/* * Copyright 2014, Daehwan Kim * * This file is part of HISAT. * * HISAT is free software: you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation, either version 3 of the License, or * (at your option) any later version. * * HISAT is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with HISAT. If not, see . */ #ifndef BP_ALIGNER_H_ #define BP_ALIGNER_H_ #include "hi_aligner.h" /** * With a hierarchical indexing, SplicedAligner provides several alignment strategies * , which enable effective alignment of RNA-seq reads */ template class BP_Aligner : public HI_Aligner { public: /** * Initialize with index. */ BP_Aligner( const Ebwt& ebwt, const EList& refnames, MUTEX_T* mutex, uint64_t threads_rids_mindist = 0, bool no_spliced_alignment = false) : HI_Aligner(ebwt, threads_rids_mindist, no_spliced_alignment), _refnames(refnames), _mutex(mutex), _done(false) { } ~BP_Aligner() { } /** * Aligns a read or a pair * This funcion is called per read or pair */ virtual int go( const Scoring& sc, const Ebwt& ebwtFw, const Ebwt& ebwtBw, const BitPairReference& ref, SwAligner& swa, SpliceSiteDB& ssdb, WalkMetrics& wlm, PerReadMetrics& prm, SwMetrics& swm, HIMetrics& him, RandomSource& rnd, AlnSinkWrap& sink) { _done = false; index_t rdi; bool fw; while(this->nextBWT(sc, ebwtFw, ebwtBw, ref, rdi, fw, wlm, prm, him, rnd, sink)) { // given the partial alignment, try to extend it to full alignments this->align(sc, ebwtFw, ebwtBw, ref, swa, ssdb, rdi, fw, wlm, prm, swm, him, rnd, sink); if(_done) break; } return EXTEND_POLICY_FULFILLED; } /** * Given a partial alignment of a read, try to further extend * the alignment bidirectionally using a combination of * local search, extension, and global search */ virtual void hybridSearch( const Scoring& sc, const Ebwt& ebwtFw, const Ebwt& ebwtBw, const BitPairReference& ref, SwAligner& swa, SpliceSiteDB& ssdb, index_t rdi, bool fw, WalkMetrics& wlm, PerReadMetrics& prm, SwMetrics& swm, HIMetrics& him, RandomSource& rnd, AlnSinkWrap& sink); /** * Given a partial alignment of a read, try to further extend * the alignment bidirectionally using a combination of * local search, extension, and global search */ virtual void hybridSearch_recur( const Scoring& sc, const Ebwt& ebwtFw, const Ebwt& ebwtBw, const BitPairReference& ref, SwAligner& swa, SpliceSiteDB& ssdb, index_t rdi, const GenomeHit& hit, index_t hitoff, index_t hitlen, WalkMetrics& wlm, PerReadMetrics& prm, SwMetrics& swm, HIMetrics& him, RandomSource& rnd, AlnSinkWrap& sink, index_t dep = 0); private: EList _refnames; MUTEX_T* _mutex; bool _done; }; /** * Given a partial alignment of a read, try to further extend * the alignment bidirectionally using a combination of * local search, extension, and global search */ template void BP_Aligner::hybridSearch( const Scoring& sc, const Ebwt& ebwtFw, const Ebwt& ebwtBw, const BitPairReference& ref, SwAligner& swa, SpliceSiteDB& ssdb, index_t rdi, bool fw, WalkMetrics& wlm, PerReadMetrics& prm, SwMetrics& swm, HIMetrics& him, RandomSource& rnd, AlnSinkWrap& sink) { assert_lt(rdi, 2); assert(this->_rds[rdi] != NULL); him.localatts++; // before further alignment using local search, extend the partial alignments directly // by comparing with the corresponding genomic sequences // this extension is performed without any mismatches allowed for(index_t hi = 0; hi < this->_genomeHits.size(); hi++) { GenomeHit& genomeHit = this->_genomeHits[hi]; index_t leftext = (index_t)OFF_MASK, rightext = (index_t)OFF_MASK; genomeHit.extend(*(this->_rds[rdi]), ref, ssdb, swa, swm, prm, sc, this->_minsc[rdi], rnd, this->_minK_local, leftext, rightext); } // for the candidate alignments, examine the longest (best) one first this->_genomeHits_done.resize(this->_genomeHits.size()); this->_genomeHits_done.fill(false); for(size_t hi = 0; hi < this->_genomeHits.size(); hi++) { index_t hj = 0; for(; hj < this->_genomeHits.size(); hj++) { if(!this->_genomeHits_done[hj]) break; } if(hj >= this->_genomeHits.size()) break; for(index_t hk = hj + 1; hk < this->_genomeHits.size(); hk++) { if(this->_genomeHits_done[hk]) continue; GenomeHit& genomeHit_j = this->_genomeHits[hj]; GenomeHit& genomeHit_k = this->_genomeHits[hk]; if(genomeHit_k.hitcount() > genomeHit_j.hitcount() || (genomeHit_k.hitcount() == genomeHit_j.hitcount() && genomeHit_k.len() > genomeHit_j.len())) { hj = hk; } } // given a candidate partial alignment, extend it bidirectionally him.anchoratts++; GenomeHit& genomeHit = this->_genomeHits[hj]; hybridSearch_recur( sc, ebwtFw, ebwtBw, ref, swa, ssdb, rdi, genomeHit, genomeHit.rdoff(), genomeHit.len(), wlm, prm, swm, him, rnd, sink); this->_genomeHits_done[hj] = true; if(_done) return; } } /** * Given a partial alignment of a read, try to further extend * the alignment bidirectionally using a combination of * local search, extension, and global search */ template void BP_Aligner::hybridSearch_recur( const Scoring& sc, const Ebwt& ebwtFw, const Ebwt& ebwtBw, const BitPairReference& ref, SwAligner& swa, SpliceSiteDB& ssdb, index_t rdi, const GenomeHit& hit, index_t hitoff, index_t hitlen, WalkMetrics& wlm, PerReadMetrics& prm, SwMetrics& swm, HIMetrics& him, RandomSource& rnd, AlnSinkWrap& sink, index_t dep) { if(_done) return; him.localsearchrecur++; assert_lt(rdi, 2); assert(this->_rds[rdi] != NULL); const Read& rd = *(this->_rds[rdi]); index_t rdlen = rd.length(); if(hit.score() < this->_minsc[rdi]) return; // if it's already examined, just return if(hitoff == hit.rdoff() - hit.trim5() && hitlen == hit.len() + hit.trim5() + hit.trim3()) { if(this->isSearched(hit, rdi)) return; this->addSearched(hit, rdi); } // for effective use of memory allocation and deallocation if(this->_coords.size() <= dep) { this->_coords.expand(); assert_leq(this->_local_genomeHits.size(), dep); this->_local_genomeHits.expand(); assert_leq(this->_spliceSites.size(), dep); this->_spliceSites.expand(); } EList& coords = this->_coords[dep]; EList >& local_genomeHits = this->_local_genomeHits[dep]; EList& spliceSites = this->_spliceSites[dep]; // daehwan - for debugging purposes #if 0 cout << rd.name << "\t" << (hit.fw() ? "+" : "-") << "\t" << hitoff << "\t" << hitoff + hitlen << "\t" << "( " << hit.rdoff() << "\t" << hit.rdoff() + hit.len() << " )" << "\t" << hit.refoff() << "\t" << hit.getRightOff() << "\t" << hit.score() << "\t" << "dep: " << dep << "\t"; Edit::print(cout, hit.edits()); cout << endl; #endif assert_leq(hitoff + hitlen, rdlen); // if this is a full alignment, report it if(hitoff == 0 && hitlen == rdlen) { if(!this->redundant(sink, rdi, hit)) { // this->reportHit(sc, ebwtFw, ref, sink, rdi, hit); return; } } else if(hitoff > 0 && (hitoff + hitlen == rdlen || hitoff + hitoff < rdlen - hitlen)) { // extend the partial alignment in the left direction index_t fragoff = 0, fraglen = 0, left = 0; hit.getLeft(fragoff, fraglen, left); const index_t minMatchLen = this->_minK_local; // make use of a list of known or novel splice sites to further align the read if(fraglen >= minMatchLen && left >= minMatchLen && !this->_no_spliced_alignment) { spliceSites.clear(); ssdb.getLeftSpliceSites(hit.ref(), left + minMatchLen, minMatchLen * 2, spliceSites); for(size_t si = 0; si < spliceSites.size(); si++) { const SpliceSite& ss = spliceSites[si]; if(!ss._fromfile && ss._readid + this->_thread_rids_mindist > rd.rdid) continue; if(left + fraglen - 1 < ss.right()) continue; index_t frag2off = ss.left() - (ss.right() - left); if(frag2off + 1 < hitoff) continue; GenomeHit tempHit; tempHit.init(hit.fw(), 0, hitoff, 0, // trim5 0, // trim3 hit.ref(), frag2off + 1 - hitoff, this->_sharedVars); if(!tempHit.compatibleWith(hit, this->_no_spliced_alignment)) continue; int64_t minsc = this->_minsc[rdi]; bool combined = tempHit.combineWith(hit, rd, ref, ssdb, swa, swm, sc, minsc, rnd, this->_minK_local, 1, 1, false); if(rdi == 0) minsc = max(minsc, sink.bestUnp1()); else minsc = max(minsc, sink.bestUnp2()); if(combined && tempHit.score() >= minsc) { assert_eq(tempHit.trim5(), 0); assert_leq(tempHit.rdoff() + tempHit.len() + tempHit.trim3(), rdlen); hybridSearch_recur( sc, ebwtFw, ebwtBw, ref, swa, ssdb, rdi, tempHit, tempHit.rdoff(), tempHit.len() + tempHit.trim3(), wlm, prm, swm, him, rnd, sink, dep + 1); } } } // choose a local index based on the genomic location of the partial alignment const HierEbwt* hierEbwtFw = (const HierEbwt*)(&ebwtFw); const LocalEbwt* localEbwtFw = hierEbwtFw->getLocalEbwt(hit.ref(), hit.refoff()); assert_leq(localEbwtFw->_localOffset, hit.refoff()); bool success = false, first = true; index_t count = 0; // consider at most two local indexes index_t max_count = 2; int64_t prev_score = hit.score(); local_genomeHits.clear(); while(!success && count++ < max_count) { if(him.localindexatts >= this->max_localindexatts) return; if(first) { first = false; } else { localEbwtFw = hierEbwtFw->prevLocalEbwt(localEbwtFw); if(localEbwtFw == NULL || localEbwtFw->empty()) break; } // local index search index_t extlen = 0; local_index_t top = (local_index_t)OFF_MASK, bot = (local_index_t)OFF_MASK; index_t extoff = hitoff - 1; if(extoff > 0) extoff -= 1; if(extoff < minAnchorLen) { extoff = minAnchorLen; } index_t nelt = (index_t)OFF_MASK; index_t max_nelt = std::max(5, extlen); bool no_extension = false; bool uniqueStop; // daehwan - for debugging purposes // index_t minUniqueLen = this->_minK_local; index_t minUniqueLen = (index_t)OFF_MASK; for(; extoff < rdlen; extoff++) { extlen = 0; // daehwan - for debugging purposes // uniqueStop = true; uniqueStop = false; him.localindexatts++; nelt = this->localEbwtSearch( localEbwtFw, // BWT index NULL, // BWT index rd, // read to align sc, // scoring scheme hit.fw(), false, // searchfw, extoff, extlen, top, bot, rnd, uniqueStop, minUniqueLen); if(extoff + 1 - extlen >= hitoff) { no_extension = true; break; } if(nelt <= max_nelt) break; } assert_leq(top, bot); assert_eq(nelt, (index_t)(bot - top)); assert_leq(extlen, extoff + 1); if(nelt > 0 && nelt <= max_nelt && extlen >= minAnchorLen && !no_extension) { assert_leq(nelt, max_nelt); coords.clear(); bool straddled = false; // get genomic locations for this local search this->getGenomeCoords_local( *localEbwtFw, ref, rnd, top, bot, hit.fw(), extoff + 1 - extlen, extlen, coords, wlm, prm, him, true, // reject straddled? straddled); assert_leq(coords.size(), nelt); coords.sort(); for(int ri = coords.size() - 1; ri >= 0; ri--) { const Coord& coord = coords[ri]; GenomeHit tempHit; tempHit.init(coord.orient(), extoff + 1 - extlen, extlen, 0, // trim5 0, // trim3 coord.ref(), coord.off(), this->_sharedVars); // daehwan - for debugging purposes if(coord.ref() == hit.ref() && coord.off() > hit.refoff() && coord.off() < hit.refoff() + 64000) { index_t leftext = (index_t)OFF_MASK, rightext = (index_t)0; index_t mm = 1; tempHit.extend( rd, ref, ssdb, swa, swm, prm, sc, this->_minsc[rdi], rnd, this->_minK_local, leftext, rightext, mm); #if 0 cout << endl; cout << rd.rdid << "\t" << rd.name << endl; cout << "\ttype: " << 1 << endl; cout << "\t" << hit.ref() << endl; cout << "\t\ttempHit " << (tempHit.fw() ? "+" : "-") << "\t" << tempHit.refoff() << "\t" << tempHit.rdoff() << "\t" << tempHit.len() << "\t" << tempHit.score() << endl; cout << "\t\tanchHit " << (hit.fw() ? "+" : "-") << "\t" << hit.refoff() << "\t" << hit.rdoff() << "\t" << hit.len() << "\t" << hit.score() << endl; spliceSites.clear(); ssdb.getRightSpliceSites(hit.ref(), hit.refoff() - minMatchLen, minMatchLen * 2, spliceSites); for(size_t si = 0; si < spliceSites.size(); si++) { const SpliceSite& ss = spliceSites[si]; if(ss.right() > tempHit.refoff()) { index_t dist = ss.right() - 1 - (tempHit.refoff() + tempHit.len() - 1); cout << rd.rdid << "s\t\t\t" << ss.left() + 1 << "\t" << ss.right() - 1 << "\t" << (ss.fw() ? "+" : "-") << "\t" << dist << endl; } } spliceSites.clear(); ssdb.getLeftSpliceSites(tempHit.ref(), tempHit.getRightOff() + minMatchLen, minMatchLen * 2, spliceSites); for(size_t si = 0; si < spliceSites.size(); si++) { const SpliceSite& ss = spliceSites[si]; if(ss.left() < hit.refoff()) { index_t dist = hit.refoff() - (ss.left() + 1); cout << rd.rdid << "s\t\t\t" << ss.left() + 1 << "\t" << ss.right() - 1 << "\t" << (ss.fw() ? "+" : "-") << "\t" << dist << endl; } } if(tempHit.rdoff() + tempHit.len() + 1 >= hit.rdoff()) return; #else assert_lt(hit.ref(), _refnames.size()); { ThreadSafe t(const_cast(_mutex), true); cout << rd.name << "\t" << _refnames[hit.ref()] << "\t" << (tempHit.fw() ? "+" : "-") << "\t" << tempHit.refoff() << "\t" << tempHit.rdoff() << "\t" << tempHit.len() << "\t" << tempHit.score() << "\t" << (hit.fw() ? "+" : "-") << "\t" << hit.refoff() << "\t" << hit.rdoff() << "\t" << hit.len() << "\t" << hit.score() << endl; } _done = true; return; #endif } // check if the partial alignment is compatible with the new alignment using the local index if(!tempHit.compatibleWith(hit, this->_no_spliced_alignment)) { if(count == 1) continue; else break; } if(uniqueStop) { assert_eq(coords.size(), 1); index_t leftext = (index_t)OFF_MASK, rightext = (index_t)0; tempHit.extend(rd, ref, ssdb, swa, swm, prm, sc, this->_minsc[rdi], rnd, this->_minK_local, leftext, rightext); } // combine the partial alignment and the new alignment int64_t minsc = this->_minsc[rdi]; bool combined = tempHit.combineWith(hit, rd, ref, ssdb, swa, swm, sc, minsc, rnd, this->_minK_local); if(rdi == 0) minsc = max(minsc, sink.bestUnp1()); else minsc = max(minsc, sink.bestUnp2()); if(combined && tempHit.score() >= minsc) { assert_eq(tempHit.trim5(), 0); assert_leq(tempHit.rdoff() + tempHit.len() + tempHit.trim3(), rdlen); if(tempHit.score() >= prev_score - sc.mmpMax) { // extend the new partial alignment recursively hybridSearch_recur( sc, ebwtFw, ebwtBw, ref, swa, ssdb, rdi, tempHit, tempHit.rdoff(), tempHit.len() + tempHit.trim3(), wlm, prm, swm, him, rnd, sink, dep + 1); } else { local_genomeHits.push_back(tempHit); } } } } int64_t minsc = (rdi == 0 ? sink.bestUnp1() : sink.bestUnp2()); if(minsc >= prev_score - sc.mmpMax) success = true; if(!success && (count == max_count || hierEbwtFw->prevLocalEbwt(localEbwtFw) == NULL)) { for(index_t ti = 0; ti < local_genomeHits.size(); ti++) { GenomeHit& tempHit = local_genomeHits[ti]; int64_t minsc = this->_minsc[rdi]; if(rdi == 0) minsc = max(minsc, sink.bestUnp1()); else minsc = max(minsc, sink.bestUnp2()); if(tempHit.score() >= minsc) { hybridSearch_recur( sc, ebwtFw, ebwtBw, ref, swa, ssdb, rdi, tempHit, tempHit.rdoff(), tempHit.len() + tempHit.trim3(), wlm, prm, swm, him, rnd, sink, dep + 1); } } } } // while(!success && count++ < 2) if(!success) { if(hitoff > this->_minK) { index_t extlen = 0; index_t top = (index_t)OFF_MASK, bot = (index_t)OFF_MASK; index_t extoff = hitoff - 1; bool uniqueStop = true; // perform global search for long introns index_t nelt = this->globalEbwtSearch( ebwtFw, // BWT index rd, // read to align sc, // scoring scheme hit.fw(), extoff, extlen, top, bot, rnd, uniqueStop); if(nelt <= 5 && extlen >= this->_minK) { coords.clear(); bool straddled = false; this->getGenomeCoords( ebwtFw, ref, rnd, top, bot, hit.fw(), bot - top, extoff + 1 - extlen, extlen, coords, wlm, prm, him, true, // reject straddled? straddled); assert_leq(coords.size(), nelt); coords.sort(); for(int ri = coords.size() - 1; ri >= 0; ri--) { const Coord& coord = coords[ri]; GenomeHit tempHit; tempHit.init(coord.orient(), extoff + 1 - extlen, extlen, 0, // trim5 0, // trim3 coord.ref(), coord.off(), this->_sharedVars); if(!tempHit.compatibleWith(hit, this->_no_spliced_alignment)) continue; if(uniqueStop) { assert_eq(coords.size(), 1); index_t leftext = (index_t)OFF_MASK, rightext = (index_t)0; tempHit.extend(rd, ref, ssdb, swa, swm, prm, sc, this->_minsc[rdi], rnd, this->_minK_local, leftext, rightext); } int64_t minsc = this->_minsc[rdi]; bool combined = tempHit.combineWith(hit, rd, ref, ssdb, swa, swm, sc, minsc, rnd, this->_minK_local); if(rdi == 0) minsc = max(minsc, sink.bestUnp1()); else minsc = max(minsc, sink.bestUnp2()); if(combined && tempHit.score() >= minsc) { assert_eq(tempHit.trim5(), 0); assert_leq(tempHit.rdoff() + tempHit.len() + tempHit.trim3(), rdlen); hybridSearch_recur( sc, ebwtFw, ebwtBw, ref, swa, ssdb, rdi, tempHit, tempHit.rdoff(), tempHit.len() + tempHit.trim3(), wlm, prm, swm, him, rnd, sink, dep + 1); } } } } GenomeHit tempHit = hit; if(tempHit.rdoff() <= 5) { index_t trim5 = tempHit.rdoff(); tempHit.trim5(trim5); assert_leq(tempHit.len() + tempHit.trim5() + tempHit.trim3(), rdlen); hybridSearch_recur( sc, ebwtFw, ebwtBw, ref, swa, ssdb, rdi, tempHit, 0, tempHit.len() + tempHit.trim5() + tempHit.trim3(), wlm, prm, swm, him, rnd, sink, dep + 1); return; } // extend the partial alignment directly comparing with the corresponding genomic sequence // with mismatches or a gap allowed int64_t minsc = this->_minsc[rdi]; assert_geq(tempHit.score(), minsc); index_t mm = (tempHit.score() - minsc) / sc.mmpMax; index_t leftext = (index_t)OFF_MASK, rightext = (index_t)0; index_t num_mismatch_allowed = 1; if(hitoff <= this->_minK_local) { num_mismatch_allowed = min(tempHit.rdoff(), mm); } him.localextatts++; tempHit.extend(rd, ref, ssdb, swa, swm, prm, sc, this->_minsc[rdi], rnd, this->_minK_local, leftext, rightext, num_mismatch_allowed); if(rdi == 0) minsc = max(minsc, sink.bestUnp1()); else minsc = max(minsc, sink.bestUnp2()); if(tempHit.score() >= minsc && leftext >= min(this->_minK_local, hit.rdoff())) { assert_eq(tempHit.trim5(), 0); assert_leq(tempHit.rdoff() + tempHit.len() + tempHit.trim3(), rdlen); hybridSearch_recur( sc, ebwtFw, ebwtBw, ref, swa, ssdb, rdi, tempHit, tempHit.rdoff(), tempHit.len() + tempHit.trim3(), wlm, prm, swm, him, rnd, sink, dep + 1); } else if(hitoff > this->_minK_local) { // skip some bases of a read index_t jumplen = hitoff > this->_minK ? this->_minK : this->_minK_local; assert_leq(hitoff, hit.rdoff()); int64_t expected_score = hit.score() - (hit.rdoff() - hitoff) / jumplen * sc.mmpMax - sc.mmpMax; if(expected_score >= minsc) { assert_lt(hitlen + jumplen, rdlen); assert_eq(hit.trim5(), 0); assert_leq(hitoff + hitlen, rdlen); hybridSearch_recur( sc, ebwtFw, ebwtBw, ref, swa, ssdb, rdi, hit, hitoff - jumplen, hitlen + jumplen, wlm, prm, swm, him, rnd, sink, dep + 1); } } } } else { // extend the partial alignment in the right direction assert_lt(hitoff + hitlen, rdlen); index_t fragoff = 0, fraglen = 0, right = 0; hit.getRight(fragoff, fraglen, right); const index_t minMatchLen = this->_minK_local; // make use of a list of known or novel splice sites to further align the read if(fraglen >= minMatchLen && !this->_no_spliced_alignment) { spliceSites.clear(); assert_gt(fraglen, 0); ssdb.getRightSpliceSites(hit.ref(), right + fraglen - minMatchLen, minMatchLen * 2, spliceSites); for(size_t si = 0; si < spliceSites.size(); si++) { const SpliceSite& ss = spliceSites[si]; if(!ss._fromfile && ss._readid + this->_thread_rids_mindist > rd.rdid) continue; if(right > ss.left()) continue; index_t frag2off = ss.right() - ss.left() + right + fraglen - 1; GenomeHit tempHit; tempHit.init(hit.fw(), fragoff + fraglen, rdlen - fragoff - fraglen, 0, // trim5 0, // trim3 hit.ref(), frag2off, this->_sharedVars); if(!hit.compatibleWith(tempHit, this->_no_spliced_alignment)) continue; GenomeHit combinedHit = hit; int64_t minsc = this->_minsc[rdi]; bool combined = combinedHit.combineWith(tempHit, rd, ref, ssdb, swa, swm, sc, minsc, rnd, this->_minK_local, 1, 1, false); if(rdi == 0) minsc = max(minsc, sink.bestUnp1()); else minsc = max(minsc, sink.bestUnp2()); if(combined && combinedHit.score() >= minsc) { assert_leq(combinedHit.trim5(), combinedHit.rdoff()); assert_eq(combinedHit.rdoff() + combinedHit.len(), rdlen); hybridSearch_recur( sc, ebwtFw, ebwtBw, ref, swa, ssdb, rdi, combinedHit, combinedHit.rdoff() - combinedHit.trim5(), combinedHit.len() + combinedHit.trim5(), wlm, prm, swm, him, rnd, sink, dep + 1); } } } // choose a local index based on the genomic location of the partial alignment const HierEbwt* hierEbwtFw = (const HierEbwt*)(&ebwtFw); const LocalEbwt* localEbwtFw = hierEbwtFw->getLocalEbwt(hit.ref(), hit.refoff()); bool success = false, first = true; index_t count = 0; index_t max_count = 2; int64_t prev_score = hit.score(); local_genomeHits.clear(); while(!success && count++ < max_count) { if(him.localindexatts >= this->max_localindexatts) return; if(first) { first = false; } else { localEbwtFw = hierEbwtFw->nextLocalEbwt(localEbwtFw); if(localEbwtFw == NULL || localEbwtFw->empty()) break; } // local index search index_t extlen = 0; local_index_t top = (local_index_t)OFF_MASK, bot = (local_index_t)OFF_MASK; // daehwan - for debugging purposes // index_t extoff = hitoff + hitlen + this->_minK_local; index_t extoff = hitoff + hitlen + this->_minK_local * 3; if(extoff + 1 < rdlen) extoff += 1; if(extoff >= rdlen) { extoff = rdlen - 1; } index_t nelt = (index_t)OFF_MASK; index_t max_nelt = std::max(5, extlen); bool no_extension = false; bool uniqueStop; // daehwan - for debugging purposes // index_t minUniqueLen = this->_minK_local; index_t minUniqueLen = (index_t)OFF_MASK; index_t maxHitLen = max(extoff + 1 - hitoff - hitlen, this->_minK_local); for(; maxHitLen < extoff + 1 && extoff < rdlen;) { extlen = 0; uniqueStop = false; him.localindexatts++; nelt = this->localEbwtSearch( localEbwtFw, // BWT index NULL, // BWT index rd, // read to align sc, // scoring scheme hit.fw(), false, // searchfw, extoff, extlen, top, bot, rnd, uniqueStop, minUniqueLen, maxHitLen); if(extoff < hitoff + hitlen) { no_extension = true; break; } if(nelt <= max_nelt) break; if(extoff + 1 < rdlen) extoff++; else { if(extlen < maxHitLen) break; else maxHitLen++; } } assert_leq(top, bot); assert_eq(nelt, (index_t)(bot - top)); assert_leq(extlen, extoff + 1); assert_leq(extoff, rdlen); if(nelt > 0 && nelt <= max_nelt && extlen >= minAnchorLen && !no_extension) { assert_leq(nelt, max_nelt); coords.clear(); bool straddled = false; // get genomic locations for this local search this->getGenomeCoords_local( *localEbwtFw, ref, rnd, top, bot, hit.fw(), extoff + 1 - extlen, extlen, coords, wlm, prm, him, true, // reject straddled? straddled); assert_leq(coords.size(), nelt); coords.sort(); for(index_t ri = 0; ri < coords.size(); ri++) { const Coord& coord = coords[ri]; GenomeHit tempHit; tempHit.init(coord.orient(), extoff + 1 - extlen, extlen, 0, // trim5 0, // trim3 coord.ref(), coord.off(), this->_sharedVars); // daehwan - for debugging purposes if(coord.ref() == hit.ref() && coord.off() < hit.refoff() && coord.off() + 64000 > hit.refoff()) { index_t leftext = (index_t)0, rightext = (index_t)OFF_MASK; index_t mm = 1; tempHit.extend( rd, ref, ssdb, swa, swm, prm, sc, this->_minsc[rdi], rnd, this->_minK_local, leftext, rightext, mm); #if 0 cout << endl; cout << rd.rdid << "\t" << rd.name << endl; cout << "\ttype: " << 2 << endl; cout << "\t" << hit.ref() << endl; cout << "\t\ttempHit " << (tempHit.fw() ? "+" : "-") << "\t" << tempHit.refoff() << "\t" << tempHit.rdoff() << "\t" << tempHit.len() << "\t" << tempHit.score() << endl; cout << "\t\tanchHit " << (hit.fw() ? "+" : "-") << "\t" << hit.refoff() << "\t" << hit.rdoff() << "\t" << hit.len() << "\t" << hit.score() << endl; spliceSites.clear(); ssdb.getRightSpliceSites(tempHit.ref(), tempHit.refoff() - minMatchLen, minMatchLen * 2, spliceSites); for(size_t si = 0; si < spliceSites.size(); si++) { const SpliceSite& ss = spliceSites[si]; if(ss.right() > hit.getRightOff()) { index_t dist = ss.right() - 1 - hit.getRightOff(); cout << rd.rdid << "s\t\t\t" << ss.left() + 1 << "\t" << ss.right() - 1 << "\t" << (ss.fw() ? "+" : "-") << "\t" << dist << endl; } } spliceSites.clear(); ssdb.getLeftSpliceSites(hit.ref(), hit.getRightOff() + minMatchLen, minMatchLen * 2, spliceSites); for(size_t si = 0; si < spliceSites.size(); si++) { const SpliceSite& ss = spliceSites[si]; if(ss.left() < tempHit.refoff()) { index_t dist = tempHit.refoff() - (ss.left() + 1); cout << rd.rdid << "s\t\t\t" << ss.left() + 1 << "\t" << ss.right() - 1 << "\t" << (ss.fw() ? "+" : "-") << "\t" << dist << endl; } } if(hit.rdoff() + hit.len() - 1 <= tempHit.rdoff()) return; #else assert_lt(hit.ref(), _refnames.size()); { ThreadSafe t(const_cast(_mutex), true); cout << rd.name << "\t" << _refnames[hit.ref()] << "\t" << (hit.fw() ? "+" : "-") << "\t" << hit.refoff() << "\t" << hit.rdoff() << "\t" << hit.len() << "\t" << hit.score() << "\t" << (tempHit.fw() ? "+" : "-") << "\t" << tempHit.refoff() << "\t" << tempHit.rdoff() << "\t" << tempHit.len() << "\t" << tempHit.score() << endl; } _done = true; return; #endif } // check if the partial alignment is compatible with the new alignment using the local index if(!hit.compatibleWith(tempHit, this->_no_spliced_alignment)) { if(count == 1) continue; else break; } index_t leftext = (index_t)0, rightext = (index_t)OFF_MASK; tempHit.extend(rd, ref, ssdb, swa, swm, prm, sc, this->_minsc[rdi], rnd, this->_minK_local, leftext, rightext); GenomeHit combinedHit = hit; int64_t minsc = this->_minsc[rdi]; // combine the partial alignment and the new alignment bool combined = combinedHit.combineWith(tempHit, rd, ref, ssdb, swa, swm, sc, minsc, rnd, this->_minK_local); if(rdi == 0) minsc = max(minsc, sink.bestUnp1()); else minsc = max(minsc, sink.bestUnp2()); if(combined && combinedHit.score() >= minsc) { assert_leq(combinedHit.trim5(), combinedHit.rdoff()); if(combinedHit.score() >= prev_score - sc.mmpMax) { // extend the new partial alignment recursively hybridSearch_recur( sc, ebwtFw, ebwtBw, ref, swa, ssdb, rdi, combinedHit, combinedHit.rdoff() - combinedHit.trim5(), combinedHit.len() + combinedHit.trim5(), wlm, prm, swm, him, rnd, sink, dep + 1); } else { local_genomeHits.push_back(combinedHit); } } } } int64_t minsc = (rdi == 0 ? sink.bestUnp1() : sink.bestUnp2()); if(minsc >= prev_score - sc.mmpMax) success = true; if(!success && (count == max_count || hierEbwtFw->nextLocalEbwt(localEbwtFw) == NULL) ) { for(index_t ti = 0; ti < local_genomeHits.size(); ti++) { GenomeHit& tempHit = local_genomeHits[ti]; int64_t minsc = this->_minsc[rdi]; if(rdi == 0) minsc = max(minsc, sink.bestUnp1()); else minsc = max(minsc, sink.bestUnp2()); if(tempHit.score() >= minsc) { hybridSearch_recur( sc, ebwtFw, ebwtBw, ref, swa, ssdb, rdi, tempHit, tempHit.rdoff() - tempHit.trim5(), tempHit.len() + tempHit.trim5(), wlm, prm, swm, him, rnd, sink, dep + 1); } } } } // while(!success && count++ < 2) if(!success) { // perform global search for long introns if(hitoff + hitlen + this->_minK + 1 < rdlen) { index_t extlen = 0; index_t top = (index_t)OFF_MASK, bot = (index_t)OFF_MASK; index_t extoff = hitoff + hitlen + this->_minK + 1; bool uniqueStop = true; index_t nelt = this->globalEbwtSearch( ebwtFw, // BWT index rd, // read to align sc, // scoring scheme hit.fw(), extoff, extlen, top, bot, rnd, uniqueStop); if(nelt <= 5 && extlen >= this->_minK) { coords.clear(); bool straddled = false; this->getGenomeCoords( ebwtFw, ref, rnd, top, bot, hit.fw(), bot - top, extoff + 1 - extlen, extlen, coords, wlm, prm, him, true, // reject straddled straddled); assert_leq(coords.size(), nelt); coords.sort(); for(index_t ri = 0; ri < coords.size(); ri++) { const Coord& coord = coords[ri]; GenomeHit tempHit; tempHit.init(coord.orient(), extoff + 1 - extlen, extlen, 0, // trim5 0, // trim3 coord.ref(), coord.off(), this->_sharedVars); if(!hit.compatibleWith(tempHit, this->_no_spliced_alignment)) continue; index_t leftext = (index_t)0, rightext = (index_t)OFF_MASK; tempHit.extend(rd, ref, ssdb, swa, swm, prm, sc, this->_minsc[rdi], rnd, this->_minK_local, leftext, rightext); GenomeHit combinedHit = hit; int64_t minsc = this->_minsc[rdi]; bool combined = combinedHit.combineWith(tempHit, rd, ref, ssdb, swa, swm, sc, minsc, rnd, this->_minK_local); if(rdi == 0) minsc = max(minsc, sink.bestUnp1()); else minsc = max(minsc, sink.bestUnp2()); if(combined && combinedHit.score() >= minsc) { assert_leq(combinedHit.trim5(), combinedHit.rdoff()); hybridSearch_recur( sc, ebwtFw, ebwtBw, ref, swa, ssdb, rdi, combinedHit, combinedHit.rdoff() - combinedHit.trim5(), combinedHit.len() + combinedHit.trim5(), wlm, prm, swm, him, rnd, sink, dep + 1); } } } } GenomeHit tempHit = hit; assert(tempHit.trim5() == 0 || hitoff == 0); if(rdlen - hitoff - tempHit.len() - tempHit.trim5() <= 5) { index_t trim3 = rdlen - hitoff - tempHit.len() - tempHit.trim5(); tempHit.trim3(trim3); assert_leq(tempHit.trim5(), tempHit.rdoff()); assert_leq(tempHit.len() + tempHit.trim5() + tempHit.trim3(), rdlen); hybridSearch_recur( sc, ebwtFw, ebwtBw, ref, swa, ssdb, rdi, tempHit, tempHit.rdoff() - tempHit.trim5(), tempHit.len() + tempHit.trim5() + tempHit.trim3(), wlm, prm, swm, him, rnd, sink, dep + 1); return; } // extend the partial alignment directly comparing with the corresponding genomic sequence // with mismatches or a gap allowed int64_t minsc = this->_minsc[rdi]; assert_geq(tempHit.score(), minsc); index_t leftext = (index_t)0, rightext = (index_t)OFF_MASK; index_t mm = (tempHit.score() - minsc) / sc.mmpMax; index_t num_mismatch_allowed = 1; if(rdlen - hitoff - hitlen <= this->_minK_local) { num_mismatch_allowed = min(rdlen - tempHit.rdoff() - tempHit.len(), mm); } him.localextatts++; tempHit.extend(rd, ref, ssdb, swa, swm, prm, sc, this->_minsc[rdi], rnd, this->_minK_local, leftext, rightext, num_mismatch_allowed); if(rdi == 0) minsc = max(minsc, sink.bestUnp1()); else minsc = max(minsc, sink.bestUnp2()); if(tempHit.score() >= minsc && rightext >= min(this->_minK_local, rdlen - hit.len() - hit.rdoff())) { assert_eq(tempHit.trim3(), 0); assert_leq(tempHit.trim5(), tempHit.rdoff()); hybridSearch_recur( sc, ebwtFw, ebwtBw, ref, swa, ssdb, rdi, tempHit, tempHit.rdoff() - tempHit.trim5(), tempHit.len() + tempHit.trim5(), wlm, prm, swm, him, rnd, sink, dep + 1); } else if(hitoff + hitlen + this->_minK_local < rdlen) { // skip some bases of a read index_t jumplen = hitoff + hitlen + this->_minK < rdlen ? this->_minK : this->_minK_local; assert_lt(hitoff + hitlen + jumplen, rdlen); assert_leq(hit.len(), hitlen); int64_t expected_score = hit.score() - (hitlen - hit.len()) / jumplen * sc.mmpMax - sc.mmpMax; if(expected_score >= minsc) { assert_eq(hit.trim3(), 0); hybridSearch_recur( sc, ebwtFw, ebwtBw, ref, swa, ssdb, rdi, hit, hitoff, hitlen + jumplen, wlm, prm, swm, him, rnd, sink, dep + 1); } } } } } #endif /*BP_ALIGNER_H_*/