/* * Copyright 2015, Daehwan Kim * * This file is part of HISAT 2. * * HISAT 2 is free software: you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation, either version 3 of the License, or * (at your option) any later version. * * HISAT 2 is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with HISAT 2. If not, see . */ #ifndef SPLICED_ALIGNER_H_ #define SPLICED_ALIGNER_H_ #include "hi_aligner.h" /** * With a hierarchical indexing, SplicedAligner provides several alignment strategies * , which enable effective alignment of RNA-seq reads */ template class SplicedAligner : public HI_Aligner { public: /** * Initialize with index. */ SplicedAligner( const GFM& gfm, bool anchorStop, uint64_t threads_rids_mindist = 0) : HI_Aligner(gfm, anchorStop, threads_rids_mindist) { } ~SplicedAligner() { } /** * Given a partial alignment of a read, try to further extend * the alignment bidirectionally using a combination of * local search, extension, and global search */ virtual void hybridSearch( const Scoring& sc, const PairedEndPolicy& pepol, // paired-end policy const TranscriptomePolicy& tpol, const GraphPolicy& gpol, const GFM& gfm, const ALTDB& altdb, const RepeatDB& repeatdb, const BitPairReference& ref, SwAligner& swa, SpliceSiteDB& ssdb, index_t rdi, bool fw, WalkMetrics& wlm, PerReadMetrics& prm, SwMetrics& swm, HIMetrics& him, RandomSource& rnd, AlnSinkWrap& sink); /** * Given a partial alignment of a read, try to further extend * the alignment bidirectionally using a combination of * local search, extension, and global search */ virtual int64_t hybridSearch_recur( const Scoring& sc, const PairedEndPolicy& pepol, // paired-end policy const TranscriptomePolicy& tpol, const GraphPolicy& gpol, const GFM& gfm, const ALTDB& altdb, const RepeatDB& repeatdb, const BitPairReference& ref, SwAligner& swa, SpliceSiteDB& ssdb, index_t rdi, const GenomeHit& hit, index_t hitoff, index_t hitlen, WalkMetrics& wlm, PerReadMetrics& prm, SwMetrics& swm, HIMetrics& him, RandomSource& rnd, AlnSinkWrap& sink, bool alignMate = false, index_t dep = 0); }; /** * Given a partial alignment of a read, try to further extend * the alignment bidirectionally using a combination of * local search, extension, and global search */ template void SplicedAligner::hybridSearch( const Scoring& sc, const PairedEndPolicy& pepol, // paired-end policy const TranscriptomePolicy& tpol, const GraphPolicy& gpol, const GFM& gfm, const ALTDB& altdb, const RepeatDB& repeatdb, const BitPairReference& ref, SwAligner& swa, SpliceSiteDB& ssdb, index_t rdi, bool fw, WalkMetrics& wlm, PerReadMetrics& prm, SwMetrics& swm, HIMetrics& him, RandomSource& rnd, AlnSinkWrap& sink) { assert_lt(rdi, 2); assert(this->_rds[rdi] != NULL); him.localatts++; const ReportingParams& rp = sink.reportingParams(); // before further alignment using local search, extend the partial alignments directly // by comparing with the corresponding genomic sequences // this extension is performed without any mismatches allowed for(index_t hi = 0; hi < this->_genomeHits.size(); hi++) { GenomeHit& genomeHit = this->_genomeHits[hi]; index_t leftext = (index_t)INDEX_MAX, rightext = (index_t)INDEX_MAX; genomeHit.extend( *(this->_rds[rdi]), gfm, ref, altdb, repeatdb, ssdb, swa, swm, prm, sc, this->_minsc[rdi], rnd, INDEX_MAX, tpol, gpol, leftext, rightext); } // for the candidate alignments, examine the longest (best) one first this->_genomeHits_done.resize(this->_genomeHits.size()); this->_genomeHits_done.fill(false); for(size_t hi = 0; hi < this->_genomeHits.size(); hi++) { index_t hj = 0; for(; hj < this->_genomeHits.size(); hj++) { if(!this->_genomeHits_done[hj]) break; } if(hj >= this->_genomeHits.size()) break; for(index_t hk = hj + 1; hk < this->_genomeHits.size(); hk++) { if(this->_genomeHits_done[hk]) continue; GenomeHit& genomeHit_j = this->_genomeHits[hj]; GenomeHit& genomeHit_k = this->_genomeHits[hk]; if(genomeHit_k.hitcount() > genomeHit_j.hitcount() || (genomeHit_k.hitcount() == genomeHit_j.hitcount() && genomeHit_k.len() > genomeHit_j.len())) { hj = hk; } } // given a candidate partial alignment, extend it bidirectionally him.anchoratts++; GenomeHit& genomeHit = this->_genomeHits[hj]; int64_t maxsc = std::numeric_limits::min(); maxsc = hybridSearch_recur(sc, pepol, tpol, gpol, gfm, altdb, repeatdb, ref, swa, ssdb, rdi, genomeHit, genomeHit.rdoff(), genomeHit.len(), wlm, prm, swm, him, rnd, sink); if(rp.bowtie2_dp == 2 || (rp.bowtie2_dp == 1 && maxsc < this->_minsc[rdi])) { const Read& rd = *this->_rds[rdi]; // Initialize the aligner with a new read swa.initRead(rd.patFw, // fw version of query rd.patRc, // rc version of query rd.qual, // fw version of qualities rd.qualRev, // rc version of qualities 0, // off of first char in 'rd' to consider rd.length(), // off of last char (excl) in 'rd' to consider sc); // scoring scheme bool found = genomeHit.len() >= rd.length(); if(!found) { DynProgFramer dpframe(false); // trimToRef size_t tlen = ref.approxLen(genomeHit.ref()); size_t readGaps = 10, refGaps = 10, nceil = 0, maxhalf = 10; index_t refoff = genomeHit.refoff() > genomeHit.rdoff() ? genomeHit.refoff() - genomeHit.rdoff() : 0; DPRect rect; dpframe.frameSeedExtensionRect(refoff, // ref offset implied by seed hit assuming no gaps rd.length(), // length of read sequence used in DP table tlen, // length of reference readGaps, // max # of read gaps permitted in opp mate alignment refGaps, // max # of ref gaps permitted in opp mate alignment (size_t)nceil, // # Ns permitted maxhalf, // max width in either direction rect); // DP rectangle assert(rect.repOk()); size_t cminlen = 2000, cpow2 = 4, nwindow = 10, nsInLeftShift = 0; swa.initRef(fw, // whether to align forward or revcomp read genomeHit.ref(), // reference aligned against rect, // DP rectangle ref, // Reference strings tlen, // length of reference sequence sc, // scoring scheme this->_minsc[rdi], // minimum score permitted true, // use 8-bit SSE if possible? cminlen, // minimum length for using checkpointing scheme cpow2, // interval b/t checkpointed diags; 1 << this false, // triangular mini-fills? true, // this is a seed extension - not finding a mate nwindow, nsInLeftShift); // Now fill the dynamic programming matrix and return true iff // there is at least one valid alignment TAlScore bestCell = std::numeric_limits::min(); found = swa.align(rnd, bestCell); if(found) { SwResult res; res.reset(); res.alres.init_raw_edits(&(this->_rawEdits)); found = swa.nextAlignment(res, this->_minsc[rdi], rnd); if(found) { if(!fw) res.alres.invertEdits(); const Coord& coord = res.alres.refcoord(); assert_geq(genomeHit._joinedOff + coord.off(), genomeHit.refoff()); index_t joinedOff = genomeHit._joinedOff + coord.off() - genomeHit.refoff(); genomeHit.init(fw, 0, // rdoff rd.length(), 0, // trim5 0, // trim3 coord.ref(), coord.off(), joinedOff, this->_sharedVars, genomeHit.repeat(), // repeat? &res.alres.ned(), NULL, res.alres.score().score()); genomeHit.replace_edits_with_alts(rd, altdb.alts(), ssdb, sc, this->_minK_local, (index_t)tpol.minIntronLen(), (index_t)tpol.maxIntronLen(), (index_t)tpol.minAnchorLen(), (index_t)tpol.minAnchorLen_noncan(), ref); } } } if(found) { hybridSearch_recur(sc, pepol, tpol, gpol, gfm, altdb, repeatdb, ref, swa, ssdb, rdi, genomeHit, genomeHit.rdoff(), genomeHit.len(), wlm, prm, swm, him, rnd, sink); } } this->_genomeHits_done[hj] = true; } } /** * Given a partial alignment of a read, try to further extend * the alignment bidirectionally using a combination of * local search, extension, and global search */ template int64_t SplicedAligner::hybridSearch_recur( const Scoring& sc, const PairedEndPolicy& pepol, // paired-end policy const TranscriptomePolicy& tpol, const GraphPolicy& gpol, const GFM& gfm, const ALTDB& altdb, const RepeatDB& repeatdb, const BitPairReference& ref, SwAligner& swa, SpliceSiteDB& ssdb, index_t rdi, const GenomeHit& hit, index_t hitoff, index_t hitlen, WalkMetrics& wlm, PerReadMetrics& prm, SwMetrics& swm, HIMetrics& him, RandomSource& rnd, AlnSinkWrap& sink, bool alignMate, index_t dep) { const ReportingParams& rp = sink.reportingParams(); int64_t maxsc = numeric_limits::min(); him.localsearchrecur++; assert_lt(rdi, 2); assert(this->_rds[rdi] != NULL); const Read& rd = *(this->_rds[rdi]); index_t rdlen = (index_t)rd.length(); TAlScore cushion = 0; if(tpol.no_spliced_alignment()) { cushion = alignMate ? rdlen * 0.03 * sc.mm(255) : 0; } if(hit.score() + cushion < this->_minsc[rdi]) return maxsc; if(dep >= 128) return maxsc; // if it's already examined, just return if(hitoff == hit.rdoff() - hit.trim5() && hitlen == hit.len() + hit.trim5() + hit.trim3()) { if(this->isSearched(hit, rdi)) return maxsc; this->addSearched(hit, rdi); } // for effective use of memory allocation and deallocation if(this->_coords.size() <= dep) { this->_coords.expand(); assert_leq(this->_local_genomeHits.size(), dep); this->_local_genomeHits.expand(); assert_leq(this->_spliceSites.size(), dep); this->_spliceSites.expand(); } EList& coords = this->_coords[dep]; EList& spliceSites = this->_spliceSites[dep]; // daehwan - for debugging purposes #if 0 cout << rd.name << "\t" << (hit.fw() ? "+" : "-") << "\t" << hitoff << "\t" << hitoff + hitlen << "\t" << "( " << hit.rdoff() << "\t" << hit.rdoff() + hit.len() << " )" << "\t" << hit.refoff() << "\t" << hit.getRightOff() << "\t" << hit.score() << "\t" << "dep: " << dep << "\t"; Edit::print(cout, hit.edits()); cout << endl; #endif assert_leq(hitoff + hitlen, rdlen); // if this is a full alignment, report it if(hitoff == 0 && hitlen == rdlen) { if(!this->redundant(sink, rdi, hit)) { bool another_spliced = false; if(!ssdb.empty()) { int64_t best_score = hit.score(); this->_local_genomeHits[dep].clear(); this->_anchors_added.clear(); this->_local_genomeHits[dep].expand(); this->_local_genomeHits[dep].back() = hit; this->_anchors_added.push_back(0); index_t fragoff = 0, fraglen = 0, left = 0, right = 0; hit.getLeft(fragoff, fraglen, left); const index_t minMatchLen = (index_t)this->_minK; index_t min_left_anchor = rdlen, min_right_anchor = rdlen; // make use of a list of known or novel splice sites to further align the read if(fraglen >= minMatchLen && left >= minMatchLen && hit.trim5() == 0 && !hit.repeat() && !tpol.no_spliced_alignment()) { spliceSites.clear(); ssdb.getLeftSpliceSites(hit.ref(), left + minMatchLen, minMatchLen, spliceSites); for(size_t si = 0; si < spliceSites.size(); si++) { const SpliceSite& ss = spliceSites[si]; if(!ss._fromfile && ss._readid + this->_thread_rids_mindist > rd.rdid) continue; if(left + fraglen - 1 < ss.right()) continue; index_t frag2off = ss.left() - (ss.right() - left); if(frag2off + 1 < hitoff) continue; GenomeHit tempHit; if(fragoff + ss.right() < left + 1) continue; index_t readoff = fragoff + ss.right() - left - 1; index_t joinedOff = 0; bool success = gfm.textOffToJoined(hit.ref(), ss.left(), joinedOff); if(!success) { continue; } #ifndef NDEBUG index_t debug_tid = 0, debug_toff = 0, debug_tlen = 0; bool debug_straddled = false; gfm.joinedToTextOff(1, // qlen joinedOff, debug_tid, debug_toff, debug_tlen, false, debug_straddled); assert_eq(hit.ref(), debug_tid); assert_eq(ss.left(), debug_toff); #endif tempHit.init(hit.fw(), readoff + 1, // rdoff 0, // len 0, // trim5 0, // trim3 hit.ref(), ss.left() + 1, joinedOff + 1, this->_sharedVars, gfm.repeat()); index_t leftext = readoff + 1, rightext = 0; tempHit.extend(rd, gfm, ref, altdb, repeatdb, ssdb, swa, swm, prm, sc, this->_minsc[rdi], rnd, (index_t)this->_minK_local, tpol, gpol, leftext, rightext); if(tempHit.len() <= 0) continue; if(!tempHit.compatibleWith( hit, (index_t)tpol.minIntronLen(), (index_t)tpol.maxIntronLen(), tpol.no_spliced_alignment())) continue; int64_t minsc = max(this->_minsc[rdi], best_score); bool combined = tempHit.combineWith( hit, rd, gfm, ref, altdb, repeatdb, ssdb, swa, swm, sc, minsc, rnd, (index_t)this->_minK_local, (index_t)tpol.minIntronLen(), (index_t)tpol.maxIntronLen(), 1, 1, gpol.maxAltsTried(), &ss, tpol.no_spliced_alignment()); if(rdi == 0) minsc = max(minsc, sink.bestUnp1()); else minsc = max(minsc, sink.bestUnp2()); index_t leftAnchorLen = 0, nedits = 0; tempHit.getLeftAnchor(leftAnchorLen, nedits); if(combined && tempHit.score() >= minsc && nedits <= leftAnchorLen / 4) { // prevent (short) anchors from having many mismatches if(this->isSearched(tempHit, rdi)) continue; if(!this->redundant(sink, rdi, tempHit)) { another_spliced = true; if(tempHit.score() > best_score) best_score = tempHit.score(); this->_local_genomeHits[dep].expand(); this->_local_genomeHits[dep].back() = tempHit; this->_anchors_added.push_back(1); index_t temp_fragoff = 0, temp_fraglen = 0, temp_left = 0; tempHit.getLeft(temp_fragoff, temp_fraglen, temp_left); if(temp_fraglen < min_left_anchor) min_left_anchor = temp_fraglen; } } } } size_t num_local_genomeHits = this->_local_genomeHits[dep].size(); for(size_t i = 0; i < num_local_genomeHits; i++) { this->_local_genomeHits[dep][i].getRight(fragoff, fraglen, right); if(this->_local_genomeHits[dep][i].score() < best_score) continue; // make use of a list of known or novel splice sites to further align the read if(fraglen >= minMatchLen && this->_local_genomeHits[dep][i].trim3() == 0 && !hit.repeat() && !tpol.no_spliced_alignment()) { spliceSites.clear(); assert_gt(fraglen, 0); ssdb.getRightSpliceSites(this->_local_genomeHits[dep][i].ref(), right + fraglen - minMatchLen, minMatchLen, spliceSites); for(size_t si = 0; si < spliceSites.size(); si++) { const GenomeHit& canHit = this->_local_genomeHits[dep][i]; const SpliceSite& ss = spliceSites[si]; if(!ss._fromfile && ss._readid + this->_thread_rids_mindist > rd.rdid) continue; if(right > ss.left()) continue; GenomeHit tempHit; index_t readoff = fragoff + ss.left() - right + 1; if(readoff >= rdlen) continue; index_t joinedOff = 0; bool success = gfm.textOffToJoined(canHit.ref(), ss.right(), joinedOff); if(!success) { continue; } #ifndef NDEBUG index_t debug_tid = 0, debug_toff = 0, debug_tlen = 0; bool debug_straddled = false; gfm.joinedToTextOff(1, // qlen joinedOff, debug_tid, debug_toff, debug_tlen, false, debug_straddled); assert_eq(canHit.ref(), debug_tid); assert_eq(ss.right(), debug_toff); #endif tempHit.init(canHit.fw(), readoff, 0, // len 0, // trim5 0, // trim3 canHit.ref(), ss.right(), joinedOff, this->_sharedVars, gfm.repeat()); index_t leftext = 0, rightext = rdlen - readoff; tempHit.extend(rd, gfm, ref, altdb, repeatdb, ssdb, swa, swm, prm, sc, this->_minsc[rdi], rnd, (index_t)this->_minK_local, tpol, gpol, leftext, rightext); if(tempHit.len() <= 0) continue; if(!canHit.compatibleWith(tempHit, (index_t)tpol.minIntronLen(), (index_t)tpol.maxIntronLen(), tpol.no_spliced_alignment())) continue; GenomeHit combinedHit = canHit; int64_t minsc = max(this->_minsc[rdi], best_score); bool combined = combinedHit.combineWith( tempHit, rd, gfm, ref, altdb, repeatdb, ssdb, swa, swm, sc, minsc, rnd, (index_t)this->_minK_local, (index_t)tpol.minIntronLen(), (index_t)tpol.maxIntronLen(), 1, 1, gpol.maxAltsTried(), &ss, tpol.no_spliced_alignment()); if(rdi == 0) minsc = max(minsc, sink.bestUnp1()); else minsc = max(minsc, sink.bestUnp2()); index_t rightAnchorLen = 0, nedits = 0; combinedHit.getRightAnchor(rightAnchorLen, nedits); if(combined && combinedHit.score() >= minsc && nedits <= rightAnchorLen / 4) { // prevent (short) anchors from having many mismatches if(this->isSearched(combinedHit, rdi)) continue; if(!this->redundant(sink, rdi, combinedHit)) { another_spliced = true; if(combinedHit.score() > best_score) best_score = tempHit.score(); this->_local_genomeHits[dep].expand(); this->_local_genomeHits[dep].back() = combinedHit; this->_anchors_added.push_back(this->_anchors_added[i] + 1); index_t temp_fragoff = 0, temp_fraglen = 0, temp_right = 0; combinedHit.getLeft(temp_fragoff, temp_fraglen, temp_right); if(temp_fraglen < min_right_anchor) min_right_anchor = temp_fraglen; } } } } } assert_eq(this->_local_genomeHits[dep].size(), this->_anchors_added.size()); for(size_t i = 0; i < this->_local_genomeHits[dep].size(); i++) { const GenomeHit& canHit = this->_local_genomeHits[dep][i]; if(!rp.secondary && canHit.score() < best_score) continue; // if(min(min_left_anchor, min_right_anchor) <= this->_minK_local) { // daehwan - for debugging purposes // if(this->_anchors_added[i] < this->_anchors_added.back()) continue; //} if(i > 0 && !this->isSearched(canHit, rdi)) { this->addSearched(canHit, rdi); } if(!this->redundant(sink, rdi, canHit)) { this->reportHit(sc, pepol, tpol, gpol, gfm, altdb, repeatdb, ref, ssdb, sink, rdi, canHit, alignMate); maxsc = max(maxsc, canHit.score()); } } } else { this->reportHit(sc, pepol, tpol, gpol, gfm, altdb, repeatdb, ref, ssdb, sink, rdi, hit, alignMate); maxsc = max(maxsc, hit.score()); } return maxsc; } } else if(hitoff > 0 && (hitoff + hitlen == rdlen || hitoff + hitoff < rdlen - hitlen)) { // Decide which side to extend first (left or right) if(!ssdb.empty()) { // extend the partial alignment in the left direction index_t fragoff = 0, fraglen = 0, left = 0; hit.getLeft(fragoff, fraglen, left); const index_t minMatchLen = (index_t)this->_minK_local; // make use of a list of known or novel splice sites to further align the read if(fraglen >= minMatchLen && left >= minMatchLen && !hit.repeat() && !tpol.no_spliced_alignment()) { spliceSites.clear(); ssdb.getLeftSpliceSites(hit.ref(), left + minMatchLen, minMatchLen + min(minMatchLen, fragoff), spliceSites); for(size_t si = 0; si < spliceSites.size(); si++) { const SpliceSite& ss = spliceSites[si]; if(!ss._fromfile && ss._readid + this->_thread_rids_mindist > rd.rdid) continue; if(left + fraglen - 1 < ss.right()) continue; if(fragoff + ss.right() < left + 1) continue; index_t readoff = fragoff + ss.right() - left - 1; index_t joinedOff = 0; bool success = gfm.textOffToJoined(hit.ref(), ss.left(), joinedOff); if(!success) { continue; } #ifndef NDEBUG index_t debug_tid = 0, debug_toff = 0, debug_tlen = 0; bool debug_straddled = false; gfm.joinedToTextOff(1, // qlen joinedOff, debug_tid, debug_toff, debug_tlen, false, debug_straddled); assert_eq(hit.ref(), debug_tid); assert_eq(ss.left(), debug_toff); #endif GenomeHit tempHit; tempHit.init(hit.fw(), readoff + 1, // rdoff 0, // len 0, // trim5 0, // trim3 hit.ref(), ss.left() + 1, joinedOff + 1, this->_sharedVars, gfm.repeat()); index_t leftext = readoff + 1, rightext = 0; tempHit.extend(rd, gfm, ref, altdb, repeatdb, ssdb, swa, swm, prm, sc, this->_minsc[rdi], rnd, (index_t)this->_minK_local, tpol, gpol, leftext, rightext); if(tempHit.len() <= 0) continue; if(!tempHit.compatibleWith(hit, (index_t)tpol.minIntronLen(), (index_t)tpol.maxIntronLen(), tpol.no_spliced_alignment())) continue; int64_t minsc = this->_minsc[rdi]; bool combined = tempHit.combineWith( hit, rd, gfm, ref, altdb, repeatdb, ssdb, swa, swm, sc, minsc, rnd, (index_t)this->_minK_local, (index_t)tpol.minIntronLen(), (index_t)tpol.maxIntronLen(), 1, 1, gpol.maxAltsTried(), &ss, tpol.no_spliced_alignment()); if(!rp.secondary) { if(rdi == 0) minsc = max(minsc, sink.bestUnp1() - cushion); else minsc = max(minsc, sink.bestUnp2() - cushion); } if(combined && tempHit.score() >= minsc && // soft-clipping might be better tempHit.score() + sc.sc(0) * hit.rdoff() >= hit.score()) { assert_eq(tempHit.trim5(), 0); assert_leq(tempHit.rdoff() + tempHit.len() + tempHit.trim3(), rdlen); int64_t tmp_maxsc = hybridSearch_recur( sc, pepol, tpol, gpol, gfm, altdb, repeatdb, ref, swa, ssdb, rdi, tempHit, tempHit.rdoff(), tempHit.len() + tempHit.trim3(), wlm, prm, swm, him, rnd, sink, alignMate, dep + 1); maxsc = max(maxsc, tmp_maxsc); } } } } bool use_localindex = true; if(hitoff == hit.rdoff() && hitoff <= this->_minK) { index_t leftext = (index_t)INDEX_MAX, rightext = (index_t)0; GenomeHit tempHit = hit; tempHit.extend( rd, gfm, ref, altdb, repeatdb, ssdb, swa, swm, prm, sc, this->_minsc[rdi], rnd, (index_t)this->_minK_local, tpol, gpol, leftext, rightext, 1); if(tempHit.rdoff() == 0) { use_localindex = false; } } // Choose a local index based on the genomic location of the partial alignment const HGFM* hGFM = (const HGFM*)(&gfm); const LocalGFM* lGFM = hGFM->getLocalGFM(hit.ref(), hit.refoff()); assert_leq(lGFM->_localOffset, hit.refoff()); bool success = false, first = true; index_t count = 0; // Use at most two local indexes const index_t max_count = 2; int64_t prev_score = hit.score(); this->_local_genomeHits[dep].clear(); while(!success && count++ < max_count && use_localindex) { if(him.localindexatts >= this->max_localindexatts) break; if(first) { first = false; } else { lGFM = hGFM->prevLocalGFM(lGFM); if(lGFM == NULL || lGFM->empty()) break; } // local index search index_t extlen = 0; local_index_t top = (local_index_t)INDEX_MAX, bot = (local_index_t)INDEX_MAX; local_index_t node_top = (local_index_t)INDEX_MAX, node_bot = (local_index_t)INDEX_MAX; index_t extoff = hitoff - 1; if(extoff > 0) extoff -= 1; if(extoff < tpol.minAnchorLen()) { extoff = tpol.minAnchorLen(); } index_t nelt = (index_t)INDEX_MAX; index_t max_nelt = std::max(5, extlen); bool no_extension = false; bool uniqueStop= false; index_t minUniqueLen = (index_t)this->_minK_local; for(; extoff < rdlen; extoff++) { extlen = 0; uniqueStop = true; him.localindexatts++; this->_local_node_iedge_count.clear(); nelt = this->localGFMSearch( *lGFM, // BWT index rd, // read to align sc, // scoring scheme sink.reportingParams(), hit.fw(), extoff, extlen, top, bot, node_top, node_bot, this->_local_node_iedge_count, rnd, uniqueStop, minUniqueLen); if(extoff + 1 - extlen >= hitoff) { no_extension = true; break; } if(nelt <= max_nelt) break; } assert_leq(node_top, node_bot); assert_eq(nelt, (index_t)(node_bot - node_top)); assert_leq(extlen, extoff + 1); if(nelt > 0 && nelt <= max_nelt && extlen >= tpol.minAnchorLen() && !no_extension) { assert_leq(nelt, max_nelt); coords.clear(); bool straddled = false; // get genomic locations for this local search this->getGenomeCoords_local( *lGFM, altdb, ref, rnd, top, bot, node_top, node_bot, this->_local_node_iedge_count, hit.fw(), extoff + 1 - extlen, extlen, coords, wlm, prm, him, true, // reject straddled? straddled); assert_leq(coords.size(), nelt); coords.sort(); for(int ri = (int)coords.size() - 1; ri >= 0; ri--) { const Coord& coord = coords[ri]; GenomeHit tempHit; tempHit.init(coord.orient(), extoff + 1 - extlen, extlen, 0, // trim5 0, // trim3 (index_t)coord.ref(), (index_t)coord.off(), (index_t)coord.joinedOff(), this->_sharedVars, gfm.repeat()); if(!tempHit.adjustWithALT(*this->_rds[rdi], gfm, altdb, ref, gpol)) continue; // check if the partial alignment is compatible with the new alignment using the local index if(!tempHit.compatibleWith(hit, (index_t)tpol.minIntronLen(), (index_t)tpol.maxIntronLen(), tpol.no_spliced_alignment())) { if(count == 1) continue; else break; } if(uniqueStop) { assert_eq(coords.size(), 1); index_t leftext = (index_t)INDEX_MAX, rightext = (index_t)0; tempHit.extend( rd, gfm, ref, altdb, repeatdb, ssdb, swa, swm, prm, sc, this->_minsc[rdi], rnd, (index_t)this->_minK_local, tpol, gpol, leftext, rightext); } // combine the partial alignment and the new alignment int64_t minsc = this->_minsc[rdi]; bool combined = tempHit.combineWith( hit, rd, gfm, ref, altdb, repeatdb, ssdb, swa, swm, sc, minsc, rnd, (index_t)this->_minK_local, (index_t)tpol.minIntronLen(), (index_t)tpol.maxIntronLen(), tpol.minAnchorLen(), tpol.minAnchorLen_noncan(), gpol.maxAltsTried(), NULL, // splice sites tpol.no_spliced_alignment()); if(!rp.secondary) { if(rdi == 0) minsc = max(minsc, sink.bestUnp1() - cushion); else minsc = max(minsc, sink.bestUnp2() - cushion); } if(combined && tempHit.score() >= minsc) { assert_eq(tempHit.trim5(), 0); assert_leq(tempHit.rdoff() + tempHit.len() + tempHit.trim3(), rdlen); if(tempHit.score() >= prev_score - sc.mmpMax) { // extend the new partial alignment recursively int64_t tmp_maxsc = hybridSearch_recur( sc, pepol, tpol, gpol, gfm, altdb, repeatdb, ref, swa, ssdb, rdi, tempHit, tempHit.rdoff(), tempHit.len() + tempHit.trim3(), wlm, prm, swm, him, rnd, sink, alignMate, dep + 1); maxsc = max(maxsc, tmp_maxsc); } else { this->_local_genomeHits[dep].push_back(tempHit); } } } } if(maxsc >= prev_score - sc.mmpMax) success = true; if(!success && (him.localindexatts >= this->max_localindexatts || count == max_count || hGFM->prevLocalGFM(lGFM) == NULL)) { for(index_t ti = 0; ti < this->_local_genomeHits[dep].size(); ti++) { GenomeHit& tempHit = this->_local_genomeHits[dep][ti]; int64_t minsc = this->_minsc[rdi]; if(!rp.secondary) { if(rdi == 0) minsc = max(minsc, sink.bestUnp1() - cushion); else minsc = max(minsc, sink.bestUnp2() - cushion); } if(tempHit.score() >= minsc) { int64_t tmp_maxsc = hybridSearch_recur( sc, pepol, tpol, gpol, gfm, altdb, repeatdb, ref, swa, ssdb, rdi, tempHit, tempHit.rdoff(), tempHit.len() + tempHit.trim3(), wlm, prm, swm, him, rnd, sink, alignMate, dep + 1); maxsc = max(maxsc, tmp_maxsc); } } } } // while(!success && count++ < 2) if(!success) { if(hitoff > this->_minK && him.localindexatts < this->max_localindexatts) { index_t extlen = 0; index_t top = (index_t)INDEX_MAX, bot = (index_t)INDEX_MAX; index_t node_top = (index_t)INDEX_MAX, node_bot = (index_t)INDEX_MAX; this->_node_iedge_count.clear(); index_t extoff = hitoff - 1; bool uniqueStop = true; // perform global search for long introns index_t nelt = this->globalGFMSearch( gfm, // GFM index rd, // read to align sc, // scoring scheme sink.reportingParams(), hit.fw(), extoff, extlen, top, bot, node_top, node_bot, this->_node_iedge_count, rnd, uniqueStop); if(nelt > 0 && nelt <= 5 && extlen >= this->_minK) { coords.clear(); bool straddled = false; this->getGenomeCoords( gfm, altdb, ref, rnd, top, bot, node_top, node_bot, this->_node_iedge_count, hit.fw(), bot - top, extoff + 1 - extlen, extlen, coords, wlm, prm, him, true, // reject straddled? straddled); assert_leq(coords.size(), nelt); if(coords.size() > 1) coords.sort(); for(int ri = (int)coords.size() - 1; ri >= 0; ri--) { const Coord& coord = coords[ri]; GenomeHit tempHit; tempHit.init(coord.orient(), extoff + 1 - extlen, extlen, 0, // trim5 0, // trim3 (index_t)coord.ref(), (index_t)coord.off(), (index_t)coord.joinedOff(), this->_sharedVars, gfm.repeat()); if(!tempHit.adjustWithALT(*this->_rds[rdi], gfm, altdb, ref, gpol)) continue; if(!tempHit.compatibleWith(hit, (index_t)tpol.minIntronLen(), (index_t)tpol.maxIntronLen(), tpol.no_spliced_alignment())) continue; if(uniqueStop) { assert_eq(coords.size(), 1); index_t leftext = (index_t)INDEX_MAX, rightext = (index_t)0; tempHit.extend( rd, gfm, ref, altdb, repeatdb, ssdb, swa, swm, prm, sc, this->_minsc[rdi], rnd, (index_t)this->_minK_local, tpol, gpol, leftext, rightext); } int64_t minsc = this->_minsc[rdi]; bool combined = tempHit.combineWith( hit, rd, gfm, ref, altdb, repeatdb, ssdb, swa, swm, sc, minsc, rnd, (index_t)this->_minK_local, (index_t)tpol.minIntronLen(), (index_t)tpol.maxIntronLen(), tpol.minAnchorLen(), tpol.minAnchorLen_noncan(), gpol.maxAltsTried(), NULL, // splice sites tpol.no_spliced_alignment()); if(!rp.secondary) { if(rdi == 0) minsc = max(minsc, sink.bestUnp1() - cushion); else minsc = max(minsc, sink.bestUnp2() - cushion); } if(combined && tempHit.score() >= minsc) { assert_eq(tempHit.trim5(), 0); assert_leq(tempHit.rdoff() + tempHit.len() + tempHit.trim3(), rdlen); int64_t tmp_maxsc = hybridSearch_recur( sc, pepol, tpol, gpol, gfm, altdb, repeatdb, ref, swa, ssdb, rdi, tempHit, tempHit.rdoff(), tempHit.len() + tempHit.trim3(), wlm, prm, swm, him, rnd, sink, alignMate, dep + 1); maxsc = max(maxsc, tmp_maxsc); } } } } GenomeHit tempHit = hit; index_t trimMax = (index_t)((tempHit.score() - max(maxsc, this->_minsc[rdi])) / sc.sc(0)); if(tempHit.rdoff() < trimMax) { index_t trim5 = tempHit.rdoff(); GenomeHit trimedHit = tempHit; trimedHit.trim5(trim5, rd, ssdb, sc, (index_t)this->_minK_local, (index_t)tpol.minIntronLen(), (index_t)tpol.maxIntronLen(), tpol.minAnchorLen(), tpol.minAnchorLen_noncan(), ref); assert_leq(trimedHit.len() + trimedHit.trim5() + trimedHit.trim3(), rdlen); int64_t tmp_score = trimedHit.score(); if(tmp_score > maxsc && tmp_score >= this->_minsc[rdi]) { int64_t tmp_maxsc = hybridSearch_recur( sc, pepol, tpol, gpol, gfm, altdb, repeatdb, ref, swa, ssdb, rdi, trimedHit, 0, trimedHit.len() + trimedHit.trim5() + trimedHit.trim3(), wlm, prm, swm, him, rnd, sink, alignMate, dep + 1); maxsc = max(maxsc, tmp_maxsc); // return maxsc; } } // extend the partial alignment directly comparing with the corresponding genomic sequence // with mismatches or a gap allowed int64_t minsc = this->_minsc[rdi]; assert_geq(tempHit.score(), minsc); index_t mm = (index_t)((tempHit.score() - minsc) / sc.mmpMax); index_t leftext = (index_t)INDEX_MAX, rightext = (index_t)0; index_t num_mismatch_allowed = 1; if(hitoff <= this->_minK_local) { num_mismatch_allowed = min(tempHit.rdoff(), mm); } him.localextatts++; tempHit.extend( rd, gfm, ref, altdb, repeatdb, ssdb, swa, swm, prm, sc, this->_minsc[rdi], rnd, (index_t)this->_minK_local, tpol, gpol, leftext, rightext, num_mismatch_allowed); if(!rp.secondary) { if(rdi == 0) minsc = max(minsc, sink.bestUnp1() - cushion); else minsc = max(minsc, sink.bestUnp2() - cushion); } if(tempHit.score() >= minsc && leftext >= min((index_t)this->_minK_local, hit.rdoff())) { assert_eq(tempHit.trim5(), 0); assert_leq(tempHit.rdoff() + tempHit.len() + tempHit.trim3(), rdlen); int64_t tmp_maxsc = hybridSearch_recur( sc, pepol, tpol, gpol, gfm, altdb, repeatdb, ref, swa, ssdb, rdi, tempHit, tempHit.rdoff(), tempHit.len() + tempHit.trim3(), wlm, prm, swm, him, rnd, sink, alignMate, dep + 1); maxsc = max(maxsc, tmp_maxsc); } else if(hitoff > this->_minK_local) { // skip some bases of a read index_t jumplen = hitoff > this->_minK ? (index_t)this->_minK : (index_t)this->_minK_local; assert_leq(hitoff, hit.rdoff()); int64_t expected_score = hit.score() - (hit.rdoff() - hitoff) / jumplen * sc.mmpMax - sc.mmpMax; if(expected_score >= minsc) { assert_lt(hitlen + jumplen, rdlen); assert_eq(hit.trim5(), 0); assert_leq(hitoff + hitlen, rdlen); int64_t tmp_maxsc = hybridSearch_recur( sc, pepol, tpol, gpol, gfm, altdb, repeatdb, ref, swa, ssdb, rdi, hit, hitoff - jumplen, hitlen + jumplen, wlm, prm, swm, him, rnd, sink, alignMate, dep + 1); maxsc = max(maxsc, tmp_maxsc); } } } } else { // extend the partial alignment in the right direction assert_lt(hitoff + hitlen, rdlen); if(!ssdb.empty()) { index_t fragoff = 0, fraglen = 0, right = 0; hit.getRight(fragoff, fraglen, right); const index_t minMatchLen = (index_t)this->_minK_local; // make use of a list of known or novel splice sites to further align the read if(fraglen >= minMatchLen && !hit.repeat() && !tpol.no_spliced_alignment()) { spliceSites.clear(); assert_gt(fraglen, 0); assert_leq(fragoff + fraglen, rdlen); index_t right_unmapped_len = rdlen - fragoff - fraglen; ssdb.getRightSpliceSites(hit.ref(), right + fraglen - minMatchLen, minMatchLen + min(minMatchLen, right_unmapped_len), spliceSites); for(size_t si = 0; si < spliceSites.size(); si++) { const SpliceSite& ss = spliceSites[si]; if(!ss._fromfile && ss._readid + this->_thread_rids_mindist > rd.rdid) continue; if(right > ss.left()) continue; GenomeHit tempHit; assert_leq(right, ss.left()); index_t readoff = fragoff + ss.left() - right + 1; if(readoff >= rdlen) continue; index_t joinedOff = 0; bool success = gfm.textOffToJoined(hit.ref(), ss.right(), joinedOff); if(!success) { continue; } #ifndef NDEBUG index_t debug_tid = 0, debug_toff = 0, debug_tlen = 0; bool debug_straddled = false; gfm.joinedToTextOff(1, // qlen joinedOff, debug_tid, debug_toff, debug_tlen, false, debug_straddled); assert_eq(hit.ref(), debug_tid); assert_eq(ss.right(), debug_toff); #endif tempHit.init(hit.fw(), readoff, 0, // len 0, // trim5 0, // trim3 hit.ref(), ss.right(), joinedOff, this->_sharedVars, gfm.repeat()); index_t leftext = 0, rightext = rdlen - readoff; tempHit.extend(rd, gfm, ref, altdb, repeatdb, ssdb, swa, swm, prm, sc, this->_minsc[rdi], rnd, (index_t)this->_minK_local, tpol, gpol, leftext, rightext); if(tempHit.len() <= 0) continue; if(!hit.compatibleWith(tempHit, (index_t)tpol.minIntronLen(), (index_t)tpol.maxIntronLen(), tpol.no_spliced_alignment())) continue; GenomeHit combinedHit = hit; int64_t minsc = this->_minsc[rdi]; bool combined = combinedHit.combineWith( tempHit, rd, gfm, ref, altdb, repeatdb, ssdb, swa, swm, sc, minsc, rnd, (index_t)this->_minK_local, (index_t)tpol.minIntronLen(), (index_t)tpol.maxIntronLen(), 1, 1, gpol.maxAltsTried(), &ss, tpol.no_spliced_alignment()); if(!rp.secondary) { if(rdi == 0) minsc = max(minsc, sink.bestUnp1() - cushion); else minsc = max(minsc, sink.bestUnp2() - cushion); } if(combined && combinedHit.score() >= minsc && // soft-clipping might be better combinedHit.score() + sc.sc(0) * (rdlen - hit.rdoff() - hit.len() - hit.trim5()) >= hit.score()) { assert_leq(combinedHit.trim5(), combinedHit.rdoff()); int64_t tmp_maxsc = hybridSearch_recur( sc, pepol, tpol, gpol, gfm, altdb, repeatdb, ref, swa, ssdb, rdi, combinedHit, combinedHit.rdoff() - combinedHit.trim5(), combinedHit.len() + combinedHit.trim5(), wlm, prm, swm, him, rnd, sink, alignMate, dep + 1); maxsc = max(maxsc, tmp_maxsc); } } } } bool use_localindex = true; if(hit.len() == hitlen && hitoff + hitlen + this->_minK > rdlen) { index_t leftext = (index_t)0, rightext = (index_t)INDEX_MAX; GenomeHit tempHit = hit; tempHit.extend( rd, gfm, ref, altdb, repeatdb, ssdb, swa, swm, prm, sc, this->_minsc[rdi], rnd, (index_t)this->_minK_local, tpol, gpol, leftext, rightext, 1); if(tempHit.rdoff() + tempHit.len()== rdlen) { use_localindex = false; } } // Choose a local index based on the genomic location of the partial alignment const HGFM* hGFM = (const HGFM*)(&gfm); const LocalGFM* lGFM = hGFM->getLocalGFM(hit.ref(), hit.refoff()); bool success = false, first = true; index_t count = 0; // Use at most two local indexes const index_t max_count = 2; int64_t prev_score = hit.score(); this->_local_genomeHits[dep].clear(); while(!success && count++ < max_count && use_localindex) { if(him.localindexatts >= this->max_localindexatts) break; if(first) { first = false; } else { lGFM = hGFM->nextLocalGFM(lGFM); if(lGFM == NULL || lGFM->empty()) break; } // local index search index_t extlen = 0; local_index_t top = (local_index_t)INDEX_MAX, bot = (local_index_t)INDEX_MAX; local_index_t node_top = (local_index_t)INDEX_MAX, node_bot = (local_index_t)INDEX_MAX; index_t extoff = hitoff + hitlen + (index_t)this->_minK_local; if(extoff + 1 < rdlen) extoff += 1; if(extoff >= rdlen) { extoff = rdlen - 1; } index_t nelt = (index_t)INDEX_MAX; index_t max_nelt = std::max(5, extlen); bool no_extension = false; bool uniqueStop; index_t minUniqueLen = (index_t)this->_minK_local; index_t maxHitLen = max(extoff - hitoff - hitlen, (index_t)this->_minK_local); for(; maxHitLen < extoff + 1 && extoff < rdlen;) { extlen = 0; uniqueStop = false; him.localindexatts++; this->_local_node_iedge_count.clear(); nelt = this->localGFMSearch( *lGFM, // GFM index rd, // read to align sc, // scoring scheme sink.reportingParams(), hit.fw(), extoff, extlen, top, bot, node_top, node_bot, this->_local_node_iedge_count, rnd, uniqueStop, minUniqueLen, maxHitLen); if(extoff < hitoff + hitlen) { no_extension = true; break; } if(nelt <= max_nelt) break; if(extoff + 1 < rdlen) extoff++; else { if(extlen < maxHitLen) break; else maxHitLen++; } } assert_leq(node_top, node_bot); assert_eq(nelt, (index_t)(node_bot - node_top)); assert_leq(extlen, extoff + 1); assert_leq(extoff, rdlen); if(nelt > 0 && nelt <= max_nelt && extlen >= tpol.minAnchorLen() && !no_extension) { assert_leq(nelt, max_nelt); coords.clear(); bool straddled = false; // get genomic locations for this local search this->getGenomeCoords_local( *lGFM, altdb, ref, rnd, top, bot, node_top, node_bot, this->_local_node_iedge_count, hit.fw(), extoff + 1 - extlen, extlen, coords, wlm, prm, him, true, // reject straddled? straddled); assert_leq(coords.size(), nelt); if(coords.size() > 1) coords.sort(); for(index_t ri = 0; ri < coords.size(); ri++) { const Coord& coord = coords[ri]; GenomeHit tempHit; tempHit.init(coord.orient(), extoff + 1 - extlen, extlen, 0, // trim5 0, // trim3 (index_t)coord.ref(), (index_t)coord.off(), (index_t)coord.joinedOff(), this->_sharedVars, gfm.repeat()); if(!tempHit.adjustWithALT(*this->_rds[rdi], gfm, altdb, ref, gpol)) continue; // check if the partial alignment is compatible with the new alignment using the local index if(!hit.compatibleWith(tempHit, (index_t)tpol.minIntronLen(), (index_t)tpol.maxIntronLen(), tpol.no_spliced_alignment())) { if(count == 1) continue; else break; } index_t leftext = (index_t)0, rightext = (index_t)INDEX_MAX; tempHit.extend( rd, gfm, ref, altdb, repeatdb, ssdb, swa, swm, prm, sc, this->_minsc[rdi], rnd, (index_t)this->_minK_local, tpol, gpol, leftext, rightext); GenomeHit combinedHit = hit; int64_t minsc = this->_minsc[rdi]; // combine the partial alignment and the new alignment bool combined = combinedHit.combineWith( tempHit, rd, gfm, ref, altdb, repeatdb, ssdb, swa, swm, sc, minsc, rnd, (index_t)this->_minK_local, (index_t)tpol.minIntronLen(), (index_t)tpol.maxIntronLen(), tpol.minAnchorLen(), tpol.minAnchorLen_noncan(), gpol.maxAltsTried(), NULL, // splice sites tpol.no_spliced_alignment()); if(!rp.secondary) { if(rdi == 0) minsc = max(minsc, sink.bestUnp1() - cushion); else minsc = max(minsc, sink.bestUnp2() - cushion); } if(combined && combinedHit.score() >= minsc) { assert_leq(combinedHit.trim5(), combinedHit.rdoff()); if(combinedHit.score() >= prev_score - sc.mmpMax) { // extend the new partial alignment recursively int64_t tmp_maxsc = hybridSearch_recur( sc, pepol, tpol, gpol, gfm, altdb, repeatdb, ref, swa, ssdb, rdi, combinedHit, combinedHit.rdoff() - combinedHit.trim5(), combinedHit.len() + combinedHit.trim5(), wlm, prm, swm, him, rnd, sink, alignMate, dep + 1); maxsc = max(maxsc, tmp_maxsc); } else { this->_local_genomeHits[dep].push_back(combinedHit); } } } } // int64_t minsc = (rdi == 0 ? sink.bestUnp1() : sink.bestUnp2()); if(maxsc >= prev_score - sc.mmpMax) success = true; if(!success && (him.localindexatts >= this->max_localindexatts || count == max_count || hGFM->nextLocalGFM(lGFM) == NULL) ) { for(index_t ti = 0; ti < this->_local_genomeHits[dep].size(); ti++) { GenomeHit& tempHit = this->_local_genomeHits[dep][ti]; int64_t minsc = this->_minsc[rdi]; if(!rp.secondary) { if(rdi == 0) minsc = max(minsc, sink.bestUnp1() - cushion); else minsc = max(minsc, sink.bestUnp2() - cushion); } if(tempHit.score() >= minsc) { int64_t tmp_maxsc = hybridSearch_recur( sc, pepol, tpol, gpol, gfm, altdb, repeatdb, ref, swa, ssdb, rdi, tempHit, tempHit.rdoff() - tempHit.trim5(), tempHit.len() + tempHit.trim5(), wlm, prm, swm, him, rnd, sink, alignMate, dep + 1); maxsc = max(maxsc, tmp_maxsc); } } } } // while(!success && count++ < 2) if(!success) { // perform global search for long introns if(hitoff + hitlen + this->_minK + 1 < rdlen && him.localindexatts < this->max_localindexatts) { index_t extlen = 0; index_t top = (index_t)INDEX_MAX, bot = (index_t)INDEX_MAX; index_t node_top = (index_t)INDEX_MAX, node_bot = (index_t)INDEX_MAX; this->_node_iedge_count.clear(); index_t extoff = hitoff + hitlen + (index_t)this->_minK + 1; bool uniqueStop = true; index_t nelt = this->globalGFMSearch( gfm, // GFM index rd, // read to align sc, // scoring scheme sink.reportingParams(), hit.fw(), extoff, extlen, top, bot, node_top, node_bot, this->_node_iedge_count, rnd, uniqueStop); if(nelt > 0 && nelt <= 5 && extlen >= this->_minK) { coords.clear(); bool straddled = false; this->getGenomeCoords( gfm, altdb, ref, rnd, top, bot, node_top, node_bot, this->_node_iedge_count, hit.fw(), bot - top, extoff + 1 - extlen, extlen, coords, wlm, prm, him, true, // reject straddled straddled); assert_leq(coords.size(), nelt); coords.sort(); for(index_t ri = 0; ri < coords.size(); ri++) { const Coord& coord = coords[ri]; GenomeHit tempHit; tempHit.init(coord.orient(), extoff + 1 - extlen, extlen, 0, // trim5 0, // trim3 (index_t)coord.ref(), (index_t)coord.off(), (index_t)coord.joinedOff(), this->_sharedVars, gfm.repeat()); if(!tempHit.adjustWithALT(*this->_rds[rdi], gfm, altdb, ref, gpol)) continue; if(!hit.compatibleWith(tempHit, (index_t)tpol.minIntronLen(), (index_t)tpol.maxIntronLen(), tpol.no_spliced_alignment())) continue; index_t leftext = (index_t)0, rightext = (index_t)INDEX_MAX; tempHit.extend( rd, gfm, ref, altdb, repeatdb, ssdb, swa, swm, prm, sc, this->_minsc[rdi], rnd, (index_t)this->_minK_local, tpol, gpol, leftext, rightext); GenomeHit combinedHit = hit; int64_t minsc = this->_minsc[rdi]; bool combined = combinedHit.combineWith( tempHit, rd, gfm, ref, altdb, repeatdb, ssdb, swa, swm, sc, minsc, rnd, (index_t)this->_minK_local, (index_t)tpol.minIntronLen(), (index_t)tpol.maxIntronLen(), tpol.minAnchorLen(), tpol.minAnchorLen_noncan(), gpol.maxAltsTried(), NULL, // splice sites tpol.no_spliced_alignment()); if(!rp.secondary) { if(rdi == 0) minsc = max(minsc, sink.bestUnp1() - cushion); else minsc = max(minsc, sink.bestUnp2() - cushion); } if(combined && combinedHit.score() >= minsc) { assert_leq(combinedHit.trim5(), combinedHit.rdoff()); int64_t tmp_maxsc = hybridSearch_recur( sc, pepol, tpol, gpol, gfm, altdb, repeatdb, ref, swa, ssdb, rdi, combinedHit, combinedHit.rdoff() - combinedHit.trim5(), combinedHit.len() + combinedHit.trim5(), wlm, prm, swm, him, rnd, sink, alignMate, dep + 1); maxsc = max(maxsc, tmp_maxsc); } } } } GenomeHit tempHit = hit; assert(tempHit.trim5() == 0 || hitoff == 0); index_t trimLen = rdlen - hitoff - tempHit.len() - tempHit.trim5(); index_t trimMax = (index_t)((tempHit.score() - max(maxsc, this->_minsc[rdi])) / sc.sc(0)); if(trimLen < trimMax) { index_t trim3 = rdlen - hitoff - tempHit.len() - tempHit.trim5(); GenomeHit trimedHit = tempHit; trimedHit.trim3(trim3, rd, ssdb, sc, (index_t)this->_minK_local, (index_t)tpol.minIntronLen(), (index_t)tpol.maxIntronLen(), tpol.minAnchorLen(), tpol.minAnchorLen_noncan(), ref); assert_leq(trimedHit.trim5(), trimedHit.rdoff()); assert_leq(trimedHit.len() + trimedHit.trim5() + trimedHit.trim3(), rdlen); int64_t tmp_score = trimedHit.score(); if(tmp_score > maxsc && tmp_score >= this->_minsc[rdi]) { int64_t tmp_maxsc = hybridSearch_recur( sc, pepol, tpol, gpol, gfm, altdb, repeatdb, ref, swa, ssdb, rdi, trimedHit, trimedHit.rdoff() - trimedHit.trim5(), trimedHit.len() + trimedHit.trim5() + trimedHit.trim3(), wlm, prm, swm, him, rnd, sink, alignMate, dep + 1); maxsc = max(maxsc, tmp_maxsc); // return maxsc; } } // extend the partial alignment directly comparing with the corresponding genomic sequence // with mismatches or a gap allowed int64_t minsc = this->_minsc[rdi]; assert_geq(tempHit.score(), minsc); index_t leftext = (index_t)0, rightext = (index_t)INDEX_MAX; index_t mm = (index_t)((tempHit.score() - minsc) / sc.mmpMax); index_t num_mismatch_allowed = 1; if(rdlen - hitoff - hitlen <= this->_minK_local) { num_mismatch_allowed = min(rdlen - tempHit.rdoff() - tempHit.len(), mm); } him.localextatts++; tempHit.extend( rd, gfm, ref, altdb, repeatdb, ssdb, swa, swm, prm, sc, this->_minsc[rdi], rnd, (index_t)this->_minK_local, tpol, gpol, leftext, rightext, num_mismatch_allowed); if(!rp.secondary) { if(rdi == 0) minsc = max(minsc, sink.bestUnp1() - cushion); else minsc = max(minsc, sink.bestUnp2() - cushion); } if(tempHit.score() >= minsc && rightext >= min((index_t)this->_minK_local, rdlen - hit.len() - hit.rdoff())) { assert_eq(tempHit.trim3(), 0); assert_leq(tempHit.trim5(), tempHit.rdoff()); int64_t tmp_maxsc = hybridSearch_recur( sc, pepol, tpol, gpol, gfm, altdb, repeatdb, ref, swa, ssdb, rdi, tempHit, tempHit.rdoff() - tempHit.trim5(), tempHit.len() + tempHit.trim5(), wlm, prm, swm, him, rnd, sink, alignMate, dep + 1); maxsc = max(maxsc, tmp_maxsc); } else if(hitoff + hitlen + this->_minK_local < rdlen) { // skip some bases of a read index_t jumplen = hitoff + hitlen + this->_minK < rdlen ? (index_t)this->_minK : (index_t)this->_minK_local; assert_lt(hitoff + hitlen + jumplen, rdlen); assert_leq(hit.len(), hitlen); int64_t expected_score = hit.score() - (hitlen - hit.len()) / jumplen * sc.mmpMax - sc.mmpMax; if(expected_score >= minsc) { assert_eq(hit.trim3(), 0); int64_t tmp_maxsc = hybridSearch_recur( sc, pepol, tpol, gpol, gfm, altdb, repeatdb, ref, swa, ssdb, rdi, hit, hitoff, hitlen + jumplen, wlm, prm, swm, him, rnd, sink, alignMate, dep + 1); maxsc = max(maxsc, tmp_maxsc); } } } } return maxsc; } #endif /*SPLICED_ALIGNER_H_*/