hisat-3n/spliced_aligner.h
2025-01-18 21:09:52 +08:00

2055 lines
113 KiB
C++

/*
* Copyright 2015, Daehwan Kim <infphilo@gmail.com>
*
* This file is part of HISAT 2.
*
* HISAT 2 is free software: you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation, either version 3 of the License, or
* (at your option) any later version.
*
* HISAT 2 is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with HISAT 2. If not, see <http://www.gnu.org/licenses/>.
*/
#ifndef SPLICED_ALIGNER_H_
#define SPLICED_ALIGNER_H_
#include "hi_aligner.h"
/**
* With a hierarchical indexing, SplicedAligner provides several alignment strategies
* , which enable effective alignment of RNA-seq reads
*/
template <typename index_t, typename local_index_t>
class SplicedAligner : public HI_Aligner<index_t, local_index_t> {
public:
/**
* Initialize with index.
*/
SplicedAligner(
const GFM<index_t>& gfm,
bool anchorStop,
uint64_t threads_rids_mindist = 0) :
HI_Aligner<index_t, local_index_t>(gfm,
anchorStop,
threads_rids_mindist)
{
}
~SplicedAligner() {
}
/**
* Given a partial alignment of a read, try to further extend
* the alignment bidirectionally using a combination of
* local search, extension, and global search
*/
virtual
void hybridSearch(
const Scoring& sc,
const PairedEndPolicy& pepol, // paired-end policy
const TranscriptomePolicy& tpol,
const GraphPolicy& gpol,
const GFM<index_t>& gfm,
const ALTDB<index_t>& altdb,
const RepeatDB<index_t>& repeatdb,
const BitPairReference& ref,
SwAligner& swa,
SpliceSiteDB& ssdb,
index_t rdi,
bool fw,
WalkMetrics& wlm,
PerReadMetrics& prm,
SwMetrics& swm,
HIMetrics& him,
RandomSource& rnd,
AlnSinkWrap<index_t>& sink);
/**
* Given a partial alignment of a read, try to further extend
* the alignment bidirectionally using a combination of
* local search, extension, and global search
*/
virtual
int64_t hybridSearch_recur(
const Scoring& sc,
const PairedEndPolicy& pepol, // paired-end policy
const TranscriptomePolicy& tpol,
const GraphPolicy& gpol,
const GFM<index_t>& gfm,
const ALTDB<index_t>& altdb,
const RepeatDB<index_t>& repeatdb,
const BitPairReference& ref,
SwAligner& swa,
SpliceSiteDB& ssdb,
index_t rdi,
const GenomeHit<index_t>& hit,
index_t hitoff,
index_t hitlen,
WalkMetrics& wlm,
PerReadMetrics& prm,
SwMetrics& swm,
HIMetrics& him,
RandomSource& rnd,
AlnSinkWrap<index_t>& sink,
bool alignMate = false,
index_t dep = 0);
};
/**
* Given a partial alignment of a read, try to further extend
* the alignment bidirectionally using a combination of
* local search, extension, and global search
*/
template <typename index_t, typename local_index_t>
void SplicedAligner<index_t, local_index_t>::hybridSearch(
const Scoring& sc,
const PairedEndPolicy& pepol, // paired-end policy
const TranscriptomePolicy& tpol,
const GraphPolicy& gpol,
const GFM<index_t>& gfm,
const ALTDB<index_t>& altdb,
const RepeatDB<index_t>& repeatdb,
const BitPairReference& ref,
SwAligner& swa,
SpliceSiteDB& ssdb,
index_t rdi,
bool fw,
WalkMetrics& wlm,
PerReadMetrics& prm,
SwMetrics& swm,
HIMetrics& him,
RandomSource& rnd,
AlnSinkWrap<index_t>& sink)
{
assert_lt(rdi, 2);
assert(this->_rds[rdi] != NULL);
him.localatts++;
const ReportingParams& rp = sink.reportingParams();
// before further alignment using local search, extend the partial alignments directly
// by comparing with the corresponding genomic sequences
// this extension is performed without any mismatches allowed
for(index_t hi = 0; hi < this->_genomeHits.size(); hi++) {
GenomeHit<index_t>& genomeHit = this->_genomeHits[hi];
index_t leftext = (index_t)INDEX_MAX, rightext = (index_t)INDEX_MAX;
genomeHit.extend(
*(this->_rds[rdi]),
gfm,
ref,
altdb,
repeatdb,
ssdb,
swa,
swm,
prm,
sc,
this->_minsc[rdi],
rnd,
INDEX_MAX,
tpol,
gpol,
leftext,
rightext);
}
// for the candidate alignments, examine the longest (best) one first
this->_genomeHits_done.resize(this->_genomeHits.size());
this->_genomeHits_done.fill(false);
for(size_t hi = 0; hi < this->_genomeHits.size(); hi++) {
index_t hj = 0;
for(; hj < this->_genomeHits.size(); hj++) {
if(!this->_genomeHits_done[hj]) break;
}
if(hj >= this->_genomeHits.size()) break;
for(index_t hk = hj + 1; hk < this->_genomeHits.size(); hk++) {
if(this->_genomeHits_done[hk]) continue;
GenomeHit<index_t>& genomeHit_j = this->_genomeHits[hj];
GenomeHit<index_t>& genomeHit_k = this->_genomeHits[hk];
if(genomeHit_k.hitcount() > genomeHit_j.hitcount() ||
(genomeHit_k.hitcount() == genomeHit_j.hitcount() && genomeHit_k.len() > genomeHit_j.len())) {
hj = hk;
}
}
// given a candidate partial alignment, extend it bidirectionally
him.anchoratts++;
GenomeHit<index_t>& genomeHit = this->_genomeHits[hj];
int64_t maxsc = std::numeric_limits<int64_t>::min();
maxsc = hybridSearch_recur(sc,
pepol,
tpol,
gpol,
gfm,
altdb,
repeatdb,
ref,
swa,
ssdb,
rdi,
genomeHit,
genomeHit.rdoff(),
genomeHit.len(),
wlm,
prm,
swm,
him,
rnd,
sink);
if(rp.bowtie2_dp == 2 || (rp.bowtie2_dp == 1 && maxsc < this->_minsc[rdi])) {
const Read& rd = *this->_rds[rdi];
// Initialize the aligner with a new read
swa.initRead(rd.patFw, // fw version of query
rd.patRc, // rc version of query
rd.qual, // fw version of qualities
rd.qualRev, // rc version of qualities
0, // off of first char in 'rd' to consider
rd.length(), // off of last char (excl) in 'rd' to consider
sc); // scoring scheme
bool found = genomeHit.len() >= rd.length();
if(!found) {
DynProgFramer dpframe(false); // trimToRef
size_t tlen = ref.approxLen(genomeHit.ref());
size_t readGaps = 10, refGaps = 10, nceil = 0, maxhalf = 10;
index_t refoff = genomeHit.refoff() > genomeHit.rdoff() ? genomeHit.refoff() - genomeHit.rdoff() : 0;
DPRect rect;
dpframe.frameSeedExtensionRect(refoff, // ref offset implied by seed hit assuming no gaps
rd.length(), // length of read sequence used in DP table
tlen, // length of reference
readGaps, // max # of read gaps permitted in opp mate alignment
refGaps, // max # of ref gaps permitted in opp mate alignment
(size_t)nceil, // # Ns permitted
maxhalf, // max width in either direction
rect); // DP rectangle
assert(rect.repOk());
size_t cminlen = 2000, cpow2 = 4, nwindow = 10, nsInLeftShift = 0;
swa.initRef(fw, // whether to align forward or revcomp read
genomeHit.ref(), // reference aligned against
rect, // DP rectangle
ref, // Reference strings
tlen, // length of reference sequence
sc, // scoring scheme
this->_minsc[rdi], // minimum score permitted
true, // use 8-bit SSE if possible?
cminlen, // minimum length for using checkpointing scheme
cpow2, // interval b/t checkpointed diags; 1 << this
false, // triangular mini-fills?
true, // this is a seed extension - not finding a mate
nwindow,
nsInLeftShift);
// Now fill the dynamic programming matrix and return true iff
// there is at least one valid alignment
TAlScore bestCell = std::numeric_limits<TAlScore>::min();
found = swa.align(rnd, bestCell);
if(found) {
SwResult res;
res.reset();
res.alres.init_raw_edits(&(this->_rawEdits));
found = swa.nextAlignment(res, this->_minsc[rdi], rnd);
if(found) {
if(!fw) res.alres.invertEdits();
const Coord& coord = res.alres.refcoord();
assert_geq(genomeHit._joinedOff + coord.off(), genomeHit.refoff());
index_t joinedOff = genomeHit._joinedOff + coord.off() - genomeHit.refoff();
genomeHit.init(fw,
0, // rdoff
rd.length(),
0, // trim5
0, // trim3
coord.ref(),
coord.off(),
joinedOff,
this->_sharedVars,
genomeHit.repeat(), // repeat?
&res.alres.ned(),
NULL,
res.alres.score().score());
genomeHit.replace_edits_with_alts(rd,
altdb.alts(),
ssdb,
sc,
this->_minK_local,
(index_t)tpol.minIntronLen(),
(index_t)tpol.maxIntronLen(),
(index_t)tpol.minAnchorLen(),
(index_t)tpol.minAnchorLen_noncan(),
ref);
}
}
}
if(found) {
hybridSearch_recur(sc,
pepol,
tpol,
gpol,
gfm,
altdb,
repeatdb,
ref,
swa,
ssdb,
rdi,
genomeHit,
genomeHit.rdoff(),
genomeHit.len(),
wlm,
prm,
swm,
him,
rnd,
sink);
}
}
this->_genomeHits_done[hj] = true;
}
}
/**
* Given a partial alignment of a read, try to further extend
* the alignment bidirectionally using a combination of
* local search, extension, and global search
*/
template <typename index_t, typename local_index_t>
int64_t SplicedAligner<index_t, local_index_t>::hybridSearch_recur(
const Scoring& sc,
const PairedEndPolicy& pepol, // paired-end policy
const TranscriptomePolicy& tpol,
const GraphPolicy& gpol,
const GFM<index_t>& gfm,
const ALTDB<index_t>& altdb,
const RepeatDB<index_t>& repeatdb,
const BitPairReference& ref,
SwAligner& swa,
SpliceSiteDB& ssdb,
index_t rdi,
const GenomeHit<index_t>& hit,
index_t hitoff,
index_t hitlen,
WalkMetrics& wlm,
PerReadMetrics& prm,
SwMetrics& swm,
HIMetrics& him,
RandomSource& rnd,
AlnSinkWrap<index_t>& sink,
bool alignMate,
index_t dep)
{
const ReportingParams& rp = sink.reportingParams();
int64_t maxsc = numeric_limits<int64_t>::min();
him.localsearchrecur++;
assert_lt(rdi, 2);
assert(this->_rds[rdi] != NULL);
const Read& rd = *(this->_rds[rdi]);
index_t rdlen = (index_t)rd.length();
TAlScore cushion = 0;
if(tpol.no_spliced_alignment()) {
cushion = alignMate ? rdlen * 0.03 * sc.mm(255) : 0;
}
if(hit.score() + cushion < this->_minsc[rdi]) return maxsc;
if(dep >= 128) return maxsc;
// if it's already examined, just return
if(hitoff == hit.rdoff() - hit.trim5() && hitlen == hit.len() + hit.trim5() + hit.trim3()) {
if(this->isSearched(hit, rdi)) return maxsc;
this->addSearched(hit, rdi);
}
// for effective use of memory allocation and deallocation
if(this->_coords.size() <= dep) {
this->_coords.expand();
assert_leq(this->_local_genomeHits.size(), dep);
this->_local_genomeHits.expand();
assert_leq(this->_spliceSites.size(), dep);
this->_spliceSites.expand();
}
EList<Coord>& coords = this->_coords[dep];
EList<SpliceSite>& spliceSites = this->_spliceSites[dep];
// daehwan - for debugging purposes
#if 0
cout << rd.name << "\t"
<< (hit.fw() ? "+" : "-") << "\t"
<< hitoff << "\t"
<< hitoff + hitlen << "\t"
<< "( " << hit.rdoff() << "\t"
<< hit.rdoff() + hit.len() << " )" << "\t"
<< hit.refoff() << "\t"
<< hit.getRightOff() << "\t"
<< hit.score() << "\t"
<< "dep: " << dep << "\t";
Edit::print(cout, hit.edits());
cout << endl;
#endif
assert_leq(hitoff + hitlen, rdlen);
// if this is a full alignment, report it
if(hitoff == 0 && hitlen == rdlen) {
if(!this->redundant(sink, rdi, hit)) {
bool another_spliced = false;
if(!ssdb.empty()) {
int64_t best_score = hit.score();
this->_local_genomeHits[dep].clear();
this->_anchors_added.clear();
this->_local_genomeHits[dep].expand();
this->_local_genomeHits[dep].back() = hit;
this->_anchors_added.push_back(0);
index_t fragoff = 0, fraglen = 0, left = 0, right = 0;
hit.getLeft(fragoff, fraglen, left);
const index_t minMatchLen = (index_t)this->_minK;
index_t min_left_anchor = rdlen, min_right_anchor = rdlen;
// make use of a list of known or novel splice sites to further align the read
if(fraglen >= minMatchLen &&
left >= minMatchLen &&
hit.trim5() == 0 &&
!hit.repeat() &&
!tpol.no_spliced_alignment()) {
spliceSites.clear();
ssdb.getLeftSpliceSites(hit.ref(), left + minMatchLen, minMatchLen, spliceSites);
for(size_t si = 0; si < spliceSites.size(); si++) {
const SpliceSite& ss = spliceSites[si];
if(!ss._fromfile && ss._readid + this->_thread_rids_mindist > rd.rdid) continue;
if(left + fraglen - 1 < ss.right()) continue;
index_t frag2off = ss.left() - (ss.right() - left);
if(frag2off + 1 < hitoff) continue;
GenomeHit<index_t> tempHit;
if(fragoff + ss.right() < left + 1) continue;
index_t readoff = fragoff + ss.right() - left - 1;
index_t joinedOff = 0;
bool success = gfm.textOffToJoined(hit.ref(), ss.left(), joinedOff);
if(!success) {
continue;
}
#ifndef NDEBUG
index_t debug_tid = 0, debug_toff = 0, debug_tlen = 0;
bool debug_straddled = false;
gfm.joinedToTextOff(1, // qlen
joinedOff,
debug_tid,
debug_toff,
debug_tlen,
false,
debug_straddled);
assert_eq(hit.ref(), debug_tid);
assert_eq(ss.left(), debug_toff);
#endif
tempHit.init(hit.fw(),
readoff + 1, // rdoff
0, // len
0, // trim5
0, // trim3
hit.ref(),
ss.left() + 1,
joinedOff + 1,
this->_sharedVars,
gfm.repeat());
index_t leftext = readoff + 1, rightext = 0;
tempHit.extend(rd,
gfm,
ref,
altdb,
repeatdb,
ssdb,
swa,
swm,
prm,
sc,
this->_minsc[rdi],
rnd,
(index_t)this->_minK_local,
tpol,
gpol,
leftext,
rightext);
if(tempHit.len() <= 0)
continue;
if(!tempHit.compatibleWith(
hit,
(index_t)tpol.minIntronLen(),
(index_t)tpol.maxIntronLen(),
tpol.no_spliced_alignment()))
continue;
int64_t minsc = max<int64_t>(this->_minsc[rdi], best_score);
bool combined = tempHit.combineWith(
hit,
rd,
gfm,
ref,
altdb,
repeatdb,
ssdb,
swa,
swm,
sc,
minsc,
rnd,
(index_t)this->_minK_local,
(index_t)tpol.minIntronLen(),
(index_t)tpol.maxIntronLen(),
1,
1,
gpol.maxAltsTried(),
&ss,
tpol.no_spliced_alignment());
if(rdi == 0) minsc = max(minsc, sink.bestUnp1());
else minsc = max(minsc, sink.bestUnp2());
index_t leftAnchorLen = 0, nedits = 0;
tempHit.getLeftAnchor(leftAnchorLen, nedits);
if(combined &&
tempHit.score() >= minsc &&
nedits <= leftAnchorLen / 4) { // prevent (short) anchors from having many mismatches
if(this->isSearched(tempHit, rdi)) continue;
if(!this->redundant(sink, rdi, tempHit)) {
another_spliced = true;
if(tempHit.score() > best_score)
best_score = tempHit.score();
this->_local_genomeHits[dep].expand();
this->_local_genomeHits[dep].back() = tempHit;
this->_anchors_added.push_back(1);
index_t temp_fragoff = 0, temp_fraglen = 0, temp_left = 0;
tempHit.getLeft(temp_fragoff, temp_fraglen, temp_left);
if(temp_fraglen < min_left_anchor)
min_left_anchor = temp_fraglen;
}
}
}
}
size_t num_local_genomeHits = this->_local_genomeHits[dep].size();
for(size_t i = 0; i < num_local_genomeHits; i++) {
this->_local_genomeHits[dep][i].getRight(fragoff, fraglen, right);
if(this->_local_genomeHits[dep][i].score() < best_score) continue;
// make use of a list of known or novel splice sites to further align the read
if(fraglen >= minMatchLen &&
this->_local_genomeHits[dep][i].trim3() == 0 &&
!hit.repeat() &&
!tpol.no_spliced_alignment()) {
spliceSites.clear();
assert_gt(fraglen, 0);
ssdb.getRightSpliceSites(this->_local_genomeHits[dep][i].ref(), right + fraglen - minMatchLen, minMatchLen, spliceSites);
for(size_t si = 0; si < spliceSites.size(); si++) {
const GenomeHit<index_t>& canHit = this->_local_genomeHits[dep][i];
const SpliceSite& ss = spliceSites[si];
if(!ss._fromfile && ss._readid + this->_thread_rids_mindist > rd.rdid) continue;
if(right > ss.left()) continue;
GenomeHit<index_t> tempHit;
index_t readoff = fragoff + ss.left() - right + 1;
if(readoff >= rdlen)
continue;
index_t joinedOff = 0;
bool success = gfm.textOffToJoined(canHit.ref(), ss.right(), joinedOff);
if(!success) {
continue;
}
#ifndef NDEBUG
index_t debug_tid = 0, debug_toff = 0, debug_tlen = 0;
bool debug_straddled = false;
gfm.joinedToTextOff(1, // qlen
joinedOff,
debug_tid,
debug_toff,
debug_tlen,
false,
debug_straddled);
assert_eq(canHit.ref(), debug_tid);
assert_eq(ss.right(), debug_toff);
#endif
tempHit.init(canHit.fw(),
readoff,
0, // len
0, // trim5
0, // trim3
canHit.ref(),
ss.right(),
joinedOff,
this->_sharedVars,
gfm.repeat());
index_t leftext = 0, rightext = rdlen - readoff;
tempHit.extend(rd,
gfm,
ref,
altdb,
repeatdb,
ssdb,
swa,
swm,
prm,
sc,
this->_minsc[rdi],
rnd,
(index_t)this->_minK_local,
tpol,
gpol,
leftext,
rightext);
if(tempHit.len() <= 0)
continue;
if(!canHit.compatibleWith(tempHit, (index_t)tpol.minIntronLen(), (index_t)tpol.maxIntronLen(), tpol.no_spliced_alignment())) continue;
GenomeHit<index_t> combinedHit = canHit;
int64_t minsc = max<int64_t>(this->_minsc[rdi], best_score);
bool combined = combinedHit.combineWith(
tempHit,
rd,
gfm,
ref,
altdb,
repeatdb,
ssdb,
swa,
swm,
sc,
minsc,
rnd,
(index_t)this->_minK_local,
(index_t)tpol.minIntronLen(),
(index_t)tpol.maxIntronLen(),
1,
1,
gpol.maxAltsTried(),
&ss,
tpol.no_spliced_alignment());
if(rdi == 0) minsc = max(minsc, sink.bestUnp1());
else minsc = max(minsc, sink.bestUnp2());
index_t rightAnchorLen = 0, nedits = 0;
combinedHit.getRightAnchor(rightAnchorLen, nedits);
if(combined &&
combinedHit.score() >= minsc &&
nedits <= rightAnchorLen / 4) { // prevent (short) anchors from having many mismatches
if(this->isSearched(combinedHit, rdi)) continue;
if(!this->redundant(sink, rdi, combinedHit)) {
another_spliced = true;
if(combinedHit.score() > best_score)
best_score = tempHit.score();
this->_local_genomeHits[dep].expand();
this->_local_genomeHits[dep].back() = combinedHit;
this->_anchors_added.push_back(this->_anchors_added[i] + 1);
index_t temp_fragoff = 0, temp_fraglen = 0, temp_right = 0;
combinedHit.getLeft(temp_fragoff, temp_fraglen, temp_right);
if(temp_fraglen < min_right_anchor)
min_right_anchor = temp_fraglen;
}
}
}
}
}
assert_eq(this->_local_genomeHits[dep].size(), this->_anchors_added.size());
for(size_t i = 0; i < this->_local_genomeHits[dep].size(); i++) {
const GenomeHit<index_t>& canHit = this->_local_genomeHits[dep][i];
if(!rp.secondary && canHit.score() < best_score) continue;
// if(min(min_left_anchor, min_right_anchor) <= this->_minK_local) {
// daehwan - for debugging purposes
// if(this->_anchors_added[i] < this->_anchors_added.back()) continue;
//}
if(i > 0 && !this->isSearched(canHit, rdi)) {
this->addSearched(canHit, rdi);
}
if(!this->redundant(sink, rdi, canHit)) {
this->reportHit(sc, pepol, tpol, gpol, gfm, altdb, repeatdb, ref, ssdb, sink, rdi, canHit, alignMate);
maxsc = max<int64_t>(maxsc, canHit.score());
}
}
}
else {
this->reportHit(sc, pepol, tpol, gpol, gfm, altdb, repeatdb, ref, ssdb, sink, rdi, hit, alignMate);
maxsc = max<int64_t>(maxsc, hit.score());
}
return maxsc;
}
} else if(hitoff > 0 && (hitoff + hitlen == rdlen || hitoff + hitoff < rdlen - hitlen)) {
// Decide which side to extend first (left or right)
if(!ssdb.empty()) {
// extend the partial alignment in the left direction
index_t fragoff = 0, fraglen = 0, left = 0;
hit.getLeft(fragoff, fraglen, left);
const index_t minMatchLen = (index_t)this->_minK_local;
// make use of a list of known or novel splice sites to further align the read
if(fraglen >= minMatchLen &&
left >= minMatchLen &&
!hit.repeat() &&
!tpol.no_spliced_alignment()) {
spliceSites.clear();
ssdb.getLeftSpliceSites(hit.ref(), left + minMatchLen, minMatchLen + min<index_t>(minMatchLen, fragoff), spliceSites);
for(size_t si = 0; si < spliceSites.size(); si++) {
const SpliceSite& ss = spliceSites[si];
if(!ss._fromfile && ss._readid + this->_thread_rids_mindist > rd.rdid) continue;
if(left + fraglen - 1 < ss.right()) continue;
if(fragoff + ss.right() < left + 1) continue;
index_t readoff = fragoff + ss.right() - left - 1;
index_t joinedOff = 0;
bool success = gfm.textOffToJoined(hit.ref(), ss.left(), joinedOff);
if(!success) {
continue;
}
#ifndef NDEBUG
index_t debug_tid = 0, debug_toff = 0, debug_tlen = 0;
bool debug_straddled = false;
gfm.joinedToTextOff(1, // qlen
joinedOff,
debug_tid,
debug_toff,
debug_tlen,
false,
debug_straddled);
assert_eq(hit.ref(), debug_tid);
assert_eq(ss.left(), debug_toff);
#endif
GenomeHit<index_t> tempHit;
tempHit.init(hit.fw(),
readoff + 1, // rdoff
0, // len
0, // trim5
0, // trim3
hit.ref(),
ss.left() + 1,
joinedOff + 1,
this->_sharedVars,
gfm.repeat());
index_t leftext = readoff + 1, rightext = 0;
tempHit.extend(rd,
gfm,
ref,
altdb,
repeatdb,
ssdb,
swa,
swm,
prm,
sc,
this->_minsc[rdi],
rnd,
(index_t)this->_minK_local,
tpol,
gpol,
leftext,
rightext);
if(tempHit.len() <= 0)
continue;
if(!tempHit.compatibleWith(hit, (index_t)tpol.minIntronLen(), (index_t)tpol.maxIntronLen(), tpol.no_spliced_alignment())) continue;
int64_t minsc = this->_minsc[rdi];
bool combined = tempHit.combineWith(
hit,
rd,
gfm,
ref,
altdb,
repeatdb,
ssdb,
swa,
swm,
sc,
minsc,
rnd,
(index_t)this->_minK_local,
(index_t)tpol.minIntronLen(),
(index_t)tpol.maxIntronLen(),
1,
1,
gpol.maxAltsTried(),
&ss,
tpol.no_spliced_alignment());
if(!rp.secondary) {
if(rdi == 0) minsc = max(minsc, sink.bestUnp1() - cushion);
else minsc = max(minsc, sink.bestUnp2() - cushion);
}
if(combined &&
tempHit.score() >= minsc &&
// soft-clipping might be better
tempHit.score() + sc.sc(0) * hit.rdoff() >= hit.score()) {
assert_eq(tempHit.trim5(), 0);
assert_leq(tempHit.rdoff() + tempHit.len() + tempHit.trim3(), rdlen);
int64_t tmp_maxsc = hybridSearch_recur(
sc,
pepol,
tpol,
gpol,
gfm,
altdb,
repeatdb,
ref,
swa,
ssdb,
rdi,
tempHit,
tempHit.rdoff(),
tempHit.len() + tempHit.trim3(),
wlm,
prm,
swm,
him,
rnd,
sink,
alignMate,
dep + 1);
maxsc = max<int64_t>(maxsc, tmp_maxsc);
}
}
}
}
bool use_localindex = true;
if(hitoff == hit.rdoff() && hitoff <= this->_minK) {
index_t leftext = (index_t)INDEX_MAX, rightext = (index_t)0;
GenomeHit<index_t> tempHit = hit;
tempHit.extend(
rd,
gfm,
ref,
altdb,
repeatdb,
ssdb,
swa,
swm,
prm,
sc,
this->_minsc[rdi],
rnd,
(index_t)this->_minK_local,
tpol,
gpol,
leftext,
rightext,
1);
if(tempHit.rdoff() == 0) {
use_localindex = false;
}
}
// Choose a local index based on the genomic location of the partial alignment
const HGFM<index_t, local_index_t>* hGFM = (const HGFM<index_t, local_index_t>*)(&gfm);
const LocalGFM<local_index_t, index_t>* lGFM = hGFM->getLocalGFM(hit.ref(), hit.refoff());
assert_leq(lGFM->_localOffset, hit.refoff());
bool success = false, first = true;
index_t count = 0;
// Use at most two local indexes
const index_t max_count = 2;
int64_t prev_score = hit.score();
this->_local_genomeHits[dep].clear();
while(!success && count++ < max_count && use_localindex) {
if(him.localindexatts >= this->max_localindexatts) break;
if(first) {
first = false;
} else {
lGFM = hGFM->prevLocalGFM(lGFM);
if(lGFM == NULL || lGFM->empty()) break;
}
// local index search
index_t extlen = 0;
local_index_t top = (local_index_t)INDEX_MAX, bot = (local_index_t)INDEX_MAX;
local_index_t node_top = (local_index_t)INDEX_MAX, node_bot = (local_index_t)INDEX_MAX;
index_t extoff = hitoff - 1;
if(extoff > 0) extoff -= 1;
if(extoff < tpol.minAnchorLen()) {
extoff = tpol.minAnchorLen();
}
index_t nelt = (index_t)INDEX_MAX;
index_t max_nelt = std::max<index_t>(5, extlen);
bool no_extension = false;
bool uniqueStop= false;
index_t minUniqueLen = (index_t)this->_minK_local;
for(; extoff < rdlen; extoff++) {
extlen = 0;
uniqueStop = true;
him.localindexatts++;
this->_local_node_iedge_count.clear();
nelt = this->localGFMSearch(
*lGFM, // BWT index
rd, // read to align
sc, // scoring scheme
sink.reportingParams(),
hit.fw(),
extoff,
extlen,
top,
bot,
node_top,
node_bot,
this->_local_node_iedge_count,
rnd,
uniqueStop,
minUniqueLen);
if(extoff + 1 - extlen >= hitoff) {
no_extension = true;
break;
}
if(nelt <= max_nelt) break;
}
assert_leq(node_top, node_bot);
assert_eq(nelt, (index_t)(node_bot - node_top));
assert_leq(extlen, extoff + 1);
if(nelt > 0 &&
nelt <= max_nelt &&
extlen >= tpol.minAnchorLen() &&
!no_extension) {
assert_leq(nelt, max_nelt);
coords.clear();
bool straddled = false;
// get genomic locations for this local search
this->getGenomeCoords_local(
*lGFM,
altdb,
ref,
rnd,
top,
bot,
node_top,
node_bot,
this->_local_node_iedge_count,
hit.fw(),
extoff + 1 - extlen,
extlen,
coords,
wlm,
prm,
him,
true, // reject straddled?
straddled);
assert_leq(coords.size(), nelt);
coords.sort();
for(int ri = (int)coords.size() - 1; ri >= 0; ri--) {
const Coord& coord = coords[ri];
GenomeHit<index_t> tempHit;
tempHit.init(coord.orient(),
extoff + 1 - extlen,
extlen,
0, // trim5
0, // trim3
(index_t)coord.ref(),
(index_t)coord.off(),
(index_t)coord.joinedOff(),
this->_sharedVars,
gfm.repeat());
if(!tempHit.adjustWithALT(*this->_rds[rdi], gfm, altdb, ref, gpol)) continue;
// check if the partial alignment is compatible with the new alignment using the local index
if(!tempHit.compatibleWith(hit, (index_t)tpol.minIntronLen(), (index_t)tpol.maxIntronLen(), tpol.no_spliced_alignment())) {
if(count == 1) continue;
else break;
}
if(uniqueStop) {
assert_eq(coords.size(), 1);
index_t leftext = (index_t)INDEX_MAX, rightext = (index_t)0;
tempHit.extend(
rd,
gfm,
ref,
altdb,
repeatdb,
ssdb,
swa,
swm,
prm,
sc,
this->_minsc[rdi],
rnd,
(index_t)this->_minK_local,
tpol,
gpol,
leftext,
rightext);
}
// combine the partial alignment and the new alignment
int64_t minsc = this->_minsc[rdi];
bool combined = tempHit.combineWith(
hit,
rd,
gfm,
ref,
altdb,
repeatdb,
ssdb,
swa,
swm,
sc,
minsc,
rnd,
(index_t)this->_minK_local,
(index_t)tpol.minIntronLen(),
(index_t)tpol.maxIntronLen(),
tpol.minAnchorLen(),
tpol.minAnchorLen_noncan(),
gpol.maxAltsTried(),
NULL, // splice sites
tpol.no_spliced_alignment());
if(!rp.secondary) {
if(rdi == 0) minsc = max(minsc, sink.bestUnp1() - cushion);
else minsc = max(minsc, sink.bestUnp2() - cushion);
}
if(combined && tempHit.score() >= minsc) {
assert_eq(tempHit.trim5(), 0);
assert_leq(tempHit.rdoff() + tempHit.len() + tempHit.trim3(), rdlen);
if(tempHit.score() >= prev_score - sc.mmpMax) {
// extend the new partial alignment recursively
int64_t tmp_maxsc = hybridSearch_recur(
sc,
pepol,
tpol,
gpol,
gfm,
altdb,
repeatdb,
ref,
swa,
ssdb,
rdi,
tempHit,
tempHit.rdoff(),
tempHit.len() + tempHit.trim3(),
wlm,
prm,
swm,
him,
rnd,
sink,
alignMate,
dep + 1);
maxsc = max<int64_t>(maxsc, tmp_maxsc);
} else {
this->_local_genomeHits[dep].push_back(tempHit);
}
}
}
}
if(maxsc >= prev_score - sc.mmpMax) success = true;
if(!success &&
(him.localindexatts >= this->max_localindexatts || count == max_count || hGFM->prevLocalGFM(lGFM) == NULL)) {
for(index_t ti = 0; ti < this->_local_genomeHits[dep].size(); ti++) {
GenomeHit<index_t>& tempHit = this->_local_genomeHits[dep][ti];
int64_t minsc = this->_minsc[rdi];
if(!rp.secondary) {
if(rdi == 0) minsc = max(minsc, sink.bestUnp1() - cushion);
else minsc = max(minsc, sink.bestUnp2() - cushion);
}
if(tempHit.score() >= minsc) {
int64_t tmp_maxsc = hybridSearch_recur(
sc,
pepol,
tpol,
gpol,
gfm,
altdb,
repeatdb,
ref,
swa,
ssdb,
rdi,
tempHit,
tempHit.rdoff(),
tempHit.len() + tempHit.trim3(),
wlm,
prm,
swm,
him,
rnd,
sink,
alignMate,
dep + 1);
maxsc = max<int64_t>(maxsc, tmp_maxsc);
}
}
}
} // while(!success && count++ < 2)
if(!success) {
if(hitoff > this->_minK &&
him.localindexatts < this->max_localindexatts) {
index_t extlen = 0;
index_t top = (index_t)INDEX_MAX, bot = (index_t)INDEX_MAX;
index_t node_top = (index_t)INDEX_MAX, node_bot = (index_t)INDEX_MAX;
this->_node_iedge_count.clear();
index_t extoff = hitoff - 1;
bool uniqueStop = true;
// perform global search for long introns
index_t nelt = this->globalGFMSearch(
gfm, // GFM index
rd, // read to align
sc, // scoring scheme
sink.reportingParams(),
hit.fw(),
extoff,
extlen,
top,
bot,
node_top,
node_bot,
this->_node_iedge_count,
rnd,
uniqueStop);
if(nelt > 0 && nelt <= 5 && extlen >= this->_minK) {
coords.clear();
bool straddled = false;
this->getGenomeCoords(
gfm,
altdb,
ref,
rnd,
top,
bot,
node_top,
node_bot,
this->_node_iedge_count,
hit.fw(),
bot - top,
extoff + 1 - extlen,
extlen,
coords,
wlm,
prm,
him,
true, // reject straddled?
straddled);
assert_leq(coords.size(), nelt);
if(coords.size() > 1) coords.sort();
for(int ri = (int)coords.size() - 1; ri >= 0; ri--) {
const Coord& coord = coords[ri];
GenomeHit<index_t> tempHit;
tempHit.init(coord.orient(),
extoff + 1 - extlen,
extlen,
0, // trim5
0, // trim3
(index_t)coord.ref(),
(index_t)coord.off(),
(index_t)coord.joinedOff(),
this->_sharedVars,
gfm.repeat());
if(!tempHit.adjustWithALT(*this->_rds[rdi], gfm, altdb, ref, gpol)) continue;
if(!tempHit.compatibleWith(hit, (index_t)tpol.minIntronLen(), (index_t)tpol.maxIntronLen(), tpol.no_spliced_alignment())) continue;
if(uniqueStop) {
assert_eq(coords.size(), 1);
index_t leftext = (index_t)INDEX_MAX, rightext = (index_t)0;
tempHit.extend(
rd,
gfm,
ref,
altdb,
repeatdb,
ssdb,
swa,
swm,
prm,
sc,
this->_minsc[rdi],
rnd,
(index_t)this->_minK_local,
tpol,
gpol,
leftext,
rightext);
}
int64_t minsc = this->_minsc[rdi];
bool combined = tempHit.combineWith(
hit,
rd,
gfm,
ref,
altdb,
repeatdb,
ssdb,
swa,
swm,
sc,
minsc,
rnd,
(index_t)this->_minK_local,
(index_t)tpol.minIntronLen(),
(index_t)tpol.maxIntronLen(),
tpol.minAnchorLen(),
tpol.minAnchorLen_noncan(),
gpol.maxAltsTried(),
NULL, // splice sites
tpol.no_spliced_alignment());
if(!rp.secondary) {
if(rdi == 0) minsc = max(minsc, sink.bestUnp1() - cushion);
else minsc = max(minsc, sink.bestUnp2() - cushion);
}
if(combined && tempHit.score() >= minsc) {
assert_eq(tempHit.trim5(), 0);
assert_leq(tempHit.rdoff() + tempHit.len() + tempHit.trim3(), rdlen);
int64_t tmp_maxsc = hybridSearch_recur(
sc,
pepol,
tpol,
gpol,
gfm,
altdb,
repeatdb,
ref,
swa,
ssdb,
rdi,
tempHit,
tempHit.rdoff(),
tempHit.len() + tempHit.trim3(),
wlm,
prm,
swm,
him,
rnd,
sink,
alignMate,
dep + 1);
maxsc = max<int64_t>(maxsc, tmp_maxsc);
}
}
}
}
GenomeHit<index_t> tempHit = hit;
index_t trimMax = (index_t)((tempHit.score() - max<int64_t>(maxsc, this->_minsc[rdi])) / sc.sc(0));
if(tempHit.rdoff() < trimMax) {
index_t trim5 = tempHit.rdoff();
GenomeHit<index_t> trimedHit = tempHit;
trimedHit.trim5(trim5,
rd,
ssdb,
sc,
(index_t)this->_minK_local,
(index_t)tpol.minIntronLen(),
(index_t)tpol.maxIntronLen(),
tpol.minAnchorLen(),
tpol.minAnchorLen_noncan(),
ref);
assert_leq(trimedHit.len() + trimedHit.trim5() + trimedHit.trim3(), rdlen);
int64_t tmp_score = trimedHit.score();
if(tmp_score > maxsc && tmp_score >= this->_minsc[rdi]) {
int64_t tmp_maxsc = hybridSearch_recur(
sc,
pepol,
tpol,
gpol,
gfm,
altdb,
repeatdb,
ref,
swa,
ssdb,
rdi,
trimedHit,
0,
trimedHit.len() + trimedHit.trim5() + trimedHit.trim3(),
wlm,
prm,
swm,
him,
rnd,
sink,
alignMate,
dep + 1);
maxsc = max<int64_t>(maxsc, tmp_maxsc);
// return maxsc;
}
}
// extend the partial alignment directly comparing with the corresponding genomic sequence
// with mismatches or a gap allowed
int64_t minsc = this->_minsc[rdi];
assert_geq(tempHit.score(), minsc);
index_t mm = (index_t)((tempHit.score() - minsc) / sc.mmpMax);
index_t leftext = (index_t)INDEX_MAX, rightext = (index_t)0;
index_t num_mismatch_allowed = 1;
if(hitoff <= this->_minK_local) {
num_mismatch_allowed = min<index_t>(tempHit.rdoff(), mm);
}
him.localextatts++;
tempHit.extend(
rd,
gfm,
ref,
altdb,
repeatdb,
ssdb,
swa,
swm,
prm,
sc,
this->_minsc[rdi],
rnd,
(index_t)this->_minK_local,
tpol,
gpol,
leftext,
rightext,
num_mismatch_allowed);
if(!rp.secondary) {
if(rdi == 0) minsc = max(minsc, sink.bestUnp1() - cushion);
else minsc = max(minsc, sink.bestUnp2() - cushion);
}
if(tempHit.score() >= minsc && leftext >= min<index_t>((index_t)this->_minK_local, hit.rdoff())) {
assert_eq(tempHit.trim5(), 0);
assert_leq(tempHit.rdoff() + tempHit.len() + tempHit.trim3(), rdlen);
int64_t tmp_maxsc = hybridSearch_recur(
sc,
pepol,
tpol,
gpol,
gfm,
altdb,
repeatdb,
ref,
swa,
ssdb,
rdi,
tempHit,
tempHit.rdoff(),
tempHit.len() + tempHit.trim3(),
wlm,
prm,
swm,
him,
rnd,
sink,
alignMate,
dep + 1);
maxsc = max<int64_t>(maxsc, tmp_maxsc);
} else if(hitoff > this->_minK_local) {
// skip some bases of a read
index_t jumplen = hitoff > this->_minK ? (index_t)this->_minK : (index_t)this->_minK_local;
assert_leq(hitoff, hit.rdoff());
int64_t expected_score = hit.score() - (hit.rdoff() - hitoff) / jumplen * sc.mmpMax - sc.mmpMax;
if(expected_score >= minsc) {
assert_lt(hitlen + jumplen, rdlen);
assert_eq(hit.trim5(), 0);
assert_leq(hitoff + hitlen, rdlen);
int64_t tmp_maxsc = hybridSearch_recur(
sc,
pepol,
tpol,
gpol,
gfm,
altdb,
repeatdb,
ref,
swa,
ssdb,
rdi,
hit,
hitoff - jumplen,
hitlen + jumplen,
wlm,
prm,
swm,
him,
rnd,
sink,
alignMate,
dep + 1);
maxsc = max<int64_t>(maxsc, tmp_maxsc);
}
}
}
} else {
// extend the partial alignment in the right direction
assert_lt(hitoff + hitlen, rdlen);
if(!ssdb.empty()) {
index_t fragoff = 0, fraglen = 0, right = 0;
hit.getRight(fragoff, fraglen, right);
const index_t minMatchLen = (index_t)this->_minK_local;
// make use of a list of known or novel splice sites to further align the read
if(fraglen >= minMatchLen &&
!hit.repeat() &&
!tpol.no_spliced_alignment()) {
spliceSites.clear();
assert_gt(fraglen, 0);
assert_leq(fragoff + fraglen, rdlen);
index_t right_unmapped_len = rdlen - fragoff - fraglen;
ssdb.getRightSpliceSites(hit.ref(), right + fraglen - minMatchLen, minMatchLen + min<index_t>(minMatchLen, right_unmapped_len), spliceSites);
for(size_t si = 0; si < spliceSites.size(); si++) {
const SpliceSite& ss = spliceSites[si];
if(!ss._fromfile && ss._readid + this->_thread_rids_mindist > rd.rdid) continue;
if(right > ss.left()) continue;
GenomeHit<index_t> tempHit;
assert_leq(right, ss.left());
index_t readoff = fragoff + ss.left() - right + 1;
if(readoff >= rdlen)
continue;
index_t joinedOff = 0;
bool success = gfm.textOffToJoined(hit.ref(), ss.right(), joinedOff);
if(!success) {
continue;
}
#ifndef NDEBUG
index_t debug_tid = 0, debug_toff = 0, debug_tlen = 0;
bool debug_straddled = false;
gfm.joinedToTextOff(1, // qlen
joinedOff,
debug_tid,
debug_toff,
debug_tlen,
false,
debug_straddled);
assert_eq(hit.ref(), debug_tid);
assert_eq(ss.right(), debug_toff);
#endif
tempHit.init(hit.fw(),
readoff,
0, // len
0, // trim5
0, // trim3
hit.ref(),
ss.right(),
joinedOff,
this->_sharedVars,
gfm.repeat());
index_t leftext = 0, rightext = rdlen - readoff;
tempHit.extend(rd,
gfm,
ref,
altdb,
repeatdb,
ssdb,
swa,
swm,
prm,
sc,
this->_minsc[rdi],
rnd,
(index_t)this->_minK_local,
tpol,
gpol,
leftext,
rightext);
if(tempHit.len() <= 0)
continue;
if(!hit.compatibleWith(tempHit, (index_t)tpol.minIntronLen(), (index_t)tpol.maxIntronLen(), tpol.no_spliced_alignment())) continue;
GenomeHit<index_t> combinedHit = hit;
int64_t minsc = this->_minsc[rdi];
bool combined = combinedHit.combineWith(
tempHit,
rd,
gfm,
ref,
altdb,
repeatdb,
ssdb,
swa,
swm,
sc,
minsc,
rnd,
(index_t)this->_minK_local,
(index_t)tpol.minIntronLen(),
(index_t)tpol.maxIntronLen(),
1,
1,
gpol.maxAltsTried(),
&ss,
tpol.no_spliced_alignment());
if(!rp.secondary) {
if(rdi == 0) minsc = max(minsc, sink.bestUnp1() - cushion);
else minsc = max(minsc, sink.bestUnp2() - cushion);
}
if(combined && combinedHit.score() >= minsc &&
// soft-clipping might be better
combinedHit.score() + sc.sc(0) * (rdlen - hit.rdoff() - hit.len() - hit.trim5()) >= hit.score()) {
assert_leq(combinedHit.trim5(), combinedHit.rdoff());
int64_t tmp_maxsc = hybridSearch_recur(
sc,
pepol,
tpol,
gpol,
gfm,
altdb,
repeatdb,
ref,
swa,
ssdb,
rdi,
combinedHit,
combinedHit.rdoff() - combinedHit.trim5(),
combinedHit.len() + combinedHit.trim5(),
wlm,
prm,
swm,
him,
rnd,
sink,
alignMate,
dep + 1);
maxsc = max<int64_t>(maxsc, tmp_maxsc);
}
}
}
}
bool use_localindex = true;
if(hit.len() == hitlen && hitoff + hitlen + this->_minK > rdlen) {
index_t leftext = (index_t)0, rightext = (index_t)INDEX_MAX;
GenomeHit<index_t> tempHit = hit;
tempHit.extend(
rd,
gfm,
ref,
altdb,
repeatdb,
ssdb,
swa,
swm,
prm,
sc,
this->_minsc[rdi],
rnd,
(index_t)this->_minK_local,
tpol,
gpol,
leftext,
rightext,
1);
if(tempHit.rdoff() + tempHit.len()== rdlen) {
use_localindex = false;
}
}
// Choose a local index based on the genomic location of the partial alignment
const HGFM<index_t, local_index_t>* hGFM = (const HGFM<index_t, local_index_t>*)(&gfm);
const LocalGFM<local_index_t, index_t>* lGFM = hGFM->getLocalGFM(hit.ref(), hit.refoff());
bool success = false, first = true;
index_t count = 0;
// Use at most two local indexes
const index_t max_count = 2;
int64_t prev_score = hit.score();
this->_local_genomeHits[dep].clear();
while(!success && count++ < max_count && use_localindex) {
if(him.localindexatts >= this->max_localindexatts) break;
if(first) {
first = false;
} else {
lGFM = hGFM->nextLocalGFM(lGFM);
if(lGFM == NULL || lGFM->empty()) break;
}
// local index search
index_t extlen = 0;
local_index_t top = (local_index_t)INDEX_MAX, bot = (local_index_t)INDEX_MAX;
local_index_t node_top = (local_index_t)INDEX_MAX, node_bot = (local_index_t)INDEX_MAX;
index_t extoff = hitoff + hitlen + (index_t)this->_minK_local;
if(extoff + 1 < rdlen) extoff += 1;
if(extoff >= rdlen) {
extoff = rdlen - 1;
}
index_t nelt = (index_t)INDEX_MAX;
index_t max_nelt = std::max<index_t>(5, extlen);
bool no_extension = false;
bool uniqueStop;
index_t minUniqueLen = (index_t)this->_minK_local;
index_t maxHitLen = max<index_t>(extoff - hitoff - hitlen, (index_t)this->_minK_local);
for(; maxHitLen < extoff + 1 && extoff < rdlen;) {
extlen = 0;
uniqueStop = false;
him.localindexatts++;
this->_local_node_iedge_count.clear();
nelt = this->localGFMSearch(
*lGFM, // GFM index
rd, // read to align
sc, // scoring scheme
sink.reportingParams(),
hit.fw(),
extoff,
extlen,
top,
bot,
node_top,
node_bot,
this->_local_node_iedge_count,
rnd,
uniqueStop,
minUniqueLen,
maxHitLen);
if(extoff < hitoff + hitlen) {
no_extension = true;
break;
}
if(nelt <= max_nelt) break;
if(extoff + 1 < rdlen) extoff++;
else {
if(extlen < maxHitLen) break;
else maxHitLen++;
}
}
assert_leq(node_top, node_bot);
assert_eq(nelt, (index_t)(node_bot - node_top));
assert_leq(extlen, extoff + 1);
assert_leq(extoff, rdlen);
if(nelt > 0 &&
nelt <= max_nelt &&
extlen >= tpol.minAnchorLen() &&
!no_extension) {
assert_leq(nelt, max_nelt);
coords.clear();
bool straddled = false;
// get genomic locations for this local search
this->getGenomeCoords_local(
*lGFM,
altdb,
ref,
rnd,
top,
bot,
node_top,
node_bot,
this->_local_node_iedge_count,
hit.fw(),
extoff + 1 - extlen,
extlen,
coords,
wlm,
prm,
him,
true, // reject straddled?
straddled);
assert_leq(coords.size(), nelt);
if(coords.size() > 1) coords.sort();
for(index_t ri = 0; ri < coords.size(); ri++) {
const Coord& coord = coords[ri];
GenomeHit<index_t> tempHit;
tempHit.init(coord.orient(),
extoff + 1 - extlen,
extlen,
0, // trim5
0, // trim3
(index_t)coord.ref(),
(index_t)coord.off(),
(index_t)coord.joinedOff(),
this->_sharedVars,
gfm.repeat());
if(!tempHit.adjustWithALT(*this->_rds[rdi], gfm, altdb, ref, gpol)) continue;
// check if the partial alignment is compatible with the new alignment using the local index
if(!hit.compatibleWith(tempHit, (index_t)tpol.minIntronLen(), (index_t)tpol.maxIntronLen(), tpol.no_spliced_alignment())) {
if(count == 1) continue;
else break;
}
index_t leftext = (index_t)0, rightext = (index_t)INDEX_MAX;
tempHit.extend(
rd,
gfm,
ref,
altdb,
repeatdb,
ssdb,
swa,
swm,
prm,
sc,
this->_minsc[rdi],
rnd,
(index_t)this->_minK_local,
tpol,
gpol,
leftext,
rightext);
GenomeHit<index_t> combinedHit = hit;
int64_t minsc = this->_minsc[rdi];
// combine the partial alignment and the new alignment
bool combined = combinedHit.combineWith(
tempHit,
rd,
gfm,
ref,
altdb,
repeatdb,
ssdb,
swa,
swm,
sc,
minsc,
rnd,
(index_t)this->_minK_local,
(index_t)tpol.minIntronLen(),
(index_t)tpol.maxIntronLen(),
tpol.minAnchorLen(),
tpol.minAnchorLen_noncan(),
gpol.maxAltsTried(),
NULL, // splice sites
tpol.no_spliced_alignment());
if(!rp.secondary) {
if(rdi == 0) minsc = max(minsc, sink.bestUnp1() - cushion);
else minsc = max(minsc, sink.bestUnp2() - cushion);
}
if(combined && combinedHit.score() >= minsc) {
assert_leq(combinedHit.trim5(), combinedHit.rdoff());
if(combinedHit.score() >= prev_score - sc.mmpMax) {
// extend the new partial alignment recursively
int64_t tmp_maxsc = hybridSearch_recur(
sc,
pepol,
tpol,
gpol,
gfm,
altdb,
repeatdb,
ref,
swa,
ssdb,
rdi,
combinedHit,
combinedHit.rdoff() - combinedHit.trim5(),
combinedHit.len() + combinedHit.trim5(),
wlm,
prm,
swm,
him,
rnd,
sink,
alignMate,
dep + 1);
maxsc = max<int64_t>(maxsc, tmp_maxsc);
} else {
this->_local_genomeHits[dep].push_back(combinedHit);
}
}
}
}
// int64_t minsc = (rdi == 0 ? sink.bestUnp1() : sink.bestUnp2());
if(maxsc >= prev_score - sc.mmpMax) success = true;
if(!success &&
(him.localindexatts >= this->max_localindexatts || count == max_count || hGFM->nextLocalGFM(lGFM) == NULL) ) {
for(index_t ti = 0; ti < this->_local_genomeHits[dep].size(); ti++) {
GenomeHit<index_t>& tempHit = this->_local_genomeHits[dep][ti];
int64_t minsc = this->_minsc[rdi];
if(!rp.secondary) {
if(rdi == 0) minsc = max(minsc, sink.bestUnp1() - cushion);
else minsc = max(minsc, sink.bestUnp2() - cushion);
}
if(tempHit.score() >= minsc) {
int64_t tmp_maxsc = hybridSearch_recur(
sc,
pepol,
tpol,
gpol,
gfm,
altdb,
repeatdb,
ref,
swa,
ssdb,
rdi,
tempHit,
tempHit.rdoff() - tempHit.trim5(),
tempHit.len() + tempHit.trim5(),
wlm,
prm,
swm,
him,
rnd,
sink,
alignMate,
dep + 1);
maxsc = max<int64_t>(maxsc, tmp_maxsc);
}
}
}
} // while(!success && count++ < 2)
if(!success) {
// perform global search for long introns
if(hitoff + hitlen + this->_minK + 1 < rdlen &&
him.localindexatts < this->max_localindexatts) {
index_t extlen = 0;
index_t top = (index_t)INDEX_MAX, bot = (index_t)INDEX_MAX;
index_t node_top = (index_t)INDEX_MAX, node_bot = (index_t)INDEX_MAX;
this->_node_iedge_count.clear();
index_t extoff = hitoff + hitlen + (index_t)this->_minK + 1;
bool uniqueStop = true;
index_t nelt = this->globalGFMSearch(
gfm, // GFM index
rd, // read to align
sc, // scoring scheme
sink.reportingParams(),
hit.fw(),
extoff,
extlen,
top,
bot,
node_top,
node_bot,
this->_node_iedge_count,
rnd,
uniqueStop);
if(nelt > 0 && nelt <= 5 && extlen >= this->_minK) {
coords.clear();
bool straddled = false;
this->getGenomeCoords(
gfm,
altdb,
ref,
rnd,
top,
bot,
node_top,
node_bot,
this->_node_iedge_count,
hit.fw(),
bot - top,
extoff + 1 - extlen,
extlen,
coords,
wlm,
prm,
him,
true, // reject straddled
straddled);
assert_leq(coords.size(), nelt);
coords.sort();
for(index_t ri = 0; ri < coords.size(); ri++) {
const Coord& coord = coords[ri];
GenomeHit<index_t> tempHit;
tempHit.init(coord.orient(),
extoff + 1 - extlen,
extlen,
0, // trim5
0, // trim3
(index_t)coord.ref(),
(index_t)coord.off(),
(index_t)coord.joinedOff(),
this->_sharedVars,
gfm.repeat());
if(!tempHit.adjustWithALT(*this->_rds[rdi], gfm, altdb, ref, gpol)) continue;
if(!hit.compatibleWith(tempHit, (index_t)tpol.minIntronLen(), (index_t)tpol.maxIntronLen(), tpol.no_spliced_alignment())) continue;
index_t leftext = (index_t)0, rightext = (index_t)INDEX_MAX;
tempHit.extend(
rd,
gfm,
ref,
altdb,
repeatdb,
ssdb,
swa,
swm,
prm,
sc,
this->_minsc[rdi],
rnd,
(index_t)this->_minK_local,
tpol,
gpol,
leftext,
rightext);
GenomeHit<index_t> combinedHit = hit;
int64_t minsc = this->_minsc[rdi];
bool combined = combinedHit.combineWith(
tempHit,
rd,
gfm,
ref,
altdb,
repeatdb,
ssdb,
swa,
swm,
sc,
minsc,
rnd,
(index_t)this->_minK_local,
(index_t)tpol.minIntronLen(),
(index_t)tpol.maxIntronLen(),
tpol.minAnchorLen(),
tpol.minAnchorLen_noncan(),
gpol.maxAltsTried(),
NULL, // splice sites
tpol.no_spliced_alignment());
if(!rp.secondary) {
if(rdi == 0) minsc = max(minsc, sink.bestUnp1() - cushion);
else minsc = max(minsc, sink.bestUnp2() - cushion);
}
if(combined && combinedHit.score() >= minsc) {
assert_leq(combinedHit.trim5(), combinedHit.rdoff());
int64_t tmp_maxsc = hybridSearch_recur(
sc,
pepol,
tpol,
gpol,
gfm,
altdb,
repeatdb,
ref,
swa,
ssdb,
rdi,
combinedHit,
combinedHit.rdoff() - combinedHit.trim5(),
combinedHit.len() + combinedHit.trim5(),
wlm,
prm,
swm,
him,
rnd,
sink,
alignMate,
dep + 1);
maxsc = max<int64_t>(maxsc, tmp_maxsc);
}
}
}
}
GenomeHit<index_t> tempHit = hit;
assert(tempHit.trim5() == 0 || hitoff == 0);
index_t trimLen = rdlen - hitoff - tempHit.len() - tempHit.trim5();
index_t trimMax = (index_t)((tempHit.score() - max<int64_t>(maxsc, this->_minsc[rdi])) / sc.sc(0));
if(trimLen < trimMax) {
index_t trim3 = rdlen - hitoff - tempHit.len() - tempHit.trim5();
GenomeHit<index_t> trimedHit = tempHit;
trimedHit.trim3(trim3,
rd,
ssdb,
sc,
(index_t)this->_minK_local,
(index_t)tpol.minIntronLen(),
(index_t)tpol.maxIntronLen(),
tpol.minAnchorLen(),
tpol.minAnchorLen_noncan(),
ref);
assert_leq(trimedHit.trim5(), trimedHit.rdoff());
assert_leq(trimedHit.len() + trimedHit.trim5() + trimedHit.trim3(), rdlen);
int64_t tmp_score = trimedHit.score();
if(tmp_score > maxsc && tmp_score >= this->_minsc[rdi]) {
int64_t tmp_maxsc = hybridSearch_recur(
sc,
pepol,
tpol,
gpol,
gfm,
altdb,
repeatdb,
ref,
swa,
ssdb,
rdi,
trimedHit,
trimedHit.rdoff() - trimedHit.trim5(),
trimedHit.len() + trimedHit.trim5() + trimedHit.trim3(),
wlm,
prm,
swm,
him,
rnd,
sink,
alignMate,
dep + 1);
maxsc = max<int64_t>(maxsc, tmp_maxsc);
// return maxsc;
}
}
// extend the partial alignment directly comparing with the corresponding genomic sequence
// with mismatches or a gap allowed
int64_t minsc = this->_minsc[rdi];
assert_geq(tempHit.score(), minsc);
index_t leftext = (index_t)0, rightext = (index_t)INDEX_MAX;
index_t mm = (index_t)((tempHit.score() - minsc) / sc.mmpMax);
index_t num_mismatch_allowed = 1;
if(rdlen - hitoff - hitlen <= this->_minK_local) {
num_mismatch_allowed = min<index_t>(rdlen - tempHit.rdoff() - tempHit.len(), mm);
}
him.localextatts++;
tempHit.extend(
rd,
gfm,
ref,
altdb,
repeatdb,
ssdb,
swa,
swm,
prm,
sc,
this->_minsc[rdi],
rnd,
(index_t)this->_minK_local,
tpol,
gpol,
leftext,
rightext,
num_mismatch_allowed);
if(!rp.secondary) {
if(rdi == 0) minsc = max(minsc, sink.bestUnp1() - cushion);
else minsc = max(minsc, sink.bestUnp2() - cushion);
}
if(tempHit.score() >= minsc && rightext >= min<index_t>((index_t)this->_minK_local, rdlen - hit.len() - hit.rdoff())) {
assert_eq(tempHit.trim3(), 0);
assert_leq(tempHit.trim5(), tempHit.rdoff());
int64_t tmp_maxsc = hybridSearch_recur(
sc,
pepol,
tpol,
gpol,
gfm,
altdb,
repeatdb,
ref,
swa,
ssdb,
rdi,
tempHit,
tempHit.rdoff() - tempHit.trim5(),
tempHit.len() + tempHit.trim5(),
wlm,
prm,
swm,
him,
rnd,
sink,
alignMate,
dep + 1);
maxsc = max<int64_t>(maxsc, tmp_maxsc);
} else if(hitoff + hitlen + this->_minK_local < rdlen) {
// skip some bases of a read
index_t jumplen = hitoff + hitlen + this->_minK < rdlen ? (index_t)this->_minK : (index_t)this->_minK_local;
assert_lt(hitoff + hitlen + jumplen, rdlen);
assert_leq(hit.len(), hitlen);
int64_t expected_score = hit.score() - (hitlen - hit.len()) / jumplen * sc.mmpMax - sc.mmpMax;
if(expected_score >= minsc) {
assert_eq(hit.trim3(), 0);
int64_t tmp_maxsc = hybridSearch_recur(
sc,
pepol,
tpol,
gpol,
gfm,
altdb,
repeatdb,
ref,
swa,
ssdb,
rdi,
hit,
hitoff,
hitlen + jumplen,
wlm,
prm,
swm,
him,
rnd,
sink,
alignMate,
dep + 1);
maxsc = max<int64_t>(maxsc, tmp_maxsc);
}
}
}
}
return maxsc;
}
#endif /*SPLICED_ALIGNER_H_*/