1773 lines
53 KiB
C++
1773 lines
53 KiB
C++
|
/*
|
||
|
* Copyright 2011, Ben Langmead <langmea@cs.jhu.edu>
|
||
|
*
|
||
|
* This file is part of Bowtie 2.
|
||
|
*
|
||
|
* Bowtie 2 is free software: you can redistribute it and/or modify
|
||
|
* it under the terms of the GNU General Public License as published by
|
||
|
* the Free Software Foundation, either version 3 of the License, or
|
||
|
* (at your option) any later version.
|
||
|
*
|
||
|
* Bowtie 2 is distributed in the hope that it will be useful,
|
||
|
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||
|
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||
|
* GNU General Public License for more details.
|
||
|
*
|
||
|
* You should have received a copy of the GNU General Public License
|
||
|
* along with Bowtie 2. If not, see <http://www.gnu.org/licenses/>.
|
||
|
*/
|
||
|
|
||
|
#include "aligner_bt.h"
|
||
|
#include "mask.h"
|
||
|
|
||
|
using namespace std;
|
||
|
|
||
|
#define CHECK_ROW_COL(rowc, colc) \
|
||
|
if(rowc >= 0 && colc >= 0) { \
|
||
|
if(!sawcell_[colc].insert(rowc)) { \
|
||
|
/* was already in there */ \
|
||
|
abort = true; \
|
||
|
return; \
|
||
|
} \
|
||
|
assert(local || prob_.cper_->debugCell(rowc, colc, hefc)); \
|
||
|
}
|
||
|
|
||
|
/**
|
||
|
* Fill in a triangle of the DP table and backtrace from the given cell to
|
||
|
* a cell in the previous checkpoint, or to the terminal cell.
|
||
|
*/
|
||
|
void BtBranchTracer::triangleFill(
|
||
|
int64_t rw, // row of cell to backtrace from
|
||
|
int64_t cl, // column of cell to backtrace from
|
||
|
int hef, // cell to backtrace from is H (0), E (1), or F (2)
|
||
|
TAlScore targ, // score of cell to backtrace from
|
||
|
TAlScore targ_final, // score of alignment we're looking for
|
||
|
RandomSource& rnd, // pseudo-random generator
|
||
|
int64_t& row_new, // out: row we ended up in after backtrace
|
||
|
int64_t& col_new, // out: column we ended up in after backtrace
|
||
|
int& hef_new, // out: H/E/F after backtrace
|
||
|
TAlScore& targ_new, // out: score up to cell we ended up in
|
||
|
bool& done, // out: finished tracing out an alignment?
|
||
|
bool& abort) // out: aborted b/c cell was seen before?
|
||
|
{
|
||
|
assert_geq(rw, 0);
|
||
|
assert_geq(cl, 0);
|
||
|
assert_range(0, 2, hef);
|
||
|
assert_lt(rw, (int64_t)prob_.qrylen_);
|
||
|
assert_lt(cl, (int64_t)prob_.reflen_);
|
||
|
assert(prob_.usecp_ && prob_.fill_);
|
||
|
int64_t row = rw, col = cl;
|
||
|
const int64_t colmin = 0;
|
||
|
const int64_t rowmin = 0;
|
||
|
const int64_t colmax = prob_.reflen_ - 1;
|
||
|
const int64_t rowmax = prob_.qrylen_ - 1;
|
||
|
assert_leq(prob_.reflen_, (TRefOff)sawcell_.size());
|
||
|
assert_leq(col, (int64_t)prob_.cper_->hicol());
|
||
|
assert_geq(col, (int64_t)prob_.cper_->locol());
|
||
|
assert_geq(prob_.cper_->per(), 2);
|
||
|
size_t mod = (row + col) & prob_.cper_->lomask();
|
||
|
assert_lt(mod, prob_.cper_->per());
|
||
|
// Allocate room for diags
|
||
|
size_t depth = mod+1;
|
||
|
assert_leq(depth, prob_.cper_->per());
|
||
|
size_t breadth = depth;
|
||
|
tri_.resize(depth);
|
||
|
// Allocate room for each diag
|
||
|
for(size_t i = 0; i < depth; i++) {
|
||
|
tri_[i].resize(breadth - i);
|
||
|
}
|
||
|
bool upperleft = false;
|
||
|
size_t off = (row + col) >> prob_.cper_->perpow2();
|
||
|
if(off == 0) {
|
||
|
upperleft = true;
|
||
|
} else {
|
||
|
off--;
|
||
|
}
|
||
|
const TAlScore sc_rdo = prob_.sc_->readGapOpen();
|
||
|
const TAlScore sc_rde = prob_.sc_->readGapExtend();
|
||
|
const TAlScore sc_rfo = prob_.sc_->refGapOpen();
|
||
|
const TAlScore sc_rfe = prob_.sc_->refGapExtend();
|
||
|
const bool local = !prob_.sc_->monotone;
|
||
|
int64_t row_lo = row - (int64_t)mod;
|
||
|
const CpQuad *prev2 = NULL, *prev1 = NULL;
|
||
|
if(!upperleft) {
|
||
|
// Read-only pointer to cells in diagonal -2. Start one row above the
|
||
|
// target row.
|
||
|
prev2 = prob_.cper_->qdiag1sPtr() + (off * prob_.cper_->nrow() + row_lo - 1);
|
||
|
// Read-only pointer to cells in diagonal -1. Start one row above the
|
||
|
// target row
|
||
|
prev1 = prob_.cper_->qdiag2sPtr() + (off * prob_.cper_->nrow() + row_lo - 1);
|
||
|
#ifndef NDEBUG
|
||
|
if(row >= (int64_t)mod) {
|
||
|
size_t rowc = row - mod, colc = col;
|
||
|
if(rowc > 0 && prob_.cper_->isCheckpointed(rowc-1, colc)) {
|
||
|
TAlScore al = prev1[0].sc[0];
|
||
|
if(al == MIN_I16) al = MIN_I64;
|
||
|
assert_eq(prob_.cper_->scoreTriangle(rowc-1, colc, 0), al);
|
||
|
}
|
||
|
if(rowc > 0 && colc > 0 && prob_.cper_->isCheckpointed(rowc-1, colc-1)) {
|
||
|
TAlScore al = prev2[0].sc[0];
|
||
|
if(al == MIN_I16) al = MIN_I64;
|
||
|
assert_eq(prob_.cper_->scoreTriangle(rowc-1, colc-1, 0), al);
|
||
|
}
|
||
|
}
|
||
|
#endif
|
||
|
}
|
||
|
// Pointer to cells in current diagonal
|
||
|
// For each diagonal we need to fill in
|
||
|
for(size_t i = 0; i < depth; i++) {
|
||
|
CpQuad * cur = tri_[i].ptr();
|
||
|
CpQuad * curc = cur;
|
||
|
size_t doff = mod - i; // # diagonals we are away from target diag
|
||
|
//assert_geq(row, (int64_t)doff);
|
||
|
int64_t rowc = row - doff;
|
||
|
int64_t colc = col;
|
||
|
size_t neval = 0; // # cells evaluated in this diag
|
||
|
ASSERT_ONLY(const CpQuad *last = NULL);
|
||
|
// Fill this diagonal from upper right to lower left
|
||
|
for(size_t j = 0; j < breadth; j++) {
|
||
|
if(rowc >= rowmin && rowc <= rowmax &&
|
||
|
colc >= colmin && colc <= colmax)
|
||
|
{
|
||
|
neval++;
|
||
|
int64_t fromend = prob_.qrylen_ - rowc - 1;
|
||
|
bool allowGaps = fromend >= prob_.sc_->gapbar && rowc >= prob_.sc_->gapbar;
|
||
|
// Fill this cell
|
||
|
// Some things we might want to calculate about this cell up front:
|
||
|
// 1. How many matches are possible from this cell to the cell in
|
||
|
// row, col, in case this allows us to prune
|
||
|
// Get character from read
|
||
|
int qc = prob_.qry_[rowc];
|
||
|
// Get quality value from read
|
||
|
int qq = prob_.qual_[rowc];
|
||
|
assert_geq(qq, 33);
|
||
|
// Get character from reference
|
||
|
int rc = prob_.ref_[colc];
|
||
|
assert_range(0, 16, rc);
|
||
|
int16_t sc_diag = prob_.sc_->score(qc, rc, qq - 33);
|
||
|
int16_t sc_h_up = MIN_I16;
|
||
|
int16_t sc_f_up = MIN_I16;
|
||
|
int16_t sc_h_lf = MIN_I16;
|
||
|
int16_t sc_e_lf = MIN_I16;
|
||
|
if(allowGaps) {
|
||
|
if(rowc > 0) {
|
||
|
assert(local || prev1[j+0].sc[2] < 0);
|
||
|
if(prev1[j+0].sc[0] > MIN_I16) {
|
||
|
sc_h_up = prev1[j+0].sc[0] - sc_rfo;
|
||
|
if(local) sc_h_up = max<int16_t>(sc_h_up, 0);
|
||
|
}
|
||
|
if(prev1[j+0].sc[2] > MIN_I16) {
|
||
|
sc_f_up = prev1[j+0].sc[2] - sc_rfe;
|
||
|
if(local) sc_f_up = max<int16_t>(sc_f_up, 0);
|
||
|
}
|
||
|
#ifndef NDEBUG
|
||
|
TAlScore hup = prev1[j+0].sc[0];
|
||
|
TAlScore fup = prev1[j+0].sc[2];
|
||
|
if(hup == MIN_I16) hup = MIN_I64;
|
||
|
if(fup == MIN_I16) fup = MIN_I64;
|
||
|
if(local) {
|
||
|
hup = max<int16_t>(hup, 0);
|
||
|
fup = max<int16_t>(fup, 0);
|
||
|
}
|
||
|
if(prob_.cper_->isCheckpointed(rowc-1, colc)) {
|
||
|
assert_eq(hup, prob_.cper_->scoreTriangle(rowc-1, colc, 0));
|
||
|
assert_eq(fup, prob_.cper_->scoreTriangle(rowc-1, colc, 2));
|
||
|
}
|
||
|
#endif
|
||
|
}
|
||
|
if(colc > 0) {
|
||
|
assert(local || prev1[j+1].sc[1] < 0);
|
||
|
if(prev1[j+1].sc[0] > MIN_I16) {
|
||
|
sc_h_lf = prev1[j+1].sc[0] - sc_rdo;
|
||
|
if(local) sc_h_lf = max<int16_t>(sc_h_lf, 0);
|
||
|
}
|
||
|
if(prev1[j+1].sc[1] > MIN_I16) {
|
||
|
sc_e_lf = prev1[j+1].sc[1] - sc_rde;
|
||
|
if(local) sc_e_lf = max<int16_t>(sc_e_lf, 0);
|
||
|
}
|
||
|
#ifndef NDEBUG
|
||
|
TAlScore hlf = prev1[j+1].sc[0];
|
||
|
TAlScore elf = prev1[j+1].sc[1];
|
||
|
if(hlf == MIN_I16) hlf = MIN_I64;
|
||
|
if(elf == MIN_I16) elf = MIN_I64;
|
||
|
if(local) {
|
||
|
hlf = max<int16_t>(hlf, 0);
|
||
|
elf = max<int16_t>(elf, 0);
|
||
|
}
|
||
|
if(prob_.cper_->isCheckpointed(rowc, colc-1)) {
|
||
|
assert_eq(hlf, prob_.cper_->scoreTriangle(rowc, colc-1, 0));
|
||
|
assert_eq(elf, prob_.cper_->scoreTriangle(rowc, colc-1, 1));
|
||
|
}
|
||
|
#endif
|
||
|
}
|
||
|
}
|
||
|
assert(rowc <= 1 || colc <= 0 || prev2 != NULL);
|
||
|
int16_t sc_h_dg = ((rowc > 0 && colc > 0) ? prev2[j+0].sc[0] : 0);
|
||
|
if(colc == 0 && rowc > 0 && !local) {
|
||
|
sc_h_dg = MIN_I16;
|
||
|
}
|
||
|
if(sc_h_dg > MIN_I16) {
|
||
|
sc_h_dg += sc_diag;
|
||
|
}
|
||
|
if(local) sc_h_dg = max<int16_t>(sc_h_dg, 0);
|
||
|
// cerr << sc_diag << " " << sc_h_dg << " " << sc_h_up << " " << sc_f_up << " " << sc_h_lf << " " << sc_e_lf << endl;
|
||
|
int mask = 0;
|
||
|
// Calculate best ways into H, E, F cells starting with H.
|
||
|
// Mask bits:
|
||
|
// H: 1=diag, 2=hhoriz, 4=ehoriz, 8=hvert, 16=fvert
|
||
|
// E: 32=hhoriz, 64=ehoriz
|
||
|
// F: 128=hvert, 256=fvert
|
||
|
int16_t sc_best = sc_h_dg;
|
||
|
if(sc_h_dg > MIN_I64) {
|
||
|
mask = 1;
|
||
|
}
|
||
|
if(colc > 0 && sc_h_lf >= sc_best && sc_h_lf > MIN_I64) {
|
||
|
if(sc_h_lf > sc_best) mask = 0;
|
||
|
mask |= 2;
|
||
|
sc_best = sc_h_lf;
|
||
|
}
|
||
|
if(colc > 0 && sc_e_lf >= sc_best && sc_e_lf > MIN_I64) {
|
||
|
if(sc_e_lf > sc_best) mask = 0;
|
||
|
mask |= 4;
|
||
|
sc_best = sc_e_lf;
|
||
|
}
|
||
|
if(rowc > 0 && sc_h_up >= sc_best && sc_h_up > MIN_I64) {
|
||
|
if(sc_h_up > sc_best) mask = 0;
|
||
|
mask |= 8;
|
||
|
sc_best = sc_h_up;
|
||
|
}
|
||
|
if(rowc > 0 && sc_f_up >= sc_best && sc_f_up > MIN_I64) {
|
||
|
if(sc_f_up > sc_best) mask = 0;
|
||
|
mask |= 16;
|
||
|
sc_best = sc_f_up;
|
||
|
}
|
||
|
// Calculate best way into E cell
|
||
|
int16_t sc_e_best = sc_h_lf;
|
||
|
if(colc > 0) {
|
||
|
if(sc_h_lf >= sc_e_lf && sc_h_lf > MIN_I64) {
|
||
|
if(sc_h_lf == sc_e_lf) {
|
||
|
mask |= 64;
|
||
|
}
|
||
|
mask |= 32;
|
||
|
} else if(sc_e_lf > MIN_I64) {
|
||
|
sc_e_best = sc_e_lf;
|
||
|
mask |= 64;
|
||
|
}
|
||
|
}
|
||
|
if(sc_e_best > sc_best) {
|
||
|
sc_best = sc_e_best;
|
||
|
mask &= ~31; // don't go diagonal
|
||
|
}
|
||
|
// Calculate best way into F cell
|
||
|
int16_t sc_f_best = sc_h_up;
|
||
|
if(rowc > 0) {
|
||
|
if(sc_h_up >= sc_f_up && sc_h_up > MIN_I64) {
|
||
|
if(sc_h_up == sc_f_up) {
|
||
|
mask |= 256;
|
||
|
}
|
||
|
mask |= 128;
|
||
|
} else if(sc_f_up > MIN_I64) {
|
||
|
sc_f_best = sc_f_up;
|
||
|
mask |= 256;
|
||
|
}
|
||
|
}
|
||
|
if(sc_f_best > sc_best) {
|
||
|
sc_best = sc_f_best;
|
||
|
mask &= ~127; // don't go horizontal or diagonal
|
||
|
}
|
||
|
// Install results in cur
|
||
|
assert(!prob_.sc_->monotone || sc_best <= 0);
|
||
|
assert(!prob_.sc_->monotone || sc_e_best <= 0);
|
||
|
assert(!prob_.sc_->monotone || sc_f_best <= 0);
|
||
|
curc->sc[0] = sc_best;
|
||
|
assert( local || sc_e_best < 0);
|
||
|
assert( local || sc_f_best < 0);
|
||
|
assert(!local || sc_e_best >= 0 || sc_e_best == MIN_I16);
|
||
|
assert(!local || sc_f_best >= 0 || sc_f_best == MIN_I16);
|
||
|
curc->sc[1] = sc_e_best;
|
||
|
curc->sc[2] = sc_f_best;
|
||
|
curc->sc[3] = mask;
|
||
|
// cerr << curc->sc[0] << " " << curc->sc[1] << " " << curc->sc[2] << " " << curc->sc[3] << endl;
|
||
|
ASSERT_ONLY(last = curc);
|
||
|
#ifndef NDEBUG
|
||
|
if(prob_.cper_->isCheckpointed(rowc, colc)) {
|
||
|
if(local) {
|
||
|
sc_e_best = max<int16_t>(sc_e_best, 0);
|
||
|
sc_f_best = max<int16_t>(sc_f_best, 0);
|
||
|
}
|
||
|
TAlScore sc_best64 = sc_best; if(sc_best == MIN_I16) sc_best64 = MIN_I64;
|
||
|
TAlScore sc_e_best64 = sc_e_best; if(sc_e_best == MIN_I16) sc_e_best64 = MIN_I64;
|
||
|
TAlScore sc_f_best64 = sc_f_best; if(sc_f_best == MIN_I16) sc_f_best64 = MIN_I64;
|
||
|
assert_eq(prob_.cper_->scoreTriangle(rowc, colc, 0), sc_best64);
|
||
|
assert_eq(prob_.cper_->scoreTriangle(rowc, colc, 1), sc_e_best64);
|
||
|
assert_eq(prob_.cper_->scoreTriangle(rowc, colc, 2), sc_f_best64);
|
||
|
}
|
||
|
#endif
|
||
|
}
|
||
|
// Update row, col
|
||
|
assert_lt(rowc, (int64_t)prob_.qrylen_);
|
||
|
rowc++;
|
||
|
colc--;
|
||
|
curc++;
|
||
|
} // for(size_t j = 0; j < breadth; j++)
|
||
|
if(i == depth-1) {
|
||
|
// Final iteration
|
||
|
assert(last != NULL);
|
||
|
assert_eq(1, neval);
|
||
|
assert_neq(0, last->sc[3]);
|
||
|
assert_eq(targ, last->sc[hef]);
|
||
|
} else {
|
||
|
breadth--;
|
||
|
prev2 = prev1 + 1;
|
||
|
prev1 = cur;
|
||
|
}
|
||
|
} // for(size_t i = 0; i < depth; i++)
|
||
|
//
|
||
|
// Now backtrack through the triangle. Abort as soon as we enter a cell
|
||
|
// that was visited by a previous backtrace.
|
||
|
//
|
||
|
int64_t rowc = row, colc = col;
|
||
|
size_t curid;
|
||
|
int hefc = hef;
|
||
|
if(bs_.empty()) {
|
||
|
// Start an initial branch
|
||
|
CHECK_ROW_COL(rowc, colc);
|
||
|
curid = bs_.alloc();
|
||
|
assert_eq(0, curid);
|
||
|
Edit e;
|
||
|
bs_[curid].init(
|
||
|
prob_,
|
||
|
0, // parent ID
|
||
|
0, // penalty
|
||
|
0, // score_en
|
||
|
rowc, // row
|
||
|
colc, // col
|
||
|
e, // edit
|
||
|
0, // hef
|
||
|
true, // I am the root
|
||
|
false); // don't try to extend with exact matches
|
||
|
bs_[curid].len_ = 0;
|
||
|
} else {
|
||
|
curid = bs_.size()-1;
|
||
|
}
|
||
|
size_t idx_orig = (row + col) >> prob_.cper_->perpow2();
|
||
|
while(true) {
|
||
|
// What depth are we?
|
||
|
size_t mod = (rowc + colc) & prob_.cper_->lomask();
|
||
|
assert_lt(mod, prob_.cper_->per());
|
||
|
CpQuad * cur = tri_[mod].ptr();
|
||
|
int64_t row_off = rowc - row_lo - mod;
|
||
|
assert(!local || cur[row_off].sc[0] > 0);
|
||
|
assert_geq(row_off, 0);
|
||
|
int mask = cur[row_off].sc[3];
|
||
|
assert_gt(mask, 0);
|
||
|
int sel = -1;
|
||
|
// Select what type of move to make, which depends on whether we're
|
||
|
// currently in H, E, F:
|
||
|
if(hefc == 0) {
|
||
|
if( (mask & 1) != 0) {
|
||
|
// diagonal
|
||
|
sel = 0;
|
||
|
} else if((mask & 8) != 0) {
|
||
|
// up to H
|
||
|
sel = 3;
|
||
|
} else if((mask & 16) != 0) {
|
||
|
// up to F
|
||
|
sel = 4;
|
||
|
} else if((mask & 2) != 0) {
|
||
|
// left to H
|
||
|
sel = 1;
|
||
|
} else if((mask & 4) != 0) {
|
||
|
// left to E
|
||
|
sel = 2;
|
||
|
}
|
||
|
} else if(hefc == 1) {
|
||
|
if( (mask & 32) != 0) {
|
||
|
// left to H
|
||
|
sel = 5;
|
||
|
} else if((mask & 64) != 0) {
|
||
|
// left to E
|
||
|
sel = 6;
|
||
|
}
|
||
|
} else {
|
||
|
assert_eq(2, hefc);
|
||
|
if( (mask & 128) != 0) {
|
||
|
// up to H
|
||
|
sel = 7;
|
||
|
} else if((mask & 256) != 0) {
|
||
|
// up to F
|
||
|
sel = 8;
|
||
|
}
|
||
|
}
|
||
|
assert_geq(sel, 0);
|
||
|
// Get character from read
|
||
|
int qc = prob_.qry_[rowc], qq = prob_.qual_[rowc];
|
||
|
// Get character from reference
|
||
|
int rc = prob_.ref_[colc];
|
||
|
assert_range(0, 16, rc);
|
||
|
// Now that we know what type of move to make, make it, updating our
|
||
|
// row and column and moving updating the branch.
|
||
|
if(sel == 0) {
|
||
|
assert_geq(rowc, 0);
|
||
|
assert_geq(colc, 0);
|
||
|
TAlScore scd = prob_.sc_->score(qc, rc, qq - 33);
|
||
|
if((rc & (1 << qc)) == 0) {
|
||
|
// Mismatch
|
||
|
size_t id = curid;
|
||
|
// Check if the previous branch was the initial (bottommost)
|
||
|
// branch with no matches. If so, the mismatch should be added
|
||
|
// to the initial branch, instead of starting a new branch.
|
||
|
bool empty = (bs_[curid].len_ == 0 && curid == 0);
|
||
|
if(!empty) {
|
||
|
id = bs_.alloc();
|
||
|
}
|
||
|
Edit e((int)rowc, mask2dna[rc], "ACGTN"[qc], EDIT_TYPE_MM);
|
||
|
assert_lt(scd, 0);
|
||
|
TAlScore score_en = bs_[curid].score_st_ + scd;
|
||
|
bs_[id].init(
|
||
|
prob_,
|
||
|
curid, // parent ID
|
||
|
-scd, // penalty
|
||
|
score_en, // score_en
|
||
|
rowc, // row
|
||
|
colc, // col
|
||
|
e, // edit
|
||
|
hefc, // hef
|
||
|
empty, // root?
|
||
|
false); // don't try to extend with exact matches
|
||
|
//assert(!local || bs_[id].score_st_ >= 0);
|
||
|
curid = id;
|
||
|
} else {
|
||
|
// Match
|
||
|
bs_[curid].score_st_ += prob_.sc_->match();
|
||
|
bs_[curid].len_++;
|
||
|
assert_leq((int64_t)bs_[curid].len_, bs_[curid].row_ + 1);
|
||
|
}
|
||
|
rowc--;
|
||
|
colc--;
|
||
|
assert(local || bs_[curid].score_st_ >= targ_final);
|
||
|
hefc = 0;
|
||
|
} else if((sel >= 1 && sel <= 2) || (sel >= 5 && sel <= 6)) {
|
||
|
assert_gt(colc, 0);
|
||
|
// Read gap
|
||
|
size_t id = bs_.alloc();
|
||
|
Edit e((int)rowc+1, mask2dna[rc], '-', EDIT_TYPE_READ_GAP);
|
||
|
TAlScore gapp = prob_.sc_->readGapOpen();
|
||
|
if(bs_[curid].len_ == 0 && bs_[curid].e_.inited() && bs_[curid].e_.isReadGap()) {
|
||
|
gapp = prob_.sc_->readGapExtend();
|
||
|
}
|
||
|
TAlScore score_en = bs_[curid].score_st_ - gapp;
|
||
|
bs_[id].init(
|
||
|
prob_,
|
||
|
curid, // parent ID
|
||
|
gapp, // penalty
|
||
|
score_en, // score_en
|
||
|
rowc, // row
|
||
|
colc-1, // col
|
||
|
e, // edit
|
||
|
hefc, // hef
|
||
|
false, // root?
|
||
|
false); // don't try to extend with exact matches
|
||
|
colc--;
|
||
|
curid = id;
|
||
|
assert( local || bs_[curid].score_st_ >= targ_final);
|
||
|
//assert(!local || bs_[curid].score_st_ >= 0);
|
||
|
if(sel == 1 || sel == 5) {
|
||
|
hefc = 0;
|
||
|
} else {
|
||
|
hefc = 1;
|
||
|
}
|
||
|
} else {
|
||
|
assert_gt(rowc, 0);
|
||
|
// Reference gap
|
||
|
size_t id = bs_.alloc();
|
||
|
Edit e((int)rowc, '-', "ACGTN"[qc], EDIT_TYPE_REF_GAP);
|
||
|
TAlScore gapp = prob_.sc_->refGapOpen();
|
||
|
if(bs_[curid].len_ == 0 && bs_[curid].e_.inited() && bs_[curid].e_.isRefGap()) {
|
||
|
gapp = prob_.sc_->refGapExtend();
|
||
|
}
|
||
|
TAlScore score_en = bs_[curid].score_st_ - gapp;
|
||
|
bs_[id].init(
|
||
|
prob_,
|
||
|
curid, // parent ID
|
||
|
gapp, // penalty
|
||
|
score_en, // score_en
|
||
|
rowc-1, // row
|
||
|
colc, // col
|
||
|
e, // edit
|
||
|
hefc, // hef
|
||
|
false, // root?
|
||
|
false); // don't try to extend with exact matches
|
||
|
rowc--;
|
||
|
curid = id;
|
||
|
//assert(!local || bs_[curid].score_st_ >= 0);
|
||
|
if(sel == 3 || sel == 7) {
|
||
|
hefc = 0;
|
||
|
} else {
|
||
|
hefc = 2;
|
||
|
}
|
||
|
}
|
||
|
CHECK_ROW_COL(rowc, colc);
|
||
|
size_t mod_new = (rowc + colc) & prob_.cper_->lomask();
|
||
|
size_t idx = (rowc + colc) >> prob_.cper_->perpow2();
|
||
|
assert_lt(mod_new, prob_.cper_->per());
|
||
|
int64_t row_off_new = rowc - row_lo - mod_new;
|
||
|
CpQuad * cur_new = NULL;
|
||
|
if(colc >= 0 && rowc >= 0 && idx == idx_orig) {
|
||
|
cur_new = tri_[mod_new].ptr();
|
||
|
}
|
||
|
bool hit_new_tri = (idx < idx_orig && colc >= 0 && rowc >= 0);
|
||
|
// Check whether we made it to the top row or to a cell with score 0
|
||
|
if(colc < 0 || rowc < 0 ||
|
||
|
(cur_new != NULL && (local && cur_new[row_off_new].sc[0] == 0)))
|
||
|
{
|
||
|
done = true;
|
||
|
assert(bs_[curid].isSolution(prob_));
|
||
|
addSolution(curid);
|
||
|
#ifndef NDEBUG
|
||
|
// A check to see if any two adjacent branches in the backtrace
|
||
|
// overlap. If they do, the whole alignment will be filtered out
|
||
|
// in trySolution(...)
|
||
|
size_t cur = curid;
|
||
|
if(!bs_[cur].root_) {
|
||
|
size_t next = bs_[cur].parentId_;
|
||
|
while(!bs_[next].root_) {
|
||
|
assert_neq(cur, next);
|
||
|
if(bs_[next].len_ != 0 || bs_[cur].len_ == 0) {
|
||
|
assert(!bs_[cur].overlap(prob_, bs_[next]));
|
||
|
}
|
||
|
cur = next;
|
||
|
next = bs_[cur].parentId_;
|
||
|
}
|
||
|
}
|
||
|
#endif
|
||
|
return;
|
||
|
}
|
||
|
if(hit_new_tri) {
|
||
|
assert(rowc < 0 || colc < 0 || prob_.cper_->isCheckpointed(rowc, colc));
|
||
|
row_new = rowc; col_new = colc;
|
||
|
hef_new = hefc;
|
||
|
done = false;
|
||
|
if(rowc < 0 || colc < 0) {
|
||
|
assert(local);
|
||
|
targ_new = 0;
|
||
|
} else {
|
||
|
targ_new = prob_.cper_->scoreTriangle(rowc, colc, hefc);
|
||
|
}
|
||
|
if(local && targ_new == 0) {
|
||
|
done = true;
|
||
|
assert(bs_[curid].isSolution(prob_));
|
||
|
addSolution(curid);
|
||
|
}
|
||
|
assert((row_new >= 0 && col_new >= 0) || done);
|
||
|
return;
|
||
|
}
|
||
|
}
|
||
|
assert(false);
|
||
|
}
|
||
|
|
||
|
#ifndef NDEBUG
|
||
|
#define DEBUG_CHECK(ss, row, col, hef) { \
|
||
|
if(prob_.cper_->debug() && row >= 0 && col >= 0) { \
|
||
|
TAlScore s = ss; \
|
||
|
if(s == MIN_I16) s = MIN_I64; \
|
||
|
if(local && s < 0) s = 0; \
|
||
|
TAlScore deb = prob_.cper_->debugCell(row, col, hef); \
|
||
|
if(local && deb < 0) deb = 0; \
|
||
|
assert_eq(s, deb); \
|
||
|
} \
|
||
|
}
|
||
|
#else
|
||
|
#define DEBUG_CHECK(ss, row, col, hef)
|
||
|
#endif
|
||
|
|
||
|
|
||
|
/**
|
||
|
* Fill in a square of the DP table and backtrace from the given cell to
|
||
|
* a cell in the previous checkpoint, or to the terminal cell.
|
||
|
*/
|
||
|
void BtBranchTracer::squareFill(
|
||
|
int64_t rw, // row of cell to backtrace from
|
||
|
int64_t cl, // column of cell to backtrace from
|
||
|
int hef, // cell to backtrace from is H (0), E (1), or F (2)
|
||
|
TAlScore targ, // score of cell to backtrace from
|
||
|
TAlScore targ_final, // score of alignment we're looking for
|
||
|
RandomSource& rnd, // pseudo-random generator
|
||
|
int64_t& row_new, // out: row we ended up in after backtrace
|
||
|
int64_t& col_new, // out: column we ended up in after backtrace
|
||
|
int& hef_new, // out: H/E/F after backtrace
|
||
|
TAlScore& targ_new, // out: score up to cell we ended up in
|
||
|
bool& done, // out: finished tracing out an alignment?
|
||
|
bool& abort) // out: aborted b/c cell was seen before?
|
||
|
{
|
||
|
assert_geq(rw, 0);
|
||
|
assert_geq(cl, 0);
|
||
|
assert_range(0, 2, hef);
|
||
|
assert_lt(rw, (int64_t)prob_.qrylen_);
|
||
|
assert_lt(cl, (int64_t)prob_.reflen_);
|
||
|
assert(prob_.usecp_ && prob_.fill_);
|
||
|
const bool is8_ = prob_.cper_->is8_;
|
||
|
int64_t row = rw, col = cl;
|
||
|
assert_leq(prob_.reflen_, (TRefOff)sawcell_.size());
|
||
|
assert_leq(col, (int64_t)prob_.cper_->hicol());
|
||
|
assert_geq(col, (int64_t)prob_.cper_->locol());
|
||
|
assert_geq(prob_.cper_->per(), 2);
|
||
|
size_t xmod = col & prob_.cper_->lomask();
|
||
|
size_t ymod = row & prob_.cper_->lomask();
|
||
|
size_t xdiv = col >> prob_.cper_->perpow2();
|
||
|
size_t ydiv = row >> prob_.cper_->perpow2();
|
||
|
size_t sq_ncol = xmod+1, sq_nrow = ymod+1;
|
||
|
sq_.resize(sq_ncol * sq_nrow);
|
||
|
bool upper = ydiv == 0;
|
||
|
bool left = xdiv == 0;
|
||
|
const TAlScore sc_rdo = prob_.sc_->readGapOpen();
|
||
|
const TAlScore sc_rde = prob_.sc_->readGapExtend();
|
||
|
const TAlScore sc_rfo = prob_.sc_->refGapOpen();
|
||
|
const TAlScore sc_rfe = prob_.sc_->refGapExtend();
|
||
|
const bool local = !prob_.sc_->monotone;
|
||
|
const CpQuad *qup = NULL;
|
||
|
const __m128i *qlf = NULL;
|
||
|
size_t per = prob_.cper_->per_;
|
||
|
ASSERT_ONLY(size_t nrow = prob_.cper_->nrow());
|
||
|
size_t ncol = prob_.cper_->ncol();
|
||
|
assert_eq(prob_.qrylen_, nrow);
|
||
|
assert_eq(prob_.reflen_, (TRefOff)ncol);
|
||
|
size_t niter = prob_.cper_->niter_;
|
||
|
if(!upper) {
|
||
|
qup = prob_.cper_->qrows_.ptr() + (ncol * (ydiv-1)) + xdiv * per;
|
||
|
}
|
||
|
if(!left) {
|
||
|
// Set up the column pointers to point to the first __m128i word in the
|
||
|
// relevant column
|
||
|
size_t off = (niter << 2) * (xdiv-1);
|
||
|
qlf = prob_.cper_->qcols_.ptr() + off;
|
||
|
}
|
||
|
size_t xedge = xdiv * per; // absolute offset of leftmost cell in square
|
||
|
size_t yedge = ydiv * per; // absolute offset of topmost cell in square
|
||
|
size_t xi = xedge, yi = yedge; // iterators for columns, rows
|
||
|
size_t ii = 0; // iterator into packed square
|
||
|
// Iterate over rows, then over columns
|
||
|
size_t m128mod = yi % prob_.cper_->niter_;
|
||
|
size_t m128div = yi / prob_.cper_->niter_;
|
||
|
int16_t sc_h_dg_lastrow = MIN_I16;
|
||
|
for(size_t i = 0; i <= ymod; i++, yi++) {
|
||
|
assert_lt(yi, nrow);
|
||
|
xi = xedge;
|
||
|
// Handling for first column is done outside the loop
|
||
|
size_t fromend = prob_.qrylen_ - yi - 1;
|
||
|
bool allowGaps = fromend >= (size_t)prob_.sc_->gapbar && yi >= (size_t)prob_.sc_->gapbar;
|
||
|
// Get character, quality from read
|
||
|
int qc = prob_.qry_[yi], qq = prob_.qual_[yi];
|
||
|
assert_geq(qq, 33);
|
||
|
int16_t sc_h_lf_last = MIN_I16;
|
||
|
int16_t sc_e_lf_last = MIN_I16;
|
||
|
for(size_t j = 0; j <= xmod; j++, xi++) {
|
||
|
assert_lt(xi, ncol);
|
||
|
// Get character from reference
|
||
|
int rc = prob_.ref_[xi];
|
||
|
assert_range(0, 16, rc);
|
||
|
int16_t sc_diag = prob_.sc_->score(qc, rc, qq - 33);
|
||
|
int16_t sc_h_up = MIN_I16, sc_f_up = MIN_I16,
|
||
|
sc_h_lf = MIN_I16, sc_e_lf = MIN_I16,
|
||
|
sc_h_dg = MIN_I16;
|
||
|
int16_t sc_h_up_c = MIN_I16, sc_f_up_c = MIN_I16,
|
||
|
sc_h_lf_c = MIN_I16, sc_e_lf_c = MIN_I16,
|
||
|
sc_h_dg_c = MIN_I16;
|
||
|
if(yi == 0) {
|
||
|
// If I'm in the first first row or column set it to 0
|
||
|
sc_h_dg = 0;
|
||
|
} else if(xi == 0) {
|
||
|
// Do nothing; leave it at min
|
||
|
if(local) {
|
||
|
sc_h_dg = 0;
|
||
|
}
|
||
|
} else if(i == 0 && j == 0) {
|
||
|
// Otherwise, if I'm in the upper-left square corner, I can get
|
||
|
// it from the checkpoint
|
||
|
sc_h_dg = qup[-1].sc[0];
|
||
|
} else if(j == 0) {
|
||
|
// Otherwise, if I'm in the leftmost cell of this row, I can
|
||
|
// get it from sc_h_lf in first column of previous row
|
||
|
sc_h_dg = sc_h_dg_lastrow;
|
||
|
} else {
|
||
|
// Otherwise, I can get it from qup
|
||
|
sc_h_dg = qup[j-1].sc[0];
|
||
|
}
|
||
|
if(yi > 0 && xi > 0) DEBUG_CHECK(sc_h_dg, yi-1, xi-1, 2);
|
||
|
|
||
|
// If we're in the leftmost column, calculate sc_h_lf regardless of
|
||
|
// allowGaps.
|
||
|
if(j == 0 && xi > 0) {
|
||
|
// Get values for left neighbors from the checkpoint
|
||
|
if(is8_) {
|
||
|
size_t vecoff = (m128mod << 6) + m128div;
|
||
|
sc_e_lf = ((uint8_t*)(qlf + 0))[vecoff];
|
||
|
sc_h_lf = ((uint8_t*)(qlf + 2))[vecoff];
|
||
|
if(local) {
|
||
|
// No adjustment
|
||
|
} else {
|
||
|
if(sc_h_lf == 0) sc_h_lf = MIN_I16;
|
||
|
else sc_h_lf -= 0xff;
|
||
|
if(sc_e_lf == 0) sc_e_lf = MIN_I16;
|
||
|
else sc_e_lf -= 0xff;
|
||
|
}
|
||
|
} else {
|
||
|
size_t vecoff = (m128mod << 5) + m128div;
|
||
|
sc_e_lf = ((int16_t*)(qlf + 0))[vecoff];
|
||
|
sc_h_lf = ((int16_t*)(qlf + 2))[vecoff];
|
||
|
if(local) {
|
||
|
sc_h_lf += 0x8000; assert_geq(sc_h_lf, 0);
|
||
|
sc_e_lf += 0x8000; assert_geq(sc_e_lf, 0);
|
||
|
} else {
|
||
|
if(sc_h_lf != MIN_I16) sc_h_lf -= 0x7fff;
|
||
|
if(sc_e_lf != MIN_I16) sc_e_lf -= 0x7fff;
|
||
|
}
|
||
|
}
|
||
|
DEBUG_CHECK(sc_e_lf, yi, xi-1, 0);
|
||
|
DEBUG_CHECK(sc_h_lf, yi, xi-1, 2);
|
||
|
sc_h_dg_lastrow = sc_h_lf;
|
||
|
}
|
||
|
|
||
|
if(allowGaps) {
|
||
|
if(j == 0 /* at left edge */ && xi > 0 /* not extreme */) {
|
||
|
sc_h_lf_c = sc_h_lf;
|
||
|
sc_e_lf_c = sc_e_lf;
|
||
|
if(sc_h_lf_c != MIN_I16) sc_h_lf_c -= sc_rdo;
|
||
|
if(sc_e_lf_c != MIN_I16) sc_e_lf_c -= sc_rde;
|
||
|
assert_leq(sc_h_lf_c, prob_.cper_->perf_);
|
||
|
assert_leq(sc_e_lf_c, prob_.cper_->perf_);
|
||
|
} else if(xi > 0) {
|
||
|
// Get values for left neighbors from the previous iteration
|
||
|
if(sc_h_lf_last != MIN_I16) {
|
||
|
sc_h_lf = sc_h_lf_last;
|
||
|
sc_h_lf_c = sc_h_lf - sc_rdo;
|
||
|
}
|
||
|
if(sc_e_lf_last != MIN_I16) {
|
||
|
sc_e_lf = sc_e_lf_last;
|
||
|
sc_e_lf_c = sc_e_lf - sc_rde;
|
||
|
}
|
||
|
}
|
||
|
if(yi > 0 /* not extreme */) {
|
||
|
// Get column values
|
||
|
assert(qup != NULL);
|
||
|
assert(local || qup[j].sc[2] < 0);
|
||
|
if(qup[j].sc[0] > MIN_I16) {
|
||
|
DEBUG_CHECK(qup[j].sc[0], yi-1, xi, 2);
|
||
|
sc_h_up = qup[j].sc[0];
|
||
|
sc_h_up_c = sc_h_up - sc_rfo;
|
||
|
}
|
||
|
if(qup[j].sc[2] > MIN_I16) {
|
||
|
DEBUG_CHECK(qup[j].sc[2], yi-1, xi, 1);
|
||
|
sc_f_up = qup[j].sc[2];
|
||
|
sc_f_up_c = sc_f_up - sc_rfe;
|
||
|
}
|
||
|
}
|
||
|
if(local) {
|
||
|
sc_h_up_c = max<int16_t>(sc_h_up_c, 0);
|
||
|
sc_f_up_c = max<int16_t>(sc_f_up_c, 0);
|
||
|
sc_h_lf_c = max<int16_t>(sc_h_lf_c, 0);
|
||
|
sc_e_lf_c = max<int16_t>(sc_e_lf_c, 0);
|
||
|
}
|
||
|
}
|
||
|
|
||
|
if(sc_h_dg > MIN_I16) {
|
||
|
sc_h_dg_c = sc_h_dg + sc_diag;
|
||
|
}
|
||
|
if(local) sc_h_dg_c = max<int16_t>(sc_h_dg_c, 0);
|
||
|
|
||
|
int mask = 0;
|
||
|
// Calculate best ways into H, E, F cells starting with H.
|
||
|
// Mask bits:
|
||
|
// H: 1=diag, 2=hhoriz, 4=ehoriz, 8=hvert, 16=fvert
|
||
|
// E: 32=hhoriz, 64=ehoriz
|
||
|
// F: 128=hvert, 256=fvert
|
||
|
int16_t sc_best = sc_h_dg_c;
|
||
|
if(sc_h_dg_c > MIN_I64) {
|
||
|
mask = 1;
|
||
|
}
|
||
|
if(xi > 0 && sc_h_lf_c >= sc_best && sc_h_lf_c > MIN_I64) {
|
||
|
if(sc_h_lf_c > sc_best) mask = 0;
|
||
|
mask |= 2;
|
||
|
sc_best = sc_h_lf_c;
|
||
|
}
|
||
|
if(xi > 0 && sc_e_lf_c >= sc_best && sc_e_lf_c > MIN_I64) {
|
||
|
if(sc_e_lf_c > sc_best) mask = 0;
|
||
|
mask |= 4;
|
||
|
sc_best = sc_e_lf_c;
|
||
|
}
|
||
|
if(yi > 0 && sc_h_up_c >= sc_best && sc_h_up_c > MIN_I64) {
|
||
|
if(sc_h_up_c > sc_best) mask = 0;
|
||
|
mask |= 8;
|
||
|
sc_best = sc_h_up_c;
|
||
|
}
|
||
|
if(yi > 0 && sc_f_up_c >= sc_best && sc_f_up_c > MIN_I64) {
|
||
|
if(sc_f_up_c > sc_best) mask = 0;
|
||
|
mask |= 16;
|
||
|
sc_best = sc_f_up_c;
|
||
|
}
|
||
|
// Calculate best way into E cell
|
||
|
int16_t sc_e_best = sc_h_lf_c;
|
||
|
if(xi > 0) {
|
||
|
if(sc_h_lf_c >= sc_e_lf_c && sc_h_lf_c > MIN_I64) {
|
||
|
if(sc_h_lf_c == sc_e_lf_c) {
|
||
|
mask |= 64;
|
||
|
}
|
||
|
mask |= 32;
|
||
|
} else if(sc_e_lf_c > MIN_I64) {
|
||
|
sc_e_best = sc_e_lf_c;
|
||
|
mask |= 64;
|
||
|
}
|
||
|
}
|
||
|
if(sc_e_best > sc_best) {
|
||
|
sc_best = sc_e_best;
|
||
|
mask &= ~31; // don't go diagonal
|
||
|
}
|
||
|
// Calculate best way into F cell
|
||
|
int16_t sc_f_best = sc_h_up_c;
|
||
|
if(yi > 0) {
|
||
|
if(sc_h_up_c >= sc_f_up_c && sc_h_up_c > MIN_I64) {
|
||
|
if(sc_h_up_c == sc_f_up_c) {
|
||
|
mask |= 256;
|
||
|
}
|
||
|
mask |= 128;
|
||
|
} else if(sc_f_up_c > MIN_I64) {
|
||
|
sc_f_best = sc_f_up_c;
|
||
|
mask |= 256;
|
||
|
}
|
||
|
}
|
||
|
if(sc_f_best > sc_best) {
|
||
|
sc_best = sc_f_best;
|
||
|
mask &= ~127; // don't go horizontal or diagonal
|
||
|
}
|
||
|
// Install results in cur
|
||
|
assert( local || sc_best <= 0);
|
||
|
sq_[ii+j].sc[0] = sc_best;
|
||
|
assert( local || sc_e_best < 0);
|
||
|
assert( local || sc_f_best < 0);
|
||
|
assert(!local || sc_e_best >= 0 || sc_e_best == MIN_I16);
|
||
|
assert(!local || sc_f_best >= 0 || sc_f_best == MIN_I16);
|
||
|
sq_[ii+j].sc[1] = sc_e_best;
|
||
|
sq_[ii+j].sc[2] = sc_f_best;
|
||
|
sq_[ii+j].sc[3] = mask;
|
||
|
DEBUG_CHECK(sq_[ii+j].sc[0], yi, xi, 2); // H
|
||
|
DEBUG_CHECK(sq_[ii+j].sc[1], yi, xi, 0); // E
|
||
|
DEBUG_CHECK(sq_[ii+j].sc[2], yi, xi, 1); // F
|
||
|
// Update sc_h_lf_last, sc_e_lf_last
|
||
|
sc_h_lf_last = sc_best;
|
||
|
sc_e_lf_last = sc_e_best;
|
||
|
}
|
||
|
// Update m128mod, m128div
|
||
|
m128mod++;
|
||
|
if(m128mod == prob_.cper_->niter_) {
|
||
|
m128mod = 0;
|
||
|
m128div++;
|
||
|
}
|
||
|
// update qup
|
||
|
ii += sq_ncol;
|
||
|
// dimensions of sq_
|
||
|
qup = sq_.ptr() + sq_ncol * i;
|
||
|
}
|
||
|
assert_eq(targ, sq_[ymod * sq_ncol + xmod].sc[hef]);
|
||
|
//
|
||
|
// Now backtrack through the triangle. Abort as soon as we enter a cell
|
||
|
// that was visited by a previous backtrace.
|
||
|
//
|
||
|
int64_t rowc = row, colc = col;
|
||
|
size_t curid;
|
||
|
int hefc = hef;
|
||
|
if(bs_.empty()) {
|
||
|
// Start an initial branch
|
||
|
CHECK_ROW_COL(rowc, colc);
|
||
|
curid = bs_.alloc();
|
||
|
assert_eq(0, curid);
|
||
|
Edit e;
|
||
|
bs_[curid].init(
|
||
|
prob_,
|
||
|
0, // parent ID
|
||
|
0, // penalty
|
||
|
0, // score_en
|
||
|
rowc, // row
|
||
|
colc, // col
|
||
|
e, // edit
|
||
|
0, // hef
|
||
|
true, // root?
|
||
|
false); // don't try to extend with exact matches
|
||
|
bs_[curid].len_ = 0;
|
||
|
} else {
|
||
|
curid = bs_.size()-1;
|
||
|
}
|
||
|
size_t ymodTimesNcol = ymod * sq_ncol;
|
||
|
while(true) {
|
||
|
// What depth are we?
|
||
|
assert_eq(ymodTimesNcol, ymod * sq_ncol);
|
||
|
CpQuad * cur = sq_.ptr() + ymodTimesNcol + xmod;
|
||
|
int mask = cur->sc[3];
|
||
|
assert_gt(mask, 0);
|
||
|
int sel = -1;
|
||
|
// Select what type of move to make, which depends on whether we're
|
||
|
// currently in H, E, F:
|
||
|
if(hefc == 0) {
|
||
|
if( (mask & 1) != 0) {
|
||
|
// diagonal
|
||
|
sel = 0;
|
||
|
} else if((mask & 8) != 0) {
|
||
|
// up to H
|
||
|
sel = 3;
|
||
|
} else if((mask & 16) != 0) {
|
||
|
// up to F
|
||
|
sel = 4;
|
||
|
} else if((mask & 2) != 0) {
|
||
|
// left to H
|
||
|
sel = 1;
|
||
|
} else if((mask & 4) != 0) {
|
||
|
// left to E
|
||
|
sel = 2;
|
||
|
}
|
||
|
} else if(hefc == 1) {
|
||
|
if( (mask & 32) != 0) {
|
||
|
// left to H
|
||
|
sel = 5;
|
||
|
} else if((mask & 64) != 0) {
|
||
|
// left to E
|
||
|
sel = 6;
|
||
|
}
|
||
|
} else {
|
||
|
assert_eq(2, hefc);
|
||
|
if( (mask & 128) != 0) {
|
||
|
// up to H
|
||
|
sel = 7;
|
||
|
} else if((mask & 256) != 0) {
|
||
|
// up to F
|
||
|
sel = 8;
|
||
|
}
|
||
|
}
|
||
|
assert_geq(sel, 0);
|
||
|
// Get character from read
|
||
|
int qc = prob_.qry_[rowc], qq = prob_.qual_[rowc];
|
||
|
// Get character from reference
|
||
|
int rc = prob_.ref_[colc];
|
||
|
assert_range(0, 16, rc);
|
||
|
bool xexit = false, yexit = false;
|
||
|
// Now that we know what type of move to make, make it, updating our
|
||
|
// row and column and moving updating the branch.
|
||
|
if(sel == 0) {
|
||
|
assert_geq(rowc, 0);
|
||
|
assert_geq(colc, 0);
|
||
|
TAlScore scd = prob_.sc_->score(qc, rc, qq - 33);
|
||
|
if((rc & (1 << qc)) == 0) {
|
||
|
// Mismatch
|
||
|
size_t id = curid;
|
||
|
// Check if the previous branch was the initial (bottommost)
|
||
|
// branch with no matches. If so, the mismatch should be added
|
||
|
// to the initial branch, instead of starting a new branch.
|
||
|
bool empty = (bs_[curid].len_ == 0 && curid == 0);
|
||
|
if(!empty) {
|
||
|
id = bs_.alloc();
|
||
|
}
|
||
|
Edit e((int)rowc, mask2dna[rc], "ACGTN"[qc], EDIT_TYPE_MM);
|
||
|
assert_lt(scd, 0);
|
||
|
TAlScore score_en = bs_[curid].score_st_ + scd;
|
||
|
bs_[id].init(
|
||
|
prob_,
|
||
|
curid, // parent ID
|
||
|
-scd, // penalty
|
||
|
score_en, // score_en
|
||
|
rowc, // row
|
||
|
colc, // col
|
||
|
e, // edit
|
||
|
hefc, // hef
|
||
|
empty, // root?
|
||
|
false); // don't try to extend with exact matches
|
||
|
curid = id;
|
||
|
//assert(!local || bs_[curid].score_st_ >= 0);
|
||
|
} else {
|
||
|
// Match
|
||
|
bs_[curid].score_st_ += prob_.sc_->match();
|
||
|
bs_[curid].len_++;
|
||
|
assert_leq((int64_t)bs_[curid].len_, bs_[curid].row_ + 1);
|
||
|
}
|
||
|
if(xmod == 0) xexit = true;
|
||
|
if(ymod == 0) yexit = true;
|
||
|
rowc--; ymod--; ymodTimesNcol -= sq_ncol;
|
||
|
colc--; xmod--;
|
||
|
assert(local || bs_[curid].score_st_ >= targ_final);
|
||
|
hefc = 0;
|
||
|
} else if((sel >= 1 && sel <= 2) || (sel >= 5 && sel <= 6)) {
|
||
|
assert_gt(colc, 0);
|
||
|
// Read gap
|
||
|
size_t id = bs_.alloc();
|
||
|
Edit e((int)rowc+1, mask2dna[rc], '-', EDIT_TYPE_READ_GAP);
|
||
|
TAlScore gapp = prob_.sc_->readGapOpen();
|
||
|
if(bs_[curid].len_ == 0 && bs_[curid].e_.inited() && bs_[curid].e_.isReadGap()) {
|
||
|
gapp = prob_.sc_->readGapExtend();
|
||
|
}
|
||
|
//assert(!local || bs_[curid].score_st_ >= gapp);
|
||
|
TAlScore score_en = bs_[curid].score_st_ - gapp;
|
||
|
bs_[id].init(
|
||
|
prob_,
|
||
|
curid, // parent ID
|
||
|
gapp, // penalty
|
||
|
score_en, // score_en
|
||
|
rowc, // row
|
||
|
colc-1, // col
|
||
|
e, // edit
|
||
|
hefc, // hef
|
||
|
false, // root?
|
||
|
false); // don't try to extend with exact matches
|
||
|
if(xmod == 0) xexit = true;
|
||
|
colc--; xmod--;
|
||
|
curid = id;
|
||
|
assert( local || bs_[curid].score_st_ >= targ_final);
|
||
|
//assert(!local || bs_[curid].score_st_ >= 0);
|
||
|
if(sel == 1 || sel == 5) {
|
||
|
hefc = 0;
|
||
|
} else {
|
||
|
hefc = 1;
|
||
|
}
|
||
|
} else {
|
||
|
assert_gt(rowc, 0);
|
||
|
// Reference gap
|
||
|
size_t id = bs_.alloc();
|
||
|
Edit e((int)rowc, '-', "ACGTN"[qc], EDIT_TYPE_REF_GAP);
|
||
|
TAlScore gapp = prob_.sc_->refGapOpen();
|
||
|
if(bs_[curid].len_ == 0 && bs_[curid].e_.inited() && bs_[curid].e_.isRefGap()) {
|
||
|
gapp = prob_.sc_->refGapExtend();
|
||
|
}
|
||
|
//assert(!local || bs_[curid].score_st_ >= gapp);
|
||
|
TAlScore score_en = bs_[curid].score_st_ - gapp;
|
||
|
bs_[id].init(
|
||
|
prob_,
|
||
|
curid, // parent ID
|
||
|
gapp, // penalty
|
||
|
score_en, // score_en
|
||
|
rowc-1, // row
|
||
|
colc, // col
|
||
|
e, // edit
|
||
|
hefc, // hef
|
||
|
false, // root?
|
||
|
false); // don't try to extend with exact matches
|
||
|
if(ymod == 0) yexit = true;
|
||
|
rowc--; ymod--; ymodTimesNcol -= sq_ncol;
|
||
|
curid = id;
|
||
|
assert( local || bs_[curid].score_st_ >= targ_final);
|
||
|
//assert(!local || bs_[curid].score_st_ >= 0);
|
||
|
if(sel == 3 || sel == 7) {
|
||
|
hefc = 0;
|
||
|
} else {
|
||
|
hefc = 2;
|
||
|
}
|
||
|
}
|
||
|
CHECK_ROW_COL(rowc, colc);
|
||
|
CpQuad * cur_new = NULL;
|
||
|
if(!xexit && !yexit) {
|
||
|
cur_new = sq_.ptr() + ymodTimesNcol + xmod;
|
||
|
}
|
||
|
// Check whether we made it to the top row or to a cell with score 0
|
||
|
if(colc < 0 || rowc < 0 ||
|
||
|
(cur_new != NULL && local && cur_new->sc[0] == 0))
|
||
|
{
|
||
|
done = true;
|
||
|
assert(bs_[curid].isSolution(prob_));
|
||
|
addSolution(curid);
|
||
|
#ifndef NDEBUG
|
||
|
// A check to see if any two adjacent branches in the backtrace
|
||
|
// overlap. If they do, the whole alignment will be filtered out
|
||
|
// in trySolution(...)
|
||
|
size_t cur = curid;
|
||
|
if(!bs_[cur].root_) {
|
||
|
size_t next = bs_[cur].parentId_;
|
||
|
while(!bs_[next].root_) {
|
||
|
assert_neq(cur, next);
|
||
|
if(bs_[next].len_ != 0 || bs_[cur].len_ == 0) {
|
||
|
assert(!bs_[cur].overlap(prob_, bs_[next]));
|
||
|
}
|
||
|
cur = next;
|
||
|
next = bs_[cur].parentId_;
|
||
|
}
|
||
|
}
|
||
|
#endif
|
||
|
return;
|
||
|
}
|
||
|
assert(!xexit || hefc == 0 || hefc == 1);
|
||
|
assert(!yexit || hefc == 0 || hefc == 2);
|
||
|
if(xexit || yexit) {
|
||
|
//assert(rowc < 0 || colc < 0 || prob_.cper_->isCheckpointed(rowc, colc));
|
||
|
row_new = rowc; col_new = colc;
|
||
|
hef_new = hefc;
|
||
|
done = false;
|
||
|
if(rowc < 0 || colc < 0) {
|
||
|
assert(local);
|
||
|
targ_new = 0;
|
||
|
} else {
|
||
|
// TODO: Don't use scoreSquare
|
||
|
targ_new = prob_.cper_->scoreSquare(rowc, colc, hefc);
|
||
|
assert(local || targ_new >= targ);
|
||
|
assert(local || targ_new >= targ_final);
|
||
|
}
|
||
|
if(local && targ_new == 0) {
|
||
|
assert_eq(0, hefc);
|
||
|
done = true;
|
||
|
assert(bs_[curid].isSolution(prob_));
|
||
|
addSolution(curid);
|
||
|
}
|
||
|
assert((row_new >= 0 && col_new >= 0) || done);
|
||
|
return;
|
||
|
}
|
||
|
}
|
||
|
assert(false);
|
||
|
}
|
||
|
|
||
|
/**
|
||
|
* Caller gives us score_en, row and col. We figure out score_st and len_
|
||
|
* by comparing characters from the strings.
|
||
|
*
|
||
|
* If this branch comes after a mismatch, (row, col) describe the cell that the
|
||
|
* mismatch occurs in. len_ is initially set to 1, and the next cell we test
|
||
|
* is the next cell up and to the left (row-1, col-1).
|
||
|
*
|
||
|
* If this branch comes after a read gap, (row, col) describe the leftmost cell
|
||
|
* involved in the gap. len_ is initially set to 0, and the next cell we test
|
||
|
* is the current cell (row, col).
|
||
|
*
|
||
|
* If this branch comes after a reference gap, (row, col) describe the upper
|
||
|
* cell involved in the gap. len_ is initially set to 0, and the next cell we
|
||
|
* test is the current cell (row, col).
|
||
|
*/
|
||
|
void BtBranch::init(
|
||
|
const BtBranchProblem& prob,
|
||
|
size_t parentId,
|
||
|
TAlScore penalty,
|
||
|
TAlScore score_en,
|
||
|
int64_t row,
|
||
|
int64_t col,
|
||
|
Edit e,
|
||
|
int hef,
|
||
|
bool root,
|
||
|
bool extend)
|
||
|
{
|
||
|
score_en_ = score_en;
|
||
|
penalty_ = penalty;
|
||
|
score_st_ = score_en_;
|
||
|
row_ = row;
|
||
|
col_ = col;
|
||
|
parentId_ = parentId;
|
||
|
e_ = e;
|
||
|
root_ = root;
|
||
|
assert(!root_ || parentId == 0);
|
||
|
assert_lt(row, (int64_t)prob.qrylen_);
|
||
|
assert_lt(col, (int64_t)prob.reflen_);
|
||
|
// First match to check is diagonally above and to the left of the cell
|
||
|
// where the edit occurs
|
||
|
int64_t rowc = row;
|
||
|
int64_t colc = col;
|
||
|
len_ = 0;
|
||
|
if(e.inited() && e.isMismatch()) {
|
||
|
rowc--; colc--;
|
||
|
len_ = 1;
|
||
|
}
|
||
|
int64_t match = prob.sc_->match();
|
||
|
bool cp = prob.usecp_;
|
||
|
size_t iters = 0;
|
||
|
curtailed_ = false;
|
||
|
if(extend) {
|
||
|
while(rowc >= 0 && colc >= 0) {
|
||
|
int rfm = prob.ref_[colc];
|
||
|
assert_range(0, 16, rfm);
|
||
|
int rdc = prob.qry_[rowc];
|
||
|
bool matches = (rfm & (1 << rdc)) != 0;
|
||
|
if(!matches) {
|
||
|
// What's the mismatch penalty?
|
||
|
break;
|
||
|
}
|
||
|
// Get score from checkpointer
|
||
|
score_st_ += match;
|
||
|
if(cp && rowc - 1 >= 0 && colc - 1 >= 0 &&
|
||
|
prob.cper_->isCheckpointed(rowc - 1, colc - 1))
|
||
|
{
|
||
|
// Possibly prune
|
||
|
int16_t cpsc;
|
||
|
cpsc = prob.cper_->scoreTriangle(rowc - 1, colc - 1, hef);
|
||
|
if(cpsc + score_st_ < prob.targ_) {
|
||
|
curtailed_ = true;
|
||
|
break;
|
||
|
}
|
||
|
}
|
||
|
iters++;
|
||
|
rowc--; colc--;
|
||
|
}
|
||
|
}
|
||
|
assert_geq(rowc, -1);
|
||
|
assert_geq(colc, -1);
|
||
|
len_ = (int64_t)row - rowc;
|
||
|
assert_leq((int64_t)len_, row_+1);
|
||
|
assert_leq((int64_t)len_, col_+1);
|
||
|
assert_leq((int64_t)score_st_, (int64_t)prob.qrylen_ * match);
|
||
|
}
|
||
|
|
||
|
/**
|
||
|
* Given a potential branch to add to the queue, see if we can follow the
|
||
|
* branch a little further first. If it's still valid, or if we reach a
|
||
|
* choice between valid outgoing paths, go ahead and add it to the queue.
|
||
|
*/
|
||
|
void BtBranchTracer::examineBranch(
|
||
|
int64_t row,
|
||
|
int64_t col,
|
||
|
const Edit& e,
|
||
|
TAlScore pen, // penalty associated with edit
|
||
|
TAlScore sc,
|
||
|
size_t parentId)
|
||
|
{
|
||
|
size_t id = bs_.alloc();
|
||
|
bs_[id].init(prob_, parentId, pen, sc, row, col, e, 0, false, true);
|
||
|
if(bs_[id].isSolution(prob_)) {
|
||
|
assert(bs_[id].isValid(prob_));
|
||
|
addSolution(id);
|
||
|
} else {
|
||
|
// Check if this branch is legit
|
||
|
if(bs_[id].isValid(prob_)) {
|
||
|
add(id);
|
||
|
} else {
|
||
|
bs_.pop();
|
||
|
}
|
||
|
}
|
||
|
}
|
||
|
|
||
|
/**
|
||
|
* Take all possible ways of leaving the given branch and add them to the
|
||
|
* branch queue.
|
||
|
*/
|
||
|
void BtBranchTracer::addOffshoots(size_t bid) {
|
||
|
BtBranch& b = bs_[bid];
|
||
|
TAlScore sc = b.score_en_;
|
||
|
int64_t match = prob_.sc_->match();
|
||
|
int64_t scoreFloor = prob_.sc_->monotone ? MIN_I64 : 0;
|
||
|
bool cp = prob_.usecp_; // Are there are any checkpoints?
|
||
|
ASSERT_ONLY(TAlScore perfectScore = prob_.sc_->perfectScore(prob_.qrylen_));
|
||
|
assert_leq(prob_.targ_, perfectScore);
|
||
|
// For each cell in the branch
|
||
|
for(size_t i = 0 ; i < b.len_; i++) {
|
||
|
assert_leq((int64_t)i, b.row_+1);
|
||
|
assert_leq((int64_t)i, b.col_+1);
|
||
|
int64_t row = b.row_ - i, col = b.col_ - i;
|
||
|
int64_t bonusLeft = (row + 1) * match;
|
||
|
int64_t fromend = prob_.qrylen_ - row - 1;
|
||
|
bool allowGaps = fromend >= prob_.sc_->gapbar && row >= prob_.sc_->gapbar;
|
||
|
if(allowGaps && row >= 0 && col >= 0) {
|
||
|
if(col > 0) {
|
||
|
// Try a read gap - it's either an extension or an open
|
||
|
bool extend = b.e_.inited() && b.e_.isReadGap() && i == 0;
|
||
|
TAlScore rdgapPen = extend ?
|
||
|
prob_.sc_->readGapExtend() : prob_.sc_->readGapOpen();
|
||
|
bool prune = false;
|
||
|
assert_gt(rdgapPen, 0);
|
||
|
if(cp && prob_.cper_->isCheckpointed(row, col - 1)) {
|
||
|
// Possibly prune
|
||
|
int16_t cpsc = (int16_t)prob_.cper_->scoreTriangle(row, col - 1, 0);
|
||
|
assert_leq(cpsc, perfectScore);
|
||
|
assert_geq(prob_.sc_->readGapOpen(), prob_.sc_->readGapExtend());
|
||
|
TAlScore bonus = prob_.sc_->readGapOpen() - prob_.sc_->readGapExtend();
|
||
|
assert_geq(bonus, 0);
|
||
|
if(cpsc + bonus + sc - rdgapPen < prob_.targ_) {
|
||
|
prune = true;
|
||
|
}
|
||
|
}
|
||
|
if(prune) {
|
||
|
if(extend) { nrdexPrune_++; } else { nrdopPrune_++; }
|
||
|
} else if(sc - rdgapPen >= scoreFloor && sc - rdgapPen + bonusLeft >= prob_.targ_) {
|
||
|
// Yes, we can introduce a read gap here
|
||
|
Edit e((int)row + 1, mask2dna[(int)prob_.ref_[col]], '-', EDIT_TYPE_READ_GAP);
|
||
|
assert(e.isReadGap());
|
||
|
examineBranch(row, col - 1, e, rdgapPen, sc - rdgapPen, bid);
|
||
|
if(extend) { nrdex_++; } else { nrdop_++; }
|
||
|
}
|
||
|
}
|
||
|
if(row > 0) {
|
||
|
// Try a reference gap - it's either an extension or an open
|
||
|
bool extend = b.e_.inited() && b.e_.isRefGap() && i == 0;
|
||
|
TAlScore rfgapPen = (b.e_.inited() && b.e_.isRefGap()) ?
|
||
|
prob_.sc_->refGapExtend() : prob_.sc_->refGapOpen();
|
||
|
bool prune = false;
|
||
|
assert_gt(rfgapPen, 0);
|
||
|
if(cp && prob_.cper_->isCheckpointed(row - 1, col)) {
|
||
|
// Possibly prune
|
||
|
int16_t cpsc = (int16_t)prob_.cper_->scoreTriangle(row - 1, col, 0);
|
||
|
assert_leq(cpsc, perfectScore);
|
||
|
assert_geq(prob_.sc_->refGapOpen(), prob_.sc_->refGapExtend());
|
||
|
TAlScore bonus = prob_.sc_->refGapOpen() - prob_.sc_->refGapExtend();
|
||
|
assert_geq(bonus, 0);
|
||
|
if(cpsc + bonus + sc - rfgapPen < prob_.targ_) {
|
||
|
prune = true;
|
||
|
}
|
||
|
}
|
||
|
if(prune) {
|
||
|
if(extend) { nrfexPrune_++; } else { nrfopPrune_++; }
|
||
|
} else if(sc - rfgapPen >= scoreFloor && sc - rfgapPen + bonusLeft >= prob_.targ_) {
|
||
|
// Yes, we can introduce a ref gap here
|
||
|
Edit e((int)row, '-', "ACGTN"[(int)prob_.qry_[row]], EDIT_TYPE_REF_GAP);
|
||
|
assert(e.isRefGap());
|
||
|
examineBranch(row - 1, col, e, rfgapPen, sc - rfgapPen, bid);
|
||
|
if(extend) { nrfex_++; } else { nrfop_++; }
|
||
|
}
|
||
|
}
|
||
|
}
|
||
|
// If we're at the top of the branch but not yet at the top of
|
||
|
// the DP table, a mismatch branch is also possible.
|
||
|
if(i == b.len_ && !b.curtailed_ && row >= 0 && col >= 0) {
|
||
|
int rfm = prob_.ref_[col];
|
||
|
assert_lt(row, (int64_t)prob_.qrylen_);
|
||
|
int rdc = prob_.qry_[row];
|
||
|
int rdq = prob_.qual_[row];
|
||
|
int scdiff = prob_.sc_->score(rdc, rfm, rdq - 33);
|
||
|
assert_lt(scdiff, 0); // at end of branch, so can't match
|
||
|
bool prune = false;
|
||
|
if(cp && row > 0 && col > 0 && prob_.cper_->isCheckpointed(row - 1, col - 1)) {
|
||
|
// Possibly prune
|
||
|
int16_t cpsc = prob_.cper_->scoreTriangle(row - 1, col - 1, 0);
|
||
|
assert_leq(cpsc, perfectScore);
|
||
|
assert_leq(cpsc + scdiff + sc, perfectScore);
|
||
|
if(cpsc + scdiff + sc < prob_.targ_) {
|
||
|
prune = true;
|
||
|
}
|
||
|
}
|
||
|
if(prune) {
|
||
|
nmm_++;
|
||
|
} else {
|
||
|
// Yes, we can introduce a mismatch here
|
||
|
if(sc + scdiff >= scoreFloor && sc + scdiff + bonusLeft >= prob_.targ_) {
|
||
|
Edit e((int)row, mask2dna[rfm], "ACGTN"[rdc], EDIT_TYPE_MM);
|
||
|
bool nmm = (mask2dna[rfm] == 'N' || rdc > 4);
|
||
|
assert_neq(e.chr, e.qchr);
|
||
|
assert_lt(scdiff, 0);
|
||
|
examineBranch(row - 1, col - 1, e, -scdiff, sc + scdiff, bid);
|
||
|
if(nmm) { nnmm_++; } else { nmm_++; }
|
||
|
}
|
||
|
}
|
||
|
}
|
||
|
sc += match;
|
||
|
}
|
||
|
}
|
||
|
|
||
|
/**
|
||
|
* Sort unsorted branches, merge them with master sorted list.
|
||
|
*/
|
||
|
void BtBranchTracer::flushUnsorted() {
|
||
|
if(unsorted_.empty()) {
|
||
|
return;
|
||
|
}
|
||
|
unsorted_.sort();
|
||
|
unsorted_.reverse();
|
||
|
#ifndef NDEBUG
|
||
|
for(size_t i = 1; i < unsorted_.size(); i++) {
|
||
|
assert_leq(bs_[unsorted_[i].second].score_st_, bs_[unsorted_[i-1].second].score_st_);
|
||
|
}
|
||
|
#endif
|
||
|
EList<size_t> *src2 = sortedSel_ ? &sorted1_ : &sorted2_;
|
||
|
EList<size_t> *dest = sortedSel_ ? &sorted2_ : &sorted1_;
|
||
|
// Merge src1 and src2 into dest
|
||
|
dest->clear();
|
||
|
size_t cur1 = 0, cur2 = cur_;
|
||
|
while(cur1 < unsorted_.size() || cur2 < src2->size()) {
|
||
|
// Take from 1 or 2 next?
|
||
|
bool take1 = true;
|
||
|
if(cur1 == unsorted_.size()) {
|
||
|
take1 = false;
|
||
|
} else if(cur2 == src2->size()) {
|
||
|
take1 = true;
|
||
|
} else {
|
||
|
assert_neq(unsorted_[cur1].second, (*src2)[cur2]);
|
||
|
take1 = bs_[unsorted_[cur1].second] < bs_[(*src2)[cur2]];
|
||
|
}
|
||
|
if(take1) {
|
||
|
dest->push_back(unsorted_[cur1++].second); // Take from list 1
|
||
|
} else {
|
||
|
dest->push_back((*src2)[cur2++]); // Take from list 2
|
||
|
}
|
||
|
}
|
||
|
assert_eq(cur1, unsorted_.size());
|
||
|
assert_eq(cur2, src2->size());
|
||
|
sortedSel_ = !sortedSel_;
|
||
|
cur_ = 0;
|
||
|
unsorted_.clear();
|
||
|
}
|
||
|
|
||
|
/**
|
||
|
* Try all the solutions accumulated so far. Solutions might be rejected
|
||
|
* if they, for instance, overlap a previous solution, have too many Ns,
|
||
|
* fail to overlap a core diagonal, etc.
|
||
|
*/
|
||
|
bool BtBranchTracer::trySolutions(
|
||
|
bool lookForOlap,
|
||
|
SwResult& res,
|
||
|
size_t& off,
|
||
|
size_t& nrej,
|
||
|
RandomSource& rnd,
|
||
|
bool& success)
|
||
|
{
|
||
|
if(solutions_.size() > 0) {
|
||
|
for(size_t i = 0; i < solutions_.size(); i++) {
|
||
|
int ret = trySolution(solutions_[i], lookForOlap, res, off, nrej, rnd);
|
||
|
if(ret == BT_FOUND) {
|
||
|
success = true;
|
||
|
return true; // there were solutions and one was good
|
||
|
}
|
||
|
}
|
||
|
solutions_.clear();
|
||
|
success = false;
|
||
|
return true; // there were solutions but none were good
|
||
|
}
|
||
|
return false; // there were no solutions to check
|
||
|
}
|
||
|
|
||
|
/**
|
||
|
* Given the id of a branch that completes a successful backtrace, turn the
|
||
|
* chain of branches into
|
||
|
*/
|
||
|
int BtBranchTracer::trySolution(
|
||
|
size_t id,
|
||
|
bool lookForOlap,
|
||
|
SwResult& res,
|
||
|
size_t& off,
|
||
|
size_t& nrej,
|
||
|
RandomSource& rnd)
|
||
|
{
|
||
|
AlnScore score;
|
||
|
BtBranch *br = &bs_[id];
|
||
|
// 'br' corresponds to the leftmost edit in a right-to-left
|
||
|
// chain of edits.
|
||
|
EList<Edit>& ned = res.alres.ned();
|
||
|
const BtBranch *cur = br, *prev = NULL;
|
||
|
size_t ns = 0, nrefns = 0;
|
||
|
size_t ngap = 0;
|
||
|
while(true) {
|
||
|
if(cur->e_.inited()) {
|
||
|
if(cur->e_.isMismatch()) {
|
||
|
if(cur->e_.qchr == 'N' || cur->e_.chr == 'N') {
|
||
|
ns++;
|
||
|
}
|
||
|
} else if(cur->e_.isGap()) {
|
||
|
ngap++;
|
||
|
}
|
||
|
if(cur->e_.chr == 'N') {
|
||
|
nrefns++;
|
||
|
}
|
||
|
ned.push_back(cur->e_);
|
||
|
}
|
||
|
if(cur->root_) {
|
||
|
break;
|
||
|
}
|
||
|
cur = &bs_[cur->parentId_];
|
||
|
}
|
||
|
if(ns > prob_.nceil_) {
|
||
|
// Alignment has too many Ns in it!
|
||
|
res.reset();
|
||
|
assert(res.alres.ned().empty());
|
||
|
nrej++;
|
||
|
return BT_REJECTED_N;
|
||
|
}
|
||
|
// Update 'seenPaths_'
|
||
|
cur = br;
|
||
|
bool rejSeen = false; // set =true if we overlap prev path
|
||
|
bool rejCore = true; // set =true if we don't touch core diag
|
||
|
while(true) {
|
||
|
// Consider row, col, len, then do something
|
||
|
int64_t row = cur->row_, col = cur->col_;
|
||
|
assert_lt(row, (int64_t)prob_.qrylen_);
|
||
|
size_t fromend = prob_.qrylen_ - row - 1;
|
||
|
size_t diag = fromend + col;
|
||
|
// Calculate the diagonal within the *trimmed* rectangle,
|
||
|
// i.e. the rectangle we dealt with in align, gather and
|
||
|
// backtrack.
|
||
|
int64_t diagi = col - row;
|
||
|
// Now adjust to the diagonal within the *untrimmed*
|
||
|
// rectangle by adding on the amount trimmed from the left.
|
||
|
diagi += prob_.rect_->triml;
|
||
|
assert_lt(diag, seenPaths_.size());
|
||
|
// Does it overlap a core diagonal?
|
||
|
if(diagi >= 0) {
|
||
|
size_t diag = (size_t)diagi;
|
||
|
if(diag >= prob_.rect_->corel &&
|
||
|
diag <= prob_.rect_->corer)
|
||
|
{
|
||
|
// Yes it does - it's OK
|
||
|
rejCore = false;
|
||
|
}
|
||
|
}
|
||
|
if(lookForOlap) {
|
||
|
int64_t newlo, newhi;
|
||
|
if(cur->len_ == 0) {
|
||
|
if(prev != NULL && prev->len_ > 0) {
|
||
|
// If there's a gap at the base of a non-0 length branch, the
|
||
|
// gap will appear to overlap the branch if we give it length 1.
|
||
|
newhi = newlo = 0;
|
||
|
} else {
|
||
|
// Read or ref gap with no matches coming off of it
|
||
|
newlo = row;
|
||
|
newhi = row + 1;
|
||
|
}
|
||
|
} else {
|
||
|
// Diagonal with matches
|
||
|
newlo = row - (cur->len_ - 1);
|
||
|
newhi = row + 1;
|
||
|
}
|
||
|
assert_geq(newlo, 0);
|
||
|
assert_geq(newhi, 0);
|
||
|
// Does the diagonal cover cells?
|
||
|
if(newhi > newlo) {
|
||
|
// Check whether there is any overlap with previously traversed
|
||
|
// cells
|
||
|
bool added = false;
|
||
|
const size_t sz = seenPaths_[diag].size();
|
||
|
for(size_t i = 0; i < sz; i++) {
|
||
|
// Does the new interval overlap this already-seen
|
||
|
// interval? Also of interest: does it abut this
|
||
|
// already-seen interval? If so, we should merge them.
|
||
|
size_t lo = seenPaths_[diag][i].first;
|
||
|
size_t hi = seenPaths_[diag][i].second;
|
||
|
assert_lt(lo, hi);
|
||
|
size_t lo_sm = newlo, hi_sm = newhi;
|
||
|
if(hi - lo < hi_sm - lo_sm) {
|
||
|
swap(lo, lo_sm);
|
||
|
swap(hi, hi_sm);
|
||
|
}
|
||
|
if((lo <= lo_sm && hi > lo_sm) ||
|
||
|
(lo < hi_sm && hi >= hi_sm))
|
||
|
{
|
||
|
// One or both of the shorter interval's end points
|
||
|
// are contained in the longer interval - so they
|
||
|
// overlap.
|
||
|
rejSeen = true;
|
||
|
// Merge them into one longer interval
|
||
|
seenPaths_[diag][i].first = min(lo, lo_sm);
|
||
|
seenPaths_[diag][i].second = max(hi, hi_sm);
|
||
|
#ifndef NDEBUG
|
||
|
for(int64_t ii = seenPaths_[diag][i].first;
|
||
|
ii < (int64_t)seenPaths_[diag][i].second;
|
||
|
ii++)
|
||
|
{
|
||
|
//cerr << "trySolution rejected (" << ii << ", " << (ii + col - row) << ")" << endl;
|
||
|
}
|
||
|
#endif
|
||
|
added = true;
|
||
|
break;
|
||
|
} else if(hi == lo_sm || lo == hi_sm) {
|
||
|
// Merge them into one longer interval
|
||
|
seenPaths_[diag][i].first = min(lo, lo_sm);
|
||
|
seenPaths_[diag][i].second = max(hi, hi_sm);
|
||
|
#ifndef NDEBUG
|
||
|
for(int64_t ii = seenPaths_[diag][i].first;
|
||
|
ii < (int64_t)seenPaths_[diag][i].second;
|
||
|
ii++)
|
||
|
{
|
||
|
//cerr << "trySolution rejected (" << ii << ", " << (ii + col - row) << ")" << endl;
|
||
|
}
|
||
|
#endif
|
||
|
added = true;
|
||
|
// Keep going in case it overlaps one of the other
|
||
|
// intervals
|
||
|
}
|
||
|
}
|
||
|
if(!added) {
|
||
|
seenPaths_[diag].push_back(make_pair(newlo, newhi));
|
||
|
}
|
||
|
}
|
||
|
}
|
||
|
// After the merging that may have occurred above, it's no
|
||
|
// longer guarnateed that all the overlapping intervals in
|
||
|
// the list have been merged. That's OK though. We'll
|
||
|
// still get correct answers to overlap queries.
|
||
|
if(cur->root_) {
|
||
|
assert_eq(0, cur->parentId_);
|
||
|
break;
|
||
|
}
|
||
|
prev = cur;
|
||
|
cur = &bs_[cur->parentId_];
|
||
|
} // while(cur->e_.inited())
|
||
|
if(rejSeen) {
|
||
|
res.reset();
|
||
|
assert(res.alres.ned().empty());
|
||
|
nrej++;
|
||
|
return BT_NOT_FOUND;
|
||
|
}
|
||
|
if(rejCore) {
|
||
|
res.reset();
|
||
|
assert(res.alres.ned().empty());
|
||
|
nrej++;
|
||
|
return BT_REJECTED_CORE_DIAG;
|
||
|
}
|
||
|
off = br->leftmostCol();
|
||
|
score.score_ = prob_.targ_;
|
||
|
score.ns_ = ns;
|
||
|
score.gaps_ = ngap;
|
||
|
res.alres.setScore(score);
|
||
|
res.alres.setRefNs(nrefns);
|
||
|
size_t trimBeg = br->uppermostRow();
|
||
|
size_t trimEnd = prob_.qrylen_ - prob_.row_ - 1;
|
||
|
assert_leq(trimBeg, prob_.qrylen_);
|
||
|
assert_leq(trimEnd, prob_.qrylen_);
|
||
|
TRefOff refoff = off + prob_.refoff_ + prob_.rect_->refl;
|
||
|
res.alres.setShape(
|
||
|
prob_.refid_, // ref id
|
||
|
refoff, // 0-based ref offset
|
||
|
prob_.treflen(), // ref length
|
||
|
prob_.fw_, // aligned to Watson?
|
||
|
prob_.qrylen_, // read length
|
||
|
0, // read id
|
||
|
true, // pretrim soft?
|
||
|
0, // pretrim 5' end
|
||
|
0, // pretrim 3' end
|
||
|
true, // alignment trim soft?
|
||
|
prob_.fw_ ? trimBeg : trimEnd, // alignment trim 5' end
|
||
|
prob_.fw_ ? trimEnd : trimBeg); // alignment trim 3' end
|
||
|
return BT_FOUND;
|
||
|
}
|
||
|
|
||
|
/**
|
||
|
* Get the next valid alignment given a backtrace problem. Return false
|
||
|
* if there is no valid solution. Use a backtracking search to find the
|
||
|
* solution. This can be very slow.
|
||
|
*/
|
||
|
bool BtBranchTracer::nextAlignmentBacktrace(
|
||
|
size_t maxiter,
|
||
|
SwResult& res,
|
||
|
size_t& off,
|
||
|
size_t& nrej,
|
||
|
size_t& niter,
|
||
|
RandomSource& rnd)
|
||
|
{
|
||
|
assert(!empty() || !emptySolution());
|
||
|
assert(prob_.inited());
|
||
|
// There's a subtle case where we might fail to backtracing in
|
||
|
// local-alignment mode. The basic fact to remember is that when we're
|
||
|
// backtracing from the highest-scoring cell in the table, we're guaranteed
|
||
|
// to be able to backtrace without ever dipping below 0. But if we're
|
||
|
// backtracing from a cell other than the highest-scoring cell in the
|
||
|
// table, we might dip below 0. Dipping below 0 implies that there's a
|
||
|
// shorted local alignment with a better score. In which case, it's
|
||
|
// perfectly fair for us to abandon any path that dips below the floor, and
|
||
|
// this might result in the queue becoming empty before we finish.
|
||
|
bool result = false;
|
||
|
niter = 0;
|
||
|
while(!empty()) {
|
||
|
if(trySolutions(true, res, off, nrej, rnd, result)) {
|
||
|
return result;
|
||
|
}
|
||
|
if(niter++ >= maxiter) {
|
||
|
break;
|
||
|
}
|
||
|
size_t brid = best(rnd); // put best branch in 'br'
|
||
|
assert(!seen_.contains(brid));
|
||
|
ASSERT_ONLY(seen_.insert(brid));
|
||
|
#if 0
|
||
|
BtBranch *br = &bs_[brid];
|
||
|
cerr << brid
|
||
|
<< ": targ:" << prob_.targ_
|
||
|
<< ", sc:" << br->score_st_
|
||
|
<< ", row:" << br->uppermostRow()
|
||
|
<< ", nmm:" << nmm_
|
||
|
<< ", nnmm:" << nnmm_
|
||
|
<< ", nrdop:" << nrdop_
|
||
|
<< ", nrfop:" << nrfop_
|
||
|
<< ", nrdex:" << nrdex_
|
||
|
<< ", nrfex:" << nrfex_
|
||
|
<< ", nrdop_pr: " << nrdopPrune_
|
||
|
<< ", nrfop_pr: " << nrfopPrune_
|
||
|
<< ", nrdex_pr: " << nrdexPrune_
|
||
|
<< ", nrfex_pr: " << nrfexPrune_
|
||
|
<< endl;
|
||
|
#endif
|
||
|
addOffshoots(brid);
|
||
|
}
|
||
|
if(trySolutions(true, res, off, nrej, rnd, result)) {
|
||
|
return result;
|
||
|
}
|
||
|
return false;
|
||
|
}
|
||
|
|
||
|
/**
|
||
|
* Get the next valid alignment given a backtrace problem. Return false
|
||
|
* if there is no valid solution. Use a triangle-fill backtrace to find
|
||
|
* the solution. This is usually fast (it's O(m + n)).
|
||
|
*/
|
||
|
bool BtBranchTracer::nextAlignmentFill(
|
||
|
size_t maxiter,
|
||
|
SwResult& res,
|
||
|
size_t& off,
|
||
|
size_t& nrej,
|
||
|
size_t& niter,
|
||
|
RandomSource& rnd)
|
||
|
{
|
||
|
assert(prob_.inited());
|
||
|
assert(!emptySolution());
|
||
|
bool result = false;
|
||
|
if(trySolutions(false, res, off, nrej, rnd, result)) {
|
||
|
return result;
|
||
|
}
|
||
|
return false;
|
||
|
}
|
||
|
|
||
|
/**
|
||
|
* Get the next valid alignment given the backtrace problem. Return false
|
||
|
* if there is no valid solution, e.g., if
|
||
|
*/
|
||
|
bool BtBranchTracer::nextAlignment(
|
||
|
size_t maxiter,
|
||
|
SwResult& res,
|
||
|
size_t& off,
|
||
|
size_t& nrej,
|
||
|
size_t& niter,
|
||
|
RandomSource& rnd)
|
||
|
{
|
||
|
if(prob_.fill_) {
|
||
|
return nextAlignmentFill(
|
||
|
maxiter,
|
||
|
res,
|
||
|
off,
|
||
|
nrej,
|
||
|
niter,
|
||
|
rnd);
|
||
|
} else {
|
||
|
return nextAlignmentBacktrace(
|
||
|
maxiter,
|
||
|
res,
|
||
|
off,
|
||
|
nrej,
|
||
|
niter,
|
||
|
rnd);
|
||
|
}
|
||
|
}
|
||
|
|
||
|
#ifdef MAIN_ALIGNER_BT
|
||
|
|
||
|
#include <iostream>
|
||
|
|
||
|
int main(int argc, char **argv) {
|
||
|
size_t off = 0;
|
||
|
RandomSource rnd(77);
|
||
|
BtBranchTracer tr;
|
||
|
Scoring sc = Scoring::base1();
|
||
|
SwResult res;
|
||
|
tr.init(
|
||
|
"ACGTACGT", // in: read sequence
|
||
|
"IIIIIIII", // in: quality sequence
|
||
|
8, // in: read sequence length
|
||
|
"ACGTACGT", // in: reference sequence
|
||
|
8, // in: reference sequence length
|
||
|
0, // in: reference id
|
||
|
0, // in: reference offset
|
||
|
true, // in: orientation
|
||
|
sc, // in: scoring scheme
|
||
|
0, // in: N ceiling
|
||
|
8, // in: alignment score
|
||
|
7, // start in this row
|
||
|
7, // start in this column
|
||
|
rnd); // random gen, to choose among equal paths
|
||
|
size_t nrej = 0;
|
||
|
tr.nextAlignment(
|
||
|
res,
|
||
|
off,
|
||
|
nrej,
|
||
|
rnd);
|
||
|
}
|
||
|
|
||
|
#endif /*def MAIN_ALIGNER_BT*/
|