hisat-3n/aligner_bt.h
2025-01-18 21:09:52 +08:00

948 lines
30 KiB
C++

/*
* Copyright 2011, Ben Langmead <langmea@cs.jhu.edu>
*
* This file is part of Bowtie 2.
*
* Bowtie 2 is free software: you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation, either version 3 of the License, or
* (at your option) any later version.
*
* Bowtie 2 is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with Bowtie 2. If not, see <http://www.gnu.org/licenses/>.
*/
#ifndef ALIGNER_BT_H_
#define ALIGNER_BT_H_
#include <utility>
#include <stdint.h>
#include "aligner_sw_common.h"
#include "aligner_result.h"
#include "scoring.h"
#include "edit.h"
#include "limit.h"
#include "dp_framer.h"
#include "sse_util.h"
/* Say we've filled in a DP matrix in a cost-only manner, not saving the scores
* for each of the cells. At the end, we obtain a list of candidate cells and
* we'd like to backtrace from them. The per-cell scores are gone, but we have
* to re-create the correct path somehow. Hopefully we can do this without
* recreating most or al of the score matrix, since this takes too much memory.
*
* Approach 1: Naively refill the matrix.
*
* Just refill the matrix, perhaps backwards starting from the backtrace cell.
* Since this involves recreating all or most of the score matrix, this is not
* a good approach.
*
* Approach 2: Naive backtracking.
*
* Conduct a search through the space of possible backtraces, rooted at the
* candidate cell. To speed things along, we can prioritize paths that have a
* high score and that align more characters from the read.
*
* The approach is simple, but it's neither fast nor memory-efficient in
* general.
*
* Approach 3: Refilling with checkpoints.
*
* Refill the matrix "backwards" starting from the candidate cell, but use
* checkpoints to ensure that only a series of relatively small triangles or
* rectangles need to be refilled. The checkpoints must include elements from
* the H, E and F matrices; not just H. After each refill, we backtrace
* through the refilled area, then discard/reuse the fill memory. I call each
* such fill/backtrace a mini-fill/backtrace.
*
* If there's only one path to be found, then this is O(m+n). But what if
* there are many? And what if we would like to avoid paths that overlap in
* one or more cells? There are two ways we can make this more efficient:
*
* 1. Remember the re-calculated E/F/H values and try to retrieve them
* 2. Keep a record of cells that have already been traversed
*
* Legend:
*
* 1: Candidate cell
* 2: Final cell from first mini-fill/backtrace
* 3: Final cell from second mini-fill/backtrace (third not shown)
* +: Checkpointed cell
* *: Cell filled from first or second mini-fill/backtrace
* -: Unfilled cell
*
* ---++--------++--------++----
* --++--------++*-------++-----
* -++--(etc)-++**------++------
* ++--------+3***-----++-------
* +--------++****----++--------
* --------++*****---++--------+
* -------++******--++--------++
* ------++*******-++*-------++-
* -----++********++**------++--
* ----++********2+***-----++---
* ---++--------++****----++----
* --++--------++*****---++-----
* -++--------++*****1--++------
* ++--------++--------++-------
*
* Approach 4: Backtracking with checkpoints.
*
* Conduct a search through the space of possible backtraces, rooted at the
* candidate cell. Use "checkpoints" to prune. That is, when a backtrace
* moves through a cell with a checkpointed score, consider the score
* accumulated so far and the cell's saved score; abort if those two scores
* add to something less than a valid score. Note we're only checkpointing H
* in this case (possibly; see "subtle point"), not E or F.
*
* Subtle point: checkpoint scores are a result of moving forward through
* the matrix whereas backtracking scores result from moving backward. This
* matters becuase the two paths that meet up at a cell might have both
* factored in a gap open penalty for the same gap, in which case we will
* underestimate the overall score and prune a good path. Here are two ideas
* for how to resolve this:
*
* Idea 1: when we combine the forward and backward scores to find an overall
* score, and our backtrack procedure *just* made a horizontal or vertical
* move, add in a "bonus" equal to the gap open penalty of the appropraite
* type (read gap open for horizontal, ref gap open for vertical). This might
* overcompensate, since
*
* Idea 2: keep the E and F values for the checkpoints around, in addition to
* the H values. When it comes time to combine the score from the forward
* and backward paths, we consider the last move we made in the backward
* backtrace. If it's a read gap (horizontal move), then we calculate the
* overall score as:
*
* max(Score-backward + H-forward, Score-backward + E-forward + read-open)
*
* If it's a reference gap (vertical move), then we calculate the overall
* score as:
*
* max(Score-backward + H-forward, Score-backward + F-forward + ref-open)
*
* What does it mean to abort a backtrack? If we're starting a new branch
* and there is a checkpoing in the bottommost cell of the branch, and the
* overall score is less than the target, then we can simply ignore the
* branch. If the checkpoint occurs in the middle of a string of matches, we
* need to curtail the branch such that it doesn't include the checkpointed
* cell and we won't ever try to enter the checkpointed cell, e.g., on a
* mismatch.
*
* Approaches 3 and 4 seem reasonable, and could be combined. For simplicity,
* we implement only approach 4 for now.
*
* Checkpoint information is propagated from the fill process to the backtracer
* via a
*/
enum {
BT_NOT_FOUND = 1, // could not obtain the backtrace because it
// overlapped a previous solution
BT_FOUND, // obtained a valid backtrace
BT_REJECTED_N, // backtrace rejected because it had too many Ns
BT_REJECTED_CORE_DIAG // backtrace rejected because it failed to overlap a
// core diagonal
};
/**
* Parameters for a matrix of potential backtrace problems to solve.
* Encapsulates information about:
*
* The problem given a particular reference substring:
*
* - The query string (nucleotides and qualities)
* - The reference substring (incl. orientation, offset into overall sequence)
* - Checkpoints (i.e. values of matrix cells)
* - Scoring scheme and other thresholds
*
* The problem given a particular reference substring AND a particular row and
* column from which to backtrace:
*
* - The row and column
* - The target score
*/
class BtBranchProblem {
public:
/**
* Create new uninitialized problem.
*/
BtBranchProblem() { reset(); }
/**
* Initialize a new problem.
*/
void initRef(
const char *qry, // query string (along rows)
const char *qual, // query quality string (along rows)
size_t qrylen, // query string (along rows) length
const char *ref, // reference string (along columns)
TRefOff reflen, // in-rectangle reference string length
TRefOff treflen,// total reference string length
TRefId refid, // reference id
TRefOff refoff, // reference offset
bool fw, // orientation of problem
const DPRect* rect, // dynamic programming rectangle filled out
const Checkpointer* cper, // checkpointer
const Scoring *sc, // scoring scheme
size_t nceil) // max # Ns allowed in alignment
{
qry_ = qry;
qual_ = qual;
qrylen_ = qrylen;
ref_ = ref;
reflen_ = reflen;
treflen_ = treflen;
refid_ = refid;
refoff_ = refoff;
fw_ = fw;
rect_ = rect;
cper_ = cper;
sc_ = sc;
nceil_ = nceil;
}
/**
* Initialize a new problem.
*/
void initBt(
size_t row, // row
size_t col, // column
bool fill, // use a filling rather than a backtracking strategy
bool usecp, // use checkpoints to short-circuit while backtracking
TAlScore targ) // target score
{
row_ = row;
col_ = col;
targ_ = targ;
fill_ = fill;
usecp_ = usecp;
if(fill) {
assert(usecp_);
}
}
/**
* Reset to uninitialized state.
*/
void reset() {
qry_ = qual_ = ref_ = NULL;
cper_ = NULL;
rect_ = NULL;
sc_ = NULL;
qrylen_ = reflen_ = treflen_ = refid_ = refoff_ = row_ = col_ = targ_ = nceil_ = 0;
fill_ = fw_ = usecp_ = false;
}
/**
* Return true iff the BtBranchProblem has been initialized.
*/
bool inited() const {
return qry_ != NULL;
}
#ifndef NDEBUG
/**
* Sanity-check the problem.
*/
bool repOk() const {
assert_gt(qrylen_, 0);
assert_gt(reflen_, 0);
assert_gt(treflen_, 0);
assert_lt(row_, qrylen_);
assert_lt((TRefOff)col_, reflen_);
return true;
}
#endif
size_t reflen() const { return reflen_; }
size_t treflen() const { return treflen_; }
protected:
const char *qry_; // query string (along rows)
const char *qual_; // query quality string (along rows)
size_t qrylen_; // query string (along rows) length
const char *ref_; // reference string (along columns)
TRefOff reflen_; // in-rectangle reference string length
TRefOff treflen_;// total reference string length
TRefId refid_; // reference id
TRefOff refoff_; // reference offset
bool fw_; // orientation of problem
const DPRect* rect_; // dynamic programming rectangle filled out
size_t row_; // starting row
size_t col_; // starting column
TAlScore targ_; // target score
const Checkpointer *cper_; // checkpointer
bool fill_; // use mini-fills
bool usecp_; // use checkpointing?
const Scoring *sc_; // scoring scheme
size_t nceil_; // max # Ns allowed in alignment
friend class BtBranch;
friend class BtBranchQ;
friend class BtBranchTracer;
};
/**
* Encapsulates a "branch" which is a diagonal of cells (possibly of length 0)
* in the matrix where all the cells are matches. These stretches are linked
* together by edits to form a full backtrace path through the matrix. Lengths
* are measured w/r/t to the number of rows traversed by the path, so a branch
* that represents a read gap extension could have length = 0.
*
* At the end of the day, the full backtrace path is represented as a list of
* BtBranch's where each BtBranch represents a stretch of matching cells (and
* up to one mismatching cell at its bottom extreme) ending in an edit (or in
* the bottommost row, in which case the edit is uninitialized). Each
* BtBranch's row and col fields indicate the bottommost cell involved in the
* diagonal stretch of matches, and the len_ field indicates the length of the
* stretch of matches. Note that the edits themselves also correspond to
* movement through the matrix.
*
* A related issue is how we record which cells have been visited so that we
* never report a pair of paths both traversing the same (row, col) of the
* overall DP matrix. This gets a little tricky because we have to take into
* account the cells covered by *edits* in addition to the cells covered by the
* stretches of matches. For instance: imagine a mismatch. That takes up a
* cell of the DP matrix, but it may or may not be preceded by a string of
* matches. It's hard to imagine how to represent this unless we let the
* mismatch "count toward" the len_ of the branch and let (row, col) refer to
* the cell where the mismatch occurs.
*
* We need BtBranches to "live forever" so that we can make some BtBranches
* parents of others using parent pointers. For this reason, BtBranch's are
* stored in an EFactory object in the BtBranchTracer class.
*/
class BtBranch {
public:
BtBranch() { reset(); }
BtBranch(
const BtBranchProblem& prob,
size_t parentId,
TAlScore penalty,
TAlScore score_en,
int64_t row,
int64_t col,
Edit e,
int hef,
bool root,
bool extend)
{
init(prob, parentId, penalty, score_en, row, col, e, hef, root, extend);
}
/**
* Reset to uninitialized state.
*/
void reset() {
parentId_ = 0;
score_st_ = score_en_ = len_ = row_ = col_ = 0;
curtailed_ = false;
e_.reset();
}
/**
* Caller gives us score_en, row and col. We figure out score_st and len_
* by comparing characters from the strings.
*/
void init(
const BtBranchProblem& prob,
size_t parentId,
TAlScore penalty,
TAlScore score_en,
int64_t row,
int64_t col,
Edit e,
int hef,
bool root,
bool extend);
/**
* Return true iff this branch ends in a solution to the backtrace problem.
*/
bool isSolution(const BtBranchProblem& prob) const {
const bool end2end = prob.sc_->monotone;
return score_st_ == prob.targ_ && (!end2end || endsInFirstRow());
}
/**
* Return true iff this branch could potentially lead to a valid alignment.
*/
bool isValid(const BtBranchProblem& prob) const {
int64_t scoreFloor = prob.sc_->monotone ? MIN_I64 : 0;
if(score_st_ < scoreFloor) {
// Dipped below the score floor
return false;
}
if(isSolution(prob)) {
// It's a solution, so it's also valid
return true;
}
if((int64_t)len_ > row_) {
// Went all the way to the top row
//assert_leq(score_st_, prob.targ_);
return score_st_ == prob.targ_;
} else {
int64_t match = prob.sc_->match();
int64_t bonusLeft = (row_ + 1 - len_) * match;
return score_st_ + bonusLeft >= prob.targ_;
}
}
/**
* Return true iff this branch overlaps with the given branch.
*/
bool overlap(const BtBranchProblem& prob, const BtBranch& bt) const {
// Calculate this branch's diagonal
assert_lt(row_, (int64_t)prob.qrylen_);
size_t fromend = prob.qrylen_ - row_ - 1;
size_t diag = fromend + col_;
int64_t lo = 0, hi = row_ + 1;
if(len_ == 0) {
lo = row_;
} else {
lo = row_ - (len_ - 1);
}
// Calculate other branch's diagonal
assert_lt(bt.row_, (int64_t)prob.qrylen_);
size_t ofromend = prob.qrylen_ - bt.row_ - 1;
size_t odiag = ofromend + bt.col_;
if(diag != odiag) {
return false;
}
int64_t olo = 0, ohi = bt.row_ + 1;
if(bt.len_ == 0) {
olo = bt.row_;
} else {
olo = bt.row_ - (bt.len_ - 1);
}
int64_t losm = olo, hism = ohi;
if(hi - lo < ohi - olo) {
swap(lo, losm);
swap(hi, hism);
}
if((lo <= losm && hi > losm) || (lo < hism && hi >= hism)) {
return true;
}
return false;
}
/**
* Return true iff this branch is higher priority than the branch 'o'.
*/
bool operator<(const BtBranch& o) const {
// Prioritize uppermost above score
if(uppermostRow() != o.uppermostRow()) {
return uppermostRow() < o.uppermostRow();
}
if(score_st_ != o.score_st_) return score_st_ > o.score_st_;
if(row_ != o.row_) return row_ < o.row_;
if(col_ != o.col_) return col_ > o.col_;
if(parentId_ != o.parentId_) return parentId_ > o.parentId_;
assert(false);
return false;
}
/**
* Return true iff the topmost cell involved in this branch is in the top
* row.
*/
bool endsInFirstRow() const {
assert_leq((int64_t)len_, row_ + 1);
return (int64_t)len_ == row_+1;
}
/**
* Return the uppermost row covered by this branch.
*/
size_t uppermostRow() const {
assert_geq(row_ + 1, (int64_t)len_);
return row_ + 1 - (int64_t)len_;
}
/**
* Return the leftmost column covered by this branch.
*/
size_t leftmostCol() const {
assert_geq(col_ + 1, (int64_t)len_);
return col_ + 1 - (int64_t)len_;
}
#ifndef NDEBUG
/**
* Sanity-check this BtBranch.
*/
bool repOk() const {
assert(root_ || e_.inited());
assert_gt(len_, 0);
assert_geq(col_ + 1, (int64_t)len_);
assert_geq(row_ + 1, (int64_t)len_);
return true;
}
#endif
protected:
// ID of the parent branch.
size_t parentId_;
// Penalty associated with the edit at the bottom of this branch (0 if
// there is no edit)
TAlScore penalty_;
// Score at the beginning of the branch
TAlScore score_st_;
// Score at the end of the branch (taking the edit into account)
TAlScore score_en_;
// Length of the branch. That is, the total number of diagonal cells
// involved in all the matches and in the edit (if any). Should always be
// > 0.
size_t len_;
// The row of the final (bottommost) cell in the branch. This might be the
// bottommost match if the branch has no associated edit. Otherwise, it's
// the cell occupied by the edit.
int64_t row_;
// The column of the final (bottommost) cell in the branch.
int64_t col_;
// The edit at the bottom of the branch. If this is the bottommost branch
// in the alignment and it does not end in an edit, then this remains
// uninitialized.
Edit e_;
// True iff this is the bottommost branch in the alignment. We can't just
// use row_ to tell us this because local alignments don't necessarily end
// in the last row.
bool root_;
bool curtailed_; // true -> pruned at a checkpoint where we otherwise
// would have had a match
friend class BtBranchQ;
friend class BtBranchTracer;
};
/**
* Instantiate and solve best-first branch-based backtraces.
*/
class BtBranchTracer {
public:
explicit BtBranchTracer() :
prob_(), bs_(), seenPaths_(DP_CAT), sawcell_(DP_CAT), doTri_() { }
/**
* Add a branch to the queue.
*/
void add(size_t id) {
assert(!bs_[id].isSolution(prob_));
unsorted_.push_back(make_pair(bs_[id].score_st_, id));
}
/**
* Add a branch to the list of solutions.
*/
void addSolution(size_t id) {
assert(bs_[id].isSolution(prob_));
solutions_.push_back(id);
}
/**
* Given a potential branch to add to the queue, see if we can follow the
* branch a little further first. If it's still valid, or if we reach a
* choice between valid outgoing paths, go ahead and add it to the queue.
*/
void examineBranch(
int64_t row,
int64_t col,
const Edit& e,
TAlScore pen,
TAlScore sc,
size_t parentId);
/**
* Take all possible ways of leaving the given branch and add them to the
* branch queue.
*/
void addOffshoots(size_t bid);
/**
* Get the best branch and remove it from the priority queue.
*/
size_t best(RandomSource& rnd) {
assert(!empty());
flushUnsorted();
assert_gt(sortedSel_ ? sorted1_.size() : sorted2_.size(), cur_);
// Perhaps shuffle everyone who's tied for first?
size_t id = sortedSel_ ? sorted1_[cur_] : sorted2_[cur_];
cur_++;
return id;
}
/**
* Return true iff there are no branches left to try.
*/
bool empty() const {
return size() == 0;
}
/**
* Return the size, i.e. the total number of branches contained.
*/
size_t size() const {
return unsorted_.size() +
(sortedSel_ ? sorted1_.size() : sorted2_.size()) - cur_;
}
/**
* Return true iff there are no solutions left to try.
*/
bool emptySolution() const {
return sizeSolution() == 0;
}
/**
* Return the size of the solution set so far.
*/
size_t sizeSolution() const {
return solutions_.size();
}
/**
* Sort unsorted branches, merge them with master sorted list.
*/
void flushUnsorted();
#ifndef NDEBUG
/**
* Sanity-check the queue.
*/
bool repOk() const {
assert_lt(cur_, (sortedSel_ ? sorted1_.size() : sorted2_.size()));
return true;
}
#endif
/**
* Initialize the tracer with respect to a new read. This involves
* resetting all the state relating to the set of cells already visited
*/
void initRef(
const char* rd, // in: read sequence
const char* qu, // in: quality sequence
size_t rdlen, // in: read sequence length
const char* rf, // in: reference sequence
size_t rflen, // in: in-rectangle reference sequence length
TRefOff trflen, // in: total reference sequence length
TRefId refid, // in: reference id
TRefOff refoff, // in: reference offset
bool fw, // in: orientation
const DPRect *rect, // in: DP rectangle
const Checkpointer *cper, // in: checkpointer
const Scoring& sc, // in: scoring scheme
size_t nceil) // in: N ceiling
{
prob_.initRef(rd, qu, rdlen, rf, rflen, trflen, refid, refoff, fw, rect, cper, &sc, nceil);
const size_t ndiag = rflen + rdlen - 1;
seenPaths_.resize(ndiag);
for(size_t i = 0; i < ndiag; i++) {
seenPaths_[i].clear();
}
// clear each of the per-column sets
if(sawcell_.size() < rflen) {
size_t isz = sawcell_.size();
sawcell_.resize(rflen);
for(size_t i = isz; i < rflen; i++) {
sawcell_[i].setCat(DP_CAT);
}
}
for(size_t i = 0; i < rflen; i++) {
sawcell_[i].setCat(DP_CAT);
sawcell_[i].clear(); // clear the set
}
}
/**
* Initialize with a new backtrace.
*/
void initBt(
TAlScore escore, // in: alignment score
size_t row, // in: start in this row
size_t col, // in: start in this column
bool fill, // in: use mini-filling?
bool usecp, // in: use checkpointing?
bool doTri, // in: triangle-shaped mini-fills?
RandomSource& rnd) // in: random gen, to choose among equal paths
{
prob_.initBt(row, col, fill, usecp, escore);
Edit e; e.reset();
unsorted_.clear();
solutions_.clear();
sorted1_.clear();
sorted2_.clear();
cur_ = 0;
nmm_ = 0; // number of mismatches attempted
nnmm_ = 0; // number of mismatches involving N attempted
nrdop_ = 0; // number of read gap opens attempted
nrfop_ = 0; // number of ref gap opens attempted
nrdex_ = 0; // number of read gap extensions attempted
nrfex_ = 0; // number of ref gap extensions attempted
nmmPrune_ = 0; // number of mismatches attempted
nnmmPrune_ = 0; // number of mismatches involving N attempted
nrdopPrune_ = 0; // number of read gap opens attempted
nrfopPrune_ = 0; // number of ref gap opens attempted
nrdexPrune_ = 0; // number of read gap extensions attempted
nrfexPrune_ = 0; // number of ref gap extensions attempted
row_ = row;
col_ = col;
doTri_ = doTri;
bs_.clear();
if(!prob_.fill_) {
size_t id = bs_.alloc();
bs_[id].init(
prob_,
0, // parent id
0, // penalty
0, // starting score
row, // row
col, // column
e,
0,
true, // this is the root
true); // this should be extend with exact matches
if(bs_[id].isSolution(prob_)) {
addSolution(id);
} else {
add(id);
}
} else {
int64_t row = row_, col = col_;
TAlScore targsc = prob_.targ_;
int hef = 0;
bool done = false, abort = false;
size_t depth = 0;
while(!done && !abort) {
// Accumulate edits as we go. We can do this by adding
// BtBranches to the bs_ structure. Each step of the backtrace
// either involves an edit (thereby starting a new branch) or
// extends the previous branch by one more position.
//
// Note: if the BtBranches are in line, then trySolution can be
// used to populate the SwResult and check for various
// situations where we might reject the alignment (i.e. due to
// a cell having been visited previously).
if(doTri_) {
triangleFill(
row, // row of cell to backtrace from
col, // column of cell to backtrace from
hef, // cell to bt from: H (0), E (1), or F (2)
targsc, // score of cell to backtrace from
prob_.targ_, // score of alignment we're looking for
rnd, // pseudo-random generator
row, // out: row we ended up in after bt
col, // out: column we ended up in after bt
hef, // out: H/E/F after backtrace
targsc, // out: score up to cell we ended up in
done, // out: finished tracing out an alignment?
abort); // out: aborted b/c cell was seen before?
} else {
squareFill(
row, // row of cell to backtrace from
col, // column of cell to backtrace from
hef, // cell to bt from: H (0), E (1), or F (2)
targsc, // score of cell to backtrace from
prob_.targ_, // score of alignment we're looking for
rnd, // pseudo-random generator
row, // out: row we ended up in after bt
col, // out: column we ended up in after bt
hef, // out: H/E/F after backtrace
targsc, // out: score up to cell we ended up in
done, // out: finished tracing out an alignment?
abort); // out: aborted b/c cell was seen before?
}
if(depth >= ndep_.size()) {
ndep_.resize(depth+1);
ndep_[depth] = 1;
} else {
ndep_[depth]++;
}
depth++;
assert((row >= 0 && col >= 0) || done);
}
}
ASSERT_ONLY(seen_.clear());
}
/**
* Get the next valid alignment given the backtrace problem. Return false
* if there is no valid solution, e.g., if
*/
bool nextAlignment(
size_t maxiter,
SwResult& res,
size_t& off,
size_t& nrej,
size_t& niter,
RandomSource& rnd);
/**
* Return true iff this tracer has been initialized
*/
bool inited() const {
return prob_.inited();
}
/**
* Return true iff the mini-fills are triangle-shaped.
*/
bool doTri() const { return doTri_; }
/**
* Fill in a triangle of the DP table and backtrace from the given cell to
* a cell in the previous checkpoint, or to the terminal cell.
*/
void triangleFill(
int64_t rw, // row of cell to backtrace from
int64_t cl, // column of cell to backtrace from
int hef, // cell to backtrace from is H (0), E (1), or F (2)
TAlScore targ, // score of cell to backtrace from
TAlScore targ_final, // score of alignment we're looking for
RandomSource& rnd, // pseudo-random generator
int64_t& row_new, // out: row we ended up in after backtrace
int64_t& col_new, // out: column we ended up in after backtrace
int& hef_new, // out: H/E/F after backtrace
TAlScore& targ_new, // out: score up to cell we ended up in
bool& done, // out: finished tracing out an alignment?
bool& abort); // out: aborted b/c cell was seen before?
/**
* Fill in a square of the DP table and backtrace from the given cell to
* a cell in the previous checkpoint, or to the terminal cell.
*/
void squareFill(
int64_t rw, // row of cell to backtrace from
int64_t cl, // column of cell to backtrace from
int hef, // cell to backtrace from is H (0), E (1), or F (2)
TAlScore targ, // score of cell to backtrace from
TAlScore targ_final, // score of alignment we're looking for
RandomSource& rnd, // pseudo-random generator
int64_t& row_new, // out: row we ended up in after backtrace
int64_t& col_new, // out: column we ended up in after backtrace
int& hef_new, // out: H/E/F after backtrace
TAlScore& targ_new, // out: score up to cell we ended up in
bool& done, // out: finished tracing out an alignment?
bool& abort); // out: aborted b/c cell was seen before?
protected:
/**
* Get the next valid alignment given a backtrace problem. Return false
* if there is no valid solution. Use a backtracking search to find the
* solution. This can be very slow.
*/
bool nextAlignmentBacktrace(
size_t maxiter,
SwResult& res,
size_t& off,
size_t& nrej,
size_t& niter,
RandomSource& rnd);
/**
* Get the next valid alignment given a backtrace problem. Return false
* if there is no valid solution. Use a triangle-fill backtrace to find
* the solution. This is usually fast (it's O(m + n)).
*/
bool nextAlignmentFill(
size_t maxiter,
SwResult& res,
size_t& off,
size_t& nrej,
size_t& niter,
RandomSource& rnd);
/**
* Try all the solutions accumulated so far. Solutions might be rejected
* if they, for instance, overlap a previous solution, have too many Ns,
* fail to overlap a core diagonal, etc.
*/
bool trySolutions(
bool lookForOlap,
SwResult& res,
size_t& off,
size_t& nrej,
RandomSource& rnd,
bool& success);
/**
* See if a given solution branch works as a solution (i.e. doesn't overlap
* another one, have too many Ns, fail to overlap a core diagonal, etc.)
*/
int trySolution(
size_t id,
bool lookForOlap,
SwResult& res,
size_t& off,
size_t& nrej,
RandomSource& rnd);
BtBranchProblem prob_; // problem configuration
EFactory<BtBranch> bs_; // global BtBranch factory
// already reported alignments going through these diagonal segments
ELList<std::pair<size_t, size_t> > seenPaths_;
ELSet<size_t> sawcell_; // cells already backtraced through
EList<std::pair<TAlScore, size_t> > unsorted_; // unsorted list of as-yet-unflished BtBranches
EList<size_t> sorted1_; // list of BtBranch, sorted by score
EList<size_t> sorted2_; // list of BtBranch, sorted by score
EList<size_t> solutions_; // list of solution branches
bool sortedSel_; // true -> 1, false -> 2
size_t cur_; // cursor into sorted list to start from
size_t nmm_; // number of mismatches attempted
size_t nnmm_; // number of mismatches involving N attempted
size_t nrdop_; // number of read gap opens attempted
size_t nrfop_; // number of ref gap opens attempted
size_t nrdex_; // number of read gap extensions attempted
size_t nrfex_; // number of ref gap extensions attempted
size_t nmmPrune_; //
size_t nnmmPrune_; //
size_t nrdopPrune_; //
size_t nrfopPrune_; //
size_t nrdexPrune_; //
size_t nrfexPrune_; //
size_t row_; // row
size_t col_; // column
bool doTri_; // true -> fill in triangles; false -> squares
EList<CpQuad> sq_; // square to fill when doing mini-fills
ELList<CpQuad> tri_; // triangle to fill when doing mini-fills
EList<size_t> ndep_; // # triangles mini-filled at various depths
#ifndef NDEBUG
ESet<size_t> seen_; // seedn branch ids; should never see same twice
#endif
};
#endif /*ndef ALIGNER_BT_H_*/