4292 lines
128 KiB
C++
4292 lines
128 KiB
C++
/*
|
|
* Copyright 2011, Ben Langmead <langmea@cs.jhu.edu>
|
|
*
|
|
* This file is part of Bowtie 2.
|
|
*
|
|
* Bowtie 2 is free software: you can redistribute it and/or modify
|
|
* it under the terms of the GNU General Public License as published by
|
|
* the Free Software Foundation, either version 3 of the License, or
|
|
* (at your option) any later version.
|
|
*
|
|
* Bowtie 2 is distributed in the hope that it will be useful,
|
|
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
|
* GNU General Public License for more details.
|
|
*
|
|
* You should have received a copy of the GNU General Public License
|
|
* along with Bowtie 2. If not, see <http://www.gnu.org/licenses/>.
|
|
*/
|
|
|
|
#ifndef ALIGNER_SEED2_H_
|
|
#define ALIGNER_SEED2_H_
|
|
|
|
/**
|
|
* The user of the DescentDriver class specifies a collection of search roots.
|
|
* Logic for picking these search roots is located elsewhere, not in this
|
|
* module. The search roots are annotated with a priority score, which
|
|
*
|
|
* The heap is a min-heap over pairs, where the first element of each pair is
|
|
* the score associated with a descent and the second element of each pair is
|
|
* the descent ID.
|
|
*
|
|
* Weeding out redundant descents is key; otherwise we end up reporting slight
|
|
* variations on the same alignment repeatedly, including variations with poor
|
|
* scores. What criteria do we use to determine whether two paths are
|
|
* redundant?
|
|
*
|
|
* Here's an example where the same set of read characters have been aligned in
|
|
* all three cases:
|
|
*
|
|
* Alignment 1 (sc = 0):
|
|
* Rd: GCTATATAGCGCGCTCGCATCATTTTGTGT
|
|
* ||||||||||||||||||||||||||||||
|
|
* Rf: GCTATATAGCGCGCTCGCATCATTTTGTGT
|
|
*
|
|
* Alignment 2 (sc = -22):
|
|
* Rd: GCTATATAGCGCGCTCGCATCATTTTGTGT
|
|
* ||||||||||||||||||||||| | |||
|
|
* Rf: GCTATATAGCGCGCTCGCATCAT--TTTGT
|
|
*
|
|
* Alignment 3 (sc = -22):
|
|
* Rd: GCTATATAGCGCGCTCGCATCATT--TTGTGT
|
|
* |||||||||||||||||||||||| |||||
|
|
* Rf: GCTATATAGCGCGCTCGCATCATTTTGTGTGT
|
|
*
|
|
* Rf from aln 1: GCTATATAGCGCGCTCGCATCATTTTGTGT
|
|
* Rf from aln 2: GCTATATAGCGCGCTCGCATCATTTTGT
|
|
* Rf from aln 3: GCTATATAGCGCGCTCGCATCATTTTGTGTGT
|
|
*
|
|
* Are alignments 2 and 3 redundant with alignment 1? We can't totally say
|
|
* without knowing the associated SA ranges. Take alignments 1 and 2. Either
|
|
* the SA ranges are the same or the SA range for 2 contains the SA range for
|
|
* 1. If they're the same, then alignment 2 is redundant with alignment 1.
|
|
* Otherwise, *some* of the elements in the SA range for alignment 2 are not
|
|
* redundant.
|
|
*
|
|
* In that example, the same read characters are aligned in all three
|
|
* alignments. Is it possible and profitable to consider scenarios where an
|
|
* alignment might be redundant with another alignment
|
|
*
|
|
* Another question is *when* do we try to detect the redundancy? Before we
|
|
* try to extend through the matches, or after. After is easier, but less work
|
|
* has been avoided.
|
|
*
|
|
* What data structure do we query to determine whether there's redundancy?
|
|
* The situation is harder when we try to detect overlaps between SA ranges
|
|
* rather than identical SA ranges. Maybe: read intervals -> intersection tree -> penalties.
|
|
*
|
|
* 1. If we're introducing a gap and we could have introduced it deeper in the
|
|
* descent with the same effect w/r/t homopolymer length.
|
|
* 2. If we have Descent A with penalty B and Descent a with penalty b, and A
|
|
* aligns read characters [X, Y] to SA range [Z, W], and B aligns read
|
|
* characters [x, y] to SA range [z, w], then A is redundant with B if
|
|
* [x, y] is within [X, Y].
|
|
*
|
|
* Found an alignment with total penalty = 3
|
|
* GCAATATAGCGCGCTCGCATCATTTTGTGT
|
|
* || |||||||||||||||||||||||||||
|
|
* GCTATATAGCGCGCTCGCATCATTTTGTGT
|
|
*
|
|
* Found an alignment with total penalty = 27
|
|
* gCAATATAGCGCGCTCGCATCATTTTGTGT
|
|
* | ||||||||||||||||||||||||
|
|
* TATA-TAGCGCGCTCGCATCATTTTGTGT
|
|
*/
|
|
|
|
#include <stdint.h>
|
|
#include <math.h>
|
|
#include <utility>
|
|
#include <limits>
|
|
#include "assert_helpers.h"
|
|
#include "random_util.h"
|
|
#include "aligner_result.h"
|
|
#include "gfm.h"
|
|
#include "simple_func.h"
|
|
#include "scoring.h"
|
|
#include "edit.h"
|
|
#include "read.h"
|
|
#include "ds.h"
|
|
#include "group_walk.h"
|
|
#include "btypes.h"
|
|
|
|
typedef size_t TReadOff;
|
|
typedef int64_t TScore;
|
|
typedef float TRootPri;
|
|
typedef size_t TDescentId;
|
|
typedef size_t TRootId;
|
|
|
|
/**
|
|
* enum encapsulating a few different policies for how we might extend descents
|
|
* in the direction opposite from their primary direction.
|
|
*/
|
|
enum {
|
|
// Never extened in the direction opposite from the primary. Just go in
|
|
// the primary direction until the bounce.
|
|
DESC_EX_NONE = 1,
|
|
|
|
// When we're finished extending out the matches for a descent, try to
|
|
// extend in the opposite direction in a way that extends all branches
|
|
// simultaneously. The Descent.nex_ field contains the number of positions
|
|
// we were able to extend through in this way.
|
|
DESC_EX_FROM_1ST_BRANCH = 2,
|
|
|
|
// Each time we add an edge to the summary, extend it in the opposite
|
|
// direction. The DescentEdge.nex field contains the number of positions
|
|
// we were able to extend through, and this in turn gets propagated to
|
|
// Descent.nex_ if and when we branch from the DescentEdge.
|
|
DESC_EX_EACH_EDGE = 3
|
|
};
|
|
|
|
/**
|
|
* Counters to keep track of how much work is being done.
|
|
*/
|
|
struct DescentMetrics {
|
|
|
|
DescentMetrics() { reset(); }
|
|
|
|
void reset() {
|
|
bwops = bwops_1 = bwops_bi = recalc = branch = branch_mm =
|
|
branch_del = branch_ins = heap_max = descent_max = descentpos_max =
|
|
nex = 0;
|
|
}
|
|
|
|
uint64_t bwops; // # FM Index opbs
|
|
uint64_t bwops_1; // # LF1 FM Index opbs
|
|
uint64_t bwops_bi; // # BiEx FM Index opbs
|
|
uint64_t recalc; // # times outgoing edge summary was recalculated
|
|
uint64_t branch; // # times we descended from another descent
|
|
uint64_t branch_mm; // # times branch was on a mismatch
|
|
uint64_t branch_del; // # times branch was on a deletion
|
|
uint64_t branch_ins; // # times branch was on a insertion
|
|
uint64_t heap_max; // maximum size of Descent heap
|
|
uint64_t descent_max; // maximum size of Descent factory
|
|
uint64_t descentpos_max; // maximum size of DescentPos factory
|
|
uint64_t nex; // # extensions
|
|
};
|
|
|
|
/**
|
|
* Priority used to rank which descent we should branch from next. Right now,
|
|
* priority is governed by a 4-tuple. From higher to lower priority:
|
|
*
|
|
* 1. Penalty accumulated so far
|
|
* 2. Depth into the search space, including extensions
|
|
* 3. Width of the SA range (i.e. uniqueness)
|
|
* 4. Root priority
|
|
*/
|
|
struct DescentPriority {
|
|
|
|
DescentPriority() { reset(); }
|
|
|
|
DescentPriority(
|
|
TScore pen_,
|
|
size_t depth_,
|
|
TIndexOffU width_,
|
|
float rootpri_)
|
|
{
|
|
pen = pen_;
|
|
depth = depth_;
|
|
width = width_;
|
|
rootpri = rootpri_;
|
|
}
|
|
|
|
/**
|
|
* Initialize new DescentPriority.
|
|
*/
|
|
void init(TScore pen_, size_t depth_, TIndexOffU width_, float rootpri_) {
|
|
pen = pen_;
|
|
depth = depth_;
|
|
width = width_;
|
|
rootpri = rootpri_;
|
|
}
|
|
|
|
/**
|
|
* Reset to uninitialized state.
|
|
*/
|
|
void reset() {
|
|
width = 0;
|
|
}
|
|
|
|
/**
|
|
* Return true iff DescentPriority is initialized.
|
|
*/
|
|
bool inited() const {
|
|
return width > 0;
|
|
}
|
|
|
|
/**
|
|
* Return true iff this priority is prior to given priority.
|
|
*/
|
|
bool operator<(const DescentPriority& o) const {
|
|
assert(inited());
|
|
assert(o.inited());
|
|
// 1st priority: penalty accumulated so far
|
|
if(pen < o.pen) return true;
|
|
if(pen > o.pen) return false;
|
|
// 2nd priority: depth into the search space, including extensions
|
|
if(depth > o.depth) return true;
|
|
if(depth < o.depth) return false;
|
|
// 3rd priority: width of the SA range (i.e. uniqueness)
|
|
if(width < o.width) return true;
|
|
if(width > o.width) return false;
|
|
// 4th priority: root priority
|
|
if(rootpri > o.rootpri) return true;
|
|
return false;
|
|
}
|
|
|
|
/**
|
|
* Return true iff this priority is prior to or equal to given priority.
|
|
*/
|
|
bool operator<=(const DescentPriority& o) const {
|
|
assert(inited());
|
|
assert(o.inited());
|
|
// 1st priority: penalty accumulated so far
|
|
if(pen < o.pen) return true;
|
|
if(pen > o.pen) return false;
|
|
// 2nd priority: depth into the search space, including extensions
|
|
if(depth > o.depth) return true;
|
|
if(depth < o.depth) return false;
|
|
// 3rd priority: width of the SA range (i.e. uniqueness)
|
|
if(width < o.depth) return true;
|
|
if(width > o.width) return false;
|
|
// 4th priority: root priority
|
|
if(rootpri > o.rootpri) return true;
|
|
return true;
|
|
}
|
|
|
|
/**
|
|
* Return true iff this priority is prior to or equal to given priority.
|
|
*/
|
|
bool operator==(const DescentPriority& o) const {
|
|
assert(inited());
|
|
assert(o.inited());
|
|
return pen == o.pen && depth == o.depth && width == o.width && rootpri == o.rootpri;
|
|
}
|
|
|
|
TScore pen; // total penalty accumulated so far
|
|
size_t depth; // depth from root of descent
|
|
TIndexOffU width; // width of the SA range
|
|
float rootpri; // priority of the root
|
|
};
|
|
|
|
static inline std::ostream& operator<<(
|
|
std::ostream& os,
|
|
const DescentPriority& o)
|
|
{
|
|
os << "[" << o.pen << ", " << o.depth << ", " << o.width << ", " << o.rootpri << "]";
|
|
return os;
|
|
}
|
|
|
|
static inline std::ostream& operator<<(
|
|
std::ostream& os,
|
|
const std::pair<DescentPriority, TDescentId>& o)
|
|
{
|
|
os << "{[" << o.first.pen << ", " << o.first.depth << ", "
|
|
<< o.first.width << ", " << o.first.rootpri << "], " << o.second << "}";
|
|
return os;
|
|
}
|
|
|
|
typedef std::pair<DescentPriority, TDescentId> TDescentPair;
|
|
|
|
/**
|
|
* Encapsulates the constraints limiting which outgoing edges are permitted.
|
|
* Specifically, we constrain the total penalty accumulated so far so that some
|
|
* outgoing edges will exceed the limit and be pruned. The limit is set
|
|
* according to our "depth" into the search, as measured by the number of read
|
|
* characters aligned so far. We divide the depth domain into two pieces, a
|
|
* piece close to the root, where the penty is constrained to be 0, and the
|
|
* remainder, where the maximum penalty is an interpolation between 0 and the
|
|
* maximum penalty
|
|
*/
|
|
struct DescentConstraints {
|
|
|
|
DescentConstraints() { reset(); }
|
|
|
|
/**
|
|
* Initialize with new constraint function.
|
|
*/
|
|
DescentConstraints(size_t nzero, double exp) {
|
|
init(nzero, exp);
|
|
}
|
|
|
|
/**
|
|
* Initialize with given function.
|
|
*/
|
|
void init(size_t nzero_, double exp_) {
|
|
nzero = nzero_ > 0 ? nzero_ : 1;
|
|
exp = exp_;
|
|
#ifndef NDEBUG
|
|
for(size_t i = 1; i < nzero_ + 5; i++) {
|
|
assert_geq(get(i, nzero_ + 10, 100), get(i-1, nzero_ + 10, 100));
|
|
}
|
|
#endif
|
|
}
|
|
|
|
/**
|
|
* Reset to uninitialized state.
|
|
*/
|
|
void reset() {
|
|
nzero = 0;
|
|
exp = -1.0f;
|
|
}
|
|
|
|
/**
|
|
* Return true iff the DescentConstraints has been initialized.
|
|
*/
|
|
bool inited() const {
|
|
return exp >= 0.0f;
|
|
}
|
|
|
|
/**
|
|
* Get the maximum penalty total for depth 'off'.
|
|
*/
|
|
inline TScore get(TReadOff off, TReadOff rdlen, TAlScore maxpen) const {
|
|
if(off < nzero || nzero >= rdlen) {
|
|
return 0;
|
|
}
|
|
double frac = (double)(off - nzero) / (rdlen - nzero);
|
|
if(fabs(exp - 1.0f) > 0.00001) {
|
|
if(fabs(exp - 2.0f) < 0.00001) {
|
|
frac *= frac;
|
|
} else {
|
|
frac = pow(frac, exp);
|
|
}
|
|
}
|
|
return (TAlScore)(frac * maxpen + 0.5f);
|
|
}
|
|
|
|
size_t nzero;
|
|
double exp;
|
|
};
|
|
|
|
/**
|
|
* Encapsulates settings governing how we descent.
|
|
*/
|
|
struct DescentConfig {
|
|
|
|
DescentConfig() { reset(); }
|
|
|
|
/**
|
|
* Reset the DescentConfig to an uninitialized state.
|
|
*/
|
|
void reset() { expol = 0; }
|
|
|
|
/**
|
|
* Return true iff this DescentConfig is initialized.
|
|
*/
|
|
bool inited() const { return expol != 0; }
|
|
|
|
DescentConstraints cons; // constraints
|
|
int expol; // extend policy
|
|
};
|
|
|
|
/**
|
|
* Encapsulates the state of a Descent that allows us to determine whether it
|
|
* is redundant with another Descent. Two Descents are redundant if:
|
|
*
|
|
* 1. Both are aligning the same read orientation (fw or rc)
|
|
* 2. Both are growing the alignment in the same direction (left-to-right or
|
|
* right-to-left)
|
|
* 3. They have aligned exactly the same read characters (which are always
|
|
* consecutive in the read)
|
|
* 4. The corresponding reference strings are identical
|
|
*/
|
|
struct DescentRedundancyKey {
|
|
|
|
DescentRedundancyKey() { reset(); }
|
|
|
|
DescentRedundancyKey(
|
|
TReadOff al5pf_,
|
|
size_t rflen_,
|
|
TIndexOffU topf_,
|
|
TIndexOffU botf_)
|
|
{
|
|
init(al5pf_, rflen_, topf_, botf_);
|
|
}
|
|
|
|
void reset() {
|
|
al5pf = 0;
|
|
rflen = 0;
|
|
topf = botf = 0;
|
|
}
|
|
|
|
bool inited() const { return rflen > 0; }
|
|
|
|
void init(
|
|
TReadOff al5pf_,
|
|
size_t rflen_,
|
|
TIndexOffU topf_,
|
|
TIndexOffU botf_)
|
|
{
|
|
al5pf = al5pf_;
|
|
rflen = rflen_;
|
|
topf = topf_;
|
|
botf = botf_;
|
|
}
|
|
|
|
bool operator==(const DescentRedundancyKey& o) const {
|
|
return al5pf == o.al5pf && rflen == o.rflen && topf == o.topf && botf == o.botf;
|
|
}
|
|
|
|
bool operator<(const DescentRedundancyKey& o) const {
|
|
if(al5pf < o.al5pf) return true;
|
|
if(al5pf > o.al5pf) return false;
|
|
if(rflen < o.rflen) return true;
|
|
if(rflen > o.rflen) return false;
|
|
if(topf < o.topf) return true;
|
|
if(topf > o.topf) return false;
|
|
return botf < o.botf;
|
|
}
|
|
|
|
TReadOff al5pf; // 3'-most aligned char, as offset from 5' end
|
|
size_t rflen; // number of reference characters involved in alignment
|
|
TIndexOffU topf; // top w/r/t forward index
|
|
TIndexOffU botf; // bot w/r/t forward index
|
|
};
|
|
|
|
/**
|
|
* Map from pairs to top, bot, penalty triples.
|
|
*/
|
|
class DescentRedundancyChecker {
|
|
|
|
public:
|
|
|
|
DescentRedundancyChecker() { reset(); }
|
|
|
|
void clear() { reset(); }
|
|
|
|
/**
|
|
* Reset to uninitialized state.
|
|
*/
|
|
void reset() {
|
|
bits_.reset();
|
|
inited_ = false;
|
|
totsz_ = 0; // total size
|
|
totcap_ = 0; // total capacity
|
|
}
|
|
|
|
const static int NPARTS = 8;
|
|
const static int PART_MASK = 7;
|
|
const static int NBITS = (1 << 16);
|
|
|
|
/**
|
|
* Initialize using given read length.
|
|
*/
|
|
void init(TReadOff rdlen) {
|
|
reset();
|
|
// daehwan - for debugging purposes
|
|
#if 0
|
|
bits_.resize(NBITS);
|
|
maplist_fl_.resize(NPARTS);
|
|
maplist_fr_.resize(NPARTS);
|
|
maplist_rl_.resize(NPARTS);
|
|
maplist_rr_.resize(NPARTS);
|
|
for(int i = 0; i < NPARTS; i++) {
|
|
maplist_fl_[i].resize(rdlen);
|
|
maplist_fr_[i].resize(rdlen);
|
|
maplist_rl_[i].resize(rdlen);
|
|
maplist_rr_[i].resize(rdlen);
|
|
totcap_ += maplist_fl_[i].totalCapacityBytes();
|
|
totcap_ += maplist_fr_[i].totalCapacityBytes();
|
|
totcap_ += maplist_rl_[i].totalCapacityBytes();
|
|
totcap_ += maplist_rr_[i].totalCapacityBytes();
|
|
for(size_t j = 0; j < rdlen; j++) {
|
|
maplist_fl_[i][j].clear();
|
|
maplist_fr_[i][j].clear();
|
|
maplist_rl_[i][j].clear();
|
|
maplist_rr_[i][j].clear();
|
|
totcap_ += maplist_fl_[i][j].totalCapacityBytes();
|
|
totcap_ += maplist_fr_[i][j].totalCapacityBytes();
|
|
totcap_ += maplist_rl_[i][j].totalCapacityBytes();
|
|
totcap_ += maplist_rr_[i][j].totalCapacityBytes();
|
|
}
|
|
}
|
|
#endif
|
|
inited_ = true;
|
|
}
|
|
|
|
/**
|
|
* Return true iff the checker is initialized.
|
|
*/
|
|
bool inited() const {
|
|
return inited_;
|
|
}
|
|
|
|
/**
|
|
* Check if this partial alignment is redundant with one that we've already
|
|
* explored.
|
|
*/
|
|
bool check(
|
|
bool fw,
|
|
bool l2r,
|
|
TReadOff al5pi,
|
|
TReadOff al5pf,
|
|
size_t rflen,
|
|
TIndexOffU topf,
|
|
TIndexOffU botf,
|
|
TScore pen)
|
|
{
|
|
// daehwan - for debugging purposes
|
|
return true;
|
|
|
|
assert(inited_);
|
|
assert(topf > 0 || botf > 0);
|
|
DescentRedundancyKey k(al5pf, rflen, topf, botf);
|
|
size_t i = std::numeric_limits<size_t>::max();
|
|
size_t mask = topf & PART_MASK;
|
|
EMap<DescentRedundancyKey, TScore>& map =
|
|
(fw ? (l2r ? maplist_fl_[mask][al5pi] : maplist_fr_[mask][al5pi]) :
|
|
(l2r ? maplist_rl_[mask][al5pi] : maplist_rr_[mask][al5pi]));
|
|
size_t key = (topf & 255) | ((botf & 255) << 8);
|
|
if(bits_.test(key) && map.containsEx(k, i)) {
|
|
// Already contains the key
|
|
assert_lt(i, map.size());
|
|
assert_geq(pen, map[i].second);
|
|
return false;
|
|
}
|
|
assert(!map.containsEx(k, i));
|
|
size_t oldsz = map.totalSizeBytes();
|
|
size_t oldcap = map.totalCapacityBytes();
|
|
map.insert(make_pair(k, pen));
|
|
bits_.set(key);
|
|
totsz_ += (map.totalSizeBytes() - oldsz);
|
|
totcap_ += (map.totalCapacityBytes() - oldcap);
|
|
return true;
|
|
}
|
|
|
|
/**
|
|
* Check if this partial alignment is redundant with one that we've already
|
|
* explored using the Bw index SA range.
|
|
*/
|
|
bool contains(
|
|
bool fw,
|
|
bool l2r,
|
|
TReadOff al5pi,
|
|
TReadOff al5pf,
|
|
size_t rflen,
|
|
TIndexOffU topf,
|
|
TIndexOffU botf,
|
|
TScore pen)
|
|
{
|
|
// daehwan - for debugging purposes
|
|
return false;
|
|
|
|
assert(inited_);
|
|
size_t key = (topf & 255) | ((botf & 255) << 8);
|
|
if(!bits_.test(key)) {
|
|
return false;
|
|
}
|
|
DescentRedundancyKey k(al5pf, rflen, topf, botf);
|
|
size_t mask = topf & PART_MASK;
|
|
EMap<DescentRedundancyKey, TScore>& map =
|
|
(fw ? (l2r ? maplist_fl_[mask][al5pi] : maplist_fr_[mask][al5pi]) :
|
|
(l2r ? maplist_rl_[mask][al5pi] : maplist_rr_[mask][al5pi]));
|
|
return map.contains(k);
|
|
}
|
|
|
|
/**
|
|
* Return the total size of the redundancy map.
|
|
*/
|
|
size_t totalSizeBytes() const {
|
|
return totsz_;
|
|
}
|
|
|
|
/**
|
|
* Return the total capacity of the redundancy map.
|
|
*/
|
|
size_t totalCapacityBytes() const {
|
|
return totcap_;
|
|
}
|
|
|
|
protected:
|
|
|
|
bool inited_; // initialized?
|
|
size_t totsz_; // total size
|
|
size_t totcap_; // total capacity
|
|
|
|
// List of maps. Each entry is a map for all the DescentRedundancyKeys
|
|
// with al5pi equal to the offset into the list.
|
|
ELList<EMap<DescentRedundancyKey, TScore>, NPARTS, 100> maplist_fl_; // fw, l2r
|
|
ELList<EMap<DescentRedundancyKey, TScore>, NPARTS, 100> maplist_rl_; // !fw, l2r
|
|
ELList<EMap<DescentRedundancyKey, TScore>, NPARTS, 100> maplist_fr_; // fw, !l2r
|
|
ELList<EMap<DescentRedundancyKey, TScore>, NPARTS, 100> maplist_rr_; // !fw, !l2r
|
|
|
|
EBitList<128> bits_;
|
|
};
|
|
|
|
/**
|
|
* A search root. Consists of an offset from the 5' end read and flags
|
|
* indicating (a) whether we're initially heading left-to-right or
|
|
* right-to-left, and (b) whether we're examining the read or its reverse
|
|
* complement.
|
|
*
|
|
* A root also comes with a priority ("pri") score indicating how promising it
|
|
* is as a root. Promising roots have long stretches of high-quality,
|
|
* non-repetitive nucleotides in the first several ply of the search tree.
|
|
* Also, roots beginning at the 5' end of the read may receive a higher
|
|
* priority.
|
|
*/
|
|
struct DescentRoot {
|
|
|
|
DescentRoot() { reset(); }
|
|
|
|
DescentRoot(size_t off5p_, bool l2r_, bool fw_, size_t len, float pri_) {
|
|
init(off5p_, l2r_, fw_, len, pri_);
|
|
}
|
|
|
|
/**
|
|
* Reset this DescentRoot to uninitialized state.
|
|
*/
|
|
void reset() {
|
|
off5p = std::numeric_limits<size_t>::max();
|
|
}
|
|
|
|
/**
|
|
* Return true iff this DescentRoot is uninitialized.
|
|
*/
|
|
bool inited() const {
|
|
return off5p == std::numeric_limits<size_t>::max();
|
|
}
|
|
|
|
/**
|
|
* Initialize a new descent root.
|
|
*/
|
|
void init(size_t off5p_, bool l2r_, bool fw_, size_t len, float pri_) {
|
|
off5p = off5p_;
|
|
l2r = l2r_;
|
|
fw = fw_;
|
|
pri = pri_;
|
|
assert_lt(off5p, len);
|
|
}
|
|
|
|
TReadOff off5p; // root origin offset, expressed as offset from 5' end
|
|
bool l2r; // true -> move in left-to-right direction
|
|
bool fw; // true -> work with forward read, false -> revcomp
|
|
float pri; // priority of seed
|
|
};
|
|
|
|
/**
|
|
* Set of flags indicating outgoing edges we've tried from a DescentPos.
|
|
*/
|
|
struct DescentPosFlags {
|
|
|
|
DescentPosFlags() { reset(); }
|
|
|
|
/**
|
|
* Set all flags to 1, indicating all outgoing edges are yet to be
|
|
* explored.
|
|
*/
|
|
void reset() {
|
|
mm_a = mm_c = mm_g = mm_t = rdg_a = rdg_c = rdg_g = rdg_t = rfg = 1;
|
|
reserved = 0;
|
|
}
|
|
|
|
/**
|
|
* Return true iff all outgoing edges have already been explored.
|
|
*/
|
|
bool exhausted() const {
|
|
return ((uint16_t*)this)[0] == 0;
|
|
}
|
|
|
|
/**
|
|
* Return false iff the specified mismatch has already been explored.
|
|
*/
|
|
bool mmExplore(int c) {
|
|
assert_range(0, 3, c);
|
|
if(c == 0) {
|
|
return mm_a;
|
|
} else if(c == 1) {
|
|
return mm_c;
|
|
} else if(c == 2) {
|
|
return mm_g;
|
|
} else {
|
|
return mm_t;
|
|
}
|
|
}
|
|
|
|
/**
|
|
* Try to explore a mismatch. Return false iff it has already been
|
|
* explored.
|
|
*/
|
|
bool mmSet(int c) {
|
|
assert_range(0, 3, c);
|
|
if(c == 0) {
|
|
bool ret = mm_a; mm_a = 0; return ret;
|
|
} else if(c == 1) {
|
|
bool ret = mm_c; mm_c = 0; return ret;
|
|
} else if(c == 2) {
|
|
bool ret = mm_g; mm_g = 0; return ret;
|
|
} else {
|
|
bool ret = mm_t; mm_t = 0; return ret;
|
|
}
|
|
}
|
|
|
|
/**
|
|
* Return false iff specified read gap has already been explored.
|
|
*/
|
|
bool rdgExplore(int c) {
|
|
assert_range(0, 3, c);
|
|
if(c == 0) {
|
|
return rdg_a;
|
|
} else if(c == 1) {
|
|
return rdg_c;
|
|
} else if(c == 2) {
|
|
return rdg_g;
|
|
} else {
|
|
return rdg_t;
|
|
}
|
|
}
|
|
|
|
/**
|
|
* Try to explore a read gap. Return false iff it has already been
|
|
* explored.
|
|
*/
|
|
bool rdgSet(int c) {
|
|
assert_range(0, 3, c);
|
|
if(c == 0) {
|
|
bool ret = rdg_a; rdg_a = 0; return ret;
|
|
} else if(c == 1) {
|
|
bool ret = rdg_c; rdg_c = 0; return ret;
|
|
} else if(c == 2) {
|
|
bool ret = rdg_g; rdg_g = 0; return ret;
|
|
} else {
|
|
bool ret = rdg_t; rdg_t = 0; return ret;
|
|
}
|
|
}
|
|
|
|
/**
|
|
* Return false iff the reference gap has already been explored.
|
|
*/
|
|
bool rfgExplore() {
|
|
return rfg;
|
|
}
|
|
|
|
/**
|
|
* Try to explore a reference gap. Return false iff it has already been
|
|
* explored.
|
|
*/
|
|
bool rfgSet() {
|
|
bool ret = rfg; rfg = 0; return ret;
|
|
}
|
|
|
|
uint16_t mm_a : 1;
|
|
uint16_t mm_c : 1;
|
|
uint16_t mm_g : 1;
|
|
uint16_t mm_t : 1;
|
|
|
|
uint16_t rdg_a : 1;
|
|
uint16_t rdg_c : 1;
|
|
uint16_t rdg_g : 1;
|
|
uint16_t rdg_t : 1;
|
|
|
|
uint16_t rfg : 1;
|
|
|
|
uint16_t reserved : 7;
|
|
};
|
|
|
|
/**
|
|
* FM Index state associated with a single position in a descent. For both the
|
|
* forward and backward indexes, it stores the four SA ranges corresponding to
|
|
* the four nucleotides.
|
|
*/
|
|
struct DescentPos {
|
|
|
|
/**
|
|
* Reset all tops and bots to 0.
|
|
*/
|
|
void reset() {
|
|
topf[0] = topf[1] = topf[2] = topf[3] = 0;
|
|
botf[0] = botf[1] = botf[2] = botf[3] = 0;
|
|
topb[0] = topb[1] = topb[2] = topb[3] = 0;
|
|
botb[0] = botb[1] = botb[2] = botb[3] = 0;
|
|
c = -1;
|
|
flags.reset();
|
|
}
|
|
|
|
/**
|
|
* Return true iff DescentPos has been initialized.
|
|
*/
|
|
bool inited() const {
|
|
return c >= 0;
|
|
}
|
|
|
|
#ifndef NDEBUG
|
|
/**
|
|
* Check that DescentPos is internally consistent.
|
|
*/
|
|
bool repOk() const {
|
|
assert_range(0, 3, (int)c);
|
|
return true;
|
|
}
|
|
#endif
|
|
|
|
TIndexOffU topf[4]; // SA range top indexes in fw index
|
|
TIndexOffU botf[4]; // SA range bottom indexes (exclusive) in fw index
|
|
TIndexOffU topb[4]; // SA range top indexes in bw index
|
|
TIndexOffU botb[4]; // SA range bottom indexes (exclusive) in bw index
|
|
char c; // read char that would yield match
|
|
DescentPosFlags flags; // flags
|
|
};
|
|
|
|
/**
|
|
* Encapsulates an edge outgoing from a descent.
|
|
*/
|
|
struct DescentEdge {
|
|
|
|
DescentEdge() { reset(); }
|
|
|
|
DescentEdge(
|
|
Edit e_,
|
|
TReadOff off5p_,
|
|
DescentPriority pri_,
|
|
size_t posFlag_,
|
|
TReadOff nex_
|
|
#ifndef NDEBUG
|
|
,
|
|
size_t d_,
|
|
TIndexOffU topf_,
|
|
TIndexOffU botf_,
|
|
TIndexOffU topb_,
|
|
TIndexOffU botb_
|
|
#endif
|
|
)
|
|
{
|
|
init(e_, off5p_, pri_, posFlag_
|
|
#ifndef NDEBUG
|
|
, d_, topf_, botf_, topb_, botb_
|
|
#endif
|
|
);
|
|
}
|
|
|
|
/**
|
|
* Return true iff edge is initialized.
|
|
*/
|
|
bool inited() const { return e.inited(); }
|
|
|
|
/**
|
|
* Reset to uninitialized state.
|
|
*/
|
|
void reset() { e.reset(); }
|
|
|
|
/**
|
|
* Initialize DescentEdge given 5' offset, nucleotide, and priority.
|
|
*/
|
|
void init(
|
|
Edit e_,
|
|
TReadOff off5p_,
|
|
DescentPriority pri_,
|
|
size_t posFlag_
|
|
#ifndef NDEBUG
|
|
,
|
|
size_t d_,
|
|
TIndexOffU topf_,
|
|
TIndexOffU botf_,
|
|
TIndexOffU topb_,
|
|
TIndexOffU botb_
|
|
#endif
|
|
)
|
|
{
|
|
e = e_;
|
|
off5p = off5p_;
|
|
pri = pri_;
|
|
posFlag = posFlag_;
|
|
#ifndef NDEBUG
|
|
d = d_;
|
|
topf = topf_;
|
|
botf = botf_;
|
|
topb = topb_;
|
|
botb = botb_;
|
|
#endif
|
|
}
|
|
|
|
/**
|
|
* Update flags to show this edge as visited.
|
|
*/
|
|
void updateFlags(EFactory<DescentPos>& pf) {
|
|
if(inited()) {
|
|
if(e.isReadGap()) {
|
|
assert_neq('-', e.chr);
|
|
pf[posFlag].flags.rdgSet(asc2dna[e.chr]);
|
|
} else if(e.isRefGap()) {
|
|
pf[posFlag].flags.rfgSet();
|
|
} else {
|
|
assert_neq('-', e.chr);
|
|
pf[posFlag].flags.mmSet(asc2dna[e.chr]);
|
|
}
|
|
}
|
|
}
|
|
|
|
/**
|
|
* Return true iff this edge has higher priority than the given edge.
|
|
*/
|
|
bool operator<(const DescentEdge& o) const {
|
|
if(inited() && !o.inited()) {
|
|
return true;
|
|
} else if(!inited()) {
|
|
return false;
|
|
}
|
|
return pri < o.pri;
|
|
}
|
|
|
|
DescentPriority pri; // priority of the edge
|
|
//TReadOff nex; // # extends possible from this edge
|
|
size_t posFlag; // depth of DescentPos where flag should be set
|
|
|
|
|
|
#ifndef NDEBUG
|
|
// This can be recreated by looking at the edit, the paren't descent's
|
|
// len_, al5pi_, al5pf_. I have it here so we can sanity check.
|
|
size_t d;
|
|
TIndexOffU topf, botf, topb, botb;
|
|
#endif
|
|
|
|
Edit e;
|
|
TReadOff off5p;
|
|
};
|
|
|
|
/**
|
|
* Encapsulates an incomplete summary of the outgoing edges from a descent. We
|
|
* don't try to store information about all outgoing edges, because doing so
|
|
* will generally be wasteful. We'll typically only try a handful of them per
|
|
* descent.
|
|
*/
|
|
class DescentOutgoing {
|
|
|
|
public:
|
|
|
|
/**
|
|
* Return the best edge and rotate in preparation for next call.
|
|
*/
|
|
DescentEdge rotate() {
|
|
DescentEdge tmp = best1;
|
|
assert(!(best2 < tmp));
|
|
best1 = best2;
|
|
assert(!(best3 < best2));
|
|
best2 = best3;
|
|
assert(!(best4 < best3));
|
|
best3 = best4;
|
|
assert(!(best5 < best4));
|
|
best4 = best5;
|
|
best5.reset();
|
|
return tmp;
|
|
}
|
|
|
|
/**
|
|
* Given a potental outgoing edge, place it where it belongs in the running
|
|
* list of best 5 outgoing edges from this descent.
|
|
*/
|
|
void update(DescentEdge e) {
|
|
if(!best1.inited()) {
|
|
best1 = e;
|
|
} else if(e < best1) {
|
|
best5 = best4;
|
|
best4 = best3;
|
|
best3 = best2;
|
|
best2 = best1;
|
|
best1 = e;
|
|
} else if(!best2.inited()) {
|
|
best2 = e;
|
|
} else if(e < best2) {
|
|
best5 = best4;
|
|
best4 = best3;
|
|
best3 = best2;
|
|
best2 = e;
|
|
} else if(!best3.inited()) {
|
|
best3 = e;
|
|
} else if(e < best3) {
|
|
best5 = best4;
|
|
best4 = best3;
|
|
best3 = e;
|
|
} else if(!best4.inited()) {
|
|
best4 = e;
|
|
} else if(e < best4) {
|
|
best5 = best4;
|
|
best4 = e;
|
|
} else if(!best5.inited() || e < best5) {
|
|
best5 = e;
|
|
}
|
|
}
|
|
|
|
/**
|
|
* Clear all the outgoing edges stored here.
|
|
*/
|
|
void clear() {
|
|
best1.reset();
|
|
best2.reset();
|
|
best3.reset();
|
|
best4.reset();
|
|
best5.reset();
|
|
}
|
|
|
|
/**
|
|
* Return true iff there are no outgoing edges currently represented in
|
|
* this summary. There may still be outgoing edges, they just haven't
|
|
* been added to the summary.
|
|
*/
|
|
bool empty() const {
|
|
return !best1.inited();
|
|
}
|
|
|
|
/**
|
|
* Return the DescentPriority of the best outgoing edge.
|
|
*/
|
|
DescentPriority bestPri() const {
|
|
assert(!empty());
|
|
return best1.pri;
|
|
}
|
|
|
|
DescentEdge best1; // best
|
|
DescentEdge best2; // 2nd-best
|
|
DescentEdge best3; // 3rd-best
|
|
DescentEdge best4; // 4th-best
|
|
DescentEdge best5; // 5th-best
|
|
};
|
|
|
|
template <typename index_t>
|
|
class DescentAlignmentSink;
|
|
|
|
/**
|
|
* Encapsulates a descent through a search tree, along a path of matches.
|
|
* Descents that are part of the same alignment form a chain. Two aligments
|
|
* adjacent in the chain are connected either by an edit, or by a switch in
|
|
* direction. Because a descent might have a different direction from the
|
|
* DescentRoot it ultimately came from, it has its own 'l2r' field, which might
|
|
* differ from the root's.
|
|
*/
|
|
template <typename index_t>
|
|
class Descent {
|
|
|
|
public:
|
|
|
|
Descent() { reset(); }
|
|
|
|
/**
|
|
* Initialize a new descent branching from the given descent via the given
|
|
* edit. Return false if the Descent has no outgoing edges (and can
|
|
* therefore have its memory freed), true otherwise.
|
|
*/
|
|
bool init(
|
|
const Read& q, // query
|
|
TRootId rid, // root id
|
|
const Scoring& sc, // scoring scheme
|
|
TAlScore minsc, // minimum score
|
|
TAlScore maxpen, // maximum penalty
|
|
TReadOff al5pi, // offset from 5' of 1st aligned char
|
|
TReadOff al5pf, // offset from 5' of last aligned char
|
|
TIndexOffU topf, // SA range top in FW index
|
|
TIndexOffU botf, // SA range bottom in FW index
|
|
TIndexOffU topb, // SA range top in BW index
|
|
TIndexOffU botb, // SA range bottom in BW index
|
|
bool l2r, // direction this descent will go in
|
|
size_t descid, // my ID
|
|
TDescentId parent, // parent ID
|
|
TScore pen, // total penalties so far
|
|
const Edit& e, // edit for incoming edge
|
|
const GFM<index_t>& gfmFw, // forward index
|
|
const GFM<index_t>& gfmBw, // mirror index
|
|
DescentRedundancyChecker& re, // redundancy checker
|
|
EFactory<Descent>& df, // Descent factory
|
|
EFactory<DescentPos>& pf, // DescentPos factory
|
|
const EList<DescentRoot>& rs, // roots
|
|
const EList<DescentConfig>& cs, // configs
|
|
EHeap<TDescentPair>& heap, // heap
|
|
DescentAlignmentSink<index_t>& alsink, // alignment sink
|
|
DescentMetrics& met, // metrics
|
|
PerReadMetrics& prm); // per-read metrics
|
|
|
|
/**
|
|
* Initialize a new descent beginning at the given root. Return false if
|
|
* the Descent has no outgoing edges (and can therefore have its memory
|
|
* freed), true otherwise.
|
|
*/
|
|
bool init(
|
|
const Read& q, // query
|
|
TRootId rid, // root id
|
|
const Scoring& sc, // scoring scheme
|
|
TAlScore minsc, // minimum score
|
|
TAlScore maxpen, // maximum penalty
|
|
size_t descid, // id of this Descent
|
|
const GFM<index_t>& gfmFw, // forward index
|
|
const GFM<index_t>& gfmBw, // mirror index
|
|
DescentRedundancyChecker& re, // redundancy checker
|
|
EFactory<Descent>& df, // Descent factory
|
|
EFactory<DescentPos>& pf, // DescentPos factory
|
|
const EList<DescentRoot>& rs, // roots
|
|
const EList<DescentConfig>& cs, // configs
|
|
EHeap<TDescentPair>& heap, // heap
|
|
DescentAlignmentSink<index_t>& alsink, // alignment sink
|
|
DescentMetrics& met, // metrics
|
|
PerReadMetrics& prm); // per-read metrics
|
|
|
|
/**
|
|
* Return true iff this Descent has been initialized.
|
|
*/
|
|
bool inited() const {
|
|
return descid_ != std::numeric_limits<size_t>::max();
|
|
}
|
|
|
|
/**
|
|
* Reset to uninitialized state.
|
|
*/
|
|
void reset() {
|
|
lastRecalc_ = true;
|
|
descid_ = std::numeric_limits<size_t>::max();
|
|
}
|
|
|
|
/**
|
|
* Return true iff this Descent is a search root.
|
|
*/
|
|
bool root() const {
|
|
return parent_ == std::numeric_limits<TDescentId>::max();
|
|
}
|
|
|
|
/**
|
|
* Return the edit.
|
|
*/
|
|
const Edit& edit() const {
|
|
return edit_;
|
|
}
|
|
|
|
/**
|
|
* Return id of parent.
|
|
*/
|
|
TDescentId parent() const {
|
|
return parent_;
|
|
}
|
|
|
|
/**
|
|
* Take the best outgoing edge and follow it.
|
|
*/
|
|
void followBestOutgoing(
|
|
const Read& q, // read
|
|
const GFM<index_t>& gfmFw, // forward index
|
|
const GFM<index_t>& gfmBw, // mirror index
|
|
const Scoring& sc, // scoring scheme
|
|
TAlScore minsc, // minimum score
|
|
TAlScore maxpen, // maximum penalty
|
|
DescentRedundancyChecker& re, // redundancy checker
|
|
EFactory<Descent>& df, // factory with Descent
|
|
EFactory<DescentPos>& pf, // factory with DescentPoss
|
|
const EList<DescentRoot>& rs, // roots
|
|
const EList<DescentConfig>& cs, // configs
|
|
EHeap<TDescentPair>& heap, // heap of descents
|
|
DescentAlignmentSink<index_t>& alsink, // alignment sink
|
|
DescentMetrics& met, // metrics
|
|
PerReadMetrics& prm); // per-read metrics
|
|
|
|
/**
|
|
* Return true iff no outgoing edges from this descent remain unexplored.
|
|
*/
|
|
bool empty() const { return lastRecalc_ && out_.empty(); }
|
|
|
|
#ifndef NDEBUG
|
|
/**
|
|
* Return true iff the Descent is internally consistent.
|
|
*/
|
|
bool repOk(const Read *q) const {
|
|
// A non-root can have an uninitialized edit_ if it is from a bounce
|
|
//assert( root() || edit_.inited());
|
|
assert(!root() || !edit_.inited());
|
|
assert_eq(botf_ - topf_, botb_ - topb_);
|
|
if(q != NULL) {
|
|
assert_leq(len_, q->length());
|
|
}
|
|
return true;
|
|
}
|
|
#endif
|
|
|
|
size_t al5pi() const { return al5pi_; }
|
|
size_t al5pf() const { return al5pf_; }
|
|
bool l2r() const { return l2r_; }
|
|
|
|
/**
|
|
* Print a stacked representation of this descent and all its parents. Assumes that
|
|
*/
|
|
void print(
|
|
std::ostream* os,
|
|
const char *prefix,
|
|
const Read& q,
|
|
size_t trimLf,
|
|
size_t trimRg,
|
|
bool fw,
|
|
const EList<Edit>& edits,
|
|
size_t ei,
|
|
size_t en,
|
|
BTDnaString& rf) const;
|
|
|
|
/**
|
|
* Collect all the edits
|
|
*/
|
|
void collectEdits(
|
|
EList<Edit>& edits,
|
|
const Edit *e,
|
|
EFactory<Descent>& df)
|
|
{
|
|
// Take just the portion of the read that has aligned up until this
|
|
// point
|
|
size_t nuninited = 0;
|
|
size_t ei = edits.size();
|
|
size_t en = 0;
|
|
if(e != NULL && e->inited()) {
|
|
edits.push_back(*e);
|
|
en++;
|
|
}
|
|
size_t cur = descid_;
|
|
while(cur != std::numeric_limits<TDescentId>::max()) {
|
|
if(!df[cur].edit().inited()) {
|
|
nuninited++;
|
|
assert_leq(nuninited, 2);
|
|
} else {
|
|
edits.push_back(df[cur].edit());
|
|
en++;
|
|
}
|
|
cur = df[cur].parent();
|
|
}
|
|
// Sort just the edits we just added
|
|
edits.sortPortion(ei, en);
|
|
}
|
|
|
|
protected:
|
|
|
|
/**
|
|
*
|
|
*/
|
|
bool bounce(
|
|
const Read& q, // query string
|
|
TIndexOffU topf, // SA range top in fw index
|
|
TIndexOffU botf, // SA range bottom in fw index
|
|
TIndexOffU topb, // SA range top in bw index
|
|
TIndexOffU botb, // SA range bottom in bw index
|
|
const GFM<index_t>& gfmFw, // forward index
|
|
const GFM<index_t>& gfmBw, // mirror index
|
|
const Scoring& sc, // scoring scheme
|
|
TAlScore minsc, // minimum score
|
|
TAlScore maxpen, // maximum penalty
|
|
DescentRedundancyChecker& re, // redundancy checker
|
|
EFactory<Descent>& df, // factory with Descent
|
|
EFactory<DescentPos>& pf, // factory with DescentPoss
|
|
const EList<DescentRoot>& rs, // roots
|
|
const EList<DescentConfig>& cs, // configs
|
|
EHeap<TDescentPair>& heap, // heap of descents
|
|
DescentAlignmentSink<index_t>& alsink, // alignment sink
|
|
DescentMetrics& met, // metrics
|
|
PerReadMetrics& prm); // per-read metrics
|
|
|
|
/**
|
|
* Given the forward and backward indexes, and given topf/botf/topb/botb,
|
|
* get tloc, bloc ready for the next step.
|
|
*/
|
|
void nextLocsBi(
|
|
const GFM<index_t>& gfmFw, // forward index
|
|
const GFM<index_t>& gfmBw, // mirror index
|
|
SideLocus<index_t>& tloc, // top locus
|
|
SideLocus<index_t>& bloc, // bot locus
|
|
index_t topf, // top in BWT
|
|
index_t botf, // bot in BWT
|
|
index_t topb, // top in BWT'
|
|
index_t botb); // bot in BWT'
|
|
|
|
/**
|
|
* Advance this descent by following read matches as far as possible.
|
|
*/
|
|
bool followMatches(
|
|
const Read& q, // query string
|
|
const Scoring& sc, // scoring scheme
|
|
const GFM<index_t>& gfmFw, // forward index
|
|
const GFM<index_t>& gfmBw, // mirror index
|
|
DescentRedundancyChecker& re, // redundancy checker
|
|
EFactory<Descent>& df, // Descent factory
|
|
EFactory<DescentPos>& pf, // DescentPos factory
|
|
const EList<DescentRoot>& rs, // roots
|
|
const EList<DescentConfig>& cs, // configs
|
|
EHeap<TDescentPair>& heap, // heap
|
|
DescentAlignmentSink<index_t>& alsink, // alignment sink
|
|
DescentMetrics& met, // metrics
|
|
PerReadMetrics& prm, // per-read metrics
|
|
bool& branches, // out: true -> there are > 0 ways to branch
|
|
bool& hitEnd, // out: true -> hit read end with non-empty range
|
|
bool& done, // out: true -> we made a full alignment
|
|
TReadOff& off5p_i, // out: initial 5' offset
|
|
TIndexOffU& topf_bounce, // out: top of SA range for fw idx for bounce
|
|
TIndexOffU& botf_bounce, // out: bot of SA range for fw idx for bounce
|
|
TIndexOffU& topb_bounce, // out: top of SA range for bw idx for bounce
|
|
TIndexOffU& botb_bounce); // out: bot of SA range for bw idx for bounce
|
|
|
|
/**
|
|
* Recalculate our summary of the outgoing edges from this descent. When
|
|
* deciding what outgoing edges are legal, we abide by constraints.
|
|
* Typically, they limit the total of the penalties accumulated so far, as
|
|
* a function of distance from the search root. E.g. a constraint might
|
|
* disallow any gaps or mismatches within 20 ply of the search root, then
|
|
* allow 1 mismatch within 30 ply, then allow up to 1 mismatch or 1 gap
|
|
* within 40 ply, etc.
|
|
*/
|
|
size_t recalcOutgoing(
|
|
const Read& q, // query string
|
|
const Scoring& sc, // scoring scheme
|
|
TAlScore minsc, // minimum score
|
|
TAlScore maxpen, // maximum penalty
|
|
DescentRedundancyChecker& re, // redundancy checker
|
|
EFactory<DescentPos>& pf, // factory with DescentPoss
|
|
const EList<DescentRoot>& rs, // roots
|
|
const EList<DescentConfig>& cs, // configs
|
|
PerReadMetrics& prm); // per-read metrics
|
|
|
|
TRootId rid_; // root id
|
|
|
|
TReadOff al5pi_; // lo offset from 5' end of aligned read char
|
|
TReadOff al5pf_; // hi offset from 5' end of aligned read char
|
|
bool l2r_; // left-to-right?
|
|
int gapadd_; // net ref characters additional
|
|
TReadOff off5p_i_; // offset we started out at for this descent
|
|
|
|
TIndexOffU topf_, botf_; // incoming SA range w/r/t forward index
|
|
TIndexOffU topb_, botb_; // incoming SA range w/r/t forward index
|
|
|
|
size_t descid_; // ID of this descent
|
|
TDescentId parent_; // ID of parent descent
|
|
TScore pen_; // total penalties accumulated so far
|
|
size_t posid_; // ID of 1st elt of the DescentPos factory w/
|
|
// descent pos info for this descent
|
|
size_t len_; // length of stretch of matches
|
|
DescentOutgoing out_; // summary of outgoing edges
|
|
Edit edit_; // edit joining this descent with parent
|
|
bool lastRecalc_; // set by recalcOutgoing if out edges empty
|
|
};
|
|
|
|
/**
|
|
* An alignment result from a Descent.
|
|
*/
|
|
struct DescentAlignment {
|
|
|
|
DescentAlignment() { reset(); }
|
|
|
|
/**
|
|
* Reset DescentAlignment to be uninitialized.
|
|
*/
|
|
void reset() {
|
|
topf = botf = 0;
|
|
pen = 0;
|
|
fw = false;
|
|
ei = en = 0;
|
|
}
|
|
|
|
/**
|
|
* Initialize this DescentAlignment.
|
|
*/
|
|
void init(
|
|
TScore pen_,
|
|
bool fw_,
|
|
TIndexOffU topf_,
|
|
TIndexOffU botf_,
|
|
size_t ei_,
|
|
size_t en_)
|
|
{
|
|
assert_gt(botf_, topf_);
|
|
pen = pen_;
|
|
fw = fw_;
|
|
topf = topf_;
|
|
botf = botf_;
|
|
ei = ei_;
|
|
en = en_;
|
|
}
|
|
|
|
/**
|
|
* Return true iff DescentAlignment is initialized.
|
|
*/
|
|
bool inited() const {
|
|
return botf > topf;
|
|
}
|
|
|
|
/**
|
|
* Return true iff the alignment is perfect (has no edits)
|
|
*/
|
|
bool perfect() const {
|
|
return pen == 0;
|
|
}
|
|
|
|
/**
|
|
* Return the number of elements in this range.
|
|
*/
|
|
size_t size() const {
|
|
return botf - topf;
|
|
}
|
|
|
|
TScore pen; // score
|
|
|
|
bool fw; // forward or revcomp aligned?
|
|
|
|
TIndexOffU topf; // top in forward index
|
|
TIndexOffU botf; // bot in forward index
|
|
|
|
size_t ei; // First edit in DescentAlignmentSink::edits_ involved in aln
|
|
size_t en; // # edits in DescentAlignmentSink::edits_ involved in aln
|
|
};
|
|
|
|
/**
|
|
* A partial alignment result from a Descent where the reference offset has
|
|
* been resolved.
|
|
*/
|
|
struct DescentPartialResolvedAlignment {
|
|
|
|
DescentPartialResolvedAlignment() { reset(); }
|
|
|
|
/**
|
|
* Reset DescentAlignment to be uninitialized.
|
|
*/
|
|
void reset() {
|
|
topf = botf = 0;
|
|
pen = 0;
|
|
fw = false;
|
|
ei = en = 0;
|
|
refcoord.reset();
|
|
}
|
|
|
|
/**
|
|
* Initialize this DescentAlignment.
|
|
*/
|
|
void init(
|
|
TScore pen_,
|
|
bool fw_,
|
|
TIndexOffU topf_,
|
|
TIndexOffU botf_,
|
|
size_t ei_,
|
|
size_t en_,
|
|
const Coord& refcoord_)
|
|
{
|
|
assert_gt(botf_, topf_);
|
|
pen = pen_;
|
|
fw = fw_;
|
|
topf = topf_;
|
|
botf = botf_;
|
|
ei = ei_;
|
|
en = en_;
|
|
refcoord = refcoord_;
|
|
}
|
|
|
|
/**
|
|
* Return true iff DescentAlignment is initialized.
|
|
*/
|
|
bool inited() const {
|
|
return botf > topf;
|
|
}
|
|
|
|
/**
|
|
* Return the number of elements in this range.
|
|
*/
|
|
size_t size() const {
|
|
return botf - topf;
|
|
}
|
|
|
|
TScore pen; // score
|
|
|
|
bool fw; // forward or revcomp aligned?
|
|
|
|
TIndexOffU topf; // top in forward index
|
|
TIndexOffU botf; // bot in forward index
|
|
|
|
size_t ei; // First edit in DescentAlignmentSink::edits_ involved in aln
|
|
size_t en; // # edits in DescentAlignmentSink::edits_ involved in aln
|
|
|
|
Coord refcoord; // reference coord of leftmost ref char involved
|
|
};
|
|
|
|
/**
|
|
* Class that accepts alignments found during descent and maintains the state
|
|
* required to dispense them to consumers in an appropriate order.
|
|
*
|
|
* As for order in which they are dispensed, in order to maintain uniform
|
|
* distribution over equal-scoring alignments, a good policy may be not to
|
|
* dispense alignments at a given score stratum until *all* alignments at that
|
|
* stratum have been accumulated (i.e. until our best-first search has moved on
|
|
* to a worse stratum). This also has the advantage that, for each alignment,
|
|
* we can also report the number of other alignments in that cost stratum.
|
|
*
|
|
* A lazier alternative is to assume that the order in which alignments in a
|
|
* given stratum arrive is already pseudo-random, which frees us from having to
|
|
* wait until the entire stratum has been explored. But there is reason to
|
|
* think that this order is not truly pseudo-random, since our root placement
|
|
* and root priorities will tend to first lead us to alignments with certain
|
|
* patterns of edits.
|
|
*/
|
|
template <typename index_t>
|
|
class DescentAlignmentSink {
|
|
|
|
public:
|
|
|
|
/**
|
|
* If this is the final descent in a complete end-to-end alignment, report
|
|
* the alignment.
|
|
*/
|
|
bool reportAlignment(
|
|
const Read& q, // query string
|
|
const GFM<index_t>& gfmFw, // forward index
|
|
const GFM<index_t>& gfmBw, // mirror index
|
|
TIndexOffU topf, // SA range top in forward index
|
|
TIndexOffU botf, // SA range bottom in forward index
|
|
TIndexOffU topb, // SA range top in backward index
|
|
TIndexOffU botb, // SA range bottom in backward index
|
|
TDescentId id, // id of leaf Descent
|
|
TRootId rid, // id of search root
|
|
const Edit& e, // final edit, if needed
|
|
TScore pen, // total penalty
|
|
EFactory<Descent<index_t> >& df, // factory with Descent
|
|
EFactory<DescentPos>& pf, // factory with DescentPoss
|
|
const EList<DescentRoot>& rs, // roots
|
|
const EList<DescentConfig>& cs); // configs
|
|
|
|
/**
|
|
* Reset to uninitialized state.
|
|
*/
|
|
void reset() {
|
|
edits_.clear();
|
|
als_.clear();
|
|
lhs_.clear();
|
|
rhs_.clear();
|
|
nelt_ = 0;
|
|
bestPen_ = worstPen_ = std::numeric_limits<TAlScore>::max();
|
|
}
|
|
|
|
/**
|
|
* Return the total size occupued by the Descent driver and all its
|
|
* constituent parts.
|
|
*/
|
|
size_t totalSizeBytes() const {
|
|
return edits_.totalSizeBytes() +
|
|
als_.totalSizeBytes() +
|
|
lhs_.totalSizeBytes() +
|
|
rhs_.totalSizeBytes() +
|
|
sizeof(size_t);
|
|
}
|
|
|
|
/**
|
|
* Return the total capacity of the Descent driver and all its constituent
|
|
* parts.
|
|
*/
|
|
size_t totalCapacityBytes() const {
|
|
return edits_.totalCapacityBytes() +
|
|
als_.totalCapacityBytes() +
|
|
lhs_.totalCapacityBytes() +
|
|
rhs_.totalCapacityBytes() +
|
|
sizeof(size_t);
|
|
}
|
|
|
|
/**
|
|
* Return the number of SA ranges involved in hits.
|
|
*/
|
|
size_t nrange() const {
|
|
return als_.size();
|
|
}
|
|
|
|
/**
|
|
* Return the number of SA elements involved in hits.
|
|
*/
|
|
size_t nelt() const {
|
|
return nelt_;
|
|
}
|
|
|
|
/**
|
|
* The caller provides 'i', which is an offset of a particular element in
|
|
* one of the SA ranges in the current stratum. This function returns, in
|
|
* 'al' and 'off', information about the element in terms of the range it's
|
|
* part of and its offset into that range.
|
|
*/
|
|
void elt(size_t i, DescentAlignment& al, size_t& ri, size_t& off) const {
|
|
assert_lt(i, nelt());
|
|
for(size_t j = 0; j < als_.size(); j++) {
|
|
if(i < als_[j].size()) {
|
|
al = als_[j];
|
|
ri = j;
|
|
off = i;
|
|
return;
|
|
}
|
|
i -= als_[j].size();
|
|
}
|
|
assert(false);
|
|
}
|
|
|
|
/**
|
|
* Get a particular alignment.
|
|
*/
|
|
const DescentAlignment& operator[](size_t i) const {
|
|
return als_[i];
|
|
}
|
|
|
|
/**
|
|
* Return true iff (a) we found an alignment since the sink was initialized
|
|
* or since the last time advanceStratum() was called, and (b) the penalty
|
|
* associated with the current-best task on the heap ('best') is worse
|
|
* (higher) than the penalty associated with the alignments found most
|
|
* recently (worstPen_).
|
|
*/
|
|
bool stratumDone(TAlScore bestPen) const {
|
|
if(nelt_ > 0 && bestPen > worstPen_) {
|
|
return true;
|
|
}
|
|
return false;
|
|
}
|
|
|
|
/**
|
|
* The alignment consumer calls this to indicate that they are done with
|
|
* all the alignments in the current best non-empty stratum. We can
|
|
* therefore mark all those alignments as "reported" and start collecting
|
|
* results for the next stratum.
|
|
*/
|
|
void advanceStratum() {
|
|
assert_gt(nelt_, 0);
|
|
edits_.clear();
|
|
als_.clear();
|
|
// Don't reset lhs_ or rhs_
|
|
nelt_ = 0;
|
|
bestPen_ = worstPen_ = std::numeric_limits<TAlScore>::max();
|
|
}
|
|
|
|
#ifndef NDEBUG
|
|
/**
|
|
* Check that alignment sink is internally consistent.
|
|
*/
|
|
bool repOk() const {
|
|
assert_geq(nelt_, als_.size());
|
|
for(size_t i = 1; i < als_.size(); i++) {
|
|
assert_geq(als_[i].pen, als_[i-1].pen);
|
|
}
|
|
assert(bestPen_ == std::numeric_limits<TAlScore>::max() || worstPen_ >= bestPen_);
|
|
return true;
|
|
}
|
|
#endif
|
|
|
|
TAlScore bestPenalty() const { return bestPen_; }
|
|
TAlScore worstPenalty() const { return worstPen_; }
|
|
|
|
size_t editsSize() const { return edits_.size(); }
|
|
size_t alsSize() const { return als_.size(); }
|
|
size_t lhsSize() const { return lhs_.size(); }
|
|
size_t rhsSize() const { return rhs_.size(); }
|
|
|
|
const EList<Edit>& edits() const { return edits_; }
|
|
|
|
protected:
|
|
|
|
EList<Edit> edits_;
|
|
EList<DescentAlignment> als_;
|
|
ESet<Triple<TIndexOffU, TIndexOffU, size_t> > lhs_;
|
|
ESet<Triple<TIndexOffU, TIndexOffU, size_t> > rhs_;
|
|
size_t nelt_;
|
|
TAlScore bestPen_; // best (smallest) penalty among as-yet-unreported alns
|
|
TAlScore worstPen_; // worst (greatest) penalty among as-yet-unreported alns
|
|
#ifndef NDEBUG
|
|
BTDnaString tmprfdnastr_;
|
|
#endif
|
|
|
|
};
|
|
|
|
/**
|
|
* Class that aggregates partial alignments taken from a snapshot of the
|
|
* DescentDriver heap.
|
|
*/
|
|
class DescentPartialResolvedAlignmentSink {
|
|
|
|
public:
|
|
|
|
/**
|
|
* Reset to uninitialized state.
|
|
*/
|
|
void reset() {
|
|
edits_.clear();
|
|
als_.clear();
|
|
nelt_ = 0;
|
|
bestPen_ = worstPen_ = std::numeric_limits<TAlScore>::max();
|
|
}
|
|
|
|
/**
|
|
* Return the total size occupued by the Descent driver and all its
|
|
* constituent parts.
|
|
*/
|
|
size_t totalSizeBytes() const {
|
|
return edits_.totalSizeBytes() +
|
|
als_.totalSizeBytes() +
|
|
sizeof(size_t);
|
|
}
|
|
|
|
/**
|
|
* Return the total capacity of the Descent driver and all its constituent
|
|
* parts.
|
|
*/
|
|
size_t totalCapacityBytes() const {
|
|
return edits_.totalCapacityBytes() +
|
|
als_.totalCapacityBytes() +
|
|
sizeof(size_t);
|
|
}
|
|
|
|
/**
|
|
* Return the number of SA ranges involved in hits.
|
|
*/
|
|
size_t nrange() const {
|
|
return als_.size();
|
|
}
|
|
|
|
/**
|
|
* Return the number of SA elements involved in hits.
|
|
*/
|
|
size_t nelt() const {
|
|
return nelt_;
|
|
}
|
|
|
|
/**
|
|
* The caller provides 'i', which is an offset of a particular element in
|
|
* one of the SA ranges in the current stratum. This function returns, in
|
|
* 'al' and 'off', information about the element in terms of the range it's
|
|
* part of and its offset into that range.
|
|
*/
|
|
void elt(size_t i, DescentPartialResolvedAlignment& al, size_t& ri, size_t& off) const {
|
|
assert_lt(i, nelt());
|
|
for(size_t j = 0; j < als_.size(); j++) {
|
|
if(i < als_[j].size()) {
|
|
al = als_[j];
|
|
ri = j;
|
|
off = i;
|
|
return;
|
|
}
|
|
i -= als_[j].size();
|
|
}
|
|
assert(false);
|
|
}
|
|
|
|
/**
|
|
* Get a particular alignment.
|
|
*/
|
|
const DescentPartialResolvedAlignment& operator[](size_t i) const {
|
|
return als_[i];
|
|
}
|
|
|
|
/**
|
|
* Return true iff (a) we found an alignment since the sink was initialized
|
|
* or since the last time advanceStratum() was called, and (b) the penalty
|
|
* associated with the current-best task on the heap ('best') is worse
|
|
* (higher) than the penalty associated with the alignments found most
|
|
* recently (worstPen_).
|
|
*/
|
|
bool stratumDone(TAlScore bestPen) const {
|
|
if(nelt_ > 0 && bestPen > worstPen_) {
|
|
return true;
|
|
}
|
|
return false;
|
|
}
|
|
|
|
/**
|
|
* The alignment consumer calls this to indicate that they are done with
|
|
* all the alignments in the current best non-empty stratum. We can
|
|
* therefore mark all those alignments as "reported" and start collecting
|
|
* results for the next stratum.
|
|
*/
|
|
void advanceStratum() {
|
|
assert_gt(nelt_, 0);
|
|
edits_.clear();
|
|
als_.clear();
|
|
nelt_ = 0;
|
|
bestPen_ = worstPen_ = std::numeric_limits<TAlScore>::max();
|
|
}
|
|
|
|
#ifndef NDEBUG
|
|
/**
|
|
* Check that partial alignment sink is internally consistent.
|
|
*/
|
|
bool repOk() const {
|
|
assert_geq(nelt_, als_.size());
|
|
//for(size_t i = 1; i < als_.size(); i++) {
|
|
// assert_geq(als_[i].pen, als_[i-1].pen);
|
|
//}
|
|
assert(bestPen_ == std::numeric_limits<TAlScore>::max() || worstPen_ >= bestPen_);
|
|
return true;
|
|
}
|
|
#endif
|
|
|
|
TAlScore bestPenalty() const { return bestPen_; }
|
|
TAlScore worstPenalty() const { return worstPen_; }
|
|
|
|
size_t editsSize() const { return edits_.size(); }
|
|
size_t alsSize() const { return als_.size(); }
|
|
|
|
const EList<Edit>& edits() const { return edits_; }
|
|
|
|
protected:
|
|
|
|
EList<Edit> edits_;
|
|
EList<DescentPartialResolvedAlignment> als_;
|
|
size_t nelt_;
|
|
TAlScore bestPen_; // best (smallest) penalty among as-yet-unreported alns
|
|
TAlScore worstPen_; // worst (greatest) penalty among as-yet-unreported alns
|
|
};
|
|
|
|
/**
|
|
* Abstract parent for classes that select descent roots and descent
|
|
* configurations given information about the read.
|
|
*/
|
|
class DescentRootSelector {
|
|
|
|
public:
|
|
|
|
virtual ~DescentRootSelector() { }
|
|
|
|
virtual void select(
|
|
const Read& q, // read that we're selecting roots for
|
|
const Read* qo, // opposite mate, if applicable
|
|
bool nofw, // don't add roots for fw read
|
|
bool norc, // don't add roots for rc read
|
|
EList<DescentConfig>& confs, // put DescentConfigs here
|
|
EList<DescentRoot>& roots) = 0; // put DescentRoot here
|
|
};
|
|
|
|
/**
|
|
* Encapsulates a set of conditions governing when the DescentDriver should
|
|
* stop.
|
|
*/
|
|
struct DescentStoppingConditions {
|
|
|
|
DescentStoppingConditions() { reset(); }
|
|
|
|
DescentStoppingConditions(
|
|
size_t totsz_,
|
|
size_t nfound_,
|
|
bool stra_,
|
|
size_t nbwop_)
|
|
{
|
|
init(totsz_, nfound_, stra_, nbwop_);
|
|
}
|
|
|
|
/**
|
|
* Reset to uninitialized state.
|
|
*/
|
|
void reset() {
|
|
totsz = nfound = nbwop = std::numeric_limits<size_t>::max();
|
|
stra = false;
|
|
assert(!inited());
|
|
}
|
|
|
|
/**
|
|
* Initialize this DescentStoppingConditions.
|
|
*/
|
|
void init(
|
|
size_t totsz_,
|
|
size_t nfound_,
|
|
bool stra_,
|
|
size_t nbwop_)
|
|
{
|
|
totsz = totsz_;
|
|
nfound = nfound_;
|
|
stra = stra_;
|
|
nbwop = nbwop_;
|
|
assert(inited());
|
|
}
|
|
|
|
/**
|
|
* Return true iff this instance is initialized.
|
|
*/
|
|
bool inited() const {
|
|
return totsz != std::numeric_limits<size_t>::max();
|
|
}
|
|
|
|
size_t totsz; // total size of all the expandable data structures in bytes
|
|
size_t nfound; // # alignments found
|
|
bool stra; // stop after each non-empty stratum
|
|
size_t nbwop; // # Burrows-Wheeler (rank) operations performed
|
|
};
|
|
|
|
enum {
|
|
DESCENT_DRIVER_ALN = 1,
|
|
DESCENT_DRIVER_STRATA = 2,
|
|
DESCENT_DRIVER_MEM = 4,
|
|
DESCENT_DRIVER_BWOPS = 8,
|
|
DESCENT_DRIVER_DONE = 16
|
|
};
|
|
|
|
/**
|
|
* Class responsible for advancing all the descents. The initial descents may
|
|
* emanate from several different locations in the read. Note that descents
|
|
* may become redundant with each other, and should then be eliminated.
|
|
*/
|
|
template <typename index_t>
|
|
class DescentDriver {
|
|
public:
|
|
|
|
DescentDriver(bool veryVerbose) :
|
|
veryVerbose_(veryVerbose)
|
|
{
|
|
reset();
|
|
}
|
|
|
|
/**
|
|
* Initialize driver with respect to a new read. If a DescentRootSelector
|
|
* is specified, then it is used to obtain roots as well.
|
|
*/
|
|
void initRead(
|
|
const Read& q,
|
|
bool nofw,
|
|
bool norc,
|
|
TAlScore minsc,
|
|
TAlScore maxpen,
|
|
const Read* qu = NULL,
|
|
DescentRootSelector *sel = NULL)
|
|
{
|
|
reset();
|
|
q_ = q;
|
|
minsc_ = minsc;
|
|
maxpen_ = maxpen;
|
|
if(sel != NULL) {
|
|
sel->select(q_, qu, nofw, norc, confs_, roots_);
|
|
}
|
|
re_.init(q.length());
|
|
}
|
|
|
|
/**
|
|
* Add a new search root, which might (a) prefer to move in a left-to-right
|
|
* direction, and might (b) be with respect to the read or its reverse
|
|
* complement.
|
|
*/
|
|
void addRoot(
|
|
const DescentConfig& conf,
|
|
TReadOff off,
|
|
bool l2r,
|
|
bool fw,
|
|
float pri)
|
|
{
|
|
confs_.push_back(conf);
|
|
assert_lt(off, q_.length());
|
|
if(l2r && off == q_.length()-1) {
|
|
l2r = !l2r;
|
|
} else if(!l2r && off == 0) {
|
|
l2r = !l2r;
|
|
}
|
|
roots_.push_back(DescentRoot(off, l2r, fw, q_.length(), pri));
|
|
}
|
|
|
|
/**
|
|
* Clear out the DescentRoots currently configured.
|
|
*/
|
|
void clearRoots() {
|
|
confs_.clear();
|
|
roots_.clear();
|
|
}
|
|
|
|
/**
|
|
* Clear the Descent driver so that we're ready to re-start seed alignment
|
|
* for the current read.
|
|
*/
|
|
void resetRead() {
|
|
df_.clear(); // clear Descents
|
|
assert_leq(df_.totalSizeBytes(), 100);
|
|
pf_.clear(); // clear DescentPoss
|
|
assert_leq(pf_.totalSizeBytes(), 100);
|
|
heap_.clear(); // clear Heap
|
|
assert_leq(heap_.totalSizeBytes(), 100);
|
|
roots_.clear(); // clear roots
|
|
assert_leq(roots_.totalSizeBytes(), 100);
|
|
confs_.clear(); // clear confs
|
|
assert_leq(confs_.totalSizeBytes(), 100);
|
|
alsink_.reset(); // clear alignment sink
|
|
assert_leq(alsink_.totalSizeBytes(), 100);
|
|
re_.reset();
|
|
assert_leq(re_.totalSizeBytes(), 100);
|
|
rootsInited_ = 0; // haven't yet created initial descents
|
|
curPen_ = 0; //
|
|
}
|
|
|
|
/**
|
|
* Clear the Descent driver so that we're ready to re-start seed alignment
|
|
* for the current read.
|
|
*/
|
|
void reset() {
|
|
resetRead();
|
|
}
|
|
|
|
/**
|
|
* Perform seed alignment.
|
|
*/
|
|
void go(
|
|
const Scoring& sc, // scoring scheme
|
|
const GFM<index_t>& gfmFw, // forward index
|
|
const GFM<index_t>& gfmBw, // mirror index
|
|
DescentMetrics& met, // metrics
|
|
PerReadMetrics& prm); // per-read metrics
|
|
|
|
/**
|
|
* Perform seed alignment until some stopping condition is satisfied.
|
|
*/
|
|
int advance(
|
|
const DescentStoppingConditions& stopc, // stopping conditions
|
|
const Scoring& sc, // scoring scheme
|
|
const GFM<index_t>& gfmFw, // forward index
|
|
const GFM<index_t>& gfmBw, // mirror index
|
|
DescentMetrics& met, // metrics
|
|
PerReadMetrics& prm); // per-read metrics
|
|
|
|
#ifndef NDEBUG
|
|
/**
|
|
* Return true iff this DescentDriver is well formed. Throw an assertion
|
|
* otherwise.
|
|
*/
|
|
bool repOk() const {
|
|
return true;
|
|
}
|
|
#endif
|
|
|
|
/**
|
|
* Return the number of end-to-end alignments reported.
|
|
*/
|
|
size_t numAlignments() const {
|
|
return alsink_.nelt();
|
|
}
|
|
|
|
/**
|
|
* Return the associated DescentAlignmentSink object.
|
|
*/
|
|
const DescentAlignmentSink<index_t>& sink() const {
|
|
return alsink_;
|
|
}
|
|
|
|
/**
|
|
* Return the associated DescentAlignmentSink object.
|
|
*/
|
|
DescentAlignmentSink<index_t>& sink() {
|
|
return alsink_;
|
|
}
|
|
|
|
/**
|
|
* Return the total size occupued by the Descent driver and all its
|
|
* constituent parts.
|
|
*/
|
|
size_t totalSizeBytes() const {
|
|
return df_.totalSizeBytes() +
|
|
pf_.totalSizeBytes() +
|
|
heap_.totalSizeBytes() +
|
|
roots_.totalSizeBytes() +
|
|
confs_.totalSizeBytes() +
|
|
alsink_.totalSizeBytes() +
|
|
re_.totalSizeBytes();
|
|
}
|
|
|
|
/**
|
|
* Return the total capacity of the Descent driver and all its constituent
|
|
* parts.
|
|
*/
|
|
size_t totalCapacityBytes() const {
|
|
return df_.totalCapacityBytes() +
|
|
pf_.totalCapacityBytes() +
|
|
heap_.totalCapacityBytes() +
|
|
roots_.totalCapacityBytes() +
|
|
confs_.totalCapacityBytes() +
|
|
alsink_.totalCapacityBytes() +
|
|
re_.totalCapacityBytes();
|
|
}
|
|
|
|
/**
|
|
* Return a const ref to the query.
|
|
*/
|
|
const Read& query() const {
|
|
return q_;
|
|
}
|
|
|
|
/**
|
|
* Return the minimum score that must be achieved by an alignment in order
|
|
* for it to be considered "valid".
|
|
*/
|
|
TAlScore minScore() const {
|
|
return minsc_;
|
|
}
|
|
|
|
protected:
|
|
|
|
Read q_; // query nucleotide and quality strings
|
|
TAlScore minsc_; // minimum score
|
|
TAlScore maxpen_; // maximum penalty
|
|
EFactory<Descent<index_t> > df_; // factory holding all the Descents, which
|
|
// must be referred to by ID
|
|
EFactory<DescentPos> pf_; // factory holding all the DescentPoss, which
|
|
// must be referred to by ID
|
|
EList<DescentRoot> roots_; // search roots
|
|
EList<DescentConfig> confs_; // configuration params for each root
|
|
size_t rootsInited_; // # initial Descents already created
|
|
EHeap<TDescentPair> heap_; // priority queue of Descents
|
|
DescentAlignmentSink<index_t> alsink_; // alignment sink
|
|
DescentRedundancyChecker re_; // redundancy checker
|
|
TAlScore curPen_; // current penalty
|
|
bool veryVerbose_; // print lots of partial alignments
|
|
|
|
EList<Edit> tmpedit_;
|
|
BTDnaString tmprfdnastr_;
|
|
};
|
|
|
|
/**
|
|
* Selects alignments to report from a complete non-empty stratum of
|
|
* alignments stored in the DescentAlignmentSink.
|
|
*/
|
|
template <typename index_t>
|
|
class DescentAlignmentSelector {
|
|
|
|
public:
|
|
|
|
DescentAlignmentSelector() : gwstate_(GW_CAT) { reset(); }
|
|
|
|
/**
|
|
* Initialize a new selector w/r/t a DescentAlignmentSink holding a
|
|
* non-empty alignment stratum.
|
|
*/
|
|
void init(
|
|
const Read& q,
|
|
const DescentAlignmentSink<index_t>& sink,
|
|
const GFM<index_t>& gfmFw, // forward Bowtie index for walking left
|
|
const BitPairReference& ref, // bitpair-encoded reference
|
|
RandomSource& rnd, // pseudo-random generator for sampling rows
|
|
WalkMetrics& met)
|
|
{
|
|
// We're going to sample from space of *alignments*, not ranges. So
|
|
// when we extract a sample, we'll have to do a little extra work to
|
|
// convert it to a <range, offset> coordinate.
|
|
rnd_.init(
|
|
sink.nelt(), // # elements to choose from
|
|
true); // without replacement
|
|
offs_.resize(sink.nelt());
|
|
offs_.fill(std::numeric_limits<TIndexOffU>::max());
|
|
sas_.resize(sink.nrange());
|
|
gws_.resize(sink.nrange());
|
|
size_t ei = 0;
|
|
for(size_t i = 0; i < sas_.size(); i++) {
|
|
size_t en = sink[i].botf - sink[i].topf;
|
|
sas_[i].init(sink[i].topf, q.length(), EListSlice<TIndexOffU, 16>(offs_, ei, en));
|
|
gws_[i].init(gfmFw, ref, sas_[i], rnd, met);
|
|
ei += en;
|
|
}
|
|
}
|
|
|
|
/**
|
|
* Reset the selector.
|
|
*/
|
|
void reset() {
|
|
rnd_.reset();
|
|
}
|
|
|
|
/**
|
|
* Return true iff the selector is currently initialized.
|
|
*/
|
|
bool inited() const {
|
|
return rnd_.size() > 0;
|
|
}
|
|
|
|
/**
|
|
* Get next alignment and convert it to an AlnRes.
|
|
*/
|
|
bool next(
|
|
const DescentDriver<index_t>& dr,
|
|
const GFM<index_t>& gfmFw, // forward Bowtie index for walking left
|
|
const BitPairReference& ref, // bitpair-encoded reference
|
|
RandomSource& rnd,
|
|
AlnRes& rs,
|
|
WalkMetrics& met,
|
|
PerReadMetrics& prm)
|
|
{
|
|
// Sample one alignment randomly from pool of remaining alignments
|
|
size_t ri = (size_t)rnd_.next(rnd);
|
|
size_t off = 0;
|
|
DescentAlignment al;
|
|
size_t rangei = 0;
|
|
// Convert random alignment index into a <range, offset> coordinate
|
|
dr.sink().elt(ri, al, rangei, off);
|
|
assert_lt(off, al.size());
|
|
Coord refcoord;
|
|
WalkResult<index_t> wr;
|
|
TIndexOffU tidx = 0, toff = 0, tlen = 0;
|
|
gws_[rangei].advanceElement(
|
|
(TIndexOffU)off,
|
|
gfmFw, // forward Bowtie index for walking left
|
|
ref, // bitpair-encoded reference
|
|
sas_[rangei], // SA range with offsets
|
|
gwstate_, // GroupWalk state; scratch space
|
|
wr, // put the result here
|
|
met, // metrics
|
|
prm); // per-read metrics
|
|
assert_neq(OFF_MASK, wr.toff);
|
|
bool straddled = false;
|
|
gfmFw.joinedToTextOff(
|
|
wr.elt.len,
|
|
wr.toff,
|
|
tidx,
|
|
toff,
|
|
tlen,
|
|
true, // reject straddlers?
|
|
straddled); // straddled?
|
|
if(tidx == OFF_MASK) {
|
|
// The seed hit straddled a reference boundary so the seed
|
|
// hit isn't valid
|
|
return false;
|
|
}
|
|
// Coordinate of the seed hit w/r/t the pasted reference string
|
|
refcoord.init(tidx, (int64_t)toff, dr.sink()[rangei].fw);
|
|
const EList<Edit>& edits = dr.sink().edits();
|
|
size_t ns = 0, ngap = 0, nrefn = 0;
|
|
for(size_t i = al.ei; i < al.ei + al.en; i++) {
|
|
if(edits[i].qchr == 'N' || edits[i].chr == 'N') ns++;
|
|
if(edits[i].chr == 'N') nrefn++;
|
|
if(edits[i].isGap()) ngap++;
|
|
}
|
|
AlnScore asc(
|
|
-dr.sink().bestPenalty(), // numeric score
|
|
ns, // # Ns
|
|
ngap); // # gaps
|
|
rs.init(
|
|
dr.query().length(), // # chars after hard trimming
|
|
asc, // alignment score
|
|
&dr.sink().edits(), // nucleotide edits array
|
|
al.ei, // nucleotide edits first pos
|
|
al.en, // nucleotide edits last pos
|
|
NULL, // ambig base array
|
|
0, // ambig base first pos
|
|
0, // ambig base last pos
|
|
refcoord, // coord of leftmost aligned char in ref
|
|
tlen, // length of reference aligned to
|
|
-1, // # seed mms allowed
|
|
-1, // seed length
|
|
-1, // seed interval
|
|
dr.minScore(), // minimum score for valid alignment
|
|
-1, // nuc5p (for colorspace)
|
|
-1, // nuc3p (for colorspace)
|
|
false, // soft pre-trimming?
|
|
0, // 5p pre-trimming
|
|
0, // 3p pre-trimming
|
|
false, // soft trimming?
|
|
0, // 5p trimming
|
|
0); // 3p trimming
|
|
rs.setRefNs(nrefn);
|
|
return true;
|
|
}
|
|
|
|
/**
|
|
* Return true iff all elements have been reported.
|
|
*/
|
|
bool done() const {
|
|
return rnd_.done();
|
|
}
|
|
|
|
/**
|
|
* Return the total size occupued by the Descent driver and all its
|
|
* constituent parts.
|
|
*/
|
|
size_t totalSizeBytes() const {
|
|
return rnd_.totalSizeBytes() +
|
|
offs_.totalSizeBytes() +
|
|
sas_.totalSizeBytes() +
|
|
gws_.totalSizeBytes();
|
|
}
|
|
|
|
/**
|
|
* Return the total capacity of the Descent driver and all its constituent
|
|
* parts.
|
|
*/
|
|
size_t totalCapacityBytes() const {
|
|
return rnd_.totalCapacityBytes() +
|
|
offs_.totalCapacityBytes() +
|
|
sas_.totalCapacityBytes() +
|
|
gws_.totalCapacityBytes();
|
|
}
|
|
|
|
protected:
|
|
|
|
Random1toN rnd_;
|
|
EList<TIndexOffU, 16> offs_;
|
|
EList<SARangeWithOffs<EListSlice<TIndexOffU, 16>, index_t> > sas_;
|
|
EList<GroupWalk2S<index_t, EListSlice<TIndexOffU, 16>, 16> > gws_;
|
|
GroupWalkState<index_t> gwstate_;
|
|
};
|
|
|
|
/**
|
|
* Selects and prioritizes partial alignments from the heap of the
|
|
* DescentDriver. We assume that the heap is no longer changing (i.e. that the
|
|
* DescentDriver is done). Usually, the user will then attempt to extend the
|
|
* partial alignments into full alignments. This can happen incrementally;
|
|
* that is, the user might ask for the partial alignments one "batch" at a
|
|
* time, and the selector will only do as much work is necessary to supply each
|
|
* requesteded batch.
|
|
*
|
|
* The actual work done here includes: (a) scanning the heap for high-priority
|
|
* partial alignments, (b) setting up the rnd_, offs_, sas_, gws_, and gwstate_
|
|
* fields and resolving offsets of partial alignments, (c) packaging and
|
|
* delivering batches of results to the caller.
|
|
*
|
|
* How to prioritize partial alignments? One idea is to use the same
|
|
* penalty-based prioritization used in the heap. This has pros: (a) maintains
|
|
* the guarantee that we're visiting alignments in best-to-worst order in
|
|
* end-to-end alignment mode, (b) the heap is already prioritized this way, so
|
|
* it's easier for us to compile high-priority partial alignments. But the con
|
|
* is that it doesn't take depth into account, which could mean that we're
|
|
* extending a lot of very short partial alignments first.
|
|
*
|
|
* A problem we should keep in mind is that some
|
|
*/
|
|
template <typename index_t>
|
|
class DescentPartialAlignmentSelector {
|
|
|
|
public:
|
|
|
|
DescentPartialAlignmentSelector() : gwstate_(GW_CAT) { reset(); }
|
|
|
|
/**
|
|
* Initialize a new selector w/r/t a read, index and heap of partial
|
|
* alignments.
|
|
*/
|
|
void init(
|
|
const Read& q, // read
|
|
const EHeap<TDescentPair>& heap, // the heap w/ the partial alns
|
|
TAlScore depthBonus, // use depth when prioritizing
|
|
size_t nbatch, // # of alignments in a batch
|
|
const GFM<index_t>& gfmFw, // forward Bowtie index for walk-left
|
|
const BitPairReference& ref, // bitpair-encoded reference
|
|
RandomSource& rnd, // pseudo-randoms for sampling rows
|
|
WalkMetrics& met) // metrics re: offset resolution
|
|
{
|
|
// Make our internal heap
|
|
if(depthBonus > 0) {
|
|
heap_.clear();
|
|
for(size_t i = 0; i < heap.size(); i++) {
|
|
TDescentPair p = heap[i];
|
|
p.first.pen += depthBonus * p.first.depth;
|
|
heap_.insert(p);
|
|
}
|
|
} else {
|
|
heap_ = heap;
|
|
}
|
|
#if 0
|
|
// We're going to sample from space of *alignments*, not ranges. So
|
|
// when we extract a sample, we'll have to do a little extra work to
|
|
// convert it to a <range, offset> coordinate.
|
|
rnd_.init(
|
|
sink.nelt(), // # elements to choose from
|
|
true); // without replacement
|
|
offs_.resize(sink.nelt());
|
|
offs_.fill(std::numeric_limits<TIndexOff>::max());
|
|
sas_.resize(sink.nrange());
|
|
gws_.resize(sink.nrange());
|
|
size_t ei = 0;
|
|
for(size_t i = 0; i < sas_.size(); i++) {
|
|
size_t en = sink[i].botf - sink[i].topf;
|
|
sas_[i].init(sink[i].topf, q.length(), EListSlice<TIndexOff, 16>(offs_, ei, en));
|
|
gws_[i].init(gfmFw, ref, sas_[i], rnd, met);
|
|
ei += en;
|
|
}
|
|
#endif
|
|
}
|
|
|
|
/**
|
|
*
|
|
*/
|
|
void compileBatch() {
|
|
}
|
|
|
|
/**
|
|
* Reset the selector.
|
|
*/
|
|
void reset() {
|
|
heap_.clear();
|
|
}
|
|
|
|
/**
|
|
* Return true iff the selector is currently initialized.
|
|
*/
|
|
bool inited() const {
|
|
return !heap_.empty();
|
|
}
|
|
|
|
/**
|
|
* Get next alignment and convert it to an AlnRes.
|
|
*/
|
|
bool next(
|
|
const DescentDriver<index_t>& dr,
|
|
const GFM<index_t>& gfmFw, // forward Bowtie index for walking left
|
|
const BitPairReference& ref, // bitpair-encoded reference
|
|
RandomSource& rnd,
|
|
AlnRes& rs,
|
|
WalkMetrics& met,
|
|
PerReadMetrics& prm)
|
|
{
|
|
// Sample one alignment randomly from pool of remaining alignments
|
|
size_t ri = (size_t)rnd_.next(rnd);
|
|
size_t off = 0;
|
|
DescentAlignment al;
|
|
size_t rangei = 0;
|
|
// Convert random alignment index into a <range, offset> coordinate
|
|
dr.sink().elt(ri, al, rangei, off);
|
|
assert_lt(off, al.size());
|
|
Coord refcoord;
|
|
WalkResult<index_t> wr;
|
|
uint32_t tidx = 0, toff = 0, tlen = 0;
|
|
gws_[rangei].advanceElement(
|
|
(uint32_t)off,
|
|
gfmFw, // forward Bowtie index for walking left
|
|
ref, // bitpair-encoded reference
|
|
sas_[rangei], // SA range with offsets
|
|
gwstate_, // GroupWalk state; scratch space
|
|
wr, // put the result here
|
|
met, // metrics
|
|
prm); // per-read metrics
|
|
assert_neq(0xffffffff, wr.toff);
|
|
bool straddled = false;
|
|
gfmFw.joinedToTextOff(
|
|
wr.elt.len,
|
|
wr.toff,
|
|
tidx,
|
|
toff,
|
|
tlen,
|
|
true, // reject straddlers?
|
|
straddled); // straddled?
|
|
if(tidx == 0xffffffff) {
|
|
// The seed hit straddled a reference boundary so the seed
|
|
// hit isn't valid
|
|
return false;
|
|
}
|
|
// Coordinate of the seed hit w/r/t the pasted reference string
|
|
refcoord.init(tidx, (int64_t)toff, dr.sink()[rangei].fw);
|
|
const EList<Edit>& edits = dr.sink().edits();
|
|
size_t ns = 0, ngap = 0, nrefn = 0;
|
|
for(size_t i = al.ei; i < al.ei + al.en; i++) {
|
|
if(edits[i].qchr == 'N' || edits[i].chr == 'N') ns++;
|
|
if(edits[i].chr == 'N') nrefn++;
|
|
if(edits[i].isGap()) ngap++;
|
|
}
|
|
return true;
|
|
}
|
|
|
|
/**
|
|
* Return true iff all elements have been reported.
|
|
*/
|
|
bool done() const {
|
|
return rnd_.done();
|
|
}
|
|
|
|
/**
|
|
* Return the total size occupued by the Descent driver and all its
|
|
* constituent parts.
|
|
*/
|
|
size_t totalSizeBytes() const {
|
|
return heap_.totalSizeBytes() +
|
|
rnd_.totalSizeBytes() +
|
|
offs_.totalSizeBytes() +
|
|
sas_.totalSizeBytes() +
|
|
gws_.totalSizeBytes();
|
|
}
|
|
|
|
/**
|
|
* Return the total capacity of the Descent driver and all its constituent
|
|
* parts.
|
|
*/
|
|
size_t totalCapacityBytes() const {
|
|
return heap_.totalCapacityBytes() +
|
|
rnd_.totalCapacityBytes() +
|
|
offs_.totalCapacityBytes() +
|
|
sas_.totalCapacityBytes() +
|
|
gws_.totalCapacityBytes();
|
|
}
|
|
|
|
protected:
|
|
|
|
// This class's working heap. This might simply be a copy of the original
|
|
// heap, or it might be re-prioritized in some way.
|
|
EHeap<TDescentPair> heap_;
|
|
|
|
Random1toN rnd_;
|
|
EList<TIndexOff, 16> offs_;
|
|
EList<SARangeWithOffs<EListSlice<TIndexOff, 16>, index_t> > sas_;
|
|
EList<GroupWalk2S<index_t, EListSlice<TIndexOff, 16>, 16> > gws_;
|
|
GroupWalkState<index_t> gwstate_;
|
|
};
|
|
|
|
/**
|
|
* Drive the process of descending from all search roots.
|
|
*/
|
|
template <typename index_t>
|
|
void DescentDriver<index_t>::go(
|
|
const Scoring& sc, // scoring scheme
|
|
const GFM<index_t>& gfmFw, // forward index
|
|
const GFM<index_t>& gfmBw, // mirror index
|
|
DescentMetrics& met, // metrics
|
|
PerReadMetrics& prm) // per-read metrics
|
|
{
|
|
assert(q_.repOk());
|
|
// Convert DescentRoots to the initial Descents
|
|
for(size_t i = 0; i < roots_.size(); i++) {
|
|
size_t dfsz = df_.size();
|
|
size_t pfsz = pf_.size();
|
|
TDescentId id = df_.alloc();
|
|
Edit e_null;
|
|
assert(!e_null.inited());
|
|
bool succ = df_[id].init(
|
|
q_, // read
|
|
i, // root and conf id
|
|
sc, // scoring scheme
|
|
minsc_, // minimum score
|
|
maxpen_, // maximum penalty
|
|
id, // new Descent's id
|
|
gfmFw, // forward index
|
|
gfmBw, // mirror index
|
|
re_, // redundancy checker
|
|
df_, // Descent factory
|
|
pf_, // DescentPos factory
|
|
roots_, // DescentRoots
|
|
confs_, // DescentConfs
|
|
heap_, // heap
|
|
alsink_, // alignment sink
|
|
met, // metrics
|
|
prm); // per-read metrics
|
|
if(veryVerbose_) {
|
|
bool fw = roots_[i].fw;
|
|
tmpedit_.clear();
|
|
df_[id].print(
|
|
&cerr,
|
|
"",
|
|
q_,
|
|
0,
|
|
0,
|
|
fw,
|
|
tmpedit_,
|
|
0,
|
|
tmpedit_.size(),
|
|
tmprfdnastr_);
|
|
}
|
|
if(!succ) {
|
|
// Reclaim memory we had used for this descent and its DescentPos info
|
|
df_.resize(dfsz);
|
|
pf_.resize(pfsz);
|
|
}
|
|
}
|
|
// Advance until some stopping condition
|
|
bool stop = heap_.empty();
|
|
while(!stop) {
|
|
// Pop off the highest-priority descent. Note that some outgoing edges
|
|
// might have since been explored, which could reduce the priority of
|
|
// the descent once we .
|
|
TDescentPair p = heap_.pop();
|
|
df_.alloc(); df_.pop();
|
|
df_[p.second].followBestOutgoing(
|
|
q_, // read
|
|
gfmFw, // index over text
|
|
gfmBw, // index over reverse text
|
|
sc, // scoring scheme
|
|
minsc_, // minimum score
|
|
maxpen_, // maximum penalty
|
|
re_, // redundancy checker
|
|
df_, // Descent factory
|
|
pf_, // DescentPos factory
|
|
roots_, //
|
|
confs_, //
|
|
heap_, // priority queue for Descents
|
|
alsink_, // alignment sink
|
|
met, // metrics
|
|
prm); // per-read metrics
|
|
stop = heap_.empty();
|
|
}
|
|
}
|
|
|
|
/**
|
|
* Perform seed alignment until some stopping condition is satisfied.
|
|
*/
|
|
template <typename index_t>
|
|
int DescentDriver<index_t>::advance(
|
|
const DescentStoppingConditions& stopc, // stopping conditions
|
|
const Scoring& sc, // scoring scheme
|
|
const GFM<index_t>& gfmFw, // forward index
|
|
const GFM<index_t>& gfmBw, // mirror index
|
|
DescentMetrics& met, // metrics
|
|
PerReadMetrics& prm) // per-read metrics
|
|
{
|
|
size_t nbwop_i = met.bwops;
|
|
while(rootsInited_ < roots_.size()) {
|
|
size_t dfsz = df_.size();
|
|
size_t pfsz = pf_.size();
|
|
TDescentId id = df_.alloc();
|
|
Edit e_null;
|
|
assert(!e_null.inited());
|
|
bool succ = df_[id].init(
|
|
q_, // query
|
|
rootsInited_, // root and conf id
|
|
sc, // scoring scheme
|
|
minsc_, // minimum score
|
|
maxpen_, // maximum penalty
|
|
id, // new Descent's id
|
|
gfmFw, // forward index
|
|
gfmBw, // mirror index
|
|
re_, // redundancy checker
|
|
df_, // Descent factory
|
|
pf_, // DescentPos factory
|
|
roots_, // DescentRoots
|
|
confs_, // DescentConfs
|
|
heap_, // heap
|
|
alsink_, // alignment sink
|
|
met, // metrics
|
|
prm); // per-read metrics
|
|
if(!succ) {
|
|
// Reclaim memory we had used for this descent and its DescentPos info
|
|
df_.resize(dfsz);
|
|
pf_.resize(pfsz);
|
|
}
|
|
rootsInited_++;
|
|
TAlScore best = std::numeric_limits<TAlScore>::max();
|
|
if(!heap_.empty()) {
|
|
best = heap_.top().first.pen;
|
|
}
|
|
if(stopc.nfound > 0 && alsink_.nelt() > stopc.nfound) {
|
|
return DESCENT_DRIVER_ALN;
|
|
}
|
|
if(alsink_.stratumDone(best)) {
|
|
return DESCENT_DRIVER_STRATA;
|
|
}
|
|
if(stopc.nbwop > 0 && (met.bwops - nbwop_i) > stopc.nbwop) {
|
|
return DESCENT_DRIVER_BWOPS;
|
|
}
|
|
if(stopc.totsz > 0 && totalSizeBytes() > stopc.totsz) {
|
|
return DESCENT_DRIVER_MEM;
|
|
}
|
|
}
|
|
// Advance until some stopping condition
|
|
bool stop = heap_.empty();
|
|
while(!stop) {
|
|
// Pop off the highest-priority descent. Note that some outgoing edges
|
|
// might have since been explored, which could reduce the priority of
|
|
// the descent once we .
|
|
TDescentPair p = heap_.pop();
|
|
df_.alloc(); df_.pop();
|
|
df_[p.second].followBestOutgoing(
|
|
q_,
|
|
gfmFw,
|
|
gfmBw,
|
|
sc,
|
|
minsc_, // minimum score
|
|
maxpen_, // maximum penalty
|
|
re_, // redundancy checker
|
|
df_, // Descent factory
|
|
pf_, // DescentPos factory
|
|
roots_,
|
|
confs_,
|
|
heap_,
|
|
alsink_,
|
|
met,
|
|
prm); // per-read metrics
|
|
TAlScore best = std::numeric_limits<TAlScore>::max();
|
|
if(!heap_.empty()) {
|
|
best = heap_.top().first.pen;
|
|
}
|
|
if(stopc.nfound > 0 && alsink_.nelt() > stopc.nfound) {
|
|
return DESCENT_DRIVER_ALN;
|
|
}
|
|
if(alsink_.stratumDone(best)) {
|
|
return DESCENT_DRIVER_STRATA;
|
|
}
|
|
if(stopc.nbwop > 0 && (met.bwops - nbwop_i) > stopc.nbwop) {
|
|
return DESCENT_DRIVER_BWOPS;
|
|
}
|
|
if(stopc.totsz > 0 && totalSizeBytes() > stopc.totsz) {
|
|
return DESCENT_DRIVER_MEM;
|
|
}
|
|
stop = heap_.empty();
|
|
}
|
|
return DESCENT_DRIVER_DONE;
|
|
}
|
|
|
|
/**
|
|
* If this is the final descent in a complete end-to-end alignment, report
|
|
* the alignment.
|
|
*/
|
|
template <typename index_t>
|
|
bool DescentAlignmentSink<index_t>::reportAlignment(
|
|
const Read& q, // query string
|
|
const GFM<index_t>& gfmFw, // forward index
|
|
const GFM<index_t>& gfmBw, // mirror index
|
|
TIndexOffU topf, // SA range top in forward index
|
|
TIndexOffU botf, // SA range bottom in forward index
|
|
TIndexOffU topb, // SA range top in backward index
|
|
TIndexOffU botb, // SA range bottom in backward index
|
|
TDescentId id, // id of leaf Descent
|
|
TRootId rid, // id of search root
|
|
const Edit& e, // final edit, if needed
|
|
TScore pen, // total penalty
|
|
EFactory<Descent<index_t> >& df, // factory with Descent
|
|
EFactory<DescentPos>& pf, // factory with DescentPoss
|
|
const EList<DescentRoot>& rs, // roots
|
|
const EList<DescentConfig>& cs) // configs
|
|
{
|
|
TDescentId cur = id;
|
|
ASSERT_ONLY(const Descent<index_t>& desc = df[id]);
|
|
const bool fw = rs[rid].fw;
|
|
ASSERT_ONLY(size_t len = q.length());
|
|
assert(q.repOk());
|
|
assert_lt(desc.al5pf(), len);
|
|
// Adjust al5pi and al5pf to take the final edit into account (if
|
|
// there is one)
|
|
// Check if this is redundant with a previous reported alignment
|
|
Triple<TIndexOffU, TIndexOffU, size_t> lhs(topf, botf, 0);
|
|
Triple<TIndexOffU, TIndexOffU, size_t> rhs(topb, botb, q.length()-1);
|
|
if(!lhs_.insert(lhs)) {
|
|
rhs_.insert(rhs);
|
|
return false; // Already there
|
|
}
|
|
if(!rhs_.insert(rhs)) {
|
|
return false; // Already there
|
|
}
|
|
size_t ei = edits_.size();
|
|
df[cur].collectEdits(edits_, &e, df);
|
|
size_t en = edits_.size() - ei;
|
|
#ifndef NDEBUG
|
|
{
|
|
for(size_t i = 1; i < en; i++) {
|
|
assert_geq(edits_[ei+i].pos, edits_[ei+i-1].pos);
|
|
}
|
|
// Now figure out how much we refrained from aligning on either
|
|
// side.
|
|
size_t trimLf = 0;
|
|
size_t trimRg = 0;
|
|
BTDnaString& rf = tmprfdnastr_;
|
|
rf.clear();
|
|
if(!fw) {
|
|
// Edit offsets are w/r/t 5' end, but desc.print wants them w/r/t
|
|
// the *left* end of the read sequence that aligned
|
|
Edit::invertPoss(edits_, len, ei, en, true);
|
|
}
|
|
desc.print(NULL, "", q, trimLf, trimRg, fw, edits_, ei, en, rf);
|
|
if(!fw) {
|
|
// Invert them back to how they were before
|
|
Edit::invertPoss(edits_, len, ei, en, true);
|
|
}
|
|
ASSERT_ONLY(TIndexOffU toptmp = 0);
|
|
ASSERT_ONLY(TIndexOffU bottmp = 0);
|
|
// Check that the edited string occurs in the reference
|
|
if(!gfmFw.contains(rf, &toptmp, &bottmp)) {
|
|
std::cerr << rf << std::endl;
|
|
assert(false);
|
|
}
|
|
}
|
|
#endif
|
|
als_.expand();
|
|
als_.back().init(pen, fw, topf, botf, ei, en);
|
|
nelt_ += (botf - topf);
|
|
if(bestPen_ == std::numeric_limits<TAlScore>::max() || pen < bestPen_) {
|
|
bestPen_ = pen;
|
|
}
|
|
if(worstPen_ == std::numeric_limits<TAlScore>::max() || pen > worstPen_) {
|
|
worstPen_ = pen;
|
|
}
|
|
return true;
|
|
}
|
|
|
|
/**
|
|
* Initialize a new descent branching from the given descent via the given
|
|
* edit. Return false if the Descent has no outgoing edges (and can
|
|
* therefore have its memory freed), true otherwise.
|
|
*/
|
|
template <typename index_t>
|
|
bool Descent<index_t>::init(
|
|
const Read& q, // query
|
|
TRootId rid, // root id
|
|
const Scoring& sc, // scoring scheme
|
|
TAlScore minsc, // minimum score
|
|
TAlScore maxpen, // maximum penalty
|
|
TReadOff al5pi, // offset from 5' of 1st aligned char
|
|
TReadOff al5pf, // offset from 5' of last aligned char
|
|
TIndexOffU topf, // SA range top in FW index
|
|
TIndexOffU botf, // SA range bottom in FW index
|
|
TIndexOffU topb, // SA range top in BW index
|
|
TIndexOffU botb, // SA range bottom in BW index
|
|
bool l2r, // direction this descent will go in
|
|
size_t descid, // my ID
|
|
TDescentId parent, // parent ID
|
|
TScore pen, // total penalties so far
|
|
const Edit& e, // edit for incoming edge
|
|
const GFM<index_t>& gfmFw, // forward index
|
|
const GFM<index_t>& gfmBw, // mirror index
|
|
DescentRedundancyChecker& re, // redundancy checker
|
|
EFactory<Descent>& df, // Descent factory
|
|
EFactory<DescentPos>& pf, // DescentPos factory
|
|
const EList<DescentRoot>& rs, // roots
|
|
const EList<DescentConfig>& cs, // configs
|
|
EHeap<TDescentPair>& heap, // heap
|
|
DescentAlignmentSink<index_t>& alsink, // alignment sink
|
|
DescentMetrics& met, // metrics
|
|
PerReadMetrics& prm) // per-read metrics
|
|
{
|
|
assert(q.repOk());
|
|
rid_ = rid;
|
|
al5pi_ = al5pi;
|
|
al5pf_ = al5pf;
|
|
l2r_ = l2r;
|
|
topf_ = topf;
|
|
botf_ = botf;
|
|
topb_ = topb;
|
|
botb_ = botb;
|
|
descid_ = descid;
|
|
parent_ = parent;
|
|
pen_ = pen;
|
|
posid_ = std::numeric_limits<size_t>::max();
|
|
len_ = 0;
|
|
out_.clear();
|
|
edit_ = e;
|
|
lastRecalc_ = true;
|
|
gapadd_ = df[parent].gapadd_;
|
|
if(e.inited()) {
|
|
if(e.isReadGap()) {
|
|
gapadd_++;
|
|
} else if(e.isRefGap()) {
|
|
gapadd_--;
|
|
}
|
|
}
|
|
bool branches = false, hitEnd = false, done = false;
|
|
TIndexOffU topf_new = 0, botf_new = 0, topb_new = 0, botb_new = 0;
|
|
off5p_i_ = 0;
|
|
#ifndef NDEBUG
|
|
size_t depth = al5pf_ - al5pi_ + 1;
|
|
TAlScore maxpend = cs[rid_].cons.get(depth, q.length(), maxpen);
|
|
assert_geq(maxpend, pen_); // can't have already exceeded max penalty
|
|
#endif
|
|
bool matchSucc = followMatches(
|
|
q,
|
|
sc,
|
|
gfmFw,
|
|
gfmBw,
|
|
re,
|
|
df,
|
|
pf,
|
|
rs,
|
|
cs,
|
|
heap,
|
|
alsink,
|
|
met,
|
|
prm,
|
|
branches,
|
|
hitEnd,
|
|
done,
|
|
off5p_i_,
|
|
topf_new,
|
|
botf_new,
|
|
topb_new,
|
|
botb_new);
|
|
bool bounceSucc = false;
|
|
if(matchSucc && hitEnd && !done) {
|
|
assert(topf_new > 0 || botf_new > 0);
|
|
bounceSucc = bounce(
|
|
q,
|
|
topf_new,
|
|
botf_new,
|
|
topb_new,
|
|
botb_new,
|
|
gfmFw,
|
|
gfmBw,
|
|
sc,
|
|
minsc, // minimum score
|
|
maxpen, // maximum penalty
|
|
re,
|
|
df,
|
|
pf,
|
|
rs,
|
|
cs,
|
|
heap,
|
|
alsink,
|
|
met, // descent metrics
|
|
prm); // per-read metrics
|
|
}
|
|
if(matchSucc) {
|
|
// Calculate info about outgoing edges
|
|
recalcOutgoing(q, sc, minsc, maxpen, re, pf, rs, cs, prm);
|
|
if(!empty()) {
|
|
heap.insert(make_pair(out_.bestPri(), descid)); // Add to heap
|
|
}
|
|
}
|
|
return !empty() || bounceSucc;
|
|
}
|
|
|
|
/**
|
|
* Initialize a new descent beginning at the given root. Return false if
|
|
* the Descent has no outgoing edges (and can therefore have its memory
|
|
* freed), true otherwise.
|
|
*/
|
|
template <typename index_t>
|
|
bool Descent<index_t>::init(
|
|
const Read& q, // query
|
|
TRootId rid, // root id
|
|
const Scoring& sc, // scoring scheme
|
|
TAlScore minsc, // minimum score
|
|
TAlScore maxpen, // maximum penalty
|
|
size_t descid, // id of this Descent
|
|
const GFM<index_t>& gfmFw, // forward index
|
|
const GFM<index_t>& gfmBw, // mirror index
|
|
DescentRedundancyChecker& re, // redundancy checker
|
|
EFactory<Descent<index_t> >& df, // Descent factory
|
|
EFactory<DescentPos>& pf, // DescentPos factory
|
|
const EList<DescentRoot>& rs, // roots
|
|
const EList<DescentConfig>& cs, // configs
|
|
EHeap<TDescentPair>& heap, // heap
|
|
DescentAlignmentSink<index_t>& alsink, // alignment sink
|
|
DescentMetrics& met, // metrics
|
|
PerReadMetrics& prm) // per-read metrics
|
|
{
|
|
rid_ = rid;
|
|
al5pi_ = rs[rid].off5p;
|
|
al5pf_ = rs[rid].off5p;
|
|
assert_lt(al5pi_, q.length());
|
|
assert_lt(al5pf_, q.length());
|
|
l2r_ = rs[rid].l2r;
|
|
topf_ = botf_ = topb_ = botb_ = 0;
|
|
descid_ = descid;
|
|
parent_ = std::numeric_limits<size_t>::max();
|
|
pen_ = 0;
|
|
posid_ = std::numeric_limits<size_t>::max();
|
|
len_ = 0;
|
|
out_.clear();
|
|
edit_.reset();
|
|
lastRecalc_ = true;
|
|
gapadd_ = 0;
|
|
bool branches = false, hitEnd = false, done = false;
|
|
TIndexOffU topf_new = 0, botf_new = 0, topb_new = 0, botb_new = 0;
|
|
off5p_i_ = 0;
|
|
bool matchSucc = followMatches(
|
|
q,
|
|
sc,
|
|
gfmFw,
|
|
gfmBw,
|
|
re,
|
|
df,
|
|
pf,
|
|
rs,
|
|
cs,
|
|
heap,
|
|
alsink,
|
|
met,
|
|
prm,
|
|
branches,
|
|
hitEnd,
|
|
done,
|
|
off5p_i_,
|
|
topf_new,
|
|
botf_new,
|
|
topb_new,
|
|
botb_new);
|
|
bool bounceSucc = false;
|
|
if(matchSucc && hitEnd && !done) {
|
|
assert(topf_new > 0 || botf_new > 0);
|
|
bounceSucc = bounce(
|
|
q,
|
|
topf_new,
|
|
botf_new,
|
|
topb_new,
|
|
botb_new,
|
|
gfmFw,
|
|
gfmBw,
|
|
sc,
|
|
minsc, // minimum score
|
|
maxpen, // maximum penalty
|
|
re,
|
|
df,
|
|
pf,
|
|
rs,
|
|
cs,
|
|
heap,
|
|
alsink,
|
|
met, // descent metrics
|
|
prm); // per-read metrics
|
|
}
|
|
// Calculate info about outgoing edges
|
|
assert(empty());
|
|
if(matchSucc) {
|
|
recalcOutgoing(q, sc, minsc, maxpen, re, pf, rs, cs, prm);
|
|
if(!empty()) {
|
|
heap.insert(make_pair(out_.bestPri(), descid)); // Add to heap
|
|
}
|
|
}
|
|
return !empty() || bounceSucc;
|
|
}
|
|
|
|
/**
|
|
* Recalculate our summary of the outgoing edges from this descent. When
|
|
* deciding what outgoing edges are legal, we abide by constraints.
|
|
* Typically, they limit the total of the penalties accumulated so far, as
|
|
* a function of distance from the search root. E.g. a constraint might
|
|
* disallow any gaps or mismatches within 20 ply of the search root, then
|
|
* allow 1 mismatch within 30 ply, then allow up to 1 mismatch or 1 gap
|
|
* within 40 ply, etc.
|
|
*
|
|
* Return the total number of valid outgoing edges found.
|
|
*
|
|
* TODO: Eliminate outgoing gap edges that are redundant with others owing to
|
|
* the DNA sequence and the fact that we don't care to distinguish among
|
|
* "equivalent" homopolymer extensinos and retractions.
|
|
*/
|
|
template <typename index_t>
|
|
size_t Descent<index_t>::recalcOutgoing(
|
|
const Read& q, // query string
|
|
const Scoring& sc, // scoring scheme
|
|
TAlScore minsc, // minimum score
|
|
TAlScore maxpen, // maximum penalty
|
|
DescentRedundancyChecker& re, // redundancy checker
|
|
EFactory<DescentPos>& pf, // factory with DescentPoss
|
|
const EList<DescentRoot>& rs, // roots
|
|
const EList<DescentConfig>& cs, // configs
|
|
PerReadMetrics& prm) // per-read metrics
|
|
{
|
|
assert_eq(botf_ - topf_, botb_ - topb_);
|
|
assert(out_.empty());
|
|
assert(repOk(&q));
|
|
// Get initial 5' and 3' offsets
|
|
bool fw = rs[rid_].fw;
|
|
float rootpri = rs[rid_].pri;
|
|
bool toward3p = (l2r_ == fw);
|
|
size_t off5p = off5p_i_;
|
|
assert_geq(al5pf_, al5pi_);
|
|
size_t off3p = q.length() - off5p - 1;
|
|
// By "depth" we essentially mean the number of characters already aligned
|
|
size_t depth, extrai = 0, extraf = 0;
|
|
size_t cur5pi = al5pi_, cur5pf = al5pf_;
|
|
if(toward3p) {
|
|
// Toward 3'
|
|
cur5pf = off5p;
|
|
depth = off5p - al5pi_;
|
|
// Failed to match out to the end?
|
|
if(al5pf_ < q.length() - 1) {
|
|
extraf = 1; // extra
|
|
}
|
|
} else {
|
|
// Toward 5'
|
|
cur5pi = off5p;
|
|
depth = al5pf_ - off5p;
|
|
if(al5pi_ > 0) {
|
|
extrai = 1;
|
|
}
|
|
}
|
|
// Get gap penalties
|
|
TScore pen_rdg_ex = sc.readGapExtend(), pen_rfg_ex = sc.refGapExtend();
|
|
TScore pen_rdg_op = sc.readGapOpen(), pen_rfg_op = sc.refGapOpen();
|
|
// Top and bot in the direction of the descent
|
|
TIndexOffU top = l2r_ ? topb_ : topf_;
|
|
TIndexOffU bot = l2r_ ? botb_ : botf_;
|
|
// Top and bot in the opposite direction
|
|
TIndexOffU topp = l2r_ ? topf_ : topb_;
|
|
TIndexOffU botp = l2r_ ? botf_ : botb_;
|
|
assert_eq(botp - topp, bot - top);
|
|
DescentEdge edge;
|
|
size_t nout = 0;
|
|
// Enumerate all outgoing edges, starting at the root and going out
|
|
size_t d = posid_;
|
|
// At first glance, we might think we should be bounded by al5pi_ and
|
|
// al5pf_, but those delimit the positions that matched between reference
|
|
// and read. If we hit a position that failed to match as part of
|
|
// followMatches, then we also want to evaluate ways of leaving that
|
|
// position, which adds one more position to viist.
|
|
while(off5p >= al5pi_ - extrai && off5p <= al5pf_ + extraf) {
|
|
assert_lt(off5p, q.length());
|
|
assert_lt(off3p, q.length());
|
|
TScore maxpend = cs[rid_].cons.get(depth, q.length(), maxpen);
|
|
assert(depth > 0 || maxpend == 0);
|
|
assert_geq(maxpend, pen_); // can't have already exceeded max penalty
|
|
TScore diff = maxpend - pen_; // room we have left
|
|
// Get pointer to SA ranges in the direction of descent
|
|
const TIndexOffU *t = l2r_ ? pf[d].topb : pf[d].topf;
|
|
const TIndexOffU *b = l2r_ ? pf[d].botb : pf[d].botf;
|
|
const TIndexOffU *tp = l2r_ ? pf[d].topf : pf[d].topb;
|
|
const TIndexOffU *bp = l2r_ ? pf[d].botf : pf[d].botb;
|
|
assert_eq(pf[d].botf - pf[d].topf, pf[d].botb - pf[d].topb);
|
|
// What are the read char / quality?
|
|
std::pair<int, int> p = q.get(off5p, fw);
|
|
int c = p.first;
|
|
assert_range(0, 4, c);
|
|
// Only entertain edits if there is at least one type of edit left and
|
|
// there is some penalty budget left
|
|
if(!pf[d].flags.exhausted() && diff > 0) {
|
|
// What would the penalty be if we mismatched at this position?
|
|
// This includes the case where the mismatch is for an N in the
|
|
// read.
|
|
int qq = p.second;
|
|
assert_geq(qq, 0);
|
|
TScore pen_mm = sc.mm(c, qq);
|
|
if(pen_mm <= diff) {
|
|
for(int j = 0; j < 4; j++) {
|
|
if(j == c) continue; // Match, not mismatch
|
|
if(b[j] <= t[j]) {
|
|
continue; // No outgoing edge with this nucleotide
|
|
}
|
|
if(!pf[d].flags.mmExplore(j)) {
|
|
continue; // Already been explored
|
|
}
|
|
TIndexOffU topf = pf[d].topf[j], botf = pf[d].botf[j];
|
|
ASSERT_ONLY(TIndexOffU topb = pf[d].topb[j], botb = pf[d].botb[j]);
|
|
if(re.contains(fw, l2r_, cur5pi, cur5pf, cur5pf - cur5pi + 1 + gapadd_, topf, botf, pen_ + pen_mm)) {
|
|
prm.nRedSkip++;
|
|
continue; // Redundant with a path already explored
|
|
}
|
|
prm.nRedFail++;
|
|
TIndexOffU width = b[j] - t[j];
|
|
Edit edit((uint32_t)off5p, (int)("ACGTN"[j]), (int)("ACGTN"[c]), EDIT_TYPE_MM);
|
|
DescentPriority pri(pen_ + pen_mm, depth, width, rootpri);
|
|
assert(topf != 0 || botf != 0);
|
|
assert(topb != 0 || botb != 0);
|
|
assert_eq(botb - topb, botf - topf);
|
|
edge.init(edit, off5p, pri, d
|
|
#ifndef NDEBUG
|
|
, d, topf, botf, topb, botb
|
|
#endif
|
|
);
|
|
out_.update(edge);
|
|
nout++;
|
|
}
|
|
}
|
|
bool gapsAllowed = (off5p >= (size_t)sc.gapbar && off3p >= (size_t)sc.gapbar);
|
|
if(gapsAllowed) {
|
|
assert_gt(depth, 0);
|
|
// An easy redundancy check is: if all ways of proceeding are
|
|
// matches, then there's no need to entertain gaps here.
|
|
// Shifting the gap one position further downstream is
|
|
// guarnteed not to be worse.
|
|
size_t totwidth = (b[0] - t[0]) +
|
|
(b[1] - t[1]) +
|
|
(b[2] - t[2]) +
|
|
(b[3] - t[3]);
|
|
assert(c > 3 || b[c] - t[c] <= totwidth);
|
|
bool allmatch = c < 4 && (totwidth == (b[c] - t[c]));
|
|
bool rdex = false, rfex = false;
|
|
size_t cur5pi_i = cur5pi, cur5pf_i = cur5pf;
|
|
if(toward3p) {
|
|
cur5pf_i--;
|
|
} else {
|
|
cur5pi_i++;
|
|
}
|
|
if(off5p == off5p_i_ && edit_.inited()) {
|
|
// If we're at the root of the descent, and the descent
|
|
// branched on a gap, then this could be scored as an
|
|
// extension of that gap.
|
|
if(pen_rdg_ex <= diff && edit_.isReadGap()) {
|
|
// Extension of a read gap
|
|
rdex = true;
|
|
for(int j = 0; j < 4; j++) {
|
|
if(b[j] <= t[j]) {
|
|
continue; // No outgoing edge with this nucleotide
|
|
}
|
|
if(!pf[d].flags.rdgExplore(j)) {
|
|
continue; // Already been explored
|
|
}
|
|
TIndexOffU topf = pf[d].topf[j], botf = pf[d].botf[j];
|
|
ASSERT_ONLY(TIndexOffU topb = pf[d].topb[j], botb = pf[d].botb[j]);
|
|
assert(topf != 0 || botf != 0);
|
|
assert(topb != 0 || botb != 0);
|
|
if(re.contains(fw, l2r_, cur5pi_i, cur5pf_i, cur5pf - cur5pi + 1 + gapadd_, topf, botf, pen_ + pen_rdg_ex)) {
|
|
prm.nRedSkip++;
|
|
continue; // Redundant with a path already explored
|
|
}
|
|
prm.nRedFail++;
|
|
TIndexOffU width = b[j] - t[j];
|
|
// off5p holds the offset from the 5' of the next
|
|
// character we were trying to align when we decided to
|
|
// introduce a read gap (before that character). If we
|
|
// were walking toward the 5' end, we need to increment
|
|
// by 1.
|
|
uint32_t off = (uint32_t)off5p + (toward3p ? 0 : 1);
|
|
Edit edit(off, (int)("ACGTN"[j]), '-', EDIT_TYPE_READ_GAP);
|
|
assert(edit.pos2 != std::numeric_limits<uint32_t>::max());
|
|
edit.pos2 = edit_.pos2 + (toward3p ? 1 : -1);
|
|
DescentPriority pri(pen_ + pen_rdg_ex, depth, width, rootpri);
|
|
assert(topf != 0 || botf != 0);
|
|
assert(topb != 0 || botb != 0);
|
|
assert_eq(botb - topb, botf - topf);
|
|
edge.init(edit, off5p, pri, d
|
|
#ifndef NDEBUG
|
|
, d,
|
|
topf, botf, topb, botb
|
|
#endif
|
|
);
|
|
out_.update(edge);
|
|
nout++;
|
|
}
|
|
}
|
|
if(pen_rfg_ex <= diff && edit_.isRefGap()) {
|
|
// Extension of a reference gap
|
|
rfex = true;
|
|
if(pf[d].flags.rfgExplore()) {
|
|
TIndexOffU topf = l2r_ ? topp : top;
|
|
TIndexOffU botf = l2r_ ? botp : bot;
|
|
ASSERT_ONLY(TIndexOffU topb = l2r_ ? top : topp);
|
|
ASSERT_ONLY(TIndexOffU botb = l2r_ ? bot : botp);
|
|
assert(topf != 0 || botf != 0);
|
|
assert(topb != 0 || botb != 0);
|
|
size_t nrefal = cur5pf - cur5pi + gapadd_;
|
|
if(!re.contains(fw, l2r_, cur5pi, cur5pf, nrefal, topf, botf, pen_ + pen_rfg_ex)) {
|
|
TIndexOffU width = bot - top;
|
|
Edit edit((uint32_t)off5p, '-', (int)("ACGTN"[c]), EDIT_TYPE_REF_GAP);
|
|
DescentPriority pri(pen_ + pen_rfg_ex, depth, width, rootpri);
|
|
assert(topf != 0 || botf != 0);
|
|
assert(topb != 0 || botb != 0);
|
|
edge.init(edit, off5p, pri, d
|
|
#ifndef NDEBUG
|
|
// It's a little unclear what the depth ought to be.
|
|
// Is it the depth we were at when we did the ref
|
|
// gap? I.e. the depth of the flags where rfgExplore()
|
|
// returned true? Or is it the depth where we can
|
|
// retrieve the appropriate top/bot? We make it the
|
|
// latter, might wrap around, indicating we should get
|
|
// top/bot from the descent's topf_, ... fields.
|
|
, (d == posid_) ? std::numeric_limits<size_t>::max() : (d - 1),
|
|
topf, botf, topb, botb
|
|
#endif
|
|
);
|
|
out_.update(edge);
|
|
nout++;
|
|
prm.nRedFail++;
|
|
} else {
|
|
prm.nRedSkip++;
|
|
}
|
|
}
|
|
}
|
|
}
|
|
if(!allmatch && pen_rdg_op <= diff && !rdex) {
|
|
// Opening a new read gap
|
|
for(int j = 0; j < 4; j++) {
|
|
if(b[j] <= t[j]) {
|
|
continue; // No outgoing edge with this nucleotide
|
|
}
|
|
if(!pf[d].flags.rdgExplore(j)) {
|
|
continue; // Already been explored
|
|
}
|
|
TIndexOffU topf = pf[d].topf[j], botf = pf[d].botf[j];
|
|
ASSERT_ONLY(TIndexOffU topb = pf[d].topb[j], botb = pf[d].botb[j]);
|
|
assert(topf != 0 || botf != 0);
|
|
assert(topb != 0 || botb != 0);
|
|
if(re.contains(fw, l2r_, cur5pi_i, cur5pf_i, cur5pf - cur5pi + 1 + gapadd_, topf, botf, pen_ + pen_rdg_op)) {
|
|
prm.nRedSkip++;
|
|
continue; // Redundant with a path already explored
|
|
}
|
|
prm.nRedFail++;
|
|
TIndexOffU width = b[j] - t[j];
|
|
// off5p holds the offset from the 5' of the next
|
|
// character we were trying to align when we decided to
|
|
// introduce a read gap (before that character). If we
|
|
// were walking toward the 5' end, we need to increment
|
|
// by 1.
|
|
uint32_t off = (uint32_t)off5p + (toward3p ? 0 : 1);
|
|
Edit edit(off, (int)("ACGTN"[j]), '-', EDIT_TYPE_READ_GAP);
|
|
assert(edit.pos2 != std::numeric_limits<uint32_t>::max());
|
|
DescentPriority pri(pen_ + pen_rdg_op, depth, width, rootpri);
|
|
assert(topf != 0 || botf != 0);
|
|
assert(topb != 0 || botb != 0);
|
|
assert_eq(botb - topb, botf - topf);
|
|
edge.init(edit, off5p, pri, d
|
|
#ifndef NDEBUG
|
|
, d, topf, botf, topb, botb
|
|
#endif
|
|
);
|
|
out_.update(edge);
|
|
nout++;
|
|
}
|
|
}
|
|
if(!allmatch && pen_rfg_op <= diff && !rfex) {
|
|
// Opening a new reference gap
|
|
if(pf[d].flags.rfgExplore()) {
|
|
TIndexOffU topf = l2r_ ? topp : top;
|
|
TIndexOffU botf = l2r_ ? botp : bot;
|
|
ASSERT_ONLY(TIndexOffU topb = l2r_ ? top : topp);
|
|
ASSERT_ONLY(TIndexOffU botb = l2r_ ? bot : botp);
|
|
assert(topf != 0 || botf != 0);
|
|
assert(topb != 0 || botb != 0);
|
|
size_t nrefal = cur5pf - cur5pi + gapadd_;
|
|
if(!re.contains(fw, l2r_, cur5pi, cur5pf, nrefal, topf, botf, pen_ + pen_rfg_op)) {
|
|
TIndexOffU width = bot - top;
|
|
Edit edit((uint32_t)off5p, '-', (int)("ACGTN"[c]), EDIT_TYPE_REF_GAP);
|
|
DescentPriority pri(pen_ + pen_rfg_op, depth, width, rootpri);
|
|
assert(topf != 0 || botf != 0);
|
|
assert(topb != 0 || botb != 0);
|
|
edge.init(edit, off5p, pri, d
|
|
#ifndef NDEBUG
|
|
// It's a little unclear what the depth ought to be.
|
|
// Is it the depth we were at when we did the ref
|
|
// gap? I.e. the depth of the flags where rfgExplore()
|
|
// returned true? Or is it the depth where we can
|
|
// retrieve the appropriate top/bot? We make it the
|
|
// latter, might wrap around, indicating we should get
|
|
// top/bot from the descent's topf_, ... fields.
|
|
, (d == posid_) ? std::numeric_limits<size_t>::max() : (d - 1),
|
|
topf, botf, topb, botb
|
|
#endif
|
|
);
|
|
out_.update(edge);
|
|
nout++;
|
|
prm.nRedFail++;
|
|
} else {
|
|
prm.nRedSkip++;
|
|
}
|
|
}
|
|
}
|
|
}
|
|
}
|
|
// Update off5p, off3p, depth
|
|
d++;
|
|
depth++;
|
|
assert_leq(depth, al5pf_ - al5pi_ + 2);
|
|
if(toward3p) {
|
|
if(off3p == 0) {
|
|
break;
|
|
}
|
|
off5p++;
|
|
off3p--;
|
|
cur5pf++;
|
|
} else {
|
|
if(off5p == 0) {
|
|
break;
|
|
}
|
|
off3p++;
|
|
off5p--;
|
|
cur5pi--;
|
|
}
|
|
// Update top and bot
|
|
if(off5p >= al5pi_ - extrai && off5p <= al5pf_ + extraf) {
|
|
assert_range(0, 3, c);
|
|
top = t[c]; topp = tp[c];
|
|
bot = b[c]; botp = bp[c];
|
|
assert_eq(bot-top, botp-topp);
|
|
}
|
|
}
|
|
lastRecalc_ = (nout <= 5);
|
|
out_.best1.updateFlags(pf);
|
|
out_.best2.updateFlags(pf);
|
|
out_.best3.updateFlags(pf);
|
|
out_.best4.updateFlags(pf);
|
|
out_.best5.updateFlags(pf);
|
|
return nout;
|
|
}
|
|
|
|
template <typename index_t>
|
|
void Descent<index_t>::print(
|
|
std::ostream *os,
|
|
const char *prefix,
|
|
const Read& q,
|
|
size_t trimLf,
|
|
size_t trimRg,
|
|
bool fw,
|
|
const EList<Edit>& edits,
|
|
size_t ei,
|
|
size_t en,
|
|
BTDnaString& rf) const
|
|
{
|
|
const BTDnaString& read = fw ? q.patFw : q.patRc;
|
|
size_t eidx = ei;
|
|
if(os != NULL) { *os << prefix; }
|
|
// Print read
|
|
for(size_t i = 0; i < read.length(); i++) {
|
|
if(i < trimLf || i >= read.length() - trimRg) {
|
|
if(os != NULL) { *os << (char)tolower(read.toChar(i)); }
|
|
continue;
|
|
}
|
|
bool del = false, mm = false;
|
|
while(eidx < ei + en && edits[eidx].pos == i) {
|
|
if(edits[eidx].isReadGap()) {
|
|
if(os != NULL) { *os << '-'; }
|
|
} else if(edits[eidx].isRefGap()) {
|
|
del = true;
|
|
assert_eq((int)edits[eidx].qchr, read.toChar(i));
|
|
if(os != NULL) { *os << read.toChar(i); }
|
|
} else {
|
|
mm = true;
|
|
assert(edits[eidx].isMismatch());
|
|
assert_eq((int)edits[eidx].qchr, read.toChar(i));
|
|
if(os != NULL) { *os << (char)edits[eidx].qchr; }
|
|
}
|
|
eidx++;
|
|
}
|
|
if(!del && !mm) {
|
|
// Print read character
|
|
if(os != NULL) { *os << read.toChar(i); }
|
|
}
|
|
}
|
|
if(os != NULL) {
|
|
*os << endl;
|
|
*os << prefix;
|
|
}
|
|
eidx = ei;
|
|
// Print match bars
|
|
for(size_t i = 0; i < read.length(); i++) {
|
|
if(i < trimLf || i >= read.length() - trimRg) {
|
|
if(os != NULL) { *os << ' '; }
|
|
continue;
|
|
}
|
|
bool del = false, mm = false;
|
|
while(eidx < ei + en && edits[eidx].pos == i) {
|
|
if(edits[eidx].isReadGap()) {
|
|
if(os != NULL) { *os << ' '; }
|
|
} else if(edits[eidx].isRefGap()) {
|
|
del = true;
|
|
if(os != NULL) { *os << ' '; }
|
|
} else {
|
|
mm = true;
|
|
assert(edits[eidx].isMismatch());
|
|
if(os != NULL) { *os << ' '; }
|
|
}
|
|
eidx++;
|
|
}
|
|
if(!del && !mm && os != NULL) { *os << '|'; }
|
|
}
|
|
if(os != NULL) {
|
|
*os << endl;
|
|
*os << prefix;
|
|
}
|
|
eidx = ei;
|
|
// Print reference
|
|
for(size_t i = 0; i < read.length(); i++) {
|
|
if(i < trimLf || i >= read.length() - trimRg) {
|
|
if(os != NULL) { *os << ' '; }
|
|
continue;
|
|
}
|
|
bool del = false, mm = false;
|
|
while(eidx < ei + en && edits[eidx].pos == i) {
|
|
if(edits[eidx].isReadGap()) {
|
|
rf.appendChar((char)edits[eidx].chr);
|
|
if(os != NULL) { *os << (char)edits[eidx].chr; }
|
|
} else if(edits[eidx].isRefGap()) {
|
|
del = true;
|
|
if(os != NULL) { *os << '-'; }
|
|
} else {
|
|
mm = true;
|
|
assert(edits[eidx].isMismatch());
|
|
rf.appendChar((char)edits[eidx].chr);
|
|
if(os != NULL) { *os << (char)edits[eidx].chr; }
|
|
}
|
|
eidx++;
|
|
}
|
|
if(!del && !mm) {
|
|
rf.append(read[i]);
|
|
if(os != NULL) { *os << read.toChar(i); }
|
|
}
|
|
}
|
|
if(os != NULL) { *os << endl; }
|
|
}
|
|
|
|
/**
|
|
* Create a new Descent
|
|
*/
|
|
template <typename index_t>
|
|
bool Descent<index_t>::bounce(
|
|
const Read& q, // query string
|
|
TIndexOffU topf, // SA range top in fw index
|
|
TIndexOffU botf, // SA range bottom in fw index
|
|
TIndexOffU topb, // SA range top in bw index
|
|
TIndexOffU botb, // SA range bottom in bw index
|
|
const GFM<index_t>& gfmFw, // forward index
|
|
const GFM<index_t>& gfmBw, // mirror index
|
|
const Scoring& sc, // scoring scheme
|
|
TAlScore minsc, // minimum score
|
|
TAlScore maxpen, // maximum penalty
|
|
DescentRedundancyChecker& re, // redundancy checker
|
|
EFactory<Descent<index_t> >& df, // factory with Descent
|
|
EFactory<DescentPos>& pf, // factory with DescentPoss
|
|
const EList<DescentRoot>& rs, // roots
|
|
const EList<DescentConfig>& cs, // configs
|
|
EHeap<TDescentPair>& heap, // heap of descents
|
|
DescentAlignmentSink<index_t>& alsink, // alignment sink
|
|
DescentMetrics& met, // metrics
|
|
PerReadMetrics& prm) // per-read metrics
|
|
{
|
|
assert_gt(botf, topf);
|
|
assert(al5pi_ == 0 || al5pf_ == q.length()-1);
|
|
assert(!(al5pi_ == 0 && al5pf_ == q.length()-1));
|
|
size_t dfsz = df.size();
|
|
size_t pfsz = pf.size();
|
|
TDescentId id = df.alloc();
|
|
Edit e_null;
|
|
assert(!e_null.inited());
|
|
// Follow matches
|
|
bool succ = df[id].init(
|
|
q, // query
|
|
rid_, // root id
|
|
sc, // scoring scheme
|
|
minsc, // minimum score
|
|
maxpen, // maximum penalty
|
|
al5pi_, // new near-5' extreme
|
|
al5pf_, // new far-5' extreme
|
|
topf, // SA range top in FW index
|
|
botf, // SA range bottom in FW index
|
|
topb, // SA range top in BW index
|
|
botb, // SA range bottom in BW index
|
|
!l2r_, // direction this descent will go in; opposite from parent
|
|
id, // my ID
|
|
descid_, // parent ID
|
|
pen_, // total penalties so far - same as parent
|
|
e_null, // edit for incoming edge; uninitialized if bounced
|
|
gfmFw, // forward index
|
|
gfmBw, // mirror index
|
|
re, // redundancy checker
|
|
df, // Descent factory
|
|
pf, // DescentPos factory
|
|
rs, // DescentRoot list
|
|
cs, // DescentConfig list
|
|
heap, // heap
|
|
alsink, // alignment sink
|
|
met, // metrics
|
|
prm); // per-read metrics
|
|
if(!succ) {
|
|
// Reclaim memory we had used for this descent and its DescentPos info
|
|
df.resize(dfsz);
|
|
pf.resize(pfsz);
|
|
}
|
|
return succ;
|
|
}
|
|
|
|
/**
|
|
* Take the best outgoing edge and place it in the heap. When deciding what
|
|
* outgoing edges exist, abide by constraints in DescentConfig. These
|
|
* constraints limit total penalty accumulated so far versus distance from
|
|
* search root. E.g. a constraint might disallow any gaps or mismatches within
|
|
* 20 ply of the root, then allow 1 mismatch within 30 ply, 1 mismatch or 1 gap
|
|
* within 40 ply, etc.
|
|
*/
|
|
template <typename index_t>
|
|
void Descent<index_t>::followBestOutgoing(
|
|
const Read& q, // query string
|
|
const GFM<index_t>& gfmFw, // forward index
|
|
const GFM<index_t>& gfmBw, // mirror index
|
|
const Scoring& sc, // scoring scheme
|
|
TAlScore minsc, // minimum score
|
|
TAlScore maxpen, // maximum penalty
|
|
DescentRedundancyChecker& re, // redundancy checker
|
|
EFactory<Descent<index_t> >& df, // factory with Descent
|
|
EFactory<DescentPos>& pf, // factory with DescentPoss
|
|
const EList<DescentRoot>& rs, // roots
|
|
const EList<DescentConfig>& cs, // configs
|
|
EHeap<TDescentPair>& heap, // heap of descents
|
|
DescentAlignmentSink<index_t>& alsink, // alignment sink
|
|
DescentMetrics& met, // metrics
|
|
PerReadMetrics& prm) // per-read metrics
|
|
{
|
|
// We assume this descent has been popped off the heap. We'll re-add it if
|
|
// it hasn't been exhausted.
|
|
assert(q.repOk());
|
|
assert(!empty());
|
|
assert(!out_.empty());
|
|
while(!out_.empty()) {
|
|
DescentPriority best = out_.bestPri();
|
|
DescentEdge e = out_.rotate();
|
|
TReadOff al5pi_new = al5pi_, al5pf_new = al5pf_;
|
|
bool fw = rs[rid_].fw;
|
|
bool toward3p = (l2r_ == fw);
|
|
TReadOff edoff = e.off5p; // 5' offset of edit
|
|
assert_leq(edoff, al5pf_ + 1);
|
|
assert_geq(edoff + 1, al5pi_);
|
|
if(out_.empty()) {
|
|
if(!lastRecalc_) {
|
|
// This might allocate new Descents
|
|
recalcOutgoing(q, sc, minsc, maxpen, re, pf, rs, cs, prm);
|
|
if(empty()) {
|
|
// Could happen, since some outgoing edges may have become
|
|
// redundant in the meantime.
|
|
break;
|
|
}
|
|
} else {
|
|
assert(empty());
|
|
}
|
|
}
|
|
TReadOff doff; // edit's offset into this descent
|
|
int chr = asc2dna[e.e.chr];
|
|
// hitEnd is set to true iff this edit pushes us to the extreme 5' or 3'
|
|
// end of the alignment
|
|
bool hitEnd = false;
|
|
// done is set to true iff this edit aligns the only remaining character of
|
|
// the read
|
|
bool done = false;
|
|
if(toward3p) {
|
|
// The 3' extreme of the new Descent is further in (away from the 3'
|
|
// end) than the parent's.
|
|
al5pf_new = doff = edoff;
|
|
if(e.e.isReadGap()) {
|
|
// We didn't actually consume the read character at 'edoff', so
|
|
// retract al5pf_new by one position. This doesn't effect the
|
|
// "depth" (doff) of the SA range we took, though.
|
|
assert_gt(al5pf_new, 0);
|
|
al5pf_new--;
|
|
}
|
|
assert_lt(al5pf_new, q.length());
|
|
hitEnd = (al5pf_new == q.length() - 1);
|
|
done = (hitEnd && al5pi_new == 0);
|
|
assert_geq(doff, off5p_i_);
|
|
doff = doff - off5p_i_;
|
|
assert_leq(doff, len_);
|
|
} else {
|
|
// The 5' extreme of the new Descent is further in (away from the 5'
|
|
// end) than the parent's.
|
|
al5pi_new = doff = edoff;
|
|
if(e.e.isReadGap()) {
|
|
// We didn't actually consume the read character at 'edoff', so
|
|
// move al5pi_new closer to the 3' end by one position. This
|
|
// doesn't effect the "depth" (doff) of the SA range we took,
|
|
// though.
|
|
al5pi_new++;
|
|
}
|
|
hitEnd = (al5pi_new == 0);
|
|
done = (hitEnd && al5pf_new == q.length() - 1);
|
|
assert_geq(off5p_i_, doff);
|
|
doff = off5p_i_ - doff;
|
|
assert_leq(doff, len_);
|
|
}
|
|
// Check if this is redundant with an already-explored path
|
|
bool l2r = l2r_; // gets overridden if we bounce
|
|
if(!done && hitEnd) {
|
|
// Alignment finsihed extending in one direction
|
|
l2r = !l2r;
|
|
}
|
|
size_t dfsz = df.size();
|
|
size_t pfsz = pf.size();
|
|
TIndexOffU topf, botf, topb, botb;
|
|
size_t d = posid_ + doff;
|
|
if(e.e.isRefGap()) {
|
|
d--; // might underflow
|
|
if(doff == 0) {
|
|
topf = topf_;
|
|
botf = botf_;
|
|
topb = topb_;
|
|
botb = botb_;
|
|
d = std::numeric_limits<size_t>::max();
|
|
assert_eq(botf-topf, botb-topb);
|
|
} else {
|
|
assert_gt(al5pf_new, 0);
|
|
assert_gt(d, 0);
|
|
chr = pf[d].c;
|
|
assert(pf[d].inited());
|
|
assert_range(0, 3, chr);
|
|
topf = pf[d].topf[chr];
|
|
botf = pf[d].botf[chr];
|
|
topb = pf[d].topb[chr];
|
|
botb = pf[d].botb[chr];
|
|
assert_eq(botf-topf, botb-topb);
|
|
}
|
|
} else {
|
|
// A read gap or a mismatch
|
|
assert(pf[d].inited());
|
|
topf = pf[d].topf[chr];
|
|
botf = pf[d].botf[chr];
|
|
topb = pf[d].topb[chr];
|
|
botb = pf[d].botb[chr];
|
|
assert_eq(botf-topf, botb-topb);
|
|
}
|
|
assert_eq(d, e.d);
|
|
assert_eq(topf, e.topf);
|
|
assert_eq(botf, e.botf);
|
|
assert_eq(topb, e.topb);
|
|
assert_eq(botb, e.botb);
|
|
if(done) {
|
|
// Aligned the entire read end-to-end. Presumably there's no need to
|
|
// create a new Descent object. We just report the alignment.
|
|
alsink.reportAlignment(
|
|
q, // query
|
|
gfmFw, // forward index
|
|
gfmBw, // backward index
|
|
topf, // top of SA range in forward index
|
|
botf, // bottom of SA range in forward index
|
|
topb, // top of SA range in backward index
|
|
botb, // bottom of SA range in backward index
|
|
descid_, // Descent at the leaf
|
|
rid_, // root id
|
|
e.e, // extra edit, if necessary
|
|
best.pen, // penalty
|
|
df, // factory with Descent
|
|
pf, // factory with DescentPoss
|
|
rs, // roots
|
|
cs); // configs
|
|
assert(alsink.repOk());
|
|
return;
|
|
}
|
|
assert(al5pi_new != 0 || al5pf_new != q.length() - 1);
|
|
TDescentId id = df.alloc();
|
|
bool succ = df[id].init(
|
|
q, // query
|
|
rid_, // root id
|
|
sc, // scoring scheme
|
|
minsc, // minimum score
|
|
maxpen, // maximum penalty
|
|
al5pi_new, // new near-5' extreme
|
|
al5pf_new, // new far-5' extreme
|
|
topf, // SA range top in FW index
|
|
botf, // SA range bottom in FW index
|
|
topb, // SA range top in BW index
|
|
botb, // SA range bottom in BW index
|
|
l2r, // direction this descent will go in
|
|
id, // my ID
|
|
descid_, // parent ID
|
|
best.pen, // total penalties so far
|
|
e.e, // edit for incoming edge; uninitialized if bounced
|
|
gfmFw, // forward index
|
|
gfmBw, // mirror index
|
|
re, // redundancy checker
|
|
df, // Descent factory
|
|
pf, // DescentPos factory
|
|
rs, // DescentRoot list
|
|
cs, // DescentConfig list
|
|
heap, // heap
|
|
alsink, // alignment sink
|
|
met, // metrics
|
|
prm); // per-read metrics
|
|
if(!succ) {
|
|
// Reclaim memory we had used for this descent and its DescentPos info
|
|
df.resize(dfsz);
|
|
pf.resize(pfsz);
|
|
}
|
|
break;
|
|
}
|
|
if(!empty()) {
|
|
// Re-insert this Descent with its new priority
|
|
heap.insert(make_pair(out_.bestPri(), descid_));
|
|
}
|
|
}
|
|
|
|
/**
|
|
* Given the forward and backward indexes, and given topf/botf/topb/botb, get
|
|
* tloc, bloc ready for the next step.
|
|
*/
|
|
template <typename index_t>
|
|
void Descent<index_t>::nextLocsBi(
|
|
const GFM<index_t>& gfmFw, // forward index
|
|
const GFM<index_t>& gfmBw, // mirror index
|
|
SideLocus<index_t>& tloc, // top locus
|
|
SideLocus<index_t>& bloc, // bot locus
|
|
index_t topf, // top in BWT
|
|
index_t botf, // bot in BWT
|
|
index_t topb, // top in BWT'
|
|
index_t botb) // bot in BWT'
|
|
{
|
|
assert_gt(botf, 0);
|
|
// Which direction are we going in next?
|
|
if(l2r_) {
|
|
// Left to right; use BWT'
|
|
if(botb - topb == 1) {
|
|
// Already down to 1 row; just init top locus
|
|
tloc.initFromRow(topb, gfmBw.gh(), gfmBw.gfm());
|
|
bloc.invalidate();
|
|
} else {
|
|
SideLocus<index_t>::initFromTopBot(
|
|
topb, botb, gfmBw.gh(), gfmBw.gfm(), tloc, bloc);
|
|
assert(bloc.valid());
|
|
}
|
|
} else {
|
|
// Right to left; use BWT
|
|
if(botf - topf == 1) {
|
|
// Already down to 1 row; just init top locus
|
|
tloc.initFromRow(topf, gfmFw.gh(), gfmFw.gfm());
|
|
bloc.invalidate();
|
|
} else {
|
|
SideLocus<index_t>::initFromTopBot(
|
|
topf, botf, gfmFw.gh(), gfmFw.gfm(), tloc, bloc);
|
|
assert(bloc.valid());
|
|
}
|
|
}
|
|
// Check if we should update the tracker with this refinement
|
|
assert(botf - topf == 1 || bloc.valid());
|
|
assert(botf - topf > 1 || !bloc.valid());
|
|
}
|
|
|
|
/**
|
|
* Advance this descent by following read matches as far as possible.
|
|
*
|
|
* This routine doesn't have to consider the whole gamut of constraints on
|
|
* which outgoing edges can be followed. If it is a root descent, it does have
|
|
* to know how deep the no-edit constraint goes, though, so we can decide
|
|
* whether using the ftab would potentially jump over relevant branch points.
|
|
* Apart from that, though, we simply proceed as far as it can go by matching
|
|
* characters in the query, irrespective of the constraints.
|
|
* recalcOutgoing(...) and followBestOutgoing(...) do have to consider these
|
|
* constraints, though.
|
|
*
|
|
* Conceptually, as we make descending steps, we have:
|
|
* 1. Before each step, a single range indicating how we departed the previous
|
|
* step
|
|
* 2. As part of each step, a quad of ranges indicating what range would result
|
|
* if we proceeded on an a, c, g ot t
|
|
*
|
|
* Return true iff it is possible to branch from this descent. If we haven't
|
|
* exceeded the no-branch depth.
|
|
*/
|
|
template <typename index_t>
|
|
bool Descent<index_t>::followMatches(
|
|
const Read& q, // query string
|
|
const Scoring& sc, // scoring scheme
|
|
const GFM<index_t>& gfmFw, // forward index
|
|
const GFM<index_t>& gfmBw, // mirror index
|
|
DescentRedundancyChecker& re, // redundancy checker
|
|
EFactory<Descent<index_t> >& df, // Descent factory
|
|
EFactory<DescentPos>& pf, // DescentPos factory
|
|
const EList<DescentRoot>& rs, // roots
|
|
const EList<DescentConfig>& cs, // configs
|
|
EHeap<TDescentPair>& heap, // heap
|
|
DescentAlignmentSink<index_t>& alsink, // alignment sink
|
|
DescentMetrics& met, // metrics
|
|
PerReadMetrics& prm, // per-read metrics
|
|
bool& branches, // out: true -> there are > 0 ways to branch
|
|
bool& hitEnd, // out: true -> hit read end with non-empty range
|
|
bool& done, // out: true -> we made a full alignment
|
|
TReadOff& off5p_i, // out: initial 5' offset
|
|
TIndexOffU& topf_bounce, // out: top of SA range for fw idx for bounce
|
|
TIndexOffU& botf_bounce, // out: bot of SA range for fw idx for bounce
|
|
TIndexOffU& topb_bounce, // out: top of SA range for bw idx for bounce
|
|
TIndexOffU& botb_bounce) // out: bot of SA range for bw idx for bounce
|
|
{
|
|
// TODO: make these full-fledged parameters
|
|
size_t nobranchDepth = 20;
|
|
bool stopOnN = true;
|
|
assert(q.repOk());
|
|
assert(repOk(&q));
|
|
assert_eq(gfmFw.eh().ftabChars(), gfmBw.gh().ftabChars());
|
|
#ifndef NDEBUG
|
|
for(int i = 0; i < 4; i++) {
|
|
assert_eq(gfmFw.fchr()[i], gfmBw.fchr()[i]);
|
|
}
|
|
#endif
|
|
SideLocus<index_t> tloc, bloc;
|
|
TIndexOffU topf = topf_, botf = botf_, topb = topb_, botb = botb_;
|
|
bool fw = rs[rid_].fw;
|
|
bool toward3p;
|
|
size_t off5p;
|
|
assert_lt(al5pi_, q.length());
|
|
assert_lt(al5pf_, q.length());
|
|
while(true) {
|
|
toward3p = (l2r_ == fw);
|
|
assert_geq(al5pf_, al5pi_);
|
|
assert(al5pi_ != 0 || al5pf_ != q.length() - 1);
|
|
if(toward3p) {
|
|
if(al5pf_ == q.length()-1) {
|
|
l2r_ = !l2r_;
|
|
continue;
|
|
}
|
|
if(al5pi_ == al5pf_ && root()) {
|
|
off5p = off5p_i = al5pi_;
|
|
} else {
|
|
off5p = off5p_i = (al5pf_ + 1);
|
|
}
|
|
} else {
|
|
if(al5pi_ == 0) {
|
|
l2r_ = !l2r_;
|
|
continue;
|
|
}
|
|
assert_gt(al5pi_, 0);
|
|
if(al5pi_ == al5pf_ && root()) {
|
|
off5p = off5p_i = al5pi_;
|
|
} else {
|
|
off5p = off5p_i = (al5pi_ - 1);
|
|
}
|
|
}
|
|
break;
|
|
}
|
|
size_t off3p = q.length() - off5p - 1;
|
|
assert_lt(off5p, q.length());
|
|
assert_lt(off3p, q.length());
|
|
bool firstPos = true;
|
|
assert_eq(0, len_);
|
|
|
|
// Number of times pf.alloc() is called. So we can sanity check it.
|
|
size_t nalloc = 0;
|
|
// Set to true as soon as we encounter a branch point along this descent.
|
|
branches = false;
|
|
// hitEnd is set to true iff this edit pushes us to the extreme 5' or 3'
|
|
// end of the alignment
|
|
hitEnd = false;
|
|
// done is set to true iff this edit aligns the only remaining character of
|
|
// the read
|
|
done = false;
|
|
if(root()) {
|
|
assert_eq(al5pi_, al5pf_);
|
|
// Check whether/how far we can jump using ftab
|
|
int ftabLen = gfmFw.gh().ftabChars();
|
|
bool ftabFits = true;
|
|
if(toward3p && ftabLen + off5p > q.length()) {
|
|
ftabFits = false;
|
|
} else if(!toward3p && off5p < (size_t)ftabLen) {
|
|
ftabFits = false;
|
|
}
|
|
bool useFtab = ftabLen > 1 && (size_t)ftabLen <= nobranchDepth && ftabFits;
|
|
bool ftabFailed = false;
|
|
if(useFtab) {
|
|
prm.nFtabs++;
|
|
// Forward index: right-to-left
|
|
size_t off_r2l = fw ? off5p : q.length() - off5p - 1;
|
|
if(l2r_) {
|
|
//
|
|
} else {
|
|
assert_geq((int)off_r2l, ftabLen - 1);
|
|
off_r2l -= (ftabLen - 1);
|
|
}
|
|
bool ret = gfmFw.ftabLoHi(fw ? q.patFw : q.patRc, off_r2l,
|
|
false, // reverse
|
|
topf, botf);
|
|
if(!ret) {
|
|
// Encountered an N or something else that made it impossible
|
|
// to use the ftab
|
|
ftabFailed = true;
|
|
} else {
|
|
if(botf - topf == 0) {
|
|
return false;
|
|
}
|
|
int c_r2l = fw ? q.patFw[off_r2l] : q.patRc[off_r2l];
|
|
// Backward index: left-to-right
|
|
size_t off_l2r = fw ? off5p : q.length() - off5p - 1;
|
|
if(l2r_) {
|
|
//
|
|
} else {
|
|
assert_geq((int)off_l2r, ftabLen - 1);
|
|
off_l2r -= (ftabLen - 1);
|
|
}
|
|
ASSERT_ONLY(bool ret2 = )
|
|
gfmBw.ftabLoHi(fw ? q.patFw : q.patRc, off_l2r,
|
|
false, // don't reverse
|
|
topb, botb);
|
|
assert(ret == ret2);
|
|
int c_l2r = fw ? q.patFw[off_l2r + ftabLen - 1] :
|
|
q.patRc[off_l2r + ftabLen - 1];
|
|
assert_eq(botf - topf, botb - topb);
|
|
if(toward3p) {
|
|
assert_geq((int)off3p, ftabLen - 1);
|
|
off5p += ftabLen; off3p -= ftabLen;
|
|
} else {
|
|
assert_geq((int)off5p, ftabLen - 1);
|
|
off5p -= ftabLen; off3p += ftabLen;
|
|
}
|
|
len_ += ftabLen;
|
|
if(toward3p) {
|
|
// By convention, al5pf_ and al5pi_ start out equal, so we only
|
|
// advance al5pf_ by ftabLen - 1 (not ftabLen)
|
|
al5pf_ += (ftabLen - 1); // -1 accounts for inclusive al5pf_
|
|
if(al5pf_ == q.length() - 1) {
|
|
hitEnd = true;
|
|
done = (al5pi_ == 0);
|
|
}
|
|
} else {
|
|
// By convention, al5pf_ and al5pi_ start out equal, so we only
|
|
// advance al5pi_ by ftabLen - 1 (not ftabLen)
|
|
al5pi_ -= (ftabLen - 1);
|
|
if(al5pi_ == 0) {
|
|
hitEnd = true;
|
|
done = (al5pf_ == q.length()-1);
|
|
}
|
|
}
|
|
// Allocate DescentPos data structures and leave them empty. We
|
|
// jumped over them by doing our lookup in the ftab, so we have no
|
|
// info about outgoing edges from them, besides the matching
|
|
// outgoing edge from the last pos which is in topf/botf and
|
|
// topb/botb.
|
|
size_t id = 0;
|
|
if(firstPos) {
|
|
posid_ = pf.alloc();
|
|
pf[posid_].reset();
|
|
firstPos = false;
|
|
for(int i = 1; i < ftabLen; i++) {
|
|
id = pf.alloc();
|
|
pf[id].reset();
|
|
}
|
|
} else {
|
|
for(int i = 0; i < ftabLen; i++) {
|
|
id = pf.alloc();
|
|
pf[id].reset();
|
|
}
|
|
}
|
|
assert_eq(botf-topf, botb-topb);
|
|
pf[id].c = l2r_ ? c_l2r : c_r2l;
|
|
pf[id].topf[l2r_ ? c_l2r : c_r2l] = topf;
|
|
pf[id].botf[l2r_ ? c_l2r : c_r2l] = botf;
|
|
pf[id].topb[l2r_ ? c_l2r : c_r2l] = topb;
|
|
pf[id].botb[l2r_ ? c_l2r : c_r2l] = botb;
|
|
assert(pf[id].inited());
|
|
nalloc += ftabLen;
|
|
}
|
|
}
|
|
if(!useFtab || ftabFailed) {
|
|
// Can't use ftab, use fchr instead
|
|
int rdc = q.getc(off5p, fw);
|
|
// If rdc is N, that's pretty bad! That means we placed a root
|
|
// right on an N. The only thing we can reasonably do is to pick a
|
|
// nucleotide at random and proceed.
|
|
if(rdc > 3) {
|
|
return false;
|
|
}
|
|
assert_range(0, 3, rdc);
|
|
topf = topb = gfmFw.fchr()[rdc];
|
|
botf = botb = gfmFw.fchr()[rdc+1];
|
|
if(botf - topf == 0) {
|
|
return false;
|
|
}
|
|
if(toward3p) {
|
|
off5p++; off3p--;
|
|
} else {
|
|
off5p--; off3p++;
|
|
}
|
|
len_++;
|
|
if(toward3p) {
|
|
if(al5pf_ == q.length()-1) {
|
|
hitEnd = true;
|
|
done = (al5pi_ == 0);
|
|
}
|
|
} else {
|
|
if(al5pi_ == 0) {
|
|
hitEnd = true;
|
|
done = (al5pf_ == q.length()-1);
|
|
}
|
|
}
|
|
// Allocate DescentPos data structure. We could fill it with the
|
|
// four ranges from fchr if we wanted to, but that will never be
|
|
// relevant.
|
|
size_t id = 0;
|
|
if(firstPos) {
|
|
posid_ = id = pf.alloc();
|
|
firstPos = false;
|
|
} else {
|
|
id = pf.alloc();
|
|
}
|
|
assert_eq(botf-topf, botb-topb);
|
|
pf[id].c = rdc;
|
|
pf[id].topf[rdc] = topf;
|
|
pf[id].botf[rdc] = botf;
|
|
pf[id].topb[rdc] = topb;
|
|
pf[id].botb[rdc] = botb;
|
|
assert(pf[id].inited());
|
|
nalloc++;
|
|
}
|
|
assert_gt(botf, topf);
|
|
assert_eq(botf - topf, botb - topb);
|
|
// Check if this is redundant with an already-explored path
|
|
if(!re.check(fw, l2r_, al5pi_, al5pf_, al5pf_ - al5pi_ + 1 + gapadd_,
|
|
topf, botf, pen_))
|
|
{
|
|
prm.nRedSkip++;
|
|
return false;
|
|
}
|
|
prm.nRedFail++; // not pruned by redundancy list
|
|
prm.nRedIns++; // inserted into redundancy list
|
|
}
|
|
if(done) {
|
|
Edit eempty;
|
|
alsink.reportAlignment(
|
|
q, // query
|
|
gfmFw, // forward index
|
|
gfmBw, // backward index
|
|
topf, // top of SA range in forward index
|
|
botf, // bottom of SA range in forward index
|
|
topb, // top of SA range in backward index
|
|
botb, // bottom of SA range in backward index
|
|
descid_, // Descent at the leaf
|
|
rid_, // root id
|
|
eempty, // extra edit, if necessary
|
|
pen_, // penalty
|
|
df, // factory with Descent
|
|
pf, // factory with DescentPoss
|
|
rs, // roots
|
|
cs); // configs
|
|
assert(alsink.repOk());
|
|
return true;
|
|
} else if(hitEnd) {
|
|
assert(botf > 0 || topf > 0);
|
|
assert_gt(botf, topf);
|
|
topf_bounce = topf;
|
|
botf_bounce = botf;
|
|
topb_bounce = topb;
|
|
botb_bounce = botb;
|
|
return true; // Bounced
|
|
}
|
|
// We just advanced either ftabLen characters, or 1 character,
|
|
// depending on whether we used ftab or fchr.
|
|
nextLocsBi(gfmFw, gfmBw, tloc, bloc, topf, botf, topb, botb);
|
|
assert(tloc.valid());
|
|
assert(botf - topf == 1 || bloc.valid());
|
|
assert(botf - topf > 1 || !bloc.valid());
|
|
TIndexOffU t[4], b[4]; // dest BW ranges
|
|
TIndexOffU tp[4], bp[4]; // dest BW ranges for "prime" index
|
|
ASSERT_ONLY(TIndexOff lasttot = botf - topf);
|
|
bool fail = false;
|
|
while(!fail && !hitEnd) {
|
|
assert(!done);
|
|
int rdc = q.getc(off5p, fw);
|
|
int rdq = q.getq(off5p);
|
|
assert_range(0, 4, rdc);
|
|
assert_gt(botf, topf);
|
|
assert(botf - topf == 1 || bloc.valid());
|
|
assert(botf - topf > 1 || !bloc.valid());
|
|
assert(tloc.valid());
|
|
TIndexOffU width = botf - topf;
|
|
bool ltr = l2r_;
|
|
const GFM<index_t>& gfm = ltr ? gfmBw : gfmFw;
|
|
t[0] = t[1] = t[2] = t[3] = b[0] = b[1] = b[2] = b[3] = 0;
|
|
int only = -1; // if we only get 1 non-empty range, this is the char
|
|
size_t nopts = 1;
|
|
if(bloc.valid()) {
|
|
// Set up initial values for the primes
|
|
if(ltr) {
|
|
tp[0] = tp[1] = tp[2] = tp[3] = topf;
|
|
bp[0] = bp[1] = bp[2] = bp[3] = botf;
|
|
} else {
|
|
tp[0] = tp[1] = tp[2] = tp[3] = topb;
|
|
bp[0] = bp[1] = bp[2] = bp[3] = botb;
|
|
}
|
|
// Range delimited by tloc/bloc has size >1. If size == 1,
|
|
// we use a simpler query (see if(!bloc.valid()) blocks below)
|
|
met.bwops++;
|
|
met.bwops_bi++;
|
|
prm.nSdFmops++;
|
|
if(prm.doFmString) {
|
|
prm.fmString.add(false, pen_, 1);
|
|
}
|
|
gfm.mapBiLFEx(tloc, bloc, t, b, tp, bp);
|
|
// t, b, tp and bp now filled
|
|
ASSERT_ONLY(TIndexOffU tot = (b[0]-t[0])+(b[1]-t[1])+(b[2]-t[2])+(b[3]-t[3]));
|
|
ASSERT_ONLY(TIndexOffU totp = (bp[0]-tp[0])+(bp[1]-tp[1])+(bp[2]-tp[2])+(bp[3]-tp[3]));
|
|
assert_eq(tot, totp);
|
|
assert_leq(tot, lasttot);
|
|
ASSERT_ONLY(lasttot = tot);
|
|
fail = (rdc > 3 || b[rdc] <= t[rdc]);
|
|
size_t nopts = 0;
|
|
if(b[0] > t[0]) { nopts++; only = 0; }
|
|
if(b[1] > t[1]) { nopts++; only = 1; }
|
|
if(b[2] > t[2]) { nopts++; only = 2; }
|
|
if(b[3] > t[3]) { nopts++; only = 3; }
|
|
if(!fail && b[rdc] - t[rdc] < width) {
|
|
branches = true;
|
|
}
|
|
} else {
|
|
tp[0] = tp[1] = tp[2] = tp[3] = bp[0] = bp[1] = bp[2] = bp[3] = 0;
|
|
// Range delimited by tloc/bloc has size 1
|
|
TIndexOffU ntop = ltr ? topb : topf;
|
|
met.bwops++;
|
|
met.bwops_1++;
|
|
prm.nSdFmops++;
|
|
if(prm.doFmString) {
|
|
prm.fmString.add(false, pen_, 1);
|
|
}
|
|
int cc = gfm.mapLF1(ntop, tloc);
|
|
assert_range(-1, 3, cc);
|
|
fail = (cc != rdc);
|
|
if(fail) {
|
|
branches = true;
|
|
}
|
|
if(cc >= 0) {
|
|
only = cc;
|
|
t[cc] = ntop; b[cc] = ntop+1;
|
|
tp[cc] = ltr ? topf : topb;
|
|
bp[cc] = ltr ? botf : botb;
|
|
}
|
|
}
|
|
// Now figure out what to do with our N.
|
|
int origRdc = rdc;
|
|
if(rdc == 4) {
|
|
fail = true;
|
|
} else {
|
|
topf = ltr ? tp[rdc] : t[rdc];
|
|
botf = ltr ? bp[rdc] : b[rdc];
|
|
topb = ltr ? t[rdc] : tp[rdc];
|
|
botb = ltr ? b[rdc] : bp[rdc];
|
|
assert_eq(botf - topf, botb - topb);
|
|
}
|
|
// The trouble with !stopOnN is that we don't have a way to store the N
|
|
// edits. There could be several per Descent.
|
|
if(rdc == 4 && !stopOnN && nopts == 1) {
|
|
fail = false;
|
|
rdc = only;
|
|
int pen = sc.n(rdq);
|
|
assert_gt(pen, 0);
|
|
pen_ += pen;
|
|
}
|
|
assert_range(0, 4, origRdc);
|
|
assert_range(0, 4, rdc);
|
|
// If 'fail' is true, we failed to align this read character. We still
|
|
// install the SA ranges into the DescentPos and increment len_ in this
|
|
// case.
|
|
|
|
// Convert t, tp, b, bp info tf, bf, tb, bb
|
|
TIndexOffU *tf = ltr ? tp : t;
|
|
TIndexOffU *bf = ltr ? bp : b;
|
|
TIndexOffU *tb = ltr ? t : tp;
|
|
TIndexOffU *bb = ltr ? b : bp;
|
|
// Allocate DescentPos data structure.
|
|
if(firstPos) {
|
|
posid_ = pf.alloc();
|
|
firstPos = false;
|
|
} else {
|
|
pf.alloc();
|
|
}
|
|
nalloc++;
|
|
pf[posid_ + len_].reset();
|
|
pf[posid_ + len_].c = origRdc;
|
|
for(size_t i = 0; i < 4; i++) {
|
|
pf[posid_ + len_].topf[i] = tf[i];
|
|
pf[posid_ + len_].botf[i] = bf[i];
|
|
pf[posid_ + len_].topb[i] = tb[i];
|
|
pf[posid_ + len_].botb[i] = bb[i];
|
|
assert_eq(pf[posid_ + len_].botf[i] - pf[posid_ + len_].topf[i],
|
|
pf[posid_ + len_].botb[i] - pf[posid_ + len_].topb[i]);
|
|
}
|
|
if(!fail) {
|
|
// Check if this is redundant with an already-explored path
|
|
size_t al5pf = al5pf_, al5pi = al5pi_;
|
|
if(toward3p) {
|
|
al5pf++;
|
|
} else {
|
|
al5pi--;
|
|
}
|
|
fail = !re.check(fw, l2r_, al5pi, al5pf,
|
|
al5pf - al5pi + 1 + gapadd_, topf, botf, pen_);
|
|
if(fail) {
|
|
prm.nRedSkip++;
|
|
} else {
|
|
prm.nRedFail++; // not pruned by redundancy list
|
|
prm.nRedIns++; // inserted into redundancy list
|
|
}
|
|
}
|
|
if(!fail) {
|
|
len_++;
|
|
if(toward3p) {
|
|
al5pf_++;
|
|
off5p++;
|
|
off3p--;
|
|
if(al5pf_ == q.length() - 1) {
|
|
hitEnd = true;
|
|
done = (al5pi_ == 0);
|
|
}
|
|
} else {
|
|
assert_gt(al5pi_, 0);
|
|
al5pi_--;
|
|
off5p--;
|
|
off3p++;
|
|
if(al5pi_ == 0) {
|
|
hitEnd = true;
|
|
done = (al5pf_ == q.length() - 1);
|
|
}
|
|
}
|
|
}
|
|
if(!fail && !hitEnd) {
|
|
nextLocsBi(gfmFw, gfmBw, tloc, bloc, tf[rdc], bf[rdc], tb[rdc], bb[rdc]);
|
|
}
|
|
}
|
|
assert_geq(al5pf_, al5pi_);
|
|
assert(!root() || al5pf_ - al5pi_ + 1 == nalloc || al5pf_ - al5pi_ + 2 == nalloc);
|
|
assert_geq(pf.size(), nalloc);
|
|
if(done) {
|
|
Edit eempty;
|
|
alsink.reportAlignment(
|
|
q, // query
|
|
gfmFw, // forward index
|
|
gfmBw, // backward index
|
|
topf, // top of SA range in forward index
|
|
botf, // bottom of SA range in forward index
|
|
topb, // top of SA range in backward index
|
|
botb, // bottom of SA range in backward index
|
|
descid_, // Descent at the leaf
|
|
rid_, // root id
|
|
eempty, // extra edit, if necessary
|
|
pen_, // penalty
|
|
df, // factory with Descent
|
|
pf, // factory with DescentPoss
|
|
rs, // roots
|
|
cs); // configs
|
|
assert(alsink.repOk());
|
|
return true;
|
|
} else if(hitEnd) {
|
|
assert(botf > 0 || topf > 0);
|
|
assert_gt(botf, topf);
|
|
topf_bounce = topf;
|
|
botf_bounce = botf;
|
|
topb_bounce = topb;
|
|
botb_bounce = botb;
|
|
return true; // Bounced
|
|
}
|
|
assert(repOk(&q));
|
|
assert(!hitEnd || topf_bounce > 0 || botf_bounce > 0);
|
|
return true;
|
|
}
|
|
|
|
#endif
|