248 lines
8.2 KiB
C++
248 lines
8.2 KiB
C++
/*
|
|
* Copyright 2012, Ben Langmead <langmea@cs.jhu.edu>
|
|
*
|
|
* This file is part of Bowtie 2.
|
|
*
|
|
* Bowtie 2 is free software: you can redistribute it and/or modify
|
|
* it under the terms of the GNU General Public License as published by
|
|
* the Free Software Foundation, either version 3 of the License, or
|
|
* (at your option) any later version.
|
|
*
|
|
* Bowtie 2 is distributed in the hope that it will be useful,
|
|
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
|
* GNU General Public License for more details.
|
|
*
|
|
* You should have received a copy of the GNU General Public License
|
|
* along with Bowtie 2. If not, see <http://www.gnu.org/licenses/>.
|
|
*/
|
|
|
|
/*
|
|
* aligner_driver.h
|
|
*
|
|
* REDUNDANT SEED HITS
|
|
*
|
|
* We say that two seed hits are redundant if they trigger identical
|
|
* seed-extend dynamic programming problems. Put another way, they both lie on
|
|
* the same diagonal of the overall read/reference dynamic programming matrix.
|
|
* Detecting redundant seed hits is simple when the seed hits are ungapped. We
|
|
* do this after offset resolution but before the offset is converted to genome
|
|
* coordinates (see uses of the seenDiags1_/seenDiags2_ fields for examples).
|
|
*
|
|
* REDUNDANT ALIGNMENTS
|
|
*
|
|
* In an unpaired context, we say that two alignments are redundant if they
|
|
* share any cells in the global DP table. Roughly speaking, this is like
|
|
* saying that two alignments are redundant if any read character aligns to the
|
|
* same reference character (same reference sequence, same strand, same offset)
|
|
* in both alignments.
|
|
*
|
|
* In a paired-end context, we say that two paired-end alignments are redundant
|
|
* if the mate #1s are redundant and the mate #2s are redundant.
|
|
*
|
|
* How do we enforce this? In the unpaired context, this is relatively simple:
|
|
* the cells from each alignment are checked against a set containing all cells
|
|
* from all previous alignments. Given a new alignment, for each cell in the
|
|
* new alignment we check whether it is in the set. If there is any overlap,
|
|
* the new alignment is rejected as redundant. Otherwise, the new alignment is
|
|
* accepted and its cells are added to the set.
|
|
*
|
|
* Enforcement in a paired context is a little trickier. Consider the
|
|
* following approaches:
|
|
*
|
|
* 1. Skip anchors that are redundant with any previous anchor or opposite
|
|
* alignment. This is sufficient to ensure no two concordant alignments
|
|
* found are redundant.
|
|
*
|
|
* 2. Same as scheme 1, but with a "transitive closure" scheme for finding all
|
|
* concordant pairs in the vicinity of an anchor. Consider the AB/AC
|
|
* scenario from the previous paragraph. If B is the anchor alignment, we
|
|
* will find AB but not AC. But under this scheme, once we find AB we then
|
|
* let B be a new anchor and immediately look for its opposites. Likewise,
|
|
* if we find any opposite, we make them anchors and continue searching. We
|
|
* don't stop searching until every opposite is used as an anchor.
|
|
*
|
|
* 3. Skip anchors that are redundant with any previous anchor alignment (but
|
|
* allow anchors that are redundant with previous opposite alignments).
|
|
* This isn't sufficient to avoid redundant concordant alignments. To avoid
|
|
* redundant concordants, we need an additional procedure that checks each
|
|
* new concordant alignment one-by-one against a list of previous concordant
|
|
* alignments to see if it is redundant.
|
|
*
|
|
* We take approach 1.
|
|
*/
|
|
|
|
#ifndef ALIGNER_DRIVER_H_
|
|
#define ALIGNER_DRIVER_H_
|
|
|
|
#include "aligner_seed2.h"
|
|
#include "simple_func.h"
|
|
#include "aln_sink.h"
|
|
|
|
/**
|
|
* Concrete subclass of DescentRootSelector. Puts a root every 'ival' chars,
|
|
* where 'ival' is determined by user-specified parameters. A root is filtered
|
|
* out if the end of the read is less than 'landing' positions away, in the
|
|
* direction of the search.
|
|
*/
|
|
class AlignerDriverRootSelector : public DescentRootSelector {
|
|
|
|
public:
|
|
|
|
AlignerDriverRootSelector(
|
|
double consExp,
|
|
const SimpleFunc& rootIval,
|
|
size_t landing)
|
|
{
|
|
consExp_ = consExp;
|
|
rootIval_ = rootIval;
|
|
landing_ = landing;
|
|
}
|
|
|
|
virtual ~AlignerDriverRootSelector() { }
|
|
|
|
virtual void select(
|
|
const Read& q, // read that we're selecting roots for
|
|
const Read* qo, // opposite mate, if applicable
|
|
bool nofw, // don't add roots for fw read
|
|
bool norc, // don't add roots for rc read
|
|
EList<DescentConfig>& confs, // put DescentConfigs here
|
|
EList<DescentRoot>& roots); // put DescentRoot here
|
|
|
|
protected:
|
|
|
|
double consExp_;
|
|
SimpleFunc rootIval_;
|
|
size_t landing_;
|
|
};
|
|
|
|
/**
|
|
* Return values from extendSeeds and extendSeedsPaired.
|
|
*/
|
|
enum {
|
|
// Candidates were examined exhaustively
|
|
ALDRIVER_EXHAUSTED_CANDIDATES = 1,
|
|
// The policy does not need us to look any further
|
|
ALDRIVER_POLICY_FULFILLED,
|
|
// We stopped because we ran up against a limit on how much work we should
|
|
// do for one set of seed ranges, e.g. the limit on number of consecutive
|
|
// unproductive DP extensions
|
|
ALDRIVER_EXCEEDED_LIMIT
|
|
};
|
|
|
|
/**
|
|
* This class is the glue between a DescentDriver and the dynamic programming
|
|
* implementations in Bowtie 2. The DescentDriver is used to find some very
|
|
* high-scoring alignments, but is additionally used to rank partial alignments
|
|
* so that they can be extended using dynamic programming.
|
|
*/
|
|
template <typename index_t>
|
|
class AlignerDriver {
|
|
|
|
public:
|
|
|
|
AlignerDriver(
|
|
double consExp,
|
|
const SimpleFunc& rootIval,
|
|
size_t landing,
|
|
bool veryVerbose,
|
|
const SimpleFunc& totsz,
|
|
const SimpleFunc& totfmops) :
|
|
sel_(consExp, rootIval, landing),
|
|
alsel_(),
|
|
dr1_(veryVerbose),
|
|
dr2_(veryVerbose)
|
|
{
|
|
totsz_ = totsz;
|
|
totfmops_ = totfmops;
|
|
}
|
|
|
|
/**
|
|
* Initialize driver with respect to a new read or pair.
|
|
*/
|
|
void initRead(
|
|
const Read& q1,
|
|
bool nofw,
|
|
bool norc,
|
|
TAlScore minsc,
|
|
TAlScore maxpen,
|
|
const Read* q2)
|
|
{
|
|
dr1_.initRead(q1, nofw, norc, minsc, maxpen, q2, &sel_);
|
|
red1_.init(q1.length());
|
|
paired_ = false;
|
|
if(q2 != NULL) {
|
|
dr2_.initRead(*q2, nofw, norc, minsc, maxpen, &q1, &sel_);
|
|
red2_.init(q2->length());
|
|
paired_ = true;
|
|
} else {
|
|
dr2_.reset();
|
|
}
|
|
size_t totsz = totsz_.f<size_t>(q1.length());
|
|
size_t totfmops = totfmops_.f<size_t>(q1.length());
|
|
stop_.init(
|
|
totsz,
|
|
0,
|
|
true,
|
|
totfmops);
|
|
}
|
|
|
|
/**
|
|
* Start the driver. The driver will begin by conducting a best-first,
|
|
* index-assisted search through the space of possible full and partial
|
|
* alignments. This search may be followed up with a dynamic programming
|
|
* extension step, taking a prioritized set of partial SA ranges found
|
|
* during the search and extending each with DP. The process might also be
|
|
* iterated, with the search being occasioanally halted so that DPs can be
|
|
* tried, then restarted, etc.
|
|
*/
|
|
int go(
|
|
const Scoring& sc,
|
|
const GFM<index_t>& gfmFw,
|
|
const GFM<index_t>& gfmBw,
|
|
const BitPairReference& ref,
|
|
DescentMetrics& met,
|
|
WalkMetrics& wlm,
|
|
PerReadMetrics& prm,
|
|
RandomSource& rnd,
|
|
AlnSinkWrap<index_t>& sink);
|
|
|
|
/**
|
|
* Reset state of all DescentDrivers.
|
|
*/
|
|
void reset() {
|
|
dr1_.reset();
|
|
dr2_.reset();
|
|
red1_.reset();
|
|
red2_.reset();
|
|
}
|
|
|
|
protected:
|
|
|
|
AlignerDriverRootSelector sel_; // selects where roots should go
|
|
DescentAlignmentSelector<index_t> alsel_; // one selector can deal with >1 drivers
|
|
DescentDriver<index_t> dr1_; // driver for mate 1/unpaired reads
|
|
DescentDriver<index_t> dr2_; // driver for paired-end reads
|
|
DescentStoppingConditions stop_; // when to pause index-assisted BFS
|
|
bool paired_; // current read is paired?
|
|
|
|
SimpleFunc totsz_; // memory limit on best-first search data
|
|
SimpleFunc totfmops_; // max # FM ops for best-first search
|
|
|
|
// For detecting redundant alignments
|
|
RedundantAlns red1_; // database of cells used for mate 1 alignments
|
|
RedundantAlns red2_; // database of cells used for mate 2 alignments
|
|
|
|
// For AlnRes::matchesRef
|
|
ASSERT_ONLY(SStringExpandable<char> raw_refbuf_);
|
|
ASSERT_ONLY(SStringExpandable<uint32_t> raw_destU32_);
|
|
ASSERT_ONLY(EList<bool> raw_matches_);
|
|
ASSERT_ONLY(BTDnaString tmp_rf_);
|
|
ASSERT_ONLY(BTDnaString tmp_rdseq_);
|
|
ASSERT_ONLY(BTString tmp_qseq_);
|
|
ASSERT_ONLY(EList<index_t> tmp_reflens_);
|
|
ASSERT_ONLY(EList<index_t> tmp_refoffs_);
|
|
};
|
|
|
|
#endif /* defined(ALIGNER_DRIVER_H_) */
|