hisat-3n/aligner_sw.h

649 lines
25 KiB
C
Raw Permalink Normal View History

2025-01-18 13:09:52 +00:00
/*
* Copyright 2011, Ben Langmead <langmea@cs.jhu.edu>
*
* This file is part of Bowtie 2.
*
* Bowtie 2 is free software: you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation, either version 3 of the License, or
* (at your option) any later version.
*
* Bowtie 2 is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with Bowtie 2. If not, see <http://www.gnu.org/licenses/>.
*/
/*
* aligner_sw.h
*
* Classes and routines for solving dynamic programming problems in aid of read
* alignment. Goals include the ability to handle:
*
* - Both read alignment, where the query must align end-to-end, and local
* alignment, where we seek a high-scoring alignment that need not involve
* the entire query.
* - Situations where: (a) we've found a seed hit and are trying to extend it
* into a larger hit, (b) we've found an alignment for one mate of a pair and
* are trying to find a nearby alignment for the other mate, (c) we're
* aligning against an entire reference sequence.
* - Caller-specified indicators for what columns of the dynamic programming
* matrix we are allowed to start in or end in.
*
* TODO:
*
* - A slicker way to filter out alignments that violate a ceiling placed on
* the number of Ns permitted in the reference portion of the alignment.
* Right now we accomplish this by masking out ending columns that correspond
* to *ungapped* alignments with too many Ns. This results in false
* positives and false negatives for gapped alignments. The margin of error
* (# of Ns by which we might miscount) is bounded by the number of gaps.
*/
/**
* |-maxgaps-|
* ***********oooooooooooooooooooooo -
* ***********ooooooooooooooooooooo |
* ***********oooooooooooooooooooo |
* ***********ooooooooooooooooooo |
* ***********oooooooooooooooooo |
* ***********ooooooooooooooooo read len
* ***********oooooooooooooooo |
* ***********ooooooooooooooo |
* ***********oooooooooooooo |
* ***********ooooooooooooo |
* ***********oooooooooooo -
* |-maxgaps-|
* |-readlen-|
* |-------skip--------|
*/
#ifndef ALIGNER_SW_H_
#define ALIGNER_SW_H_
#define INLINE_CUPS
#include <stdint.h>
#include <iostream>
#include <limits>
#include "threading.h"
#include <emmintrin.h>
#include "aligner_sw_common.h"
#include "aligner_sw_nuc.h"
#include "ds.h"
#include "aligner_seed.h"
#include "reference.h"
#include "random_source.h"
#include "mem_ids.h"
#include "aligner_result.h"
#include "mask.h"
#include "dp_framer.h"
#include "aligner_swsse.h"
#include "aligner_bt.h"
#define QUAL2(d, f) sc_->mm((int)(*rd_)[rdi_ + d], \
(int) rf_ [rfi_ + f], \
(int)(*qu_)[rdi_ + d] - 33)
#define QUAL(d) sc_->mm((int)(*rd_)[rdi_ + d], \
(int)(*qu_)[rdi_ + d] - 33)
#define N_SNP_PEN(c) (((int)rf_[rfi_ + c] > 15) ? sc_->n(30) : sc_->penSnp)
/**
* SwAligner
* =========
*
* Ensapsulates facilities for alignment using dynamic programming. Handles
* alignment of nucleotide reads against known reference nucleotides.
*
* The class is stateful. First the user must call init() to initialize the
* object with details regarding the dynamic programming problem to be solved.
* Next, the user calls align() to fill the dynamic programming matrix and
* calculate summaries describing the solutions. Finally the user calls
* nextAlignment(...), perhaps repeatedly, to populate the SwResult object with
* the next result. Results are dispensend in best-to-worst, left-to-right
* order.
*
* The class expects the read string, quality string, and reference string
* provided by the caller live at least until the user is finished aligning and
* obtaining alignments from this object.
*
* There is a design tradeoff between hiding/exposing details of the genome and
* its strands to the SwAligner. In a sense, a better design is to hide
* details such as the id of the reference sequence aligned to, or whether
* we're aligning the read in its original forward orientation or its reverse
* complement. But this means that any alignment results returned by SwAligner
* have to be extended to include those details before they're useful to the
* caller. We opt for messy but expedient - the reference id and orientation
* of the read are given to SwAligner, remembered, and used to populate
* SwResults.
*
* LOCAL VS GLOBAL
*
* The dynamic programming aligner supports both local and global alignment,
* and one option in between. To implement global alignment, the aligner (a)
* allows negative scores (i.e. doesn't necessarily clamp them up to 0), (b)
* checks in rows other than the last row for acceptable solutions, and (c)
* optionally adds a bonus to the score for matches.
*
* For global alignment, we:
*
* (a) Allow negative scores
* (b) Check only in the last row
* (c) Either add a bonus for matches or not (doesn't matter)
*
* For local alignment, we:
*
* (a) Clamp scores to 0
* (b) Check in any row for a sufficiently high score
* (c) Add a bonus for matches
*
* An in-between solution is to allow alignments to be curtailed on the
* right-hand side if a better score can be achieved thereby, but not on the
* left. For this, we:
*
* (a) Allow negative scores
* (b) Check in any row for a sufficiently high score
* (c) Either add a bonus for matches or not (doesn't matter)
*
* REDUNDANT ALIGNMENTS
*
* When are two alignments distinct and when are they redundant (not distinct)?
* At one extreme, we might say the best alignment from any given dynamic
* programming problem is redundant with all other alignments from that
# problem. At the other extreme, we might say that any two alignments with
* distinct starting points and edits are distinct. The former is probably too
* conservative for mate-finding DP problems. The latter is certainly too
* permissive, since two alignments that differ only in how gaps are arranged
* should not be considered distinct.
*
* Some in-between solutions are:
*
* (a) If two alignments share an end point on either end, they are redundant.
* Otherwise, they are distinct.
* (b) If two alignments share *both* end points, they are redundant.
* (c) If two alignments share any cells in the DP table, they are redundant.
* (d) 2 alignments are redundant if either end within N poss of each other
* (e) Like (d) but both instead of either
* (f, g) Like d, e, but where N is tied to maxgaps somehow
*
* Why not (a)? One reason is that it's possible for two alignments to have
* different start & end positions but share many cells. Consider alignments 1
* and 2 below; their end-points are labeled.
*
* 1 2
* \ \
* -\
* \
* \
* \
* -\
* \ \
* 1 2
*
* 1 and 2 are distinct according to (a) but they share many cells in common.
*
* Why not (f, g)? It fixes the problem with (a) above by forcing the
* alignments to be spread so far that they can't possibly share diagonal cells
* in common
*/
class SwAligner {
typedef std::pair<size_t, size_t> SizeTPair;
// States that the aligner can be in
enum {
STATE_UNINIT, // init() hasn't been called yet
STATE_INITED, // init() has been called, but not align()
STATE_ALIGNED, // align() has been called
};
const static size_t ALPHA_SIZE = 5;
public:
explicit SwAligner() :
sseU8fw_(DP_CAT),
sseU8rc_(DP_CAT),
sseI16fw_(DP_CAT),
sseI16rc_(DP_CAT),
state_(STATE_UNINIT),
initedRead_(false),
readSse16_(false),
initedRef_(false),
rfwbuf_(DP_CAT),
btnstack_(DP_CAT),
btcells_(DP_CAT),
btdiag_(),
btncand_(DP_CAT),
btncanddone_(DP_CAT),
btncanddoneSucc_(0),
btncanddoneFail_(0),
cper_(),
cperMinlen_(),
cperPerPow2_(),
cperEf_(),
cperTri_(),
colstop_(0),
lastsolcol_(0),
cural_(0)
ASSERT_ONLY(, cand_tmp_(DP_CAT))
{ }
/**
* Prepare the dynamic programming driver with a new read and a new scoring
* scheme.
*/
void initRead(
const BTDnaString& rdfw, // read sequence for fw read
const BTDnaString& rdrc, // read sequence for rc read
const BTString& qufw, // read qualities for fw read
const BTString& qurc, // read qualities for rc read
size_t rdi, // offset of first read char to align
size_t rdf, // offset of last read char to align
const Scoring& sc); // scoring scheme
/**
* Initialize with a new alignment problem.
*/
void initRef(
bool fw, // whether to forward or revcomp read is aligning
TRefId refidx, // id of reference aligned against
const DPRect& rect, // DP rectangle
char *rf, // reference sequence
size_t rfi, // offset of first reference char to align to
size_t rff, // offset of last reference char to align to
TRefOff reflen, // length of reference sequence
const Scoring& sc, // scoring scheme
TAlScore minsc, // minimum score
bool enable8, // use 8-bit SSE if possible?
size_t cminlen, // minimum length for using checkpointing scheme
size_t cpow2, // interval b/t checkpointed diags; 1 << this
bool doTri, // triangular mini-fills?
bool extend); // true iff this is a seed extension
/**
* Given a read, an alignment orientation, a range of characters in a
* referece sequence, and a bit-encoded version of the reference,
* execute the corresponding dynamic programming problem.
*
* Here we expect that the caller has already narrowed down the relevant
* portion of the reference (e.g. using a seed hit) and all we do is
* banded dynamic programming in the vicinity of that portion. This is not
* the function to call if we are trying to solve the whole alignment
* problem with dynamic programming (that is TODO).
*
* Returns true if an alignment was found, false otherwise.
*/
void initRef(
bool fw, // whether to forward or revcomp read aligned
TRefId refidx, // reference aligned against
const DPRect& rect, // DP rectangle
const BitPairReference& refs, // Reference strings
TRefOff reflen, // length of reference sequence
const Scoring& sc, // scoring scheme
TAlScore minsc, // minimum alignment score
bool enable8, // use 8-bit SSE if possible?
size_t cminlen, // minimum length for using checkpointing scheme
size_t cpow2, // interval b/t checkpointed diags; 1 << this
bool doTri, // triangular mini-fills?
bool extend, // true iff this is a seed extension
size_t upto, // count the number of Ns up to this offset
size_t& nsUpto); // output: the number of Ns up to 'upto'
/**
* Given a read, an alignment orientation, a range of characters in a
* referece sequence, and a bit-encoded version of the reference, set up
* and execute the corresponding ungapped alignment problem. There can
* only be one solution.
*
* The caller has already narrowed down the relevant portion of the
* reference using, e.g., the location of a seed hit, or the range of
* possible fragment lengths if we're searching for the opposite mate in a
* pair.
*/
int ungappedAlign(
const BTDnaString& rd, // read sequence (could be RC)
const BTString& qu, // qual sequence (could be rev)
const Coord& coord, // coordinate aligned to
const BitPairReference& refs, // Reference strings
size_t reflen, // length of reference sequence
const Scoring& sc, // scoring scheme
bool ohang, // allow overhang?
TAlScore minsc, // minimum score
SwResult& res); // put alignment result here
/**
* Align read 'rd' to reference using read & reference information given
* last time init() was called. Uses dynamic programming.
*/
bool align(RandomSource& rnd, TAlScore& best);
/**
* Populate the given SwResult with information about the "next best"
* alignment if there is one. If there isn't one, false is returned. Note
* that false might be returned even though a call to done() would have
* returned false.
*/
bool nextAlignment(
SwResult& res,
TAlScore minsc,
RandomSource& rnd);
/**
* Print out an alignment result as an ASCII DP table.
*/
void printResultStacked(
const SwResult& res,
std::ostream& os)
{
res.alres.printStacked(*rd_, os);
}
/**
* Return true iff there are no more solution cells to backtace from.
* Note that this may return false in situations where there are actually
* no more solutions, but that hasn't been discovered yet.
*/
bool done() const {
assert(initedRead() && initedRef());
return cural_ == btncand_.size();
}
/**
* Return true iff this SwAligner has been initialized with a read to align.
*/
inline bool initedRef() const { return initedRef_; }
/**
* Return true iff this SwAligner has been initialized with a reference to
* align against.
*/
inline bool initedRead() const { return initedRead_; }
/**
* Reset, signaling that we're done with this dynamic programming problem
* and won't be asking for any more alignments.
*/
inline void reset() { initedRef_ = initedRead_ = false; }
#ifndef NDEBUG
/**
* Check that aligner is internally consistent.
*/
bool repOk() const {
assert_gt(dpRows(), 0);
// Check btncand_
for(size_t i = 0; i < btncand_.size(); i++) {
assert(btncand_[i].repOk());
assert_geq(btncand_[i].score, minsc_);
}
return true;
}
#endif
/**
* Return the number of alignments given out so far by nextAlignment().
*/
size_t numAlignmentsReported() const { return cural_; }
/**
* Merge tallies in the counters related to filling the DP table.
*/
void merge(
SSEMetrics& sseU8ExtendMet,
SSEMetrics& sseU8MateMet,
SSEMetrics& sseI16ExtendMet,
SSEMetrics& sseI16MateMet,
uint64_t& nbtfiltst,
uint64_t& nbtfiltsc,
uint64_t& nbtfiltdo)
{
sseU8ExtendMet.merge(sseU8ExtendMet_);
sseU8MateMet.merge(sseU8MateMet_);
sseI16ExtendMet.merge(sseI16ExtendMet_);
sseI16MateMet.merge(sseI16MateMet_);
nbtfiltst += nbtfiltst_;
nbtfiltsc += nbtfiltsc_;
nbtfiltdo += nbtfiltdo_;
}
/**
* Reset all the counters related to filling in the DP table to 0.
*/
void resetCounters() {
sseU8ExtendMet_.reset();
sseU8MateMet_.reset();
sseI16ExtendMet_.reset();
sseI16MateMet_.reset();
nbtfiltst_ = nbtfiltsc_ = nbtfiltdo_ = 0;
}
/**
* Return the size of the DP problem.
*/
size_t size() const {
return dpRows() * (rff_ - rfi_);
}
protected:
/**
* Return the number of rows that will be in the dynamic programming table.
*/
inline size_t dpRows() const {
assert(initedRead_);
return rdf_ - rdi_;
}
/**
* Align nucleotides from read 'rd' to the reference string 'rf' using
* vector instructions. Return the score of the best alignment found, or
* the minimum integer if an alignment could not be found. Flag is set to
* 0 if an alignment is found, -1 if no valid alignment is found, or -2 if
* the score saturated at any point during alignment.
*/
TAlScore alignNucleotidesEnd2EndSseU8( // unsigned 8-bit elements
int& flag, bool debug);
TAlScore alignNucleotidesLocalSseU8( // unsigned 8-bit elements
int& flag, bool debug);
TAlScore alignNucleotidesEnd2EndSseI16( // signed 16-bit elements
int& flag, bool debug);
TAlScore alignNucleotidesLocalSseI16( // signed 16-bit elements
int& flag, bool debug);
/**
* Aligns by filling a dynamic programming matrix with the SSE-accelerated,
* banded DP approach of Farrar. As it goes, it determines which cells we
* might backtrace from and tallies the best (highest-scoring) N backtrace
* candidate cells per diagonal. Also returns the alignment score of the best
* alignment in the matrix.
*
* This routine does *not* maintain a matrix holding the entire matrix worth of
* scores, nor does it maintain any other dense O(mn) data structure, as this
* would quickly exhaust memory for queries longer than about 10,000 kb.
* Instead, in the fill stage it maintains two columns worth of scores at a
* time (current/previous, or right/left) - these take O(m) space. When
* finished with the current column, it determines which cells from the
* previous column, if any, are candidates we might backtrace from to find a
* full alignment. A candidate cell has a score that rises above the threshold
* and isn't improved upon by a match in the next column. The best N
* candidates per diagonal are stored in a O(m + n) data structure.
*/
TAlScore alignGatherEE8( // unsigned 8-bit elements
int& flag, bool debug);
TAlScore alignGatherLoc8( // unsigned 8-bit elements
int& flag, bool debug);
TAlScore alignGatherEE16( // signed 16-bit elements
int& flag, bool debug);
TAlScore alignGatherLoc16( // signed 16-bit elements
int& flag, bool debug);
/**
* Build query profile look up tables for the read. The query profile look
* up table is organized as a 1D array indexed by [i][j] where i is the
* reference character in the current DP column (0=A, 1=C, etc), and j is
* the segment of the query we're currently working on.
*/
void buildQueryProfileEnd2EndSseU8(bool fw);
void buildQueryProfileLocalSseU8(bool fw);
/**
* Build query profile look up tables for the read. The query profile look
* up table is organized as a 1D array indexed by [i][j] where i is the
* reference character in the current DP column (0=A, 1=C, etc), and j is
* the segment of the query we're currently working on.
*/
void buildQueryProfileEnd2EndSseI16(bool fw);
void buildQueryProfileLocalSseI16(bool fw);
bool gatherCellsNucleotidesLocalSseU8(TAlScore best);
bool gatherCellsNucleotidesEnd2EndSseU8(TAlScore best);
bool gatherCellsNucleotidesLocalSseI16(TAlScore best);
bool gatherCellsNucleotidesEnd2EndSseI16(TAlScore best);
bool backtraceNucleotidesLocalSseU8(
TAlScore escore, // in: expected score
SwResult& res, // out: store results (edits and scores) here
size_t& off, // out: store diagonal projection of origin
size_t& nbts, // out: # backtracks
size_t row, // start in this rectangle row
size_t col, // start in this rectangle column
RandomSource& rand); // random gen, to choose among equal paths
bool backtraceNucleotidesLocalSseI16(
TAlScore escore, // in: expected score
SwResult& res, // out: store results (edits and scores) here
size_t& off, // out: store diagonal projection of origin
size_t& nbts, // out: # backtracks
size_t row, // start in this rectangle row
size_t col, // start in this rectangle column
RandomSource& rand); // random gen, to choose among equal paths
bool backtraceNucleotidesEnd2EndSseU8(
TAlScore escore, // in: expected score
SwResult& res, // out: store results (edits and scores) here
size_t& off, // out: store diagonal projection of origin
size_t& nbts, // out: # backtracks
size_t row, // start in this rectangle row
size_t col, // start in this rectangle column
RandomSource& rand); // random gen, to choose among equal paths
bool backtraceNucleotidesEnd2EndSseI16(
TAlScore escore, // in: expected score
SwResult& res, // out: store results (edits and scores) here
size_t& off, // out: store diagonal projection of origin
size_t& nbts, // out: # backtracks
size_t row, // start in this rectangle row
size_t col, // start in this rectangle column
RandomSource& rand); // random gen, to choose among equal paths
bool backtrace(
TAlScore escore, // in: expected score
bool fill, // in: use mini-fill?
bool usecp, // in: use checkpoints?
SwResult& res, // out: store results (edits and scores) here
size_t& off, // out: store diagonal projection of origin
size_t row, // start in this rectangle row
size_t col, // start in this rectangle column
size_t maxiter,// max # extensions to try
size_t& niter, // # extensions tried
RandomSource& rnd) // random gen, to choose among equal paths
{
bter_.initBt(
escore, // in: alignment score
row, // in: start in this row
col, // in: start in this column
fill, // in: use mini-fill?
usecp, // in: use checkpoints?
cperTri_, // in: triangle-shaped mini-fills?
rnd); // in: random gen, to choose among equal paths
assert(bter_.inited());
size_t nrej = 0;
if(bter_.emptySolution()) {
return false;
} else {
return bter_.nextAlignment(maxiter, res, off, nrej, niter, rnd);
}
}
const BTDnaString *rd_; // read sequence
const BTString *qu_; // read qualities
const BTDnaString *rdfw_; // read sequence for fw read
const BTDnaString *rdrc_; // read sequence for rc read
const BTString *qufw_; // read qualities for fw read
const BTString *qurc_; // read qualities for rc read
TReadOff rdi_; // offset of first read char to align
TReadOff rdf_; // offset of last read char to align
bool fw_; // true iff read sequence is original fw read
TRefId refidx_; // id of reference aligned against
TRefOff reflen_; // length of entire reference sequence
const DPRect* rect_; // DP rectangle
char *rf_; // reference sequence
TRefOff rfi_; // offset of first ref char to align to
TRefOff rff_; // offset of last ref char to align to (excl)
size_t rdgap_; // max # gaps in read
size_t rfgap_; // max # gaps in reference
bool enable8_;// enable 8-bit sse
bool extend_; // true iff this is a seed-extend problem
const Scoring *sc_; // penalties for edit types
TAlScore minsc_; // penalty ceiling for valid alignments
int nceil_; // max # Ns allowed in ref portion of aln
bool sse8succ_; // whether 8-bit worked
bool sse16succ_; // whether 16-bit worked
SSEData sseU8fw_; // buf for fw query, 8-bit score
SSEData sseU8rc_; // buf for rc query, 8-bit score
SSEData sseI16fw_; // buf for fw query, 16-bit score
SSEData sseI16rc_; // buf for rc query, 16-bit score
bool sseU8fwBuilt_; // built fw query profile, 8-bit score
bool sseU8rcBuilt_; // built rc query profile, 8-bit score
bool sseI16fwBuilt_; // built fw query profile, 16-bit score
bool sseI16rcBuilt_; // built rc query profile, 16-bit score
SSEMetrics sseU8ExtendMet_;
SSEMetrics sseU8MateMet_;
SSEMetrics sseI16ExtendMet_;
SSEMetrics sseI16MateMet_;
int state_; // state
bool initedRead_; // true iff initialized with initRead
bool readSse16_; // true -> sse16 from now on for read
bool initedRef_; // true iff initialized with initRef
EList<uint32_t> rfwbuf_; // buffer for wordized ref stretches
EList<DpNucFrame> btnstack_; // backtrace stack for nucleotides
EList<SizeTPair> btcells_; // cells involved in current backtrace
NBest<DpBtCandidate> btdiag_; // per-diagonal backtrace candidates
EList<DpBtCandidate> btncand_; // cells we might backtrace from
EList<DpBtCandidate> btncanddone_; // candidates that we investigated
size_t btncanddoneSucc_; // # investigated and succeeded
size_t btncanddoneFail_; // # investigated and failed
BtBranchTracer bter_; // backtracer
Checkpointer cper_; // structure for saving checkpoint cells
size_t cperMinlen_; // minimum length for using checkpointer
size_t cperPerPow2_; // checkpoint every 1 << perpow2 diags (& next)
bool cperEf_; // store E and F in addition to H?
bool cperTri_; // checkpoint for triangular mini-fills?
size_t colstop_; // bailed on DP loop after this many cols
size_t lastsolcol_; // last DP col with valid cell
size_t cural_; // index of next alignment to be given
uint64_t nbtfiltst_; // # candidates filtered b/c starting cell was seen
uint64_t nbtfiltsc_; // # candidates filtered b/c score uninteresting
uint64_t nbtfiltdo_; // # candidates filtered b/c dominated by other cell
ASSERT_ONLY(SStringExpandable<uint32_t> tmp_destU32_);
ASSERT_ONLY(BTDnaString tmp_editstr_, tmp_refstr_);
ASSERT_ONLY(EList<DpBtCandidate> cand_tmp_);
};
#endif /*ALIGNER_SW_H_*/