2267 lines
73 KiB
C++
2267 lines
73 KiB
C++
|
/*
|
||
|
* Copyright 2011, Ben Langmead <langmea@cs.jhu.edu>
|
||
|
*
|
||
|
* This file is part of Bowtie 2.
|
||
|
*
|
||
|
* Bowtie 2 is free software: you can redistribute it and/or modify
|
||
|
* it under the terms of the GNU General Public License as published by
|
||
|
* the Free Software Foundation, either version 3 of the License, or
|
||
|
* (at your option) any later version.
|
||
|
*
|
||
|
* Bowtie 2 is distributed in the hope that it will be useful,
|
||
|
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||
|
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||
|
* GNU General Public License for more details.
|
||
|
*
|
||
|
* You should have received a copy of the GNU General Public License
|
||
|
* along with Bowtie 2. If not, see <http://www.gnu.org/licenses/>.
|
||
|
*/
|
||
|
|
||
|
/**
|
||
|
* aligner_sw_sse.cpp
|
||
|
*
|
||
|
* Versions of key alignment functions that use vector instructions to
|
||
|
* accelerate dynamic programming. Based chiefly on the striped Smith-Waterman
|
||
|
* paper and implementation by Michael Farrar. See:
|
||
|
*
|
||
|
* Farrar M. Striped Smith-Waterman speeds database searches six times over
|
||
|
* other SIMD implementations. Bioinformatics. 2007 Jan 15;23(2):156-61.
|
||
|
* http://sites.google.com/site/farrarmichael/smith-waterman
|
||
|
*
|
||
|
* While the paper describes an implementation of Smith-Waterman, we extend it
|
||
|
* do end-to-end read alignment as well as local alignment. The change
|
||
|
* required for this is minor: we simply let vmax be the maximum element in the
|
||
|
* score domain rather than the minimum.
|
||
|
*
|
||
|
* The vectorized dynamic programming implementation lacks some features that
|
||
|
* make it hard to adapt to solving the entire dynamic-programming alignment
|
||
|
* problem. For instance:
|
||
|
*
|
||
|
* - It doesn't respect gap barriers on either end of the read
|
||
|
* - It just gives a maximum; not enough information to backtrace without
|
||
|
* redoing some alignment
|
||
|
* - It's a little difficult to handle st_ and en_, especially st_.
|
||
|
* - The query profile mechanism makes handling of ambiguous reference bases a
|
||
|
* little tricky (16 cols in query profile lookup table instead of 5)
|
||
|
*
|
||
|
* Given the drawbacks, it is tempting to use SSE dynamic programming as a
|
||
|
* filter rather than as an aligner per se. Here are a few ideas for how it
|
||
|
* can be extended to handle more of the alignment problem:
|
||
|
*
|
||
|
* - Save calculated scores to a big array as we go. We return to this array
|
||
|
* to find and backtrace from good solutions.
|
||
|
*/
|
||
|
|
||
|
#include <limits>
|
||
|
#include "aligner_sw.h"
|
||
|
|
||
|
// static const size_t NBYTES_PER_REG = 16;
|
||
|
static const size_t NWORDS_PER_REG = 16;
|
||
|
// static const size_t NBITS_PER_WORD = 8;
|
||
|
static const size_t NBYTES_PER_WORD = 1;
|
||
|
|
||
|
// In local mode, we start low (0) and go high (255). Factoring in a query
|
||
|
// profile involves unsigned saturating addition. All query profile elements
|
||
|
// should be expressed as a positive number; this is done by adding -min
|
||
|
// where min is the smallest (negative) score in the query profile.
|
||
|
|
||
|
typedef uint8_t TCScore;
|
||
|
|
||
|
/**
|
||
|
* Build query profile look up tables for the read. The query profile look
|
||
|
* up table is organized as a 1D array indexed by [i][j] where i is the
|
||
|
* reference character in the current DP column (0=A, 1=C, etc), and j is
|
||
|
* the segment of the query we're currently working on.
|
||
|
*/
|
||
|
void SwAligner::buildQueryProfileLocalSseU8(bool fw) {
|
||
|
bool& done = fw ? sseU8fwBuilt_ : sseU8rcBuilt_;
|
||
|
if(done) {
|
||
|
return;
|
||
|
}
|
||
|
done = true;
|
||
|
const BTDnaString* rd = fw ? rdfw_ : rdrc_;
|
||
|
const BTString* qu = fw ? qufw_ : qurc_;
|
||
|
const size_t len = rd->length();
|
||
|
const size_t seglen = (len + (NWORDS_PER_REG-1)) / NWORDS_PER_REG;
|
||
|
// How many __m128i's are needed
|
||
|
size_t n128s =
|
||
|
64 + // slack bytes, for alignment?
|
||
|
(seglen * ALPHA_SIZE) // query profile data
|
||
|
* 2; // & gap barrier data
|
||
|
assert_gt(n128s, 0);
|
||
|
SSEData& d = fw ? sseU8fw_ : sseU8rc_;
|
||
|
d.profbuf_.resizeNoCopy(n128s);
|
||
|
assert(!d.profbuf_.empty());
|
||
|
d.maxPen_ = d.maxBonus_ = 0;
|
||
|
d.lastIter_ = d.lastWord_ = 0;
|
||
|
d.qprofStride_ = d.gbarStride_ = 2;
|
||
|
d.bias_ = 0;
|
||
|
// Calculate bias
|
||
|
for(size_t refc = 0; refc < ALPHA_SIZE; refc++) {
|
||
|
for(size_t i = 0; i < len; i++) {
|
||
|
int readc = (*rd)[i];
|
||
|
int readq = (*qu)[i];
|
||
|
int sc = sc_->score(readc, (int)(1 << refc), readq - 33);
|
||
|
if(sc < 0 && sc < d.bias_) {
|
||
|
d.bias_ = sc;
|
||
|
}
|
||
|
}
|
||
|
}
|
||
|
assert_leq(d.bias_, 0);
|
||
|
d.bias_ = -d.bias_;
|
||
|
// For each reference character A, C, G, T, N ...
|
||
|
for(size_t refc = 0; refc < ALPHA_SIZE; refc++) {
|
||
|
// For each segment ...
|
||
|
for(size_t i = 0; i < seglen; i++) {
|
||
|
size_t j = i;
|
||
|
uint8_t *qprofWords =
|
||
|
reinterpret_cast<uint8_t*>(d.profbuf_.ptr() + (refc * seglen * 2) + (i * 2));
|
||
|
uint8_t *gbarWords =
|
||
|
reinterpret_cast<uint8_t*>(d.profbuf_.ptr() + (refc * seglen * 2) + (i * 2) + 1);
|
||
|
// For each sub-word (byte) ...
|
||
|
for(size_t k = 0; k < NWORDS_PER_REG; k++) {
|
||
|
int sc = 0;
|
||
|
*gbarWords = 0;
|
||
|
if(j < len) {
|
||
|
int readc = (*rd)[j];
|
||
|
int readq = (*qu)[j];
|
||
|
sc = sc_->score(readc, (int)(1 << refc), readq - 33);
|
||
|
assert_range(0, 255, sc + d.bias_);
|
||
|
size_t j_from_end = len - j - 1;
|
||
|
if(j < (size_t)sc_->gapbar ||
|
||
|
j_from_end < (size_t)sc_->gapbar)
|
||
|
{
|
||
|
// Inside the gap barrier
|
||
|
*gbarWords = 0xff;
|
||
|
}
|
||
|
}
|
||
|
if(refc == 0 && j == len-1) {
|
||
|
// Remember which 128-bit word and which smaller word has
|
||
|
// the final row
|
||
|
d.lastIter_ = i;
|
||
|
d.lastWord_ = k;
|
||
|
}
|
||
|
if(sc < 0) {
|
||
|
if((size_t)(-sc) > d.maxPen_) {
|
||
|
d.maxPen_ = (size_t)(-sc);
|
||
|
}
|
||
|
} else {
|
||
|
if((size_t)sc > d.maxBonus_) {
|
||
|
d.maxBonus_ = (size_t)sc;
|
||
|
}
|
||
|
}
|
||
|
*qprofWords = (uint8_t)(sc + d.bias_);
|
||
|
gbarWords++;
|
||
|
qprofWords++;
|
||
|
j += seglen; // update offset into query
|
||
|
}
|
||
|
}
|
||
|
}
|
||
|
}
|
||
|
|
||
|
#ifndef NDEBUG
|
||
|
/**
|
||
|
* Return true iff the cell has sane E/F/H values w/r/t its predecessors.
|
||
|
*/
|
||
|
static bool cellOkLocalU8(
|
||
|
SSEData& d,
|
||
|
size_t row,
|
||
|
size_t col,
|
||
|
int refc,
|
||
|
int readc,
|
||
|
int readq,
|
||
|
const Scoring& sc) // scoring scheme
|
||
|
{
|
||
|
TCScore floorsc = 0;
|
||
|
TCScore ceilsc = 255 - d.bias_ - 1;
|
||
|
TAlScore offsetsc = 0;
|
||
|
TAlScore sc_h_cur = (TAlScore)d.mat_.helt(row, col);
|
||
|
TAlScore sc_e_cur = (TAlScore)d.mat_.eelt(row, col);
|
||
|
TAlScore sc_f_cur = (TAlScore)d.mat_.felt(row, col);
|
||
|
if(sc_h_cur > floorsc) {
|
||
|
sc_h_cur += offsetsc;
|
||
|
}
|
||
|
if(sc_e_cur > floorsc) {
|
||
|
sc_e_cur += offsetsc;
|
||
|
}
|
||
|
if(sc_f_cur > floorsc) {
|
||
|
sc_f_cur += offsetsc;
|
||
|
}
|
||
|
bool gapsAllowed = true;
|
||
|
size_t rowFromEnd = d.mat_.nrow() - row - 1;
|
||
|
if(row < (size_t)sc.gapbar || rowFromEnd < (size_t)sc.gapbar) {
|
||
|
gapsAllowed = false;
|
||
|
}
|
||
|
bool e_left_trans = false, h_left_trans = false;
|
||
|
bool f_up_trans = false, h_up_trans = false;
|
||
|
bool h_diag_trans = false;
|
||
|
if(gapsAllowed) {
|
||
|
TAlScore sc_h_left = floorsc;
|
||
|
TAlScore sc_e_left = floorsc;
|
||
|
TAlScore sc_h_up = floorsc;
|
||
|
TAlScore sc_f_up = floorsc;
|
||
|
if(col > 0 && sc_e_cur > floorsc && sc_e_cur <= ceilsc) {
|
||
|
sc_h_left = d.mat_.helt(row, col-1) + offsetsc;
|
||
|
sc_e_left = d.mat_.eelt(row, col-1) + offsetsc;
|
||
|
e_left_trans = (sc_e_left > floorsc && sc_e_cur == sc_e_left - sc.readGapExtend());
|
||
|
h_left_trans = (sc_h_left > floorsc && sc_e_cur == sc_h_left - sc.readGapOpen());
|
||
|
assert(e_left_trans || h_left_trans);
|
||
|
}
|
||
|
if(row > 0 && sc_f_cur > floorsc && sc_f_cur <= ceilsc) {
|
||
|
sc_h_up = d.mat_.helt(row-1, col) + offsetsc;
|
||
|
sc_f_up = d.mat_.felt(row-1, col) + offsetsc;
|
||
|
f_up_trans = (sc_f_up > floorsc && sc_f_cur == sc_f_up - sc.refGapExtend());
|
||
|
h_up_trans = (sc_h_up > floorsc && sc_f_cur == sc_h_up - sc.refGapOpen());
|
||
|
assert(f_up_trans || h_up_trans);
|
||
|
}
|
||
|
} else {
|
||
|
assert_geq(floorsc, sc_e_cur);
|
||
|
assert_geq(floorsc, sc_f_cur);
|
||
|
}
|
||
|
if(col > 0 && row > 0 && sc_h_cur > floorsc && sc_h_cur <= ceilsc) {
|
||
|
TAlScore sc_h_upleft = d.mat_.helt(row-1, col-1) + offsetsc;
|
||
|
TAlScore sc_diag = sc.score(readc, (int)refc, readq - 33);
|
||
|
h_diag_trans = sc_h_cur == sc_h_upleft + sc_diag;
|
||
|
}
|
||
|
assert(
|
||
|
sc_h_cur <= floorsc ||
|
||
|
e_left_trans ||
|
||
|
h_left_trans ||
|
||
|
f_up_trans ||
|
||
|
h_up_trans ||
|
||
|
h_diag_trans ||
|
||
|
sc_h_cur > ceilsc ||
|
||
|
row == 0 ||
|
||
|
col == 0);
|
||
|
return true;
|
||
|
}
|
||
|
#endif /*ndef NDEBUG*/
|
||
|
|
||
|
#ifdef NDEBUG
|
||
|
|
||
|
#define assert_all_eq0(x)
|
||
|
#define assert_all_gt(x, y)
|
||
|
#define assert_all_gt_lo(x)
|
||
|
#define assert_all_lt(x, y)
|
||
|
#define assert_all_lt_hi(x)
|
||
|
|
||
|
#else
|
||
|
|
||
|
#define assert_all_eq0(x) { \
|
||
|
__m128i z = _mm_setzero_si128(); \
|
||
|
__m128i tmp = _mm_setzero_si128(); \
|
||
|
z = _mm_xor_si128(z, z); \
|
||
|
tmp = _mm_cmpeq_epi16(x, z); \
|
||
|
assert_eq(0xffff, _mm_movemask_epi8(tmp)); \
|
||
|
}
|
||
|
|
||
|
#define assert_all_gt(x, y) { \
|
||
|
__m128i tmp = _mm_cmpgt_epu8(x, y); \
|
||
|
assert_eq(0xffff, _mm_movemask_epi8(tmp)); \
|
||
|
}
|
||
|
|
||
|
#define assert_all_gt_lo(x) { \
|
||
|
__m128i z = _mm_setzero_si128(); \
|
||
|
__m128i tmp = _mm_setzero_si128(); \
|
||
|
z = _mm_xor_si128(z, z); \
|
||
|
tmp = _mm_cmpgt_epu8(x, z); \
|
||
|
assert_eq(0xffff, _mm_movemask_epi8(tmp)); \
|
||
|
}
|
||
|
|
||
|
#define assert_all_lt(x, y) { \
|
||
|
__m128i z = _mm_setzero_si128(); \
|
||
|
z = _mm_xor_si128(z, z); \
|
||
|
__m128i tmp = _mm_subs_epu8(y, x); \
|
||
|
tmp = _mm_cmpeq_epi16(tmp, z); \
|
||
|
assert_eq(0x0000, _mm_movemask_epi8(tmp)); \
|
||
|
}
|
||
|
|
||
|
#define assert_all_lt_hi(x) { \
|
||
|
__m128i z = _mm_setzero_si128(); \
|
||
|
__m128i tmp = _mm_setzero_si128(); \
|
||
|
z = _mm_cmpeq_epu8(z, z); \
|
||
|
z = _mm_srli_epu8(z, 1); \
|
||
|
tmp = _mm_cmplt_epu8(x, z); \
|
||
|
assert_eq(0xffff, _mm_movemask_epi8(tmp)); \
|
||
|
}
|
||
|
#endif
|
||
|
|
||
|
/**
|
||
|
* Aligns by filling a dynamic programming matrix with the SSE-accelerated,
|
||
|
* banded DP approach of Farrar. As it goes, it determines which cells we
|
||
|
* might backtrace from and tallies the best (highest-scoring) N backtrace
|
||
|
* candidate cells per diagonal. Also returns the alignment score of the best
|
||
|
* alignment in the matrix.
|
||
|
*
|
||
|
* This routine does *not* maintain a matrix holding the entire matrix worth of
|
||
|
* scores, nor does it maintain any other dense O(mn) data structure, as this
|
||
|
* would quickly exhaust memory for queries longer than about 10,000 kb.
|
||
|
* Instead, in the fill stage it maintains two columns worth of scores at a
|
||
|
* time (current/previous, or right/left) - these take O(m) space. When
|
||
|
* finished with the current column, it determines which cells from the
|
||
|
* previous column, if any, are candidates we might backtrace from to find a
|
||
|
* full alignment. A candidate cell has a score that rises above the threshold
|
||
|
* and isn't improved upon by a match in the next column. The best N
|
||
|
* candidates per diagonal are stored in a O(m + n) data structure.
|
||
|
*/
|
||
|
TAlScore SwAligner::alignGatherLoc8(int& flag, bool debug) {
|
||
|
assert_leq(rdf_, rd_->length());
|
||
|
assert_leq(rdf_, qu_->length());
|
||
|
assert_lt(rfi_, rff_);
|
||
|
assert_lt(rdi_, rdf_);
|
||
|
assert_eq(rd_->length(), qu_->length());
|
||
|
assert_geq(sc_->gapbar, 1);
|
||
|
assert_gt(minsc_, 0);
|
||
|
assert(repOk());
|
||
|
#ifndef NDEBUG
|
||
|
for(size_t i = (size_t)rfi_; i < (size_t)rff_; i++) {
|
||
|
assert_range(0, 16, (int)rf_[i]);
|
||
|
}
|
||
|
#endif
|
||
|
|
||
|
SSEData& d = fw_ ? sseU8fw_ : sseU8rc_;
|
||
|
SSEMetrics& met = extend_ ? sseU8ExtendMet_ : sseU8MateMet_;
|
||
|
if(!debug) met.dp++;
|
||
|
buildQueryProfileLocalSseU8(fw_);
|
||
|
assert(!d.profbuf_.empty());
|
||
|
assert_gt(d.bias_, 0);
|
||
|
assert_lt(d.bias_, 127);
|
||
|
|
||
|
assert_gt(d.maxBonus_, 0);
|
||
|
size_t iter =
|
||
|
(dpRows() + (NWORDS_PER_REG-1)) / NWORDS_PER_REG; // iter = segLen
|
||
|
|
||
|
// Now set up the score vectors. We just need two columns worth, which
|
||
|
// we'll call "left" and "right".
|
||
|
d.vecbuf_.resize(ROWSTRIDE_2COL * iter * 2);
|
||
|
d.vecbuf_.zero();
|
||
|
__m128i *vbuf_l = d.vecbuf_.ptr();
|
||
|
__m128i *vbuf_r = d.vecbuf_.ptr() + (ROWSTRIDE_2COL * iter);
|
||
|
|
||
|
// This is the data structure that holds candidate cells per diagonal.
|
||
|
const size_t ndiags = rff_ - rfi_ + dpRows() - 1;
|
||
|
if(!debug) {
|
||
|
btdiag_.init(ndiags, 2);
|
||
|
}
|
||
|
|
||
|
// Data structure that holds checkpointed anti-diagonals
|
||
|
TAlScore perfectScore = sc_->perfectScore(dpRows());
|
||
|
bool checkpoint = true;
|
||
|
bool cpdebug = false;
|
||
|
#ifndef NDEBUG
|
||
|
cpdebug = dpRows() < 1000;
|
||
|
#endif
|
||
|
cper_.init(
|
||
|
dpRows(), // # rows
|
||
|
rff_ - rfi_, // # columns
|
||
|
cperPerPow2_, // checkpoint every 1 << perpow2 diags (& next)
|
||
|
perfectScore, // perfect score (for sanity checks)
|
||
|
true, // matrix cells have 8-bit scores?
|
||
|
cperTri_, // triangular mini-fills?
|
||
|
true, // alignment is local?
|
||
|
cpdebug); // save all cells for debugging?
|
||
|
|
||
|
// Many thanks to Michael Farrar for releasing his striped Smith-Waterman
|
||
|
// implementation:
|
||
|
//
|
||
|
// http://sites.google.com/site/farrarmichael/smith-waterman
|
||
|
//
|
||
|
// Much of the implmentation below is adapted from Michael's code.
|
||
|
|
||
|
// Set all elts to reference gap open penalty
|
||
|
__m128i rfgapo = _mm_setzero_si128();
|
||
|
__m128i rfgape = _mm_setzero_si128();
|
||
|
__m128i rdgapo = _mm_setzero_si128();
|
||
|
__m128i rdgape = _mm_setzero_si128();
|
||
|
__m128i vlo = _mm_setzero_si128();
|
||
|
__m128i vhi = _mm_setzero_si128();
|
||
|
__m128i vmax = _mm_setzero_si128();
|
||
|
__m128i vcolmax = _mm_setzero_si128();
|
||
|
__m128i vmaxtmp = _mm_setzero_si128();
|
||
|
__m128i ve = _mm_setzero_si128();
|
||
|
__m128i vf = _mm_setzero_si128();
|
||
|
__m128i vh = _mm_setzero_si128();
|
||
|
__m128i vhd = _mm_setzero_si128();
|
||
|
__m128i vhdtmp = _mm_setzero_si128();
|
||
|
__m128i vtmp = _mm_setzero_si128();
|
||
|
__m128i vzero = _mm_setzero_si128();
|
||
|
__m128i vbias = _mm_setzero_si128();
|
||
|
__m128i vbiasm1 = _mm_setzero_si128();
|
||
|
__m128i vminsc = _mm_setzero_si128();
|
||
|
|
||
|
int dup;
|
||
|
|
||
|
assert_gt(sc_->refGapOpen(), 0);
|
||
|
assert_leq(sc_->refGapOpen(), MAX_U8);
|
||
|
dup = (sc_->refGapOpen() << 8) | (sc_->refGapOpen() & 0x00ff);
|
||
|
rfgapo = _mm_insert_epi16(rfgapo, dup, 0);
|
||
|
rfgapo = _mm_shufflelo_epi16(rfgapo, 0);
|
||
|
rfgapo = _mm_shuffle_epi32(rfgapo, 0);
|
||
|
|
||
|
// Set all elts to reference gap extension penalty
|
||
|
assert_gt(sc_->refGapExtend(), 0);
|
||
|
assert_leq(sc_->refGapExtend(), MAX_U8);
|
||
|
assert_leq(sc_->refGapExtend(), sc_->refGapOpen());
|
||
|
dup = (sc_->refGapExtend() << 8) | (sc_->refGapExtend() & 0x00ff);
|
||
|
rfgape = _mm_insert_epi16(rfgape, dup, 0);
|
||
|
rfgape = _mm_shufflelo_epi16(rfgape, 0);
|
||
|
rfgape = _mm_shuffle_epi32(rfgape, 0);
|
||
|
|
||
|
// Set all elts to read gap open penalty
|
||
|
assert_gt(sc_->readGapOpen(), 0);
|
||
|
assert_leq(sc_->readGapOpen(), MAX_U8);
|
||
|
dup = (sc_->readGapOpen() << 8) | (sc_->readGapOpen() & 0x00ff);
|
||
|
rdgapo = _mm_insert_epi16(rdgapo, dup, 0);
|
||
|
rdgapo = _mm_shufflelo_epi16(rdgapo, 0);
|
||
|
rdgapo = _mm_shuffle_epi32(rdgapo, 0);
|
||
|
|
||
|
// Set all elts to read gap extension penalty
|
||
|
assert_gt(sc_->readGapExtend(), 0);
|
||
|
assert_leq(sc_->readGapExtend(), MAX_U8);
|
||
|
assert_leq(sc_->readGapExtend(), sc_->readGapOpen());
|
||
|
dup = (sc_->readGapExtend() << 8) | (sc_->readGapExtend() & 0x00ff);
|
||
|
rdgape = _mm_insert_epi16(rdgape, dup, 0);
|
||
|
rdgape = _mm_shufflelo_epi16(rdgape, 0);
|
||
|
rdgape = _mm_shuffle_epi32(rdgape, 0);
|
||
|
|
||
|
// Set all elts to minimum score threshold. Actually, to 1 less than the
|
||
|
// threshold so we can use gt instead of geq.
|
||
|
dup = (((int)minsc_ - 1) << 8) | (((int)minsc_ - 1) & 0x00ff);
|
||
|
vminsc = _mm_insert_epi16(vminsc, dup, 0);
|
||
|
vminsc = _mm_shufflelo_epi16(vminsc, 0);
|
||
|
vminsc = _mm_shuffle_epi32(vminsc, 0);
|
||
|
|
||
|
dup = ((d.bias_ - 1) << 8) | ((d.bias_ - 1) & 0x00ff);
|
||
|
vbiasm1 = _mm_insert_epi16(vbiasm1, dup, 0);
|
||
|
vbiasm1 = _mm_shufflelo_epi16(vbiasm1, 0);
|
||
|
vbiasm1 = _mm_shuffle_epi32(vbiasm1, 0);
|
||
|
vhi = _mm_cmpeq_epi16(vhi, vhi); // all elts = 0xffff
|
||
|
vlo = _mm_xor_si128(vlo, vlo); // all elts = 0
|
||
|
vmax = vlo;
|
||
|
|
||
|
// Make a vector of bias offsets
|
||
|
dup = (d.bias_ << 8) | (d.bias_ & 0x00ff);
|
||
|
vbias = _mm_insert_epi16(vbias, dup, 0);
|
||
|
vbias = _mm_shufflelo_epi16(vbias, 0);
|
||
|
vbias = _mm_shuffle_epi32(vbias, 0);
|
||
|
|
||
|
// Points to a long vector of __m128i where each element is a block of
|
||
|
// contiguous cells in the E, F or H matrix. If the index % 3 == 0, then
|
||
|
// the block of cells is from the E matrix. If index % 3 == 1, they're
|
||
|
// from the F matrix. If index % 3 == 2, then they're from the H matrix.
|
||
|
// Blocks of cells are organized in the same interleaved manner as they are
|
||
|
// calculated by the Farrar algorithm.
|
||
|
const __m128i *pvScore; // points into the query profile
|
||
|
|
||
|
const size_t colstride = ROWSTRIDE_2COL * iter;
|
||
|
|
||
|
// Initialize the H and E vectors in the first matrix column
|
||
|
__m128i *pvELeft = vbuf_l + 0; __m128i *pvERight = vbuf_r + 0;
|
||
|
/* __m128i *pvFLeft = vbuf_l + 1; */ __m128i *pvFRight = vbuf_r + 1;
|
||
|
__m128i *pvHLeft = vbuf_l + 2; __m128i *pvHRight = vbuf_r + 2;
|
||
|
|
||
|
for(size_t i = 0; i < iter; i++) {
|
||
|
// start low in local mode
|
||
|
_mm_store_si128(pvERight, vlo); pvERight += ROWSTRIDE_2COL;
|
||
|
_mm_store_si128(pvHRight, vlo); pvHRight += ROWSTRIDE_2COL;
|
||
|
}
|
||
|
|
||
|
assert_gt(sc_->gapbar, 0);
|
||
|
size_t nfixup = 0;
|
||
|
TAlScore matchsc = sc_->match(30);
|
||
|
TAlScore leftmax = MIN_I64;
|
||
|
|
||
|
// Fill in the table as usual but instead of using the same gap-penalty
|
||
|
// vector for each iteration of the inner loop, load words out of a
|
||
|
// pre-calculated gap vector parallel to the query profile. The pre-
|
||
|
// calculated gap vectors enforce the gap barrier constraint by making it
|
||
|
// infinitely costly to introduce a gap in barrier rows.
|
||
|
//
|
||
|
// AND use a separate loop to fill in the first row of the table, enforcing
|
||
|
// the st_ constraints in the process. This is awkward because it
|
||
|
// separates the processing of the first row from the others and might make
|
||
|
// it difficult to use the first-row results in the next row, but it might
|
||
|
// be the simplest and least disruptive way to deal with the st_ constraint.
|
||
|
|
||
|
size_t off = MAX_SIZE_T, lastoff;
|
||
|
bool bailed = false;
|
||
|
for(size_t i = (size_t)rfi_; i < (size_t)rff_; i++) {
|
||
|
// Swap left and right; vbuf_l is the vector on the left, which we
|
||
|
// generally load from, and vbuf_r is the vector on the right, which we
|
||
|
// generally store to.
|
||
|
swap(vbuf_l, vbuf_r);
|
||
|
pvELeft = vbuf_l + 0; pvERight = vbuf_r + 0;
|
||
|
/* pvFLeft = vbuf_l + 1; */ pvFRight = vbuf_r + 1;
|
||
|
pvHLeft = vbuf_l + 2; pvHRight = vbuf_r + 2;
|
||
|
|
||
|
// Fetch this column's reference mask
|
||
|
const int refm = (int)rf_[i];
|
||
|
|
||
|
// Fetch the appropriate query profile
|
||
|
lastoff = off;
|
||
|
off = (size_t)firsts5[refm] * iter * 2;
|
||
|
pvScore = d.profbuf_.ptr() + off; // even elts = query profile, odd = gap barrier
|
||
|
|
||
|
// Load H vector from the final row of the previous column.
|
||
|
// ??? perhaps we should calculate the next iter's F instead of the
|
||
|
// current iter's? The way we currently do it, seems like it will
|
||
|
// almost always require at least one fixup loop iter (to recalculate
|
||
|
// this topmost F).
|
||
|
vh = _mm_load_si128(pvHLeft + colstride - ROWSTRIDE_2COL);
|
||
|
|
||
|
// Set all cells to low value
|
||
|
vf = _mm_xor_si128(vf, vf);
|
||
|
// vf now contains the vertical contribution
|
||
|
|
||
|
// Store cells in F, calculated previously
|
||
|
// No need to veto ref gap extensions, they're all 0x00s
|
||
|
_mm_store_si128(pvFRight, vf);
|
||
|
pvFRight += ROWSTRIDE_2COL;
|
||
|
|
||
|
// Shift down so that topmost (least sig) cell gets 0
|
||
|
vh = _mm_slli_si128(vh, NBYTES_PER_WORD);
|
||
|
|
||
|
// We pull out one loop iteration to make it easier to veto values in the top row
|
||
|
|
||
|
// Load cells from E, calculated previously
|
||
|
ve = _mm_load_si128(pvELeft);
|
||
|
vhd = _mm_load_si128(pvHLeft);
|
||
|
assert_all_lt(ve, vhi);
|
||
|
pvELeft += ROWSTRIDE_2COL;
|
||
|
// ve now contains the horizontal contribution
|
||
|
|
||
|
// Factor in query profile (matches and mismatches)
|
||
|
vh = _mm_adds_epu8(vh, pvScore[0]);
|
||
|
vh = _mm_subs_epu8(vh, vbias);
|
||
|
// vh now contains the diagonal contribution
|
||
|
|
||
|
vhdtmp = vhd;
|
||
|
vhd = _mm_subs_epu8(vhd, rdgapo);
|
||
|
vhd = _mm_subs_epu8(vhd, pvScore[1]); // veto some read gap opens
|
||
|
ve = _mm_subs_epu8(ve, rdgape);
|
||
|
ve = _mm_max_epu8(ve, vhd);
|
||
|
|
||
|
vh = _mm_max_epu8(vh, ve);
|
||
|
vf = vh;
|
||
|
|
||
|
// Update highest score so far
|
||
|
vcolmax = vh;
|
||
|
|
||
|
// Save the new vH values
|
||
|
_mm_store_si128(pvHRight, vh);
|
||
|
|
||
|
vh = vhdtmp;
|
||
|
assert_all_lt(ve, vhi);
|
||
|
pvHRight += ROWSTRIDE_2COL;
|
||
|
pvHLeft += ROWSTRIDE_2COL;
|
||
|
|
||
|
// Save E values
|
||
|
_mm_store_si128(pvERight, ve);
|
||
|
pvERight += ROWSTRIDE_2COL;
|
||
|
|
||
|
// Update vf value
|
||
|
vf = _mm_subs_epu8(vf, rfgapo);
|
||
|
assert_all_lt(vf, vhi);
|
||
|
|
||
|
pvScore += 2; // move on to next query profile
|
||
|
|
||
|
// For each character in the reference text:
|
||
|
size_t j;
|
||
|
for(j = 1; j < iter; j++) {
|
||
|
// Load cells from E, calculated previously
|
||
|
ve = _mm_load_si128(pvELeft);
|
||
|
vhd = _mm_load_si128(pvHLeft);
|
||
|
assert_all_lt(ve, vhi);
|
||
|
pvELeft += ROWSTRIDE_2COL;
|
||
|
|
||
|
// Store cells in F, calculated previously
|
||
|
vf = _mm_subs_epu8(vf, pvScore[1]); // veto some ref gap extensions
|
||
|
_mm_store_si128(pvFRight, vf);
|
||
|
pvFRight += ROWSTRIDE_2COL;
|
||
|
|
||
|
// Factor in query profile (matches and mismatches)
|
||
|
vh = _mm_adds_epu8(vh, pvScore[0]);
|
||
|
vh = _mm_subs_epu8(vh, vbias);
|
||
|
|
||
|
// Update H, factoring in E and F
|
||
|
vh = _mm_max_epu8(vh, vf);
|
||
|
|
||
|
vhdtmp = vhd;
|
||
|
vhd = _mm_subs_epu8(vhd, rdgapo);
|
||
|
vhd = _mm_subs_epu8(vhd, pvScore[1]); // veto some read gap opens
|
||
|
ve = _mm_subs_epu8(ve, rdgape);
|
||
|
ve = _mm_max_epu8(ve, vhd);
|
||
|
|
||
|
vh = _mm_max_epu8(vh, ve);
|
||
|
vtmp = vh;
|
||
|
|
||
|
// Update highest score encountered this far
|
||
|
vcolmax = _mm_max_epu8(vcolmax, vh);
|
||
|
|
||
|
// Save the new vH values
|
||
|
_mm_store_si128(pvHRight, vh);
|
||
|
|
||
|
vh = vhdtmp;
|
||
|
|
||
|
assert_all_lt(ve, vhi);
|
||
|
pvHRight += ROWSTRIDE_2COL;
|
||
|
pvHLeft += ROWSTRIDE_2COL;
|
||
|
|
||
|
// Save E values
|
||
|
_mm_store_si128(pvERight, ve);
|
||
|
pvERight += ROWSTRIDE_2COL;
|
||
|
|
||
|
// Update vf value
|
||
|
vtmp = _mm_subs_epu8(vtmp, rfgapo);
|
||
|
vf = _mm_subs_epu8(vf, rfgape);
|
||
|
assert_all_lt(vf, vhi);
|
||
|
vf = _mm_max_epu8(vf, vtmp);
|
||
|
|
||
|
pvScore += 2; // move on to next query profile / gap veto
|
||
|
}
|
||
|
// pvHStore, pvELoad, pvEStore have all rolled over to the next column
|
||
|
pvFRight -= colstride; // reset to start of column
|
||
|
vtmp = _mm_load_si128(pvFRight);
|
||
|
|
||
|
pvHRight -= colstride; // reset to start of column
|
||
|
vh = _mm_load_si128(pvHRight);
|
||
|
|
||
|
pvScore = d.profbuf_.ptr() + off + 1; // reset veto vector
|
||
|
|
||
|
// vf from last row gets shifted down by one to overlay the first row
|
||
|
// rfgape has already been subtracted from it.
|
||
|
vf = _mm_slli_si128(vf, NBYTES_PER_WORD);
|
||
|
|
||
|
vf = _mm_subs_epu8(vf, *pvScore); // veto some ref gap extensions
|
||
|
vf = _mm_max_epu8(vtmp, vf);
|
||
|
// TODO: We're testing whether F changed. Can't we just assume that F
|
||
|
// did change and instead check whether H changed? Might save us from
|
||
|
// entering the fixup loop.
|
||
|
vtmp = _mm_subs_epu8(vf, vtmp);
|
||
|
vtmp = _mm_cmpeq_epi8(vtmp, vzero);
|
||
|
int cmp = _mm_movemask_epi8(vtmp);
|
||
|
|
||
|
// If any element of vtmp is greater than H - gap-open...
|
||
|
j = 0;
|
||
|
while(cmp != 0xffff) {
|
||
|
// Store this vf
|
||
|
_mm_store_si128(pvFRight, vf);
|
||
|
pvFRight += ROWSTRIDE_2COL;
|
||
|
|
||
|
// Update vh w/r/t new vf
|
||
|
vh = _mm_max_epu8(vh, vf);
|
||
|
|
||
|
// Save vH values
|
||
|
_mm_store_si128(pvHRight, vh);
|
||
|
pvHRight += ROWSTRIDE_2COL;
|
||
|
|
||
|
// Update highest score encountered so far.
|
||
|
vcolmax = _mm_max_epu8(vcolmax, vh);
|
||
|
|
||
|
pvScore += 2;
|
||
|
|
||
|
assert_lt(j, iter);
|
||
|
if(++j == iter) {
|
||
|
pvFRight -= colstride;
|
||
|
vtmp = _mm_load_si128(pvFRight); // load next vf ASAP
|
||
|
pvHRight -= colstride;
|
||
|
vh = _mm_load_si128(pvHRight); // load next vh ASAP
|
||
|
pvScore = d.profbuf_.ptr() + off + 1;
|
||
|
j = 0;
|
||
|
vf = _mm_slli_si128(vf, NBYTES_PER_WORD);
|
||
|
} else {
|
||
|
vtmp = _mm_load_si128(pvFRight); // load next vf ASAP
|
||
|
vh = _mm_load_si128(pvHRight); // load next vh ASAP
|
||
|
}
|
||
|
|
||
|
// Update F with another gap extension
|
||
|
vf = _mm_subs_epu8(vf, rfgape);
|
||
|
vf = _mm_subs_epu8(vf, *pvScore); // veto some ref gap extensions
|
||
|
vf = _mm_max_epu8(vtmp, vf);
|
||
|
vtmp = _mm_subs_epu8(vf, vtmp);
|
||
|
vtmp = _mm_cmpeq_epi8(vtmp, vzero);
|
||
|
cmp = _mm_movemask_epi8(vtmp);
|
||
|
nfixup++;
|
||
|
}
|
||
|
|
||
|
// Now we'd like to know exactly which cells in the left column are
|
||
|
// candidates we might backtrace from. First question is: did *any*
|
||
|
// elements in the column exceed the minimum score threshold?
|
||
|
if(!debug && leftmax >= minsc_) {
|
||
|
// Yes. Next question is: which cells are candidates? We have to
|
||
|
// allow matches in the right column to override matches above and
|
||
|
// to the left in the left column.
|
||
|
assert_gt(i - rfi_, 0);
|
||
|
pvHLeft = vbuf_l + 2;
|
||
|
assert_lt(lastoff, MAX_SIZE_T);
|
||
|
pvScore = d.profbuf_.ptr() + lastoff; // even elts = query profile, odd = gap barrier
|
||
|
for(size_t k = 0; k < iter; k++) {
|
||
|
vh = _mm_load_si128(pvHLeft);
|
||
|
vtmp = _mm_cmpgt_epi8(pvScore[0], vbiasm1);
|
||
|
int cmp = _mm_movemask_epi8(vtmp);
|
||
|
if(cmp != 0xffff) {
|
||
|
// At least one candidate in this mask. Now iterate
|
||
|
// through vm/vh to evaluate individual cells.
|
||
|
for(size_t m = 0; m < NWORDS_PER_REG; m++) {
|
||
|
size_t row = k + m * iter;
|
||
|
if(row >= dpRows()) {
|
||
|
break;
|
||
|
}
|
||
|
if(((TCScore *)&vtmp)[m] > 0 && ((TCScore *)&vh)[m] >= minsc_) {
|
||
|
TCScore sc = ((TCScore *)&vh)[m];
|
||
|
assert_geq(sc, minsc_);
|
||
|
// Add to data structure holding all candidates
|
||
|
size_t col = i - rfi_ - 1; // -1 b/c prev col
|
||
|
size_t frombot = dpRows() - row - 1;
|
||
|
DpBtCandidate cand(row, col, sc);
|
||
|
btdiag_.add(frombot + col, cand);
|
||
|
}
|
||
|
}
|
||
|
}
|
||
|
pvHLeft += ROWSTRIDE_2COL;
|
||
|
pvScore += 2;
|
||
|
}
|
||
|
}
|
||
|
|
||
|
// Save some elements to checkpoints
|
||
|
if(checkpoint) {
|
||
|
|
||
|
__m128i *pvE = vbuf_r + 0;
|
||
|
__m128i *pvF = vbuf_r + 1;
|
||
|
__m128i *pvH = vbuf_r + 2;
|
||
|
size_t coli = i - rfi_;
|
||
|
if(coli < cper_.locol_) cper_.locol_ = coli;
|
||
|
if(coli > cper_.hicol_) cper_.hicol_ = coli;
|
||
|
if(cperTri_) {
|
||
|
// Checkpoint for triangular mini-fills
|
||
|
size_t rc_mod = coli & cper_.lomask_;
|
||
|
assert_lt(rc_mod, cper_.per_);
|
||
|
int64_t row = -(int64_t)rc_mod-1;
|
||
|
int64_t row_mod = row;
|
||
|
int64_t row_div = 0;
|
||
|
size_t idx = coli >> cper_.perpow2_;
|
||
|
size_t idxrow = idx * cper_.nrow_;
|
||
|
assert_eq(4, ROWSTRIDE_2COL);
|
||
|
bool done = false;
|
||
|
while(true) {
|
||
|
row += (cper_.per_ - 2);
|
||
|
row_mod += (cper_.per_ - 2);
|
||
|
for(size_t j = 0; j < 2; j++) {
|
||
|
row++;
|
||
|
row_mod++;
|
||
|
if(row >= 0 && (size_t)row < cper_.nrow_) {
|
||
|
// Update row divided by iter_ and mod iter_
|
||
|
while(row_mod >= (int64_t)iter) {
|
||
|
row_mod -= (int64_t)iter;
|
||
|
row_div++;
|
||
|
}
|
||
|
size_t delt = idxrow + row;
|
||
|
size_t vecoff = (row_mod << 6) + row_div;
|
||
|
assert_lt(row_div, 16);
|
||
|
int16_t h_sc = ((uint8_t*)pvH)[vecoff];
|
||
|
int16_t e_sc = ((uint8_t*)pvE)[vecoff];
|
||
|
int16_t f_sc = ((uint8_t*)pvF)[vecoff];
|
||
|
assert_leq(h_sc, cper_.perf_);
|
||
|
assert_leq(e_sc, cper_.perf_);
|
||
|
assert_leq(f_sc, cper_.perf_);
|
||
|
CpQuad *qdiags = ((j == 0) ? cper_.qdiag1s_.ptr() : cper_.qdiag2s_.ptr());
|
||
|
qdiags[delt].sc[0] = h_sc;
|
||
|
qdiags[delt].sc[1] = e_sc;
|
||
|
qdiags[delt].sc[2] = f_sc;
|
||
|
} // if(row >= 0 && row < nrow_)
|
||
|
else if(row >= 0 && (size_t)row >= cper_.nrow_) {
|
||
|
done = true;
|
||
|
break;
|
||
|
}
|
||
|
} // for(size_t j = 0; j < 2; j++)
|
||
|
if(done) {
|
||
|
break;
|
||
|
}
|
||
|
idx++;
|
||
|
idxrow += cper_.nrow_;
|
||
|
} // while(true)
|
||
|
} else {
|
||
|
// Checkpoint for square mini-fills
|
||
|
|
||
|
// If this is the first column, take this opportunity to
|
||
|
// pre-calculate the coordinates of the elements we're going to
|
||
|
// checkpoint.
|
||
|
if(coli == 0) {
|
||
|
size_t cpi = cper_.per_-1;
|
||
|
size_t cpimod = cper_.per_-1;
|
||
|
size_t cpidiv = 0;
|
||
|
cper_.commitMap_.clear();
|
||
|
while(cpi < cper_.nrow_) {
|
||
|
while(cpimod >= iter) {
|
||
|
cpimod -= iter;
|
||
|
cpidiv++;
|
||
|
}
|
||
|
size_t vecoff = (cpimod << 6) + cpidiv;
|
||
|
cper_.commitMap_.push_back(vecoff);
|
||
|
cpi += cper_.per_;
|
||
|
cpimod += cper_.per_;
|
||
|
}
|
||
|
}
|
||
|
// Save all the rows
|
||
|
size_t rowoff = 0;
|
||
|
size_t sz = cper_.commitMap_.size();
|
||
|
for(size_t i = 0; i < sz; i++, rowoff += cper_.ncol_) {
|
||
|
size_t vecoff = cper_.commitMap_[i];
|
||
|
int16_t h_sc = ((uint8_t*)pvH)[vecoff];
|
||
|
//int16_t e_sc = ((uint8_t*)pvE)[vecoff];
|
||
|
int16_t f_sc = ((uint8_t*)pvF)[vecoff];
|
||
|
assert_leq(h_sc, cper_.perf_);
|
||
|
//assert_leq(e_sc, cper_.perf_);
|
||
|
assert_leq(f_sc, cper_.perf_);
|
||
|
CpQuad& dst = cper_.qrows_[rowoff + coli];
|
||
|
dst.sc[0] = h_sc;
|
||
|
//dst.sc[1] = e_sc;
|
||
|
dst.sc[2] = f_sc;
|
||
|
}
|
||
|
// Is this a column we'd like to checkpoint?
|
||
|
if((coli & cper_.lomask_) == cper_.lomask_) {
|
||
|
// Save the column using memcpys
|
||
|
assert_gt(coli, 0);
|
||
|
size_t wordspercol = cper_.niter_ * ROWSTRIDE_2COL;
|
||
|
size_t coloff = (coli >> cper_.perpow2_) * wordspercol;
|
||
|
__m128i *dst = cper_.qcols_.ptr() + coloff;
|
||
|
memcpy(dst, vbuf_r, sizeof(__m128i) * wordspercol);
|
||
|
}
|
||
|
}
|
||
|
if(cper_.debug_) {
|
||
|
// Save the column using memcpys
|
||
|
size_t wordspercol = cper_.niter_ * ROWSTRIDE_2COL;
|
||
|
size_t coloff = coli * wordspercol;
|
||
|
__m128i *dst = cper_.qcolsD_.ptr() + coloff;
|
||
|
memcpy(dst, vbuf_r, sizeof(__m128i) * wordspercol);
|
||
|
}
|
||
|
}
|
||
|
|
||
|
// Store column maximum vector in first element of tmp
|
||
|
vmax = _mm_max_epu8(vmax, vcolmax);
|
||
|
|
||
|
{
|
||
|
// Get single largest score in this column
|
||
|
vmaxtmp = vcolmax;
|
||
|
vtmp = _mm_srli_si128(vmaxtmp, 8);
|
||
|
vmaxtmp = _mm_max_epu8(vmaxtmp, vtmp);
|
||
|
vtmp = _mm_srli_si128(vmaxtmp, 4);
|
||
|
vmaxtmp = _mm_max_epu8(vmaxtmp, vtmp);
|
||
|
vtmp = _mm_srli_si128(vmaxtmp, 2);
|
||
|
vmaxtmp = _mm_max_epu8(vmaxtmp, vtmp);
|
||
|
vtmp = _mm_srli_si128(vmaxtmp, 1);
|
||
|
vmaxtmp = _mm_max_epu8(vmaxtmp, vtmp);
|
||
|
int score = _mm_extract_epi16(vmaxtmp, 0);
|
||
|
score = score & 0x00ff;
|
||
|
|
||
|
// Could we have saturated?
|
||
|
if(score + d.bias_ >= 255) {
|
||
|
flag = -2; // yes
|
||
|
if(!debug) met.dpsat++;
|
||
|
return MIN_I64;
|
||
|
}
|
||
|
|
||
|
if(score < minsc_) {
|
||
|
size_t ncolleft = rff_ - i - 1;
|
||
|
if(score + (TAlScore)ncolleft * matchsc < minsc_) {
|
||
|
// Bail! There can't possibly be a valid alignment that
|
||
|
// passes through this column.
|
||
|
bailed = true;
|
||
|
break;
|
||
|
}
|
||
|
}
|
||
|
|
||
|
leftmax = score;
|
||
|
}
|
||
|
}
|
||
|
|
||
|
lastoff = off;
|
||
|
|
||
|
// Now we'd like to know exactly which cells in the *rightmost* column are
|
||
|
// candidates we might backtrace from. Did *any* elements exceed the
|
||
|
// minimum score threshold?
|
||
|
if(!debug && !bailed && leftmax >= minsc_) {
|
||
|
// Yes. Next question is: which cells are candidates? We have to
|
||
|
// allow matches in the right column to override matches above and
|
||
|
// to the left in the left column.
|
||
|
pvHLeft = vbuf_r + 2;
|
||
|
assert_lt(lastoff, MAX_SIZE_T);
|
||
|
pvScore = d.profbuf_.ptr() + lastoff; // even elts = query profile, odd = gap barrier
|
||
|
for(size_t k = 0; k < iter; k++) {
|
||
|
vh = _mm_load_si128(pvHLeft);
|
||
|
vtmp = _mm_cmpgt_epi8(pvScore[0], vbiasm1);
|
||
|
int cmp = _mm_movemask_epi8(vtmp);
|
||
|
if(cmp != 0xffff) {
|
||
|
// At least one candidate in this mask. Now iterate
|
||
|
// through vm/vh to evaluate individual cells.
|
||
|
for(size_t m = 0; m < NWORDS_PER_REG; m++) {
|
||
|
size_t row = k + m * iter;
|
||
|
if(row >= dpRows()) {
|
||
|
break;
|
||
|
}
|
||
|
if(((TCScore *)&vtmp)[m] > 0 && ((TCScore *)&vh)[m] >= minsc_) {
|
||
|
TCScore sc = ((TCScore *)&vh)[m];
|
||
|
assert_geq(sc, minsc_);
|
||
|
// Add to data structure holding all candidates
|
||
|
size_t col = rff_ - rfi_ - 1; // -1 b/c prev col
|
||
|
size_t frombot = dpRows() - row - 1;
|
||
|
DpBtCandidate cand(row, col, sc);
|
||
|
btdiag_.add(frombot + col, cand);
|
||
|
}
|
||
|
}
|
||
|
}
|
||
|
pvHLeft += ROWSTRIDE_2COL;
|
||
|
pvScore += 2;
|
||
|
}
|
||
|
}
|
||
|
|
||
|
// Find largest score in vmax
|
||
|
vtmp = _mm_srli_si128(vmax, 8);
|
||
|
vmax = _mm_max_epu8(vmax, vtmp);
|
||
|
vtmp = _mm_srli_si128(vmax, 4);
|
||
|
vmax = _mm_max_epu8(vmax, vtmp);
|
||
|
vtmp = _mm_srli_si128(vmax, 2);
|
||
|
vmax = _mm_max_epu8(vmax, vtmp);
|
||
|
vtmp = _mm_srli_si128(vmax, 1);
|
||
|
vmax = _mm_max_epu8(vmax, vtmp);
|
||
|
|
||
|
// Update metrics
|
||
|
if(!debug) {
|
||
|
size_t ninner = (rff_ - rfi_) * iter;
|
||
|
met.col += (rff_ - rfi_); // DP columns
|
||
|
met.cell += (ninner * NWORDS_PER_REG); // DP cells
|
||
|
met.inner += ninner; // DP inner loop iters
|
||
|
met.fixup += nfixup; // DP fixup loop iters
|
||
|
}
|
||
|
|
||
|
int score = _mm_extract_epi16(vmax, 0);
|
||
|
score = score & 0x00ff;
|
||
|
|
||
|
flag = 0;
|
||
|
|
||
|
// Could we have saturated?
|
||
|
if(score + d.bias_ >= 255) {
|
||
|
flag = -2; // yes
|
||
|
if(!debug) met.dpsat++;
|
||
|
return MIN_I64;
|
||
|
}
|
||
|
|
||
|
// Did we find a solution?
|
||
|
if(score == MIN_U8 || score < minsc_) {
|
||
|
flag = -1; // no
|
||
|
if(!debug) met.dpfail++;
|
||
|
return (TAlScore)score;
|
||
|
}
|
||
|
|
||
|
// Now take all the backtrace candidates in the btdaig_ structure and
|
||
|
// dump them into the btncand_ array. They'll be sorted later.
|
||
|
if(!debug) {
|
||
|
assert(!btdiag_.empty());
|
||
|
btdiag_.dump(btncand_);
|
||
|
assert(!btncand_.empty());
|
||
|
}
|
||
|
|
||
|
// Return largest score
|
||
|
if(!debug) met.dpsucc++;
|
||
|
return (TAlScore)score;
|
||
|
}
|
||
|
|
||
|
/**
|
||
|
* Solve the current alignment problem using SSE instructions that operate on 16
|
||
|
* unsigned 8-bit values packed into a single 128-bit register.
|
||
|
*/
|
||
|
TAlScore SwAligner::alignNucleotidesLocalSseU8(int& flag, bool debug) {
|
||
|
assert_leq(rdf_, rd_->length());
|
||
|
assert_leq(rdf_, qu_->length());
|
||
|
assert_lt(rfi_, rff_);
|
||
|
assert_lt(rdi_, rdf_);
|
||
|
assert_eq(rd_->length(), qu_->length());
|
||
|
assert_geq(sc_->gapbar, 1);
|
||
|
assert(repOk());
|
||
|
#ifndef NDEBUG
|
||
|
for(size_t i = (size_t)rfi_; i < (size_t)rff_; i++) {
|
||
|
assert_range(0, 16, (int)rf_[i]);
|
||
|
}
|
||
|
#endif
|
||
|
|
||
|
SSEData& d = fw_ ? sseU8fw_ : sseU8rc_;
|
||
|
SSEMetrics& met = extend_ ? sseU8ExtendMet_ : sseU8MateMet_;
|
||
|
if(!debug) met.dp++;
|
||
|
buildQueryProfileLocalSseU8(fw_);
|
||
|
assert(!d.profbuf_.empty());
|
||
|
assert_geq(d.bias_, 0);
|
||
|
|
||
|
assert_gt(d.maxBonus_, 0);
|
||
|
size_t iter =
|
||
|
(dpRows() + (NWORDS_PER_REG-1)) / NWORDS_PER_REG; // iter = segLen
|
||
|
|
||
|
int dup;
|
||
|
|
||
|
// Many thanks to Michael Farrar for releasing his striped Smith-Waterman
|
||
|
// implementation:
|
||
|
//
|
||
|
// http://sites.google.com/site/farrarmichael/smith-waterman
|
||
|
//
|
||
|
// Much of the implmentation below is adapted from Michael's code.
|
||
|
|
||
|
// Set all elts to reference gap open penalty
|
||
|
__m128i rfgapo = _mm_setzero_si128();
|
||
|
__m128i rfgape = _mm_setzero_si128();
|
||
|
__m128i rdgapo = _mm_setzero_si128();
|
||
|
__m128i rdgape = _mm_setzero_si128();
|
||
|
__m128i vlo = _mm_setzero_si128();
|
||
|
__m128i vhi = _mm_setzero_si128();
|
||
|
__m128i vmax = _mm_setzero_si128();
|
||
|
__m128i vcolmax = _mm_setzero_si128();
|
||
|
__m128i vmaxtmp = _mm_setzero_si128();
|
||
|
__m128i ve = _mm_setzero_si128();
|
||
|
__m128i vf = _mm_setzero_si128();
|
||
|
__m128i vh = _mm_setzero_si128();
|
||
|
__m128i vtmp = _mm_setzero_si128();
|
||
|
__m128i vzero = _mm_setzero_si128();
|
||
|
__m128i vbias = _mm_setzero_si128();
|
||
|
|
||
|
assert_gt(sc_->refGapOpen(), 0);
|
||
|
assert_leq(sc_->refGapOpen(), MAX_U8);
|
||
|
dup = (sc_->refGapOpen() << 8) | (sc_->refGapOpen() & 0x00ff);
|
||
|
rfgapo = _mm_insert_epi16(rfgapo, dup, 0);
|
||
|
rfgapo = _mm_shufflelo_epi16(rfgapo, 0);
|
||
|
rfgapo = _mm_shuffle_epi32(rfgapo, 0);
|
||
|
|
||
|
// Set all elts to reference gap extension penalty
|
||
|
assert_gt(sc_->refGapExtend(), 0);
|
||
|
assert_leq(sc_->refGapExtend(), MAX_U8);
|
||
|
assert_leq(sc_->refGapExtend(), sc_->refGapOpen());
|
||
|
dup = (sc_->refGapExtend() << 8) | (sc_->refGapExtend() & 0x00ff);
|
||
|
rfgape = _mm_insert_epi16(rfgape, dup, 0);
|
||
|
rfgape = _mm_shufflelo_epi16(rfgape, 0);
|
||
|
rfgape = _mm_shuffle_epi32(rfgape, 0);
|
||
|
|
||
|
// Set all elts to read gap open penalty
|
||
|
assert_gt(sc_->readGapOpen(), 0);
|
||
|
assert_leq(sc_->readGapOpen(), MAX_U8);
|
||
|
dup = (sc_->readGapOpen() << 8) | (sc_->readGapOpen() & 0x00ff);
|
||
|
rdgapo = _mm_insert_epi16(rdgapo, dup, 0);
|
||
|
rdgapo = _mm_shufflelo_epi16(rdgapo, 0);
|
||
|
rdgapo = _mm_shuffle_epi32(rdgapo, 0);
|
||
|
|
||
|
// Set all elts to read gap extension penalty
|
||
|
assert_gt(sc_->readGapExtend(), 0);
|
||
|
assert_leq(sc_->readGapExtend(), MAX_U8);
|
||
|
assert_leq(sc_->readGapExtend(), sc_->readGapOpen());
|
||
|
dup = (sc_->readGapExtend() << 8) | (sc_->readGapExtend() & 0x00ff);
|
||
|
rdgape = _mm_insert_epi16(rdgape, dup, 0);
|
||
|
rdgape = _mm_shufflelo_epi16(rdgape, 0);
|
||
|
rdgape = _mm_shuffle_epi32(rdgape, 0);
|
||
|
|
||
|
vhi = _mm_cmpeq_epi16(vhi, vhi); // all elts = 0xffff
|
||
|
vlo = _mm_xor_si128(vlo, vlo); // all elts = 0
|
||
|
vmax = vlo;
|
||
|
|
||
|
// Make a vector of bias offsets
|
||
|
dup = (d.bias_ << 8) | (d.bias_ & 0x00ff);
|
||
|
vbias = _mm_insert_epi16(vbias, dup, 0);
|
||
|
vbias = _mm_shufflelo_epi16(vbias, 0);
|
||
|
vbias = _mm_shuffle_epi32(vbias, 0);
|
||
|
|
||
|
// Points to a long vector of __m128i where each element is a block of
|
||
|
// contiguous cells in the E, F or H matrix. If the index % 3 == 0, then
|
||
|
// the block of cells is from the E matrix. If index % 3 == 1, they're
|
||
|
// from the F matrix. If index % 3 == 2, then they're from the H matrix.
|
||
|
// Blocks of cells are organized in the same interleaved manner as they are
|
||
|
// calculated by the Farrar algorithm.
|
||
|
const __m128i *pvScore; // points into the query profile
|
||
|
|
||
|
d.mat_.init(dpRows(), rff_ - rfi_, NWORDS_PER_REG);
|
||
|
const size_t colstride = d.mat_.colstride();
|
||
|
//const size_t rowstride = d.mat_.rowstride();
|
||
|
assert_eq(ROWSTRIDE, colstride / iter);
|
||
|
|
||
|
// Initialize the H and E vectors in the first matrix column
|
||
|
__m128i *pvHTmp = d.mat_.tmpvec(0, 0);
|
||
|
__m128i *pvETmp = d.mat_.evec(0, 0);
|
||
|
|
||
|
for(size_t i = 0; i < iter; i++) {
|
||
|
_mm_store_si128(pvETmp, vlo);
|
||
|
_mm_store_si128(pvHTmp, vlo); // start low in local mode
|
||
|
pvETmp += ROWSTRIDE;
|
||
|
pvHTmp += ROWSTRIDE;
|
||
|
}
|
||
|
// These are swapped just before the innermost loop
|
||
|
__m128i *pvHStore = d.mat_.hvec(0, 0);
|
||
|
__m128i *pvHLoad = d.mat_.tmpvec(0, 0);
|
||
|
__m128i *pvELoad = d.mat_.evec(0, 0);
|
||
|
__m128i *pvEStore = d.mat_.evecUnsafe(0, 1);
|
||
|
__m128i *pvFStore = d.mat_.fvec(0, 0);
|
||
|
__m128i *pvFTmp = NULL;
|
||
|
|
||
|
assert_gt(sc_->gapbar, 0);
|
||
|
size_t nfixup = 0;
|
||
|
TAlScore matchsc = sc_->match(30);
|
||
|
|
||
|
// Fill in the table as usual but instead of using the same gap-penalty
|
||
|
// vector for each iteration of the inner loop, load words out of a
|
||
|
// pre-calculated gap vector parallel to the query profile. The pre-
|
||
|
// calculated gap vectors enforce the gap barrier constraint by making it
|
||
|
// infinitely costly to introduce a gap in barrier rows.
|
||
|
//
|
||
|
// AND use a separate loop to fill in the first row of the table, enforcing
|
||
|
// the st_ constraints in the process. This is awkward because it
|
||
|
// separates the processing of the first row from the others and might make
|
||
|
// it difficult to use the first-row results in the next row, but it might
|
||
|
// be the simplest and least disruptive way to deal with the st_ constraint.
|
||
|
|
||
|
colstop_ = rff_ - rfi_;
|
||
|
lastsolcol_ = 0;
|
||
|
for(size_t i = (size_t)rfi_; i < (size_t)rff_; i++) {
|
||
|
assert(pvFStore == d.mat_.fvec(0, i - rfi_));
|
||
|
assert(pvHStore == d.mat_.hvec(0, i - rfi_));
|
||
|
|
||
|
// Fetch this column's reference mask
|
||
|
const int refm = (int)rf_[i];
|
||
|
|
||
|
// Fetch the appropriate query profile
|
||
|
size_t off = (size_t)firsts5[refm] * iter * 2;
|
||
|
pvScore = d.profbuf_.ptr() + off; // even elts = query profile, odd = gap barrier
|
||
|
|
||
|
// Load H vector from the final row of the previous column
|
||
|
vh = _mm_load_si128(pvHLoad + colstride - ROWSTRIDE);
|
||
|
|
||
|
// Set all cells to low value
|
||
|
vf = _mm_xor_si128(vf, vf);
|
||
|
|
||
|
// Store cells in F, calculated previously
|
||
|
// No need to veto ref gap extensions, they're all 0x00s
|
||
|
_mm_store_si128(pvFStore, vf);
|
||
|
pvFStore += ROWSTRIDE;
|
||
|
|
||
|
// Shift down so that topmost (least sig) cell gets 0
|
||
|
vh = _mm_slli_si128(vh, NBYTES_PER_WORD);
|
||
|
|
||
|
// We pull out one loop iteration to make it easier to veto values in the top row
|
||
|
|
||
|
// Load cells from E, calculated previously
|
||
|
ve = _mm_load_si128(pvELoad);
|
||
|
assert_all_lt(ve, vhi);
|
||
|
pvELoad += ROWSTRIDE;
|
||
|
|
||
|
// Factor in query profile (matches and mismatches)
|
||
|
vh = _mm_adds_epu8(vh, pvScore[0]);
|
||
|
vh = _mm_subs_epu8(vh, vbias);
|
||
|
|
||
|
// Update H, factoring in E and F
|
||
|
vh = _mm_max_epu8(vh, ve);
|
||
|
vh = _mm_max_epu8(vh, vf);
|
||
|
|
||
|
// Update highest score so far
|
||
|
vcolmax = _mm_xor_si128(vcolmax, vcolmax);
|
||
|
vcolmax = _mm_max_epu8(vcolmax, vh);
|
||
|
|
||
|
// Save the new vH values
|
||
|
_mm_store_si128(pvHStore, vh);
|
||
|
pvHStore += ROWSTRIDE;
|
||
|
|
||
|
// Update vE value
|
||
|
vf = vh;
|
||
|
vh = _mm_subs_epu8(vh, rdgapo);
|
||
|
vh = _mm_subs_epu8(vh, pvScore[1]); // veto some read gap opens
|
||
|
ve = _mm_subs_epu8(ve, rdgape);
|
||
|
ve = _mm_max_epu8(ve, vh);
|
||
|
assert_all_lt(ve, vhi);
|
||
|
|
||
|
// Load the next h value
|
||
|
vh = _mm_load_si128(pvHLoad);
|
||
|
pvHLoad += ROWSTRIDE;
|
||
|
|
||
|
// Save E values
|
||
|
_mm_store_si128(pvEStore, ve);
|
||
|
pvEStore += ROWSTRIDE;
|
||
|
|
||
|
// Update vf value
|
||
|
vf = _mm_subs_epu8(vf, rfgapo);
|
||
|
assert_all_lt(vf, vhi);
|
||
|
|
||
|
pvScore += 2; // move on to next query profile
|
||
|
|
||
|
// For each character in the reference text:
|
||
|
size_t j;
|
||
|
for(j = 1; j < iter; j++) {
|
||
|
// Load cells from E, calculated previously
|
||
|
ve = _mm_load_si128(pvELoad);
|
||
|
assert_all_lt(ve, vhi);
|
||
|
pvELoad += ROWSTRIDE;
|
||
|
|
||
|
// Store cells in F, calculated previously
|
||
|
vf = _mm_subs_epu8(vf, pvScore[1]); // veto some ref gap extensions
|
||
|
_mm_store_si128(pvFStore, vf);
|
||
|
pvFStore += ROWSTRIDE;
|
||
|
|
||
|
// Factor in query profile (matches and mismatches)
|
||
|
vh = _mm_adds_epu8(vh, pvScore[0]);
|
||
|
vh = _mm_subs_epu8(vh, vbias);
|
||
|
|
||
|
// Update H, factoring in E and F
|
||
|
vh = _mm_max_epu8(vh, ve);
|
||
|
vh = _mm_max_epu8(vh, vf);
|
||
|
|
||
|
// Update highest score encountered this far
|
||
|
vcolmax = _mm_max_epu8(vcolmax, vh);
|
||
|
|
||
|
// Save the new vH values
|
||
|
_mm_store_si128(pvHStore, vh);
|
||
|
pvHStore += ROWSTRIDE;
|
||
|
|
||
|
// Update vE value
|
||
|
vtmp = vh;
|
||
|
vh = _mm_subs_epu8(vh, rdgapo);
|
||
|
vh = _mm_subs_epu8(vh, pvScore[1]); // veto some read gap opens
|
||
|
ve = _mm_subs_epu8(ve, rdgape);
|
||
|
ve = _mm_max_epu8(ve, vh);
|
||
|
assert_all_lt(ve, vhi);
|
||
|
|
||
|
// Load the next h value
|
||
|
vh = _mm_load_si128(pvHLoad);
|
||
|
pvHLoad += ROWSTRIDE;
|
||
|
|
||
|
// Save E values
|
||
|
_mm_store_si128(pvEStore, ve);
|
||
|
pvEStore += ROWSTRIDE;
|
||
|
|
||
|
// Update vf value
|
||
|
vtmp = _mm_subs_epu8(vtmp, rfgapo);
|
||
|
vf = _mm_subs_epu8(vf, rfgape);
|
||
|
assert_all_lt(vf, vhi);
|
||
|
vf = _mm_max_epu8(vf, vtmp);
|
||
|
|
||
|
pvScore += 2; // move on to next query profile / gap veto
|
||
|
}
|
||
|
// pvHStore, pvELoad, pvEStore have all rolled over to the next column
|
||
|
pvFTmp = pvFStore;
|
||
|
pvFStore -= colstride; // reset to start of column
|
||
|
vtmp = _mm_load_si128(pvFStore);
|
||
|
|
||
|
pvHStore -= colstride; // reset to start of column
|
||
|
vh = _mm_load_si128(pvHStore);
|
||
|
|
||
|
pvEStore -= colstride; // reset to start of column
|
||
|
ve = _mm_load_si128(pvEStore);
|
||
|
|
||
|
pvHLoad = pvHStore; // new pvHLoad = pvHStore
|
||
|
pvScore = d.profbuf_.ptr() + off + 1; // reset veto vector
|
||
|
|
||
|
// vf from last row gets shifted down by one to overlay the first row
|
||
|
// rfgape has already been subtracted from it.
|
||
|
vf = _mm_slli_si128(vf, NBYTES_PER_WORD);
|
||
|
|
||
|
vf = _mm_subs_epu8(vf, *pvScore); // veto some ref gap extensions
|
||
|
vf = _mm_max_epu8(vtmp, vf);
|
||
|
vtmp = _mm_subs_epu8(vf, vtmp);
|
||
|
vtmp = _mm_cmpeq_epi8(vtmp, vzero);
|
||
|
int cmp = _mm_movemask_epi8(vtmp);
|
||
|
|
||
|
// If any element of vtmp is greater than H - gap-open...
|
||
|
j = 0;
|
||
|
while(cmp != 0xffff) {
|
||
|
// Store this vf
|
||
|
_mm_store_si128(pvFStore, vf);
|
||
|
pvFStore += ROWSTRIDE;
|
||
|
|
||
|
// Update vh w/r/t new vf
|
||
|
vh = _mm_max_epu8(vh, vf);
|
||
|
|
||
|
// Save vH values
|
||
|
_mm_store_si128(pvHStore, vh);
|
||
|
pvHStore += ROWSTRIDE;
|
||
|
|
||
|
// Update highest score encountered this far
|
||
|
vcolmax = _mm_max_epu8(vcolmax, vh);
|
||
|
|
||
|
// Update E in case it can be improved using our new vh
|
||
|
vh = _mm_subs_epu8(vh, rdgapo);
|
||
|
vh = _mm_subs_epu8(vh, *pvScore); // veto some read gap opens
|
||
|
ve = _mm_max_epu8(ve, vh);
|
||
|
_mm_store_si128(pvEStore, ve);
|
||
|
pvEStore += ROWSTRIDE;
|
||
|
pvScore += 2;
|
||
|
|
||
|
assert_lt(j, iter);
|
||
|
if(++j == iter) {
|
||
|
pvFStore -= colstride;
|
||
|
vtmp = _mm_load_si128(pvFStore); // load next vf ASAP
|
||
|
pvHStore -= colstride;
|
||
|
vh = _mm_load_si128(pvHStore); // load next vh ASAP
|
||
|
pvEStore -= colstride;
|
||
|
ve = _mm_load_si128(pvEStore); // load next ve ASAP
|
||
|
pvScore = d.profbuf_.ptr() + off + 1;
|
||
|
j = 0;
|
||
|
vf = _mm_slli_si128(vf, NBYTES_PER_WORD);
|
||
|
} else {
|
||
|
vtmp = _mm_load_si128(pvFStore); // load next vf ASAP
|
||
|
vh = _mm_load_si128(pvHStore); // load next vh ASAP
|
||
|
ve = _mm_load_si128(pvEStore); // load next vh ASAP
|
||
|
}
|
||
|
|
||
|
// Update F with another gap extension
|
||
|
vf = _mm_subs_epu8(vf, rfgape);
|
||
|
vf = _mm_subs_epu8(vf, *pvScore); // veto some ref gap extensions
|
||
|
vf = _mm_max_epu8(vtmp, vf);
|
||
|
vtmp = _mm_subs_epu8(vf, vtmp);
|
||
|
vtmp = _mm_cmpeq_epi8(vtmp, vzero);
|
||
|
cmp = _mm_movemask_epi8(vtmp);
|
||
|
nfixup++;
|
||
|
}
|
||
|
|
||
|
#ifndef NDEBUG
|
||
|
if((rand() & 15) == 0) {
|
||
|
// This is a work-intensive sanity check; each time we finish filling
|
||
|
// a column, we check that each H, E, and F is sensible.
|
||
|
for(size_t k = 0; k < dpRows(); k++) {
|
||
|
assert(cellOkLocalU8(
|
||
|
d,
|
||
|
k, // row
|
||
|
i - rfi_, // col
|
||
|
refm, // reference mask
|
||
|
(int)(*rd_)[rdi_+k], // read char
|
||
|
(int)(*qu_)[rdi_+k], // read quality
|
||
|
*sc_)); // scoring scheme
|
||
|
}
|
||
|
}
|
||
|
#endif
|
||
|
|
||
|
// Store column maximum vector in first element of tmp
|
||
|
vmax = _mm_max_epu8(vmax, vcolmax);
|
||
|
_mm_store_si128(d.mat_.tmpvec(0, i - rfi_), vcolmax);
|
||
|
|
||
|
{
|
||
|
// Get single largest score in this column
|
||
|
vmaxtmp = vcolmax;
|
||
|
vtmp = _mm_srli_si128(vmaxtmp, 8);
|
||
|
vmaxtmp = _mm_max_epu8(vmaxtmp, vtmp);
|
||
|
vtmp = _mm_srli_si128(vmaxtmp, 4);
|
||
|
vmaxtmp = _mm_max_epu8(vmaxtmp, vtmp);
|
||
|
vtmp = _mm_srli_si128(vmaxtmp, 2);
|
||
|
vmaxtmp = _mm_max_epu8(vmaxtmp, vtmp);
|
||
|
vtmp = _mm_srli_si128(vmaxtmp, 1);
|
||
|
vmaxtmp = _mm_max_epu8(vmaxtmp, vtmp);
|
||
|
int score = _mm_extract_epi16(vmaxtmp, 0);
|
||
|
score = score & 0x00ff;
|
||
|
|
||
|
// Could we have saturated?
|
||
|
if(score + d.bias_ >= 255) {
|
||
|
flag = -2; // yes
|
||
|
if(!debug) met.dpsat++;
|
||
|
return MIN_I64;
|
||
|
}
|
||
|
|
||
|
if(score < minsc_) {
|
||
|
size_t ncolleft = rff_ - i - 1;
|
||
|
if(score + (TAlScore)ncolleft * matchsc < minsc_) {
|
||
|
// Bail! We're guaranteed not to see a valid alignment in
|
||
|
// the rest of the matrix
|
||
|
colstop_ = (i+1) - rfi_;
|
||
|
break;
|
||
|
}
|
||
|
} else {
|
||
|
lastsolcol_ = i - rfi_;
|
||
|
}
|
||
|
}
|
||
|
|
||
|
// pvELoad and pvHLoad are already where they need to be
|
||
|
|
||
|
// Adjust the load and store vectors here.
|
||
|
pvHStore = pvHLoad + colstride;
|
||
|
pvEStore = pvELoad + colstride;
|
||
|
pvFStore = pvFTmp;
|
||
|
}
|
||
|
|
||
|
// Find largest score in vmax
|
||
|
vtmp = _mm_srli_si128(vmax, 8);
|
||
|
vmax = _mm_max_epu8(vmax, vtmp);
|
||
|
vtmp = _mm_srli_si128(vmax, 4);
|
||
|
vmax = _mm_max_epu8(vmax, vtmp);
|
||
|
vtmp = _mm_srli_si128(vmax, 2);
|
||
|
vmax = _mm_max_epu8(vmax, vtmp);
|
||
|
vtmp = _mm_srli_si128(vmax, 1);
|
||
|
vmax = _mm_max_epu8(vmax, vtmp);
|
||
|
|
||
|
// Update metrics
|
||
|
if(!debug) {
|
||
|
size_t ninner = (rff_ - rfi_) * iter;
|
||
|
met.col += (rff_ - rfi_); // DP columns
|
||
|
met.cell += (ninner * NWORDS_PER_REG); // DP cells
|
||
|
met.inner += ninner; // DP inner loop iters
|
||
|
met.fixup += nfixup; // DP fixup loop iters
|
||
|
}
|
||
|
|
||
|
int score = _mm_extract_epi16(vmax, 0);
|
||
|
score = score & 0x00ff;
|
||
|
|
||
|
flag = 0;
|
||
|
|
||
|
// Could we have saturated?
|
||
|
if(score + d.bias_ >= 255) {
|
||
|
flag = -2; // yes
|
||
|
if(!debug) met.dpsat++;
|
||
|
return MIN_I64;
|
||
|
}
|
||
|
|
||
|
// Did we find a solution?
|
||
|
if(score == MIN_U8 || score < minsc_) {
|
||
|
flag = -1; // no
|
||
|
if(!debug) met.dpfail++;
|
||
|
return (TAlScore)score;
|
||
|
}
|
||
|
|
||
|
// Return largest score
|
||
|
if(!debug) met.dpsucc++;
|
||
|
return (TAlScore)score;
|
||
|
}
|
||
|
|
||
|
/**
|
||
|
* Given a filled-in DP table, populate the btncand_ list with candidate cells
|
||
|
* that might be at the ends of valid alignments. No need to do this unless
|
||
|
* the maximum score returned by the align*() func is >= the minimum.
|
||
|
*
|
||
|
* We needn't consider cells that have no chance of reaching any of the core
|
||
|
* diagonals. These are the cells that are more than 'maxgaps' cells away from
|
||
|
* a core diagonal.
|
||
|
*
|
||
|
* We need to be careful to consider that the rectangle might be truncated on
|
||
|
* one or both ends.
|
||
|
*
|
||
|
* The seed extend case looks like this:
|
||
|
*
|
||
|
* |Rectangle| 0: seed diagonal
|
||
|
* **OO0oo---- o: "RHS gap" diagonals
|
||
|
* -**OO0oo--- O: "LHS gap" diagonals
|
||
|
* --**OO0oo-- *: "LHS extra" diagonals
|
||
|
* ---**OO0oo- -: cells that can't possibly be involved in a valid
|
||
|
* ----**OO0oo alignment that overlaps one of the core diagonals
|
||
|
*
|
||
|
* The anchor-to-left case looks like this:
|
||
|
*
|
||
|
* |Anchor| | ---- Rectangle ---- |
|
||
|
* o---------OO0000000000000oo------ 0: mate diagonal (also core diags!)
|
||
|
* -o---------OO0000000000000oo----- o: "RHS gap" diagonals
|
||
|
* --o---------OO0000000000000oo---- O: "LHS gap" diagonals
|
||
|
* ---oo--------OO0000000000000oo--- *: "LHS extra" diagonals
|
||
|
* -----o--------OO0000000000000oo-- -: cells that can't possibly be
|
||
|
* ------o--------OO0000000000000oo- involved in a valid alignment that
|
||
|
* -------o--------OO0000000000000oo overlaps one of the core diagonals
|
||
|
* XXXXXXXXXXXXX
|
||
|
* | RHS Range |
|
||
|
* ^ ^
|
||
|
* rl rr
|
||
|
*
|
||
|
* The anchor-to-right case looks like this:
|
||
|
*
|
||
|
* ll lr
|
||
|
* v v
|
||
|
* | LHS Range |
|
||
|
* XXXXXXXXXXXXX |Anchor|
|
||
|
* OO0000000000000oo--------o-------- 0: mate diagonal (also core diags!)
|
||
|
* -OO0000000000000oo--------o------- o: "RHS gap" diagonals
|
||
|
* --OO0000000000000oo--------o------ O: "LHS gap" diagonals
|
||
|
* ---OO0000000000000oo--------oo---- *: "LHS extra" diagonals
|
||
|
* ----OO0000000000000oo---------o--- -: cells that can't possibly be
|
||
|
* -----OO0000000000000oo---------o-- involved in a valid alignment that
|
||
|
* ------OO0000000000000oo---------o- overlaps one of the core diagonals
|
||
|
* | ---- Rectangle ---- |
|
||
|
*/
|
||
|
bool SwAligner::gatherCellsNucleotidesLocalSseU8(TAlScore best) {
|
||
|
// What's the minimum number of rows that can possibly be spanned by an
|
||
|
// alignment that meets the minimum score requirement?
|
||
|
assert(sse8succ_);
|
||
|
size_t bonus = (size_t)sc_->match(30);
|
||
|
const size_t ncol = lastsolcol_ + 1;
|
||
|
const size_t nrow = dpRows();
|
||
|
assert_gt(nrow, 0);
|
||
|
btncand_.clear();
|
||
|
btncanddone_.clear();
|
||
|
SSEData& d = fw_ ? sseU8fw_ : sseU8rc_;
|
||
|
SSEMetrics& met = extend_ ? sseU8ExtendMet_ : sseU8MateMet_;
|
||
|
assert(!d.profbuf_.empty());
|
||
|
//const size_t rowstride = d.mat_.rowstride();
|
||
|
//const size_t colstride = d.mat_.colstride();
|
||
|
size_t iter = (dpRows() + (NWORDS_PER_REG - 1)) / NWORDS_PER_REG;
|
||
|
assert_gt(iter, 0);
|
||
|
assert_geq(minsc_, 0);
|
||
|
assert_gt(bonus, 0);
|
||
|
size_t minrow = (size_t)(((minsc_ + bonus - 1) / bonus) - 1);
|
||
|
for(size_t j = 0; j < ncol; j++) {
|
||
|
// Establish the range of rows where a backtrace from the cell in this
|
||
|
// row/col is close enough to one of the core diagonals that it could
|
||
|
// conceivably count
|
||
|
size_t nrow_lo = MIN_SIZE_T;
|
||
|
size_t nrow_hi = nrow;
|
||
|
// First, check if there is a cell in this column with a score
|
||
|
// above the score threshold
|
||
|
__m128i vmax = *d.mat_.tmpvec(0, j);
|
||
|
__m128i vtmp = _mm_srli_si128(vmax, 8);
|
||
|
vmax = _mm_max_epu8(vmax, vtmp);
|
||
|
vtmp = _mm_srli_si128(vmax, 4);
|
||
|
vmax = _mm_max_epu8(vmax, vtmp);
|
||
|
vtmp = _mm_srli_si128(vmax, 2);
|
||
|
vmax = _mm_max_epu8(vmax, vtmp);
|
||
|
vtmp = _mm_srli_si128(vmax, 1);
|
||
|
vmax = _mm_max_epu8(vmax, vtmp);
|
||
|
int score = _mm_extract_epi16(vmax, 0);
|
||
|
score = score & 0x00ff;
|
||
|
#ifndef NDEBUG
|
||
|
{
|
||
|
// Start in upper vector row and move down
|
||
|
TAlScore max = 0;
|
||
|
__m128i *pvH = d.mat_.hvec(0, j);
|
||
|
for(size_t i = 0; i < iter; i++) {
|
||
|
for(size_t k = 0; k < NWORDS_PER_REG; k++) {
|
||
|
TAlScore sc = (TAlScore)((TCScore*)pvH)[k];
|
||
|
if(sc > max) {
|
||
|
max = sc;
|
||
|
}
|
||
|
}
|
||
|
pvH += ROWSTRIDE;
|
||
|
}
|
||
|
assert_eq(max, score);
|
||
|
}
|
||
|
#endif
|
||
|
if((TAlScore)score < minsc_) {
|
||
|
// Scores in column aren't good enough
|
||
|
continue;
|
||
|
}
|
||
|
// Get pointer to first cell in column to examine:
|
||
|
__m128i *pvHorig = d.mat_.hvec(0, j);
|
||
|
__m128i *pvH = pvHorig;
|
||
|
// Get pointer to the vector in the following column that corresponds
|
||
|
// to the cells diagonally down and to the right from the cells in pvH
|
||
|
__m128i *pvHSucc = (j < ncol-1) ? d.mat_.hvec(0, j+1) : NULL;
|
||
|
// Start in upper vector row and move down
|
||
|
for(size_t i = 0; i < iter; i++) {
|
||
|
if(pvHSucc != NULL) {
|
||
|
pvHSucc += ROWSTRIDE;
|
||
|
if(i == iter-1) {
|
||
|
pvHSucc = d.mat_.hvec(0, j+1);
|
||
|
}
|
||
|
}
|
||
|
// Which elements of this vector are exhaustively scored?
|
||
|
size_t rdoff = i;
|
||
|
for(size_t k = 0; k < NWORDS_PER_REG; k++) {
|
||
|
// Is this row, col one that we can potential backtrace from?
|
||
|
// I.e. are we close enough to a core diagonal?
|
||
|
if(rdoff >= nrow_lo && rdoff < nrow_hi) {
|
||
|
// This cell has been exhaustively scored
|
||
|
if(rdoff >= minrow) {
|
||
|
// ... and it could potentially score high enough
|
||
|
TAlScore sc = (TAlScore)((TCScore*)pvH)[k];
|
||
|
assert_leq(sc, best);
|
||
|
if(sc >= minsc_) {
|
||
|
// This is a potential solution
|
||
|
bool matchSucc = false;
|
||
|
int readc = (*rd_)[rdoff];
|
||
|
int refc = rf_[j + rfi_];
|
||
|
bool match = ((refc & (1 << readc)) != 0);
|
||
|
if(rdoff < dpRows()-1) {
|
||
|
int readcSucc = (*rd_)[rdoff+1];
|
||
|
int refcSucc = rf_[j + rfi_ + 1];
|
||
|
assert_range(0, 16, refcSucc);
|
||
|
matchSucc = ((refcSucc & (1 << readcSucc)) != 0);
|
||
|
}
|
||
|
if(match && !matchSucc) {
|
||
|
// Yes, this is legit
|
||
|
met.gathsol++;
|
||
|
btncand_.expand();
|
||
|
btncand_.back().init(rdoff, j, sc);
|
||
|
}
|
||
|
}
|
||
|
}
|
||
|
} else {
|
||
|
// Already saw every element in the vector that's been
|
||
|
// exhaustively scored
|
||
|
break;
|
||
|
}
|
||
|
rdoff += iter;
|
||
|
}
|
||
|
pvH += ROWSTRIDE;
|
||
|
}
|
||
|
}
|
||
|
if(!btncand_.empty()) {
|
||
|
d.mat_.initMasks();
|
||
|
}
|
||
|
return !btncand_.empty();
|
||
|
}
|
||
|
|
||
|
#define MOVE_VEC_PTR_UP(vec, rowvec, rowelt) { \
|
||
|
if(rowvec == 0) { \
|
||
|
rowvec += d.mat_.nvecrow_; \
|
||
|
vec += d.mat_.colstride_; \
|
||
|
rowelt--; \
|
||
|
} \
|
||
|
rowvec--; \
|
||
|
vec -= ROWSTRIDE; \
|
||
|
}
|
||
|
|
||
|
#define MOVE_VEC_PTR_LEFT(vec, rowvec, rowelt) { vec -= d.mat_.colstride_; }
|
||
|
|
||
|
#define MOVE_VEC_PTR_UPLEFT(vec, rowvec, rowelt) { \
|
||
|
MOVE_VEC_PTR_UP(vec, rowvec, rowelt); \
|
||
|
MOVE_VEC_PTR_LEFT(vec, rowvec, rowelt); \
|
||
|
}
|
||
|
|
||
|
#define MOVE_ALL_LEFT() { \
|
||
|
MOVE_VEC_PTR_LEFT(cur_vec, rowvec, rowelt); \
|
||
|
MOVE_VEC_PTR_LEFT(left_vec, left_rowvec, left_rowelt); \
|
||
|
MOVE_VEC_PTR_LEFT(up_vec, up_rowvec, up_rowelt); \
|
||
|
MOVE_VEC_PTR_LEFT(upleft_vec, upleft_rowvec, upleft_rowelt); \
|
||
|
}
|
||
|
|
||
|
#define MOVE_ALL_UP() { \
|
||
|
MOVE_VEC_PTR_UP(cur_vec, rowvec, rowelt); \
|
||
|
MOVE_VEC_PTR_UP(left_vec, left_rowvec, left_rowelt); \
|
||
|
MOVE_VEC_PTR_UP(up_vec, up_rowvec, up_rowelt); \
|
||
|
MOVE_VEC_PTR_UP(upleft_vec, upleft_rowvec, upleft_rowelt); \
|
||
|
}
|
||
|
|
||
|
#define MOVE_ALL_UPLEFT() { \
|
||
|
MOVE_VEC_PTR_UPLEFT(cur_vec, rowvec, rowelt); \
|
||
|
MOVE_VEC_PTR_UPLEFT(left_vec, left_rowvec, left_rowelt); \
|
||
|
MOVE_VEC_PTR_UPLEFT(up_vec, up_rowvec, up_rowelt); \
|
||
|
MOVE_VEC_PTR_UPLEFT(upleft_vec, upleft_rowvec, upleft_rowelt); \
|
||
|
}
|
||
|
|
||
|
#define NEW_ROW_COL(row, col) { \
|
||
|
rowelt = row / d.mat_.nvecrow_; \
|
||
|
rowvec = row % d.mat_.nvecrow_; \
|
||
|
eltvec = (col * d.mat_.colstride_) + (rowvec * ROWSTRIDE); \
|
||
|
cur_vec = d.mat_.matbuf_.ptr() + eltvec; \
|
||
|
left_vec = cur_vec; \
|
||
|
left_rowelt = rowelt; \
|
||
|
left_rowvec = rowvec; \
|
||
|
MOVE_VEC_PTR_LEFT(left_vec, left_rowvec, left_rowelt); \
|
||
|
up_vec = cur_vec; \
|
||
|
up_rowelt = rowelt; \
|
||
|
up_rowvec = rowvec; \
|
||
|
MOVE_VEC_PTR_UP(up_vec, up_rowvec, up_rowelt); \
|
||
|
upleft_vec = up_vec; \
|
||
|
upleft_rowelt = up_rowelt; \
|
||
|
upleft_rowvec = up_rowvec; \
|
||
|
MOVE_VEC_PTR_LEFT(upleft_vec, upleft_rowvec, upleft_rowelt); \
|
||
|
}
|
||
|
|
||
|
/**
|
||
|
* Given the dynamic programming table and a cell, trace backwards from the
|
||
|
* cell and install the edits and score/penalty in the appropriate fields
|
||
|
* of SwResult res, which contains an AlnRes. The RandomSource is used to
|
||
|
* break ties among equally good ways of tracing back.
|
||
|
*
|
||
|
* Upon entering a cell, we check if the read/ref coordinates of the cell
|
||
|
* correspond to a cell we traversed constructing a previous alignment. If so,
|
||
|
* we backtrack to the last decision point, mask out the path that led to the
|
||
|
* previously observed cell, and continue along a different path; or, if there
|
||
|
* are no more paths to try, we give up.
|
||
|
*
|
||
|
* An alignment found is subject to a filtering step designed to remove
|
||
|
* alignments that could spuriously trump a better alignment falling partially
|
||
|
* outside the rectangle.
|
||
|
*
|
||
|
* 1
|
||
|
* 67890123456 0: seed diagonal
|
||
|
* **OO0oo---- o: right-hand "gap" diagonals: band of 'maxgap' diags
|
||
|
* -**OO0oo--- O: left-hand "gap" diagonals: band of 'maxgap' diags
|
||
|
* --**OO0oo-- *: "extra" diagonals: additional band of 'maxgap' diags
|
||
|
* ---**OO0oo- +: cells not in any of the above
|
||
|
* ----**OO0oo
|
||
|
* |-|
|
||
|
* Gotta touch one of these diags
|
||
|
*
|
||
|
* Basically, the filtering step removes alignments that do not at some point
|
||
|
* touch a cell labeled '0' or 'O' in the diagram above.
|
||
|
*
|
||
|
*/
|
||
|
bool SwAligner::backtraceNucleotidesLocalSseU8(
|
||
|
TAlScore escore, // in: expected score
|
||
|
SwResult& res, // out: store results (edits and scores) here
|
||
|
size_t& off, // out: store diagonal projection of origin
|
||
|
size_t& nbts, // out: # backtracks
|
||
|
size_t row, // start in this row
|
||
|
size_t col, // start in this column
|
||
|
RandomSource& rnd) // random gen, to choose among equal paths
|
||
|
{
|
||
|
assert_lt(row, dpRows());
|
||
|
assert_lt(col, (size_t)(rff_ - rfi_));
|
||
|
SSEData& d = fw_ ? sseU8fw_ : sseU8rc_;
|
||
|
SSEMetrics& met = extend_ ? sseU8ExtendMet_ : sseU8MateMet_;
|
||
|
met.bt++;
|
||
|
assert(!d.profbuf_.empty());
|
||
|
assert_lt(row, rd_->length());
|
||
|
btnstack_.clear(); // empty the backtrack stack
|
||
|
btcells_.clear(); // empty the cells-so-far list
|
||
|
AlnScore score; score.score_ = 0;
|
||
|
score.gaps_ = score.ns_ = 0;
|
||
|
size_t origCol = col;
|
||
|
size_t gaps = 0, readGaps = 0, refGaps = 0;
|
||
|
res.alres.reset();
|
||
|
EList<Edit>& ned = res.alres.ned();
|
||
|
assert(ned.empty());
|
||
|
assert_gt(dpRows(), row);
|
||
|
size_t trimEnd = dpRows() - row - 1;
|
||
|
size_t trimBeg = 0;
|
||
|
size_t ct = SSEMatrix::H; // cell type
|
||
|
// Row and col in terms of where they fall in the SSE vector matrix
|
||
|
size_t rowelt, rowvec, eltvec;
|
||
|
size_t left_rowelt, up_rowelt, upleft_rowelt;
|
||
|
size_t left_rowvec, up_rowvec, upleft_rowvec;
|
||
|
__m128i *cur_vec, *left_vec, *up_vec, *upleft_vec;
|
||
|
NEW_ROW_COL(row, col);
|
||
|
while((int)row >= 0) {
|
||
|
met.btcell++;
|
||
|
nbts++;
|
||
|
int readc = (*rd_)[rdi_ + row];
|
||
|
int refm = (int)rf_[rfi_ + col];
|
||
|
int readq = (*qu_)[row];
|
||
|
assert_leq(col, origCol);
|
||
|
// Get score in this cell
|
||
|
bool empty = false, reportedThru, canMoveThru, branch = false;
|
||
|
int cur = SSEMatrix::H;
|
||
|
if(!d.mat_.reset_[row]) {
|
||
|
d.mat_.resetRow(row);
|
||
|
}
|
||
|
reportedThru = d.mat_.reportedThrough(row, col);
|
||
|
canMoveThru = true;
|
||
|
if(reportedThru) {
|
||
|
canMoveThru = false;
|
||
|
} else {
|
||
|
empty = false;
|
||
|
if(row > 0) {
|
||
|
assert_gt(row, 0);
|
||
|
size_t rowFromEnd = d.mat_.nrow() - row - 1;
|
||
|
bool gapsAllowed = true;
|
||
|
if(row < (size_t)sc_->gapbar ||
|
||
|
rowFromEnd < (size_t)sc_->gapbar)
|
||
|
{
|
||
|
gapsAllowed = false;
|
||
|
}
|
||
|
const int floorsc = 0;
|
||
|
const int offsetsc = 0;
|
||
|
// Move to beginning of column/row
|
||
|
if(ct == SSEMatrix::E) { // AKA rdgap
|
||
|
assert_gt(col, 0);
|
||
|
TAlScore sc_cur = ((TCScore*)(cur_vec + SSEMatrix::E))[rowelt] + offsetsc;
|
||
|
assert(gapsAllowed);
|
||
|
// Currently in the E matrix; incoming transition must come from the
|
||
|
// left. It's either a gap open from the H matrix or a gap extend from
|
||
|
// the E matrix.
|
||
|
// TODO: save and restore origMask as well as mask
|
||
|
int origMask = 0, mask = 0;
|
||
|
// Get H score of cell to the left
|
||
|
TAlScore sc_h_left = ((TCScore*)(left_vec + SSEMatrix::H))[left_rowelt] + offsetsc;
|
||
|
if(sc_h_left > 0 && sc_h_left - sc_->readGapOpen() == sc_cur) {
|
||
|
mask |= (1 << 0);
|
||
|
}
|
||
|
// Get E score of cell to the left
|
||
|
TAlScore sc_e_left = ((TCScore*)(left_vec + SSEMatrix::E))[left_rowelt] + offsetsc;
|
||
|
if(sc_e_left > 0 && sc_e_left - sc_->readGapExtend() == sc_cur) {
|
||
|
mask |= (1 << 1);
|
||
|
}
|
||
|
origMask = mask;
|
||
|
assert(origMask > 0 || sc_cur <= sc_->match());
|
||
|
if(d.mat_.isEMaskSet(row, col)) {
|
||
|
mask = (d.mat_.masks_[row][col] >> 8) & 3;
|
||
|
}
|
||
|
if(mask == 3) {
|
||
|
#if 1
|
||
|
// Pick H -> E cell
|
||
|
cur = SW_BT_OALL_READ_OPEN;
|
||
|
d.mat_.eMaskSet(row, col, 2); // might choose E later
|
||
|
#else
|
||
|
if(rnd.nextU2()) {
|
||
|
// Pick H -> E cell
|
||
|
cur = SW_BT_OALL_READ_OPEN;
|
||
|
d.mat_.eMaskSet(row, col, 2); // might choose E later
|
||
|
} else {
|
||
|
// Pick E -> E cell
|
||
|
cur = SW_BT_RDGAP_EXTEND;
|
||
|
d.mat_.eMaskSet(row, col, 1); // might choose H later
|
||
|
}
|
||
|
#endif
|
||
|
branch = true;
|
||
|
} else if(mask == 2) {
|
||
|
// I chose the E cell
|
||
|
cur = SW_BT_RDGAP_EXTEND;
|
||
|
d.mat_.eMaskSet(row, col, 0); // done
|
||
|
} else if(mask == 1) {
|
||
|
// I chose the H cell
|
||
|
cur = SW_BT_OALL_READ_OPEN;
|
||
|
d.mat_.eMaskSet(row, col, 0); // done
|
||
|
} else {
|
||
|
empty = true;
|
||
|
// It's empty, so the only question left is whether we should be
|
||
|
// allowed in terimnate in this cell. If it's got a valid score
|
||
|
// then we *shouldn't* be allowed to terminate here because that
|
||
|
// means it's part of a larger alignment that was already reported.
|
||
|
canMoveThru = (origMask == 0);
|
||
|
}
|
||
|
assert(!empty || !canMoveThru);
|
||
|
} else if(ct == SSEMatrix::F) { // AKA rfgap
|
||
|
assert_gt(row, 0);
|
||
|
assert(gapsAllowed);
|
||
|
TAlScore sc_h_up = ((TCScore*)(up_vec + SSEMatrix::H))[up_rowelt] + offsetsc;
|
||
|
TAlScore sc_f_up = ((TCScore*)(up_vec + SSEMatrix::F))[up_rowelt] + offsetsc;
|
||
|
TAlScore sc_cur = ((TCScore*)(cur_vec + SSEMatrix::F))[rowelt] + offsetsc;
|
||
|
// Currently in the F matrix; incoming transition must come from above.
|
||
|
// It's either a gap open from the H matrix or a gap extend from the F
|
||
|
// matrix.
|
||
|
// TODO: save and restore origMask as well as mask
|
||
|
int origMask = 0, mask = 0;
|
||
|
// Get H score of cell above
|
||
|
if(sc_h_up > floorsc && sc_h_up - sc_->refGapOpen() == sc_cur) {
|
||
|
mask |= (1 << 0);
|
||
|
}
|
||
|
// Get F score of cell above
|
||
|
if(sc_f_up > floorsc && sc_f_up - sc_->refGapExtend() == sc_cur) {
|
||
|
mask |= (1 << 1);
|
||
|
}
|
||
|
origMask = mask;
|
||
|
assert(origMask > 0 || sc_cur <= sc_->match());
|
||
|
if(d.mat_.isFMaskSet(row, col)) {
|
||
|
mask = (d.mat_.masks_[row][col] >> 11) & 3;
|
||
|
}
|
||
|
if(mask == 3) {
|
||
|
#if 1
|
||
|
// I chose the H cell
|
||
|
cur = SW_BT_OALL_REF_OPEN;
|
||
|
d.mat_.fMaskSet(row, col, 2); // might choose E later
|
||
|
#else
|
||
|
if(rnd.nextU2()) {
|
||
|
// I chose the H cell
|
||
|
cur = SW_BT_OALL_REF_OPEN;
|
||
|
d.mat_.fMaskSet(row, col, 2); // might choose E later
|
||
|
} else {
|
||
|
// I chose the F cell
|
||
|
cur = SW_BT_RFGAP_EXTEND;
|
||
|
d.mat_.fMaskSet(row, col, 1); // might choose E later
|
||
|
}
|
||
|
#endif
|
||
|
branch = true;
|
||
|
} else if(mask == 2) {
|
||
|
// I chose the F cell
|
||
|
cur = SW_BT_RFGAP_EXTEND;
|
||
|
d.mat_.fMaskSet(row, col, 0); // done
|
||
|
} else if(mask == 1) {
|
||
|
// I chose the H cell
|
||
|
cur = SW_BT_OALL_REF_OPEN;
|
||
|
d.mat_.fMaskSet(row, col, 0); // done
|
||
|
} else {
|
||
|
empty = true;
|
||
|
// It's empty, so the only question left is whether we should be
|
||
|
// allowed in terimnate in this cell. If it's got a valid score
|
||
|
// then we *shouldn't* be allowed to terminate here because that
|
||
|
// means it's part of a larger alignment that was already reported.
|
||
|
canMoveThru = (origMask == 0);
|
||
|
}
|
||
|
assert(!empty || !canMoveThru);
|
||
|
} else {
|
||
|
assert_eq(SSEMatrix::H, ct);
|
||
|
TAlScore sc_cur = ((TCScore*)(cur_vec + SSEMatrix::H))[rowelt] + offsetsc;
|
||
|
TAlScore sc_f_up = ((TCScore*)(up_vec + SSEMatrix::F))[up_rowelt] + offsetsc;
|
||
|
TAlScore sc_h_up = ((TCScore*)(up_vec + SSEMatrix::H))[up_rowelt] + offsetsc;
|
||
|
TAlScore sc_h_left = col > 0 ? (((TCScore*)(left_vec + SSEMatrix::H))[left_rowelt] + offsetsc) : floorsc;
|
||
|
TAlScore sc_e_left = col > 0 ? (((TCScore*)(left_vec + SSEMatrix::E))[left_rowelt] + offsetsc) : floorsc;
|
||
|
TAlScore sc_h_upleft = col > 0 ? (((TCScore*)(upleft_vec + SSEMatrix::H))[upleft_rowelt] + offsetsc) : floorsc;
|
||
|
TAlScore sc_diag = sc_->score(readc, refm, readq - 33);
|
||
|
// TODO: save and restore origMask as well as mask
|
||
|
int origMask = 0, mask = 0;
|
||
|
if(gapsAllowed) {
|
||
|
if(sc_h_up > floorsc && sc_cur == sc_h_up - sc_->refGapOpen()) {
|
||
|
mask |= (1 << 0);
|
||
|
}
|
||
|
if(sc_h_left > floorsc && sc_cur == sc_h_left - sc_->readGapOpen()) {
|
||
|
mask |= (1 << 1);
|
||
|
}
|
||
|
if(sc_f_up > floorsc && sc_cur == sc_f_up - sc_->refGapExtend()) {
|
||
|
mask |= (1 << 2);
|
||
|
}
|
||
|
if(sc_e_left > floorsc && sc_cur == sc_e_left - sc_->readGapExtend()) {
|
||
|
mask |= (1 << 3);
|
||
|
}
|
||
|
}
|
||
|
if(sc_h_upleft > floorsc && sc_cur == sc_h_upleft + sc_diag) {
|
||
|
mask |= (1 << 4);
|
||
|
}
|
||
|
origMask = mask;
|
||
|
assert(origMask > 0 || sc_cur <= sc_->match());
|
||
|
if(d.mat_.isHMaskSet(row, col)) {
|
||
|
mask = (d.mat_.masks_[row][col] >> 2) & 31;
|
||
|
}
|
||
|
assert(gapsAllowed || mask == (1 << 4) || mask == 0);
|
||
|
int opts = alts5[mask];
|
||
|
int select = -1;
|
||
|
if(opts == 1) {
|
||
|
select = firsts5[mask];
|
||
|
assert_geq(mask, 0);
|
||
|
d.mat_.hMaskSet(row, col, 0);
|
||
|
} else if(opts > 1) {
|
||
|
#if 1
|
||
|
if( (mask & 16) != 0) {
|
||
|
select = 4; // H diag
|
||
|
} else if((mask & 1) != 0) {
|
||
|
select = 0; // H up
|
||
|
} else if((mask & 4) != 0) {
|
||
|
select = 2; // F up
|
||
|
} else if((mask & 2) != 0) {
|
||
|
select = 1; // H left
|
||
|
} else if((mask & 8) != 0) {
|
||
|
select = 3; // E left
|
||
|
}
|
||
|
#else
|
||
|
select = randFromMask(rnd, mask);
|
||
|
#endif
|
||
|
assert_geq(mask, 0);
|
||
|
mask &= ~(1 << select);
|
||
|
assert(gapsAllowed || mask == (1 << 4) || mask == 0);
|
||
|
d.mat_.hMaskSet(row, col, mask);
|
||
|
branch = true;
|
||
|
} else { /* No way to backtrack! */ }
|
||
|
if(select != -1) {
|
||
|
if(select == 4) {
|
||
|
cur = SW_BT_OALL_DIAG;
|
||
|
} else if(select == 0) {
|
||
|
cur = SW_BT_OALL_REF_OPEN;
|
||
|
} else if(select == 1) {
|
||
|
cur = SW_BT_OALL_READ_OPEN;
|
||
|
} else if(select == 2) {
|
||
|
cur = SW_BT_RFGAP_EXTEND;
|
||
|
} else {
|
||
|
assert_eq(3, select)
|
||
|
cur = SW_BT_RDGAP_EXTEND;
|
||
|
}
|
||
|
} else {
|
||
|
empty = true;
|
||
|
// It's empty, so the only question left is whether we should be
|
||
|
// allowed in terimnate in this cell. If it's got a valid score
|
||
|
// then we *shouldn't* be allowed to terminate here because that
|
||
|
// means it's part of a larger alignment that was already reported.
|
||
|
canMoveThru = (origMask == 0);
|
||
|
}
|
||
|
}
|
||
|
assert(!empty || !canMoveThru || ct == SSEMatrix::H);
|
||
|
}
|
||
|
}
|
||
|
d.mat_.setReportedThrough(row, col);
|
||
|
assert_eq(gaps, Edit::numGaps(ned));
|
||
|
assert_leq(gaps, rdgap_ + rfgap_);
|
||
|
// Cell was involved in a previously-reported alignment?
|
||
|
if(!canMoveThru) {
|
||
|
if(!btnstack_.empty()) {
|
||
|
// Remove all the cells from list back to and including the
|
||
|
// cell where the branch occurred
|
||
|
btcells_.resize(btnstack_.back().celsz);
|
||
|
// Pop record off the top of the stack
|
||
|
ned.resize(btnstack_.back().nedsz);
|
||
|
//aed.resize(btnstack_.back().aedsz);
|
||
|
row = btnstack_.back().row;
|
||
|
col = btnstack_.back().col;
|
||
|
gaps = btnstack_.back().gaps;
|
||
|
readGaps = btnstack_.back().readGaps;
|
||
|
refGaps = btnstack_.back().refGaps;
|
||
|
score = btnstack_.back().score;
|
||
|
ct = btnstack_.back().ct;
|
||
|
btnstack_.pop_back();
|
||
|
assert(!sc_->monotone || score.score() >= escore);
|
||
|
NEW_ROW_COL(row, col);
|
||
|
continue;
|
||
|
} else {
|
||
|
// No branch points to revisit; just give up
|
||
|
res.reset();
|
||
|
met.btfail++; // DP backtraces failed
|
||
|
return false;
|
||
|
}
|
||
|
}
|
||
|
assert(!reportedThru);
|
||
|
assert(!sc_->monotone || score.score() >= minsc_);
|
||
|
if(empty || row == 0) {
|
||
|
assert_eq(SSEMatrix::H, ct);
|
||
|
btcells_.expand();
|
||
|
btcells_.back().first = row;
|
||
|
btcells_.back().second = col;
|
||
|
// This cell is at the end of a legitimate alignment
|
||
|
trimBeg = row;
|
||
|
assert_eq(btcells_.size(), dpRows() - trimBeg - trimEnd + readGaps);
|
||
|
break;
|
||
|
}
|
||
|
if(branch) {
|
||
|
// Add a frame to the backtrack stack
|
||
|
btnstack_.expand();
|
||
|
btnstack_.back().init(
|
||
|
ned.size(),
|
||
|
0, // aed.size()
|
||
|
btcells_.size(),
|
||
|
row,
|
||
|
col,
|
||
|
gaps,
|
||
|
readGaps,
|
||
|
refGaps,
|
||
|
score,
|
||
|
(int)ct);
|
||
|
}
|
||
|
btcells_.expand();
|
||
|
btcells_.back().first = row;
|
||
|
btcells_.back().second = col;
|
||
|
switch(cur) {
|
||
|
// Move up and to the left. If the reference nucleotide in the
|
||
|
// source row mismatches the read nucleotide, penalize
|
||
|
// it and add a nucleotide mismatch.
|
||
|
case SW_BT_OALL_DIAG: {
|
||
|
assert_gt(row, 0); assert_gt(col, 0);
|
||
|
// Check for color mismatch
|
||
|
int readC = (*rd_)[row];
|
||
|
int refNmask = (int)rf_[rfi_+col];
|
||
|
assert_gt(refNmask, 0);
|
||
|
int m = matchesEx(readC, refNmask);
|
||
|
ct = SSEMatrix::H;
|
||
|
if(m != 1) {
|
||
|
Edit e(
|
||
|
(int)row,
|
||
|
mask2dna[refNmask],
|
||
|
"ACGTN"[readC],
|
||
|
EDIT_TYPE_MM);
|
||
|
assert(e.repOk());
|
||
|
assert(ned.empty() || ned.back().pos >= row);
|
||
|
ned.push_back(e);
|
||
|
int pen = QUAL2(row, col);
|
||
|
score.score_ -= pen;
|
||
|
assert(!sc_->monotone || score.score() >= escore);
|
||
|
} else {
|
||
|
// Reward a match
|
||
|
int64_t bonus = sc_->match(30);
|
||
|
score.score_ += bonus;
|
||
|
assert(!sc_->monotone || score.score() >= escore);
|
||
|
}
|
||
|
if(m == -1) {
|
||
|
score.ns_++;
|
||
|
}
|
||
|
row--; col--;
|
||
|
MOVE_ALL_UPLEFT();
|
||
|
assert(VALID_AL_SCORE(score));
|
||
|
break;
|
||
|
}
|
||
|
// Move up. Add an edit encoding the ref gap.
|
||
|
case SW_BT_OALL_REF_OPEN:
|
||
|
{
|
||
|
assert_gt(row, 0);
|
||
|
Edit e(
|
||
|
(int)row,
|
||
|
'-',
|
||
|
"ACGTN"[(int)(*rd_)[row]],
|
||
|
EDIT_TYPE_REF_GAP);
|
||
|
assert(e.repOk());
|
||
|
assert(ned.empty() || ned.back().pos >= row);
|
||
|
ned.push_back(e);
|
||
|
assert_geq(row, (size_t)sc_->gapbar);
|
||
|
assert_geq((int)(rdf_-rdi_-row-1), sc_->gapbar-1);
|
||
|
row--;
|
||
|
ct = SSEMatrix::H;
|
||
|
int pen = sc_->refGapOpen();
|
||
|
score.score_ -= pen;
|
||
|
assert(!sc_->monotone || score.score() >= minsc_);
|
||
|
gaps++; refGaps++;
|
||
|
assert_eq(gaps, Edit::numGaps(ned));
|
||
|
assert_leq(gaps, rdgap_ + rfgap_);
|
||
|
MOVE_ALL_UP();
|
||
|
break;
|
||
|
}
|
||
|
// Move up. Add an edit encoding the ref gap.
|
||
|
case SW_BT_RFGAP_EXTEND:
|
||
|
{
|
||
|
assert_gt(row, 1);
|
||
|
Edit e(
|
||
|
(int)row,
|
||
|
'-',
|
||
|
"ACGTN"[(int)(*rd_)[row]],
|
||
|
EDIT_TYPE_REF_GAP);
|
||
|
assert(e.repOk());
|
||
|
assert(ned.empty() || ned.back().pos >= row);
|
||
|
ned.push_back(e);
|
||
|
assert_geq(row, (size_t)sc_->gapbar);
|
||
|
assert_geq((int)(rdf_-rdi_-row-1), sc_->gapbar-1);
|
||
|
row--;
|
||
|
ct = SSEMatrix::F;
|
||
|
int pen = sc_->refGapExtend();
|
||
|
score.score_ -= pen;
|
||
|
assert(!sc_->monotone || score.score() >= minsc_);
|
||
|
gaps++; refGaps++;
|
||
|
assert_eq(gaps, Edit::numGaps(ned));
|
||
|
assert_leq(gaps, rdgap_ + rfgap_);
|
||
|
MOVE_ALL_UP();
|
||
|
break;
|
||
|
}
|
||
|
case SW_BT_OALL_READ_OPEN:
|
||
|
{
|
||
|
assert_gt(col, 0);
|
||
|
Edit e(
|
||
|
(int)row+1,
|
||
|
mask2dna[(int)rf_[rfi_+col]],
|
||
|
'-',
|
||
|
EDIT_TYPE_READ_GAP);
|
||
|
assert(e.repOk());
|
||
|
assert(ned.empty() || ned.back().pos >= row);
|
||
|
ned.push_back(e);
|
||
|
assert_geq(row, (size_t)sc_->gapbar);
|
||
|
assert_geq((int)(rdf_-rdi_-row-1), sc_->gapbar-1);
|
||
|
col--;
|
||
|
ct = SSEMatrix::H;
|
||
|
int pen = sc_->readGapOpen();
|
||
|
score.score_ -= pen;
|
||
|
assert(!sc_->monotone || score.score() >= minsc_);
|
||
|
gaps++; readGaps++;
|
||
|
assert_eq(gaps, Edit::numGaps(ned));
|
||
|
assert_leq(gaps, rdgap_ + rfgap_);
|
||
|
MOVE_ALL_LEFT();
|
||
|
break;
|
||
|
}
|
||
|
case SW_BT_RDGAP_EXTEND:
|
||
|
{
|
||
|
assert_gt(col, 1);
|
||
|
Edit e(
|
||
|
(int)row+1,
|
||
|
mask2dna[(int)rf_[rfi_+col]],
|
||
|
'-',
|
||
|
EDIT_TYPE_READ_GAP);
|
||
|
assert(e.repOk());
|
||
|
assert(ned.empty() || ned.back().pos >= row);
|
||
|
ned.push_back(e);
|
||
|
assert_geq(row, (size_t)sc_->gapbar);
|
||
|
assert_geq((int)(rdf_-rdi_-row-1), sc_->gapbar-1);
|
||
|
col--;
|
||
|
ct = SSEMatrix::E;
|
||
|
int pen = sc_->readGapExtend();
|
||
|
score.score_ -= pen;
|
||
|
assert(!sc_->monotone || score.score() >= minsc_);
|
||
|
gaps++; readGaps++;
|
||
|
assert_eq(gaps, Edit::numGaps(ned));
|
||
|
assert_leq(gaps, rdgap_ + rfgap_);
|
||
|
MOVE_ALL_LEFT();
|
||
|
break;
|
||
|
}
|
||
|
default: throw 1;
|
||
|
}
|
||
|
} // while((int)row > 0)
|
||
|
assert_geq(col, 0);
|
||
|
assert_eq(SSEMatrix::H, ct);
|
||
|
// The number of cells in the backtracs should equal the number of read
|
||
|
// bases after trimming plus the number of gaps
|
||
|
assert_eq(btcells_.size(), dpRows() - trimBeg - trimEnd + readGaps);
|
||
|
// Check whether we went through a core diagonal and set 'reported' flag on
|
||
|
// each cell
|
||
|
bool overlappedCoreDiag = false;
|
||
|
for(size_t i = 0; i < btcells_.size(); i++) {
|
||
|
size_t rw = btcells_[i].first;
|
||
|
size_t cl = btcells_[i].second;
|
||
|
// Calculate the diagonal within the *trimmed* rectangle, i.e. the
|
||
|
// rectangle we dealt with in align, gather and backtrack.
|
||
|
int64_t diagi = cl - rw;
|
||
|
// Now adjust to the diagonal within the *untrimmed* rectangle by
|
||
|
// adding on the amount trimmed from the left.
|
||
|
diagi += rect_->triml;
|
||
|
if(diagi >= 0) {
|
||
|
size_t diag = (size_t)diagi;
|
||
|
if(diag >= rect_->corel && diag <= rect_->corer) {
|
||
|
overlappedCoreDiag = true;
|
||
|
break;
|
||
|
}
|
||
|
}
|
||
|
#ifndef NDEBUG
|
||
|
//assert(!d.mat_.reportedThrough(rw, cl));
|
||
|
//d.mat_.setReportedThrough(rw, cl);
|
||
|
assert(d.mat_.reportedThrough(rw, cl));
|
||
|
#endif
|
||
|
}
|
||
|
if(!overlappedCoreDiag) {
|
||
|
// Must overlap a core diagonal. Otherwise, we run the risk of
|
||
|
// reporting an alignment that overlaps (and trumps) a higher-scoring
|
||
|
// alignment that lies partially outside the dynamic programming
|
||
|
// rectangle.
|
||
|
res.reset();
|
||
|
met.corerej++;
|
||
|
return false;
|
||
|
}
|
||
|
int readC = (*rd_)[rdi_+row]; // get last char in read
|
||
|
int refNmask = (int)rf_[rfi_+col]; // get last ref char ref involved in aln
|
||
|
assert_gt(refNmask, 0);
|
||
|
int m = matchesEx(readC, refNmask);
|
||
|
if(m != 1) {
|
||
|
Edit e((int)row, mask2dna[refNmask], "ACGTN"[readC], EDIT_TYPE_MM);
|
||
|
assert(e.repOk());
|
||
|
assert(ned.empty() || ned.back().pos >= row);
|
||
|
ned.push_back(e);
|
||
|
score.score_ -= QUAL2(row, col);
|
||
|
assert_geq(score.score(), minsc_);
|
||
|
} else {
|
||
|
score.score_ += sc_->match(30);
|
||
|
}
|
||
|
if(m == -1) {
|
||
|
score.ns_++;
|
||
|
}
|
||
|
if(score.ns_ > nceil_) {
|
||
|
// Alignment has too many Ns in it!
|
||
|
res.reset();
|
||
|
met.nrej++;
|
||
|
return false;
|
||
|
}
|
||
|
res.reverse();
|
||
|
assert(Edit::repOk(ned, (*rd_)));
|
||
|
assert_eq(score.score(), escore);
|
||
|
assert_leq(gaps, rdgap_ + rfgap_);
|
||
|
off = col;
|
||
|
assert_lt(col + (size_t)rfi_, (size_t)rff_);
|
||
|
score.gaps_ = gaps;
|
||
|
res.alres.setScore(score);
|
||
|
res.alres.setShape(
|
||
|
refidx_, // ref id
|
||
|
off + rfi_ + rect_->refl, // 0-based ref offset
|
||
|
reflen_, // reference length
|
||
|
fw_, // aligned to Watson?
|
||
|
rdf_ - rdi_, // read length
|
||
|
0, // read ID
|
||
|
true, // pretrim soft?
|
||
|
0, // pretrim 5' end
|
||
|
0, // pretrim 3' end
|
||
|
true, // alignment trim soft?
|
||
|
fw_ ? trimBeg : trimEnd, // alignment trim 5' end
|
||
|
fw_ ? trimEnd : trimBeg); // alignment trim 3' end
|
||
|
size_t refns = 0;
|
||
|
for(size_t i = col; i <= origCol; i++) {
|
||
|
if((int)rf_[rfi_+i] > 15) {
|
||
|
refns++;
|
||
|
}
|
||
|
}
|
||
|
res.alres.setRefNs(refns);
|
||
|
assert(Edit::repOk(ned, (*rd_), true, trimBeg, trimEnd));
|
||
|
assert(res.repOk());
|
||
|
#ifndef NDEBUG
|
||
|
size_t gapsCheck = 0;
|
||
|
for(size_t i = 0; i < ned.size(); i++) {
|
||
|
if(ned[i].isGap()) gapsCheck++;
|
||
|
}
|
||
|
assert_eq(gaps, gapsCheck);
|
||
|
BTDnaString refstr;
|
||
|
for(size_t i = col; i <= origCol; i++) {
|
||
|
refstr.append(firsts5[(int)rf_[rfi_+i]]);
|
||
|
}
|
||
|
BTDnaString editstr;
|
||
|
Edit::toRef((*rd_), ned, editstr, true, trimBeg, trimEnd);
|
||
|
if(refstr != editstr) {
|
||
|
cerr << "Decoded nucleotides and edits don't match reference:" << endl;
|
||
|
cerr << " score: " << score.score()
|
||
|
<< " (" << gaps << " gaps)" << endl;
|
||
|
cerr << " edits: ";
|
||
|
Edit::print(cerr, ned);
|
||
|
cerr << endl;
|
||
|
cerr << " decoded nucs: " << (*rd_) << endl;
|
||
|
cerr << " edited nucs: " << editstr << endl;
|
||
|
cerr << " reference nucs: " << refstr << endl;
|
||
|
assert(0);
|
||
|
}
|
||
|
#endif
|
||
|
met.btsucc++; // DP backtraces succeeded
|
||
|
return true;
|
||
|
}
|