hisat-3n/aligner_sw.cpp

3215 lines
102 KiB
C++
Raw Normal View History

2025-01-18 13:09:52 +00:00
/*
* Copyright 2011, Ben Langmead <langmea@cs.jhu.edu>
*
* This file is part of Bowtie 2.
*
* Bowtie 2 is free software: you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation, either version 3 of the License, or
* (at your option) any later version.
*
* Bowtie 2 is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with Bowtie 2. If not, see <http://www.gnu.org/licenses/>.
*/
#include <limits>
// -- BTL remove --
//#include <stdlib.h>
//#include <sys/time.h>
// -- --
#include "aligner_sw.h"
#include "aligner_result.h"
#include "search_globals.h"
#include "scoring.h"
#include "mask.h"
/**
* Initialize with a new read.
*/
void SwAligner::initRead(
const BTDnaString& rdfw, // forward read sequence
const BTDnaString& rdrc, // revcomp read sequence
const BTString& qufw, // forward read qualities
const BTString& qurc, // reverse read qualities
size_t rdi, // offset of first read char to align
size_t rdf, // offset of last read char to align
const Scoring& sc) // scoring scheme
{
assert_gt(rdf, rdi);
int nceil = sc.nCeil.f<int>((double)rdfw.length());
rdfw_ = &rdfw; // read sequence
rdrc_ = &rdrc; // read sequence
qufw_ = &qufw; // read qualities
qurc_ = &qurc; // read qualities
rdi_ = rdi; // offset of first read char to align
rdf_ = rdf; // offset of last read char to align
sc_ = &sc; // scoring scheme
nceil_ = nceil; // max # Ns allowed in ref portion of aln
readSse16_ = false; // true -> sse16 from now on for this read
initedRead_ = true;
#ifndef NO_SSE
sseU8fwBuilt_ = false; // built fw query profile, 8-bit score
sseU8rcBuilt_ = false; // built rc query profile, 8-bit score
sseI16fwBuilt_ = false; // built fw query profile, 16-bit score
sseI16rcBuilt_ = false; // built rc query profile, 16-bit score
#endif
}
/**
* Initialize with a new alignment problem.
*/
void SwAligner::initRef(
bool fw, // whether to forward or revcomp read is aligning
TRefId refidx, // id of reference aligned against
const DPRect& rect, // DP rectangle
char *rf, // reference sequence
size_t rfi, // offset of first reference char to align to
size_t rff, // offset of last reference char to align to
TRefOff reflen, // length of reference sequence
const Scoring& sc, // scoring scheme
TAlScore minsc, // minimum score
bool enable8, // use 8-bit SSE if possible?
size_t cminlen, // minimum length for using checkpointing scheme
size_t cpow2, // interval b/t checkpointed diags; 1 << this
bool doTri, // triangular mini-fills?
bool extend) // is this a seed extension?
{
size_t readGaps = sc.maxReadGaps(minsc, rdfw_->length());
size_t refGaps = sc.maxRefGaps(minsc, rdfw_->length());
assert_geq(readGaps, 0);
assert_geq(refGaps, 0);
assert_gt(rff, rfi);
rdgap_ = readGaps; // max # gaps in read
rfgap_ = refGaps; // max # gaps in reference
state_ = STATE_INITED;
fw_ = fw; // orientation
rd_ = fw ? rdfw_ : rdrc_; // read sequence
qu_ = fw ? qufw_ : qurc_; // quality sequence
refidx_ = refidx; // id of reference aligned against
rf_ = rf; // reference sequence
rfi_ = rfi; // offset of first reference char to align to
rff_ = rff; // offset of last reference char to align to
reflen_ = reflen; // length of entire reference sequence
rect_ = &rect; // DP rectangle
minsc_ = minsc; // minimum score
cural_ = 0; // idx of next alignment to give out
initedRef_ = true; // indicate we've initialized the ref portion
enable8_ = enable8; // use 8-bit SSE if possible?
extend_ = extend; // true iff this is a seed extension
cperMinlen_ = cminlen; // reads shorter than this won't use checkpointer
cperPerPow2_ = cpow2; // interval b/t checkpointed diags; 1 << this
cperEf_ = true; // whether to checkpoint H, E, and F
cperTri_ = doTri; // triangular mini-fills?
bter_.initRef(
fw_ ? rdfw_->buf() : // in: read sequence
rdrc_->buf(),
fw_ ? qufw_->buf() : // in: quality sequence
qurc_->buf(),
// daehwan
// rd_->length(), // in: read sequence length
rdf_ - rdi_,
rf_ + rfi_, // in: reference sequence
rff_ - rfi_, // in: in-rectangle reference sequence length
reflen, // in: total reference sequence length
refidx_, // in: reference id
rfi_, // in: reference offset
fw_, // in: orientation
rect_, // in: DP rectangle
&cper_, // in: checkpointer
*sc_, // in: scoring scheme
nceil_); // in: N ceiling
}
/**
* Given a read, an alignment orientation, a range of characters in a referece
* sequence, and a bit-encoded version of the reference, set up and execute the
* corresponding dynamic programming problem.
*
* The caller has already narrowed down the relevant portion of the reference
* using, e.g., the location of a seed hit, or the range of possible fragment
* lengths if we're searching for the opposite mate in a pair.
*/
void SwAligner::initRef(
bool fw, // whether to forward or revcomp read is aligning
TRefId refidx, // reference aligned against
const DPRect& rect, // DP rectangle
const BitPairReference& refs, // Reference strings
TRefOff reflen, // length of reference sequence
const Scoring& sc, // scoring scheme
TAlScore minsc, // minimum score
bool enable8, // use 8-bit SSE if possible?
size_t cminlen, // minimum length for using checkpointing scheme
size_t cpow2, // interval b/t checkpointed diags; 1 << this
bool doTri, // triangular mini-fills?
bool extend, // true iff this is a seed extension
size_t upto, // count the number of Ns up to this offset
size_t& nsUpto) // output: the number of Ns up to 'upto'
{
TRefOff rfi = rect.refl;
TRefOff rff = rect.refr + 1;
assert_gt(rff, rfi);
// Capture an extra reference character outside the rectangle so that we
// can check matches in the next column over to the right
rff++;
// rflen = full length of the reference substring to consider, including
// overhang off the boundaries of the reference sequence
const size_t rflen = (size_t)(rff - rfi);
// Figure the number of Ns we're going to add to either side
size_t leftNs =
(rfi >= 0 ? 0 : (size_t)std::abs(static_cast<long>(rfi)));
leftNs = min(leftNs, rflen);
size_t rightNs =
(rff <= (TRefOff)reflen ? 0 : (size_t)std::abs(static_cast<long>(rff - reflen)));
rightNs = min(rightNs, rflen);
// rflenInner = length of just the portion that doesn't overhang ref ends
assert_geq(rflen, leftNs + rightNs);
const size_t rflenInner = rflen - (leftNs + rightNs);
#ifndef NDEBUG
bool haveRfbuf2 = false;
EList<char> rfbuf2(rflen);
// This is really slow, so only do it some of the time
if((rand() % 10) == 0) {
TRefOff rfii = rfi;
for(size_t i = 0; i < rflen; i++) {
if(rfii < 0 || (TRefOff)rfii >= reflen) {
rfbuf2.push_back(4);
} else {
rfbuf2.push_back(refs.getBase(refidx, (uint32_t)rfii));
}
rfii++;
}
haveRfbuf2 = true;
}
#endif
// rfbuf_ = uint32_t list large enough to accommodate both the reference
// sequence and any Ns we might add to either side.
rfwbuf_.resize((rflen + 16) / 4);
int offset = refs.getStretch(
rfwbuf_.ptr(), // buffer to store words in
refidx, // which reference
(rfi < 0) ? 0 : (size_t)rfi, // starting offset (can't be < 0)
rflenInner // length to grab (exclude overhang)
ASSERT_ONLY(, tmp_destU32_));// for BitPairReference::getStretch()
assert_leq(offset, 16);
rf_ = (char*)rfwbuf_.ptr() + offset;
// Shift ref chars away from 0 so we can stick Ns at the beginning
if(leftNs > 0) {
// Slide everyone down
for(size_t i = rflenInner; i > 0; i--) {
rf_[i+leftNs-1] = rf_[i-1];
}
// Add Ns
for(size_t i = 0; i < leftNs; i++) {
rf_[i] = 4;
}
}
if(rightNs > 0) {
// Add Ns to the end
for(size_t i = 0; i < rightNs; i++) {
rf_[i + leftNs + rflenInner] = 4;
}
}
#ifndef NDEBUG
// Sanity check reference characters
for(size_t i = 0; i < rflen; i++) {
assert(!haveRfbuf2 || rf_[i] == rfbuf2[i]);
assert_range(0, 4, (int)rf_[i]);
}
#endif
// Count Ns and convert reference characters into A/C/G/T masks. Ambiguous
// nucleotides (IUPAC codes) have more than one mask bit set. If a
// reference scanner was provided, use it to opportunistically resolve seed
// hits.
nsUpto = 0;
for(size_t i = 0; i < rflen; i++) {
// rf_[i] gets mask version of refence char, with N=16
if(i < upto && rf_[i] > 3) {
nsUpto++;
}
rf_[i] = (1 << rf_[i]);
}
// Correct for having captured an extra reference character
rff--;
initRef(
fw, // whether to forward or revcomp read is aligning
refidx, // id of reference aligned against
rect, // DP rectangle
rf_, // reference sequence, wrapped up in BTString object
0, // use the whole thing
(size_t)(rff - rfi), // ditto
reflen, // reference length
sc, // scoring scheme
minsc, // minimum score
enable8, // use 8-bit SSE if possible?
cminlen, // minimum length for using checkpointing scheme
cpow2, // interval b/t checkpointed diags; 1 << this
doTri, // triangular mini-fills?
extend); // true iff this is a seed extension
}
/**
* Given a read, an alignment orientation, a range of characters in a referece
* sequence, and a bit-encoded version of the reference, set up and execute the
* corresponding ungapped alignment problem. There can only be one solution.
*
* The caller has already narrowed down the relevant portion of the reference
* using, e.g., the location of a seed hit, or the range of possible fragment
* lengths if we're searching for the opposite mate in a pair.
*/
int SwAligner::ungappedAlign(
const BTDnaString& rd, // read sequence (could be RC)
const BTString& qu, // qual sequence (could be rev)
const Coord& coord, // coordinate aligned to
const BitPairReference& refs, // Reference strings
size_t reflen, // length of reference sequence
const Scoring& sc, // scoring scheme
bool ohang, // allow overhang?
TAlScore minsc, // minimum score
SwResult& res) // put alignment result here
{
const size_t len = rd.length();
int nceil = sc.nCeil.f<int>((double)len);
int ns = 0;
TRefOff rfi = coord.off();
TRefOff rff = rfi + (TRefOff)len;
TRefId refidx = coord.ref();
assert_gt(rff, rfi);
// Figure the number of Ns we're going to add to either side
size_t leftNs = 0;
if(rfi < 0) {
if(ohang) {
leftNs = (size_t)(-rfi);
} else {
return 0;
}
}
size_t rightNs = 0;
if(rff > (TRefOff)reflen) {
if(ohang) {
rightNs = (size_t)(rff - (TRefOff)reflen);
} else {
return 0;
}
}
if((leftNs + rightNs) > (size_t)nceil) {
return 0;
}
// rflenInner = length of just the portion that doesn't overhang ref ends
assert_geq(len, leftNs + rightNs);
const size_t rflenInner = len - (leftNs + rightNs);
#ifndef NDEBUG
bool haveRfbuf2 = false;
EList<char> rfbuf2(len);
// This is really slow, so only do it some of the time
if((rand() % 10) == 0) {
TRefOff rfii = rfi;
for(size_t i = 0; i < len; i++) {
if(rfii < 0 || (size_t)rfii >= reflen) {
rfbuf2.push_back(4);
} else {
rfbuf2.push_back(refs.getBase(refidx, (uint32_t)rfii));
}
rfii++;
}
haveRfbuf2 = true;
}
#endif
// rfbuf_ = uint32_t list large enough to accommodate both the reference
// sequence and any Ns we might add to either side.
rfwbuf_.resize((len + 16) / 4);
int offset = refs.getStretch(
rfwbuf_.ptr(), // buffer to store words in
refidx, // which reference
(rfi < 0) ? 0 : (size_t)rfi, // starting offset (can't be < 0)
rflenInner // length to grab (exclude overhang)
ASSERT_ONLY(, tmp_destU32_));// for BitPairReference::getStretch()
assert_leq(offset, 16);
rf_ = (char*)rfwbuf_.ptr() + offset;
// Shift ref chars away from 0 so we can stick Ns at the beginning
if(leftNs > 0) {
// Slide everyone down
for(size_t i = rflenInner; i > 0; i--) {
rf_[i+leftNs-1] = rf_[i-1];
}
// Add Ns
for(size_t i = 0; i < leftNs; i++) {
rf_[i] = 4;
}
}
if(rightNs > 0) {
// Add Ns to the end
for(size_t i = 0; i < rightNs; i++) {
rf_[i + leftNs + rflenInner] = 4;
}
}
#ifndef NDEBUG
// Sanity check reference characters
for(size_t i = 0; i < len; i++) {
assert(!haveRfbuf2 || rf_[i] == rfbuf2[i]);
assert_range(0, 4, (int)rf_[i]);
}
#endif
// Count Ns and convert reference characters into A/C/G/T masks. Ambiguous
// nucleotides (IUPAC codes) have more than one mask bit set. If a
// reference scanner was provided, use it to opportunistically resolve seed
// hits.
TAlScore score = 0;
res.alres.reset();
size_t rowi = 0;
size_t rowf = len-1;
if(sc.monotone) {
for(size_t i = 0; i < len; i++) {
// rf_[i] gets mask version of refence char, with N=16
assert_geq(qu[i], 33);
score += sc.score(rd[i], (int)(1 << rf_[i]), qu[i] - 33, ns);
assert_leq(score, 0);
if(score < minsc || ns > nceil) {
// Fell below threshold
return 0;
}
}
// Got a result! Fill in the rest of the result object.
} else {
// Definitely ways to short-circuit this. E.g. if diff between cur
// score and minsc can't be met by matches.
TAlScore floorsc = 0;
TAlScore scoreMax = floorsc;
size_t lastfloor = 0;
rowi = MAX_SIZE_T;
size_t sols = 0;
for(size_t i = 0; i < len; i++) {
score += sc.score(rd[i], (int)(1 << rf_[i]), qu[i] - 33, ns);
if(score >= minsc && score >= scoreMax) {
scoreMax = score;
rowf = i;
if(rowi != lastfloor) {
rowi = lastfloor;
sols++;
}
}
if(score <= floorsc) {
score = floorsc;
lastfloor = i+1;
}
}
if(ns > nceil || scoreMax < minsc) {
// Too many Ns
return 0;
}
if(sols > 1) {
// >1 distinct solution in this diag; defer to DP aligner
return -1;
}
score = scoreMax;
// Got a result! Fill in the rest of the result object.
}
// Now fill in the edits
res.alres.setScore(AlnScore(score, ns, 0));
assert_geq(rowf, rowi);
EList<Edit>& ned = res.alres.ned();
size_t refns = 0;
ASSERT_ONLY(BTDnaString refstr);
for(size_t i = rowi; i <= rowf; i++) {
ASSERT_ONLY(refstr.append((int)rf_[i]));
if(rf_[i] > 3 || rd[i] != rf_[i]) {
// Add edit
Edit e((int)i,
mask2dna[1 << (int)rf_[i]],
"ACGTN"[(int)rd[i]],
EDIT_TYPE_MM);
ned.push_back(e);
if(rf_[i] > 3) {
refns++;
}
}
}
assert(Edit::repOk(ned, rd));
bool fw = coord.fw();
assert_leq(rowf, len-1);
size_t trimEnd = (len-1) - rowf;
res.alres.setShape(
coord.ref(), // ref id
coord.off()+rowi, // 0-based ref offset
reflen, // length of reference sequence aligned to
fw, // aligned to Watson?
len, // read length
0, // read ID
true, // pretrim soft?
0, // pretrim 5' end
0, // pretrim 3' end
true, // alignment trim soft?
fw ? rowi : trimEnd, // alignment trim 5' end
fw ? trimEnd : rowi); // alignment trim 3' end
res.alres.setRefNs(refns);
assert(res.repOk());
#ifndef NDEBUG
BTDnaString editstr;
Edit::toRef(rd, ned, editstr, true, rowi, trimEnd);
if(refstr != editstr) {
cerr << "Decoded nucleotides and edits don't match reference:" << endl;
cerr << " score: " << res.alres.score().score() << endl;
cerr << " edits: ";
Edit::print(cerr, ned);
cerr << endl;
cerr << " decoded nucs: " << rd << endl;
cerr << " edited nucs: " << editstr << endl;
cerr << " reference nucs: " << refstr << endl;
assert(0);
}
#endif
if(!fw) {
// All edits are currently w/r/t upstream end; if read aligned to Crick
// strand, invert them to be w/r/t 5' end instead.
res.alres.invertEdits();
}
return 1;
}
/**
* Align read 'rd' to reference using read & reference information given
* last time init() was called.
*/
bool SwAligner::align(
RandomSource& rnd, // source of pseudo-randoms
TAlScore& best) // best alignment score observed in DP matrix
{
assert(initedRef() && initedRead());
assert_eq(STATE_INITED, state_);
state_ = STATE_ALIGNED;
// Reset solutions lists
btncand_.clear();
btncanddone_.clear();
btncanddoneSucc_ = btncanddoneFail_ = 0;
best = std::numeric_limits<TAlScore>::min();
sse8succ_ = sse16succ_ = false;
int flag = 0;
size_t rdlen = rdf_ - rdi_;
bool checkpointed = rdlen >= cperMinlen_;
bool gathered = false; // Did gathering happen along with alignment?
if(sc_->monotone) {
// End-to-end
if(enable8_ && !readSse16_ && minsc_ >= -254) {
// 8-bit end-to-end
if(checkpointed) {
best = alignGatherEE8(flag, false);
if(flag == 0) {
gathered = true;
}
} else {
best = alignNucleotidesEnd2EndSseU8(flag, false);
#ifndef NDEBUG
int flagtmp = 0;
TAlScore besttmp = alignGatherEE8(flagtmp, true); // debug
assert_eq(flagtmp, flag);
assert_eq(besttmp, best);
#endif
}
sse8succ_ = (flag == 0);
#ifndef NDEBUG
{
int flag2 = 0;
TAlScore best2 = alignNucleotidesEnd2EndSseI16(flag2, true);
{
int flagtmp = 0;
TAlScore besttmp = alignGatherEE16(flagtmp, true);
assert_eq(flagtmp, flag2);
assert(flag2 != 0 || best2 == besttmp);
}
assert(flag < 0 || best == best2);
sse16succ_ = (flag2 == 0);
}
#endif /*ndef NDEBUG*/
} else {
// 16-bit end-to-end
if(checkpointed) {
best = alignGatherEE16(flag, false);
if(flag == 0) {
gathered = true;
}
} else {
best = alignNucleotidesEnd2EndSseI16(flag, false);
#ifndef NDEBUG
int flagtmp = 0;
TAlScore besttmp = alignGatherEE16(flagtmp, true);
assert_eq(flagtmp, flag);
assert_eq(besttmp, best);
#endif
}
sse16succ_ = (flag == 0);
}
} else {
// Local
flag = -2;
if(enable8_ && !readSse16_) {
// 8-bit local
if(checkpointed) {
best = alignGatherLoc8(flag, false);
if(flag == 0) {
gathered = true;
}
} else {
best = alignNucleotidesLocalSseU8(flag, false);
#ifndef NDEBUG
int flagtmp = 0;
TAlScore besttmp = alignGatherLoc8(flagtmp, true);
assert_eq(flag, flagtmp);
assert_eq(best, besttmp);
#endif
}
}
if(flag == -2) {
// 16-bit local
flag = 0;
if(checkpointed) {
best = alignNucleotidesLocalSseI16(flag, false);
best = alignGatherLoc16(flag, false);
if(flag == 0) {
gathered = true;
}
} else {
best = alignNucleotidesLocalSseI16(flag, false);
#ifndef NDEBUG
int flagtmp = 0;
TAlScore besttmp = alignGatherLoc16(flagtmp, true);
assert_eq(flag, flagtmp);
assert_eq(best, besttmp);
#endif
}
sse16succ_ = (flag == 0);
} else {
sse8succ_ = (flag == 0);
#ifndef NDEBUG
int flag2 = 0;
TAlScore best2 = alignNucleotidesLocalSseI16(flag2, true);
{
int flagtmp = 0;
TAlScore besttmp = alignGatherLoc16(flagtmp, true);
assert_eq(flag2, flagtmp);
assert(flag2 != 0 || best2 == besttmp);
}
assert(flag2 < 0 || best == best2);
sse16succ_ = (flag2 == 0);
#endif /*ndef NDEBUG*/
}
}
#ifndef NDEBUG
if(!checkpointed && (rand() & 15) == 0 && sse8succ_ && sse16succ_) {
SSEData& d8 = fw_ ? sseU8fw_ : sseU8rc_;
SSEData& d16 = fw_ ? sseI16fw_ : sseI16rc_;
assert_eq(d8.mat_.nrow(), d16.mat_.nrow());
assert_eq(d8.mat_.ncol(), d16.mat_.ncol());
for(size_t i = 0; i < d8.mat_.nrow(); i++) {
for(size_t j = 0; j < colstop_; j++) {
int h8 = d8.mat_.helt(i, j);
int h16 = d16.mat_.helt(i, j);
int e8 = d8.mat_.eelt(i, j);
int e16 = d16.mat_.eelt(i, j);
int f8 = d8.mat_.felt(i, j);
int f16 = d16.mat_.felt(i, j);
TAlScore h8s =
(sc_->monotone ? (h8 - 0xff ) : h8);
TAlScore h16s =
(sc_->monotone ? (h16 - 0x7fff) : (h16 + 0x8000));
TAlScore e8s =
(sc_->monotone ? (e8 - 0xff ) : e8);
TAlScore e16s =
(sc_->monotone ? (e16 - 0x7fff) : (e16 + 0x8000));
TAlScore f8s =
(sc_->monotone ? (f8 - 0xff ) : f8);
TAlScore f16s =
(sc_->monotone ? (f16 - 0x7fff) : (f16 + 0x8000));
if(h8s < minsc_) {
h8s = minsc_ - 1;
}
if(h16s < minsc_) {
h16s = minsc_ - 1;
}
if(e8s < minsc_) {
e8s = minsc_ - 1;
}
if(e16s < minsc_) {
e16s = minsc_ - 1;
}
if(f8s < minsc_) {
f8s = minsc_ - 1;
}
if(f16s < minsc_) {
f16s = minsc_ - 1;
}
if((h8 != 0 || (int16_t)h16 != (int16_t)0x8000) && h8 > 0) {
assert_eq(h8s, h16s);
}
if((e8 != 0 || (int16_t)e16 != (int16_t)0x8000) && e8 > 0) {
assert_eq(e8s, e16s);
}
if((f8 != 0 || (int16_t)f16 != (int16_t)0x8000) && f8 > 0) {
assert_eq(f8s, f16s);
}
}
}
}
#endif
assert(repOk());
cural_ = 0;
if(best == MIN_I64 || best < minsc_) {
return false;
}
if(!gathered) {
// Look for solutions using SSE matrix
assert(sse8succ_ || sse16succ_);
if(sc_->monotone) {
if(sse8succ_) {
gatherCellsNucleotidesEnd2EndSseU8(best);
#ifndef NDEBUG
if(sse16succ_) {
cand_tmp_ = btncand_;
gatherCellsNucleotidesEnd2EndSseI16(best);
cand_tmp_.sort();
btncand_.sort();
assert(cand_tmp_ == btncand_);
}
#endif /*ndef NDEBUG*/
} else {
gatherCellsNucleotidesEnd2EndSseI16(best);
}
} else {
if(sse8succ_) {
gatherCellsNucleotidesLocalSseU8(best);
#ifndef NDEBUG
if(sse16succ_) {
cand_tmp_ = btncand_;
gatherCellsNucleotidesLocalSseI16(best);
cand_tmp_.sort();
btncand_.sort();
assert(cand_tmp_ == btncand_);
}
#endif /*ndef NDEBUG*/
} else {
gatherCellsNucleotidesLocalSseI16(best);
}
}
}
if(!btncand_.empty()) {
btncand_.sort();
}
return !btncand_.empty();
}
/**
* Populate the given SwResult with information about the "next best"
* alignment if there is one. If there isn't one, false is returned. Note
* that false might be returned even though a call to done() would have
* returned false.
*/
bool SwAligner::nextAlignment(
SwResult& res,
TAlScore minsc,
RandomSource& rnd)
{
assert(initedRead() && initedRef());
assert_eq(STATE_ALIGNED, state_);
assert(repOk());
if(done()) {
res.reset();
return false;
}
assert(!done());
size_t off = 0, nbts = 0;
assert_lt(cural_, btncand_.size());
assert(res.repOk());
// For each candidate cell that we should try to backtrack from...
const size_t candsz = btncand_.size();
size_t SQ = dpRows() >> 4;
if(SQ == 0) SQ = 1;
size_t rdlen = rdf_ - rdi_;
bool checkpointed = rdlen >= cperMinlen_;
while(cural_ < candsz) {
// Doing 'continue' anywhere in here simply causes us to move on to the
// next candidate
if(btncand_[cural_].score < minsc) {
btncand_[cural_].fate = BT_CAND_FATE_FILT_SCORE;
nbtfiltsc_++; cural_++; continue;
}
nbts = 0;
assert(sse8succ_ || sse16succ_);
size_t row = btncand_[cural_].row;
size_t col = btncand_[cural_].col;
assert_lt(row, dpRows());
assert_lt((TRefOff)col, rff_-rfi_);
if(sse16succ_) {
SSEData& d = fw_ ? sseI16fw_ : sseI16rc_;
if(!checkpointed && d.mat_.reset_[row] && d.mat_.reportedThrough(row, col)) {
// Skipping this candidate because a previous candidate already
// moved through this cell
btncand_[cural_].fate = BT_CAND_FATE_FILT_START;
//cerr << " skipped becuase starting cell was covered" << endl;
nbtfiltst_++; cural_++; continue;
}
} else if(sse8succ_) {
SSEData& d = fw_ ? sseU8fw_ : sseU8rc_;
if(!checkpointed && d.mat_.reset_[row] && d.mat_.reportedThrough(row, col)) {
// Skipping this candidate because a previous candidate already
// moved through this cell
btncand_[cural_].fate = BT_CAND_FATE_FILT_START;
//cerr << " skipped becuase starting cell was covered" << endl;
nbtfiltst_++; cural_++; continue;
}
}
if(sc_->monotone) {
bool ret = false;
if(sse8succ_) {
uint32_t reseed = rnd.nextU32() + 1;
rnd.init(reseed);
res.reset();
if(checkpointed) {
size_t maxiter = MAX_SIZE_T;
size_t niter = 0;
ret = backtrace(
btncand_[cural_].score, // in: expected score
true, // in: use mini-fill?
true, // in: use checkpoints?
res, // out: store results (edits and scores) here
off, // out: store diagonal projection of origin
row, // start in this rectangle row
col, // start in this rectangle column
maxiter, // max # extensions to try
niter, // # extensions tried
rnd); // random gen, to choose among equal paths
} else {
ret = backtraceNucleotidesEnd2EndSseU8(
btncand_[cural_].score, // in: expected score
res, // out: store results (edits and scores) here
off, // out: store diagonal projection of origin
nbts, // out: # backtracks
row, // start in this rectangle row
col, // start in this rectangle column
rnd); // random gen, to choose among equal paths
}
#ifndef NDEBUG
// if(...) statement here should check not whether the primary
// alignment was checkpointed, but whether a checkpointed
// alignment was done at all.
if(!checkpointed) {
SwResult res2;
res2.alres = res.alres; res2.alres.reset();
size_t maxiter2 = MAX_SIZE_T;
size_t niter2 = 0;
bool ret2 = backtrace(
btncand_[cural_].score, // in: expected score
true, // in: use mini-fill?
true, // in: use checkpoints?
res2, // out: store results (edits and scores) here
off, // out: store diagonal projection of origin
row, // start in this rectangle row
col, // start in this rectangle column
maxiter2, // max # extensions to try
niter2, // # extensions tried
rnd); // random gen, to choose among equal paths
// After the first alignment, there's no guarantee we'll
// get the same answer from both backtrackers because of
// differences in how they handle marking cells as
// reported-through.
assert(cural_ > 0 || !ret || ret == ret2);
assert(cural_ > 0 || !ret || res.alres == res2.alres);
}
if(sse16succ_ && !checkpointed) {
SwResult res2;
res2.alres = res.alres; res2.alres.reset();
size_t off2, nbts2 = 0;
rnd.init(reseed);
bool ret2 = backtraceNucleotidesEnd2EndSseI16(
btncand_[cural_].score, // in: expected score
res2, // out: store results (edits and scores) here
off2, // out: store diagonal projection of origin
nbts2, // out: # backtracks
row, // start in this rectangle row
col, // start in this rectangle column
rnd); // random gen, to choose among equal paths
assert_eq(ret, ret2);
assert_eq(nbts, nbts2);
assert(!ret || res2.alres.score() == res.alres.score());
#if 0
if(!checkpointed && (rand() & 15) == 0) {
// Check that same cells are reported through
SSEData& d8 = fw_ ? sseU8fw_ : sseU8rc_;
SSEData& d16 = fw_ ? sseI16fw_ : sseI16rc_;
for(size_t i = d8.mat_.nrow(); i > 0; i--) {
for(size_t j = 0; j < d8.mat_.ncol(); j++) {
assert_eq(d8.mat_.reportedThrough(i-1, j),
d16.mat_.reportedThrough(i-1, j));
}
}
}
#endif
}
#endif
rnd.init(reseed+1); // debug/release pseudo-randoms in lock step
} else if(sse16succ_) {
uint32_t reseed = rnd.nextU32() + 1;
res.reset();
if(checkpointed) {
size_t maxiter = MAX_SIZE_T;
size_t niter = 0;
ret = backtrace(
btncand_[cural_].score, // in: expected score
true, // in: use mini-fill?
true, // in: use checkpoints?
res, // out: store results (edits and scores) here
off, // out: store diagonal projection of origin
row, // start in this rectangle row
col, // start in this rectangle column
maxiter, // max # extensions to try
niter, // # extensions tried
rnd); // random gen, to choose among equal paths
} else {
ret = backtraceNucleotidesEnd2EndSseI16(
btncand_[cural_].score, // in: expected score
res, // out: store results (edits and scores) here
off, // out: store diagonal projection of origin
nbts, // out: # backtracks
row, // start in this rectangle row
col, // start in this rectangle column
rnd); // random gen, to choose among equal paths
}
#ifndef NDEBUG
// if(...) statement here should check not whether the primary
// alignment was checkpointed, but whether a checkpointed
// alignment was done at all.
if(!checkpointed) {
SwResult res2;
size_t maxiter2 = MAX_SIZE_T;
size_t niter2 = 0;
bool ret2 = backtrace(
btncand_[cural_].score, // in: expected score
true, // in: use mini-fill?
true, // in: use checkpoints?
res2, // out: store results (edits and scores) here
off, // out: store diagonal projection of origin
row, // start in this rectangle row
col, // start in this rectangle column
maxiter2, // max # extensions to try
niter2, // # extensions tried
rnd); // random gen, to choose among equal paths
// After the first alignment, there's no guarantee we'll
// get the same answer from both backtrackers because of
// differences in how they handle marking cells as
// reported-through.
assert(cural_ > 0 || !ret || ret == ret2);
assert(cural_ > 0 || !ret || res.alres == res2.alres);
}
#endif
rnd.init(reseed); // debug/release pseudo-randoms in lock step
}
if(ret) {
btncand_[cural_].fate = BT_CAND_FATE_SUCCEEDED;
break;
} else {
btncand_[cural_].fate = BT_CAND_FATE_FAILED;
}
} else {
// Local alignment
// Check if this solution is "dominated" by a prior one.
// Domination is a heuristic designed to eliminate the vast
// majority of valid-but-redundant candidates lying in the
// "penumbra" of a high-scoring alignment.
bool dom = false;
{
size_t donesz = btncanddone_.size();
const size_t col = btncand_[cural_].col;
const size_t row = btncand_[cural_].row;
for(size_t i = 0; i < donesz; i++) {
assert_gt(btncanddone_[i].fate, 0);
size_t colhi = col, rowhi = row;
size_t rowlo = btncanddone_[i].row;
size_t collo = btncanddone_[i].col;
if(colhi < collo) swap(colhi, collo);
if(rowhi < rowlo) swap(rowhi, rowlo);
if(colhi - collo <= SQ && rowhi - rowlo <= SQ) {
// Skipping this candidate because it's "dominated" by
// a previous candidate
dom = true;
break;
}
}
}
if(dom) {
btncand_[cural_].fate = BT_CAND_FATE_FILT_DOMINATED;
nbtfiltdo_++;
cural_++;
continue;
}
bool ret = false;
if(sse8succ_) {
uint32_t reseed = rnd.nextU32() + 1;
res.reset();
rnd.init(reseed);
if(checkpointed) {
size_t maxiter = MAX_SIZE_T;
size_t niter = 0;
ret = backtrace(
btncand_[cural_].score, // in: expected score
true, // in: use mini-fill?
true, // in: use checkpoints?
res, // out: store results (edits and scores) here
off, // out: store diagonal projection of origin
row, // start in this rectangle row
col, // start in this rectangle column
maxiter, // max # extensions to try
niter, // # extensions tried
rnd); // random gen, to choose among equal paths
} else {
ret = backtraceNucleotidesLocalSseU8(
btncand_[cural_].score, // in: expected score
res, // out: store results (edits and scores) here
off, // out: store diagonal projection of origin
nbts, // out: # backtracks
row, // start in this rectangle row
col, // start in this rectangle column
rnd); // random gen, to choose among equal paths
}
#ifndef NDEBUG
// if(...) statement here should check not whether the primary
// alignment was checkpointed, but whether a checkpointed
// alignment was done at all.
if(!checkpointed) {
SwResult res2;
size_t maxiter2 = MAX_SIZE_T;
size_t niter2 = 0;
bool ret2 = backtrace(
btncand_[cural_].score, // in: expected score
true, // in: use mini-fill?
true, // in: use checkpoints?
res2, // out: store results (edits and scores) here
off, // out: store diagonal projection of origin
row, // start in this rectangle row
col, // start in this rectangle column
maxiter2, // max # extensions to try
niter2, // # extensions tried
rnd); // random gen, to choose among equal paths
// After the first alignment, there's no guarantee we'll
// get the same answer from both backtrackers because of
// differences in how they handle marking cells as
// reported-through.
assert(cural_ > 0 || !ret || ret == ret2);
assert(cural_ > 0 || !ret || res.alres == res2.alres);
}
if(!checkpointed && sse16succ_) {
SwResult res2;
size_t off2, nbts2 = 0;
rnd.init(reseed); // same b/t backtrace calls
bool ret2 = backtraceNucleotidesLocalSseI16(
btncand_[cural_].score, // in: expected score
res2, // out: store results (edits and scores) here
off2, // out: store diagonal projection of origin
nbts2, // out: # backtracks
row, // start in this rectangle row
col, // start in this rectangle column
rnd); // random gen, to choose among equal paths
assert_eq(ret, ret2);
assert_eq(nbts, nbts2);
assert(!ret || res2.alres.score() == res.alres.score());
#if 0
if(!checkpointed && (rand() & 15) == 0) {
// Check that same cells are reported through
SSEData& d8 = fw_ ? sseU8fw_ : sseU8rc_;
SSEData& d16 = fw_ ? sseI16fw_ : sseI16rc_;
for(size_t i = d8.mat_.nrow(); i > 0; i--) {
for(size_t j = 0; j < d8.mat_.ncol(); j++) {
assert_eq(d8.mat_.reportedThrough(i-1, j),
d16.mat_.reportedThrough(i-1, j));
}
}
}
#endif
}
#endif
rnd.init(reseed+1); // debug/release pseudo-randoms in lock step
} else if(sse16succ_) {
uint32_t reseed = rnd.nextU32() + 1;
res.reset();
if(checkpointed) {
size_t maxiter = MAX_SIZE_T;
size_t niter = 0;
ret = backtrace(
btncand_[cural_].score, // in: expected score
true, // in: use mini-fill?
true, // in: use checkpoints?
res, // out: store results (edits and scores) here
off, // out: store diagonal projection of origin
row, // start in this rectangle row
col, // start in this rectangle column
maxiter, // max # extensions to try
niter, // # extensions tried
rnd); // random gen, to choose among equal paths
} else {
ret = backtraceNucleotidesLocalSseI16(
btncand_[cural_].score, // in: expected score
res, // out: store results (edits and scores) here
off, // out: store diagonal projection of origin
nbts, // out: # backtracks
row, // start in this rectangle row
col, // start in this rectangle column
rnd); // random gen, to choose among equal paths
}
#ifndef NDEBUG
// if(...) statement here should check not whether the primary
// alignment was checkpointed, but whether a checkpointed
// alignment was done at all.
if(!checkpointed) {
SwResult res2;
size_t maxiter2 = MAX_SIZE_T;
size_t niter2 = 0;
bool ret2 = backtrace(
btncand_[cural_].score, // in: expected score
true, // in: use mini-fill?
true, // in: use checkpoints?
res2, // out: store results (edits and scores) here
off, // out: store diagonal projection of origin
row, // start in this rectangle row
col, // start in this rectangle column
maxiter2, // max # extensions to try
niter2, // # extensions tried
rnd); // random gen, to choose among equal paths
// After the first alignment, there's no guarantee we'll
// get the same answer from both backtrackers because of
// differences in how they handle marking cells as
// reported-through.
assert(cural_ > 0 || !ret || ret == ret2);
assert(cural_ > 0 || !ret || res.alres == res2.alres);
}
#endif
rnd.init(reseed); // same b/t backtrace calls
}
if(ret) {
btncand_[cural_].fate = BT_CAND_FATE_SUCCEEDED;
btncanddone_.push_back(btncand_[cural_]);
btncanddoneSucc_++;
assert(res.repOk());
break;
} else {
btncand_[cural_].fate = BT_CAND_FATE_FAILED;
btncanddone_.push_back(btncand_[cural_]);
btncanddoneFail_++;
}
}
cural_++;
} // while(cural_ < btncand_.size())
if(cural_ == btncand_.size()) {
assert(res.repOk());
return false;
}
assert(!res.alres.empty());
assert(res.repOk());
if(!fw_) {
// All edits are currently w/r/t upstream end; if read aligned
// to Crick strand, we need to invert them so that they're
// w/r/t the read's 5' end instead.
res.alres.invertEdits();
}
cural_++;
assert(res.repOk());
return true;
}
#ifdef MAIN_ALIGNER_SW
#include <sstream>
#include <utility>
#include <getopt.h>
#include "scoring.h"
#include "aligner_seed_policy.h"
int gGapBarrier;
int gSnpPhred;
static int bonusMatchType; // how to reward matches
static int bonusMatch; // constant if match bonus is a constant
static int penMmcType; // how to penalize mismatches
static int penMmc; // constant if mm pelanty is a constant
static int penNType; // how to penalize Ns in the read
static int penN; // constant if N pelanty is a constant
static bool nPairCat; // true -> concatenate mates before N filter
static int penRdExConst; // constant coeff for cost of gap in read
static int penRfExConst; // constant coeff for cost of gap in ref
static int penRdExLinear; // linear coeff for cost of gap in read
static int penRfExLinear; // linear coeff for cost of gap in ref
static float costMinConst; // constant coeff for min score w/r/t read len
static float costMinLinear; // linear coeff for min score w/r/t read len
static float costFloorConst; // constant coeff for score floor w/r/t read len
static float costFloorLinear;// linear coeff for score floor w/r/t read len
static float nCeilConst; // constant coeff for N ceiling w/r/t read len
static float nCeilLinear; // linear coeff for N ceiling w/r/t read len
static bool nCatPair; // concat mates before applying N filter?
static int multiseedMms; // mismatches permitted in a multiseed seed
static int multiseedLen; // length of multiseed seeds
static int multiseedIvalType;
static float multiseedIvalA;
static float multiseedIvalB;
static float posmin;
static float posfrac;
static float rowmult;
enum {
ARG_TESTS = 256
};
static const char *short_opts = "s:m:r:d:i:";
static struct option long_opts[] = {
{(char*)"snppen", required_argument, 0, 's'},
{(char*)"misspen", required_argument, 0, 'm'},
{(char*)"seed", required_argument, 0, 'r'},
{(char*)"align-policy", no_argument, 0, 'A'},
{(char*)"test", no_argument, 0, ARG_TESTS},
};
static void printUsage(ostream& os) {
os << "Usage: aligner_sw <read-seq> <ref-nuc-seq> [options]*" << endl;
os << "Options:" << endl;
os << " -s/--snppen <int> penalty incurred by SNP; used for decoding"
<< endl;
os << " -m/--misspen <int> quality to use for read chars" << endl;
os << " -r/-seed <int> seed for pseudo-random generator" << endl;
}
/**
* Parse a T from a string 's'
*/
template<typename T>
T parse(const char *s) {
T tmp;
stringstream ss(s);
ss >> tmp;
return tmp;
}
static EList<bool> stbuf, enbuf;
static BTDnaString btread;
static BTString btqual;
static BTString btref;
static BTString btref2;
static BTDnaString readrc;
static BTString qualrc;
/**
* Helper function for running a case consisting of a read (sequence
* and quality), a reference string, and an offset that anchors the 0th
* character of the read to a reference position.
*/
static void doTestCase(
SwAligner& al,
const BTDnaString& read,
const BTString& qual,
const BTString& refin,
TRefOff off,
EList<bool> *en,
const Scoring& sc,
TAlScore minsc,
SwResult& res,
bool nsInclusive,
bool filterns,
uint32_t seed)
{
RandomSource rnd(seed);
btref2 = refin;
assert_eq(read.length(), qual.length());
size_t nrow = read.length();
TRefOff rfi, rff;
// Calculate the largest possible number of read and reference gaps given
// 'minsc' and 'pens'
size_t maxgaps;
size_t padi, padf;
{
int readGaps = sc.maxReadGaps(minsc, read.length());
int refGaps = sc.maxRefGaps(minsc, read.length());
assert_geq(readGaps, 0);
assert_geq(refGaps, 0);
int maxGaps = max(readGaps, refGaps);
padi = 2 * maxGaps;
padf = maxGaps;
maxgaps = (size_t)maxGaps;
}
size_t nceil = (size_t)sc.nCeil.f((double)read.length());
size_t width = 1 + padi + padf;
rfi = off;
off = 0;
// Pad the beginning of the reference with Ns if necessary
if(rfi < padi) {
size_t beginpad = (size_t)(padi - rfi);
for(size_t i = 0; i < beginpad; i++) {
btref2.insert('N', 0);
off--;
}
rfi = 0;
} else {
rfi -= padi;
}
assert_geq(rfi, 0);
// Pad the end of the reference with Ns if necessary
while(rfi + nrow + padi + padf > btref2.length()) {
btref2.append('N');
}
rff = rfi + nrow + padi + padf;
// Convert reference string to masks
for(size_t i = 0; i < btref2.length(); i++) {
if(toupper(btref2[i]) == 'N' && !nsInclusive) {
btref2.set(16, i);
} else {
int num = 0;
int alts[] = {4, 4, 4, 4};
decodeNuc(toupper(btref2[i]), num, alts);
assert_leq(num, 4);
assert_gt(num, 0);
btref2.set(0, i);
for(int j = 0; j < num; j++) {
btref2.set(btref2[i] | (1 << alts[j]), i);
}
}
}
bool fw = true;
uint32_t refidx = 0;
size_t solwidth = width;
if(maxgaps >= solwidth) {
solwidth = 0;
} else {
solwidth -= maxgaps;
}
if(en == NULL) {
enbuf.resize(solwidth);
enbuf.fill(true);
en = &enbuf;
}
assert_geq(rfi, 0);
assert_gt(rff, rfi);
readrc = read;
qualrc = qual;
al.initRead(
read, // read sequence
readrc,
qual, // read qualities
qualrc,
0, // offset of first character within 'read' to consider
read.length(), // offset of last char (exclusive) in 'read' to consider
floorsc); // local-alignment score floor
al.initRef(
fw, // 'read' is forward version of read?
refidx, // id of reference aligned to
off, // offset of upstream ref char aligned against
btref2.wbuf(), // reference sequence (masks)
rfi, // offset of first char in 'ref' to consider
rff, // offset of last char (exclusive) in 'ref' to consider
width, // # bands to do (width of parallelogram)
solwidth, // # rightmost cols where solns can end
sc, // scoring scheme
minsc, // minimum score for valid alignment
maxgaps, // max of max # read gaps, ref gaps
0, // amount to truncate on left-hand side
en); // mask indicating which columns we can end in
if(filterns) {
al.filter((int)nceil);
}
al.align(rnd);
}
/**
* Another interface for running a case.
*/
static void doTestCase2(
SwAligner& al,
const char *read,
const char *qual,
const char *refin,
TRefOff off,
const Scoring& sc,
float costMinConst,
float costMinLinear,
SwResult& res,
bool nsInclusive = false,
bool filterns = false,
uint32_t seed = 0)
{
btread.install(read, true);
TAlScore minsc = (TAlScore)(Scoring::linearFunc(
btread.length(),
costMinConst,
costMinLinear));
TAlScore floorsc = (TAlScore)(Scoring::linearFunc(
btread.length(),
costFloorConst,
costFloorLinear));
btqual.install(qual);
btref.install(refin);
doTestCase(
al,
btread,
btqual,
btref,
off,
NULL,
sc,
minsc,
floorsc,
res,
nsInclusive,
filterns,
seed
);
}
/**
* Another interface for running a case.
*/
static void doTestCase3(
SwAligner& al,
const char *read,
const char *qual,
const char *refin,
TRefOff off,
Scoring& sc,
float costMinConst,
float costMinLinear,
float nCeilConst,
float nCeilLinear,
SwResult& res,
bool nsInclusive = false,
bool filterns = false,
uint32_t seed = 0)
{
btread.install(read, true);
// Calculate the penalty ceiling for the read
TAlScore minsc = (TAlScore)(Scoring::linearFunc(
btread.length(),
costMinConst,
costMinLinear));
TAlScore floorsc = (TAlScore)(Scoring::linearFunc(
btread.length(),
costFloorConst,
costFloorLinear));
btqual.install(qual);
btref.install(refin);
sc.nCeil.setType(SIMPLE_FUNC_LINEAR);
sc.nCeil.setConst(costMinConst);
sc.nCeil.setCoeff(costMinLinear);
doTestCase(
al,
btread,
btqual,
btref,
off,
NULL,
sc,
minsc,
floorsc,
res,
nsInclusive,
filterns,
seed
);
}
/**
* Another interface for running a case. Like doTestCase3 but caller specifies
* st_ and en_ lists.
*/
static void doTestCase4(
SwAligner& al,
const char *read,
const char *qual,
const char *refin,
TRefOff off,
EList<bool>& en,
Scoring& sc,
float costMinConst,
float costMinLinear,
float nCeilConst,
float nCeilLinear,
SwResult& res,
bool nsInclusive = false,
bool filterns = false,
uint32_t seed = 0)
{
btread.install(read, true);
// Calculate the penalty ceiling for the read
TAlScore minsc = (TAlScore)(Scoring::linearFunc(
btread.length(),
costMinConst,
costMinLinear));
TAlScore floorsc = (TAlScore)(Scoring::linearFunc(
btread.length(),
costFloorConst,
costFloorLinear));
btqual.install(qual);
btref.install(refin);
sc.nCeil.setType(SIMPLE_FUNC_LINEAR);
sc.nCeil.setConst(costMinConst);
sc.nCeil.setCoeff(costMinLinear);
doTestCase(
al,
btread,
btqual,
btref,
off,
&en,
sc,
minsc,
floorsc,
res,
nsInclusive,
filterns,
seed
);
}
/**
* Do a set of unit tests.
*/
static void doTests() {
bonusMatchType = DEFAULT_MATCH_BONUS_TYPE;
bonusMatch = DEFAULT_MATCH_BONUS;
penMmcType = DEFAULT_MM_PENALTY_TYPE;
penMmc = DEFAULT_MM_PENALTY;
penSnp = DEFAULT_SNP_PENALTY;
penNType = DEFAULT_N_PENALTY_TYPE;
penN = DEFAULT_N_PENALTY;
nPairCat = DEFAULT_N_CAT_PAIR;
penRdExConst = DEFAULT_READ_GAP_CONST;
penRfExConst = DEFAULT_REF_GAP_CONST;
penRdExLinear = DEFAULT_READ_GAP_LINEAR;
penRfExLinear = DEFAULT_REF_GAP_LINEAR;
costMinConst = DEFAULT_MIN_CONST;
costMinLinear = DEFAULT_MIN_LINEAR;
costFloorConst = DEFAULT_FLOOR_CONST;
costFloorLinear = DEFAULT_FLOOR_LINEAR;
nCeilConst = 1.0f; // constant factor in N ceil w/r/t read len
nCeilLinear = 0.1f; // coeff of linear term in N ceil w/r/t read len
multiseedMms = DEFAULT_SEEDMMS;
multiseedLen = DEFAULT_SEEDLEN;
// Set up penalities
Scoring sc(
bonusMatch,
penMmcType, // how to penalize mismatches
30, // constant if mm pelanty is a constant
30, // penalty for decoded SNP
costMinConst, // constant factor in N ceiling w/r/t read length
costMinLinear, // coeff of linear term in N ceiling w/r/t read length
costFloorConst, // constant factor in N ceiling w/r/t read length
costFloorLinear, // coeff of linear term in N ceiling w/r/t read length
nCeilConst, // constant factor in N ceiling w/r/t read length
nCeilLinear, // coeff of linear term in N ceiling w/r/t read length
penNType, // how to penalize Ns in the read
penN, // constant if N pelanty is a constant
nPairCat, // true -> concatenate mates before N filtering
25, // constant coeff for cost of gap in read
25, // constant coeff for cost of gap in ref
15, // linear coeff for cost of gap in read
15, // linear coeff for cost of gap in ref
1, // # rows at top/bot can only be entered diagonally
-1, // min row idx to backtrace from; -1 = no limit
false // sort results first by row then by score?
);
// Set up alternative penalities
Scoring sc2(
bonusMatch,
COST_MODEL_QUAL, // how to penalize mismatches
30, // constant if mm pelanty is a constant
30, // penalty for decoded SNP
costMinConst, // constant factor in N ceiling w/r/t read length
costMinLinear, // coeff of linear term in N ceiling w/r/t read length
costFloorConst, // constant factor in N ceiling w/r/t read length
costFloorLinear, // coeff of linear term in N ceiling w/r/t read length
1.0f, // constant factor in N ceiling w/r/t read length
1.0f, // coeff of linear term in N ceiling w/r/t read length
penNType, // how to penalize Ns in the read
penN, // constant if N pelanty is a constant
nPairCat, // true -> concatenate mates before N filtering
25, // constant coeff for cost of gap in read
25, // constant coeff for cost of gap in ref
15, // linear coeff for cost of gap in read
15, // linear coeff for cost of gap in ref
1, // # rows at top/bot can only be entered diagonally
-1, // min row idx to backtrace from; -1 = no limit
false // sort results first by row then by score?
);
SwResult res;
//
// Basic nucleotide-space tests
//
cerr << "Running tests..." << endl;
int tests = 1;
bool nIncl = false;
bool nfilter = false;
SwAligner al;
RandomSource rnd(73);
for(int i = 0; i < 3; i++) {
cerr << " Test " << tests++ << " (nuc space, offset "
<< (i*4) << ", exact)...";
sc.rdGapConst = 40;
sc.rfGapConst = 40;
sc.rdGapLinear = 15;
sc.rfGapLinear = 15;
// A C G T A C G T
// H E F H E F H E F H E F H E F H E F H E F H E F
// A 0 lo lo -30 lo lo -30 lo lo -30 lo lo 0 lo lo -30 lo lo-30 lo lo-30 lo lo
// C -30 lo -55 0 -85 -85 -55 -55 -85
// G -30 lo -70 -55 -85 -55 0 -100-100
// T -30 lo -85 -60 -85 -70 -55-100 -55
// A 0 lo -85 -55 -55 -85 -70 -70 -70
// C -30 lo -55 0 -85-100 -55 -55 -85
// G -30 lo -70 -55 -85 -55 0 -100-100
// T -30 lo -85 -60 -85 -70 -55-100 -55
doTestCase2(
al,
"ACGTACGT", // read
"IIIIIIII", // qual
"ACGTACGTACGTACGT", // ref in
i*4, // off
sc, // scoring scheme
-30.0f, // const coeff for cost ceiling
0.0f, // linear coeff for cost ceiling
res, // result
nIncl, // Ns inclusive (not mismatches)
nfilter); // filter Ns
assert(!al.done());
al.nextAlignment(res, rnd);
assert(!res.empty());
assert_eq(i*4, res.alres.refoff());
assert_eq(8, res.alres.refExtent());
assert_eq(res.alres.score().gaps(), 0);
assert_eq(res.alres.score().score(), 0);
assert_eq(res.alres.score().ns(), 0);
assert(res.alres.ned().empty());
assert(res.alres.aed().empty());
res.reset();
al.nextAlignment(res, rnd);
assert(res.empty());
assert(al.done());
res.reset();
cerr << "PASSED" << endl;
cerr << " Test " << tests++ << " (nuc space, offset " << (i*4)
<< ", 1mm allowed by minsc)...";
sc.setMmPen(COST_MODEL_CONSTANT, 30);
//sc.setMatchBonus(10);
doTestCase2(
al,
"ACGTTCGT", // read
"IIIIIIII", // qual
"ACGTACGTACGTACGT", // ref in
i*4, // off
sc, // scoring scheme
-30.0f, // const coeff for cost ceiling
0.0f, // linear coeff for cost ceiling
res, // result
nIncl, // Ns inclusive (not mismatches)
nfilter); // filter Ns
assert(!al.done());
al.nextAlignment(res, rnd);
assert(!res.empty());
assert_eq(i*4, res.alres.refoff());
assert_eq(8, res.alres.refExtent());
assert_eq(res.alres.score().gaps(), 0);
assert_eq(res.alres.score().score(), -30);
assert_eq(res.alres.score().ns(), 0);
res.reset();
al.nextAlignment(res, rnd);
assert(res.empty());
assert(al.done());
cerr << "PASSED" << endl;
cerr << " Test " << tests++ << " (nuc space, offset " << (i*4)
<< ", 1mm allowed by minsc, check qual 1)...";
doTestCase2(
al,
"ACGTTCGT", // read
"ABCDEFGH", // qual
"ACGTACGTACGTACGT", // ref in
i*4, // off
sc2, // scoring scheme
-40.0f, // const coeff for cost ceiling
0.0f, // linear coeff for cost ceiling
res, // result
nIncl, // Ns inclusive (not mismatches)
nfilter); // filter Ns
assert(!al.done());
size_t lo, hi;
if(i == 0) {
lo = 0; hi = 1;
} else if(i == 1) {
lo = 1; hi = 2;
} else {
lo = 2; hi = 3;
}
for(size_t j = lo; j < hi; j++) {
al.nextAlignment(res, rnd);
assert(!res.empty());
assert_eq(j*4, res.alres.refoff());
assert_eq(8, res.alres.refExtent());
assert_eq(res.alres.score().gaps(), 0);
assert_eq(res.alres.score().score(), -36);
assert_eq(res.alres.score().ns(), 0);
res.reset();
}
al.nextAlignment(res, rnd);
assert(res.empty());
res.reset();
cerr << "PASSED" << endl;
cerr << " Test " << tests++ << " (nuc space, offset " << (i*4)
<< ", 1mm allowed by minsc, check qual 2)...";
doTestCase2(
al,
"ACGAACGT", // read
"ABCDEFGH", // qual
"ACGTACGTACGTACGT", // ref in
i*4, // off
sc2, // scoring scheme
-40.0f, // const coeff for cost ceiling
0.0f, // linear coeff for cost ceiling
res, // result
nIncl, // Ns inclusive (not mismatches)
nfilter); // filter Ns
assert(!al.done());
al.nextAlignment(res, rnd);
assert(!res.empty());
assert_eq(i*4, res.alres.refoff());
assert_eq(8, res.alres.refExtent());
assert_eq(res.alres.score().gaps(), 0);
assert_eq(res.alres.score().score(), -35);
assert_eq(res.alres.score().ns(), 0);
res.reset();
al.nextAlignment(res, rnd);
assert(res.empty());
res.reset();
assert(res.empty());
cerr << "PASSED" << endl;
cerr << " Test " << tests++ << " (nuc space, offset " << (i*4)
<< ", 1mm allowed by minsc, check qual )...";
assert(res.empty());
doTestCase2(
al,
"TCGTACGT", // read
"ABCDEFGH", // qual
"ACGTACGTACGTACGT", // ref in
i*4, // off
sc2, // scoring scheme
-40.0f, // const coeff for cost ceiling
0.0f, // linear coeff for cost ceiling
res, // result
nIncl, // Ns inclusive (not mismatches)
nfilter); // filter Ns
assert(!al.done());
al.nextAlignment(res, rnd);
assert(!res.empty());
assert_eq(i*4, res.alres.refoff());
assert_eq(8, res.alres.refExtent());
assert_eq(res.alres.score().gaps(), 0);
assert_eq(res.alres.score().score(), -32);
assert_eq(res.alres.score().ns(), 0);
res.reset();
al.nextAlignment(res, rnd);
assert(res.empty());
res.reset();
assert(res.empty());
cerr << "PASSED" << endl;
cerr << " Test " << tests++ << " (nuc space, offset " << (i*4)
<< ", 1mm at the beginning, allowed by minsc)...";
doTestCase2(
al,
"CCGTACGT", // read
"IIIIIIII", // qual
"ACGTACGTACGTACGT", // ref in
i*4, // off
sc, // scoring scheme
-30.0f, // const coeff for cost ceiling
0.0f, // linear coeff for cost ceiling
res, // result
nIncl, // Ns inclusive (not mismatches)
nfilter); // filter Ns
assert(!al.done());
al.nextAlignment(res, rnd);
assert(!res.empty());
assert_eq(i*4, res.alres.refoff());
assert_eq(8, res.alres.refExtent());
assert_eq(res.alres.score().gaps(), 0);
assert_eq(res.alres.score().score(), -30);
assert_eq(res.alres.score().ns(), 0);
assert_eq(1, res.alres.ned().size());
assert_eq(0, res.alres.aed().size());
res.reset();
al.nextAlignment(res, rnd);
assert(res.empty());
res.reset();
cerr << "PASSED" << endl;
cerr << " Test " << tests++ << " (nuc space, offset " << (i*4)
<< ", 1 n in read, allowed)...";
doTestCase3(
al,
"ACGTNCGT", // read
"IIIIIIII", // qual
"ACGTACGTACGTACGT", // ref in
i*4, // off
sc, // scoring scheme
-30.0f, // const coeff for cost ceiling
0.0f, // linear coeff for cost ceiling
1.0f, // allow 1 N
0.0f, // allow 1 N
res, // result
nIncl, // Ns inclusive (not mismatches)
nfilter); // filter Ns
assert(!al.done());
al.nextAlignment(res, rnd);
assert(!res.empty());
assert_eq(i*4, res.alres.refoff());
assert_eq(8, res.alres.refExtent());
assert_eq(res.alres.score().gaps(), 0);
assert_eq(res.alres.score().score(), -1);
assert_eq(res.alres.score().ns(), 1);
res.reset();
al.nextAlignment(res, rnd);
assert(res.empty());
res.reset();
cerr << "PASSED" << endl;
cerr << " Test " << tests++ << " (nuc space, offset " << (i*4)
<< ", 2 n in read, allowed)...";
doTestCase3(
al,
"ACGNNCGT", // read
"IIIIIIII", // qual
"ACGTACGTACGTACGT", // ref in
i*4, // off
sc, // scoring scheme
-30.0f, // const coeff for cost ceiling
0.0f, // linear coeff for cost ceiling
2.0f, // const coeff for N ceiling
0.0f, // linear coeff for N ceiling
res, // result
nIncl, // Ns inclusive (not mismatches)
nfilter); // filter Ns
assert(!al.done());
al.nextAlignment(res, rnd);
assert(!res.empty());
assert_eq(i*4, res.alres.refoff());
assert_eq(8, res.alres.refExtent());
assert_eq(res.alres.score().gaps(), 0);
assert_eq(res.alres.score().score(), -2);
assert_eq(res.alres.score().ns(), 2);
res.reset();
al.nextAlignment(res, rnd);
assert(res.empty());
res.reset();
cerr << "PASSED" << endl;
cerr << " Test " << tests++ << " (nuc space, offset " << (i*4)
<< ", 2 n in read, 1 at beginning, allowed)...";
doTestCase2(
al,
"NCGTNCGT", // read
"IIIIIIII", // qual
"ACGTACGTACGTACGT", // ref in
i*4, // off
sc, // scoring scheme
-30.0f, // const coeff for cost ceiling
0.0f, // linear coeff for cost ceiling
res, // result
nIncl, // Ns inclusive (not mismatches)
nfilter); // filter Ns
assert(!al.done());
al.nextAlignment(res, rnd);
assert(!res.empty());
assert_eq(i*4, res.alres.refoff());
assert_eq(8, res.alres.refExtent());
assert_eq(res.alres.score().gaps(), 0);
assert_eq(res.alres.score().score(), -2);
assert_eq(res.alres.score().ns(), 2);
res.reset();
al.nextAlignment(res, rnd);
assert(res.empty());
res.reset();
cerr << "PASSED" << endl;
cerr << " Test " << tests++ << " (nuc space, offset " << (i*4)
<< ", 1 n in ref, allowed)...";
doTestCase2(
al,
"ACGTACGT", // read
"IIIIIIII", // qual
"ACGTNCGTACGTANGT", // ref in
i*4, // off
sc, // scoring scheme
-30.0f, // const coeff for cost ceiling
0.0f, // linear coeff for cost ceiling
res, // result
nIncl, // Ns inclusive (not mismatches)
nfilter); // filter Ns
assert(!al.done());
al.nextAlignment(res, rnd);
assert(!res.empty());
assert_eq(i*4, res.alres.refoff());
assert_eq(8, res.alres.refExtent());
assert_eq(res.alres.score().gaps(), 0);
assert_eq(res.alres.score().score(), -1);
assert_eq(res.alres.score().ns(), 1);
res.reset();
al.nextAlignment(res, rnd);
assert(res.empty());
res.reset();
cerr << "PASSED" << endl;
cerr << " Test " << tests++ << " (nuc space, offset " << (i*4)
<< ", 1mm disallowed by minsc)...";
doTestCase2(
al,
"ACGTTCGT", // read
"IIIIIIII", // qual
"ACGTACGTACGTACGT", // ref in
i*4, // off
sc, // scoring scheme
-10.0f, // const coeff for cost ceiling
0.0f, // linear coeff for cost ceiling
res, // result
nIncl, // Ns inclusive (not mismatches)
nfilter); // filter Ns
al.nextAlignment(res, rnd);
assert(res.empty());
assert(al.done());
// Read gap with equal read and ref gap penalties
cerr << "PASSED" << endl;
cerr << " Test " << tests++ << " (nuc space, offset " << (i*4)
<< ", read gap allowed by minsc)...";
assert(res.empty());
sc.rfGapConst = 25;
sc.rdGapConst = 25;
sc.rfGapLinear = 15;
sc.rdGapLinear = 15;
doTestCase2(
al,
"ACGTCGT", // read
"IIIIIII", // qual
"ACGTACGTACGTACGT", // ref in
i*4, // off
sc, // scoring scheme
-40.0f, // const coeff for cost ceiling
0.0f, // linear coeff for cost ceiling
res, // result
nIncl, // Ns inclusive (not mismatches)
nfilter); // filter Ns
assert(!al.done());
al.nextAlignment(res, rnd);
assert(!res.empty());
assert_eq(i*4, res.alres.refoff());
assert_eq(8, res.alres.refExtent());
assert_eq(res.alres.score().gaps(), 1);
assert_eq(res.alres.score().score(), -40);
assert_eq(res.alres.score().ns(), 0);
res.reset();
al.nextAlignment(res, rnd);
assert(res.empty());
res.reset();
cerr << "PASSED" << endl;
cerr << " Test " << tests++ << " (nuc space, offset " << (i*4)
<< ", read gap disallowed by minsc)...";
sc.rfGapConst = 25;
sc.rdGapConst = 25;
sc.rfGapLinear = 15;
sc.rdGapLinear = 15;
doTestCase2(
al,
"ACGTCGT", // read
"IIIIIII", // qual
"ACGTACGTACGTACGT", // ref in
i*4, // off
sc, // scoring scheme
-30.0f, // const coeff for cost ceiling
0.0f, // linear coeff for cost ceiling
res, // result
nIncl, // Ns inclusive (not mismatches)
nfilter); // filter Ns
al.nextAlignment(res, rnd);
assert(res.empty());
assert(al.done());
res.reset();
cerr << "PASSED" << endl;
// Ref gap with equal read and ref gap penalties
cerr << " Test " << tests++ << " (nuc space, offset " << (i*4)
<< ", ref gap allowed by minsc)...";
doTestCase2(
al,
"ACGTAACGT", // read
"IIIIIIIII", // qual
"ACGTACGTACGTACGT", // ref in
i*4, // off
sc, // scoring scheme
-40.0f, // const coeff for cost ceiling
0.0f, // linear coeff for cost ceiling
res, // result
nIncl, // Ns inclusive (not mismatches)
nfilter); // filter Ns
assert(!al.done());
al.nextAlignment(res, rnd);
assert(!res.empty());
assert_eq(i*4, res.alres.refoff());
assert_eq(8, res.alres.refExtent());
assert_eq(res.alres.score().gaps(), 1);
assert_eq(res.alres.score().score(), -40);
assert_eq(res.alres.score().ns(), 0);
res.reset();
al.nextAlignment(res, rnd);
assert(res.empty());
res.reset();
cerr << "PASSED" << endl;
cerr << " Test " << tests++ << " (nuc space, offset " << (i*4)
<< ", read gap disallowed by gap barrier)...";
sc.rfGapConst = 25;
sc.rdGapConst = 25;
sc.rfGapLinear = 15;
sc.rdGapLinear = 15;
sc.gapbar = 4;
doTestCase2(
al,
"ACGTCGT", // read
"IIIIIII", // qual
"ACGTACGTACGTACGT", // ref in
i*4, // off
sc, // scoring scheme
-40.0f, // const coeff for cost ceiling
0.0f, // linear coeff for cost ceiling
res, // result
nIncl, // Ns inclusive (not mismatches)
nfilter); // filter Ns
sc.gapbar = 1;
al.nextAlignment(res, rnd);
assert(res.empty());
assert(al.done());
res.reset();
cerr << "PASSED" << endl;
// Ref gap with equal read and ref gap penalties
cerr << " Test " << tests++ << " (nuc space, offset " << (i*4)
<< ", ref gap allowed by minsc, gapbar=3)...";
sc.gapbar = 3;
doTestCase2(
al,
"ACGTAACGT", // read
"IIIIIIIII", // qual
"ACGTACGTACGTACGT", // ref in
i*4, // off
sc, // scoring scheme
-40.0f, // const coeff for cost ceiling
0.0f, // linear coeff for cost ceiling
res, // result
nIncl, // Ns inclusive (not mismatches)
nfilter); // filter Ns
sc.gapbar = 1;
assert(!al.done());
al.nextAlignment(res, rnd);
assert(!res.empty());
assert_eq(i*4, res.alres.refoff());
assert_eq(8, res.alres.refExtent());
assert_eq(res.alres.score().gaps(), 1);
assert_eq(res.alres.score().score(), -40);
assert_eq(res.alres.score().ns(), 0);
res.reset();
al.nextAlignment(res, rnd);
assert(res.empty());
res.reset();
cerr << "PASSED" << endl;
// Ref gap with equal read and ref gap penalties
cerr << " Test " << tests++ << " (nuc space, offset " << (i*4)
<< ", ref gap allowed by minsc, gapbar=4)...";
sc.gapbar = 4;
doTestCase2(
al,
"ACGTAACGT", // read
"IIIIIIIII", // qual
"ACGTACGTACGTACGT", // ref in
i*4, // off
sc, // scoring scheme
-40.0f, // const coeff for cost ceiling
0.0f, // linear coeff for cost ceiling
res, // result
nIncl, // Ns inclusive (not mismatches)
nfilter); // filter Ns
sc.gapbar = 1;
assert(!al.done());
al.nextAlignment(res, rnd);
assert(!res.empty());
assert_eq(i*4, res.alres.refoff());
assert_eq(8, res.alres.refExtent());
assert_eq(res.alres.score().gaps(), 1);
assert_eq(res.alres.score().score(), -40);
assert_eq(res.alres.score().ns(), 0);
res.reset();
al.nextAlignment(res, rnd);
assert(res.empty());
res.reset();
cerr << "PASSED" << endl;
cerr << " Test " << tests++ << " (nuc space, offset " << (i*4)
<< ", ref gap disallowed by minsc)...";
doTestCase2(
al,
"ACGTAACGT", // read
"IIIIIIIII", // qual
"ACGTACGTACGTACGT", // ref in
i*4, // off
sc, // scoring scheme
-30.0f, // const coeff for cost ceiling
0.0f, // linear coeff for cost ceiling
res, // result
nIncl, // Ns inclusive (not mismatches)
nfilter); // filter Ns
al.nextAlignment(res, rnd);
assert(res.empty());
res.reset();
assert(al.done());
cerr << "PASSED" << endl;
cerr << " Test " << tests++ << " (nuc space, offset " << (i*4)
<< ", ref gap disallowed by gap barrier)...";
sc.gapbar = 5;
doTestCase2(
al,
"ACGTAACGT", // read
"IIIIIIIII", // qual
"ACGTACGTACGTACGT", // ref in
i*4, // off
sc, // scoring scheme
-40.0f, // const coeff for cost ceiling
0.0f, // linear coeff for cost ceiling
res, // result
nIncl, // Ns inclusive (not mismatches)
nfilter); // filter Ns
sc.gapbar = 1;
al.nextAlignment(res, rnd);
assert(res.empty());
res.reset();
assert(al.done());
cerr << "PASSED" << endl;
// Read gap with one read gap and zero ref gaps allowed
cerr << " Test " << tests++ << " (nuc space, offset " << (i*4)
<< ", 1 read gap, ref gaps disallowed by minsc)...";
sc.rfGapConst = 35;
sc.rdGapConst = 25;
sc.rfGapLinear = 20;
sc.rdGapLinear = 10;
doTestCase2(
al,
"ACGTCGT", // read
"IIIIIII", // qual
"ACGTACGTACGTACGT", // ref in
i*4, // off
sc, // scoring scheme
-40.0f, // const coeff for cost ceiling
0.0f, // linear coeff for cost ceiling
res, // result
nIncl, // Ns inclusive (not mismatches)
nfilter); // filter Ns
assert(!al.done());
al.nextAlignment(res, rnd);
assert(!res.empty());
assert_eq(i*4, res.alres.refoff());
assert_eq(8, res.alres.refExtent());
assert_eq(res.alres.score().gaps(), 1);
assert_eq(res.alres.score().score(), -35);
assert_eq(res.alres.score().ns(), 0);
res.reset();
al.nextAlignment(res, rnd);
assert(res.empty());
res.reset();
cerr << "PASSED" << endl;
cerr << " Test " << tests++ << " (nuc space, offset " << (i*4)
<< ", gaps disallowed by minsc)...";
sc.rfGapConst = 25;
sc.rdGapConst = 25;
sc.rfGapLinear = 10;
sc.rdGapLinear = 10;
doTestCase2(
al,
"ACGTCGT", // read
"IIIIIII", // qual
"ACGTACGTACGTACGT", // ref
i*4, // off
sc, // scoring scheme
-30.0f, // const coeff for cost ceiling
0.0f, // linear coeff for cost ceiling
res, // result
nIncl, // Ns inclusive (not mismatches)
nfilter); // filter Ns
al.nextAlignment(res, rnd);
assert(res.empty());
res.reset();
assert(res.empty());
cerr << "PASSED" << endl;
// Ref gap with one ref gap and zero read gaps allowed
sc.rfGapConst = 25;
sc.rdGapConst = 35;
sc.rfGapLinear = 12;
sc.rdGapLinear = 22;
cerr << " Test " << tests++ << " (nuc space, offset " << (i*4)
<< ", 1 ref gap, read gaps disallowed by minsc)...";
assert(res.empty());
doTestCase2(
al,
"ACGTAACGT",
"IIIIIIIII",
"ACGTACGTACGTACGT",
i*4, // off
sc, // scoring scheme
-40.0f, // const coeff for cost ceiling
0.0f, // linear coeff for cost ceiling
res, // result
nIncl, // Ns inclusive (not mismatches)
nfilter); // filter Ns
assert(!al.done());
al.nextAlignment(res, rnd);
assert(!res.empty());
assert_eq(i*4, res.alres.refoff());
assert_eq(8, res.alres.refExtent());
assert_eq(res.alres.score().gaps(), 1);
assert_eq(res.alres.score().score(), -37);
assert_eq(res.alres.score().ns(), 0);
res.reset();
al.nextAlignment(res, rnd);
assert(res.empty());
res.reset();
cerr << "PASSED" << endl;
cerr << " Test " << tests++ << " (nuc space, offset " << (i*4)
<< ", gaps disallowed by minsc)...";
doTestCase2(
al,
"ACGTAACGT",
"IIIIIIIII",
"ACGTACGTACGTACGT",
i*4, // off
sc, // scoring scheme
-30.0f, // const coeff for cost ceiling
0.0f, // linear coeff for cost ceiling
res, // result
nIncl, // Ns inclusive (not mismatches)
nfilter); // filter Ns
al.nextAlignment(res, rnd);
assert(res.empty());
assert(al.done());
cerr << "PASSED" << endl;
// Read gap with one read gap and two ref gaps allowed
sc.rfGapConst = 20;
sc.rdGapConst = 25;
sc.rfGapLinear = 10;
sc.rdGapLinear = 15;
cerr << " Test " << tests++ << " (nuc space, offset " << (i*4)
<< ", 1 read gap, 2 ref gaps allowed by minsc)...";
doTestCase2(
al,
"ACGTCGT",
"IIIIIII",
"ACGTACGTACGTACGT",
i*4, // off
sc, // scoring scheme
-40.0f, // const coeff for cost ceiling
0.0f, // linear coeff for cost ceiling
res, // result
nIncl, // Ns inclusive (not mismatches)
nfilter); // filter Ns
assert(!al.done());
al.nextAlignment(res, rnd);
assert(!res.empty());
assert_eq(i*4, res.alres.refoff());
assert_eq(8, res.alres.refExtent());
assert_eq(res.alres.score().gaps(), 1);
assert_eq(res.alres.score().score(), -40);
assert_eq(res.alres.score().ns(), 0);
res.reset();
al.nextAlignment(res, rnd);
assert(res.empty());
res.reset();
cerr << "PASSED" << endl;
cerr << " Test " << tests++ << " (nuc space, offset " << (i*4)
<< ", gaps disallowed by minsc)...";
doTestCase2(
al,
"ACGTCGT",
"IIIIIII",
"ACGTACGTACGTACGT",
i*4, // off
sc, // scoring scheme
-30.0f, // const coeff for cost ceiling
0.0f, // linear coeff for cost ceiling
res, // result
nIncl, // Ns inclusive (not mismatches)
nfilter); // filter Ns
al.nextAlignment(res, rnd);
assert(res.empty());
assert(al.done());
cerr << "PASSED" << endl;
// Ref gap with one ref gap and two read gaps allowed
sc.rfGapConst = 25;
sc.rdGapConst = 11; // if this were 10, we'd have ties
sc.rfGapLinear = 15;
sc.rdGapLinear = 10;
cerr << " Test " << tests++ << " (nuc space, offset " << (i*4)
<< ", 1 ref gap, 2 read gaps allowed by minsc)...";
doTestCase2(
al,
"ACGTAACGT",
"IIIIIIIII",
"ACGTACGTACGTACGT",
i*4, // off
sc, // scoring scheme
-40.0f, // const coeff for cost ceiling
0.0f, // linear coeff for cost ceiling
res, // result
nIncl, // Ns inclusive (not mismatches)
nfilter); // filter Ns
assert(!al.done());
al.nextAlignment(res, rnd);
assert(!res.empty());
assert_eq(i*4, res.alres.refoff());
assert_eq(8, res.alres.refExtent());
assert_eq(res.alres.score().gaps(), 1);
assert_eq(res.alres.score().score(), -40);
assert_eq(res.alres.score().ns(), 0);
res.reset();
al.nextAlignment(res, rnd);
assert(res.empty());
res.reset();
cerr << "PASSED" << endl;
cerr << " Test " << tests++ << " (nuc space, offset " << (i*4) << ", gaps disallowed by minsc)...";
doTestCase2(
al,
"ACGTAACGT",
"IIIIIIIII",
"ACGTACGTACGTACGT",
i*4, // off
sc, // scoring scheme
-30.0f, // const coeff for cost ceiling
0.0f, // linear coeff for cost ceiling
res, // result
nIncl, // Ns inclusive (not mismatches)
nfilter); // filter Ns
al.nextAlignment(res, rnd);
assert(res.empty());
res.reset();
assert(al.done());
cerr << "PASSED" << endl;
// Read gap with two read gaps and two ref gaps allowed
sc.rfGapConst = 15;
sc.rdGapConst = 15;
sc.rfGapLinear = 10;
sc.rdGapLinear = 10;
cerr << " Test " << tests++ << " (nuc space, offset " << (i*4)
<< ", 2 ref gaps, 2 read gaps allowed by minsc)...";
doTestCase3(
al,
"ACGTCGT",
"IIIIIII",
"ACGTACGTACGTACGT",
i*4, // off
sc, // scoring scheme
-40.0f, // const coeff for cost ceiling
0.0f, // linear coeff for cost ceiling
1.0, // const coeff for N ceiling
0.0, // linear coeff for N ceiling
res, // result
nIncl, // Ns inclusive (not mismatches)
true); // filter Ns
assert(!al.done());
al.nextAlignment(res, rnd);
if(!res.empty()) {
//al.printResultStacked(res, cerr); cerr << endl;
}
assert(!res.empty());
assert_eq(i*4, res.alres.refoff());
assert_eq(8, res.alres.refExtent());
assert_eq(res.alres.score().gaps(), 1);
assert_eq(res.alres.score().score(), -25);
assert_eq(res.alres.score().ns(), 0);
res.reset();
al.nextAlignment(res, rnd);
// The following alignment is possible when i == 2:
// ACGTACGTACGTACGTN
// A x
// C x
// G x
// T x
// C x
// G x
// T x
assert(i == 2 || res.empty());
res.reset();
cerr << "PASSED" << endl;
sc.rfGapConst = 10;
sc.rdGapConst = 10;
sc.rfGapLinear = 10;
sc.rdGapLinear = 10;
cerr << " Test " << tests++ << " (nuc space, offset " << (i*4)
<< ", 1 ref gap, 1 read gap allowed by minsc)...";
doTestCase2(
al,
"ACGTCGT",
"IIIIIII",
"ACGTACGTACGTACGT",
i*4, // off
sc, // scoring scheme
-30.0f, // const coeff for cost ceiling
0.0f, // linear coeff for cost ceiling
res, // result
nIncl, // Ns inclusive (not mismatches)
nfilter); // filter Ns
al.nextAlignment(res, rnd);
assert(!res.empty());
assert_eq(i*4, res.alres.refoff());
assert_eq(8, res.alres.refExtent());
assert_eq(res.alres.score().gaps(), 1);
assert_eq(res.alres.score().score(), -20);
assert_eq(res.alres.score().ns(), 0);
res.reset();
cerr << "PASSED" << endl;
// Ref gap with two ref gaps and zero read gaps allowed
sc.rfGapConst = 15;
sc.rdGapConst = 15;
sc.rfGapLinear = 5;
sc.rdGapLinear = 5;
cerr << " Test " << tests++ << " (nuc space, offset " << (i*4)
<< ", 2 ref gaps, 2 read gaps allowed by minsc)...";
// Careful: it might be possible for the read to align with overhang
// instead of with a gap
doTestCase3(
al,
"ACGTAACGT",
"IIIIIIIII",
"ACGTACGTACGTACGT",
i*4, // off
sc, // scoring scheme
-35.0f, // const coeff for cost ceiling
0.0f, // linear coeff for cost ceiling
1.0f, // needed to avoid overhang alignments
0.0f, // needed to avoid overhang alignments
res, // result
nIncl, // Ns inclusive (not mismatches)
true); // filter Ns
if(i == 0) {
lo = 0; hi = 1;
} else if(i == 1) {
lo = 1; hi = 2;
} else {
lo = 2; hi = 3;
}
for(size_t j = lo; j < hi; j++) {
al.nextAlignment(res, rnd);
assert(!res.empty());
//al.printResultStacked(res, cerr); cerr << endl;
assert(res.alres.refoff() == 0 ||
res.alres.refoff() == 4 ||
res.alres.refoff() == 8);
assert_eq(8, res.alres.refExtent());
assert_eq(res.alres.score().gaps(), 1);
assert_eq(res.alres.score().score(), -20);
assert_eq(res.alres.score().ns(), 0);
res.reset();
}
al.nextAlignment(res, rnd);
//assert(res.empty());
//res.reset();
cerr << "PASSED" << endl;
sc.rfGapConst = 25;
sc.rdGapConst = 25;
sc.rfGapLinear = 4;
sc.rdGapLinear = 4;
cerr << " Test " << tests++ << " (nuc space, offset " << (i*4)
<< ", 1 ref gap, 1 read gap allowed by minsc)...";
doTestCase2(
al,
"ACGTAACGT",
"IIIIIIIII",
"ACGTACGTACGTACGT",
i*4, // off
sc, // scoring scheme
-30.0f, // const coeff for cost ceiling
0.0f, // linear coeff for cost ceiling
res, // result
nIncl, // Ns inclusive (not mismatches)
nfilter); // filter Ns
al.nextAlignment(res, rnd);
assert(!res.empty());
assert_eq(i*4, res.alres.refoff());
assert_eq(8, res.alres.refExtent());
assert_eq(res.alres.score().gaps(), 1);
assert_eq(res.alres.score().score(), -29);
assert_eq(res.alres.score().ns(), 0);
res.reset();
cerr << "PASSED" << endl;
cerr << " Test " << tests++ << " (nuc space, offset " << (i*4)
<< ", short read)...";
doTestCase2(
al,
"A",
"I",
"AAAAAAAAAAAA",
i*4, // off
sc, // scoring scheme
-30.0f, // const coeff for cost ceiling
0.0f, // linear coeff for cost ceiling
res, // result
nIncl, // Ns inclusive (not mismatches)
nfilter); // filter Ns
al.nextAlignment(res, rnd);
assert(!res.empty());
assert_eq(res.alres.score().gaps(), 0);
assert_eq(res.alres.score().score(), 0);
assert_eq(res.alres.score().ns(), 0);
res.reset();
cerr << "PASSED" << endl;
if(i == 0) {
cerr << " Test " << tests++
<< " (nuc space, offset 0, short read & ref)...";
doTestCase2(
al,
"A",
"I",
"A",
0, // off
sc, // scoring scheme
-30.0f, // const coeff for cost ceiling
0.0f, // linear coeff for cost ceiling
res, // result
nIncl, // Ns inclusive (not mismatches)
nfilter); // filter Ns
al.nextAlignment(res, rnd);
assert(!res.empty());
assert_eq(res.alres.score().gaps(), 0);
assert_eq(res.alres.score().score(), 0);
assert_eq(res.alres.score().ns(), 0);
res.reset();
cerr << "PASSED" << endl;
}
cerr << " Test " << tests++ << " (nuc space, offset " << (i*4)
<< ", short read, many allowed gaps)...";
doTestCase2(
al,
"A",
"I",
"AAAAAAAAAAAA",
i*4, // off
sc, // scoring scheme
-150.0f, // const coeff for cost ceiling
0.0f, // linear coeff for cost ceiling
res, // result
nIncl, // Ns inclusive (not mismatches)
nfilter); // filter Ns
al.nextAlignment(res, rnd);
assert(!res.empty());
assert_eq(res.alres.score().gaps(), 0);
assert_eq(res.alres.score().score(), 0);
assert_eq(res.alres.score().ns(), 0);
res.reset();
cerr << "PASSED" << endl;
if(i == 0) {
cerr << " Test " << tests++
<< " (nuc space, offset 0, short read & ref, "
<< "many allowed gaps)...";
doTestCase2(
al,
"A",
"I",
"A",
0, // off
sc, // scoring scheme
-150.0f, // const coeff for cost ceiling
0.0f, // linear coeff for cost ceiling
res, // result
nIncl, // Ns inclusive (not mismatches)
nfilter); // filter Ns
al.nextAlignment(res, rnd);
assert(!res.empty());
assert_eq(res.alres.score().gaps(), 0);
assert_eq(res.alres.score().score(), 0);
assert_eq(res.alres.score().ns(), 0);
res.reset();
cerr << "PASSED" << endl;
}
}
// A test case where a valid alignment with a worse score should be
// accepted over a valid alignment with a better score but too many
// Ns
cerr << " Test " << tests++ << " (N ceiling 1)...";
sc.mmcostType = COST_MODEL_CONSTANT;
sc.mmcost = 10;
sc.snp = 30;
sc.nCeilConst = 0.0f;
sc.nCeilLinear = 0.0f;
sc.rfGapConst = 10;
sc.rdGapLinear = 10;
sc.rfGapConst = 10;
sc.rfGapLinear = 10;
sc.setNPen(COST_MODEL_CONSTANT, 2);
sc.gapbar = 1;
// No Ns allowed, so this hit should be filtered
doTestCase2(
al,
"ACGTACGT", // read seq
"IIIIIIII", // read quals
"NCGTACGT", // ref seq
0, // offset
sc, // scoring scheme
-25.0f, // const coeff for cost ceiling
0.0f, // linear coeff for cost ceiling
res, // result
false, // ns are in inclusive
true, // nfilter
0);
al.nextAlignment(res, rnd);
assert(res.empty());
cerr << "PASSED" << endl;
res.reset();
// 1 N allowed, so this hit should stand
cerr << " Test " << tests++ << " (N ceiling 2)...";
doTestCase3(
al,
"ACGTACGT", // read seq
"IIIIIIII", // read quals
"NCGTACGT", // ref seq
0, // offset
sc, // scoring scheme
-25.0f, // const coeff for cost ceiling
0.0f, // linear coeff for cost ceiling
1.0f, // constant coefficient for # Ns allowed
0.0f, // linear coefficient for # Ns allowed
res, // result
false, // ns are in inclusive
false, // nfilter - NOTE: FILTER OFF
0);
al.nextAlignment(res, rnd);
assert(!res.empty());
assert_eq(0, res.alres.score().gaps());
assert_eq(-2, res.alres.score().score());
assert_eq(1, res.alres.score().ns());
cerr << "PASSED" << endl;
res.reset();
// 1 N allowed, but we set st_ such that this hit should not stand
for(size_t i = 0; i < 2; i++) {
cerr << " Test " << tests++ << " (N ceiling 2 with st_ override)...";
EList<bool> en;
en.resize(3); en.fill(true);
if(i == 1) {
en[1] = false;
}
sc.rfGapConst = 10;
sc.rdGapLinear = 10;
sc.rfGapConst = 10;
sc.rfGapLinear = 10;
doTestCase4(
al,
"ACGTACGT", // read seq
"IIIIIIII", // read quals
"NCGTACGT", // ref seq
0, // offset
en, // rectangle columns where solution can end
sc, // scoring scheme
-25.0f, // const coeff for cost ceiling
0.0f, // linear coeff for cost ceiling
1.0f, // constant coefficient for # Ns allowed
0.0f, // linear coefficient for # Ns allowed
res, // result
false, // ns are in inclusive
false, // nfilter - NOTE: FILTER OFF
0);
al.nextAlignment(res, rnd);
if(i > 0) {
assert(res.empty());
} else {
assert(!res.empty());
}
cerr << "PASSED" << endl;
res.reset();
}
// No Ns allowed, so this hit should be filtered
cerr << " Test " << tests++ << " (N ceiling 3)...";
sc.nCeilConst = 1.0f;
sc.nCeilLinear = 0.0f;
doTestCase2(
al,
"ACGTACGT", // read seq
"IIIIIIII", // read quals
"NCGTACGT", // ref seq
0, // offset
sc, // scoring scheme
-25.0f, // const coeff for cost ceiling
0.0f, // linear coeff for cost ceiling
res, // result
false, // ns are in inclusive
true, // nfilter - NOTE: FILTER ON
0);
al.nextAlignment(res, rnd);
assert(!res.empty());
assert_eq(0, res.alres.score().gaps());
assert_eq(-2, res.alres.score().score());
assert_eq(1, res.alres.score().ns());
cerr << "PASSED" << endl;
res.reset();
// No Ns allowed, so this hit should be filtered
cerr << " Test " << tests++ << " (redundant alignment elimination 1)...";
sc.nCeilConst = 1.0f;
sc.nCeilLinear = 0.0f;
sc.rfGapConst = 25;
sc.rdGapLinear = 15;
sc.rfGapConst = 25;
sc.rfGapLinear = 15;
doTestCase2(
al,
// 1 2 3 4
// 01234567890123456789012345678901234567890123456
"AGGCTATGCCTCTGACGCGATATCGGCGCCCACTTCAGAGCTAACCG",
"IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII",
"TTTTTTTTAGGCTATGCCTCTGACGCGATATCGGCGCCCACTTCAGAGCTAACCGTTTTTTT",
// 01234567890123456789012345678901234567890123456789012345678901
// 1 2 3 4 5 6
8, // offset
sc, // scoring scheme
-25.0f, // const coeff for cost ceiling
-5.0f, // linear coeff for cost ceiling
res, // result
false, // ns are in inclusive
true, // nfilter - NOTE: FILTER ON
0);
al.nextAlignment(res, rnd);
assert(!res.empty());
assert_eq(8, res.alres.refoff());
assert_eq(47, res.alres.refExtent());
assert_eq(0, res.alres.score().gaps());
assert_eq(0, res.alres.score().score());
assert_eq(0, res.alres.score().ns());
res.reset();
al.nextAlignment(res, rnd);
assert(res.empty());
assert(al.done());
cerr << "PASSED" << endl;
res.reset();
}
/**
* Do a set of unit tests for local alignment.
*/
static void doLocalTests() {
bonusMatchType = DEFAULT_MATCH_BONUS_TYPE;
bonusMatch = DEFAULT_MATCH_BONUS_LOCAL;
penMmcType = DEFAULT_MM_PENALTY_TYPE;
penMmc = DEFAULT_MM_PENALTY;
penSnp = DEFAULT_SNP_PENALTY;
penNType = DEFAULT_N_PENALTY_TYPE;
penN = DEFAULT_N_PENALTY;
nPairCat = DEFAULT_N_CAT_PAIR;
penRdExConst = DEFAULT_READ_GAP_CONST;
penRfExConst = DEFAULT_REF_GAP_CONST;
penRdExLinear = DEFAULT_READ_GAP_LINEAR;
penRfExLinear = DEFAULT_REF_GAP_LINEAR;
costMinConst = DEFAULT_MIN_CONST_LOCAL;
costMinLinear = DEFAULT_MIN_LINEAR_LOCAL;
costFloorConst = DEFAULT_FLOOR_CONST_LOCAL;
costFloorLinear = DEFAULT_FLOOR_LINEAR_LOCAL;
nCeilConst = 1.0f; // constant factor in N ceil w/r/t read len
nCeilLinear = 0.1f; // coeff of linear term in N ceil w/r/t read len
multiseedMms = DEFAULT_SEEDMMS;
multiseedLen = DEFAULT_SEEDLEN;
// Set up penalities
Scoring sc(
10,
penMmcType, // how to penalize mismatches
30, // constant if mm pelanty is a constant
penSnp, // penalty for decoded SNP
costMinConst, // constant factor in N ceiling w/r/t read length
costMinLinear, // coeff of linear term in N ceiling w/r/t read length
costFloorConst, // constant factor in N ceiling w/r/t read length
costFloorLinear, // coeff of linear term in N ceiling w/r/t read length
nCeilConst, // constant factor in N ceiling w/r/t read length
nCeilLinear, // coeff of linear term in N ceiling w/r/t read length
penNType, // how to penalize Ns in the read
penN, // constant if N pelanty is a constant
nPairCat, // true -> concatenate mates before N filtering
25, // constant coeff for cost of gap in read
25, // constant coeff for cost of gap in ref
15, // linear coeff for cost of gap in read
15, // linear coeff for cost of gap in ref
1, // # rows at top/bot can only be entered diagonally
-1, // min row idx to backtrace from; -1 = no limit
false // sort results first by row then by score?
);
SwResult res;
//
// Basic nucleotide-space tests
//
cerr << "Running local tests..." << endl;
int tests = 1;
bool nIncl = false;
bool nfilter = false;
SwAligner al;
RandomSource rnd(73);
for(int i = 0; i < 3; i++) {
cerr << " Test " << tests++ << " (short nuc space, offset "
<< (i*4) << ", exact)...";
sc.rdGapConst = 40;
sc.rfGapConst = 40;
doTestCase2(
al,
"ACGT", // read
"IIII", // qual
"ACGTACGTACGTACGT", // ref in
i*4, // off
sc, // scoring scheme
0.0f, // const coeff for cost ceiling
8.0f, // linear coeff for cost ceiling
res, // result
nIncl, // Ns inclusive (not mismatches)
nfilter); // filter Ns
assert(!al.done());
al.nextAlignment(res, rnd);
assert(!res.empty());
assert_eq(i*4, res.alres.refoff());
assert_eq(4, res.alres.refExtent());
assert_eq(res.alres.score().gaps(), 0);
assert_eq(res.alres.score().score(), 40);
assert_eq(res.alres.score().ns(), 0);
assert(res.alres.ned().empty());
assert(res.alres.aed().empty());
res.reset();
al.nextAlignment(res, rnd);
assert(res.empty());
assert(al.done());
res.reset();
cerr << "PASSED" << endl;
// 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5
// A C G T A C G T A C G T A C G T
// 0 C
// 1 C x
// 2 G x
// 3 T x
cerr << " Test " << tests++ << " (short nuc space, offset "
<< (i*4) << ", 1mm)...";
sc.rdGapConst = 40;
sc.rfGapConst = 40;
doTestCase2(
al,
"CCGT", // read
"IIII", // qual
"ACGTACGTACGTACGT", // ref in
i*4, // off
sc, // scoring scheme
0.0f, // const coeff for cost ceiling
7.0f, // linear coeff for cost ceiling
res, // result
nIncl, // Ns inclusive (not mismatches)
nfilter); // filter Ns
assert(!al.done());
al.nextAlignment(res, rnd);
assert(!res.empty());
assert_eq(i*4+1, res.alres.refoff());
assert_eq(3, res.alres.refExtent());
assert_eq(res.alres.score().gaps(), 0);
assert_eq(res.alres.score().score(), 30);
assert_eq(res.alres.score().ns(), 0);
assert(res.alres.ned().empty());
assert(res.alres.aed().empty());
res.reset();
al.nextAlignment(res, rnd);
assert(res.empty());
assert(al.done());
res.reset();
cerr << "PASSED" << endl;
cerr << " Test " << tests++ << " (short nuc space, offset "
<< (i*4) << ", 1mm)...";
sc.rdGapConst = 40;
sc.rfGapConst = 40;
doTestCase2(
al,
"ACGA", // read
"IIII", // qual
"ACGTACGTACGTACGT", // ref in
i*4, // off
sc, // scoring scheme
0.0f, // const coeff for cost ceiling
7.0f, // linear coeff for cost ceiling
res, // result
nIncl, // Ns inclusive (not mismatches)
nfilter); // filter Ns
assert(!al.done());
al.nextAlignment(res, rnd);
assert(!res.empty());
assert_eq(i*4, res.alres.refoff());
assert_eq(3, res.alres.refExtent());
assert_eq(res.alres.score().gaps(), 0);
assert_eq(res.alres.score().score(), 30);
assert_eq(res.alres.score().ns(), 0);
assert(res.alres.ned().empty());
assert(res.alres.aed().empty());
res.reset();
al.nextAlignment(res, rnd);
assert(res.empty());
assert(al.done());
res.reset();
cerr << "PASSED" << endl;
if(i == 0) {
cerr << " Test " << tests++ << " (short nuc space, offset "
<< (i*4) << ", 1mm, match bonus=20)...";
sc.rdGapConst = 40;
sc.rfGapConst = 40;
sc.setMatchBonus(20);
doTestCase2(
al,
"TTGT", // read
"IIII", // qual
"TTGA", // ref in
i*4, // off
sc, // scoring scheme
25.0f, // const coeff for cost ceiling
0.0f, // linear coeff for cost ceiling
res, // result
nIncl, // Ns inclusive (not mismatches)
nfilter); // filter Ns
assert(!al.done());
al.nextAlignment(res, rnd);
assert(!res.empty());
assert_eq(i*4, res.alres.refoff());
assert_eq(3, res.alres.refExtent());
assert_eq(res.alres.score().gaps(), 0);
assert_eq(res.alres.score().score(), 60);
assert_eq(res.alres.score().ns(), 0);
assert(res.alres.ned().empty());
assert(res.alres.aed().empty());
res.reset();
al.nextAlignment(res, rnd);
assert(res.empty());
assert(al.done());
res.reset();
sc.setMatchBonus(10);
cerr << "PASSED" << endl;
}
cerr << " Test " << tests++ << " (nuc space, offset "
<< (i*4) << ", exact)...";
sc.rdGapConst = 40;
sc.rfGapConst = 40;
doTestCase2(
al,
"ACGTACGT", // read
"IIIIIIII", // qual
"ACGTACGTACGTACGT", // ref in
i*4, // off
sc, // scoring scheme
0.0f, // const coeff for cost ceiling
8.0f, // linear coeff for cost ceiling
res, // result
nIncl, // Ns inclusive (not mismatches)
nfilter); // filter Ns
assert(!al.done());
al.nextAlignment(res, rnd);
assert(!res.empty());
assert_eq(i*4, res.alres.refoff());
assert_eq(8, res.alres.refExtent());
assert_eq(res.alres.score().gaps(), 0);
assert_eq(res.alres.score().score(), 80);
assert_eq(res.alres.score().ns(), 0);
assert(res.alres.ned().empty());
assert(res.alres.aed().empty());
res.reset();
al.nextAlignment(res, rnd);
assert(res.empty());
assert(al.done());
res.reset();
cerr << "PASSED" << endl;
cerr << " Test " << tests++ << " (long nuc space, offset "
<< (i*8) << ", exact)...";
sc.rdGapConst = 40;
sc.rfGapConst = 40;
doTestCase2(
al,
"ACGTACGTACGTACGTACGTA", // read
"IIIIIIIIIIIIIIIIIIIII", // qual
"ACGTACGTACGTACGTACGTACGTACGTACGTACGTA", // ref in
// ACGTACGTACGTACGTACGT
// ACGTACGTACGTACGTACGT
// ACGTACGTACGTACGTACGT
i*8, // off
sc, // scoring scheme
0.0f, // const coeff for cost ceiling
8.0f, // linear coeff for cost ceiling
res, // result
nIncl, // Ns inclusive (not mismatches)
nfilter); // filter Ns
assert(!al.done());
al.nextAlignment(res, rnd);
assert(!res.empty());
assert_eq(i*8, res.alres.refoff());
assert_eq(21, res.alres.refExtent());
assert_eq(res.alres.score().gaps(), 0);
assert_eq(res.alres.score().score(), 210);
assert_eq(res.alres.score().ns(), 0);
assert(res.alres.ned().empty());
assert(res.alres.aed().empty());
res.reset();
al.nextAlignment(res, rnd);
//assert(res.empty());
//assert(al.done());
res.reset();
cerr << "PASSED" << endl;
cerr << " Test " << tests++ << " (nuc space, offset " << (i*4)
<< ", 1mm allowed by minsc)...";
doTestCase2(
al,
"ACGTTCGT", // read
"IIIIIIII", // qual
"ACGTACGTACGTACGT", // ref in
i*4, // off
sc, // scoring scheme
0.0f, // const coeff for cost ceiling
5.0f, // linear coeff for cost ceiling
res, // result
nIncl, // Ns inclusive (not mismatches)
nfilter); // filter Ns
assert(!al.done());
al.nextAlignment(res, rnd);
assert(!res.empty());
assert_eq(i*4, res.alres.refoff());
assert_eq(8, res.alres.refExtent());
assert_eq(res.alres.score().gaps(), 0);
assert_eq(res.alres.score().score(), 40);
assert_eq(res.alres.score().ns(), 0);
res.reset();
al.nextAlignment(res, rnd);
//assert(res.empty());
//assert(al.done());
cerr << "PASSED" << endl;
cerr << " Test " << tests++ << " (long nuc space, offset "
<< (i*8) << ", 6mm allowed by minsc)...";
sc.rdGapConst = 50;
sc.rfGapConst = 50;
sc.rdGapLinear = 45;
sc.rfGapLinear = 45;
doTestCase2(
al,
"ACGTACGATGCATCGTACGTA", // read
"IIIIIIIIIIIIIIIIIIIII", // qual
"ACGTACGTACGTACGTACGTACGTACGTACGTACGTA", // ref in
// ACGTACGTACGTACGTACGT
// ACGTACGTACGTACGTACGT
// ACGTACGTACGTACGTACGT
i*8, // off
sc, // scoring scheme
0.0f, // const coeff for cost ceiling
1.0f, // linear coeff for cost ceiling
res, // result
nIncl, // Ns inclusive (not mismatches)
nfilter); // filter Ns
assert(!al.done());
al.nextAlignment(res, rnd);
assert(!res.empty());
assert_eq(i*8 + 13, res.alres.refoff());
assert_eq(8, res.alres.refExtent());
assert_eq(res.alres.score().gaps(), 0);
assert_eq(res.alres.score().score(), 80);
assert_eq(res.alres.score().ns(), 0);
assert(res.alres.ned().empty());
assert(res.alres.aed().empty());
res.reset();
al.nextAlignment(res, rnd);
res.reset();
cerr << "PASSED" << endl;
}
}
int main(int argc, char **argv) {
int option_index = 0;
int next_option;
unsigned seed = 0;
gGapBarrier = 1;
gSnpPhred = 30;
bool nsInclusive = false;
bool nfilter = false;
bonusMatchType = DEFAULT_MATCH_BONUS_TYPE;
bonusMatch = DEFAULT_MATCH_BONUS;
penMmcType = DEFAULT_MM_PENALTY_TYPE;
penMmc = DEFAULT_MM_PENALTY;
penSnp = DEFAULT_SNP_PENALTY;
penNType = DEFAULT_N_PENALTY_TYPE;
penN = DEFAULT_N_PENALTY;
penRdExConst = DEFAULT_READ_GAP_CONST;
penRfExConst = DEFAULT_REF_GAP_CONST;
penRdExLinear = DEFAULT_READ_GAP_LINEAR;
penRfExLinear = DEFAULT_REF_GAP_LINEAR;
costMinConst = DEFAULT_MIN_CONST;
costMinLinear = DEFAULT_MIN_LINEAR;
costFloorConst = DEFAULT_FLOOR_CONST;
costFloorLinear = DEFAULT_FLOOR_LINEAR;
nCeilConst = 1.0f; // constant factor in N ceiling w/r/t read length
nCeilLinear = 1.0f; // coeff of linear term in N ceiling w/r/t read length
nCatPair = false;
multiseedMms = DEFAULT_SEEDMMS;
multiseedLen = DEFAULT_SEEDLEN;
multiseedIvalType = DEFAULT_IVAL;
multiseedIvalA = DEFAULT_IVAL_A;
multiseedIvalB = DEFAULT_IVAL_B;
mhits = 1;
do {
next_option = getopt_long(argc, argv, short_opts, long_opts, &option_index);
switch (next_option) {
case 's': gSnpPhred = parse<int>(optarg); break;
case 'r': seed = parse<unsigned>(optarg); break;
case ARG_TESTS: {
doTests();
cout << "PASSED end-to-ends" << endl;
doLocalTests();
cout << "PASSED locals" << endl;
return 0;
}
case 'A': {
bool localAlign = false;
bool noisyHpolymer = false;
bool ignoreQuals = false;
SeedAlignmentPolicy::parseString(
optarg,
localAlign,
noisyHpolymer,
ignoreQuals,
bonusMatchType,
bonusMatch,
penMmcType,
penMmc,
penNType,
penN,
penRdExConst,
penRfExConst,
penRdExLinear,
penRfExLinear,
costMinConst,
costMinLinear,
costFloorConst,
costFloorLinear,
nCeilConst,
nCeilLinear,
nCatPair,
multiseedMms,
multiseedLen,
multiseedIvalType,
multiseedIvalA,
multiseedIvalB,
posmin);
break;
}
case -1: break;
default: {
cerr << "Unknown option: " << (char)next_option << endl;
printUsage(cerr);
exit(1);
}
}
} while(next_option != -1);
srand(seed);
if(argc - optind < 4) {
cerr << "Need at least 4 arguments" << endl;
printUsage(cerr);
exit(1);
}
BTDnaString read;
BTString ref, qual;
// Get read
read.installChars(argv[optind]);
// Get qualities
qual.install(argv[optind+1]);
assert_eq(read.length(), qual.length());
// Get reference
ref.install(argv[optind+2]);
// Get reference offset
size_t off = parse<size_t>(argv[optind+3]);
// Set up penalities
Scoring sc(
false, // local alignment?
false, // bad homopolymer?
bonusMatchType,
bonusMatch,
penMmcType, // how to penalize mismatches
penMmc, // constant if mm pelanty is a constant
costMinConst,
costMinLinear,
costFloorConst,
costFloorLinear,
nCeilConst, // N ceiling constant coefficient
nCeilLinear, // N ceiling linear coefficient
penNType, // how to penalize Ns in the read
penN, // constant if N pelanty is a constant
nCatPair, // true -> concatenate mates before N filtering
penRdExConst, // constant cost of extending a gap in the read
penRfExConst, // constant cost of extending a gap in the reference
penRdExLinear, // coeff of linear term for cost of gap extension in read
penRfExLinear // coeff of linear term for cost of gap extension in ref
);
// Calculate the penalty ceiling for the read
TAlScore minsc = Scoring::linearFunc(
read.length(),
costMinConst,
costMinLinear);
TAlScore floorsc = Scoring::linearFunc(
read.length(),
costFloorConst,
costFloorLinear);
SwResult res;
SwAligner al;
doTestCase(
al,
read,
qual,
ref,
off,
NULL,
sc,
minsc,
res,
nsInclusive,
nfilter,
seed);
}
#endif /*MAIN_ALIGNER_SW*/