/* * Copyright 2011, Ben Langmead * * This file is part of Bowtie 2. * * Bowtie 2 is free software: you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation, either version 3 of the License, or * (at your option) any later version. * * Bowtie 2 is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with Bowtie 2. If not, see . */ /* * aligner_sw_driver.h * * REDUNDANT SEED HITS * * We say that two seed hits are redundant if they trigger identical * seed-extend dynamic programming problems. Put another way, they both lie on * the same diagonal of the overall read/reference dynamic programming matrix. * Detecting redundant seed hits is simple when the seed hits are ungapped. We * do this after offset resolution but before the offset is converted to genome * coordinates (see uses of the seenDiags1_/seenDiags2_ fields for examples). * * REDUNDANT ALIGNMENTS * * In an unpaired context, we say that two alignments are redundant if they * share any cells in the global DP table. Roughly speaking, this is like * saying that two alignments are redundant if any read character aligns to the * same reference character (same reference sequence, same strand, same offset) * in both alignments. * * In a paired-end context, we say that two paired-end alignments are redundant * if the mate #1s are redundant and the mate #2s are redundant. * * How do we enforce this? In the unpaired context, this is relatively simple: * the cells from each alignment are checked against a set containing all cells * from all previous alignments. Given a new alignment, for each cell in the * new alignment we check whether it is in the set. If there is any overlap, * the new alignment is rejected as redundant. Otherwise, the new alignment is * accepted and its cells are added to the set. * * Enforcement in a paired context is a little trickier. Consider the * following approaches: * * 1. Skip anchors that are redundant with any previous anchor or opposite * alignment. This is sufficient to ensure no two concordant alignments * found are redundant. * * 2. Same as scheme 1, but with a "transitive closure" scheme for finding all * concordant pairs in the vicinity of an anchor. Consider the AB/AC * scenario from the previous paragraph. If B is the anchor alignment, we * will find AB but not AC. But under this scheme, once we find AB we then * let B be a new anchor and immediately look for its opposites. Likewise, * if we find any opposite, we make them anchors and continue searching. We * don't stop searching until every opposite is used as an anchor. * * 3. Skip anchors that are redundant with any previous anchor alignment (but * allow anchors that are redundant with previous opposite alignments). * This isn't sufficient to avoid redundant concordant alignments. To avoid * redundant concordants, we need an additional procedure that checks each * new concordant alignment one-by-one against a list of previous concordant * alignments to see if it is redundant. * * We take approach 1. */ #ifndef ALIGNER_SW_DRIVER_H_ #define ALIGNER_SW_DRIVER_H_ #include // -- BTL remove -- #include #include // -- -- #include #include "ds.h" #include "aligner_seed.h" #include "aligner_sw.h" #include "aligner_cache.h" #include "reference.h" #include "group_walk.h" #include "gfm.h" #include "mem_ids.h" #include "aln_sink.h" #include "pe.h" #include "ival_list.h" #include "simple_func.h" #include "random_util.h" #include "dp_framer.h" using namespace std; template struct SeedPos { SeedPos() : fw(false), offidx(0), rdoff(0), seedlen(0) { } SeedPos( bool fw_, index_t offidx_, index_t rdoff_, index_t seedlen_) { init(fw_, offidx_, rdoff_, seedlen_); } void init( bool fw_, index_t offidx_, index_t rdoff_, index_t seedlen_) { fw = fw_; offidx = offidx_; rdoff = rdoff_; seedlen = seedlen_; } bool operator<(const SeedPos& o) const { if(offidx < o.offidx) return true; if(offidx > o.offidx) return false; if(rdoff < o.rdoff) return true; if(rdoff > o.rdoff) return false; if(seedlen < o.seedlen) return true; if(seedlen > o.seedlen) return false; if(fw && !o.fw) return true; if(!fw && o.fw) return false; return false; } bool operator>(const SeedPos& o) const { if(offidx < o.offidx) return false; if(offidx > o.offidx) return true; if(rdoff < o.rdoff) return false; if(rdoff > o.rdoff) return true; if(seedlen < o.seedlen) return false; if(seedlen > o.seedlen) return true; if(fw && !o.fw) return false; if(!fw && o.fw) return true; return false; } bool operator==(const SeedPos& o) const { return fw == o.fw && offidx == o.offidx && rdoff == o.rdoff && seedlen == o.seedlen; } bool fw; index_t offidx; index_t rdoff; index_t seedlen; }; /** * An SATuple along with the associated seed position. */ template struct SATupleAndPos { SATuple sat; // result for this seed hit SeedPos pos; // seed position that yielded the range this was taken from index_t origSz; // size of range this was taken from index_t nlex; // # position we can extend seed hit to left w/o edit index_t nrex; // # position we can extend seed hit to right w/o edit bool operator<(const SATupleAndPos& o) const { if(sat < o.sat) return true; if(sat > o.sat) return false; return pos < o.pos; } bool operator==(const SATupleAndPos& o) const { return sat == o.sat && pos == o.pos; } }; /** * Encapsulates the weighted random sampling scheme we want to use to pick * which seed hit range to sample a row from. */ template class RowSampler { public: RowSampler(int cat = 0) : elim_(cat), masses_(cat) { mass_ = 0.0f; } /** * Initialze sampler with respect to a range of elements in a list of * SATupleAndPos's. */ void init( const EList, 16>& salist, index_t sai, index_t saf, bool lensq, // whether to square the numerator, which = extended length bool szsq) // whether to square denominator, which = { assert_gt(saf, sai); elim_.resize(saf - sai); elim_.fill(false); // Initialize mass mass_ = 0.0f; masses_.resize(saf - sai); for(index_t i = sai; i < saf; i++) { index_t len = salist[i].nlex + salist[i].nrex + 1; // + salist[i].sat.key.len; double num = (double)len; if(lensq) { num *= num; } double denom = (double)salist[i].sat.size(); if(szsq) { denom *= denom; } masses_[i - sai] = num / denom; mass_ += masses_[i - sai]; } } /** * Caller is indicating that the bin at index i is exhausted and we should * exclude it from our sampling from now on. */ void finishedRange(index_t i) { assert_lt(i, masses_.size()); elim_[i] = true; mass_ -= masses_[i]; } /** * Sample randomly from the mass. */ size_t next(RandomSource& rnd) { // Throw the dart double rd = rnd.nextFloat() * mass_; double mass_sofar = 0.0f; size_t sz = masses_.size(); size_t last_unelim = std::numeric_limits::max(); for(size_t i = 0; i < sz; i++) { if(!elim_[i]) { last_unelim = i; mass_sofar += masses_[i]; if(rd < mass_sofar) { // This is the one we hit return i; } } } assert_neq(std::numeric_limits::max(), last_unelim); return last_unelim; } protected: double mass_; // total probability mass to throw darts at EList elim_; // whether the range is eliminated EList masses_; // mass of each range }; /** * Return values from extendSeeds and extendSeedsPaired. */ enum { // All end-to-end and seed hits were examined // The policy does not need us to look any further EXTEND_EXHAUSTED_CANDIDATES = 1, EXTEND_POLICY_FULFILLED, // We stopped because we reached a point where the only remaining // alignments of interest have perfect scores, but we already investigated // perfect alignments EXTEND_PERFECT_SCORE, // We stopped because we ran up against a limit on how much work we should // do for one set of seed ranges, e.g. the limit on number of consecutive // unproductive DP extensions EXTEND_EXCEEDED_SOFT_LIMIT, // We stopped because we ran up against a limit on how much work we should // do for overall before giving up on a mate EXTEND_EXCEEDED_HARD_LIMIT }; /** * Data structure encapsulating a range that's been extended out in two * directions. */ struct ExtendRange { void init(size_t off_, size_t len_, size_t sz_) { off = off_; len = len_; sz = sz_; } size_t off; // offset of extended region size_t len; // length between extremes of extended region size_t sz; // # of elements in SA range }; template class SwDriver { typedef PList TSAList; public: SwDriver(size_t bytes) : satups_(DP_CAT), gws_(DP_CAT), seenDiags1_(DP_CAT), seenDiags2_(DP_CAT), redAnchor_(DP_CAT), redMate1_(DP_CAT), redMate2_(DP_CAT), pool_(bytes, CACHE_PAGE_SZ, DP_CAT), salistEe_(DP_CAT), gwstate_(GW_CAT) { } /** * Given a collection of SeedHits for a single read, extend seed alignments * into full alignments. Where possible, try to avoid redundant offset * lookups and dynamic programming problems. Optionally report alignments * to a AlnSinkWrap object as they are discovered. * * If 'reportImmediately' is true, returns true iff a call to * mhs->report() returned true (indicating that the reporting * policy is satisfied and we can stop). Otherwise, returns false. */ int extendSeeds( Read& rd, // read to align bool mate1, // true iff rd is mate #1 SeedResults& sh, // seed hits to extend into full alignments const GFM& gfmFw, // BWT const GFM* gfmBw, // BWT' const BitPairReference& ref, // Reference strings SwAligner& swa, // dynamic programming aligner const Scoring& sc, // scoring scheme int seedmms, // # mismatches allowed in seed int seedlen, // length of seed int seedival, // interval between seeds TAlScore& minsc, // minimum score for anchor int nceil, // maximum # Ns permitted in ref portion size_t maxhalf, // maximum width on one side of DP table bool doUngapped, // do ungapped alignment size_t maxIters, // stop after this many seed-extend loop iters size_t maxUg, // max # ungapped extends size_t maxDp, // max # DPs size_t maxUgStreak, // stop after streak of this many ungap fails size_t maxDpStreak, // stop after streak of this many dp fails bool doExtend, // do seed extension bool enable8, // use 8-bit SSE where possible size_t cminlen, // use checkpointer if read longer than this size_t cpow2, // interval between diagonals to checkpoint bool doTri, // triangular mini-fills int tighten, // -M score tightening mode AlignmentCacheIface& ca, // alignment cache for seed hits RandomSource& rnd, // pseudo-random source WalkMetrics& wlm, // group walk left metrics SwMetrics& swmSeed, // DP metrics for seed-extend PerReadMetrics& prm, // per-read metrics AlnSinkWrap* mhs, // HitSink for multiseed-style aligner bool reportImmediately, // whether to report hits immediately to mhs bool& exhaustive); /** * Given a collection of SeedHits for a read pair, extend seed * alignments into full alignments and then look for the opposite * mate using dynamic programming. Where possible, try to avoid * redundant offset lookups. Optionally report alignments to a * AlnSinkWrap object as they are discovered. * * If 'reportImmediately' is true, returns true iff a call to * mhs->report() returned true (indicating that the reporting * policy is satisfied and we can stop). Otherwise, returns false. */ int extendSeedsPaired( Read& rd, // mate to align as anchor Read& ord, // mate to align as opposite bool anchor1, // true iff anchor mate is mate1 bool oppFilt, // true iff opposite mate was filtered out SeedResults& sh, // seed hits for anchor const GFM& gfmFw, // BWT const GFM* gfmBw, // BWT' const BitPairReference& ref, // Reference strings SwAligner& swa, // dyn programming aligner for anchor SwAligner& swao, // dyn programming aligner for opposite const Scoring& sc, // scoring scheme const PairedEndPolicy& pepol,// paired-end policy int seedmms, // # mismatches allowed in seed int seedlen, // length of seed int seedival, // interval between seeds TAlScore& minsc, // minimum score for anchor TAlScore& ominsc, // minimum score for opposite int nceil, // max # Ns permitted in ref for anchor int onceil, // max # Ns permitted in ref for opposite bool nofw, // don't align forward read bool norc, // don't align revcomp read size_t maxhalf, // maximum width on one side of DP table bool doUngapped, // do ungapped alignment size_t maxIters, // stop after this many seed-extend loop iters size_t maxUg, // max # ungapped extends size_t maxDp, // max # DPs size_t maxEeStreak, // stop after streak of this many end-to-end fails size_t maxUgStreak, // stop after streak of this many ungap fails size_t maxDpStreak, // stop after streak of this many dp fails size_t maxMateStreak, // stop seed range after N mate-find fails bool doExtend, // do seed extension bool enable8, // use 8-bit SSE where possible size_t cminlen, // use checkpointer if read longer than this size_t cpow2, // interval between diagonals to checkpoint bool doTri, // triangular mini-fills int tighten, // -M score tightening mode AlignmentCacheIface& cs, // alignment cache for seed hits RandomSource& rnd, // pseudo-random source WalkMetrics& wlm, // group walk left metrics SwMetrics& swmSeed, // DP metrics for seed-extend SwMetrics& swmMate, // DP metrics for mate finidng PerReadMetrics& prm, // per-read metrics for anchor AlnSinkWrap* msink, // AlnSink wrapper for multiseed-style aligner bool swMateImmediately, // whether to look for mate immediately bool reportImmediately, // whether to report hits immediately to msink bool discord, // look for discordant alignments? bool mixed, // look for unpaired as well as paired alns? bool& exhaustive); /** * Prepare for a new read. */ void nextRead(bool paired, size_t mate1len, size_t mate2len) { redAnchor_.reset(); seenDiags1_.reset(); seenDiags2_.reset(); seedExRangeFw_[0].clear(); // mate 1 fw seedExRangeFw_[1].clear(); // mate 2 fw seedExRangeRc_[0].clear(); // mate 1 rc seedExRangeRc_[1].clear(); // mate 2 rc size_t maxlen = mate1len; if(paired) { redMate1_.reset(); redMate1_.init(mate1len); redMate2_.reset(); redMate2_.init(mate2len); if(mate2len > maxlen) { maxlen = mate2len; } } redAnchor_.init(maxlen); } protected: bool eeSaTups( const Read& rd, // read SeedResults& sh, // seed hits to extend into full alignments const GFM& gfm, // BWT const BitPairReference& ref, // Reference strings RandomSource& rnd, // pseudo-random generator WalkMetrics& wlm, // group walk left metrics SwMetrics& swmSeed, // metrics for seed extensions index_t& nelt_out, // out: # elements total index_t maxelts, // max # elts to report bool all); // report all hits? void extend( const Read& rd, // read const GFM& gfmFw, // Forward Bowtie index const GFM* gfmBw, // Backward Bowtie index index_t topf, // top in fw index index_t botf, // bot in fw index index_t topb, // top in bw index index_t botb, // bot in bw index bool fw, // seed orientation index_t off, // seed offset from 5' end index_t len, // seed length PerReadMetrics& prm, // per-read metrics index_t& nlex, // # positions we can extend to left w/o edit index_t& nrex); // # positions we can extend to right w/o edit void prioritizeSATups( const Read& rd, // read SeedResults& sh, // seed hits to extend into full alignments const GFM& gfmFw, // BWT const GFM* gfmBw, // BWT' const BitPairReference& ref, // Reference strings int seedmms, // # seed mismatches allowed index_t maxelt, // max elts we'll consider bool doExtend, // extend out seeds bool lensq, // square extended length bool szsq, // square SA range size index_t nsm, // if range as <= nsm elts, it's "small" AlignmentCacheIface& ca, // alignment cache for seed hits RandomSource& rnd, // pseudo-random generator WalkMetrics& wlm, // group walk left metrics PerReadMetrics& prm, // per-read metrics index_t& nelt_out, // out: # elements total bool all); // report all hits? Random1toN rand_; // random number generators EList rands_; // random number generators EList rands2_; // random number generators EList, 16> eehits_; // holds end-to-end hits EList, 16> satpos_; // holds SATuple, SeedPos pairs EList, 16> satpos2_; // holds SATuple, SeedPos pairs EList, 16> satups_; // holds SATuples to explore elements from EList > gws_; // list of GroupWalks; no particular order EList mateStreaks_; // mate-find fail streaks RowSampler rowsamp_; // row sampler // Ranges that we've extended through when extending seed hits EList seedExRangeFw_[2]; EList seedExRangeRc_[2]; // Data structures encapsulating the diagonals that have already been used // to seed alignment for mate 1 and mate 2. EIvalMergeListBinned seenDiags1_; EIvalMergeListBinned seenDiags2_; // For weeding out redundant alignments RedundantAlns redAnchor_; // database of cells used for anchor alignments RedundantAlns redMate1_; // database of cells used for mate 1 alignments RedundantAlns redMate2_; // database of cells used for mate 2 alignments // For holding results for anchor (res_) and opposite (ores_) mates SwResult resGap_; // temp holder for alignment result SwResult oresGap_; // temp holder for alignment result, opp mate SwResult resUngap_; // temp holder for ungapped alignment result SwResult oresUngap_; // temp holder for ungap. aln. opp mate SwResult resEe_; // temp holder for ungapped alignment result SwResult oresEe_; // temp holder for ungap. aln. opp mate Pool pool_; // memory pages for salistExact_ TSAList salistEe_; // PList for offsets for end-to-end hits GroupWalkState gwstate_; // some per-thread state shared by all GroupWalks // For AlnRes::matchesRef: ASSERT_ONLY(SStringExpandable raw_refbuf_); ASSERT_ONLY(SStringExpandable raw_destU32_); ASSERT_ONLY(EList raw_matches_); ASSERT_ONLY(BTDnaString tmp_rf_); ASSERT_ONLY(BTDnaString tmp_rdseq_); ASSERT_ONLY(BTString tmp_qseq_); ASSERT_ONLY(EList tmp_reflens_); ASSERT_ONLY(EList tmp_refoffs_); }; #define TIMER_START() \ struct timeval tv_i, tv_f; \ struct timezone tz_i, tz_f; \ size_t total_usecs; \ gettimeofday(&tv_i, &tz_i) #define IF_TIMER_END() \ gettimeofday(&tv_f, &tz_f); \ total_usecs = \ (tv_f.tv_sec - tv_i.tv_sec) * 1000000 + (tv_f.tv_usec - tv_i.tv_usec); \ if(total_usecs > 300000) /* * aligner_sw_driver.cpp * * Routines that drive the alignment process given a collection of seed hits. * This is generally done in a few stages: extendSeeds visits the set of * seed-hit BW elements in some order; for each element visited it resolves its * reference offset; once the reference offset is known, bounds for a dynamic * programming subproblem are established; if these bounds are distinct from * the bounds we've already tried, we solve the dynamic programming subproblem * and report the hit; if the AlnSinkWrap indicates that we can stop, we * return, otherwise we continue on to the next BW element. */ /** * Given end-to-end alignment results stored in the SeedResults structure, set * up all of our state for resolving and keeping track of reference offsets for * hits. Order the list of ranges to examine such that all exact end-to-end * alignments are examined before any 1mm end-to-end alignments. * * Note: there might be a lot of hits and a lot of wide ranges to look for * here. We use 'maxelt'. */ template bool SwDriver::eeSaTups( const Read& rd, // read SeedResults& sh, // seed hits to extend into full alignments const GFM& gfm, // BWT const BitPairReference& ref, // Reference strings RandomSource& rnd, // pseudo-random generator WalkMetrics& wlm, // group walk left metrics SwMetrics& swmSeed, // metrics for seed extensions index_t& nelt_out, // out: # elements total index_t maxelt, // max elts we'll consider bool all) // report all hits? { assert_eq(0, nelt_out); gws_.clear(); rands_.clear(); satpos_.clear(); eehits_.clear(); // First, count up the total number of satpos_, rands_, eehits_, and gws_ // we're going to tuse index_t nobj = 0; if(!sh.exactFwEEHit().empty()) nobj++; if(!sh.exactRcEEHit().empty()) nobj++; nobj += sh.mm1EEHits().size(); nobj = min(nobj, maxelt); gws_.ensure(nobj); rands_.ensure(nobj); satpos_.ensure(nobj); eehits_.ensure(nobj); index_t tot = sh.exactFwEEHit().size() + sh.exactRcEEHit().size(); bool succ = false; bool firstEe = true; bool done = false; if(tot > 0) { bool fwFirst = true; // Pick fw / rc to go first in a weighted random fashion #ifdef BOWTIE_64BIT_INDEX index_t rn64 = rnd.nextU64(); index_t rn = rn64 % (uint64_t)tot; #else index_t rn32 = rnd.nextU32(); index_t rn = rn32 % (uint32_t)tot; #endif if(rn >= sh.exactFwEEHit().size()) { fwFirst = false; } for(int fwi = 0; fwi < 2 && !done; fwi++) { bool fw = ((fwi == 0) == fwFirst); EEHit hit = fw ? sh.exactFwEEHit() : sh.exactRcEEHit(); if(hit.empty()) { continue; } assert(hit.fw == fw); if(hit.bot > hit.top) { // Possibly adjust bot and width if we would have exceeded maxelt index_t tops[2] = { hit.top, 0 }; index_t bots[2] = { hit.bot, 0 }; index_t width = hit.bot - hit.top; if(nelt_out + width > maxelt) { index_t trim = (index_t)((nelt_out + width) - maxelt); #ifdef BOWTIE_64BIT_INDEX index_t rn = rnd.nextU64() % width; #else index_t rn = rnd.nextU32() % width; #endif index_t newwidth = width - trim; if(hit.top + rn + newwidth > hit.bot) { // Two pieces tops[0] = hit.top + rn; bots[0] = hit.bot; tops[1] = hit.top; bots[1] = hit.top + newwidth - (bots[0] - tops[0]); } else { // One piece tops[0] = hit.top + rn; bots[0] = tops[0] + newwidth; } assert_leq(bots[0], hit.bot); assert_leq(bots[1], hit.bot); assert_geq(bots[0], tops[0]); assert_geq(bots[1], tops[1]); assert_eq(newwidth, (bots[0] - tops[0]) + (bots[1] - tops[1])); } for(int i = 0; i < 2 && !done; i++) { if(bots[i] <= tops[i]) break; index_t width = bots[i] - tops[i]; index_t top = tops[i]; // Clear list where resolved offsets are stored swmSeed.exranges++; swmSeed.exrows += width; if(!succ) { swmSeed.exsucc++; succ = true; } if(firstEe) { salistEe_.clear(); pool_.clear(); firstEe = false; } // We have to be careful not to allocate excessive amounts of memory here TSlice o(salistEe_, (index_t)salistEe_.size(), width); for(index_t i = 0; i < width; i++) { if(!salistEe_.add(pool_, (index_t)OFF_MASK)) { swmSeed.exooms++; return false; } } assert(!done); eehits_.push_back(hit); satpos_.expand(); satpos_.back().sat.init(SAKey(), top, (index_t)OFF_MASK, o); satpos_.back().sat.key.seq = MAX_U64; satpos_.back().sat.key.len = (index_t)rd.length(); satpos_.back().pos.init(fw, 0, 0, (index_t)rd.length()); satpos_.back().origSz = width; rands_.expand(); rands_.back().init(width, all); gws_.expand(); SARangeWithOffs sa; sa.topf = satpos_.back().sat.topf; sa.len = satpos_.back().sat.key.len; sa.offs = satpos_.back().sat.offs; gws_.back().init( gfm, // forward Bowtie index ref, // reference sequences sa, // SATuple rnd, // pseudo-random generator wlm); // metrics assert(gws_.back().repOk(sa)); nelt_out += width; if(nelt_out >= maxelt) { done = true; } } } } } succ = false; if(!done && !sh.mm1EEHits().empty()) { sh.sort1mmEe(rnd); index_t sz = sh.mm1EEHits().size(); for(index_t i = 0; i < sz && !done; i++) { EEHit hit = sh.mm1EEHits()[i]; assert(hit.repOk(rd)); assert(!hit.empty()); // Possibly adjust bot and width if we would have exceeded maxelt index_t tops[2] = { hit.top, 0 }; index_t bots[2] = { hit.bot, 0 }; index_t width = hit.bot - hit.top; if(nelt_out + width > maxelt) { index_t trim = (index_t)((nelt_out + width) - maxelt); #ifdef BOWTIE_64BIT_INDEX index_t rn = rnd.nextU64() % width; #else index_t rn = rnd.nextU32() % width; #endif index_t newwidth = width - trim; if(hit.top + rn + newwidth > hit.bot) { // Two pieces tops[0] = hit.top + rn; bots[0] = hit.bot; tops[1] = hit.top; bots[1] = hit.top + newwidth - (bots[0] - tops[0]); } else { // One piece tops[0] = hit.top + rn; bots[0] = tops[0] + newwidth; } assert_leq(bots[0], hit.bot); assert_leq(bots[1], hit.bot); assert_geq(bots[0], tops[0]); assert_geq(bots[1], tops[1]); assert_eq(newwidth, (bots[0] - tops[0]) + (bots[1] - tops[1])); } for(int i = 0; i < 2 && !done; i++) { if(bots[i] <= tops[i]) break; index_t width = bots[i] - tops[i]; index_t top = tops[i]; // Clear list where resolved offsets are stored swmSeed.mm1ranges++; swmSeed.mm1rows += width; if(!succ) { swmSeed.mm1succ++; succ = true; } if(firstEe) { salistEe_.clear(); pool_.clear(); firstEe = false; } TSlice o(salistEe_, (index_t)salistEe_.size(), width); for(size_t i = 0; i < width; i++) { if(!salistEe_.add(pool_, (index_t)OFF_MASK)) { swmSeed.mm1ooms++; return false; } } eehits_.push_back(hit); satpos_.expand(); satpos_.back().sat.init(SAKey(), top, (index_t)OFF_MASK, o); satpos_.back().sat.key.seq = MAX_U64; satpos_.back().sat.key.len = (index_t)rd.length(); satpos_.back().pos.init(hit.fw, 0, 0, (index_t)rd.length()); satpos_.back().origSz = width; rands_.expand(); rands_.back().init(width, all); gws_.expand(); SARangeWithOffs sa; sa.topf = satpos_.back().sat.topf; sa.len = satpos_.back().sat.key.len; sa.offs = satpos_.back().sat.offs; gws_.back().init( gfm, // forward Bowtie index ref, // reference sequences sa, // SATuple rnd, // pseudo-random generator wlm); // metrics assert(gws_.back().repOk(sa)); nelt_out += width; if(nelt_out >= maxelt) { done = true; } } } } return true; } /** * Extend a seed hit out on either side. Requires that we know the seed hit's * offset into the read and orientation. Also requires that we know top/bot * for the seed hit in both the forward and (if we want to extend to the right) * reverse index. */ template void SwDriver::extend( const Read& rd, // read const GFM& gfmFw, // Forward Bowtie index const GFM* gfmBw, // Backward Bowtie index index_t topf, // top in fw index index_t botf, // bot in fw index index_t topb, // top in bw index index_t botb, // bot in bw index bool fw, // seed orientation index_t off, // seed offset from 5' end index_t len, // seed length PerReadMetrics& prm, // per-read metrics index_t& nlex, // # positions we can extend to left w/o edit index_t& nrex) // # positions we can extend to right w/o edit { index_t t[4], b[4]; index_t tp[4], bp[4]; SideLocus tloc, bloc; index_t rdlen = (index_t)rd.length(); index_t lim = fw ? off : rdlen - len - off; // We're about to add onto the beginning, so reverse it #ifndef NDEBUG if(false) { // TODO: This will sometimes fail even when the extension is legitimate // This is because contains() comes in from one extreme or the other, // whereas we started from the inside and worked outwards. This // affects which Ns are OK and which are not OK. // Have to do both because whether we can get through an N depends on // which direction we're coming in bool fwContains = gfmFw.contains(tmp_rdseq_); tmp_rdseq_.reverse(); bool bwContains = gfmBw != NULL && gfmBw->contains(tmp_rdseq_); tmp_rdseq_.reverse(); assert(fwContains || bwContains); } #endif ASSERT_ONLY(tmp_rdseq_.reverse()); if(lim > 0) { const GFM *gfm = &gfmFw; assert(gfm != NULL); // Extend left using forward index const BTDnaString& seq = fw ? rd.patFw : rd.patRc; // See what we get by extending index_t top = topf, bot = botf; t[0] = t[1] = t[2] = t[3] = 0; b[0] = b[1] = b[2] = b[3] = 0; tp[0] = tp[1] = tp[2] = tp[3] = topb; bp[0] = bp[1] = bp[2] = bp[3] = botb; SideLocus tloc, bloc; INIT_LOCS(top, bot, tloc, bloc, *gfm); for(index_t ii = 0; ii < lim; ii++) { // Starting to left of seed (mapBiLFEx(tloc, bloc, t, b, tp, bp); SANITY_CHECK_4TUP(t, b, tp, bp); int nonz = -1; bool abort = false; size_t origSz = bot - top; for(int j = 0; j < 4; j++) { if(b[j] > t[j]) { if(nonz >= 0) { abort = true; break; } nonz = j; top = t[j]; bot = b[j]; } } assert_leq(bot - top, origSz); if(abort || (nonz != rdc && rdc <= 3) || bot - top < origSz) { break; } } else { assert_eq(bot, top+1); prm.nSdFmops++; int c = gfm->mapLF1(top, tloc); if(c != rdc && rdc <= 3) { break; } bot = top + 1; } ASSERT_ONLY(tmp_rdseq_.append(rdc)); if(++nlex == 255) { break; } INIT_LOCS(top, bot, tloc, bloc, *gfm); } } // We're about to add onto the end, so re-reverse ASSERT_ONLY(tmp_rdseq_.reverse()); lim = fw ? rdlen - len - off : off; if(lim > 0 && gfmBw != NULL) { const GFM *gfm = gfmBw; assert(gfm != NULL); // Extend right using backward index const BTDnaString& seq = fw ? rd.patFw : rd.patRc; // See what we get by extending index_t top = topb, bot = botb; t[0] = t[1] = t[2] = t[3] = 0; b[0] = b[1] = b[2] = b[3] = 0; tp[0] = tp[1] = tp[2] = tp[3] = topf; bp[0] = bp[1] = bp[2] = bp[3] = botf; INIT_LOCS(top, bot, tloc, bloc, *gfm); for(index_t ii = 0; ii < lim; ii++) { // Starting to right of seed (mapBiLFEx(tloc, bloc, t, b, tp, bp); SANITY_CHECK_4TUP(t, b, tp, bp); int nonz = -1; bool abort = false; size_t origSz = bot - top; for(int j = 0; j < 4; j++) { if(b[j] > t[j]) { if(nonz >= 0) { abort = true; break; } nonz = j; top = t[j]; bot = b[j]; } } assert_leq(bot - top, origSz); if(abort || (nonz != rdc && rdc <= 3) || bot - top < origSz) { break; } } else { assert_eq(bot, top+1); prm.nSdFmops++; int c = gfm->mapLF1(top, tloc); if(c != rdc && rdc <= 3) { break; } bot = top + 1; } ASSERT_ONLY(tmp_rdseq_.append(rdc)); if(++nrex == 255) { break; } INIT_LOCS(top, bot, tloc, bloc, *gfm); } } #ifndef NDEBUG if(false) { // TODO: This will sometimes fail even when the extension is legitimate // This is because contains() comes in from one extreme or the other, // whereas we started from the inside and worked outwards. This // affects which Ns are OK and which are not OK. // Have to do both because whether we can get through an N depends on // which direction we're coming in bool fwContains = gfmFw.contains(tmp_rdseq_); tmp_rdseq_.reverse(); bool bwContains = gfmBw != NULL && gfmBw->contains(tmp_rdseq_); tmp_rdseq_.reverse(); assert(fwContains || bwContains); } #endif assert_lt(nlex, rdlen); assert_lt(nrex, rdlen); return; } /** * Given seed results, set up all of our state for resolving and keeping * track of reference offsets for hits. */ template void SwDriver::prioritizeSATups( const Read& read, // read SeedResults& sh, // seed hits to extend into full alignments const GFM& gfmFw, // BWT const GFM* gfmBw, // BWT const BitPairReference& ref, // Reference strings int seedmms, // # mismatches allowed in seed index_t maxelt, // max elts we'll consider bool doExtend, // do extension of seed hits? bool lensq, // square length in weight calculation bool szsq, // square range size in weight calculation index_t nsm, // if range as <= nsm elts, it's "small" AlignmentCacheIface& ca, // alignment cache for seed hits RandomSource& rnd, // pseudo-random generator WalkMetrics& wlm, // group walk left metrics PerReadMetrics& prm, // per-read metrics index_t& nelt_out, // out: # elements total bool all) // report all hits? { const index_t nonz = sh.nonzeroOffsets(); // non-zero positions const int matei = (read.mate <= 1 ? 0 : 1); satups_.clear(); gws_.clear(); rands_.clear(); rands2_.clear(); satpos_.clear(); satpos2_.clear(); index_t nrange = 0, nelt = 0, nsmall = 0, nsmall_elts = 0; bool keepWhole = false; EList, 16>& satpos = keepWhole ? satpos_ : satpos2_; for(index_t i = 0; i < nonz; i++) { bool fw = true; index_t offidx = 0, rdoff = 0, seedlen = 0; QVal qv = sh.hitsByRank(i, offidx, rdoff, fw, seedlen); assert(qv.valid()); assert(!qv.empty()); assert(qv.repOk(ca.current())); ca.queryQval(qv, satups_, nrange, nelt); for(size_t j = 0; j < satups_.size(); j++) { const index_t sz = satups_[j].size(); // Check whether this hit occurs inside the extended boundaries of // another hit we already processed for this read. if(seedmms == 0) { // See if we're covered by a previous extended seed hit EList& range = fw ? seedExRangeFw_[matei] : seedExRangeRc_[matei]; bool skip = false; for(index_t k = 0; k < range.size(); k++) { index_t p5 = range[k].off; index_t len = range[k].len; if(p5 <= rdoff && p5 + len >= (rdoff + seedlen)) { if(sz <= range[k].sz) { skip = true; break; } } } if(skip) { assert_gt(nrange, 0); nrange--; assert_geq(nelt, sz); nelt -= sz; continue; // Skip this seed } } satpos.expand(); satpos.back().sat = satups_[j]; satpos.back().origSz = sz; satpos.back().pos.init(fw, offidx, rdoff, seedlen); if(sz <= nsm) { nsmall++; nsmall_elts += sz; } satpos.back().nlex = satpos.back().nrex = 0; #ifndef NDEBUG tmp_rdseq_.clear(); uint64_t key = satpos.back().sat.key.seq; for(size_t k = 0; k < seedlen; k++) { int c = (int)(key & 3); tmp_rdseq_.append(c); key >>= 2; } tmp_rdseq_.reverse(); #endif index_t nlex = 0, nrex = 0; if(doExtend) { extend( read, gfmFw, gfmBw, satpos.back().sat.topf, (index_t)(satpos.back().sat.topf + sz), satpos.back().sat.topb, (index_t)(satpos.back().sat.topb + sz), fw, rdoff, seedlen, prm, nlex, nrex); } satpos.back().nlex = nlex; satpos.back().nrex = nrex; if(seedmms == 0 && (nlex > 0 || nrex > 0)) { assert_geq(rdoff, (fw ? nlex : nrex)); index_t p5 = rdoff - (fw ? nlex : nrex); EList& range = fw ? seedExRangeFw_[matei] : seedExRangeRc_[matei]; range.expand(); range.back().off = p5; range.back().len = seedlen + nlex + nrex; range.back().sz = sz; } } satups_.clear(); } assert_leq(nsmall, nrange); nelt_out = nelt; // return the total number of elements assert_eq(nrange, satpos.size()); satpos.sort(); if(keepWhole) { gws_.ensure(nrange); rands_.ensure(nrange); for(index_t i = 0; i < nrange; i++) { gws_.expand(); SARangeWithOffs sa; sa.topf = satpos_.back().sat.topf; sa.len = satpos_.back().sat.key.len; sa.offs = satpos_.back().sat.offs; gws_.back().init( gfmFw, // forward Bowtie index ref, // reference sequences sa, // SA tuples: ref hit, salist range rnd, // pseudo-random generator wlm); // metrics assert(gws_.back().initialized()); rands_.expand(); rands_.back().init(satpos_[i].sat.size(), all); } return; } // Resize satups_ list so that ranges having elements that we might // possibly explore are present satpos_.ensure(min(maxelt, nelt)); gws_.ensure(min(maxelt, nelt)); rands_.ensure(min(maxelt, nelt)); rands2_.ensure(min(maxelt, nelt)); size_t nlarge_elts = nelt - nsmall_elts; if(maxelt < nelt) { size_t diff = nelt - maxelt; if(diff >= nlarge_elts) { nlarge_elts = 0; } else { nlarge_elts -= diff; } } index_t nelt_added = 0; // Now we have a collection of ranges in satpos2_. Now we want to decide // how we explore elements from them. The basic idea is that: for very // small guys, where "very small" means that the size of the range is less // than or equal to the parameter 'nsz', we explore them in their entirety // right away. For the rest, we want to select in a way that is (a) // random, and (b) weighted toward examining elements from the smaller // ranges more frequently (and first). // // 1. do the smalls for(index_t j = 0; j < nsmall && nelt_added < maxelt; j++) { satpos_.expand(); satpos_.back() = satpos2_[j]; gws_.expand(); SARangeWithOffs sa; sa.topf = satpos_.back().sat.topf; sa.len = satpos_.back().sat.key.len; sa.offs = satpos_.back().sat.offs; gws_.back().init( gfmFw, // forward Bowtie index ref, // reference sequences sa, // SA tuples: ref hit, salist range rnd, // pseudo-random generator wlm); // metrics assert(gws_.back().initialized()); rands_.expand(); rands_.back().init(satpos_.back().sat.size(), all); nelt_added += satpos_.back().sat.size(); #ifndef NDEBUG for(size_t k = 0; k < satpos_.size()-1; k++) { assert(!(satpos_[k] == satpos_.back())); } #endif } if(nelt_added >= maxelt || nsmall == satpos2_.size()) { nelt_out = nelt_added; return; } // 2. do the non-smalls // Initialize the row sampler rowsamp_.init(satpos2_, nsmall, satpos2_.size(), lensq, szsq); // Initialize the random choosers rands2_.resize(satpos2_.size()); for(index_t j = 0; j < satpos2_.size(); j++) { rands2_[j].reset(); } while(nelt_added < maxelt && nelt_added < nelt) { // Pick a non-small range to sample from index_t ri = rowsamp_.next(rnd) + nsmall; assert_geq(ri, nsmall); assert_lt(ri, satpos2_.size()); // Initialize random element chooser for that range if(!rands2_[ri].inited()) { rands2_[ri].init(satpos2_[ri].sat.size(), all); assert(!rands2_[ri].done()); } assert(!rands2_[ri].done()); // Choose an element from the range uint32_t r = rands2_[ri].next(rnd); if(rands2_[ri].done()) { // Tell the row sampler this range is done rowsamp_.finishedRange(ri - nsmall); } // Add the element to the satpos_ list SATuple sat; TSlice o; o.init(satpos2_[ri].sat.offs, r, r+1); sat.init(satpos2_[ri].sat.key, (index_t)(satpos2_[ri].sat.topf + r), (index_t)OFF_MASK, o); satpos_.expand(); satpos_.back().sat = sat; satpos_.back().origSz = satpos2_[ri].origSz; satpos_.back().pos = satpos2_[ri].pos; // Initialize GroupWalk object gws_.expand(); SARangeWithOffs sa; sa.topf = sat.topf; sa.len = sat.key.len; sa.offs = sat.offs; gws_.back().init( gfmFw, // forward Bowtie index ref, // reference sequences sa, // SA tuples: ref hit, salist range rnd, // pseudo-random generator wlm); // metrics assert(gws_.back().initialized()); // Initialize random selector rands_.expand(); rands_.back().init(1, all); nelt_added++; } nelt_out = nelt_added; return; } enum { FOUND_NONE = 0, FOUND_EE, FOUND_UNGAPPED, }; /** * Given a collection of SeedHits for a single read, extend seed alignments * into full alignments. Where possible, try to avoid redundant offset lookups * and dynamic programming wherever possible. Optionally report alignments to * a AlnSinkWrap object as they are discovered. * * If 'reportImmediately' is true, returns true iff a call to msink->report() * returned true (indicating that the reporting policy is satisfied and we can * stop). Otherwise, returns false. */ template int SwDriver::extendSeeds( Read& rd, // read to align bool mate1, // true iff rd is mate #1 SeedResults& sh, // seed hits to extend into full alignments const GFM& gfmFw, // BWT const GFM* gfmBw, // BWT' const BitPairReference& ref, // Reference strings SwAligner& swa, // dynamic programming aligner const Scoring& sc, // scoring scheme int seedmms, // # mismatches allowed in seed int seedlen, // length of seed int seedival, // interval between seeds TAlScore& minsc, // minimum score for anchor int nceil, // maximum # Ns permitted in reference portion size_t maxhalf, // max width in either direction for DP tables bool doUngapped, // do ungapped alignment size_t maxIters, // stop after this many seed-extend loop iters size_t maxUg, // stop after this many ungaps size_t maxDp, // stop after this many dps size_t maxUgStreak, // stop after streak of this many ungap fails size_t maxDpStreak, // stop after streak of this many dp fails bool doExtend, // do seed extension bool enable8, // use 8-bit SSE where possible size_t cminlen, // use checkpointer if read longer than this size_t cpow2, // interval between diagonals to checkpoint bool doTri, // triangular mini-fills? int tighten, // -M score tightening mode AlignmentCacheIface& ca, // alignment cache for seed hits RandomSource& rnd, // pseudo-random source WalkMetrics& wlm, // group walk left metrics SwMetrics& swmSeed, // DP metrics for seed-extend PerReadMetrics& prm, // per-read metrics AlnSinkWrap* msink, // AlnSink wrapper for multiseed-style aligner bool reportImmediately, // whether to report hits immediately to msink bool& exhaustive) // set to true iff we searched all seeds exhaustively { bool all = msink->allHits(); // typedef std::pair UPair; assert(!reportImmediately || msink != NULL); assert(!reportImmediately || !msink->maxed()); assert_geq(nceil, 0); assert_leq((size_t)nceil, rd.length()); // Calculate the largest possible number of read and reference gaps const index_t rdlen = (index_t)rd.length(); TAlScore perfectScore = sc.perfectScore(rdlen); DynProgFramer dpframe(!gReportOverhangs); swa.reset(); // Initialize a set of GroupWalks, one for each seed. Also, intialize the // accompanying lists of reference seed hits (satups*) const index_t nsm = 5; const index_t nonz = sh.nonzeroOffsets(); // non-zero positions index_t eeHits = sh.numE2eHits(); bool eeMode = eeHits > 0; bool firstEe = true; bool firstExtend = true; // Reset all the counters related to streaks prm.nEeFail = 0; prm.nUgFail = 0; prm.nDpFail = 0; index_t nelt = 0, neltLeft = 0; index_t rows = rdlen; index_t eltsDone = 0; // cerr << "===" << endl; while(true) { if(eeMode) { if(firstEe) { firstEe = false; eeMode = eeSaTups( rd, // read sh, // seed hits to extend into full alignments gfmFw, // BWT ref, // Reference strings rnd, // pseudo-random generator wlm, // group walk left metrics swmSeed, // seed-extend metrics nelt, // out: # elements total maxIters, // max # to report all); // report all hits? assert_eq(gws_.size(), rands_.size()); assert_eq(gws_.size(), satpos_.size()); } else { eeMode = false; } } if(!eeMode) { if(nonz == 0) { return EXTEND_EXHAUSTED_CANDIDATES; // No seed hits! Bail. } if(minsc == perfectScore) { return EXTEND_PERFECT_SCORE; // Already found all perfect hits! } if(firstExtend) { nelt = 0; prioritizeSATups( rd, // read sh, // seed hits to extend into full alignments gfmFw, // BWT gfmBw, // BWT' ref, // Reference strings seedmms, // # seed mismatches allowed maxIters, // max rows to consider per position doExtend, // extend out seeds true, // square extended length true, // square SA range size nsm, // smallness threshold ca, // alignment cache for seed hits rnd, // pseudo-random generator wlm, // group walk left metrics prm, // per-read metrics nelt, // out: # elements total all); // report all hits? assert_eq(gws_.size(), rands_.size()); assert_eq(gws_.size(), satpos_.size()); neltLeft = nelt; firstExtend = false; } if(neltLeft == 0) { // Finished examining gapped candidates break; } } for(size_t i = 0; i < gws_.size(); i++) { if(eeMode && eehits_[i].score < minsc) { return EXTEND_PERFECT_SCORE; } bool is_small = satpos_[i].sat.size() < nsm; bool fw = satpos_[i].pos.fw; index_t rdoff = satpos_[i].pos.rdoff; index_t seedhitlen = satpos_[i].pos.seedlen; if(!fw) { // 'rdoff' and 'offidx' are with respect to the 5' end of // the read. Here we convert rdoff to be with respect to // the upstream (3') end of ther read. rdoff = (index_t)(rdlen - rdoff - seedhitlen); } bool first = true; // If the range is small, investigate all elements now. If the // range is large, just investigate one and move on - we might come // back to this range later. index_t riter = 0; while(!rands_[i].done() && (first || is_small || eeMode)) { assert(!gws_[i].done()); riter++; if(minsc == perfectScore) { if(!eeMode || eehits_[i].score < perfectScore) { return EXTEND_PERFECT_SCORE; } } else if(eeMode && eehits_[i].score < minsc) { break; } if(prm.nExDps >= maxDp || prm.nMateDps >= maxDp) { return EXTEND_EXCEEDED_HARD_LIMIT; } if(prm.nExUgs >= maxUg || prm.nMateUgs >= maxUg) { return EXTEND_EXCEEDED_HARD_LIMIT; } if(prm.nExIters >= maxIters) { return EXTEND_EXCEEDED_HARD_LIMIT; } prm.nExIters++; first = false; // Resolve next element offset WalkResult wr; uint32_t elt = rands_[i].next(rnd); //cerr << "elt=" << elt << endl; SARangeWithOffs sa; sa.topf = satpos_[i].sat.topf; sa.len = satpos_[i].sat.key.len; sa.offs = satpos_[i].sat.offs; gws_[i].advanceElement((index_t)elt, gfmFw, ref, sa, gwstate_, wr, wlm, prm); eltsDone++; if(!eeMode) { assert_gt(neltLeft, 0); neltLeft--; } assert_neq((index_t)OFF_MASK, wr.toff); index_t tidx = 0, toff = 0, tlen = 0; bool straddled = false; gfmFw.joinedToTextOff( wr.elt.len, wr.toff, tidx, toff, tlen, eeMode, // reject straddlers? straddled); // did it straddle? if(tidx == (index_t)OFF_MASK) { // The seed hit straddled a reference boundary so the seed hit // isn't valid continue; } #ifndef NDEBUG if(!eeMode && !straddled) { // Check that seed hit matches reference uint64_t key = satpos_[i].sat.key.seq; for(index_t k = 0; k < wr.elt.len; k++) { int c = ref.getBase(tidx, toff + wr.elt.len - k - 1); assert_leq(c, 3); int ck = (int)(key & 3); key >>= 2; assert_eq(c, ck); } } #endif // Find offset of alignment's upstream base assuming net gaps=0 // between beginning of read and beginning of seed hit int64_t refoff = (int64_t)toff - rdoff; // Coordinate of the seed hit w/r/t the pasted reference string Coord refcoord(tidx, refoff, fw); if(seenDiags1_.locusPresent(refcoord)) { // Already handled alignments seeded on this diagonal prm.nRedundants++; swmSeed.rshit++; continue; } // Now that we have a seed hit, there are many issues to solve // before we have a completely framed dynamic programming problem. // They include: // // 1. Setting reference offsets on either side of the seed hit, // accounting for where the seed occurs in the read // 2. Adjusting the width of the banded dynamic programming problem // and adjusting reference bounds to allow for gaps in the // alignment // 3. Accounting for the edges of the reference, which can impact // the width of the DP problem and reference bounds. // 4. Perhaps filtering the problem down to a smaller problem based // on what DPs we've already solved for this read // // We do #1 here, since it is simple and we have all the seed-hit // information here. #2 and #3 are handled in the DynProgFramer. int readGaps = 0, refGaps = 0; bool ungapped = false; if(!eeMode) { readGaps = sc.maxReadGaps(minsc, rdlen); refGaps = sc.maxRefGaps(minsc, rdlen); ungapped = (readGaps == 0 && refGaps == 0); } int state = FOUND_NONE; bool found = false; if(eeMode) { resEe_.reset(); resEe_.alres.reset(); const EEHit& h = eehits_[i]; assert_leq(h.score, perfectScore); resEe_.alres.setScore(AlnScore(h.score, h.ns(), 0)); resEe_.alres.setShape( refcoord.ref(), // ref id refcoord.off(), // 0-based ref offset tlen, // length of reference fw, // aligned to Watson? rdlen, // read length true, // pretrim soft? 0, // pretrim 5' end 0, // pretrim 3' end true, // alignment trim soft? 0, // alignment trim 5' end 0); // alignment trim 3' end resEe_.alres.setRefNs(h.refns()); if(h.mms() > 0) { assert_eq(1, h.mms()); assert_lt(h.e1.pos, rd.length()); resEe_.alres.ned().push_back(h.e1); } assert(resEe_.repOk(rd)); state = FOUND_EE; found = true; Interval refival(refcoord, 1); seenDiags1_.add(refival); } else if(doUngapped && ungapped) { resUngap_.reset(); int al = swa.ungappedAlign( fw ? rd.patFw : rd.patRc, fw ? rd.qual : rd.qualRev, refcoord, ref, tlen, sc, gReportOverhangs, minsc, resUngap_); Interval refival(refcoord, 1); seenDiags1_.add(refival); prm.nExUgs++; if(al == 0) { prm.nExUgFails++; prm.nUgFail++; if(prm.nUgFail >= maxUgStreak) { return EXTEND_EXCEEDED_SOFT_LIMIT; } swmSeed.ungapfail++; continue; } else if(al == -1) { prm.nExUgFails++; prm.nUgFail++; // count this as failure if(prm.nUgFail >= maxUgStreak) { return EXTEND_EXCEEDED_SOFT_LIMIT; } swmSeed.ungapnodec++; } else { prm.nExUgSuccs++; prm.nUgLastSucc = prm.nExUgs-1; if(prm.nUgFail > prm.nUgFailStreak) { prm.nUgFailStreak = prm.nUgFail; } prm.nUgFail = 0; found = true; state = FOUND_UNGAPPED; swmSeed.ungapsucc++; } } int64_t pastedRefoff = (int64_t)wr.toff - rdoff; DPRect rect; if(state == FOUND_NONE) { found = dpframe.frameSeedExtensionRect( refoff, // ref offset implied by seed hit assuming no gaps rows, // length of read sequence used in DP table tlen, // length of reference readGaps, // max # of read gaps permitted in opp mate alignment refGaps, // max # of ref gaps permitted in opp mate alignment (size_t)nceil, // # Ns permitted maxhalf, // max width in either direction rect); // DP rectangle assert(rect.repOk()); // Add the seed diagonal at least seenDiags1_.add(Interval(refcoord, 1)); if(!found) { continue; } } int64_t leftShift = refoff - rect.refl; size_t nwindow = 0; if(toff >= rect.refl) { nwindow = (size_t)(toff - rect.refl); } // NOTE: We might be taking off more than we should because the // pasted string omits non-A/C/G/T characters, but we included them // when calculating leftShift. We'll account for this later. pastedRefoff -= leftShift; size_t nsInLeftShift = 0; if(state == FOUND_NONE) { if(!swa.initedRead()) { // Initialize the aligner with a new read swa.initRead( rd.patFw, // fw version of query rd.patRc, // rc version of query rd.qual, // fw version of qualities rd.qualRev,// rc version of qualities 0, // off of first char in 'rd' to consider rdlen, // off of last char (excl) in 'rd' to consider sc); // scoring scheme } swa.initRef( fw, // whether to align forward or revcomp read tidx, // reference aligned against rect, // DP rectangle ref, // Reference strings tlen, // length of reference sequence sc, // scoring scheme minsc, // minimum score permitted enable8, // use 8-bit SSE if possible? cminlen, // minimum length for using checkpointing scheme cpow2, // interval b/t checkpointed diags; 1 << this doTri, // triangular mini-fills? true, // this is a seed extension - not finding a mate nwindow, nsInLeftShift); // Because of how we framed the problem, we can say that we've // exhaustively scored the seed diagonal as well as maxgaps // diagonals on either side Interval refival(tidx, 0, fw, 0); rect.initIval(refival); seenDiags1_.add(refival); // Now fill the dynamic programming matrix and return true iff // there is at least one valid alignment TAlScore bestCell = std::numeric_limits::min(); found = swa.align(rnd, bestCell); swmSeed.tallyGappedDp(readGaps, refGaps); prm.nExDps++; if(!found) { prm.nExDpFails++; prm.nDpFail++; if(prm.nDpFail >= maxDpStreak) { return EXTEND_EXCEEDED_SOFT_LIMIT; } if(bestCell > std::numeric_limits::min() && bestCell > prm.bestLtMinscMate1) { prm.bestLtMinscMate1 = bestCell; } continue; // Look for more anchor alignments } else { prm.nExDpSuccs++; prm.nDpLastSucc = prm.nExDps-1; if(prm.nDpFail > prm.nDpFailStreak) { prm.nDpFailStreak = prm.nDpFail; } prm.nDpFail = 0; } } bool firstInner = true; while(true) { assert(found); SwResult *res = NULL; if(state == FOUND_EE) { if(!firstInner) { break; } res = &resEe_; } else if(state == FOUND_UNGAPPED) { if(!firstInner) { break; } res = &resUngap_; } else { resGap_.reset(); assert(resGap_.empty()); if(swa.done()) { break; } swa.nextAlignment(resGap_, minsc, rnd); found = !resGap_.empty(); if(!found) { break; } res = &resGap_; } assert(res != NULL); firstInner = false; assert(res->alres.matchesRef( rd, ref, tmp_rf_, tmp_rdseq_, tmp_qseq_, raw_refbuf_, raw_destU32_, raw_matches_, tmp_reflens_, tmp_refoffs_)); Interval refival(tidx, 0, fw, tlen); assert_gt(res->alres.refExtent(), 0); if(gReportOverhangs && !refival.containsIgnoreOrient(res->alres.refival())) { res->alres.clipOutside(true, 0, tlen); if(res->alres.refExtent() == 0) { continue; } } assert(gReportOverhangs || refival.containsIgnoreOrient(res->alres.refival())); // Did the alignment fall entirely outside the reference? if(!refival.overlapsIgnoreOrient(res->alres.refival())) { continue; } // Is this alignment redundant with one we've seen previously? if(redAnchor_.overlap(res->alres)) { // Redundant with an alignment we found already continue; } redAnchor_.add(res->alres); // Annotate the AlnRes object with some key parameters // that were used to obtain the alignment. res->alres.setParams( seedmms, // # mismatches allowed in seed seedlen, // length of seed seedival, // interval between seeds minsc); // minimum score for valid alignment if(reportImmediately) { assert(msink != NULL); assert(res->repOk()); // Check that alignment accurately reflects the // reference characters aligned to assert(res->alres.matchesRef( rd, ref, tmp_rf_, tmp_rdseq_, tmp_qseq_, raw_refbuf_, raw_destU32_, raw_matches_, tmp_reflens_, tmp_refoffs_)); // Report an unpaired alignment assert(!msink->maxed()); if(msink->report( 0, mate1 ? &res->alres : NULL, mate1 ? NULL : &res->alres)) { // Short-circuited because a limit, e.g. -k, -m or // -M, was exceeded return EXTEND_POLICY_FULFILLED; } if(tighten > 0 && msink->Mmode() && msink->hasSecondBestUnp1()) { if(tighten == 1) { if(msink->bestUnp1() >= minsc) { minsc = msink->bestUnp1(); if(minsc < perfectScore && msink->bestUnp1() == msink->secondBestUnp1()) { minsc++; } } } else if(tighten == 2) { if(msink->secondBestUnp1() >= minsc) { minsc = msink->secondBestUnp1(); if(minsc < perfectScore) { minsc++; } } } else { TAlScore diff = msink->bestUnp1() - msink->secondBestUnp1(); TAlScore bot = msink->secondBestUnp1() + ((diff*3)/4); if(bot >= minsc) { minsc = bot; if(minsc < perfectScore) { minsc++; } } } assert_leq(minsc, perfectScore); } } } // At this point we know that we aren't bailing, and will // continue to resolve seed hits. } // while(!gws_[i].done()) } } // Short-circuited because a limit, e.g. -k, -m or -M, was exceeded return EXTEND_EXHAUSTED_CANDIDATES; } /** * Given a collection of SeedHits for both mates in a read pair, extend seed * alignments into full alignments and then look for the opposite mate using * dynamic programming. Where possible, try to avoid redundant offset lookups. * Optionally report alignments to a AlnSinkWrap object as they are discovered. * * If 'reportImmediately' is true, returns true iff a call to * msink->report() returned true (indicating that the reporting * policy is satisfied and we can stop). Otherwise, returns false. * * REDUNDANT SEED HITS * * See notes at top of aligner_sw_driver.h. * * REDUNDANT ALIGNMENTS * * See notes at top of aligner_sw_driver.h. * * MIXING PAIRED AND UNPAIRED ALIGNMENTS * * There are distinct paired-end alignment modes for the cases where (a) the * user does or does not want to see unpaired alignments for individual mates * when there are no reportable paired-end alignments involving both mates, and * (b) the user does or does not want to see discordant paired-end alignments. * The modes have implications for this function and for the AlnSinkWrap, since * it affects when we're "done." Also, whether the user has asked us to report * discordant alignments affects whether and how much searching for unpaired * alignments we must do (i.e. if there are no paired-end alignments, we must * at least do -m 1 for both mates). * * Mode 1: Just concordant paired-end. Print only concordant paired-end * alignments. As soon as any limits (-k/-m/-M) are reached, stop. * * Mode 2: Concordant and discordant paired-end. If -k/-m/-M limits are * reached for paired-end alignments, stop. Otherwise, if no paired-end * alignments are found, align both mates in an unpaired -m 1 fashion. If * there is exactly one unpaired alignment for each mate, report the * combination as a discordant alignment. * * Mode 3: Concordant paired-end if possible, otherwise unpaired. If -k/-M * limit is reached for paired-end alignmnts, stop. If -m limit is reached for * paired-end alignments or no paired-end alignments are found, align both * mates in an unpaired fashion. All the same settings governing validity and * reportability in paired-end mode apply here too (-k/-m/-M/etc). * * Mode 4: Concordant or discordant paired-end if possible, otherwise unpaired. * If -k/-M limit is reached for paired-end alignmnts, stop. If -m limit is * reached for paired-end alignments or no paired-end alignments are found, * align both mates in an unpaired fashion. If the -m limit was reached, there * is no need to search for a discordant alignment, and unapired alignment can * proceed as in Mode 3. If no paired-end alignments were found, then unpaired * alignment proceeds as in Mode 3 but with this caveat: alignment must be at * least as thorough as dictated by -m 1 up until the point where * * Print paired-end alignments when there are reportable paired-end * alignments, otherwise report reportable unpaired alignments. If -k limit is * reached for paired-end alignments, stop. If -m/-M limit is reached for * paired-end alignments, stop searching for paired-end alignments and look * only for unpaired alignments. If searching only for unpaired alignments, * respect -k/-m/-M limits separately for both mates. * * The return value from the AlnSinkWrap's report member function must be * specific enough to distinguish between: * * 1. Stop searching for paired-end alignments * 2. Stop searching for alignments for unpaired alignments for mate #1 * 3. Stop searching for alignments for unpaired alignments for mate #2 * 4. Stop searching for any alignments * * Note that in Mode 2, options affecting validity and reportability of * alignments apply . E.g. if -m 1 is specified * * WORKFLOW * * Our general approach to finding paired and unpaired alignments here * is as follows: * * - For mate in mate1, mate2: * - For each seed hit in mate: * - Try to extend it into a full alignment; if we can't, continue * to the next seed hit * - Look for alignment for opposite mate; if we can't find one, * - * - * */ template int SwDriver::extendSeedsPaired( Read& rd, // mate to align as anchor Read& ord, // mate to align as opposite bool anchor1, // true iff anchor mate is mate1 bool oppFilt, // true iff opposite mate was filtered out SeedResults& sh, // seed hits for anchor const GFM& gfmFw, // BWT const GFM* gfmBw, // BWT' const BitPairReference& ref, // Reference strings SwAligner& swa, // dynamic programming aligner for anchor SwAligner& oswa, // dynamic programming aligner for opposite const Scoring& sc, // scoring scheme const PairedEndPolicy& pepol,// paired-end policy int seedmms, // # mismatches allowed in seed int seedlen, // length of seed int seedival, // interval between seeds TAlScore& minsc, // minimum score for valid anchor aln TAlScore& ominsc, // minimum score for valid opposite aln int nceil, // max # Ns permitted in ref for anchor int onceil, // max # Ns permitted in ref for opposite bool nofw, // don't align forward read bool norc, // don't align revcomp read size_t maxhalf, // max width in either direction for DP tables bool doUngapped, // do ungapped alignment size_t maxIters, // stop after this many seed-extend loop iters size_t maxUg, // stop after this many ungaps size_t maxDp, // stop after this many dps size_t maxEeStreak, // stop after streak of this many end-to-end fails size_t maxUgStreak, // stop after streak of this many ungap fails size_t maxDpStreak, // stop after streak of this many dp fails size_t maxMateStreak, // stop seed range after N mate-find fails bool doExtend, // do seed extension bool enable8, // use 8-bit SSE where possible size_t cminlen, // use checkpointer if read longer than this size_t cpow2, // interval between diagonals to checkpoint bool doTri, // triangular mini-fills? int tighten, // -M score tightening mode AlignmentCacheIface& ca, // alignment cache for seed hits RandomSource& rnd, // pseudo-random source WalkMetrics& wlm, // group walk left metrics SwMetrics& swmSeed, // DP metrics for seed-extend SwMetrics& swmMate, // DP metrics for mate finidng PerReadMetrics& prm, // per-read metrics AlnSinkWrap* msink, // AlnSink wrapper for multiseed-style aligner bool swMateImmediately, // whether to look for mate immediately bool reportImmediately, // whether to report hits immediately to msink bool discord, // look for discordant alignments? bool mixed, // look for unpaired as well as paired alns? bool& exhaustive) { bool all = msink->allHits(); // typedef std::pair U32Pair; assert(!reportImmediately || msink != NULL); assert(!reportImmediately || !msink->maxed()); assert(!msink->state().doneWithMate(anchor1)); assert_geq(nceil, 0); assert_geq(onceil, 0); assert_leq((size_t)nceil, rd.length()); assert_leq((size_t)onceil, ord.length()); const index_t rdlen = rd.length(); const index_t ordlen = ord.length(); const TAlScore perfectScore = sc.perfectScore(rdlen); const TAlScore operfectScore = sc.perfectScore(ordlen); assert_leq(minsc, perfectScore); assert(oppFilt || ominsc <= operfectScore); TAlScore bestPairScore = perfectScore + operfectScore; if(tighten > 0 && msink->Mmode() && msink->hasSecondBestPair()) { // Paired-end alignments should have at least this score from now TAlScore ps; if(tighten == 1) { ps = msink->bestPair(); } else if(tighten == 2) { ps = msink->secondBestPair(); } else { TAlScore diff = msink->bestPair() - msink->secondBestPair(); ps = msink->secondBestPair() + (diff * 3)/4; } if(tighten == 1 && ps < bestPairScore && msink->bestPair() == msink->secondBestPair()) { ps++; } if(tighten >= 2 && ps < bestPairScore) { ps++; } // Anchor mate must have score at least 'ps' minus the best possible // score for the opposite mate. TAlScore nc = ps - operfectScore; if(nc > minsc) { minsc = nc; } assert_leq(minsc, perfectScore); } DynProgFramer dpframe(!gReportOverhangs); swa.reset(); oswa.reset(); // Initialize a set of GroupWalks, one for each seed. Also, intialize the // accompanying lists of reference seed hits (satups*) const index_t nsm = 5; const index_t nonz = sh.nonzeroOffsets(); // non-zero positions index_t eeHits = sh.numE2eHits(); bool eeMode = eeHits > 0; bool firstEe = true; bool firstExtend = true; // Reset all the counters related to streaks prm.nEeFail = 0; prm.nUgFail = 0; prm.nDpFail = 0; index_t nelt = 0, neltLeft = 0; const index_t rows = rdlen; const index_t orows = ordlen; index_t eltsDone = 0; while(true) { if(eeMode) { if(firstEe) { firstEe = false; eeMode = eeSaTups( rd, // read sh, // seed hits to extend into full alignments gfmFw, // BWT ref, // Reference strings rnd, // pseudo-random generator wlm, // group walk left metrics swmSeed, // seed-extend metrics nelt, // out: # elements total maxIters, // max elts to report all); // report all hits assert_eq(gws_.size(), rands_.size()); assert_eq(gws_.size(), satpos_.size()); neltLeft = nelt; // Initialize list that contains the mate-finding failure // streak for each range mateStreaks_.resize(gws_.size()); mateStreaks_.fill(0); } else { eeMode = false; } } if(!eeMode) { if(nonz == 0) { // No seed hits! Bail. return EXTEND_EXHAUSTED_CANDIDATES; } if(msink->Mmode() && minsc == perfectScore) { // Already found all perfect hits! return EXTEND_PERFECT_SCORE; } if(firstExtend) { nelt = 0; prioritizeSATups( rd, // read sh, // seed hits to extend into full alignments gfmFw, // BWT gfmBw, // BWT' ref, // Reference strings seedmms, // # seed mismatches allowed maxIters, // max rows to consider per position doExtend, // extend out seeds true, // square extended length true, // square SA range size nsm, // smallness threshold ca, // alignment cache for seed hits rnd, // pseudo-random generator wlm, // group walk left metrics prm, // per-read metrics nelt, // out: # elements total all); // report all hits? assert_eq(gws_.size(), rands_.size()); assert_eq(gws_.size(), satpos_.size()); neltLeft = nelt; firstExtend = false; mateStreaks_.resize(gws_.size()); mateStreaks_.fill(0); } if(neltLeft == 0) { // Finished examining gapped candidates break; } } for(index_t i = 0; i < gws_.size(); i++) { if(eeMode && eehits_[i].score < minsc) { return EXTEND_PERFECT_SCORE; } bool is_small = satpos_[i].sat.size() < nsm; bool fw = satpos_[i].pos.fw; index_t rdoff = satpos_[i].pos.rdoff; index_t seedhitlen = satpos_[i].pos.seedlen; if(!fw) { // 'rdoff' and 'offidx' are with respect to the 5' end of // the read. Here we convert rdoff to be with respect to // the upstream (3') end of ther read. rdoff = (index_t)(rdlen - rdoff - seedhitlen); } bool first = true; // If the range is small, investigate all elements now. If the // range is large, just investigate one and move on - we might come // back to this range later. while(!rands_[i].done() && (first || is_small || eeMode)) { if(minsc == perfectScore) { if(!eeMode || eehits_[i].score < perfectScore) { return EXTEND_PERFECT_SCORE; } } else if(eeMode && eehits_[i].score < minsc) { break; } if(prm.nExDps >= maxDp || prm.nMateDps >= maxDp) { return EXTEND_EXCEEDED_HARD_LIMIT; } if(prm.nExUgs >= maxUg || prm.nMateUgs >= maxUg) { return EXTEND_EXCEEDED_HARD_LIMIT; } if(prm.nExIters >= maxIters) { return EXTEND_EXCEEDED_HARD_LIMIT; } if(eeMode && prm.nEeFail >= maxEeStreak) { return EXTEND_EXCEEDED_SOFT_LIMIT; } if(!eeMode && prm.nDpFail >= maxDpStreak) { return EXTEND_EXCEEDED_SOFT_LIMIT; } if(!eeMode && prm.nUgFail >= maxUgStreak) { return EXTEND_EXCEEDED_SOFT_LIMIT; } if(mateStreaks_[i] >= maxMateStreak) { // Don't try this seed range anymore rands_[i].setDone(); assert(rands_[i].done()); break; } prm.nExIters++; first = false; assert(!gws_[i].done()); // Resolve next element offset WalkResult wr; uint32_t elt = rands_[i].next(rnd); SARangeWithOffs sa; sa.topf = satpos_[i].sat.topf; sa.len = satpos_[i].sat.key.len; sa.offs = satpos_[i].sat.offs; gws_[i].advanceElement((index_t)elt, gfmFw, ref, sa, gwstate_, wr, wlm, prm); eltsDone++; assert_gt(neltLeft, 0); neltLeft--; assert_neq((index_t)OFF_MASK, wr.toff); index_t tidx = 0, toff = 0, tlen = 0; bool straddled = false; gfmFw.joinedToTextOff( wr.elt.len, wr.toff, tidx, toff, tlen, eeMode, // reject straddlers? straddled); // straddled? if(tidx == (index_t)OFF_MASK) { // The seed hit straddled a reference boundary so the seed hit // isn't valid continue; } #ifndef NDEBUG if(!eeMode && !straddled) { // Check that seed hit matches reference uint64_t key = satpos_[i].sat.key.seq; for(index_t k = 0; k < wr.elt.len; k++) { int c = ref.getBase(tidx, toff + wr.elt.len - k - 1); assert_leq(c, 3); int ck = (int)(key & 3); key >>= 2; assert_eq(c, ck); } } #endif // Find offset of alignment's upstream base assuming net gaps=0 // between beginning of read and beginning of seed hit int64_t refoff = (int64_t)toff - rdoff; EIvalMergeListBinned& seenDiags = anchor1 ? seenDiags1_ : seenDiags2_; // Coordinate of the seed hit w/r/t the pasted reference string Coord refcoord(tidx, refoff, fw); if(seenDiags.locusPresent(refcoord)) { // Already handled alignments seeded on this diagonal prm.nRedundants++; swmSeed.rshit++; continue; } // Now that we have a seed hit, there are many issues to solve // before we have a completely framed dynamic programming problem. // They include: // // 1. Setting reference offsets on either side of the seed hit, // accounting for where the seed occurs in the read // 2. Adjusting the width of the banded dynamic programming problem // and adjusting reference bounds to allow for gaps in the // alignment // 3. Accounting for the edges of the reference, which can impact // the width of the DP problem and reference bounds. // 4. Perhaps filtering the problem down to a smaller problem based // on what DPs we've already solved for this read // // We do #1 here, since it is simple and we have all the seed-hit // information here. #2 and #3 are handled in the DynProgFramer. int readGaps = 0, refGaps = 0; bool ungapped = false; if(!eeMode) { readGaps = sc.maxReadGaps(minsc, rdlen); refGaps = sc.maxRefGaps(minsc, rdlen); ungapped = (readGaps == 0 && refGaps == 0); } int state = FOUND_NONE; bool found = false; // In unpaired mode, a seed extension is successful if it // results in a full alignment that meets the minimum score // threshold. In paired-end mode, a seed extension is // successful if it results in a *full paired-end* alignment // that meets the minimum score threshold. if(eeMode) { resEe_.reset(); resEe_.alres.reset(); const EEHit& h = eehits_[i]; assert_leq(h.score, perfectScore); resEe_.alres.setScore(AlnScore(h.score, h.ns(), 0)); resEe_.alres.setShape( refcoord.ref(), // ref id refcoord.off(), // 0-based ref offset tlen, // reference length fw, // aligned to Watson? rdlen, // read length true, // pretrim soft? 0, // pretrim 5' end 0, // pretrim 3' end true, // alignment trim soft? 0, // alignment trim 5' end 0); // alignment trim 3' end resEe_.alres.setRefNs(h.refns()); if(h.mms() > 0) { assert_eq(1, h.mms()); assert_lt(h.e1.pos, rd.length()); resEe_.alres.ned().push_back(h.e1); } assert(resEe_.repOk(rd)); state = FOUND_EE; found = true; Interval refival(refcoord, 1); seenDiags.add(refival); prm.nExEes++; prm.nEeFail++; // say it's failed until proven successful prm.nExEeFails++; } else if(doUngapped && ungapped) { resUngap_.reset(); int al = swa.ungappedAlign( fw ? rd.patFw : rd.patRc, fw ? rd.qual : rd.qualRev, refcoord, ref, tlen, sc, gReportOverhangs, minsc, // minimum resUngap_); Interval refival(refcoord, 1); seenDiags.add(refival); prm.nExUgs++; prm.nUgFail++; // say it's failed until proven successful prm.nExUgFails++; if(al == 0) { swmSeed.ungapfail++; continue; } else if(al == -1) { swmSeed.ungapnodec++; } else { found = true; state = FOUND_UNGAPPED; swmSeed.ungapsucc++; } } int64_t pastedRefoff = (int64_t)wr.toff - rdoff; DPRect rect; if(state == FOUND_NONE) { found = dpframe.frameSeedExtensionRect( refoff, // ref offset implied by seed hit assuming no gaps rows, // length of read sequence used in DP table tlen, // length of reference readGaps, // max # of read gaps permitted in opp mate alignment refGaps, // max # of ref gaps permitted in opp mate alignment (size_t)nceil, // # Ns permitted maxhalf, // max width in either direction rect); // DP rectangle assert(rect.repOk()); // Add the seed diagonal at least seenDiags.add(Interval(refcoord, 1)); if(!found) { continue; } } int64_t leftShift = refoff - rect.refl; size_t nwindow = 0; if(toff >= rect.refl) { nwindow = (size_t)(toff - rect.refl); } // NOTE: We might be taking off more than we should because the // pasted string omits non-A/C/G/T characters, but we included them // when calculating leftShift. We'll account for this later. pastedRefoff -= leftShift; size_t nsInLeftShift = 0; if(state == FOUND_NONE) { if(!swa.initedRead()) { // Initialize the aligner with a new read swa.initRead( rd.patFw, // fw version of query rd.patRc, // rc version of query rd.qual, // fw version of qualities rd.qualRev,// rc version of qualities 0, // off of first char in 'rd' to consider rdlen, // off of last char (excl) in 'rd' to consider sc); // scoring scheme } swa.initRef( fw, // whether to align forward or revcomp read tidx, // reference aligned against rect, // DP rectangle ref, // Reference strings tlen, // length of reference sequence sc, // scoring scheme minsc, // minimum score permitted enable8, // use 8-bit SSE if possible? cminlen, // minimum length for using checkpointing scheme cpow2, // interval b/t checkpointed diags; 1 << this doTri, // triangular mini-fills? true, // this is a seed extension - not finding a mate nwindow, nsInLeftShift); // Because of how we framed the problem, we can say that we've // exhaustively scored the seed diagonal as well as maxgaps // diagonals on either side Interval refival(tidx, 0, fw, 0); rect.initIval(refival); seenDiags.add(refival); // Now fill the dynamic programming matrix and return true iff // there is at least one valid alignment TAlScore bestCell = std::numeric_limits::min(); found = swa.align(rnd, bestCell); swmSeed.tallyGappedDp(readGaps, refGaps); prm.nExDps++; prm.nDpFail++; // failed until proven successful prm.nExDpFails++; // failed until proven successful if(!found) { TAlScore bestLast = anchor1 ? prm.bestLtMinscMate1 : prm.bestLtMinscMate2; if(bestCell > std::numeric_limits::min() && bestCell > bestLast) { if(anchor1) { prm.bestLtMinscMate1 = bestCell; } else { prm.bestLtMinscMate2 = bestCell; } } continue; // Look for more anchor alignments } } bool firstInner = true; bool foundConcordant = false; while(true) { assert(found); SwResult *res = NULL; if(state == FOUND_EE) { if(!firstInner) { break; } res = &resEe_; assert(res->repOk(rd)); } else if(state == FOUND_UNGAPPED) { if(!firstInner) { break; } res = &resUngap_; assert(res->repOk(rd)); } else { resGap_.reset(); assert(resGap_.empty()); if(swa.done()) { break; } swa.nextAlignment(resGap_, minsc, rnd); found = !resGap_.empty(); if(!found) { break; } res = &resGap_; assert(res->repOk(rd)); } // TODO: If we're just taking anchor alignments out of the // same rectangle, aren't we getting very similar // rectangles for the opposite mate each time? Seems like // we could save some work by detecting this. assert(res != NULL); firstInner = false; assert(res->alres.matchesRef( rd, ref, tmp_rf_, tmp_rdseq_, tmp_qseq_, raw_refbuf_, raw_destU32_, raw_matches_, tmp_reflens_, tmp_refoffs_)); Interval refival(tidx, 0, fw, tlen); assert_gt(res->alres.refExtent(), 0); if(gReportOverhangs && !refival.containsIgnoreOrient(res->alres.refival())) { res->alres.clipOutside(true, 0, tlen); if(res->alres.refExtent() == 0) { continue; } } assert(gReportOverhangs || refival.containsIgnoreOrient(res->alres.refival())); // Did the alignment fall entirely outside the reference? if(!refival.overlapsIgnoreOrient(res->alres.refival())) { continue; } // Is this alignment redundant with one we've seen previously? if(redAnchor_.overlap(res->alres)) { continue; } redAnchor_.add(res->alres); // Annotate the AlnRes object with some key parameters // that were used to obtain the alignment. res->alres.setParams( seedmms, // # mismatches allowed in seed seedlen, // length of seed seedival, // interval between seeds minsc); // minimum score for valid alignment bool foundMate = false; TRefOff off = res->alres.refoff(); if( msink->state().doneWithMate(!anchor1) && !msink->state().doneWithMate( anchor1)) { // We're done with the opposite mate but not with the // anchor mate; don't try to mate up the anchor. swMateImmediately = false; } if(found && swMateImmediately) { assert(!msink->state().doneWithMate(!anchor1)); bool oleft = false, ofw = false; int64_t oll = 0, olr = 0, orl = 0, orr = 0; assert(!msink->state().done()); foundMate = !oppFilt; TAlScore ominsc_cur = ominsc; //bool oungapped = false; int oreadGaps = 0, orefGaps = 0; //int oungappedAlign = -1; // defer if(foundMate) { // Adjust ominsc given the alignment score of the // anchor mate ominsc_cur = ominsc; if(tighten > 0 && msink->Mmode() && msink->hasSecondBestPair()) { // Paired-end alignments should have at least this score from now TAlScore ps; if(tighten == 1) { ps = msink->bestPair(); } else if(tighten == 2) { ps = msink->secondBestPair(); } else { TAlScore diff = msink->bestPair() - msink->secondBestPair(); ps = msink->secondBestPair() + (diff * 3)/4; } if(tighten == 1 && ps < bestPairScore && msink->bestPair() == msink->secondBestPair()) { ps++; } if(tighten >= 2 && ps < bestPairScore) { ps++; } // Anchor mate must have score at least 'ps' minus the best possible // score for the opposite mate. TAlScore nc = ps - res->alres.score().score(); if(nc > ominsc_cur) { ominsc_cur = nc; assert_leq(ominsc_cur, operfectScore); } } oreadGaps = sc.maxReadGaps(ominsc_cur, ordlen); orefGaps = sc.maxRefGaps (ominsc_cur, ordlen); //oungapped = (oreadGaps == 0 && orefGaps == 0); // TODO: Something lighter-weight than DP to scan // for other mate?? //if(oungapped) { // oresUngap_.reset(); // oungappedAlign = oswa.ungappedAlign( // ofw ? ord.patFw : ord.patRc, // ofw ? ord.qual : ord.qualRev, // orefcoord, // ref, // otlen, // sc, // gReportOverhangs, // ominsc_cur, // 0, // oresUngap_); //} foundMate = pepol.otherMate( anchor1, // anchor mate is mate #1? fw, // anchor aligned to Watson? off, // offset of anchor mate orows + oreadGaps, // max # columns spanned by alignment tlen, // reference length anchor1 ? rd.length() : ord.length(), // mate 1 len anchor1 ? ord.length() : rd.length(), // mate 2 len oleft, // out: look left for opposite mate? oll, olr, orl, orr, ofw); } DPRect orect; if(foundMate) { foundMate = dpframe.frameFindMateRect( !oleft, // true iff anchor alignment is to the left oll, // leftmost Watson off for LHS of opp aln olr, // rightmost Watson off for LHS of opp aln orl, // leftmost Watson off for RHS of opp aln orr, // rightmost Watson off for RHS of opp aln orows, // length of opposite mate tlen, // length of reference sequence aligned to oreadGaps, // max # of read gaps in opp mate aln orefGaps, // max # of ref gaps in opp mate aln (size_t)onceil, // max # Ns on opp mate maxhalf, // max width in either direction orect); // DP rectangle assert(!foundMate || orect.refr >= orect.refl); } if(foundMate) { oresGap_.reset(); assert(oresGap_.empty()); if(!oswa.initedRead()) { oswa.initRead( ord.patFw, // read to align ord.patRc, // qualities ord.qual, // read to align ord.qualRev,// qualities 0, // off of first char to consider ordlen, // off of last char (ex) to consider sc); // scoring scheme } // Given the boundaries defined by refi and reff, initilize // the SwAligner with the dynamic programming problem that // aligns the read to this reference stretch. size_t onsInLeftShift = 0; assert_geq(orect.refr, orect.refl); oswa.initRef( ofw, // align forward or revcomp read? tidx, // reference aligned against orect, // DP rectangle ref, // Reference strings tlen, // length of reference sequence sc, // scoring scheme ominsc_cur,// min score for valid alignments enable8, // use 8-bit SSE if possible? cminlen, // minimum length for using checkpointing scheme cpow2, // interval b/t checkpointed diags; 1 << this doTri, // triangular mini-fills? false, // this is finding a mate - not seed ext 0, // nwindow? onsInLeftShift); // TODO: Can't we add some diagonals to the // opposite mate's seenDiags when we fill in the // opposite mate's DP? Or can we? We might want // to use this again as an anchor - will that still // happen? Also, isn't there a problem with // consistency of the minimum score? Minimum score // here depends in part on the score of the anchor // alignment here, but it won't when the current // opposite becomes the anchor. // Because of how we framed the problem, we can say // that we've exhaustively explored the "core" // diagonals //Interval orefival(tidx, 0, ofw, 0); //orect.initIval(orefival); //oseenDiags.add(orefival); // Now fill the dynamic programming matrix, return true // iff there is at least one valid alignment TAlScore bestCell = std::numeric_limits::min(); foundMate = oswa.align(rnd, bestCell); prm.nMateDps++; swmMate.tallyGappedDp(oreadGaps, orefGaps); if(!foundMate) { TAlScore bestLast = anchor1 ? prm.bestLtMinscMate2 : prm.bestLtMinscMate1; if(bestCell > std::numeric_limits::min() && bestCell > bestLast) { if(anchor1) { prm.bestLtMinscMate2 = bestCell; } else { prm.bestLtMinscMate1 = bestCell; } } } } bool didAnchor = false; do { oresGap_.reset(); assert(oresGap_.empty()); if(foundMate && oswa.done()) { foundMate = false; } else if(foundMate) { oswa.nextAlignment(oresGap_, ominsc_cur, rnd); foundMate = !oresGap_.empty(); assert(!foundMate || oresGap_.alres.matchesRef( ord, ref, tmp_rf_, tmp_rdseq_, tmp_qseq_, raw_refbuf_, raw_destU32_, raw_matches_, tmp_reflens_, tmp_refoffs_)); } if(foundMate) { // Redundant with one we've seen previously? if(!redAnchor_.overlap(oresGap_.alres)) { redAnchor_.add(oresGap_.alres); } assert_eq(ofw, oresGap_.alres.fw()); // Annotate the AlnRes object with some key parameters // that were used to obtain the alignment. oresGap_.alres.setParams( seedmms, // # mismatches allowed in seed seedlen, // length of seed seedival, // interval between seeds ominsc); // minimum score for valid alignment assert_gt(oresGap_.alres.refExtent(), 0); if(gReportOverhangs && !refival.containsIgnoreOrient(oresGap_.alres.refival())) { oresGap_.alres.clipOutside(true, 0, tlen); foundMate = oresGap_.alres.refExtent() > 0; } if(foundMate && ((!gReportOverhangs && !refival.containsIgnoreOrient(oresGap_.alres.refival())) || !refival.overlapsIgnoreOrient(oresGap_.alres.refival()))) { foundMate = false; } } ASSERT_ONLY(TRefId refid); TRefOff off1, off2; size_t len1, len2; bool fw1, fw2; int pairCl = PE_ALS_DISCORD; if(foundMate) { ASSERT_ONLY(refid =) res->alres.refid(); assert_eq(refid, oresGap_.alres.refid()); off1 = anchor1 ? off : oresGap_.alres.refoff(); off2 = anchor1 ? oresGap_.alres.refoff() : off; len1 = anchor1 ? res->alres.refExtent() : oresGap_.alres.refExtent(); len2 = anchor1 ? oresGap_.alres.refExtent() : res->alres.refExtent(); fw1 = anchor1 ? res->alres.fw() : oresGap_.alres.fw(); fw2 = anchor1 ? oresGap_.alres.fw() : res->alres.fw(); // Check that final mate alignments are consistent with // paired-end fragment constraints pairCl = pepol.peClassifyPair( off1, len1, fw1, off2, len2, fw2); // Instead of trying //foundMate = pairCl != PE_ALS_DISCORD; } if(msink->state().doneConcordant()) { foundMate = false; } if(reportImmediately) { if(foundMate) { // Report pair to the AlnSinkWrap assert(!msink->state().doneConcordant()); assert(msink != NULL); assert(res->repOk()); assert(oresGap_.repOk()); // Report an unpaired alignment assert(!msink->maxed()); assert(!msink->state().done()); bool doneUnpaired = false; //if(mixed || discord) { // Report alignment for mate #1 as an // unpaired alignment. if(!anchor1 || !didAnchor) { if(anchor1) { didAnchor = true; } const AlnRes& r1 = anchor1 ? res->alres : oresGap_.alres; if(!redMate1_.overlap(r1)) { redMate1_.add(r1); if(msink->report(0, &r1, NULL)) { doneUnpaired = true; // Short-circuited } } } // Report alignment for mate #2 as an // unpaired alignment. if(anchor1 || !didAnchor) { if(!anchor1) { didAnchor = true; } const AlnRes& r2 = anchor1 ? oresGap_.alres : res->alres; if(!redMate2_.overlap(r2)) { redMate2_.add(r2); if(msink->report(0, NULL, &r2)) { doneUnpaired = true; // Short-circuited } } } //} // if(mixed || discord) bool donePaired = false; if(pairCl != PE_ALS_DISCORD) { foundConcordant = true; if(msink->report( 0, anchor1 ? &res->alres : &oresGap_.alres, anchor1 ? &oresGap_.alres : &res->alres)) { // Short-circuited because a limit, e.g. // -k, -m or -M, was exceeded donePaired = true; } else { if(tighten > 0 && msink->Mmode() && msink->hasSecondBestPair()) { // Paired-end alignments should have at least this score from now TAlScore ps; if(tighten == 1) { ps = msink->bestPair(); } else if(tighten == 2) { ps = msink->secondBestPair(); } else { TAlScore diff = msink->bestPair() - msink->secondBestPair(); ps = msink->secondBestPair() + (diff * 3)/4; } if(tighten == 1 && ps < bestPairScore && msink->bestPair() == msink->secondBestPair()) { ps++; } if(tighten >= 2 && ps < bestPairScore) { ps++; } // Anchor mate must have score at least 'ps' minus the best possible // score for the opposite mate. TAlScore nc = ps - operfectScore; if(nc > minsc) { minsc = nc; assert_leq(minsc, perfectScore); if(minsc > res->alres.score().score()) { // We're done with this anchor break; } } assert_leq(minsc, perfectScore); } } } // if(pairCl != PE_ALS_DISCORD) if(donePaired || doneUnpaired) { return EXTEND_POLICY_FULFILLED; } if(msink->state().doneWithMate(anchor1)) { // We're now done with the mate that we're // currently using as our anchor. We're not // with the read overall. return EXTEND_POLICY_FULFILLED; } } else if((mixed || discord) && !didAnchor) { didAnchor = true; // Report unpaired hit for anchor assert(msink != NULL); assert(res->repOk()); // Check that alignment accurately reflects the // reference characters aligned to assert(res->alres.matchesRef( rd, ref, tmp_rf_, tmp_rdseq_, tmp_qseq_, raw_refbuf_, raw_destU32_, raw_matches_, tmp_reflens_, tmp_refoffs_)); // Report an unpaired alignment assert(!msink->maxed()); assert(!msink->state().done()); // Report alignment for mate #1 as an // unpaired alignment. if(!msink->state().doneUnpaired(anchor1)) { const AlnRes& r = res->alres; RedundantAlns& red = anchor1 ? redMate1_ : redMate2_; const AlnRes* r1 = anchor1 ? &res->alres : NULL; const AlnRes* r2 = anchor1 ? NULL : &res->alres; if(!red.overlap(r)) { red.add(r); if(msink->report(0, r1, r2)) { return EXTEND_POLICY_FULFILLED; // Short-circuited } } } if(msink->state().doneWithMate(anchor1)) { // Done with mate, but not read overall return EXTEND_POLICY_FULFILLED; } } } } while(!oresGap_.empty()); } // if(found && swMateImmediately) else if(found) { assert(!msink->state().doneWithMate(anchor1)); // We found an anchor alignment but did not attempt to find // an alignment for the opposite mate (probably because // we're done with it) if(reportImmediately && (mixed || discord)) { // Report unpaired hit for anchor assert(msink != NULL); assert(res->repOk()); // Check that alignment accurately reflects the // reference characters aligned to assert(res->alres.matchesRef( rd, ref, tmp_rf_, tmp_rdseq_, tmp_qseq_, raw_refbuf_, raw_destU32_, raw_matches_, tmp_reflens_, tmp_refoffs_)); // Report an unpaired alignment assert(!msink->maxed()); assert(!msink->state().done()); // Report alignment for mate #1 as an // unpaired alignment. if(!msink->state().doneUnpaired(anchor1)) { const AlnRes& r = res->alres; RedundantAlns& red = anchor1 ? redMate1_ : redMate2_; const AlnRes* r1 = anchor1 ? &res->alres : NULL; const AlnRes* r2 = anchor1 ? NULL : &res->alres; if(!red.overlap(r)) { red.add(r); if(msink->report(0, r1, r2)) { return EXTEND_POLICY_FULFILLED; // Short-circuited } } } if(msink->state().doneWithMate(anchor1)) { // Done with mate, but not read overall return EXTEND_POLICY_FULFILLED; } } } } // while(true) if(foundConcordant) { prm.nMateDpSuccs++; mateStreaks_[i] = 0; // Register this as a success. Now we need to // make the streak variables reflect the // success. if(state == FOUND_UNGAPPED) { assert_gt(prm.nUgFail, 0); assert_gt(prm.nExUgFails, 0); prm.nExUgFails--; prm.nExUgSuccs++; prm.nUgLastSucc = prm.nExUgs-1; if(prm.nUgFail > prm.nUgFailStreak) { prm.nUgFailStreak = prm.nUgFail; } prm.nUgFail = 0; } else if(state == FOUND_EE) { assert_gt(prm.nEeFail, 0); assert_gt(prm.nExEeFails, 0); prm.nExEeFails--; prm.nExEeSuccs++; prm.nEeLastSucc = prm.nExEes-1; if(prm.nEeFail > prm.nEeFailStreak) { prm.nEeFailStreak = prm.nEeFail; } prm.nEeFail = 0; } else { assert_gt(prm.nDpFail, 0); assert_gt(prm.nExDpFails, 0); prm.nExDpFails--; prm.nExDpSuccs++; prm.nDpLastSucc = prm.nExDps-1; if(prm.nDpFail > prm.nDpFailStreak) { prm.nDpFailStreak = prm.nDpFail; } prm.nDpFail = 0; } } else { prm.nMateDpFails++; mateStreaks_[i]++; } // At this point we know that we aren't bailing, and will continue to resolve seed hits. } // while(!gw.done()) } // for(size_t i = 0; i < gws_.size(); i++) } return EXTEND_EXHAUSTED_CANDIDATES; } #endif /*ALIGNER_SW_DRIVER_H_*/