/* * Copyright 2011, Ben Langmead * * This file is part of Bowtie 2. * * Bowtie 2 is free software: you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation, either version 3 of the License, or * (at your option) any later version. * * Bowtie 2 is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with Bowtie 2. If not, see . */ #ifndef ALIGNER_SEED_H_ #define ALIGNER_SEED_H_ #include #include #include #include "qual.h" #include "ds.h" #include "sstring.h" #include "alphabet.h" #include "edit.h" #include "read.h" // Threading is necessary to synchronize the classes that dump // intermediate alignment results to files. Otherwise, all data herein // is constant and shared, or per-thread. #include "threading.h" #include "aligner_result.h" #include "aligner_cache.h" #include "scoring.h" #include "mem_ids.h" #include "simple_func.h" #include "btypes.h" /** * A constraint to apply to an alignment zone, or to an overall * alignment. * * The constraint can put both caps and ceilings on the number and * types of edits allowed. */ struct Constraint { Constraint() { init(); } /** * Initialize Constraint to be fully permissive. */ void init() { edits = mms = ins = dels = penalty = editsCeil = mmsCeil = insCeil = delsCeil = penaltyCeil = MAX_I; penFunc.reset(); instantiated = false; } /** * Return true iff penalities and constraints prevent us from * adding any edits. */ bool mustMatch() { assert(instantiated); return (mms == 0 && edits == 0) || penalty == 0 || (mms == 0 && dels == 0 && ins == 0); } /** * Return true iff a mismatch of the given quality is permitted. */ bool canMismatch(int q, const Scoring& cm) { assert(instantiated); return (mms > 0 || edits > 0) && penalty >= cm.mm(q); } /** * Return true iff a mismatch of the given quality is permitted. */ bool canN(int q, const Scoring& cm) { assert(instantiated); return (mms > 0 || edits > 0) && penalty >= cm.n(q); } /** * Return true iff a mismatch of *any* quality (even qual=1) is * permitted. */ bool canMismatch() { assert(instantiated); return (mms > 0 || edits > 0) && penalty > 0; } /** * Return true iff a mismatch of *any* quality (even qual=1) is * permitted. */ bool canN() { assert(instantiated); return (mms > 0 || edits > 0); } /** * Return true iff a deletion of the given extension (0=open, 1=1st * extension, etc) is permitted. */ bool canDelete(int ex, const Scoring& cm) { assert(instantiated); return (dels > 0 && edits > 0) && penalty >= cm.del(ex); } /** * Return true iff a deletion of any extension is permitted. */ bool canDelete() { assert(instantiated); return (dels > 0 || edits > 0) && penalty > 0; } /** * Return true iff an insertion of the given extension (0=open, * 1=1st extension, etc) is permitted. */ bool canInsert(int ex, const Scoring& cm) { assert(instantiated); return (ins > 0 || edits > 0) && penalty >= cm.ins(ex); } /** * Return true iff an insertion of any extension is permitted. */ bool canInsert() { assert(instantiated); return (ins > 0 || edits > 0) && penalty > 0; } /** * Return true iff a gap of any extension is permitted */ bool canGap() { assert(instantiated); return ((ins > 0 || dels > 0) || edits > 0) && penalty > 0; } /** * Charge a mismatch of the given quality. */ void chargeMismatch(int q, const Scoring& cm) { assert(instantiated); if(mms == 0) { assert_gt(edits, 0); edits--; } else mms--; penalty -= cm.mm(q); assert_geq(mms, 0); assert_geq(edits, 0); assert_geq(penalty, 0); } /** * Charge an N mismatch of the given quality. */ void chargeN(int q, const Scoring& cm) { assert(instantiated); if(mms == 0) { assert_gt(edits, 0); edits--; } else mms--; penalty -= cm.n(q); assert_geq(mms, 0); assert_geq(edits, 0); assert_geq(penalty, 0); } /** * Charge a deletion of the given extension. */ void chargeDelete(int ex, const Scoring& cm) { assert(instantiated); dels--; edits--; penalty -= cm.del(ex); assert_geq(dels, 0); assert_geq(edits, 0); assert_geq(penalty, 0); } /** * Charge an insertion of the given extension. */ void chargeInsert(int ex, const Scoring& cm) { assert(instantiated); ins--; edits--; penalty -= cm.ins(ex); assert_geq(ins, 0); assert_geq(edits, 0); assert_geq(penalty, 0); } /** * Once the constrained area is completely explored, call this * function to check whether there were *at least* as many * dissimilarities as required by the constraint. Bounds like this * are helpful to resolve instances where two search roots would * otherwise overlap in what alignments they can find. */ bool acceptable() { assert(instantiated); return edits <= editsCeil && mms <= mmsCeil && ins <= insCeil && dels <= delsCeil && penalty <= penaltyCeil; } /** * Instantiate a constraint w/r/t the read length and the constant * and linear coefficients for the penalty function. */ static int instantiate(size_t rdlen, const SimpleFunc& func) { return func.f((double)rdlen); } /** * Instantiate this constraint w/r/t the read length. */ void instantiate(size_t rdlen) { assert(!instantiated); if(penFunc.initialized()) { penalty = Constraint::instantiate(rdlen, penFunc); } instantiated = true; } int edits; // # edits permitted int mms; // # mismatches permitted int ins; // # insertions permitted int dels; // # deletions permitted int penalty; // penalty total permitted int editsCeil; // <= this many edits can be left at the end int mmsCeil; // <= this many mismatches can be left at the end int insCeil; // <= this many inserts can be left at the end int delsCeil; // <= this many deletions can be left at the end int penaltyCeil;// <= this much leftover penalty can be left at the end SimpleFunc penFunc;// penalty function; function of read len bool instantiated; // whether constraint is instantiated w/r/t read len // // Some static methods for constructing some standard Constraints // /** * Construct a constraint with no edits of any kind allowed. */ static Constraint exact(); /** * Construct a constraint where the only constraint is a total * penalty constraint. */ static Constraint penaltyBased(int pen); /** * Construct a constraint where the only constraint is a total * penalty constraint related to the length of the read. */ static Constraint penaltyFuncBased(const SimpleFunc& func); /** * Construct a constraint where the only constraint is a total * penalty constraint. */ static Constraint mmBased(int mms); /** * Construct a constraint where the only constraint is a total * penalty constraint. */ static Constraint editBased(int edits); }; /** * We divide seed search strategies into three categories: * * 1. A left-to-right search where the left half of the read is * constrained to match exactly and the right half is subject to * some looser constraint (e.g. 1mm or 2mm) * 2. Same as 1, but going right to left with the exact matching half * on the right. * 3. Inside-out search where the center half of the read is * constrained to match exactly, and the extreme quarters of the * read are subject to a looser constraint. */ enum { SEED_TYPE_EXACT = 1, SEED_TYPE_LEFT_TO_RIGHT, SEED_TYPE_RIGHT_TO_LEFT, SEED_TYPE_INSIDE_OUT }; struct InstantiatedSeed; /** * Policy dictating how to size and arrange seeds along the length of * the read, and what constraints to force on the zones of the seed. * We assume that seeds are plopped down at regular intervals from the * 5' to 3' ends, with the first seed flush to the 5' end. * * If the read is shorter than a single seed, one seed is used and it * is shrunk to accommodate the read. */ struct Seed { int len; // length of a seed int type; // dictates anchor portion, direction of search Constraint *overall; // for the overall alignment Seed() { init(0, 0, NULL); } /** * Construct and initialize this seed with given length and type. */ Seed(int ln, int ty, Constraint* oc) { init(ln, ty, oc); } /** * Initialize this seed with given length and type. */ void init(int ln, int ty, Constraint* oc) { len = ln; type = ty; overall = oc; } // If the seed is split into halves, we just use zones[0] and // zones[1]; 0 is the near half and 1 is the far half. If the seed // is split into thirds (i.e. inside-out) then 0 is the center, 1 // is the far portion on the left, and 2 is the far portion on the // right. Constraint zones[3]; /** * Once the constrained seed is completely explored, call this * function to check whether there were *at least* as many * dissimilarities as required by all constraints. Bounds like this * are helpful to resolve instances where two search roots would * otherwise overlap in what alignments they can find. */ bool acceptable() { assert(overall != NULL); return zones[0].acceptable() && zones[1].acceptable() && zones[2].acceptable() && overall->acceptable(); } /** * Given a read, depth and orientation, extract a seed data structure * from the read and fill in the steps & zones arrays. The Seed * contains the sequence and quality values. */ bool instantiate( const Read& read, const BTDnaString& seq, // already-extracted seed sequence const BTString& qual, // already-extracted seed quality sequence const Scoring& pens, int depth, int seedoffidx, int seedtypeidx, bool fw, InstantiatedSeed& si) const; /** * Return a list of Seed objects encapsulating */ static void mmSeeds( int mms, int ln, EList& pols, Constraint& oall) { if(mms == 0) { zeroMmSeeds(ln, pols, oall); } else if(mms == 1) { oneMmSeeds(ln, pols, oall); } else if(mms == 2) { twoMmSeeds(ln, pols, oall); } else throw 1; } static void zeroMmSeeds(int ln, EList&, Constraint&); static void oneMmSeeds (int ln, EList&, Constraint&); static void twoMmSeeds (int ln, EList&, Constraint&); }; /** * An instantiated seed is a seed (perhaps modified to fit the read) * plus all data needed to conduct a search of the seed. */ struct InstantiatedSeed { InstantiatedSeed() : steps(AL_CAT), zones(AL_CAT) { } // Steps map. There are as many steps as there are positions in // the seed. The map is a helpful abstraction because we sometimes // visit seed positions in an irregular order (e.g. inside-out // search). EList steps; // Zones map. For each step, records what constraint to charge an // edit to. The first entry in each pair gives the constraint for // non-insert edits and the second entry in each pair gives the // constraint for insert edits. If the value stored is negative, // this indicates that the zone is "closed out" after this // position, so zone acceptility should be checked. EList > zones; // Nucleotide sequence covering the seed, extracted from read BTDnaString *seq; // Quality sequence covering the seed, extracted from read BTString *qual; // Initial constraints governing zones 0, 1, 2. We precalculate // the effect of Ns on these. Constraint cons[3]; // Overall constraint, tailored to the read length. Constraint overall; // Maximum number of positions that the aligner may advance before // its first step. This lets the aligner know whether it can use // the ftab or not. int maxjump; // Offset of seed from 5' end of read int seedoff; // Id for seed offset; ids are such that the smallest index is the // closest to the 5' end and consecutive ids are adjacent (i.e. // there are no intervening offsets with seeds) int seedoffidx; // Type of seed (left-to-right, etc) int seedtypeidx; // Seed comes from forward-oriented read? bool fw; // Filtered out due to the pattern of Ns present. If true, this // seed should be ignored by searchAllSeeds(). bool nfiltered; // Seed this was instantiated from Seed s; #ifndef NDEBUG /** * Check that InstantiatedSeed is internally consistent. */ bool repOk() const { return true; } #endif }; /** * Simple struct for holding a end-to-end alignments for the read with at most * 2 edits. */ template struct EEHit { EEHit() { reset(); } void reset() { top = bot = 0; fw = false; e1.reset(); e2.reset(); score = MIN_I64; } void init( index_t top_, index_t bot_, const Edit* e1_, const Edit* e2_, bool fw_, int64_t score_) { top = top_; bot = bot_; if(e1_ != NULL) { e1 = *e1_; } else { e1.reset(); } if(e2_ != NULL) { e2 = *e2_; } else { e2.reset(); } fw = fw_; score = score_; } /** * Return number of mismatches in the alignment. */ int mms() const { if (e2.inited()) return 2; else if(e1.inited()) return 1; else return 0; } /** * Return the number of Ns involved in the alignment. */ int ns() const { int ns = 0; if(e1.inited() && e1.hasN()) { ns++; if(e2.inited() && e2.hasN()) { ns++; } } return ns; } /** * Return the number of Ns involved in the alignment. */ int refns() const { int ns = 0; if(e1.inited() && e1.chr == 'N') { ns++; if(e2.inited() && e2.chr == 'N') { ns++; } } return ns; } /** * Return true iff there is no hit. */ bool empty() const { return bot <= top; } /** * Higher score = higher priority. */ bool operator<(const EEHit& o) const { return score > o.score; } /** * Return the size of the alignments SA range.s */ index_t size() const { return bot - top; } #ifndef NDEBUG /** * Check that hit is sane w/r/t read. */ bool repOk(const Read& rd) const { assert_gt(bot, top); if(e1.inited()) { assert_lt(e1.pos, rd.length()); if(e2.inited()) { assert_lt(e2.pos, rd.length()); } } return true; } #endif index_t top; index_t bot; Edit e1; Edit e2; bool fw; int64_t score; }; /** * Data structure for holding all of the seed hits associated with a read. All * the seed hits for a given read are encapsulated in a single QVal object. A * QVal refers to a range of values in the qlist, where each qlist value is a * BW range and a slot to hold the hit's suffix array offset. QVals are kept * in two lists (hitsFw_ and hitsRc_), one for seeds on the forward read strand, * one for seeds on the reverse read strand. The list is indexed by read * offset index (e.g. 0=closest-to-5', 1=second-closest, etc). * * An assumption behind this data structure is that all the seeds are found * first, then downstream analyses try to extend them. In between finding the * seed hits and extending them, the sort() member function is called, which * ranks QVals according to the order they should be extended. Right now the * policy is that QVals with fewer elements (hits) should be tried first. */ template class SeedResults { public: SeedResults() : seqFw_(AL_CAT), seqRc_(AL_CAT), qualFw_(AL_CAT), qualRc_(AL_CAT), hitsFw_(AL_CAT), hitsRc_(AL_CAT), isFw_(AL_CAT), isRc_(AL_CAT), sortedFw_(AL_CAT), sortedRc_(AL_CAT), offIdx2off_(AL_CAT), rankOffs_(AL_CAT), rankFws_(AL_CAT), mm1Hit_(AL_CAT) { clear(); } /** * Set the current read. */ void nextRead(const Read& read) { read_ = &read; } /** * Set the appropriate element of either hitsFw_ or hitsRc_ to the given * QVal. A QVal encapsulates all the BW ranges for reference substrings * that are within some distance of the seed string. */ void add( const QVal& qv, // range of ranges in cache const AlignmentCache& ac, // cache index_t seedIdx, // seed index (from 5' end) bool seedFw) // whether seed is from forward read { assert(qv.repOk(ac)); assert(repOk(&ac)); assert_lt(seedIdx, hitsFw_.size()); assert_gt(numOffs_, 0); // if this fails, probably failed to call reset if(qv.empty()) return; if(seedFw) { assert(!hitsFw_[seedIdx].valid()); hitsFw_[seedIdx] = qv; numEltsFw_ += qv.numElts(); numRangesFw_ += qv.numRanges(); if(qv.numRanges() > 0) nonzFw_++; } else { assert(!hitsRc_[seedIdx].valid()); hitsRc_[seedIdx] = qv; numEltsRc_ += qv.numElts(); numRangesRc_ += qv.numRanges(); if(qv.numRanges() > 0) nonzRc_++; } numElts_ += qv.numElts(); numRanges_ += qv.numRanges(); if(qv.numRanges() > 0) { nonzTot_++; } assert(repOk(&ac)); } /** * Clear buffered seed hits and state. Set the number of seed * offsets and the read. */ void reset( const Read& read, const EList& offIdx2off, size_t numOffs) { assert_gt(numOffs, 0); clearSeeds(); numOffs_ = numOffs; seqFw_.resize(numOffs_); seqRc_.resize(numOffs_); qualFw_.resize(numOffs_); qualRc_.resize(numOffs_); hitsFw_.resize(numOffs_); hitsRc_.resize(numOffs_); isFw_.resize(numOffs_); isRc_.resize(numOffs_); sortedFw_.resize(numOffs_); sortedRc_.resize(numOffs_); offIdx2off_ = offIdx2off; for(size_t i = 0; i < numOffs_; i++) { sortedFw_[i] = sortedRc_[i] = false; hitsFw_[i].reset(); hitsRc_[i].reset(); isFw_[i].clear(); isRc_[i].clear(); } read_ = &read; sorted_ = false; } /** * Clear seed-hit state. */ void clearSeeds() { sortedFw_.clear(); sortedRc_.clear(); rankOffs_.clear(); rankFws_.clear(); offIdx2off_.clear(); hitsFw_.clear(); hitsRc_.clear(); isFw_.clear(); isRc_.clear(); seqFw_.clear(); seqRc_.clear(); nonzTot_ = 0; nonzFw_ = 0; nonzRc_ = 0; numOffs_ = 0; numRanges_ = 0; numElts_ = 0; numRangesFw_ = 0; numEltsFw_ = 0; numRangesRc_ = 0; numEltsRc_ = 0; } /** * Clear seed-hit state and end-to-end alignment state. */ void clear() { clearSeeds(); read_ = NULL; exactFwHit_.reset(); exactRcHit_.reset(); mm1Hit_.clear(); mm1Sorted_ = false; mm1Elt_ = 0; assert(empty()); } /** * Extract key summaries from this SeedResults and put into 'ssum'. */ void toSeedAlSumm(SeedAlSumm& ssum) const { // Number of positions with at least 1 range ssum.nonzTot = nonzTot_; ssum.nonzFw = nonzFw_; ssum.nonzRc = nonzRc_; // Number of ranges ssum.nrangeTot = numRanges_; ssum.nrangeFw = numRangesFw_; ssum.nrangeRc = numRangesRc_; // Number of elements ssum.neltTot = numElts_; ssum.neltFw = numEltsFw_; ssum.neltRc = numEltsRc_; // Other summaries ssum.maxNonzRangeFw = ssum.minNonzRangeFw = 0; ssum.maxNonzRangeRc = ssum.minNonzRangeRc = 0; ssum.maxNonzEltFw = ssum.minNonzEltFw = 0; ssum.maxNonzEltRc = ssum.minNonzEltRc = 0; for(size_t i = 0; i < numOffs_; i++) { if(hitsFw_[i].valid()) { if(ssum.minNonzEltFw == 0 || hitsFw_[i].numElts() < ssum.minNonzEltFw) { ssum.minNonzEltFw = hitsFw_[i].numElts(); } if(ssum.maxNonzEltFw == 0 || hitsFw_[i].numElts() > ssum.maxNonzEltFw) { ssum.maxNonzEltFw = hitsFw_[i].numElts(); } if(ssum.minNonzRangeFw == 0 || hitsFw_[i].numRanges() < ssum.minNonzRangeFw) { ssum.minNonzRangeFw = hitsFw_[i].numRanges(); } if(ssum.maxNonzRangeFw == 0 || hitsFw_[i].numRanges() > ssum.maxNonzRangeFw) { ssum.maxNonzRangeFw = hitsFw_[i].numRanges(); } } if(hitsRc_[i].valid()) { if(ssum.minNonzEltRc == 0 || hitsRc_[i].numElts() < ssum.minNonzEltRc) { ssum.minNonzEltRc = hitsRc_[i].numElts(); } if(ssum.maxNonzEltRc == 0 || hitsRc_[i].numElts() > ssum.maxNonzEltRc) { ssum.maxNonzEltRc = hitsRc_[i].numElts(); } if(ssum.minNonzRangeRc == 0 || hitsRc_[i].numRanges() < ssum.minNonzRangeRc) { ssum.minNonzRangeRc = hitsRc_[i].numRanges(); } if(ssum.maxNonzRangeRc == 0 || hitsRc_[i].numRanges() > ssum.maxNonzRangeRc) { ssum.maxNonzRangeRc = hitsRc_[i].numRanges(); } } } } /** * Return average number of hits per seed. */ float averageHitsPerSeed() const { return (float)numElts_ / (float)nonzTot_; } /** * Return median of all the non-zero per-seed # hits */ float medianHitsPerSeed() const { EList& median = const_cast&>(tmpMedian_); median.clear(); for(size_t i = 0; i < numOffs_; i++) { if(hitsFw_[i].valid() && hitsFw_[i].numElts() > 0) { median.push_back(hitsFw_[i].numElts()); } if(hitsRc_[i].valid() && hitsRc_[i].numElts() > 0) { median.push_back(hitsRc_[i].numElts()); } } if(tmpMedian_.empty()) { return 0.0f; } median.sort(); float med1 = (float)median[tmpMedian_.size() >> 1]; float med2 = med1; if((median.size() & 1) == 0) { med2 = (float)median[(tmpMedian_.size() >> 1) - 1]; } return med1 + med2 * 0.5f; } /** * Return a number that's meant to quantify how hopeful we are that this * set of seed hits will lead to good alignments. */ double uniquenessFactor() const { double result = 0.0; for(size_t i = 0; i < numOffs_; i++) { if(hitsFw_[i].valid()) { size_t nelt = hitsFw_[i].numElts(); result += (1.0 / (double)(nelt * nelt)); } if(hitsRc_[i].valid()) { size_t nelt = hitsRc_[i].numElts(); result += (1.0 / (double)(nelt * nelt)); } } return result; } /** * Return the number of ranges being held. */ index_t numRanges() const { return numRanges_; } /** * Return the number of elements being held. */ index_t numElts() const { return numElts_; } /** * Return the number of ranges being held for seeds on the forward * read strand. */ index_t numRangesFw() const { return numRangesFw_; } /** * Return the number of elements being held for seeds on the * forward read strand. */ index_t numEltsFw() const { return numEltsFw_; } /** * Return the number of ranges being held for seeds on the * reverse-complement read strand. */ index_t numRangesRc() const { return numRangesRc_; } /** * Return the number of elements being held for seeds on the * reverse-complement read strand. */ index_t numEltsRc() const { return numEltsRc_; } /** * Given an offset index, return the offset that has that index. */ index_t idx2off(size_t off) const { return offIdx2off_[off]; } /** * Return true iff there are 0 hits being held. */ bool empty() const { return numRanges() == 0; } /** * Get the QVal representing all the reference hits for the given * orientation and seed offset index. */ const QVal& hitsAtOffIdx(bool fw, size_t seedoffidx) const { assert_lt(seedoffidx, numOffs_); assert(repOk(NULL)); return fw ? hitsFw_[seedoffidx] : hitsRc_[seedoffidx]; } /** * Get the Instantiated seeds for the given orientation and offset. */ EList& instantiatedSeeds(bool fw, size_t seedoffidx) { assert_lt(seedoffidx, numOffs_); assert(repOk(NULL)); return fw ? isFw_[seedoffidx] : isRc_[seedoffidx]; } /** * Return the number of different seed offsets possible. */ index_t numOffs() const { return numOffs_; } /** * Return the read from which seeds were extracted, aligned. */ const Read& read() const { return *read_; } #ifndef NDEBUG /** * Check that this SeedResults is internally consistent. */ bool repOk( const AlignmentCache* ac, bool requireInited = false) const { if(requireInited) { assert(read_ != NULL); } if(numOffs_ > 0) { assert_eq(numOffs_, hitsFw_.size()); assert_eq(numOffs_, hitsRc_.size()); assert_leq(numRanges_, numElts_); assert_leq(nonzTot_, numRanges_); size_t nonzs = 0; for(int fw = 0; fw <= 1; fw++) { const EList >& rrs = (fw ? hitsFw_ : hitsRc_); for(size_t i = 0; i < numOffs_; i++) { if(rrs[i].valid()) { if(rrs[i].numRanges() > 0) nonzs++; if(ac != NULL) { assert(rrs[i].repOk(*ac)); } } } } assert_eq(nonzs, nonzTot_); assert(!sorted_ || nonzTot_ == rankFws_.size()); assert(!sorted_ || nonzTot_ == rankOffs_.size()); } return true; } #endif /** * Populate rankOffs_ and rankFws_ with the list of QVals that need to be * examined for this SeedResults, in order. The order is ascending by * number of elements, so QVals with fewer elements (i.e. seed sequences * that are more unique) will be tried first and QVals with more elements * (i.e. seed sequences */ void rankSeedHits(RandomSource& rnd) { while(rankOffs_.size() < nonzTot_) { index_t minsz = (index_t)0xffffffff; index_t minidx = 0; bool minfw = true; // Rank seed-hit positions in ascending order by number of elements // in all BW ranges bool rb = rnd.nextBool(); assert(rb == 0 || rb == 1); for(int fwi = 0; fwi <= 1; fwi++) { bool fw = (fwi == (rb ? 1 : 0)); EList >& rrs = (fw ? hitsFw_ : hitsRc_); EList& sorted = (fw ? sortedFw_ : sortedRc_); index_t i = (rnd.nextU32() % (index_t)numOffs_); for(index_t ii = 0; ii < numOffs_; ii++) { if(rrs[i].valid() && // valid QVal rrs[i].numElts() > 0 && // non-empty !sorted[i] && // not already sorted rrs[i].numElts() < minsz) // least elts so far? { minsz = rrs[i].numElts(); minidx = i; minfw = (fw == 1); } if((++i) == numOffs_) { i = 0; } } } assert_neq((index_t)0xffffffff, minsz); if(minfw) { sortedFw_[minidx] = true; } else { sortedRc_[minidx] = true; } rankOffs_.push_back(minidx); rankFws_.push_back(minfw); } assert_eq(rankOffs_.size(), rankFws_.size()); sorted_ = true; } /** * Return the number of orientation/offsets into the read that have * at least one seed hit. */ size_t nonzeroOffsets() const { assert(!sorted_ || nonzTot_ == rankFws_.size()); assert(!sorted_ || nonzTot_ == rankOffs_.size()); return nonzTot_; } /** * Return true iff all seeds hit for forward read. */ bool allFwSeedsHit() const { return nonzFw_ == numOffs(); } /** * Return true iff all seeds hit for revcomp read. */ bool allRcSeedsHit() const { return nonzRc_ == numOffs(); } /** * Return the minimum number of edits that an end-to-end alignment of the * fw read could have. Uses knowledge of how many seeds have exact hits * and how the seeds overlap. */ index_t fewestEditsEE(bool fw, int seedlen, int per) const { assert_gt(seedlen, 0); assert_gt(per, 0); index_t nonz = fw ? nonzFw_ : nonzRc_; if(nonz < numOffs()) { int maxdepth = (seedlen + per - 1) / per; int missing = (int)(numOffs() - nonz); return (missing + maxdepth - 1) / maxdepth; } else { // Exact hit is possible (not guaranteed) return 0; } } /** * Return the number of offsets into the forward read that have at * least one seed hit. */ index_t nonzeroOffsetsFw() const { return nonzFw_; } /** * Return the number of offsets into the reverse-complement read * that have at least one seed hit. */ index_t nonzeroOffsetsRc() const { return nonzRc_; } /** * Return a QVal of seed hits of the given rank 'r'. 'offidx' gets the id * of the offset from 5' from which it was extracted (0 for the 5-most * offset, 1 for the next closes to 5', etc). 'off' gets the offset from * the 5' end. 'fw' gets true iff the seed was extracted from the forward * read. */ const QVal& hitsByRank( index_t r, // in index_t& offidx, // out index_t& off, // out bool& fw, // out index_t& seedlen) // out { assert(sorted_); assert_lt(r, nonzTot_); if(rankFws_[r]) { fw = true; offidx = rankOffs_[r]; assert_lt(offidx, offIdx2off_.size()); off = offIdx2off_[offidx]; seedlen = (index_t)seqFw_[rankOffs_[r]].length(); return hitsFw_[rankOffs_[r]]; } else { fw = false; offidx = rankOffs_[r]; assert_lt(offidx, offIdx2off_.size()); off = offIdx2off_[offidx]; seedlen = (index_t)seqRc_[rankOffs_[r]].length(); return hitsRc_[rankOffs_[r]]; } } /** * Return an EList of seed hits of the given rank. */ const BTDnaString& seqByRank(index_t r) { assert(sorted_); assert_lt(r, nonzTot_); return rankFws_[r] ? seqFw_[rankOffs_[r]] : seqRc_[rankOffs_[r]]; } /** * Return an EList of seed hits of the given rank. */ const BTString& qualByRank(index_t r) { assert(sorted_); assert_lt(r, nonzTot_); return rankFws_[r] ? qualFw_[rankOffs_[r]] : qualRc_[rankOffs_[r]]; } /** * Return the list of extracted seed sequences for seeds on either * the forward or reverse strand. */ EList& seqs(bool fw) { return fw ? seqFw_ : seqRc_; } /** * Return the list of extracted quality sequences for seeds on * either the forward or reverse strand. */ EList& quals(bool fw) { return fw ? qualFw_ : qualRc_; } /** * Return exact end-to-end alignment of fw read. */ EEHit exactFwEEHit() const { return exactFwHit_; } /** * Return exact end-to-end alignment of rc read. */ EEHit exactRcEEHit() const { return exactRcHit_; } /** * Return const ref to list of 1-mismatch end-to-end alignments. */ const EList >& mm1EEHits() const { return mm1Hit_; } /** * Sort the end-to-end 1-mismatch alignments, prioritizing by score (higher * score = higher priority). */ void sort1mmEe(RandomSource& rnd) { assert(!mm1Sorted_); mm1Hit_.sort(); size_t streak = 0; for(size_t i = 1; i < mm1Hit_.size(); i++) { if(mm1Hit_[i].score == mm1Hit_[i-1].score) { if(streak == 0) { streak = 1; } streak++; } else { if(streak > 1) { assert_geq(i, streak); mm1Hit_.shufflePortion(i-streak, streak, rnd); } streak = 0; } } if(streak > 1) { mm1Hit_.shufflePortion(mm1Hit_.size() - streak, streak, rnd); } mm1Sorted_ = true; } /** * Add an end-to-end 1-mismatch alignment. */ void add1mmEe( index_t top, index_t bot, const Edit* e1, const Edit* e2, bool fw, int64_t score) { mm1Hit_.expand(); mm1Hit_.back().init(top, bot, e1, e2, fw, score); mm1Elt_ += (bot - top); } /** * Add an end-to-end exact alignment. */ void addExactEeFw( index_t top, index_t bot, const Edit* e1, const Edit* e2, bool fw, int64_t score) { exactFwHit_.init(top, bot, e1, e2, fw, score); } /** * Add an end-to-end exact alignment. */ void addExactEeRc( index_t top, index_t bot, const Edit* e1, const Edit* e2, bool fw, int64_t score) { exactRcHit_.init(top, bot, e1, e2, fw, score); } /** * Clear out the end-to-end exact alignments. */ void clearExactE2eHits() { exactFwHit_.reset(); exactRcHit_.reset(); } /** * Clear out the end-to-end 1-mismatch alignments. */ void clear1mmE2eHits() { mm1Hit_.clear(); // 1-mismatch end-to-end hits mm1Elt_ = 0; // number of 1-mismatch hit rows mm1Sorted_ = false; // true iff we've sorted the mm1Hit_ list } /** * Return the number of distinct exact and 1-mismatch end-to-end hits * found. */ index_t numE2eHits() const { return (index_t)(exactFwHit_.size() + exactRcHit_.size() + mm1Elt_); } /** * Return the number of distinct exact end-to-end hits found. */ index_t numExactE2eHits() const { return (index_t)(exactFwHit_.size() + exactRcHit_.size()); } /** * Return the number of distinct 1-mismatch end-to-end hits found. */ index_t num1mmE2eHits() const { return mm1Elt_; } /** * Return the length of the read that yielded the seed hits. */ index_t readLength() const { assert(read_ != NULL); return read_->length(); } protected: // As seed hits and edits are added they're sorted into these // containers EList seqFw_; // seqs for seeds from forward read EList seqRc_; // seqs for seeds from revcomp read EList qualFw_; // quals for seeds from forward read EList qualRc_; // quals for seeds from revcomp read EList > hitsFw_; // hits for forward read EList > hitsRc_; // hits for revcomp read EList > isFw_; // hits for forward read EList > isRc_; // hits for revcomp read EList sortedFw_; // true iff fw QVal was sorted/ranked EList sortedRc_; // true iff rc QVal was sorted/ranked index_t nonzTot_; // # offsets with non-zero size index_t nonzFw_; // # offsets into fw read with non-0 size index_t nonzRc_; // # offsets into rc read with non-0 size index_t numRanges_; // # ranges added index_t numElts_; // # elements added index_t numRangesFw_; // # ranges added for fw seeds index_t numEltsFw_; // # elements added for fw seeds index_t numRangesRc_; // # ranges added for rc seeds index_t numEltsRc_; // # elements added for rc seeds EList offIdx2off_;// map from offset indexes to offsets from 5' end // When the sort routine is called, the seed hits collected so far // are sorted into another set of containers that allow easy access // to hits from the lowest-ranked offset (the one with the fewest // BW elements) to the greatest-ranked offset. Offsets with 0 hits // are ignored. EList rankOffs_; // sorted offests of seeds to try EList rankFws_; // sorted orientations assoc. with rankOffs_ bool sorted_; // true if sort() called since last reset // These fields set once per read index_t numOffs_; // # different seed offsets possible const Read* read_; // read from which seeds were extracted EEHit exactFwHit_; // end-to-end exact hit for fw read EEHit exactRcHit_; // end-to-end exact hit for rc read EList > mm1Hit_; // 1-mismatch end-to-end hits index_t mm1Elt_; // number of 1-mismatch hit rows bool mm1Sorted_; // true iff we've sorted the mm1Hit_ list EList tmpMedian_; // temporary storage for calculating median }; // Forward decl template class Ebwt; template struct SideLocus; /** * Encapsulates a sumamry of what the searchAllSeeds aligner did. */ struct SeedSearchMetrics { SeedSearchMetrics() : mutex_m() { reset(); } /** * Merge this metrics object with the given object, i.e., sum each * category. This is the only safe way to update a * SeedSearchMetrics object shread by multiple threads. */ void merge(const SeedSearchMetrics& m, bool getLock = false) { ThreadSafe ts(&mutex_m, getLock); seedsearch += m.seedsearch; possearch += m.possearch; intrahit += m.intrahit; interhit += m.interhit; filteredseed += m.filteredseed; ooms += m.ooms; bwops += m.bwops; bweds += m.bweds; bestmin0 += m.bestmin0; bestmin1 += m.bestmin1; bestmin2 += m.bestmin2; } /** * Set all counters to 0. */ void reset() { seedsearch = possearch = intrahit = interhit = filteredseed = ooms = bwops = bweds = bestmin0 = bestmin1 = bestmin2 = 0; } uint64_t seedsearch; // # times we executed strategy in InstantiatedSeed uint64_t possearch; // # offsets where aligner executed >= 1 strategy uint64_t intrahit; // # offsets where current-read cache gave answer uint64_t interhit; // # offsets where across-read cache gave answer uint64_t filteredseed; // # seed instantiations skipped due to Ns uint64_t ooms; // out-of-memory errors uint64_t bwops; // Burrows-Wheeler operations uint64_t bweds; // Burrows-Wheeler edits uint64_t bestmin0; // # times the best min # edits was 0 uint64_t bestmin1; // # times the best min # edits was 1 uint64_t bestmin2; // # times the best min # edits was 2 MUTEX_T mutex_m; }; /** * Given an index and a seeding scheme, searches for seed hits. */ template class SeedAligner { public: /** * Initialize with index. */ SeedAligner() : edits_(AL_CAT), offIdx2off_(AL_CAT) { } /** * Given a read and a few coordinates that describe a substring of the * read (or its reverse complement), fill in 'seq' and 'qual' objects * with the seed sequence and qualities. */ void instantiateSeq( const Read& read, // input read BTDnaString& seq, // output sequence BTString& qual, // output qualities int len, // seed length int depth, // seed's 0-based offset from 5' end bool fw) const; // seed's orientation /** * Iterate through the seeds that cover the read and initiate a * search for each seed. */ std::pair instantiateSeeds( const EList& seeds, // search seeds index_t off, // offset into read to start extracting int per, // interval between seeds const Read& read, // read to align const Scoring& pens, // scoring scheme bool nofw, // don't align forward read bool norc, // don't align revcomp read AlignmentCacheIface& cache, // holds some seed hits from previous reads SeedResults& sr, // holds all the seed hits SeedSearchMetrics& met); // metrics /** * Iterate through the seeds that cover the read and initiate a * search for each seed. */ void searchAllSeeds( const EList& seeds, // search seeds const Ebwt* ebwtFw, // BWT index const Ebwt* ebwtBw, // BWT' index const Read& read, // read to align const Scoring& pens, // scoring scheme AlignmentCacheIface& cache, // local seed alignment cache SeedResults& hits, // holds all the seed hits SeedSearchMetrics& met, // metrics PerReadMetrics& prm); // per-read metrics /** * Sanity-check a partial alignment produced during oneMmSearch. */ bool sanityPartial( const Ebwt* ebwtFw, // BWT index const Ebwt* ebwtBw, // BWT' index const BTDnaString& seq, index_t dep, index_t len, bool do1mm, index_t topfw, index_t botfw, index_t topbw, index_t botbw); /** * Do an exact-matching sweet to establish a lower bound on number of edits * and to find exact alignments. */ size_t exactSweep( const Ebwt& ebwt, // BWT index const Read& read, // read to align const Scoring& sc, // scoring scheme bool nofw, // don't align forward read bool norc, // don't align revcomp read size_t mineMax, // don't care about edit bounds > this size_t& mineFw, // minimum # edits for forward read size_t& mineRc, // minimum # edits for revcomp read bool repex, // report 0mm hits? SeedResults& hits, // holds all the seed hits (and exact hit) SeedSearchMetrics& met); // metrics /** * Search for end-to-end alignments with up to 1 mismatch. */ bool oneMmSearch( const Ebwt* ebwtFw, // BWT index const Ebwt* ebwtBw, // BWT' index const Read& read, // read to align const Scoring& sc, // scoring int64_t minsc, // minimum score bool nofw, // don't align forward read bool norc, // don't align revcomp read bool local, // 1mm hits must be legal local alignments bool repex, // report 0mm hits? bool rep1mm, // report 1mm hits? SeedResults& hits, // holds all the seed hits (and exact hit) SeedSearchMetrics& met); // metrics protected: /** * Report a seed hit found by searchSeedBi(), but first try to extend it out in * either direction as far as possible without hitting any edits. This will * allow us to prioritize the seed hits better later on. Call reportHit() when * we're done, which actually adds the hit to the cache. Returns result from * calling reportHit(). */ bool extendAndReportHit( index_t topf, // top in BWT index_t botf, // bot in BWT index_t topb, // top in BWT' index_t botb, // bot in BWT' index_t len, // length of hit DoublyLinkedList *prevEdit); // previous edit /** * Report a seed hit found by searchSeedBi() by adding it to the cache. Return * false if the hit could not be reported because of, e.g., cache exhaustion. */ bool reportHit( index_t topf, // top in BWT index_t botf, // bot in BWT index_t topb, // top in BWT' index_t botb, // bot in BWT' index_t len, // length of hit DoublyLinkedList *prevEdit); // previous edit /** * Given an instantiated seed (in s_ and other fields), search */ bool searchSeedBi(); /** * Main, recursive implementation of the seed search. */ bool searchSeedBi( int step, // depth into steps_[] array int depth, // recursion depth index_t topf, // top in BWT index_t botf, // bot in BWT index_t topb, // top in BWT' index_t botb, // bot in BWT' SideLocus tloc, // locus for top (perhaps unititialized) SideLocus bloc, // locus for bot (perhaps unititialized) Constraint c0, // constraints to enforce in seed zone 0 Constraint c1, // constraints to enforce in seed zone 1 Constraint c2, // constraints to enforce in seed zone 2 Constraint overall, // overall constraints DoublyLinkedList *prevEdit); // previous edit /** * Get tloc and bloc ready for the next step. */ inline void nextLocsBi( SideLocus& tloc, // top locus SideLocus& bloc, // bot locus index_t topf, // top in BWT index_t botf, // bot in BWT index_t topb, // top in BWT' index_t botb, // bot in BWT' int step); // step to get ready for // Following are set in searchAllSeeds then used by searchSeed() // and other protected members. const Ebwt* ebwtFw_; // forward index (BWT) const Ebwt* ebwtBw_; // backward/mirror index (BWT') const Scoring* sc_; // scoring scheme const InstantiatedSeed* s_; // current instantiated seed const Read* read_; // read whose seeds are currently being aligned // The following are set just before a call to searchSeedBi() const BTDnaString* seq_; // sequence of current seed const BTString* qual_; // quality string for current seed index_t off_; // offset of seed currently being searched bool fw_; // orientation of seed currently being searched EList edits_; // temporary place to sort edits AlignmentCacheIface *ca_; // local alignment cache for seed alignments EList offIdx2off_; // offset idx to read offset map, set up instantiateSeeds() uint64_t bwops_; // Burrows-Wheeler operations uint64_t bwedits_; // Burrows-Wheeler edits BTDnaString tmprfdnastr_; // used in reportHit ASSERT_ONLY(ESet hits_); // Ref hits so far for seed being aligned BTDnaString tmpdnastr_; }; #define INIT_LOCS(top, bot, tloc, bloc, e) { \ if(bot - top == 1) { \ tloc.initFromRow(top, (e).eh(), (e).ebwt()); \ bloc.invalidate(); \ } else { \ SideLocus::initFromTopBot(top, bot, (e).eh(), (e).ebwt(), tloc, bloc); \ assert(bloc.valid()); \ } \ } #define SANITY_CHECK_4TUP(t, b, tp, bp) { \ ASSERT_ONLY(index_t tot = (b[0]-t[0])+(b[1]-t[1])+(b[2]-t[2])+(b[3]-t[3])); \ ASSERT_ONLY(index_t totp = (bp[0]-tp[0])+(bp[1]-tp[1])+(bp[2]-tp[2])+(bp[3]-tp[3])); \ assert_eq(tot, totp); \ } /** * Given a read and a few coordinates that describe a substring of the read (or * its reverse complement), fill in 'seq' and 'qual' objects with the seed * sequence and qualities. * * The seq field is filled with the sequence as it would align to the Watson * reference strand. I.e. if fw is false, then the sequence that appears in * 'seq' is the reverse complement of the raw read substring. */ template void SeedAligner::instantiateSeq( const Read& read, // input read BTDnaString& seq, // output sequence BTString& qual, // output qualities int len, // seed length int depth, // seed's 0-based offset from 5' end bool fw) const // seed's orientation { // Fill in 'seq' and 'qual' int seedlen = len; if((int)read.length() < seedlen) seedlen = (int)read.length(); seq.resize(len); qual.resize(len); // If fw is false, we take characters starting at the 3' end of the // reverse complement of the read. for(int i = 0; i < len; i++) { seq.set(read.patFw.windowGetDna(i, fw, read.color, depth, len), i); qual.set(read.qual.windowGet(i, fw, depth, len), i); } } /** * We assume that all seeds are the same length. * * For each seed, instantiate the seed, retracting if necessary. */ template pair SeedAligner::instantiateSeeds( const EList& seeds, // search seeds index_t off, // offset into read to start extracting int per, // interval between seeds const Read& read, // read to align const Scoring& pens, // scoring scheme bool nofw, // don't align forward read bool norc, // don't align revcomp read AlignmentCacheIface& cache,// holds some seed hits from previous reads SeedResults& sr, // holds all the seed hits SeedSearchMetrics& met) // metrics { assert(!seeds.empty()); assert_gt(read.length(), 0); // Check whether read has too many Ns offIdx2off_.clear(); int len = seeds[0].len; // assume they're all the same length #ifndef NDEBUG for(size_t i = 1; i < seeds.size(); i++) { assert_eq(len, seeds[i].len); } #endif // Calc # seeds within read interval int nseeds = 1; if((int)read.length() - (int)off > len) { nseeds += ((int)read.length() - (int)off - len) / per; } for(int i = 0; i < nseeds; i++) { offIdx2off_.push_back(per * i + (int)off); } pair ret; ret.first = 0; // # seeds that require alignment ret.second = 0; // # seeds that hit in cache with non-empty results sr.reset(read, offIdx2off_, nseeds); assert(sr.repOk(&cache.current(), true)); // require that SeedResult be initialized // For each seed position for(int fwi = 0; fwi < 2; fwi++) { bool fw = (fwi == 0); if((fw && nofw) || (!fw && norc)) { // Skip this orientation b/c user specified --nofw or --norc continue; } // For each seed position for(int i = 0; i < nseeds; i++) { int depth = i * per + (int)off; int seedlen = seeds[0].len; // Extract the seed sequence at this offset // If fw == true, we extract the characters from i*per to // i*(per-1) (exclusive). If fw == false, instantiateSeq( read, sr.seqs(fw)[i], sr.quals(fw)[i], std::min((int)seedlen, (int)read.length()), depth, fw); //QKey qk(sr.seqs(fw)[i] ASSERT_ONLY(, tmpdnastr_)); // For each search strategy EList& iss = sr.instantiatedSeeds(fw, i); for(int j = 0; j < (int)seeds.size(); j++) { iss.expand(); assert_eq(seedlen, seeds[j].len); InstantiatedSeed* is = &iss.back(); if(seeds[j].instantiate( read, sr.seqs(fw)[i], sr.quals(fw)[i], pens, depth, i, j, fw, *is)) { // Can we fill this seed hit in from the cache? ret.first++; } else { // Seed may fail to instantiate if there are Ns // that prevent it from matching met.filteredseed++; iss.pop_back(); } } } } return ret; } /** * We assume that all seeds are the same length. * * For each seed: * * 1. Instantiate all seeds, retracting them if necessary. * 2. Calculate zone boundaries for each seed */ template void SeedAligner::searchAllSeeds( const EList& seeds, // search seeds const Ebwt* ebwtFw, // BWT index const Ebwt* ebwtBw, // BWT' index const Read& read, // read to align const Scoring& pens, // scoring scheme AlignmentCacheIface& cache, // local cache for seed alignments SeedResults& sr, // holds all the seed hits SeedSearchMetrics& met, // metrics PerReadMetrics& prm) // per-read metrics { assert(!seeds.empty()); assert(ebwtFw != NULL); assert(ebwtFw->isInMemory()); assert(sr.repOk(&cache.current())); ebwtFw_ = ebwtFw; ebwtBw_ = ebwtBw; sc_ = &pens; read_ = &read; ca_ = &cache; bwops_ = bwedits_ = 0; uint64_t possearches = 0, seedsearches = 0, intrahits = 0, interhits = 0, ooms = 0; // For each instantiated seed for(int i = 0; i < (int)sr.numOffs(); i++) { size_t off = sr.idx2off(i); for(int fwi = 0; fwi < 2; fwi++) { bool fw = (fwi == 0); assert(sr.repOk(&cache.current())); EList& iss = sr.instantiatedSeeds(fw, i); if(iss.empty()) { // Cache hit in an across-read cache continue; } QVal qv; seq_ = &sr.seqs(fw)[i]; // seed sequence qual_ = &sr.quals(fw)[i]; // seed qualities off_ = off; // seed offset (from 5') fw_ = fw; // seed orientation // Tell the cache that we've started aligning, so the cache can // expect a series of on-the-fly updates int ret = cache.beginAlign(*seq_, *qual_, qv); ASSERT_ONLY(hits_.clear()); if(ret == -1) { // Out of memory when we tried to add key to map ooms++; continue; } bool abort = false; if(ret == 0) { // Not already in cache assert(cache.aligning()); possearches++; for(size_t j = 0; j < iss.size(); j++) { // Set seq_ and qual_ appropriately, using the seed sequences // and qualities already installed in SeedResults assert_eq(fw, iss[j].fw); assert_eq(i, (int)iss[j].seedoffidx); s_ = &iss[j]; // Do the search with respect to seq_, qual_ and s_. if(!searchSeedBi()) { // Memory exhausted during search ooms++; abort = true; break; } seedsearches++; assert(cache.aligning()); } if(!abort) { qv = cache.finishAlign(); } } else { // Already in cache assert_eq(1, ret); assert(qv.valid()); intrahits++; } assert(abort || !cache.aligning()); if(qv.valid()) { sr.add( qv, // range of ranges in cache cache.current(), // cache i, // seed index (from 5' end) fw); // whether seed is from forward read } } } prm.nSeedRanges = sr.numRanges(); prm.nSeedElts = sr.numElts(); prm.nSeedRangesFw = sr.numRangesFw(); prm.nSeedRangesRc = sr.numRangesRc(); prm.nSeedEltsFw = sr.numEltsFw(); prm.nSeedEltsRc = sr.numEltsRc(); prm.seedMedian = (uint64_t)(sr.medianHitsPerSeed() + 0.5); prm.seedMean = (uint64_t)sr.averageHitsPerSeed(); prm.nSdFmops += bwops_; met.seedsearch += seedsearches; met.possearch += possearches; met.intrahit += intrahits; met.interhit += interhits; met.ooms += ooms; met.bwops += bwops_; met.bweds += bwedits_; } template bool SeedAligner::sanityPartial( const Ebwt* ebwtFw, // BWT index const Ebwt* ebwtBw, // BWT' index const BTDnaString& seq, index_t dep, index_t len, bool do1mm, index_t topfw, index_t botfw, index_t topbw, index_t botbw) { tmpdnastr_.clear(); for(size_t i = dep; i < len; i++) { tmpdnastr_.append(seq[i]); } index_t top_fw = 0, bot_fw = 0; ebwtFw->contains(tmpdnastr_, &top_fw, &bot_fw); assert_eq(top_fw, topfw); assert_eq(bot_fw, botfw); if(do1mm && ebwtBw != NULL) { tmpdnastr_.reverse(); index_t top_bw = 0, bot_bw = 0; ebwtBw->contains(tmpdnastr_, &top_bw, &bot_bw); assert_eq(top_bw, topbw); assert_eq(bot_bw, botbw); } return true; } /** * Sweep right-to-left and left-to-right using exact matching. Remember all * the SA ranges encountered along the way. Report exact matches if there are * any. Calculate a lower bound on the number of edits in an end-to-end * alignment. */ template size_t SeedAligner::exactSweep( const Ebwt& ebwt, // BWT index const Read& read, // read to align const Scoring& sc, // scoring scheme bool nofw, // don't align forward read bool norc, // don't align revcomp read size_t mineMax, // don't care about edit bounds > this size_t& mineFw, // minimum # edits for forward read size_t& mineRc, // minimum # edits for revcomp read bool repex, // report 0mm hits? SeedResults& hits, // holds all the seed hits (and exact hit) SeedSearchMetrics& met) // metrics { assert_gt(mineMax, 0); index_t top = 0, bot = 0; SideLocus tloc, bloc; const size_t len = read.length(); size_t nelt = 0; for(int fwi = 0; fwi < 2; fwi++) { bool fw = (fwi == 0); if( fw && nofw) continue; if(!fw && norc) continue; const BTDnaString& seq = fw ? read.patFw : read.patRc; assert(!seq.empty()); int ftabLen = ebwt.eh().ftabChars(); size_t dep = 0; size_t nedit = 0; bool done = false; while(dep < len && !done) { top = bot = 0; size_t left = len - dep; assert_gt(left, 0); bool doFtab = ftabLen > 1 && left >= (size_t)ftabLen; if(doFtab) { // Does N interfere with use of Ftab? for(size_t i = 0; i < (size_t)ftabLen; i++) { int c = seq[len-dep-1-i]; if(c > 3) { doFtab = false; break; } } } if(doFtab) { // Use ftab ebwt.ftabLoHi(seq, len - dep - ftabLen, false, top, bot); dep += (size_t)ftabLen; } else { // Use fchr int c = seq[len-dep-1]; if(c < 4) { top = ebwt.fchr()[c]; bot = ebwt.fchr()[c+1]; } dep++; } if(bot <= top) { nedit++; if(nedit >= mineMax) { if(fw) { mineFw = nedit; } else { mineRc = nedit; } break; } continue; } INIT_LOCS(top, bot, tloc, bloc, ebwt); // Keep going while(dep < len) { int c = seq[len-dep-1]; if(c > 3) { top = bot = 0; } else { if(bloc.valid()) { bwops_ += 2; top = ebwt.mapLF(tloc, c); bot = ebwt.mapLF(bloc, c); } else { bwops_++; top = ebwt.mapLF1(top, tloc, c); if(top == (index_t)OFF_MASK) { top = bot = 0; } else { bot = top+1; } } } if(bot <= top) { nedit++; if(nedit >= mineMax) { if(fw) { mineFw = nedit; } else { mineRc = nedit; } done = true; } break; } INIT_LOCS(top, bot, tloc, bloc, ebwt); dep++; } if(done) { break; } if(dep == len) { // Set the minimum # edits if(fw) { mineFw = nedit; } else { mineRc = nedit; } // Done if(nedit == 0 && bot > top) { if(repex) { // This is an exact hit int64_t score = len * sc.match(); if(fw) { hits.addExactEeFw(top, bot, NULL, NULL, fw, score); assert(ebwt.contains(seq, NULL, NULL)); } else { hits.addExactEeRc(top, bot, NULL, NULL, fw, score); assert(ebwt.contains(seq, NULL, NULL)); } } nelt += (bot - top); } break; } dep++; } } return nelt; } /** * Search for end-to-end exact hit for read. Return true iff one is found. */ template bool SeedAligner::oneMmSearch( const Ebwt* ebwtFw, // BWT index const Ebwt* ebwtBw, // BWT' index const Read& read, // read to align const Scoring& sc, // scoring int64_t minsc, // minimum score bool nofw, // don't align forward read bool norc, // don't align revcomp read bool local, // 1mm hits must be legal local alignments bool repex, // report 0mm hits? bool rep1mm, // report 1mm hits? SeedResults& hits, // holds all the seed hits (and exact hit) SeedSearchMetrics& met) // metrics { assert(!rep1mm || ebwtBw != NULL); const size_t len = read.length(); int nceil = sc.nCeil.f((double)len); size_t ns = read.ns(); if(ns > 1) { // Can't align this with <= 1 mismatches return false; } else if(ns == 1 && !rep1mm) { // Can't align this with 0 mismatches return false; } assert_geq(len, 2); assert(!rep1mm || ebwtBw->eh().ftabChars() == ebwtFw->eh().ftabChars()); #ifndef NDEBUG if(ebwtBw != NULL) { for(int i = 0; i < 4; i++) { assert_eq(ebwtBw->fchr()[i], ebwtFw->fchr()[i]); } } #endif size_t halfFw = len >> 1; size_t halfBw = len >> 1; if((len & 1) != 0) { halfBw++; } assert_geq(halfFw, 1); assert_geq(halfBw, 1); SideLocus tloc, bloc; index_t t[4], b[4]; // dest BW ranges for BWT t[0] = t[1] = t[2] = t[3] = 0; b[0] = b[1] = b[2] = b[3] = 0; index_t tp[4], bp[4]; // dest BW ranges for BWT' tp[0] = tp[1] = tp[2] = tp[3] = 0; bp[0] = bp[1] = bp[2] = bp[3] = 0; index_t top = 0, bot = 0, topp = 0, botp = 0; // Align fw read / rc read bool results = false; for(int fwi = 0; fwi < 2; fwi++) { bool fw = (fwi == 0); if( fw && nofw) continue; if(!fw && norc) continue; // Align going right-to-left, left-to-right int lim = rep1mm ? 2 : 1; for(int ebwtfwi = 0; ebwtfwi < lim; ebwtfwi++) { bool ebwtfw = (ebwtfwi == 0); const Ebwt* ebwt = (ebwtfw ? ebwtFw : ebwtBw); const Ebwt* ebwtp = (ebwtfw ? ebwtBw : ebwtFw); assert(rep1mm || ebwt->fw()); const BTDnaString& seq = (fw ? (ebwtfw ? read.patFw : read.patFwRev) : (ebwtfw ? read.patRc : read.patRcRev)); assert(!seq.empty()); const BTString& qual = (fw ? (ebwtfw ? read.qual : read.qualRev) : (ebwtfw ? read.qualRev : read.qual)); int ftabLen = ebwt->eh().ftabChars(); size_t nea = ebwtfw ? halfFw : halfBw; // Check if there's an N in the near portion bool skip = false; for(size_t dep = 0; dep < nea; dep++) { if(seq[len-dep-1] > 3) { skip = true; break; } } if(skip) { continue; } size_t dep = 0; // Align near half if(ftabLen > 1 && (size_t)ftabLen <= nea) { // Use ftab to jump partway into near half bool rev = !ebwtfw; ebwt->ftabLoHi(seq, len - ftabLen, rev, top, bot); if(rep1mm) { ebwtp->ftabLoHi(seq, len - ftabLen, rev, topp, botp); assert_eq(bot - top, botp - topp); } if(bot - top == 0) { continue; } int c = seq[len - ftabLen]; t[c] = top; b[c] = bot; tp[c] = topp; bp[c] = botp; dep = ftabLen; // initialize tloc, bloc?? } else { // Use fchr to jump in by 1 pos int c = seq[len-1]; assert_range(0, 3, c); top = topp = tp[c] = ebwt->fchr()[c]; bot = botp = bp[c] = ebwt->fchr()[c+1]; if(bot - top == 0) { continue; } dep = 1; // initialize tloc, bloc?? } INIT_LOCS(top, bot, tloc, bloc, *ebwt); assert(sanityPartial(ebwt, ebwtp, seq, len-dep, len, rep1mm, top, bot, topp, botp)); bool do_continue = false; for(; dep < nea; dep++) { assert_lt(dep, len); int rdc = seq[len - dep - 1]; tp[0] = tp[1] = tp[2] = tp[3] = topp; bp[0] = bp[1] = bp[2] = bp[3] = botp; if(bloc.valid()) { bwops_++; t[0] = t[1] = t[2] = t[3] = b[0] = b[1] = b[2] = b[3] = 0; ebwt->mapBiLFEx(tloc, bloc, t, b, tp, bp); SANITY_CHECK_4TUP(t, b, tp, bp); top = t[rdc]; bot = b[rdc]; if(bot <= top) { do_continue = true; break; } topp = tp[rdc]; botp = bp[rdc]; assert(!rep1mm || bot - top == botp - topp); } else { assert_eq(bot, top+1); assert(!rep1mm || botp == topp+1); bwops_++; top = ebwt->mapLF1(top, tloc, rdc); if(top == (index_t)OFF_MASK) { do_continue = true; break; } bot = top + 1; t[rdc] = top; b[rdc] = bot; tp[rdc] = topp; bp[rdc] = botp; assert(!rep1mm || b[rdc] - t[rdc] == bp[rdc] - tp[rdc]); // topp/botp stay the same } INIT_LOCS(top, bot, tloc, bloc, *ebwt); assert(sanityPartial(ebwt, ebwtp, seq, len - dep - 1, len, rep1mm, top, bot, topp, botp)); } if(do_continue) { continue; } // Align far half for(; dep < len; dep++) { int rdc = seq[len-dep-1]; int quc = qual[len-dep-1]; if(rdc > 3 && nceil == 0) { break; } tp[0] = tp[1] = tp[2] = tp[3] = topp; bp[0] = bp[1] = bp[2] = bp[3] = botp; int clo = 0, chi = 3; bool match = true; if(bloc.valid()) { bwops_++; t[0] = t[1] = t[2] = t[3] = b[0] = b[1] = b[2] = b[3] = 0; ebwt->mapBiLFEx(tloc, bloc, t, b, tp, bp); SANITY_CHECK_4TUP(t, b, tp, bp); match = rdc < 4; top = t[rdc]; bot = b[rdc]; topp = tp[rdc]; botp = bp[rdc]; } else { assert_eq(bot, top+1); assert(!rep1mm || botp == topp+1); bwops_++; clo = ebwt->mapLF1(top, tloc); match = (clo == rdc); assert_range(-1, 3, clo); if(clo < 0) { break; // Hit the $ } else { t[clo] = top; b[clo] = bot = top + 1; } bp[clo] = botp; tp[clo] = topp; assert(!rep1mm || bot - top == botp - topp); assert(!rep1mm || b[clo] - t[clo] == bp[clo] - tp[clo]); chi = clo; } //assert(sanityPartial(ebwt, ebwtp, seq, len - dep - 1, len, rep1mm, top, bot, topp, botp)); if(rep1mm && (ns == 0 || rdc > 3)) { for(int j = clo; j <= chi; j++) { if(j == rdc || b[j] == t[j]) { // Either matches read or isn't a possibility continue; } // Potential mismatch - next, try size_t depm = dep + 1; index_t topm = t[j], botm = b[j]; index_t topmp = tp[j], botmp = bp[j]; assert_eq(botm - topm, botmp - topmp); index_t tm[4], bm[4]; // dest BW ranges for BWT tm[0] = t[0]; tm[1] = t[1]; tm[2] = t[2]; tm[3] = t[3]; bm[0] = b[0]; bm[1] = t[1]; bm[2] = b[2]; bm[3] = t[3]; index_t tmp[4], bmp[4]; // dest BW ranges for BWT' tmp[0] = tp[0]; tmp[1] = tp[1]; tmp[2] = tp[2]; tmp[3] = tp[3]; bmp[0] = bp[0]; bmp[1] = tp[1]; bmp[2] = bp[2]; bmp[3] = tp[3]; SideLocus tlocm, blocm; INIT_LOCS(topm, botm, tlocm, blocm, *ebwt); for(; depm < len; depm++) { int rdcm = seq[len - depm - 1]; tmp[0] = tmp[1] = tmp[2] = tmp[3] = topmp; bmp[0] = bmp[1] = bmp[2] = bmp[3] = botmp; if(blocm.valid()) { bwops_++; tm[0] = tm[1] = tm[2] = tm[3] = bm[0] = bm[1] = bm[2] = bm[3] = 0; ebwt->mapBiLFEx(tlocm, blocm, tm, bm, tmp, bmp); SANITY_CHECK_4TUP(tm, bm, tmp, bmp); topm = tm[rdcm]; botm = bm[rdcm]; topmp = tmp[rdcm]; botmp = bmp[rdcm]; if(botm <= topm) { break; } } else { assert_eq(botm, topm+1); assert_eq(botmp, topmp+1); bwops_++; topm = ebwt->mapLF1(topm, tlocm, rdcm); if(topm == (index_t)0xffffffff) { break; } botm = topm + 1; // topp/botp stay the same } INIT_LOCS(topm, botm, tlocm, blocm, *ebwt); } if(depm == len) { // Success; this is a 1MM hit size_t off5p = dep; // offset from 5' end of read size_t offstr = dep; // offset into patFw/patRc if(fw == ebwtfw) { off5p = len - off5p - 1; } if(!ebwtfw) { offstr = len - offstr - 1; } Edit e((uint32_t)off5p, j, rdc, EDIT_TYPE_MM, false); results = true; int64_t score = (len - 1) * sc.match(); // In --local mode, need to double-check that // end-to-end alignment doesn't violate local // alignment principles. Specifically, it // shouldn't to or below 0 anywhere in the middle. int pen = sc.score(rdc, (int)(1 << j), quc - 33); score += pen; bool valid = true; if(local) { int64_t locscore_fw = 0, locscore_bw = 0; for(size_t i = 0; i < len; i++) { if(i == dep) { if(locscore_fw + pen <= 0) { valid = false; break; } locscore_fw += pen; } else { locscore_fw += sc.match(); } if(len-i-1 == dep) { if(locscore_bw + pen <= 0) { valid = false; break; } locscore_bw += pen; } else { locscore_bw += sc.match(); } } } if(valid) { valid = score >= minsc; } if(valid) { #ifndef NDEBUG BTDnaString& rf = tmprfdnastr_; rf.clear(); edits_.clear(); edits_.push_back(e); if(!fw) Edit::invertPoss(edits_, len, false); Edit::toRef(fw ? read.patFw : read.patRc, edits_, rf); if(!fw) Edit::invertPoss(edits_, len, false); assert_eq(len, rf.length()); for(size_t i = 0; i < len; i++) { assert_lt((int)rf[i], 4); } ASSERT_ONLY(index_t toptmp = 0); ASSERT_ONLY(index_t bottmp = 0); assert(ebwtFw->contains(rf, &toptmp, &bottmp)); #endif index_t toprep = ebwtfw ? topm : topmp; index_t botrep = ebwtfw ? botm : botmp; assert_eq(toprep, toptmp); assert_eq(botrep, bottmp); hits.add1mmEe(toprep, botrep, &e, NULL, fw, score); } } } } if(bot > top && match) { assert_lt(rdc, 4); if(dep == len-1) { // Success; this is an exact hit if(ebwtfw && repex) { if(fw) { results = true; int64_t score = len * sc.match(); hits.addExactEeFw( ebwtfw ? top : topp, ebwtfw ? bot : botp, NULL, NULL, fw, score); assert(ebwtFw->contains(seq, NULL, NULL)); } else { results = true; int64_t score = len * sc.match(); hits.addExactEeRc( ebwtfw ? top : topp, ebwtfw ? bot : botp, NULL, NULL, fw, score); assert(ebwtFw->contains(seq, NULL, NULL)); } } break; // End of far loop } else { INIT_LOCS(top, bot, tloc, bloc, *ebwt); assert(sanityPartial(ebwt, ebwtp, seq, len - dep - 1, len, rep1mm, top, bot, topp, botp)); } } else { break; // End of far loop } } // for(; dep < len; dep++) } // for(int ebwtfw = 0; ebwtfw < 2; ebwtfw++) } // for(int fw = 0; fw < 2; fw++) return results; } /** * Wrapper for initial invcation of searchSeed. */ template bool SeedAligner::searchSeedBi() { return searchSeedBi( 0, 0, 0, 0, 0, 0, SideLocus(), SideLocus(), s_->cons[0], s_->cons[1], s_->cons[2], s_->overall, NULL); } /** * Get tloc, bloc ready for the next step. If the new range is under * the ceiling. */ template inline void SeedAligner::nextLocsBi( SideLocus& tloc, // top locus SideLocus& bloc, // bot locus index_t topf, // top in BWT index_t botf, // bot in BWT index_t topb, // top in BWT' index_t botb, // bot in BWT' int step // step to get ready for #if 0 , const SABWOffTrack* prevOt, // previous tracker SABWOffTrack& ot // current tracker #endif ) { assert_gt(botf, 0); assert(ebwtBw_ == NULL || botb > 0); assert_geq(step, 0); // next step can't be first one assert(ebwtBw_ == NULL || botf-topf == botb-topb); if(step == (int)s_->steps.size()) return; // no more steps! // Which direction are we going in next? if(s_->steps[step] > 0) { // Left to right; use BWT' if(botb - topb == 1) { // Already down to 1 row; just init top locus tloc.initFromRow(topb, ebwtBw_->eh(), ebwtBw_->ebwt()); bloc.invalidate(); } else { SideLocus::initFromTopBot( topb, botb, ebwtBw_->eh(), ebwtBw_->ebwt(), tloc, bloc); assert(bloc.valid()); } } else { // Right to left; use BWT if(botf - topf == 1) { // Already down to 1 row; just init top locus tloc.initFromRow(topf, ebwtFw_->eh(), ebwtFw_->ebwt()); bloc.invalidate(); } else { SideLocus::initFromTopBot( topf, botf, ebwtFw_->eh(), ebwtFw_->ebwt(), tloc, bloc); assert(bloc.valid()); } } // Check if we should update the tracker with this refinement #if 0 if(botf-topf <= BW_OFF_TRACK_CEIL) { if(ot.size() == 0 && prevOt != NULL && prevOt->size() > 0) { // Inherit state from the predecessor ot = *prevOt; } bool ltr = s_->steps[step-1] > 0; int adj = abs(s_->steps[step-1])-1; const Ebwt* ebwt = ltr ? ebwtBw_ : ebwtFw_; ot.update( ltr ? topb : topf, // top ltr ? botb : botf, // bot adj, // adj (to be subtracted from offset) ebwt->offs(), // offs array ebwt->eh().offRate(), // offrate (sample = every 1 << offrate elts) NULL // dead ); assert_gt(ot.size(), 0); } #endif assert(botf - topf == 1 || bloc.valid()); assert(botf - topf > 1 || !bloc.valid()); } /** * Report a seed hit found by searchSeedBi(), but first try to extend it out in * either direction as far as possible without hitting any edits. This will * allow us to prioritize the seed hits better later on. Call reportHit() when * we're done, which actually adds the hit to the cache. Returns result from * calling reportHit(). */ template bool SeedAligner::extendAndReportHit( index_t topf, // top in BWT index_t botf, // bot in BWT index_t topb, // top in BWT' index_t botb, // bot in BWT' index_t len, // length of hit DoublyLinkedList *prevEdit) // previous edit { index_t nlex = 0, nrex = 0; index_t t[4], b[4]; index_t tp[4], bp[4]; SideLocus tloc, bloc; if(off_ > 0) { const Ebwt *ebwt = ebwtFw_; assert(ebwt != NULL); // Extend left using forward index const BTDnaString& seq = fw_ ? read_->patFw : read_->patRc; // See what we get by extending index_t top = topf, bot = botf; t[0] = t[1] = t[2] = t[3] = 0; b[0] = b[1] = b[2] = b[3] = 0; tp[0] = tp[1] = tp[2] = tp[3] = topb; bp[0] = bp[1] = bp[2] = bp[3] = botb; SideLocus tloc, bloc; INIT_LOCS(top, bot, tloc, bloc, *ebwt); for(size_t ii = off_; ii > 0; ii--) { size_t i = ii-1; // Get char from read int rdc = seq.get(i); // See what we get by extending if(bloc.valid()) { bwops_++; t[0] = t[1] = t[2] = t[3] = b[0] = b[1] = b[2] = b[3] = 0; ebwt->mapBiLFEx(tloc, bloc, t, b, tp, bp); SANITY_CHECK_4TUP(t, b, tp, bp); int nonz = -1; bool abort = false; for(int j = 0; j < 4; j++) { if(b[i] > t[i]) { if(nonz >= 0) { abort = true; break; } nonz = j; top = t[i]; bot = b[i]; } } if(abort || nonz != rdc) { break; } } else { assert_eq(bot, top+1); bwops_++; int c = ebwt->mapLF1(top, tloc); if(c != rdc) { break; } bot = top + 1; } if(++nlex == 255) { break; } INIT_LOCS(top, bot, tloc, bloc, *ebwt); } } size_t rdlen = read_->length(); size_t nright = rdlen - off_ - len; if(nright > 0 && ebwtBw_ != NULL) { const Ebwt *ebwt = ebwtBw_; assert(ebwt != NULL); // Extend right using backward index const BTDnaString& seq = fw_ ? read_->patFw : read_->patRc; // See what we get by extending index_t top = topb, bot = botb; t[0] = t[1] = t[2] = t[3] = 0; b[0] = b[1] = b[2] = b[3] = 0; tp[0] = tp[1] = tp[2] = tp[3] = topb; bp[0] = bp[1] = bp[2] = bp[3] = botb; INIT_LOCS(top, bot, tloc, bloc, *ebwt); for(size_t i = off_ + len; i < rdlen; i++) { // Get char from read int rdc = seq.get(i); // See what we get by extending if(bloc.valid()) { bwops_++; t[0] = t[1] = t[2] = t[3] = b[0] = b[1] = b[2] = b[3] = 0; ebwt->mapBiLFEx(tloc, bloc, t, b, tp, bp); SANITY_CHECK_4TUP(t, b, tp, bp); int nonz = -1; bool abort = false; for(int j = 0; j < 4; j++) { if(b[i] > t[i]) { if(nonz >= 0) { abort = true; break; } nonz = j; top = t[i]; bot = b[i]; } } if(abort || nonz != rdc) { break; } } else { assert_eq(bot, top+1); bwops_++; int c = ebwt->mapLF1(top, tloc); if(c != rdc) { break; } bot = top + 1; } if(++nrex == 255) { break; } INIT_LOCS(top, bot, tloc, bloc, *ebwt); } } assert_lt(nlex, rdlen); assert_leq(nlex, off_); assert_lt(nrex, rdlen); return reportHit(topf, botf, topb, botb, len, prevEdit); } /** * Report a seed hit found by searchSeedBi() by adding it to the cache. Return * false if the hit could not be reported because of, e.g., cache exhaustion. */ template bool SeedAligner::reportHit( index_t topf, // top in BWT index_t botf, // bot in BWT index_t topb, // top in BWT' index_t botb, // bot in BWT' index_t len, // length of hit DoublyLinkedList *prevEdit) // previous edit { // Add information about the seed hit to AlignmentCache. This // information eventually makes its way back to the SeedResults // object when we call finishAlign(...). BTDnaString& rf = tmprfdnastr_; rf.clear(); edits_.clear(); if(prevEdit != NULL) { prevEdit->toList(edits_); Edit::sort(edits_); assert(Edit::repOk(edits_, *seq_)); Edit::toRef(*seq_, edits_, rf); } else { rf = *seq_; } // Sanity check: shouldn't add the same hit twice. If this // happens, it may be because our zone Constraints are not set up // properly and erroneously return true from acceptable() when they // should return false in some cases. assert_eq(hits_.size(), ca_->curNumRanges()); assert(hits_.insert(rf)); if(!ca_->addOnTheFly(rf, topf, botf, topb, botb)) { return false; } assert_eq(hits_.size(), ca_->curNumRanges()); #ifndef NDEBUG // Sanity check that the topf/botf and topb/botb ranges really // correspond to the reference sequence aligned to { BTDnaString rfr; index_t tpf, btf, tpb, btb; tpf = btf = tpb = btb = 0; assert(ebwtFw_->contains(rf, &tpf, &btf)); if(ebwtBw_ != NULL) { rfr = rf; rfr.reverse(); assert(ebwtBw_->contains(rfr, &tpb, &btb)); assert_eq(tpf, topf); assert_eq(btf, botf); assert_eq(tpb, topb); assert_eq(btb, botb); } } #endif return true; } /** * Given a seed, search. Assumes zone 0 = no backtracking. * * Return a list of Seed hits. * 1. Edits * 2. Bidirectional BWT range(s) on either end */ template bool SeedAligner::searchSeedBi( int step, // depth into steps_[] array int depth, // recursion depth index_t topf, // top in BWT index_t botf, // bot in BWT index_t topb, // top in BWT' index_t botb, // bot in BWT' SideLocus tloc, // locus for top (perhaps unititialized) SideLocus bloc, // locus for bot (perhaps unititialized) Constraint c0, // constraints to enforce in seed zone 0 Constraint c1, // constraints to enforce in seed zone 1 Constraint c2, // constraints to enforce in seed zone 2 Constraint overall, // overall constraints to enforce DoublyLinkedList *prevEdit // previous edit #if 0 , const SABWOffTrack* prevOt // prev off tracker (if tracking started) #endif ) { assert(s_ != NULL); const InstantiatedSeed& s = *s_; assert_gt(s.steps.size(), 0); assert(ebwtBw_ == NULL || ebwtBw_->eh().ftabChars() == ebwtFw_->eh().ftabChars()); #ifndef NDEBUG for(int i = 0; i < 4; i++) { assert(ebwtBw_ == NULL || ebwtBw_->fchr()[i] == ebwtFw_->fchr()[i]); } #endif if(step == (int)s.steps.size()) { // Finished aligning seed assert(c0.acceptable()); assert(c1.acceptable()); assert(c2.acceptable()); if(!reportHit(topf, botf, topb, botb, seq_->length(), prevEdit)) { return false; // Memory exhausted } return true; } #ifndef NDEBUG if(depth > 0) { assert(botf - topf == 1 || bloc.valid()); assert(botf - topf > 1 || !bloc.valid()); } #endif int off; index_t tp[4], bp[4]; // dest BW ranges for "prime" index if(step == 0) { // Just starting assert(prevEdit == NULL); assert(!tloc.valid()); assert(!bloc.valid()); off = s.steps[0]; bool ltr = off > 0; off = abs(off)-1; // Check whether/how far we can jump using ftab or fchr int ftabLen = ebwtFw_->eh().ftabChars(); if(ftabLen > 1 && ftabLen <= s.maxjump) { if(!ltr) { assert_geq(off+1, ftabLen-1); off = off - ftabLen + 1; } ebwtFw_->ftabLoHi(*seq_, off, false, topf, botf); #ifdef NDEBUG if(botf - topf == 0) return true; #endif #ifdef NDEBUG if(ebwtBw_ != NULL) { topb = ebwtBw_->ftabHi(*seq_, off); botb = topb + (botf-topf); } #else if(ebwtBw_ != NULL) { ebwtBw_->ftabLoHi(*seq_, off, false, topb, botb); assert_eq(botf-topf, botb-topb); } if(botf - topf == 0) return true; #endif step += ftabLen; } else if(s.maxjump > 0) { // Use fchr int c = (*seq_)[off]; assert_range(0, 3, c); topf = topb = ebwtFw_->fchr()[c]; botf = botb = ebwtFw_->fchr()[c+1]; if(botf - topf == 0) return true; step++; } else { assert_eq(0, s.maxjump); topf = topb = 0; botf = botb = ebwtFw_->fchr()[4]; } if(step == (int)s.steps.size()) { // Finished aligning seed assert(c0.acceptable()); assert(c1.acceptable()); assert(c2.acceptable()); if(!reportHit(topf, botf, topb, botb, seq_->length(), prevEdit)) { return false; // Memory exhausted } return true; } nextLocsBi(tloc, bloc, topf, botf, topb, botb, step); assert(tloc.valid()); } else assert(prevEdit != NULL); assert(tloc.valid()); assert(botf - topf == 1 || bloc.valid()); assert(botf - topf > 1 || !bloc.valid()); assert_geq(step, 0); index_t t[4], b[4]; // dest BW ranges Constraint* zones[3] = { &c0, &c1, &c2 }; ASSERT_ONLY(index_t lasttot = botf - topf); for(int i = step; i < (int)s.steps.size(); i++) { assert_gt(botf, topf); assert(botf - topf == 1 || bloc.valid()); assert(botf - topf > 1 || !bloc.valid()); assert(ebwtBw_ == NULL || botf-topf == botb-topb); assert(tloc.valid()); off = s.steps[i]; bool ltr = off > 0; const Ebwt* ebwt = ltr ? ebwtBw_ : ebwtFw_; assert(ebwt != NULL); if(ltr) { tp[0] = tp[1] = tp[2] = tp[3] = topf; bp[0] = bp[1] = bp[2] = bp[3] = botf; } else { tp[0] = tp[1] = tp[2] = tp[3] = topb; bp[0] = bp[1] = bp[2] = bp[3] = botb; } t[0] = t[1] = t[2] = t[3] = b[0] = b[1] = b[2] = b[3] = 0; if(bloc.valid()) { // Range delimited by tloc/bloc has size >1. If size == 1, // we use a simpler query (see if(!bloc.valid()) blocks below) bwops_++; ebwt->mapBiLFEx(tloc, bloc, t, b, tp, bp); ASSERT_ONLY(index_t tot = (b[0]-t[0])+(b[1]-t[1])+(b[2]-t[2])+(b[3]-t[3])); ASSERT_ONLY(index_t totp = (bp[0]-tp[0])+(bp[1]-tp[1])+(bp[2]-tp[2])+(bp[3]-tp[3])); assert_eq(tot, totp); assert_leq(tot, lasttot); ASSERT_ONLY(lasttot = tot); } index_t *tf = ltr ? tp : t, *tb = ltr ? t : tp; index_t *bf = ltr ? bp : b, *bb = ltr ? b : bp; off = abs(off)-1; // bool leaveZone = s.zones[i].first < 0; //bool leaveZoneIns = zones_[i].second < 0; Constraint& cons = *zones[abs(s.zones[i].first)]; Constraint& insCons = *zones[abs(s.zones[i].second)]; int c = (*seq_)[off]; assert_range(0, 4, c); int q = (*qual_)[off]; // Is it legal for us to advance on characters other than 'c'? if(!(cons.mustMatch() && !overall.mustMatch()) || c == 4) { // There may be legal edits bool bail = false; if(!bloc.valid()) { // Range delimited by tloc/bloc has size 1 index_t ntop = ltr ? topb : topf; bwops_++; int cc = ebwt->mapLF1(ntop, tloc); assert_range(-1, 3, cc); if(cc < 0) bail = true; else { t[cc] = ntop; b[cc] = ntop+1; } } if(!bail) { if((cons.canMismatch(q, *sc_) && overall.canMismatch(q, *sc_)) || c == 4) { Constraint oldCons = cons, oldOvCons = overall; SideLocus oldTloc = tloc, oldBloc = bloc; if(c != 4) { cons.chargeMismatch(q, *sc_); overall.chargeMismatch(q, *sc_); } // Can leave the zone as-is if(!leaveZone || (cons.acceptable() && overall.acceptable())) { for(int j = 0; j < 4; j++) { if(j == c || b[j] == t[j]) continue; // Potential mismatch nextLocsBi(tloc, bloc, tf[j], bf[j], tb[j], bb[j], i+1); int loff = off; if(!ltr) loff = (int)(s.steps.size() - loff - 1); assert(prevEdit == NULL || prevEdit->next == NULL); Edit edit(off, j, c, EDIT_TYPE_MM, false); DoublyLinkedList editl; editl.payload = edit; if(prevEdit != NULL) { prevEdit->next = &editl; editl.prev = prevEdit; } assert(editl.next == NULL); bwedits_++; if(!searchSeedBi( i+1, // depth into steps_[] array depth+1, // recursion depth tf[j], // top in BWT bf[j], // bot in BWT tb[j], // top in BWT' bb[j], // bot in BWT' tloc, // locus for top (perhaps unititialized) bloc, // locus for bot (perhaps unititialized) c0, // constraints to enforce in seed zone 0 c1, // constraints to enforce in seed zone 1 c2, // constraints to enforce in seed zone 2 overall, // overall constraints to enforce &editl)) // latest edit { return false; } if(prevEdit != NULL) prevEdit->next = NULL; } } else { // Not enough edits to make this path // non-redundant with other seeds } cons = oldCons; overall = oldOvCons; tloc = oldTloc; bloc = oldBloc; } if(cons.canGap() && overall.canGap()) { throw 1; // TODO int delEx = 0; if(cons.canDelete(delEx, *sc_) && overall.canDelete(delEx, *sc_)) { // Try delete } int insEx = 0; if(insCons.canInsert(insEx, *sc_) && overall.canInsert(insEx, *sc_)) { // Try insert } } } // if(!bail) } if(c == 4) { return true; // couldn't handle the N } if(leaveZone && (!cons.acceptable() || !overall.acceptable())) { // Not enough edits to make this path non-redundant with // other seeds return true; } if(!bloc.valid()) { assert(ebwtBw_ == NULL || bp[c] == tp[c]+1); // Range delimited by tloc/bloc has size 1 index_t top = ltr ? topb : topf; bwops_++; t[c] = ebwt->mapLF1(top, tloc, c); if(t[c] == (index_t)OFF_MASK) { return true; } assert_geq(t[c], ebwt->fchr()[c]); assert_lt(t[c], ebwt->fchr()[c+1]); b[c] = t[c]+1; assert_gt(b[c], 0); } assert(ebwtBw_ == NULL || bf[c]-tf[c] == bb[c]-tb[c]); assert_leq(bf[c]-tf[c], lasttot); ASSERT_ONLY(lasttot = bf[c]-tf[c]); if(b[c] == t[c]) { return true; } topf = tf[c]; botf = bf[c]; topb = tb[c]; botb = bb[c]; if(i+1 == (int)s.steps.size()) { // Finished aligning seed assert(c0.acceptable()); assert(c1.acceptable()); assert(c2.acceptable()); if(!reportHit(topf, botf, topb, botb, seq_->length(), prevEdit)) { return false; // Memory exhausted } return true; } nextLocsBi(tloc, bloc, tf[c], bf[c], tb[c], bb[c], i+1); } return true; } #endif /*ALIGNER_SEED_H_*/