hisat-3n/aligner_seed.h
2025-01-18 21:09:52 +08:00

2923 lines
84 KiB
C++

/*
* Copyright 2011, Ben Langmead <langmea@cs.jhu.edu>
*
* This file is part of Bowtie 2.
*
* Bowtie 2 is free software: you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation, either version 3 of the License, or
* (at your option) any later version.
*
* Bowtie 2 is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with Bowtie 2. If not, see <http://www.gnu.org/licenses/>.
*/
#ifndef ALIGNER_SEED_H_
#define ALIGNER_SEED_H_
#include <iostream>
#include <utility>
#include <limits>
#include "qual.h"
#include "ds.h"
#include "sstring.h"
#include "alphabet.h"
#include "edit.h"
#include "read.h"
// Threading is necessary to synchronize the classes that dump
// intermediate alignment results to files. Otherwise, all data herein
// is constant and shared, or per-thread.
#include "threading.h"
#include "aligner_result.h"
#include "aligner_cache.h"
#include "scoring.h"
#include "mem_ids.h"
#include "simple_func.h"
#include "btypes.h"
/**
* A constraint to apply to an alignment zone, or to an overall
* alignment.
*
* The constraint can put both caps and ceilings on the number and
* types of edits allowed.
*/
struct Constraint {
Constraint() { init(); }
/**
* Initialize Constraint to be fully permissive.
*/
void init() {
edits = mms = ins = dels = penalty = editsCeil = mmsCeil =
insCeil = delsCeil = penaltyCeil = MAX_I;
penFunc.reset();
instantiated = false;
}
/**
* Return true iff penalities and constraints prevent us from
* adding any edits.
*/
bool mustMatch() {
assert(instantiated);
return (mms == 0 && edits == 0) ||
penalty == 0 ||
(mms == 0 && dels == 0 && ins == 0);
}
/**
* Return true iff a mismatch of the given quality is permitted.
*/
bool canMismatch(int q, const Scoring& cm) {
assert(instantiated);
return (mms > 0 || edits > 0) &&
penalty >= cm.mm(q);
}
/**
* Return true iff a mismatch of the given quality is permitted.
*/
bool canN(int q, const Scoring& cm) {
assert(instantiated);
return (mms > 0 || edits > 0) &&
penalty >= cm.n(q);
}
/**
* Return true iff a mismatch of *any* quality (even qual=1) is
* permitted.
*/
bool canMismatch() {
assert(instantiated);
return (mms > 0 || edits > 0) && penalty > 0;
}
/**
* Return true iff a mismatch of *any* quality (even qual=1) is
* permitted.
*/
bool canN() {
assert(instantiated);
return (mms > 0 || edits > 0);
}
/**
* Return true iff a deletion of the given extension (0=open, 1=1st
* extension, etc) is permitted.
*/
bool canDelete(int ex, const Scoring& cm) {
assert(instantiated);
return (dels > 0 && edits > 0) &&
penalty >= cm.del(ex);
}
/**
* Return true iff a deletion of any extension is permitted.
*/
bool canDelete() {
assert(instantiated);
return (dels > 0 || edits > 0) &&
penalty > 0;
}
/**
* Return true iff an insertion of the given extension (0=open,
* 1=1st extension, etc) is permitted.
*/
bool canInsert(int ex, const Scoring& cm) {
assert(instantiated);
return (ins > 0 || edits > 0) &&
penalty >= cm.ins(ex);
}
/**
* Return true iff an insertion of any extension is permitted.
*/
bool canInsert() {
assert(instantiated);
return (ins > 0 || edits > 0) &&
penalty > 0;
}
/**
* Return true iff a gap of any extension is permitted
*/
bool canGap() {
assert(instantiated);
return ((ins > 0 || dels > 0) || edits > 0) && penalty > 0;
}
/**
* Charge a mismatch of the given quality.
*/
void chargeMismatch(int q, const Scoring& cm) {
assert(instantiated);
if(mms == 0) { assert_gt(edits, 0); edits--; }
else mms--;
penalty -= cm.mm(q);
assert_geq(mms, 0);
assert_geq(edits, 0);
assert_geq(penalty, 0);
}
/**
* Charge an N mismatch of the given quality.
*/
void chargeN(int q, const Scoring& cm) {
assert(instantiated);
if(mms == 0) { assert_gt(edits, 0); edits--; }
else mms--;
penalty -= cm.n(q);
assert_geq(mms, 0);
assert_geq(edits, 0);
assert_geq(penalty, 0);
}
/**
* Charge a deletion of the given extension.
*/
void chargeDelete(int ex, const Scoring& cm) {
assert(instantiated);
dels--;
edits--;
penalty -= cm.del(ex);
assert_geq(dels, 0);
assert_geq(edits, 0);
assert_geq(penalty, 0);
}
/**
* Charge an insertion of the given extension.
*/
void chargeInsert(int ex, const Scoring& cm) {
assert(instantiated);
ins--;
edits--;
penalty -= cm.ins(ex);
assert_geq(ins, 0);
assert_geq(edits, 0);
assert_geq(penalty, 0);
}
/**
* Once the constrained area is completely explored, call this
* function to check whether there were *at least* as many
* dissimilarities as required by the constraint. Bounds like this
* are helpful to resolve instances where two search roots would
* otherwise overlap in what alignments they can find.
*/
bool acceptable() {
assert(instantiated);
return edits <= editsCeil &&
mms <= mmsCeil &&
ins <= insCeil &&
dels <= delsCeil &&
penalty <= penaltyCeil;
}
/**
* Instantiate a constraint w/r/t the read length and the constant
* and linear coefficients for the penalty function.
*/
static int instantiate(size_t rdlen, const SimpleFunc& func) {
return func.f<int>((double)rdlen);
}
/**
* Instantiate this constraint w/r/t the read length.
*/
void instantiate(size_t rdlen) {
assert(!instantiated);
if(penFunc.initialized()) {
penalty = Constraint::instantiate(rdlen, penFunc);
}
instantiated = true;
}
int edits; // # edits permitted
int mms; // # mismatches permitted
int ins; // # insertions permitted
int dels; // # deletions permitted
int penalty; // penalty total permitted
int editsCeil; // <= this many edits can be left at the end
int mmsCeil; // <= this many mismatches can be left at the end
int insCeil; // <= this many inserts can be left at the end
int delsCeil; // <= this many deletions can be left at the end
int penaltyCeil;// <= this much leftover penalty can be left at the end
SimpleFunc penFunc;// penalty function; function of read len
bool instantiated; // whether constraint is instantiated w/r/t read len
//
// Some static methods for constructing some standard Constraints
//
/**
* Construct a constraint with no edits of any kind allowed.
*/
static Constraint exact();
/**
* Construct a constraint where the only constraint is a total
* penalty constraint.
*/
static Constraint penaltyBased(int pen);
/**
* Construct a constraint where the only constraint is a total
* penalty constraint related to the length of the read.
*/
static Constraint penaltyFuncBased(const SimpleFunc& func);
/**
* Construct a constraint where the only constraint is a total
* penalty constraint.
*/
static Constraint mmBased(int mms);
/**
* Construct a constraint where the only constraint is a total
* penalty constraint.
*/
static Constraint editBased(int edits);
};
/**
* We divide seed search strategies into three categories:
*
* 1. A left-to-right search where the left half of the read is
* constrained to match exactly and the right half is subject to
* some looser constraint (e.g. 1mm or 2mm)
* 2. Same as 1, but going right to left with the exact matching half
* on the right.
* 3. Inside-out search where the center half of the read is
* constrained to match exactly, and the extreme quarters of the
* read are subject to a looser constraint.
*/
enum {
SEED_TYPE_EXACT = 1,
SEED_TYPE_LEFT_TO_RIGHT,
SEED_TYPE_RIGHT_TO_LEFT,
SEED_TYPE_INSIDE_OUT
};
struct InstantiatedSeed;
/**
* Policy dictating how to size and arrange seeds along the length of
* the read, and what constraints to force on the zones of the seed.
* We assume that seeds are plopped down at regular intervals from the
* 5' to 3' ends, with the first seed flush to the 5' end.
*
* If the read is shorter than a single seed, one seed is used and it
* is shrunk to accommodate the read.
*/
struct Seed {
int len; // length of a seed
int type; // dictates anchor portion, direction of search
Constraint *overall; // for the overall alignment
Seed() { init(0, 0, NULL); }
/**
* Construct and initialize this seed with given length and type.
*/
Seed(int ln, int ty, Constraint* oc) {
init(ln, ty, oc);
}
/**
* Initialize this seed with given length and type.
*/
void init(int ln, int ty, Constraint* oc) {
len = ln;
type = ty;
overall = oc;
}
// If the seed is split into halves, we just use zones[0] and
// zones[1]; 0 is the near half and 1 is the far half. If the seed
// is split into thirds (i.e. inside-out) then 0 is the center, 1
// is the far portion on the left, and 2 is the far portion on the
// right.
Constraint zones[3];
/**
* Once the constrained seed is completely explored, call this
* function to check whether there were *at least* as many
* dissimilarities as required by all constraints. Bounds like this
* are helpful to resolve instances where two search roots would
* otherwise overlap in what alignments they can find.
*/
bool acceptable() {
assert(overall != NULL);
return zones[0].acceptable() &&
zones[1].acceptable() &&
zones[2].acceptable() &&
overall->acceptable();
}
/**
* Given a read, depth and orientation, extract a seed data structure
* from the read and fill in the steps & zones arrays. The Seed
* contains the sequence and quality values.
*/
bool instantiate(
const Read& read,
const BTDnaString& seq, // already-extracted seed sequence
const BTString& qual, // already-extracted seed quality sequence
const Scoring& pens,
int depth,
int seedoffidx,
int seedtypeidx,
bool fw,
InstantiatedSeed& si) const;
/**
* Return a list of Seed objects encapsulating
*/
static void mmSeeds(
int mms,
int ln,
EList<Seed>& pols,
Constraint& oall)
{
if(mms == 0) {
zeroMmSeeds(ln, pols, oall);
} else if(mms == 1) {
oneMmSeeds(ln, pols, oall);
} else if(mms == 2) {
twoMmSeeds(ln, pols, oall);
} else throw 1;
}
static void zeroMmSeeds(int ln, EList<Seed>&, Constraint&);
static void oneMmSeeds (int ln, EList<Seed>&, Constraint&);
static void twoMmSeeds (int ln, EList<Seed>&, Constraint&);
};
/**
* An instantiated seed is a seed (perhaps modified to fit the read)
* plus all data needed to conduct a search of the seed.
*/
struct InstantiatedSeed {
InstantiatedSeed() : steps(AL_CAT), zones(AL_CAT) { }
// Steps map. There are as many steps as there are positions in
// the seed. The map is a helpful abstraction because we sometimes
// visit seed positions in an irregular order (e.g. inside-out
// search).
EList<int> steps;
// Zones map. For each step, records what constraint to charge an
// edit to. The first entry in each pair gives the constraint for
// non-insert edits and the second entry in each pair gives the
// constraint for insert edits. If the value stored is negative,
// this indicates that the zone is "closed out" after this
// position, so zone acceptility should be checked.
EList<pair<int, int> > zones;
// Nucleotide sequence covering the seed, extracted from read
BTDnaString *seq;
// Quality sequence covering the seed, extracted from read
BTString *qual;
// Initial constraints governing zones 0, 1, 2. We precalculate
// the effect of Ns on these.
Constraint cons[3];
// Overall constraint, tailored to the read length.
Constraint overall;
// Maximum number of positions that the aligner may advance before
// its first step. This lets the aligner know whether it can use
// the ftab or not.
int maxjump;
// Offset of seed from 5' end of read
int seedoff;
// Id for seed offset; ids are such that the smallest index is the
// closest to the 5' end and consecutive ids are adjacent (i.e.
// there are no intervening offsets with seeds)
int seedoffidx;
// Type of seed (left-to-right, etc)
int seedtypeidx;
// Seed comes from forward-oriented read?
bool fw;
// Filtered out due to the pattern of Ns present. If true, this
// seed should be ignored by searchAllSeeds().
bool nfiltered;
// Seed this was instantiated from
Seed s;
#ifndef NDEBUG
/**
* Check that InstantiatedSeed is internally consistent.
*/
bool repOk() const {
return true;
}
#endif
};
/**
* Simple struct for holding a end-to-end alignments for the read with at most
* 2 edits.
*/
template <typename index_t>
struct EEHit {
EEHit() { reset(); }
void reset() {
top = bot = 0;
fw = false;
e1.reset();
e2.reset();
score = MIN_I64;
}
void init(
index_t top_,
index_t bot_,
const Edit* e1_,
const Edit* e2_,
bool fw_,
int64_t score_)
{
top = top_; bot = bot_;
if(e1_ != NULL) {
e1 = *e1_;
} else {
e1.reset();
}
if(e2_ != NULL) {
e2 = *e2_;
} else {
e2.reset();
}
fw = fw_;
score = score_;
}
/**
* Return number of mismatches in the alignment.
*/
int mms() const {
if (e2.inited()) return 2;
else if(e1.inited()) return 1;
else return 0;
}
/**
* Return the number of Ns involved in the alignment.
*/
int ns() const {
int ns = 0;
if(e1.inited() && e1.hasN()) {
ns++;
if(e2.inited() && e2.hasN()) {
ns++;
}
}
return ns;
}
/**
* Return the number of Ns involved in the alignment.
*/
int refns() const {
int ns = 0;
if(e1.inited() && e1.chr == 'N') {
ns++;
if(e2.inited() && e2.chr == 'N') {
ns++;
}
}
return ns;
}
/**
* Return true iff there is no hit.
*/
bool empty() const {
return bot <= top;
}
/**
* Higher score = higher priority.
*/
bool operator<(const EEHit& o) const {
return score > o.score;
}
/**
* Return the size of the alignments SA range.s
*/
index_t size() const { return bot - top; }
#ifndef NDEBUG
/**
* Check that hit is sane w/r/t read.
*/
bool repOk(const Read& rd) const {
assert_gt(bot, top);
if(e1.inited()) {
assert_lt(e1.pos, rd.length());
if(e2.inited()) {
assert_lt(e2.pos, rd.length());
}
}
return true;
}
#endif
index_t top;
index_t bot;
Edit e1;
Edit e2;
bool fw;
int64_t score;
};
/**
* Data structure for holding all of the seed hits associated with a read. All
* the seed hits for a given read are encapsulated in a single QVal object. A
* QVal refers to a range of values in the qlist, where each qlist value is a
* BW range and a slot to hold the hit's suffix array offset. QVals are kept
* in two lists (hitsFw_ and hitsRc_), one for seeds on the forward read strand,
* one for seeds on the reverse read strand. The list is indexed by read
* offset index (e.g. 0=closest-to-5', 1=second-closest, etc).
*
* An assumption behind this data structure is that all the seeds are found
* first, then downstream analyses try to extend them. In between finding the
* seed hits and extending them, the sort() member function is called, which
* ranks QVals according to the order they should be extended. Right now the
* policy is that QVals with fewer elements (hits) should be tried first.
*/
template <typename index_t>
class SeedResults {
public:
SeedResults() :
seqFw_(AL_CAT),
seqRc_(AL_CAT),
qualFw_(AL_CAT),
qualRc_(AL_CAT),
hitsFw_(AL_CAT),
hitsRc_(AL_CAT),
isFw_(AL_CAT),
isRc_(AL_CAT),
sortedFw_(AL_CAT),
sortedRc_(AL_CAT),
offIdx2off_(AL_CAT),
rankOffs_(AL_CAT),
rankFws_(AL_CAT),
mm1Hit_(AL_CAT)
{
clear();
}
/**
* Set the current read.
*/
void nextRead(const Read& read) {
read_ = &read;
}
/**
* Set the appropriate element of either hitsFw_ or hitsRc_ to the given
* QVal. A QVal encapsulates all the BW ranges for reference substrings
* that are within some distance of the seed string.
*/
void add(
const QVal<index_t>& qv, // range of ranges in cache
const AlignmentCache<index_t>& ac, // cache
index_t seedIdx, // seed index (from 5' end)
bool seedFw) // whether seed is from forward read
{
assert(qv.repOk(ac));
assert(repOk(&ac));
assert_lt(seedIdx, hitsFw_.size());
assert_gt(numOffs_, 0); // if this fails, probably failed to call reset
if(qv.empty()) return;
if(seedFw) {
assert(!hitsFw_[seedIdx].valid());
hitsFw_[seedIdx] = qv;
numEltsFw_ += qv.numElts();
numRangesFw_ += qv.numRanges();
if(qv.numRanges() > 0) nonzFw_++;
} else {
assert(!hitsRc_[seedIdx].valid());
hitsRc_[seedIdx] = qv;
numEltsRc_ += qv.numElts();
numRangesRc_ += qv.numRanges();
if(qv.numRanges() > 0) nonzRc_++;
}
numElts_ += qv.numElts();
numRanges_ += qv.numRanges();
if(qv.numRanges() > 0) {
nonzTot_++;
}
assert(repOk(&ac));
}
/**
* Clear buffered seed hits and state. Set the number of seed
* offsets and the read.
*/
void reset(
const Read& read,
const EList<index_t>& offIdx2off,
size_t numOffs)
{
assert_gt(numOffs, 0);
clearSeeds();
numOffs_ = numOffs;
seqFw_.resize(numOffs_);
seqRc_.resize(numOffs_);
qualFw_.resize(numOffs_);
qualRc_.resize(numOffs_);
hitsFw_.resize(numOffs_);
hitsRc_.resize(numOffs_);
isFw_.resize(numOffs_);
isRc_.resize(numOffs_);
sortedFw_.resize(numOffs_);
sortedRc_.resize(numOffs_);
offIdx2off_ = offIdx2off;
for(size_t i = 0; i < numOffs_; i++) {
sortedFw_[i] = sortedRc_[i] = false;
hitsFw_[i].reset();
hitsRc_[i].reset();
isFw_[i].clear();
isRc_[i].clear();
}
read_ = &read;
sorted_ = false;
}
/**
* Clear seed-hit state.
*/
void clearSeeds() {
sortedFw_.clear();
sortedRc_.clear();
rankOffs_.clear();
rankFws_.clear();
offIdx2off_.clear();
hitsFw_.clear();
hitsRc_.clear();
isFw_.clear();
isRc_.clear();
seqFw_.clear();
seqRc_.clear();
nonzTot_ = 0;
nonzFw_ = 0;
nonzRc_ = 0;
numOffs_ = 0;
numRanges_ = 0;
numElts_ = 0;
numRangesFw_ = 0;
numEltsFw_ = 0;
numRangesRc_ = 0;
numEltsRc_ = 0;
}
/**
* Clear seed-hit state and end-to-end alignment state.
*/
void clear() {
clearSeeds();
read_ = NULL;
exactFwHit_.reset();
exactRcHit_.reset();
mm1Hit_.clear();
mm1Sorted_ = false;
mm1Elt_ = 0;
assert(empty());
}
/**
* Extract key summaries from this SeedResults and put into 'ssum'.
*/
void toSeedAlSumm(SeedAlSumm& ssum) const {
// Number of positions with at least 1 range
ssum.nonzTot = nonzTot_;
ssum.nonzFw = nonzFw_;
ssum.nonzRc = nonzRc_;
// Number of ranges
ssum.nrangeTot = numRanges_;
ssum.nrangeFw = numRangesFw_;
ssum.nrangeRc = numRangesRc_;
// Number of elements
ssum.neltTot = numElts_;
ssum.neltFw = numEltsFw_;
ssum.neltRc = numEltsRc_;
// Other summaries
ssum.maxNonzRangeFw = ssum.minNonzRangeFw = 0;
ssum.maxNonzRangeRc = ssum.minNonzRangeRc = 0;
ssum.maxNonzEltFw = ssum.minNonzEltFw = 0;
ssum.maxNonzEltRc = ssum.minNonzEltRc = 0;
for(size_t i = 0; i < numOffs_; i++) {
if(hitsFw_[i].valid()) {
if(ssum.minNonzEltFw == 0 || hitsFw_[i].numElts() < ssum.minNonzEltFw) {
ssum.minNonzEltFw = hitsFw_[i].numElts();
}
if(ssum.maxNonzEltFw == 0 || hitsFw_[i].numElts() > ssum.maxNonzEltFw) {
ssum.maxNonzEltFw = hitsFw_[i].numElts();
}
if(ssum.minNonzRangeFw == 0 || hitsFw_[i].numRanges() < ssum.minNonzRangeFw) {
ssum.minNonzRangeFw = hitsFw_[i].numRanges();
}
if(ssum.maxNonzRangeFw == 0 || hitsFw_[i].numRanges() > ssum.maxNonzRangeFw) {
ssum.maxNonzRangeFw = hitsFw_[i].numRanges();
}
}
if(hitsRc_[i].valid()) {
if(ssum.minNonzEltRc == 0 || hitsRc_[i].numElts() < ssum.minNonzEltRc) {
ssum.minNonzEltRc = hitsRc_[i].numElts();
}
if(ssum.maxNonzEltRc == 0 || hitsRc_[i].numElts() > ssum.maxNonzEltRc) {
ssum.maxNonzEltRc = hitsRc_[i].numElts();
}
if(ssum.minNonzRangeRc == 0 || hitsRc_[i].numRanges() < ssum.minNonzRangeRc) {
ssum.minNonzRangeRc = hitsRc_[i].numRanges();
}
if(ssum.maxNonzRangeRc == 0 || hitsRc_[i].numRanges() > ssum.maxNonzRangeRc) {
ssum.maxNonzRangeRc = hitsRc_[i].numRanges();
}
}
}
}
/**
* Return average number of hits per seed.
*/
float averageHitsPerSeed() const {
return (float)numElts_ / (float)nonzTot_;
}
/**
* Return median of all the non-zero per-seed # hits
*/
float medianHitsPerSeed() const {
EList<size_t>& median = const_cast<EList<size_t>&>(tmpMedian_);
median.clear();
for(size_t i = 0; i < numOffs_; i++) {
if(hitsFw_[i].valid() && hitsFw_[i].numElts() > 0) {
median.push_back(hitsFw_[i].numElts());
}
if(hitsRc_[i].valid() && hitsRc_[i].numElts() > 0) {
median.push_back(hitsRc_[i].numElts());
}
}
if(tmpMedian_.empty()) {
return 0.0f;
}
median.sort();
float med1 = (float)median[tmpMedian_.size() >> 1];
float med2 = med1;
if((median.size() & 1) == 0) {
med2 = (float)median[(tmpMedian_.size() >> 1) - 1];
}
return med1 + med2 * 0.5f;
}
/**
* Return a number that's meant to quantify how hopeful we are that this
* set of seed hits will lead to good alignments.
*/
double uniquenessFactor() const {
double result = 0.0;
for(size_t i = 0; i < numOffs_; i++) {
if(hitsFw_[i].valid()) {
size_t nelt = hitsFw_[i].numElts();
result += (1.0 / (double)(nelt * nelt));
}
if(hitsRc_[i].valid()) {
size_t nelt = hitsRc_[i].numElts();
result += (1.0 / (double)(nelt * nelt));
}
}
return result;
}
/**
* Return the number of ranges being held.
*/
index_t numRanges() const { return numRanges_; }
/**
* Return the number of elements being held.
*/
index_t numElts() const { return numElts_; }
/**
* Return the number of ranges being held for seeds on the forward
* read strand.
*/
index_t numRangesFw() const { return numRangesFw_; }
/**
* Return the number of elements being held for seeds on the
* forward read strand.
*/
index_t numEltsFw() const { return numEltsFw_; }
/**
* Return the number of ranges being held for seeds on the
* reverse-complement read strand.
*/
index_t numRangesRc() const { return numRangesRc_; }
/**
* Return the number of elements being held for seeds on the
* reverse-complement read strand.
*/
index_t numEltsRc() const { return numEltsRc_; }
/**
* Given an offset index, return the offset that has that index.
*/
index_t idx2off(size_t off) const {
return offIdx2off_[off];
}
/**
* Return true iff there are 0 hits being held.
*/
bool empty() const { return numRanges() == 0; }
/**
* Get the QVal representing all the reference hits for the given
* orientation and seed offset index.
*/
const QVal<index_t>& hitsAtOffIdx(bool fw, size_t seedoffidx) const {
assert_lt(seedoffidx, numOffs_);
assert(repOk(NULL));
return fw ? hitsFw_[seedoffidx] : hitsRc_[seedoffidx];
}
/**
* Get the Instantiated seeds for the given orientation and offset.
*/
EList<InstantiatedSeed>& instantiatedSeeds(bool fw, size_t seedoffidx) {
assert_lt(seedoffidx, numOffs_);
assert(repOk(NULL));
return fw ? isFw_[seedoffidx] : isRc_[seedoffidx];
}
/**
* Return the number of different seed offsets possible.
*/
index_t numOffs() const { return numOffs_; }
/**
* Return the read from which seeds were extracted, aligned.
*/
const Read& read() const { return *read_; }
#ifndef NDEBUG
/**
* Check that this SeedResults is internally consistent.
*/
bool repOk(
const AlignmentCache<index_t>* ac,
bool requireInited = false) const
{
if(requireInited) {
assert(read_ != NULL);
}
if(numOffs_ > 0) {
assert_eq(numOffs_, hitsFw_.size());
assert_eq(numOffs_, hitsRc_.size());
assert_leq(numRanges_, numElts_);
assert_leq(nonzTot_, numRanges_);
size_t nonzs = 0;
for(int fw = 0; fw <= 1; fw++) {
const EList<QVal<index_t> >& rrs = (fw ? hitsFw_ : hitsRc_);
for(size_t i = 0; i < numOffs_; i++) {
if(rrs[i].valid()) {
if(rrs[i].numRanges() > 0) nonzs++;
if(ac != NULL) {
assert(rrs[i].repOk(*ac));
}
}
}
}
assert_eq(nonzs, nonzTot_);
assert(!sorted_ || nonzTot_ == rankFws_.size());
assert(!sorted_ || nonzTot_ == rankOffs_.size());
}
return true;
}
#endif
/**
* Populate rankOffs_ and rankFws_ with the list of QVals that need to be
* examined for this SeedResults, in order. The order is ascending by
* number of elements, so QVals with fewer elements (i.e. seed sequences
* that are more unique) will be tried first and QVals with more elements
* (i.e. seed sequences
*/
void rankSeedHits(RandomSource& rnd) {
while(rankOffs_.size() < nonzTot_) {
index_t minsz = (index_t)0xffffffff;
index_t minidx = 0;
bool minfw = true;
// Rank seed-hit positions in ascending order by number of elements
// in all BW ranges
bool rb = rnd.nextBool();
assert(rb == 0 || rb == 1);
for(int fwi = 0; fwi <= 1; fwi++) {
bool fw = (fwi == (rb ? 1 : 0));
EList<QVal<index_t> >& rrs = (fw ? hitsFw_ : hitsRc_);
EList<bool>& sorted = (fw ? sortedFw_ : sortedRc_);
index_t i = (rnd.nextU32() % (index_t)numOffs_);
for(index_t ii = 0; ii < numOffs_; ii++) {
if(rrs[i].valid() && // valid QVal
rrs[i].numElts() > 0 && // non-empty
!sorted[i] && // not already sorted
rrs[i].numElts() < minsz) // least elts so far?
{
minsz = rrs[i].numElts();
minidx = i;
minfw = (fw == 1);
}
if((++i) == numOffs_) {
i = 0;
}
}
}
assert_neq((index_t)0xffffffff, minsz);
if(minfw) {
sortedFw_[minidx] = true;
} else {
sortedRc_[minidx] = true;
}
rankOffs_.push_back(minidx);
rankFws_.push_back(minfw);
}
assert_eq(rankOffs_.size(), rankFws_.size());
sorted_ = true;
}
/**
* Return the number of orientation/offsets into the read that have
* at least one seed hit.
*/
size_t nonzeroOffsets() const {
assert(!sorted_ || nonzTot_ == rankFws_.size());
assert(!sorted_ || nonzTot_ == rankOffs_.size());
return nonzTot_;
}
/**
* Return true iff all seeds hit for forward read.
*/
bool allFwSeedsHit() const {
return nonzFw_ == numOffs();
}
/**
* Return true iff all seeds hit for revcomp read.
*/
bool allRcSeedsHit() const {
return nonzRc_ == numOffs();
}
/**
* Return the minimum number of edits that an end-to-end alignment of the
* fw read could have. Uses knowledge of how many seeds have exact hits
* and how the seeds overlap.
*/
index_t fewestEditsEE(bool fw, int seedlen, int per) const {
assert_gt(seedlen, 0);
assert_gt(per, 0);
index_t nonz = fw ? nonzFw_ : nonzRc_;
if(nonz < numOffs()) {
int maxdepth = (seedlen + per - 1) / per;
int missing = (int)(numOffs() - nonz);
return (missing + maxdepth - 1) / maxdepth;
} else {
// Exact hit is possible (not guaranteed)
return 0;
}
}
/**
* Return the number of offsets into the forward read that have at
* least one seed hit.
*/
index_t nonzeroOffsetsFw() const {
return nonzFw_;
}
/**
* Return the number of offsets into the reverse-complement read
* that have at least one seed hit.
*/
index_t nonzeroOffsetsRc() const {
return nonzRc_;
}
/**
* Return a QVal of seed hits of the given rank 'r'. 'offidx' gets the id
* of the offset from 5' from which it was extracted (0 for the 5-most
* offset, 1 for the next closes to 5', etc). 'off' gets the offset from
* the 5' end. 'fw' gets true iff the seed was extracted from the forward
* read.
*/
const QVal<index_t>& hitsByRank(
index_t r, // in
index_t& offidx, // out
index_t& off, // out
bool& fw, // out
index_t& seedlen) // out
{
assert(sorted_);
assert_lt(r, nonzTot_);
if(rankFws_[r]) {
fw = true;
offidx = rankOffs_[r];
assert_lt(offidx, offIdx2off_.size());
off = offIdx2off_[offidx];
seedlen = (index_t)seqFw_[rankOffs_[r]].length();
return hitsFw_[rankOffs_[r]];
} else {
fw = false;
offidx = rankOffs_[r];
assert_lt(offidx, offIdx2off_.size());
off = offIdx2off_[offidx];
seedlen = (index_t)seqRc_[rankOffs_[r]].length();
return hitsRc_[rankOffs_[r]];
}
}
/**
* Return an EList of seed hits of the given rank.
*/
const BTDnaString& seqByRank(index_t r) {
assert(sorted_);
assert_lt(r, nonzTot_);
return rankFws_[r] ? seqFw_[rankOffs_[r]] : seqRc_[rankOffs_[r]];
}
/**
* Return an EList of seed hits of the given rank.
*/
const BTString& qualByRank(index_t r) {
assert(sorted_);
assert_lt(r, nonzTot_);
return rankFws_[r] ? qualFw_[rankOffs_[r]] : qualRc_[rankOffs_[r]];
}
/**
* Return the list of extracted seed sequences for seeds on either
* the forward or reverse strand.
*/
EList<BTDnaString>& seqs(bool fw) { return fw ? seqFw_ : seqRc_; }
/**
* Return the list of extracted quality sequences for seeds on
* either the forward or reverse strand.
*/
EList<BTString>& quals(bool fw) { return fw ? qualFw_ : qualRc_; }
/**
* Return exact end-to-end alignment of fw read.
*/
EEHit<index_t> exactFwEEHit() const { return exactFwHit_; }
/**
* Return exact end-to-end alignment of rc read.
*/
EEHit<index_t> exactRcEEHit() const { return exactRcHit_; }
/**
* Return const ref to list of 1-mismatch end-to-end alignments.
*/
const EList<EEHit<index_t> >& mm1EEHits() const { return mm1Hit_; }
/**
* Sort the end-to-end 1-mismatch alignments, prioritizing by score (higher
* score = higher priority).
*/
void sort1mmEe(RandomSource& rnd) {
assert(!mm1Sorted_);
mm1Hit_.sort();
size_t streak = 0;
for(size_t i = 1; i < mm1Hit_.size(); i++) {
if(mm1Hit_[i].score == mm1Hit_[i-1].score) {
if(streak == 0) { streak = 1; }
streak++;
} else {
if(streak > 1) {
assert_geq(i, streak);
mm1Hit_.shufflePortion(i-streak, streak, rnd);
}
streak = 0;
}
}
if(streak > 1) {
mm1Hit_.shufflePortion(mm1Hit_.size() - streak, streak, rnd);
}
mm1Sorted_ = true;
}
/**
* Add an end-to-end 1-mismatch alignment.
*/
void add1mmEe(
index_t top,
index_t bot,
const Edit* e1,
const Edit* e2,
bool fw,
int64_t score)
{
mm1Hit_.expand();
mm1Hit_.back().init(top, bot, e1, e2, fw, score);
mm1Elt_ += (bot - top);
}
/**
* Add an end-to-end exact alignment.
*/
void addExactEeFw(
index_t top,
index_t bot,
const Edit* e1,
const Edit* e2,
bool fw,
int64_t score)
{
exactFwHit_.init(top, bot, e1, e2, fw, score);
}
/**
* Add an end-to-end exact alignment.
*/
void addExactEeRc(
index_t top,
index_t bot,
const Edit* e1,
const Edit* e2,
bool fw,
int64_t score)
{
exactRcHit_.init(top, bot, e1, e2, fw, score);
}
/**
* Clear out the end-to-end exact alignments.
*/
void clearExactE2eHits() {
exactFwHit_.reset();
exactRcHit_.reset();
}
/**
* Clear out the end-to-end 1-mismatch alignments.
*/
void clear1mmE2eHits() {
mm1Hit_.clear(); // 1-mismatch end-to-end hits
mm1Elt_ = 0; // number of 1-mismatch hit rows
mm1Sorted_ = false; // true iff we've sorted the mm1Hit_ list
}
/**
* Return the number of distinct exact and 1-mismatch end-to-end hits
* found.
*/
index_t numE2eHits() const {
return (index_t)(exactFwHit_.size() + exactRcHit_.size() + mm1Elt_);
}
/**
* Return the number of distinct exact end-to-end hits found.
*/
index_t numExactE2eHits() const {
return (index_t)(exactFwHit_.size() + exactRcHit_.size());
}
/**
* Return the number of distinct 1-mismatch end-to-end hits found.
*/
index_t num1mmE2eHits() const {
return mm1Elt_;
}
/**
* Return the length of the read that yielded the seed hits.
*/
index_t readLength() const {
assert(read_ != NULL);
return read_->length();
}
protected:
// As seed hits and edits are added they're sorted into these
// containers
EList<BTDnaString> seqFw_; // seqs for seeds from forward read
EList<BTDnaString> seqRc_; // seqs for seeds from revcomp read
EList<BTString> qualFw_; // quals for seeds from forward read
EList<BTString> qualRc_; // quals for seeds from revcomp read
EList<QVal<index_t> > hitsFw_; // hits for forward read
EList<QVal<index_t> > hitsRc_; // hits for revcomp read
EList<EList<InstantiatedSeed> > isFw_; // hits for forward read
EList<EList<InstantiatedSeed> > isRc_; // hits for revcomp read
EList<bool> sortedFw_; // true iff fw QVal was sorted/ranked
EList<bool> sortedRc_; // true iff rc QVal was sorted/ranked
index_t nonzTot_; // # offsets with non-zero size
index_t nonzFw_; // # offsets into fw read with non-0 size
index_t nonzRc_; // # offsets into rc read with non-0 size
index_t numRanges_; // # ranges added
index_t numElts_; // # elements added
index_t numRangesFw_; // # ranges added for fw seeds
index_t numEltsFw_; // # elements added for fw seeds
index_t numRangesRc_; // # ranges added for rc seeds
index_t numEltsRc_; // # elements added for rc seeds
EList<index_t> offIdx2off_;// map from offset indexes to offsets from 5' end
// When the sort routine is called, the seed hits collected so far
// are sorted into another set of containers that allow easy access
// to hits from the lowest-ranked offset (the one with the fewest
// BW elements) to the greatest-ranked offset. Offsets with 0 hits
// are ignored.
EList<index_t> rankOffs_; // sorted offests of seeds to try
EList<bool> rankFws_; // sorted orientations assoc. with rankOffs_
bool sorted_; // true if sort() called since last reset
// These fields set once per read
index_t numOffs_; // # different seed offsets possible
const Read* read_; // read from which seeds were extracted
EEHit<index_t> exactFwHit_; // end-to-end exact hit for fw read
EEHit<index_t> exactRcHit_; // end-to-end exact hit for rc read
EList<EEHit<index_t> > mm1Hit_; // 1-mismatch end-to-end hits
index_t mm1Elt_; // number of 1-mismatch hit rows
bool mm1Sorted_; // true iff we've sorted the mm1Hit_ list
EList<size_t> tmpMedian_; // temporary storage for calculating median
};
// Forward decl
template <typename index_t> class Ebwt;
template <typename index_t> struct SideLocus;
/**
* Encapsulates a sumamry of what the searchAllSeeds aligner did.
*/
struct SeedSearchMetrics {
SeedSearchMetrics() : mutex_m() {
reset();
}
/**
* Merge this metrics object with the given object, i.e., sum each
* category. This is the only safe way to update a
* SeedSearchMetrics object shread by multiple threads.
*/
void merge(const SeedSearchMetrics& m, bool getLock = false) {
ThreadSafe ts(&mutex_m, getLock);
seedsearch += m.seedsearch;
possearch += m.possearch;
intrahit += m.intrahit;
interhit += m.interhit;
filteredseed += m.filteredseed;
ooms += m.ooms;
bwops += m.bwops;
bweds += m.bweds;
bestmin0 += m.bestmin0;
bestmin1 += m.bestmin1;
bestmin2 += m.bestmin2;
}
/**
* Set all counters to 0.
*/
void reset() {
seedsearch =
possearch =
intrahit =
interhit =
filteredseed =
ooms =
bwops =
bweds =
bestmin0 =
bestmin1 =
bestmin2 = 0;
}
uint64_t seedsearch; // # times we executed strategy in InstantiatedSeed
uint64_t possearch; // # offsets where aligner executed >= 1 strategy
uint64_t intrahit; // # offsets where current-read cache gave answer
uint64_t interhit; // # offsets where across-read cache gave answer
uint64_t filteredseed; // # seed instantiations skipped due to Ns
uint64_t ooms; // out-of-memory errors
uint64_t bwops; // Burrows-Wheeler operations
uint64_t bweds; // Burrows-Wheeler edits
uint64_t bestmin0; // # times the best min # edits was 0
uint64_t bestmin1; // # times the best min # edits was 1
uint64_t bestmin2; // # times the best min # edits was 2
MUTEX_T mutex_m;
};
/**
* Given an index and a seeding scheme, searches for seed hits.
*/
template <typename index_t>
class SeedAligner {
public:
/**
* Initialize with index.
*/
SeedAligner() : edits_(AL_CAT), offIdx2off_(AL_CAT) { }
/**
* Given a read and a few coordinates that describe a substring of the
* read (or its reverse complement), fill in 'seq' and 'qual' objects
* with the seed sequence and qualities.
*/
void instantiateSeq(
const Read& read, // input read
BTDnaString& seq, // output sequence
BTString& qual, // output qualities
int len, // seed length
int depth, // seed's 0-based offset from 5' end
bool fw) const; // seed's orientation
/**
* Iterate through the seeds that cover the read and initiate a
* search for each seed.
*/
std::pair<int, int> instantiateSeeds(
const EList<Seed>& seeds, // search seeds
index_t off, // offset into read to start extracting
int per, // interval between seeds
const Read& read, // read to align
const Scoring& pens, // scoring scheme
bool nofw, // don't align forward read
bool norc, // don't align revcomp read
AlignmentCacheIface<index_t>& cache, // holds some seed hits from previous reads
SeedResults<index_t>& sr, // holds all the seed hits
SeedSearchMetrics& met); // metrics
/**
* Iterate through the seeds that cover the read and initiate a
* search for each seed.
*/
void searchAllSeeds(
const EList<Seed>& seeds, // search seeds
const Ebwt<index_t>* ebwtFw, // BWT index
const Ebwt<index_t>* ebwtBw, // BWT' index
const Read& read, // read to align
const Scoring& pens, // scoring scheme
AlignmentCacheIface<index_t>& cache, // local seed alignment cache
SeedResults<index_t>& hits, // holds all the seed hits
SeedSearchMetrics& met, // metrics
PerReadMetrics& prm); // per-read metrics
/**
* Sanity-check a partial alignment produced during oneMmSearch.
*/
bool sanityPartial(
const Ebwt<index_t>* ebwtFw, // BWT index
const Ebwt<index_t>* ebwtBw, // BWT' index
const BTDnaString& seq,
index_t dep,
index_t len,
bool do1mm,
index_t topfw,
index_t botfw,
index_t topbw,
index_t botbw);
/**
* Do an exact-matching sweet to establish a lower bound on number of edits
* and to find exact alignments.
*/
size_t exactSweep(
const Ebwt<index_t>& ebwt, // BWT index
const Read& read, // read to align
const Scoring& sc, // scoring scheme
bool nofw, // don't align forward read
bool norc, // don't align revcomp read
size_t mineMax, // don't care about edit bounds > this
size_t& mineFw, // minimum # edits for forward read
size_t& mineRc, // minimum # edits for revcomp read
bool repex, // report 0mm hits?
SeedResults<index_t>& hits, // holds all the seed hits (and exact hit)
SeedSearchMetrics& met); // metrics
/**
* Search for end-to-end alignments with up to 1 mismatch.
*/
bool oneMmSearch(
const Ebwt<index_t>* ebwtFw, // BWT index
const Ebwt<index_t>* ebwtBw, // BWT' index
const Read& read, // read to align
const Scoring& sc, // scoring
int64_t minsc, // minimum score
bool nofw, // don't align forward read
bool norc, // don't align revcomp read
bool local, // 1mm hits must be legal local alignments
bool repex, // report 0mm hits?
bool rep1mm, // report 1mm hits?
SeedResults<index_t>& hits, // holds all the seed hits (and exact hit)
SeedSearchMetrics& met); // metrics
protected:
/**
* Report a seed hit found by searchSeedBi(), but first try to extend it out in
* either direction as far as possible without hitting any edits. This will
* allow us to prioritize the seed hits better later on. Call reportHit() when
* we're done, which actually adds the hit to the cache. Returns result from
* calling reportHit().
*/
bool extendAndReportHit(
index_t topf, // top in BWT
index_t botf, // bot in BWT
index_t topb, // top in BWT'
index_t botb, // bot in BWT'
index_t len, // length of hit
DoublyLinkedList<Edit> *prevEdit); // previous edit
/**
* Report a seed hit found by searchSeedBi() by adding it to the cache. Return
* false if the hit could not be reported because of, e.g., cache exhaustion.
*/
bool reportHit(
index_t topf, // top in BWT
index_t botf, // bot in BWT
index_t topb, // top in BWT'
index_t botb, // bot in BWT'
index_t len, // length of hit
DoublyLinkedList<Edit> *prevEdit); // previous edit
/**
* Given an instantiated seed (in s_ and other fields), search
*/
bool searchSeedBi();
/**
* Main, recursive implementation of the seed search.
*/
bool searchSeedBi(
int step, // depth into steps_[] array
int depth, // recursion depth
index_t topf, // top in BWT
index_t botf, // bot in BWT
index_t topb, // top in BWT'
index_t botb, // bot in BWT'
SideLocus<index_t> tloc, // locus for top (perhaps unititialized)
SideLocus<index_t> bloc, // locus for bot (perhaps unititialized)
Constraint c0, // constraints to enforce in seed zone 0
Constraint c1, // constraints to enforce in seed zone 1
Constraint c2, // constraints to enforce in seed zone 2
Constraint overall, // overall constraints
DoublyLinkedList<Edit> *prevEdit); // previous edit
/**
* Get tloc and bloc ready for the next step.
*/
inline void nextLocsBi(
SideLocus<index_t>& tloc, // top locus
SideLocus<index_t>& bloc, // bot locus
index_t topf, // top in BWT
index_t botf, // bot in BWT
index_t topb, // top in BWT'
index_t botb, // bot in BWT'
int step); // step to get ready for
// Following are set in searchAllSeeds then used by searchSeed()
// and other protected members.
const Ebwt<index_t>* ebwtFw_; // forward index (BWT)
const Ebwt<index_t>* ebwtBw_; // backward/mirror index (BWT')
const Scoring* sc_; // scoring scheme
const InstantiatedSeed* s_; // current instantiated seed
const Read* read_; // read whose seeds are currently being aligned
// The following are set just before a call to searchSeedBi()
const BTDnaString* seq_; // sequence of current seed
const BTString* qual_; // quality string for current seed
index_t off_; // offset of seed currently being searched
bool fw_; // orientation of seed currently being searched
EList<Edit> edits_; // temporary place to sort edits
AlignmentCacheIface<index_t> *ca_; // local alignment cache for seed alignments
EList<index_t> offIdx2off_; // offset idx to read offset map, set up instantiateSeeds()
uint64_t bwops_; // Burrows-Wheeler operations
uint64_t bwedits_; // Burrows-Wheeler edits
BTDnaString tmprfdnastr_; // used in reportHit
ASSERT_ONLY(ESet<BTDnaString> hits_); // Ref hits so far for seed being aligned
BTDnaString tmpdnastr_;
};
#define INIT_LOCS(top, bot, tloc, bloc, e) { \
if(bot - top == 1) { \
tloc.initFromRow(top, (e).eh(), (e).ebwt()); \
bloc.invalidate(); \
} else { \
SideLocus<index_t>::initFromTopBot(top, bot, (e).eh(), (e).ebwt(), tloc, bloc); \
assert(bloc.valid()); \
} \
}
#define SANITY_CHECK_4TUP(t, b, tp, bp) { \
ASSERT_ONLY(index_t tot = (b[0]-t[0])+(b[1]-t[1])+(b[2]-t[2])+(b[3]-t[3])); \
ASSERT_ONLY(index_t totp = (bp[0]-tp[0])+(bp[1]-tp[1])+(bp[2]-tp[2])+(bp[3]-tp[3])); \
assert_eq(tot, totp); \
}
/**
* Given a read and a few coordinates that describe a substring of the read (or
* its reverse complement), fill in 'seq' and 'qual' objects with the seed
* sequence and qualities.
*
* The seq field is filled with the sequence as it would align to the Watson
* reference strand. I.e. if fw is false, then the sequence that appears in
* 'seq' is the reverse complement of the raw read substring.
*/
template <typename index_t>
void SeedAligner<index_t>::instantiateSeq(
const Read& read, // input read
BTDnaString& seq, // output sequence
BTString& qual, // output qualities
int len, // seed length
int depth, // seed's 0-based offset from 5' end
bool fw) const // seed's orientation
{
// Fill in 'seq' and 'qual'
int seedlen = len;
if((int)read.length() < seedlen) seedlen = (int)read.length();
seq.resize(len);
qual.resize(len);
// If fw is false, we take characters starting at the 3' end of the
// reverse complement of the read.
for(int i = 0; i < len; i++) {
seq.set(read.patFw.windowGetDna(i, fw, read.color, depth, len), i);
qual.set(read.qual.windowGet(i, fw, depth, len), i);
}
}
/**
* We assume that all seeds are the same length.
*
* For each seed, instantiate the seed, retracting if necessary.
*/
template <typename index_t>
pair<int, int> SeedAligner<index_t>::instantiateSeeds(
const EList<Seed>& seeds, // search seeds
index_t off, // offset into read to start extracting
int per, // interval between seeds
const Read& read, // read to align
const Scoring& pens, // scoring scheme
bool nofw, // don't align forward read
bool norc, // don't align revcomp read
AlignmentCacheIface<index_t>& cache,// holds some seed hits from previous reads
SeedResults<index_t>& sr, // holds all the seed hits
SeedSearchMetrics& met) // metrics
{
assert(!seeds.empty());
assert_gt(read.length(), 0);
// Check whether read has too many Ns
offIdx2off_.clear();
int len = seeds[0].len; // assume they're all the same length
#ifndef NDEBUG
for(size_t i = 1; i < seeds.size(); i++) {
assert_eq(len, seeds[i].len);
}
#endif
// Calc # seeds within read interval
int nseeds = 1;
if((int)read.length() - (int)off > len) {
nseeds += ((int)read.length() - (int)off - len) / per;
}
for(int i = 0; i < nseeds; i++) {
offIdx2off_.push_back(per * i + (int)off);
}
pair<int, int> ret;
ret.first = 0; // # seeds that require alignment
ret.second = 0; // # seeds that hit in cache with non-empty results
sr.reset(read, offIdx2off_, nseeds);
assert(sr.repOk(&cache.current(), true)); // require that SeedResult be initialized
// For each seed position
for(int fwi = 0; fwi < 2; fwi++) {
bool fw = (fwi == 0);
if((fw && nofw) || (!fw && norc)) {
// Skip this orientation b/c user specified --nofw or --norc
continue;
}
// For each seed position
for(int i = 0; i < nseeds; i++) {
int depth = i * per + (int)off;
int seedlen = seeds[0].len;
// Extract the seed sequence at this offset
// If fw == true, we extract the characters from i*per to
// i*(per-1) (exclusive). If fw == false,
instantiateSeq(
read,
sr.seqs(fw)[i],
sr.quals(fw)[i],
std::min<int>((int)seedlen, (int)read.length()),
depth,
fw);
//QKey qk(sr.seqs(fw)[i] ASSERT_ONLY(, tmpdnastr_));
// For each search strategy
EList<InstantiatedSeed>& iss = sr.instantiatedSeeds(fw, i);
for(int j = 0; j < (int)seeds.size(); j++) {
iss.expand();
assert_eq(seedlen, seeds[j].len);
InstantiatedSeed* is = &iss.back();
if(seeds[j].instantiate(
read,
sr.seqs(fw)[i],
sr.quals(fw)[i],
pens,
depth,
i,
j,
fw,
*is))
{
// Can we fill this seed hit in from the cache?
ret.first++;
} else {
// Seed may fail to instantiate if there are Ns
// that prevent it from matching
met.filteredseed++;
iss.pop_back();
}
}
}
}
return ret;
}
/**
* We assume that all seeds are the same length.
*
* For each seed:
*
* 1. Instantiate all seeds, retracting them if necessary.
* 2. Calculate zone boundaries for each seed
*/
template <typename index_t>
void SeedAligner<index_t>::searchAllSeeds(
const EList<Seed>& seeds, // search seeds
const Ebwt<index_t>* ebwtFw, // BWT index
const Ebwt<index_t>* ebwtBw, // BWT' index
const Read& read, // read to align
const Scoring& pens, // scoring scheme
AlignmentCacheIface<index_t>& cache, // local cache for seed alignments
SeedResults<index_t>& sr, // holds all the seed hits
SeedSearchMetrics& met, // metrics
PerReadMetrics& prm) // per-read metrics
{
assert(!seeds.empty());
assert(ebwtFw != NULL);
assert(ebwtFw->isInMemory());
assert(sr.repOk(&cache.current()));
ebwtFw_ = ebwtFw;
ebwtBw_ = ebwtBw;
sc_ = &pens;
read_ = &read;
ca_ = &cache;
bwops_ = bwedits_ = 0;
uint64_t possearches = 0, seedsearches = 0, intrahits = 0, interhits = 0, ooms = 0;
// For each instantiated seed
for(int i = 0; i < (int)sr.numOffs(); i++) {
size_t off = sr.idx2off(i);
for(int fwi = 0; fwi < 2; fwi++) {
bool fw = (fwi == 0);
assert(sr.repOk(&cache.current()));
EList<InstantiatedSeed>& iss = sr.instantiatedSeeds(fw, i);
if(iss.empty()) {
// Cache hit in an across-read cache
continue;
}
QVal<index_t> qv;
seq_ = &sr.seqs(fw)[i]; // seed sequence
qual_ = &sr.quals(fw)[i]; // seed qualities
off_ = off; // seed offset (from 5')
fw_ = fw; // seed orientation
// Tell the cache that we've started aligning, so the cache can
// expect a series of on-the-fly updates
int ret = cache.beginAlign(*seq_, *qual_, qv);
ASSERT_ONLY(hits_.clear());
if(ret == -1) {
// Out of memory when we tried to add key to map
ooms++;
continue;
}
bool abort = false;
if(ret == 0) {
// Not already in cache
assert(cache.aligning());
possearches++;
for(size_t j = 0; j < iss.size(); j++) {
// Set seq_ and qual_ appropriately, using the seed sequences
// and qualities already installed in SeedResults
assert_eq(fw, iss[j].fw);
assert_eq(i, (int)iss[j].seedoffidx);
s_ = &iss[j];
// Do the search with respect to seq_, qual_ and s_.
if(!searchSeedBi()) {
// Memory exhausted during search
ooms++;
abort = true;
break;
}
seedsearches++;
assert(cache.aligning());
}
if(!abort) {
qv = cache.finishAlign();
}
} else {
// Already in cache
assert_eq(1, ret);
assert(qv.valid());
intrahits++;
}
assert(abort || !cache.aligning());
if(qv.valid()) {
sr.add(
qv, // range of ranges in cache
cache.current(), // cache
i, // seed index (from 5' end)
fw); // whether seed is from forward read
}
}
}
prm.nSeedRanges = sr.numRanges();
prm.nSeedElts = sr.numElts();
prm.nSeedRangesFw = sr.numRangesFw();
prm.nSeedRangesRc = sr.numRangesRc();
prm.nSeedEltsFw = sr.numEltsFw();
prm.nSeedEltsRc = sr.numEltsRc();
prm.seedMedian = (uint64_t)(sr.medianHitsPerSeed() + 0.5);
prm.seedMean = (uint64_t)sr.averageHitsPerSeed();
prm.nSdFmops += bwops_;
met.seedsearch += seedsearches;
met.possearch += possearches;
met.intrahit += intrahits;
met.interhit += interhits;
met.ooms += ooms;
met.bwops += bwops_;
met.bweds += bwedits_;
}
template <typename index_t>
bool SeedAligner<index_t>::sanityPartial(
const Ebwt<index_t>* ebwtFw, // BWT index
const Ebwt<index_t>* ebwtBw, // BWT' index
const BTDnaString& seq,
index_t dep,
index_t len,
bool do1mm,
index_t topfw,
index_t botfw,
index_t topbw,
index_t botbw)
{
tmpdnastr_.clear();
for(size_t i = dep; i < len; i++) {
tmpdnastr_.append(seq[i]);
}
index_t top_fw = 0, bot_fw = 0;
ebwtFw->contains(tmpdnastr_, &top_fw, &bot_fw);
assert_eq(top_fw, topfw);
assert_eq(bot_fw, botfw);
if(do1mm && ebwtBw != NULL) {
tmpdnastr_.reverse();
index_t top_bw = 0, bot_bw = 0;
ebwtBw->contains(tmpdnastr_, &top_bw, &bot_bw);
assert_eq(top_bw, topbw);
assert_eq(bot_bw, botbw);
}
return true;
}
/**
* Sweep right-to-left and left-to-right using exact matching. Remember all
* the SA ranges encountered along the way. Report exact matches if there are
* any. Calculate a lower bound on the number of edits in an end-to-end
* alignment.
*/
template <typename index_t>
size_t SeedAligner<index_t>::exactSweep(
const Ebwt<index_t>& ebwt, // BWT index
const Read& read, // read to align
const Scoring& sc, // scoring scheme
bool nofw, // don't align forward read
bool norc, // don't align revcomp read
size_t mineMax, // don't care about edit bounds > this
size_t& mineFw, // minimum # edits for forward read
size_t& mineRc, // minimum # edits for revcomp read
bool repex, // report 0mm hits?
SeedResults<index_t>& hits, // holds all the seed hits (and exact hit)
SeedSearchMetrics& met) // metrics
{
assert_gt(mineMax, 0);
index_t top = 0, bot = 0;
SideLocus<index_t> tloc, bloc;
const size_t len = read.length();
size_t nelt = 0;
for(int fwi = 0; fwi < 2; fwi++) {
bool fw = (fwi == 0);
if( fw && nofw) continue;
if(!fw && norc) continue;
const BTDnaString& seq = fw ? read.patFw : read.patRc;
assert(!seq.empty());
int ftabLen = ebwt.eh().ftabChars();
size_t dep = 0;
size_t nedit = 0;
bool done = false;
while(dep < len && !done) {
top = bot = 0;
size_t left = len - dep;
assert_gt(left, 0);
bool doFtab = ftabLen > 1 && left >= (size_t)ftabLen;
if(doFtab) {
// Does N interfere with use of Ftab?
for(size_t i = 0; i < (size_t)ftabLen; i++) {
int c = seq[len-dep-1-i];
if(c > 3) {
doFtab = false;
break;
}
}
}
if(doFtab) {
// Use ftab
ebwt.ftabLoHi(seq, len - dep - ftabLen, false, top, bot);
dep += (size_t)ftabLen;
} else {
// Use fchr
int c = seq[len-dep-1];
if(c < 4) {
top = ebwt.fchr()[c];
bot = ebwt.fchr()[c+1];
}
dep++;
}
if(bot <= top) {
nedit++;
if(nedit >= mineMax) {
if(fw) { mineFw = nedit; } else { mineRc = nedit; }
break;
}
continue;
}
INIT_LOCS(top, bot, tloc, bloc, ebwt);
// Keep going
while(dep < len) {
int c = seq[len-dep-1];
if(c > 3) {
top = bot = 0;
} else {
if(bloc.valid()) {
bwops_ += 2;
top = ebwt.mapLF(tloc, c);
bot = ebwt.mapLF(bloc, c);
} else {
bwops_++;
top = ebwt.mapLF1(top, tloc, c);
if(top == (index_t)OFF_MASK) {
top = bot = 0;
} else {
bot = top+1;
}
}
}
if(bot <= top) {
nedit++;
if(nedit >= mineMax) {
if(fw) { mineFw = nedit; } else { mineRc = nedit; }
done = true;
}
break;
}
INIT_LOCS(top, bot, tloc, bloc, ebwt);
dep++;
}
if(done) {
break;
}
if(dep == len) {
// Set the minimum # edits
if(fw) { mineFw = nedit; } else { mineRc = nedit; }
// Done
if(nedit == 0 && bot > top) {
if(repex) {
// This is an exact hit
int64_t score = len * sc.match();
if(fw) {
hits.addExactEeFw(top, bot, NULL, NULL, fw, score);
assert(ebwt.contains(seq, NULL, NULL));
} else {
hits.addExactEeRc(top, bot, NULL, NULL, fw, score);
assert(ebwt.contains(seq, NULL, NULL));
}
}
nelt += (bot - top);
}
break;
}
dep++;
}
}
return nelt;
}
/**
* Search for end-to-end exact hit for read. Return true iff one is found.
*/
template <typename index_t>
bool SeedAligner<index_t>::oneMmSearch(
const Ebwt<index_t>* ebwtFw, // BWT index
const Ebwt<index_t>* ebwtBw, // BWT' index
const Read& read, // read to align
const Scoring& sc, // scoring
int64_t minsc, // minimum score
bool nofw, // don't align forward read
bool norc, // don't align revcomp read
bool local, // 1mm hits must be legal local alignments
bool repex, // report 0mm hits?
bool rep1mm, // report 1mm hits?
SeedResults<index_t>& hits, // holds all the seed hits (and exact hit)
SeedSearchMetrics& met) // metrics
{
assert(!rep1mm || ebwtBw != NULL);
const size_t len = read.length();
int nceil = sc.nCeil.f<int>((double)len);
size_t ns = read.ns();
if(ns > 1) {
// Can't align this with <= 1 mismatches
return false;
} else if(ns == 1 && !rep1mm) {
// Can't align this with 0 mismatches
return false;
}
assert_geq(len, 2);
assert(!rep1mm || ebwtBw->eh().ftabChars() == ebwtFw->eh().ftabChars());
#ifndef NDEBUG
if(ebwtBw != NULL) {
for(int i = 0; i < 4; i++) {
assert_eq(ebwtBw->fchr()[i], ebwtFw->fchr()[i]);
}
}
#endif
size_t halfFw = len >> 1;
size_t halfBw = len >> 1;
if((len & 1) != 0) {
halfBw++;
}
assert_geq(halfFw, 1);
assert_geq(halfBw, 1);
SideLocus<index_t> tloc, bloc;
index_t t[4], b[4]; // dest BW ranges for BWT
t[0] = t[1] = t[2] = t[3] = 0;
b[0] = b[1] = b[2] = b[3] = 0;
index_t tp[4], bp[4]; // dest BW ranges for BWT'
tp[0] = tp[1] = tp[2] = tp[3] = 0;
bp[0] = bp[1] = bp[2] = bp[3] = 0;
index_t top = 0, bot = 0, topp = 0, botp = 0;
// Align fw read / rc read
bool results = false;
for(int fwi = 0; fwi < 2; fwi++) {
bool fw = (fwi == 0);
if( fw && nofw) continue;
if(!fw && norc) continue;
// Align going right-to-left, left-to-right
int lim = rep1mm ? 2 : 1;
for(int ebwtfwi = 0; ebwtfwi < lim; ebwtfwi++) {
bool ebwtfw = (ebwtfwi == 0);
const Ebwt<index_t>* ebwt = (ebwtfw ? ebwtFw : ebwtBw);
const Ebwt<index_t>* ebwtp = (ebwtfw ? ebwtBw : ebwtFw);
assert(rep1mm || ebwt->fw());
const BTDnaString& seq =
(fw ? (ebwtfw ? read.patFw : read.patFwRev) :
(ebwtfw ? read.patRc : read.patRcRev));
assert(!seq.empty());
const BTString& qual =
(fw ? (ebwtfw ? read.qual : read.qualRev) :
(ebwtfw ? read.qualRev : read.qual));
int ftabLen = ebwt->eh().ftabChars();
size_t nea = ebwtfw ? halfFw : halfBw;
// Check if there's an N in the near portion
bool skip = false;
for(size_t dep = 0; dep < nea; dep++) {
if(seq[len-dep-1] > 3) {
skip = true;
break;
}
}
if(skip) {
continue;
}
size_t dep = 0;
// Align near half
if(ftabLen > 1 && (size_t)ftabLen <= nea) {
// Use ftab to jump partway into near half
bool rev = !ebwtfw;
ebwt->ftabLoHi(seq, len - ftabLen, rev, top, bot);
if(rep1mm) {
ebwtp->ftabLoHi(seq, len - ftabLen, rev, topp, botp);
assert_eq(bot - top, botp - topp);
}
if(bot - top == 0) {
continue;
}
int c = seq[len - ftabLen];
t[c] = top; b[c] = bot;
tp[c] = topp; bp[c] = botp;
dep = ftabLen;
// initialize tloc, bloc??
} else {
// Use fchr to jump in by 1 pos
int c = seq[len-1];
assert_range(0, 3, c);
top = topp = tp[c] = ebwt->fchr()[c];
bot = botp = bp[c] = ebwt->fchr()[c+1];
if(bot - top == 0) {
continue;
}
dep = 1;
// initialize tloc, bloc??
}
INIT_LOCS(top, bot, tloc, bloc, *ebwt);
assert(sanityPartial(ebwt, ebwtp, seq, len-dep, len, rep1mm, top, bot, topp, botp));
bool do_continue = false;
for(; dep < nea; dep++) {
assert_lt(dep, len);
int rdc = seq[len - dep - 1];
tp[0] = tp[1] = tp[2] = tp[3] = topp;
bp[0] = bp[1] = bp[2] = bp[3] = botp;
if(bloc.valid()) {
bwops_++;
t[0] = t[1] = t[2] = t[3] = b[0] = b[1] = b[2] = b[3] = 0;
ebwt->mapBiLFEx(tloc, bloc, t, b, tp, bp);
SANITY_CHECK_4TUP(t, b, tp, bp);
top = t[rdc]; bot = b[rdc];
if(bot <= top) {
do_continue = true;
break;
}
topp = tp[rdc]; botp = bp[rdc];
assert(!rep1mm || bot - top == botp - topp);
} else {
assert_eq(bot, top+1);
assert(!rep1mm || botp == topp+1);
bwops_++;
top = ebwt->mapLF1(top, tloc, rdc);
if(top == (index_t)OFF_MASK) {
do_continue = true;
break;
}
bot = top + 1;
t[rdc] = top; b[rdc] = bot;
tp[rdc] = topp; bp[rdc] = botp;
assert(!rep1mm || b[rdc] - t[rdc] == bp[rdc] - tp[rdc]);
// topp/botp stay the same
}
INIT_LOCS(top, bot, tloc, bloc, *ebwt);
assert(sanityPartial(ebwt, ebwtp, seq, len - dep - 1, len, rep1mm, top, bot, topp, botp));
}
if(do_continue) {
continue;
}
// Align far half
for(; dep < len; dep++) {
int rdc = seq[len-dep-1];
int quc = qual[len-dep-1];
if(rdc > 3 && nceil == 0) {
break;
}
tp[0] = tp[1] = tp[2] = tp[3] = topp;
bp[0] = bp[1] = bp[2] = bp[3] = botp;
int clo = 0, chi = 3;
bool match = true;
if(bloc.valid()) {
bwops_++;
t[0] = t[1] = t[2] = t[3] = b[0] = b[1] = b[2] = b[3] = 0;
ebwt->mapBiLFEx(tloc, bloc, t, b, tp, bp);
SANITY_CHECK_4TUP(t, b, tp, bp);
match = rdc < 4;
top = t[rdc]; bot = b[rdc];
topp = tp[rdc]; botp = bp[rdc];
} else {
assert_eq(bot, top+1);
assert(!rep1mm || botp == topp+1);
bwops_++;
clo = ebwt->mapLF1(top, tloc);
match = (clo == rdc);
assert_range(-1, 3, clo);
if(clo < 0) {
break; // Hit the $
} else {
t[clo] = top;
b[clo] = bot = top + 1;
}
bp[clo] = botp;
tp[clo] = topp;
assert(!rep1mm || bot - top == botp - topp);
assert(!rep1mm || b[clo] - t[clo] == bp[clo] - tp[clo]);
chi = clo;
}
//assert(sanityPartial(ebwt, ebwtp, seq, len - dep - 1, len, rep1mm, top, bot, topp, botp));
if(rep1mm && (ns == 0 || rdc > 3)) {
for(int j = clo; j <= chi; j++) {
if(j == rdc || b[j] == t[j]) {
// Either matches read or isn't a possibility
continue;
}
// Potential mismatch - next, try
size_t depm = dep + 1;
index_t topm = t[j], botm = b[j];
index_t topmp = tp[j], botmp = bp[j];
assert_eq(botm - topm, botmp - topmp);
index_t tm[4], bm[4]; // dest BW ranges for BWT
tm[0] = t[0]; tm[1] = t[1];
tm[2] = t[2]; tm[3] = t[3];
bm[0] = b[0]; bm[1] = t[1];
bm[2] = b[2]; bm[3] = t[3];
index_t tmp[4], bmp[4]; // dest BW ranges for BWT'
tmp[0] = tp[0]; tmp[1] = tp[1];
tmp[2] = tp[2]; tmp[3] = tp[3];
bmp[0] = bp[0]; bmp[1] = tp[1];
bmp[2] = bp[2]; bmp[3] = tp[3];
SideLocus<index_t> tlocm, blocm;
INIT_LOCS(topm, botm, tlocm, blocm, *ebwt);
for(; depm < len; depm++) {
int rdcm = seq[len - depm - 1];
tmp[0] = tmp[1] = tmp[2] = tmp[3] = topmp;
bmp[0] = bmp[1] = bmp[2] = bmp[3] = botmp;
if(blocm.valid()) {
bwops_++;
tm[0] = tm[1] = tm[2] = tm[3] =
bm[0] = bm[1] = bm[2] = bm[3] = 0;
ebwt->mapBiLFEx(tlocm, blocm, tm, bm, tmp, bmp);
SANITY_CHECK_4TUP(tm, bm, tmp, bmp);
topm = tm[rdcm]; botm = bm[rdcm];
topmp = tmp[rdcm]; botmp = bmp[rdcm];
if(botm <= topm) {
break;
}
} else {
assert_eq(botm, topm+1);
assert_eq(botmp, topmp+1);
bwops_++;
topm = ebwt->mapLF1(topm, tlocm, rdcm);
if(topm == (index_t)0xffffffff) {
break;
}
botm = topm + 1;
// topp/botp stay the same
}
INIT_LOCS(topm, botm, tlocm, blocm, *ebwt);
}
if(depm == len) {
// Success; this is a 1MM hit
size_t off5p = dep; // offset from 5' end of read
size_t offstr = dep; // offset into patFw/patRc
if(fw == ebwtfw) {
off5p = len - off5p - 1;
}
if(!ebwtfw) {
offstr = len - offstr - 1;
}
Edit e((uint32_t)off5p, j, rdc, EDIT_TYPE_MM, false);
results = true;
int64_t score = (len - 1) * sc.match();
// In --local mode, need to double-check that
// end-to-end alignment doesn't violate local
// alignment principles. Specifically, it
// shouldn't to or below 0 anywhere in the middle.
int pen = sc.score(rdc, (int)(1 << j), quc - 33);
score += pen;
bool valid = true;
if(local) {
int64_t locscore_fw = 0, locscore_bw = 0;
for(size_t i = 0; i < len; i++) {
if(i == dep) {
if(locscore_fw + pen <= 0) {
valid = false;
break;
}
locscore_fw += pen;
} else {
locscore_fw += sc.match();
}
if(len-i-1 == dep) {
if(locscore_bw + pen <= 0) {
valid = false;
break;
}
locscore_bw += pen;
} else {
locscore_bw += sc.match();
}
}
}
if(valid) {
valid = score >= minsc;
}
if(valid) {
#ifndef NDEBUG
BTDnaString& rf = tmprfdnastr_;
rf.clear();
edits_.clear();
edits_.push_back(e);
if(!fw) Edit::invertPoss(edits_, len, false);
Edit::toRef(fw ? read.patFw : read.patRc, edits_, rf);
if(!fw) Edit::invertPoss(edits_, len, false);
assert_eq(len, rf.length());
for(size_t i = 0; i < len; i++) {
assert_lt((int)rf[i], 4);
}
ASSERT_ONLY(index_t toptmp = 0);
ASSERT_ONLY(index_t bottmp = 0);
assert(ebwtFw->contains(rf, &toptmp, &bottmp));
#endif
index_t toprep = ebwtfw ? topm : topmp;
index_t botrep = ebwtfw ? botm : botmp;
assert_eq(toprep, toptmp);
assert_eq(botrep, bottmp);
hits.add1mmEe(toprep, botrep, &e, NULL, fw, score);
}
}
}
}
if(bot > top && match) {
assert_lt(rdc, 4);
if(dep == len-1) {
// Success; this is an exact hit
if(ebwtfw && repex) {
if(fw) {
results = true;
int64_t score = len * sc.match();
hits.addExactEeFw(
ebwtfw ? top : topp,
ebwtfw ? bot : botp,
NULL, NULL, fw, score);
assert(ebwtFw->contains(seq, NULL, NULL));
} else {
results = true;
int64_t score = len * sc.match();
hits.addExactEeRc(
ebwtfw ? top : topp,
ebwtfw ? bot : botp,
NULL, NULL, fw, score);
assert(ebwtFw->contains(seq, NULL, NULL));
}
}
break; // End of far loop
} else {
INIT_LOCS(top, bot, tloc, bloc, *ebwt);
assert(sanityPartial(ebwt, ebwtp, seq, len - dep - 1, len, rep1mm, top, bot, topp, botp));
}
} else {
break; // End of far loop
}
} // for(; dep < len; dep++)
} // for(int ebwtfw = 0; ebwtfw < 2; ebwtfw++)
} // for(int fw = 0; fw < 2; fw++)
return results;
}
/**
* Wrapper for initial invcation of searchSeed.
*/
template <typename index_t>
bool SeedAligner<index_t>::searchSeedBi() {
return searchSeedBi(
0, 0,
0, 0, 0, 0,
SideLocus<index_t>(), SideLocus<index_t>(),
s_->cons[0], s_->cons[1], s_->cons[2], s_->overall,
NULL);
}
/**
* Get tloc, bloc ready for the next step. If the new range is under
* the ceiling.
*/
template <typename index_t>
inline void SeedAligner<index_t>::nextLocsBi(
SideLocus<index_t>& tloc, // top locus
SideLocus<index_t>& bloc, // bot locus
index_t topf, // top in BWT
index_t botf, // bot in BWT
index_t topb, // top in BWT'
index_t botb, // bot in BWT'
int step // step to get ready for
#if 0
, const SABWOffTrack* prevOt, // previous tracker
SABWOffTrack& ot // current tracker
#endif
)
{
assert_gt(botf, 0);
assert(ebwtBw_ == NULL || botb > 0);
assert_geq(step, 0); // next step can't be first one
assert(ebwtBw_ == NULL || botf-topf == botb-topb);
if(step == (int)s_->steps.size()) return; // no more steps!
// Which direction are we going in next?
if(s_->steps[step] > 0) {
// Left to right; use BWT'
if(botb - topb == 1) {
// Already down to 1 row; just init top locus
tloc.initFromRow(topb, ebwtBw_->eh(), ebwtBw_->ebwt());
bloc.invalidate();
} else {
SideLocus<index_t>::initFromTopBot(
topb, botb, ebwtBw_->eh(), ebwtBw_->ebwt(), tloc, bloc);
assert(bloc.valid());
}
} else {
// Right to left; use BWT
if(botf - topf == 1) {
// Already down to 1 row; just init top locus
tloc.initFromRow(topf, ebwtFw_->eh(), ebwtFw_->ebwt());
bloc.invalidate();
} else {
SideLocus<index_t>::initFromTopBot(
topf, botf, ebwtFw_->eh(), ebwtFw_->ebwt(), tloc, bloc);
assert(bloc.valid());
}
}
// Check if we should update the tracker with this refinement
#if 0
if(botf-topf <= BW_OFF_TRACK_CEIL) {
if(ot.size() == 0 && prevOt != NULL && prevOt->size() > 0) {
// Inherit state from the predecessor
ot = *prevOt;
}
bool ltr = s_->steps[step-1] > 0;
int adj = abs(s_->steps[step-1])-1;
const Ebwt<index_t>* ebwt = ltr ? ebwtBw_ : ebwtFw_;
ot.update(
ltr ? topb : topf, // top
ltr ? botb : botf, // bot
adj, // adj (to be subtracted from offset)
ebwt->offs(), // offs array
ebwt->eh().offRate(), // offrate (sample = every 1 << offrate elts)
NULL // dead
);
assert_gt(ot.size(), 0);
}
#endif
assert(botf - topf == 1 || bloc.valid());
assert(botf - topf > 1 || !bloc.valid());
}
/**
* Report a seed hit found by searchSeedBi(), but first try to extend it out in
* either direction as far as possible without hitting any edits. This will
* allow us to prioritize the seed hits better later on. Call reportHit() when
* we're done, which actually adds the hit to the cache. Returns result from
* calling reportHit().
*/
template <typename index_t>
bool SeedAligner<index_t>::extendAndReportHit(
index_t topf, // top in BWT
index_t botf, // bot in BWT
index_t topb, // top in BWT'
index_t botb, // bot in BWT'
index_t len, // length of hit
DoublyLinkedList<Edit> *prevEdit) // previous edit
{
index_t nlex = 0, nrex = 0;
index_t t[4], b[4];
index_t tp[4], bp[4];
SideLocus<index_t> tloc, bloc;
if(off_ > 0) {
const Ebwt<index_t> *ebwt = ebwtFw_;
assert(ebwt != NULL);
// Extend left using forward index
const BTDnaString& seq = fw_ ? read_->patFw : read_->patRc;
// See what we get by extending
index_t top = topf, bot = botf;
t[0] = t[1] = t[2] = t[3] = 0;
b[0] = b[1] = b[2] = b[3] = 0;
tp[0] = tp[1] = tp[2] = tp[3] = topb;
bp[0] = bp[1] = bp[2] = bp[3] = botb;
SideLocus<index_t> tloc, bloc;
INIT_LOCS(top, bot, tloc, bloc, *ebwt);
for(size_t ii = off_; ii > 0; ii--) {
size_t i = ii-1;
// Get char from read
int rdc = seq.get(i);
// See what we get by extending
if(bloc.valid()) {
bwops_++;
t[0] = t[1] = t[2] = t[3] =
b[0] = b[1] = b[2] = b[3] = 0;
ebwt->mapBiLFEx(tloc, bloc, t, b, tp, bp);
SANITY_CHECK_4TUP(t, b, tp, bp);
int nonz = -1;
bool abort = false;
for(int j = 0; j < 4; j++) {
if(b[i] > t[i]) {
if(nonz >= 0) {
abort = true;
break;
}
nonz = j;
top = t[i]; bot = b[i];
}
}
if(abort || nonz != rdc) {
break;
}
} else {
assert_eq(bot, top+1);
bwops_++;
int c = ebwt->mapLF1(top, tloc);
if(c != rdc) {
break;
}
bot = top + 1;
}
if(++nlex == 255) {
break;
}
INIT_LOCS(top, bot, tloc, bloc, *ebwt);
}
}
size_t rdlen = read_->length();
size_t nright = rdlen - off_ - len;
if(nright > 0 && ebwtBw_ != NULL) {
const Ebwt<index_t> *ebwt = ebwtBw_;
assert(ebwt != NULL);
// Extend right using backward index
const BTDnaString& seq = fw_ ? read_->patFw : read_->patRc;
// See what we get by extending
index_t top = topb, bot = botb;
t[0] = t[1] = t[2] = t[3] = 0;
b[0] = b[1] = b[2] = b[3] = 0;
tp[0] = tp[1] = tp[2] = tp[3] = topb;
bp[0] = bp[1] = bp[2] = bp[3] = botb;
INIT_LOCS(top, bot, tloc, bloc, *ebwt);
for(size_t i = off_ + len; i < rdlen; i++) {
// Get char from read
int rdc = seq.get(i);
// See what we get by extending
if(bloc.valid()) {
bwops_++;
t[0] = t[1] = t[2] = t[3] =
b[0] = b[1] = b[2] = b[3] = 0;
ebwt->mapBiLFEx(tloc, bloc, t, b, tp, bp);
SANITY_CHECK_4TUP(t, b, tp, bp);
int nonz = -1;
bool abort = false;
for(int j = 0; j < 4; j++) {
if(b[i] > t[i]) {
if(nonz >= 0) {
abort = true;
break;
}
nonz = j;
top = t[i]; bot = b[i];
}
}
if(abort || nonz != rdc) {
break;
}
} else {
assert_eq(bot, top+1);
bwops_++;
int c = ebwt->mapLF1(top, tloc);
if(c != rdc) {
break;
}
bot = top + 1;
}
if(++nrex == 255) {
break;
}
INIT_LOCS(top, bot, tloc, bloc, *ebwt);
}
}
assert_lt(nlex, rdlen);
assert_leq(nlex, off_);
assert_lt(nrex, rdlen);
return reportHit(topf, botf, topb, botb, len, prevEdit);
}
/**
* Report a seed hit found by searchSeedBi() by adding it to the cache. Return
* false if the hit could not be reported because of, e.g., cache exhaustion.
*/
template <typename index_t>
bool SeedAligner<index_t>::reportHit(
index_t topf, // top in BWT
index_t botf, // bot in BWT
index_t topb, // top in BWT'
index_t botb, // bot in BWT'
index_t len, // length of hit
DoublyLinkedList<Edit> *prevEdit) // previous edit
{
// Add information about the seed hit to AlignmentCache. This
// information eventually makes its way back to the SeedResults
// object when we call finishAlign(...).
BTDnaString& rf = tmprfdnastr_;
rf.clear();
edits_.clear();
if(prevEdit != NULL) {
prevEdit->toList(edits_);
Edit::sort(edits_);
assert(Edit::repOk(edits_, *seq_));
Edit::toRef(*seq_, edits_, rf);
} else {
rf = *seq_;
}
// Sanity check: shouldn't add the same hit twice. If this
// happens, it may be because our zone Constraints are not set up
// properly and erroneously return true from acceptable() when they
// should return false in some cases.
assert_eq(hits_.size(), ca_->curNumRanges());
assert(hits_.insert(rf));
if(!ca_->addOnTheFly(rf, topf, botf, topb, botb)) {
return false;
}
assert_eq(hits_.size(), ca_->curNumRanges());
#ifndef NDEBUG
// Sanity check that the topf/botf and topb/botb ranges really
// correspond to the reference sequence aligned to
{
BTDnaString rfr;
index_t tpf, btf, tpb, btb;
tpf = btf = tpb = btb = 0;
assert(ebwtFw_->contains(rf, &tpf, &btf));
if(ebwtBw_ != NULL) {
rfr = rf;
rfr.reverse();
assert(ebwtBw_->contains(rfr, &tpb, &btb));
assert_eq(tpf, topf);
assert_eq(btf, botf);
assert_eq(tpb, topb);
assert_eq(btb, botb);
}
}
#endif
return true;
}
/**
* Given a seed, search. Assumes zone 0 = no backtracking.
*
* Return a list of Seed hits.
* 1. Edits
* 2. Bidirectional BWT range(s) on either end
*/
template <typename index_t>
bool SeedAligner<index_t>::searchSeedBi(
int step, // depth into steps_[] array
int depth, // recursion depth
index_t topf, // top in BWT
index_t botf, // bot in BWT
index_t topb, // top in BWT'
index_t botb, // bot in BWT'
SideLocus<index_t> tloc, // locus for top (perhaps unititialized)
SideLocus<index_t> bloc, // locus for bot (perhaps unititialized)
Constraint c0, // constraints to enforce in seed zone 0
Constraint c1, // constraints to enforce in seed zone 1
Constraint c2, // constraints to enforce in seed zone 2
Constraint overall, // overall constraints to enforce
DoublyLinkedList<Edit> *prevEdit // previous edit
#if 0
, const SABWOffTrack* prevOt // prev off tracker (if tracking started)
#endif
)
{
assert(s_ != NULL);
const InstantiatedSeed& s = *s_;
assert_gt(s.steps.size(), 0);
assert(ebwtBw_ == NULL || ebwtBw_->eh().ftabChars() == ebwtFw_->eh().ftabChars());
#ifndef NDEBUG
for(int i = 0; i < 4; i++) {
assert(ebwtBw_ == NULL || ebwtBw_->fchr()[i] == ebwtFw_->fchr()[i]);
}
#endif
if(step == (int)s.steps.size()) {
// Finished aligning seed
assert(c0.acceptable());
assert(c1.acceptable());
assert(c2.acceptable());
if(!reportHit(topf, botf, topb, botb, seq_->length(), prevEdit)) {
return false; // Memory exhausted
}
return true;
}
#ifndef NDEBUG
if(depth > 0) {
assert(botf - topf == 1 || bloc.valid());
assert(botf - topf > 1 || !bloc.valid());
}
#endif
int off;
index_t tp[4], bp[4]; // dest BW ranges for "prime" index
if(step == 0) {
// Just starting
assert(prevEdit == NULL);
assert(!tloc.valid());
assert(!bloc.valid());
off = s.steps[0];
bool ltr = off > 0;
off = abs(off)-1;
// Check whether/how far we can jump using ftab or fchr
int ftabLen = ebwtFw_->eh().ftabChars();
if(ftabLen > 1 && ftabLen <= s.maxjump) {
if(!ltr) {
assert_geq(off+1, ftabLen-1);
off = off - ftabLen + 1;
}
ebwtFw_->ftabLoHi(*seq_, off, false, topf, botf);
#ifdef NDEBUG
if(botf - topf == 0) return true;
#endif
#ifdef NDEBUG
if(ebwtBw_ != NULL) {
topb = ebwtBw_->ftabHi(*seq_, off);
botb = topb + (botf-topf);
}
#else
if(ebwtBw_ != NULL) {
ebwtBw_->ftabLoHi(*seq_, off, false, topb, botb);
assert_eq(botf-topf, botb-topb);
}
if(botf - topf == 0) return true;
#endif
step += ftabLen;
} else if(s.maxjump > 0) {
// Use fchr
int c = (*seq_)[off];
assert_range(0, 3, c);
topf = topb = ebwtFw_->fchr()[c];
botf = botb = ebwtFw_->fchr()[c+1];
if(botf - topf == 0) return true;
step++;
} else {
assert_eq(0, s.maxjump);
topf = topb = 0;
botf = botb = ebwtFw_->fchr()[4];
}
if(step == (int)s.steps.size()) {
// Finished aligning seed
assert(c0.acceptable());
assert(c1.acceptable());
assert(c2.acceptable());
if(!reportHit(topf, botf, topb, botb, seq_->length(), prevEdit)) {
return false; // Memory exhausted
}
return true;
}
nextLocsBi(tloc, bloc, topf, botf, topb, botb, step);
assert(tloc.valid());
} else assert(prevEdit != NULL);
assert(tloc.valid());
assert(botf - topf == 1 || bloc.valid());
assert(botf - topf > 1 || !bloc.valid());
assert_geq(step, 0);
index_t t[4], b[4]; // dest BW ranges
Constraint* zones[3] = { &c0, &c1, &c2 };
ASSERT_ONLY(index_t lasttot = botf - topf);
for(int i = step; i < (int)s.steps.size(); i++) {
assert_gt(botf, topf);
assert(botf - topf == 1 || bloc.valid());
assert(botf - topf > 1 || !bloc.valid());
assert(ebwtBw_ == NULL || botf-topf == botb-topb);
assert(tloc.valid());
off = s.steps[i];
bool ltr = off > 0;
const Ebwt<index_t>* ebwt = ltr ? ebwtBw_ : ebwtFw_;
assert(ebwt != NULL);
if(ltr) {
tp[0] = tp[1] = tp[2] = tp[3] = topf;
bp[0] = bp[1] = bp[2] = bp[3] = botf;
} else {
tp[0] = tp[1] = tp[2] = tp[3] = topb;
bp[0] = bp[1] = bp[2] = bp[3] = botb;
}
t[0] = t[1] = t[2] = t[3] = b[0] = b[1] = b[2] = b[3] = 0;
if(bloc.valid()) {
// Range delimited by tloc/bloc has size >1. If size == 1,
// we use a simpler query (see if(!bloc.valid()) blocks below)
bwops_++;
ebwt->mapBiLFEx(tloc, bloc, t, b, tp, bp);
ASSERT_ONLY(index_t tot = (b[0]-t[0])+(b[1]-t[1])+(b[2]-t[2])+(b[3]-t[3]));
ASSERT_ONLY(index_t totp = (bp[0]-tp[0])+(bp[1]-tp[1])+(bp[2]-tp[2])+(bp[3]-tp[3]));
assert_eq(tot, totp);
assert_leq(tot, lasttot);
ASSERT_ONLY(lasttot = tot);
}
index_t *tf = ltr ? tp : t, *tb = ltr ? t : tp;
index_t *bf = ltr ? bp : b, *bb = ltr ? b : bp;
off = abs(off)-1;
//
bool leaveZone = s.zones[i].first < 0;
//bool leaveZoneIns = zones_[i].second < 0;
Constraint& cons = *zones[abs(s.zones[i].first)];
Constraint& insCons = *zones[abs(s.zones[i].second)];
int c = (*seq_)[off]; assert_range(0, 4, c);
int q = (*qual_)[off];
// Is it legal for us to advance on characters other than 'c'?
if(!(cons.mustMatch() && !overall.mustMatch()) || c == 4) {
// There may be legal edits
bool bail = false;
if(!bloc.valid()) {
// Range delimited by tloc/bloc has size 1
index_t ntop = ltr ? topb : topf;
bwops_++;
int cc = ebwt->mapLF1(ntop, tloc);
assert_range(-1, 3, cc);
if(cc < 0) bail = true;
else { t[cc] = ntop; b[cc] = ntop+1; }
}
if(!bail) {
if((cons.canMismatch(q, *sc_) && overall.canMismatch(q, *sc_)) || c == 4) {
Constraint oldCons = cons, oldOvCons = overall;
SideLocus<index_t> oldTloc = tloc, oldBloc = bloc;
if(c != 4) {
cons.chargeMismatch(q, *sc_);
overall.chargeMismatch(q, *sc_);
}
// Can leave the zone as-is
if(!leaveZone || (cons.acceptable() && overall.acceptable())) {
for(int j = 0; j < 4; j++) {
if(j == c || b[j] == t[j]) continue;
// Potential mismatch
nextLocsBi(tloc, bloc, tf[j], bf[j], tb[j], bb[j], i+1);
int loff = off;
if(!ltr) loff = (int)(s.steps.size() - loff - 1);
assert(prevEdit == NULL || prevEdit->next == NULL);
Edit edit(off, j, c, EDIT_TYPE_MM, false);
DoublyLinkedList<Edit> editl;
editl.payload = edit;
if(prevEdit != NULL) {
prevEdit->next = &editl;
editl.prev = prevEdit;
}
assert(editl.next == NULL);
bwedits_++;
if(!searchSeedBi(
i+1, // depth into steps_[] array
depth+1, // recursion depth
tf[j], // top in BWT
bf[j], // bot in BWT
tb[j], // top in BWT'
bb[j], // bot in BWT'
tloc, // locus for top (perhaps unititialized)
bloc, // locus for bot (perhaps unititialized)
c0, // constraints to enforce in seed zone 0
c1, // constraints to enforce in seed zone 1
c2, // constraints to enforce in seed zone 2
overall, // overall constraints to enforce
&editl)) // latest edit
{
return false;
}
if(prevEdit != NULL) prevEdit->next = NULL;
}
} else {
// Not enough edits to make this path
// non-redundant with other seeds
}
cons = oldCons;
overall = oldOvCons;
tloc = oldTloc;
bloc = oldBloc;
}
if(cons.canGap() && overall.canGap()) {
throw 1; // TODO
int delEx = 0;
if(cons.canDelete(delEx, *sc_) && overall.canDelete(delEx, *sc_)) {
// Try delete
}
int insEx = 0;
if(insCons.canInsert(insEx, *sc_) && overall.canInsert(insEx, *sc_)) {
// Try insert
}
}
} // if(!bail)
}
if(c == 4) {
return true; // couldn't handle the N
}
if(leaveZone && (!cons.acceptable() || !overall.acceptable())) {
// Not enough edits to make this path non-redundant with
// other seeds
return true;
}
if(!bloc.valid()) {
assert(ebwtBw_ == NULL || bp[c] == tp[c]+1);
// Range delimited by tloc/bloc has size 1
index_t top = ltr ? topb : topf;
bwops_++;
t[c] = ebwt->mapLF1(top, tloc, c);
if(t[c] == (index_t)OFF_MASK) {
return true;
}
assert_geq(t[c], ebwt->fchr()[c]);
assert_lt(t[c], ebwt->fchr()[c+1]);
b[c] = t[c]+1;
assert_gt(b[c], 0);
}
assert(ebwtBw_ == NULL || bf[c]-tf[c] == bb[c]-tb[c]);
assert_leq(bf[c]-tf[c], lasttot);
ASSERT_ONLY(lasttot = bf[c]-tf[c]);
if(b[c] == t[c]) {
return true;
}
topf = tf[c]; botf = bf[c];
topb = tb[c]; botb = bb[c];
if(i+1 == (int)s.steps.size()) {
// Finished aligning seed
assert(c0.acceptable());
assert(c1.acceptable());
assert(c2.acceptable());
if(!reportHit(topf, botf, topb, botb, seq_->length(), prevEdit)) {
return false; // Memory exhausted
}
return true;
}
nextLocsBi(tloc, bloc, tf[c], bf[c], tb[c], bb[c], i+1);
}
return true;
}
#endif /*ALIGNER_SEED_H_*/