1014 lines
26 KiB
C++
1014 lines
26 KiB
C++
/*
|
|
* Copyright 2011, Ben Langmead <langmea@cs.jhu.edu>
|
|
*
|
|
* This file is part of Bowtie 2.
|
|
*
|
|
* Bowtie 2 is free software: you can redistribute it and/or modify
|
|
* it under the terms of the GNU General Public License as published by
|
|
* the Free Software Foundation, either version 3 of the License, or
|
|
* (at your option) any later version.
|
|
*
|
|
* Bowtie 2 is distributed in the hope that it will be useful,
|
|
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
|
* GNU General Public License for more details.
|
|
*
|
|
* You should have received a copy of the GNU General Public License
|
|
* along with Bowtie 2. If not, see <http://www.gnu.org/licenses/>.
|
|
*/
|
|
|
|
#ifndef ALIGNER_CACHE_H_
|
|
#define ALIGNER_CACHE_H_
|
|
|
|
/**
|
|
* CACHEING
|
|
*
|
|
* By caching the results of some alignment sub-problems, we hope to
|
|
* enable a "fast path" for read alignment whereby answers are mostly
|
|
* looked up rather than calculated from scratch. This is particularly
|
|
* effective when the input is sorted or otherwise grouped in a way
|
|
* that brings together reads with (at least some) seed sequences in
|
|
* common.
|
|
*
|
|
* But the cache is also where results are held, regardless of whether
|
|
* the results are maintained & re-used across reads.
|
|
*
|
|
* The cache consists of two linked potions:
|
|
*
|
|
* 1. A multimap from seed strings (i.e. read substrings) to reference strings
|
|
* that are within some edit distance (roughly speaking). This is the "seed
|
|
* multimap".
|
|
*
|
|
* Key: Read substring (2-bit-per-base encoded + length)
|
|
* Value: Set of reference substrings (i.e. keys into the suffix
|
|
* array multimap).
|
|
*
|
|
* 2. A multimap from reference strings to the corresponding elements of the
|
|
* suffix array. Elements are filled in with reference-offset info as it's
|
|
* calculated. This is the "suffix array multimap"
|
|
*
|
|
* Key: Reference substring (2-bit-per-base encoded + length)
|
|
* Value: (a) top from BWT, (b) length of range, (c) offset of first
|
|
* range element in
|
|
*
|
|
* For both multimaps, we use a combo Red-Black tree and EList. The payload in
|
|
* the Red-Black tree nodes points to a range in the EList.
|
|
*/
|
|
|
|
#include <iostream>
|
|
#include "ds.h"
|
|
#include "read.h"
|
|
#include "threading.h"
|
|
#include "mem_ids.h"
|
|
#include "simple_func.h"
|
|
#include "btypes.h"
|
|
|
|
#define CACHE_PAGE_SZ (16 * 1024)
|
|
|
|
typedef PListSlice<TIndexOffU, CACHE_PAGE_SZ> TSlice;
|
|
|
|
/**
|
|
* Key for the query multimap: the read substring and its length.
|
|
*/
|
|
struct QKey {
|
|
|
|
/**
|
|
* Initialize invalid QKey.
|
|
*/
|
|
QKey() { reset(); }
|
|
|
|
/**
|
|
* Initialize QKey from DNA string.
|
|
*/
|
|
QKey(const BTDnaString& s ASSERT_ONLY(, BTDnaString& tmp)) {
|
|
init(s ASSERT_ONLY(, tmp));
|
|
}
|
|
|
|
/**
|
|
* Initialize QKey from DNA string. Rightmost character is placed in the
|
|
* least significant bitpair.
|
|
*/
|
|
bool init(
|
|
const BTDnaString& s
|
|
ASSERT_ONLY(, BTDnaString& tmp))
|
|
{
|
|
seq = 0;
|
|
len = (uint32_t)s.length();
|
|
ASSERT_ONLY(tmp.clear());
|
|
if(len > 32) {
|
|
len = 0xffffffff;
|
|
return false; // wasn't cacheable
|
|
} else {
|
|
// Rightmost char of 's' goes in the least significant bitpair
|
|
for(size_t i = 0; i < 32 && i < s.length(); i++) {
|
|
int c = (int)s.get(i);
|
|
assert_range(0, 4, c);
|
|
if(c == 4) {
|
|
len = 0xffffffff;
|
|
return false;
|
|
}
|
|
seq = (seq << 2) | s.get(i);
|
|
}
|
|
ASSERT_ONLY(toString(tmp));
|
|
assert(sstr_eq(tmp, s));
|
|
assert_leq(len, 32);
|
|
return true; // was cacheable
|
|
}
|
|
}
|
|
|
|
/**
|
|
* Convert this key to a DNA string.
|
|
*/
|
|
void toString(BTDnaString& s) {
|
|
s.resize(len);
|
|
uint64_t sq = seq;
|
|
for(int i = (len)-1; i >= 0; i--) {
|
|
s.set((uint32_t)(sq & 3), i);
|
|
sq >>= 2;
|
|
}
|
|
}
|
|
|
|
/**
|
|
* Return true iff the read substring is cacheable.
|
|
*/
|
|
bool cacheable() const { return len != 0xffffffff; }
|
|
|
|
/**
|
|
* Reset to uninitialized state.
|
|
*/
|
|
void reset() { seq = 0; len = 0xffffffff; }
|
|
|
|
/**
|
|
* True -> my key is less than the given key.
|
|
*/
|
|
bool operator<(const QKey& o) const {
|
|
return seq < o.seq || (seq == o.seq && len < o.len);
|
|
}
|
|
|
|
/**
|
|
* True -> my key is greater than the given key.
|
|
*/
|
|
bool operator>(const QKey& o) const {
|
|
return !(*this < o || *this == o);
|
|
}
|
|
|
|
/**
|
|
* True -> my key is equal to the given key.
|
|
*/
|
|
bool operator==(const QKey& o) const {
|
|
return seq == o.seq && len == o.len;
|
|
}
|
|
|
|
|
|
/**
|
|
* True -> my key is not equal to the given key.
|
|
*/
|
|
bool operator!=(const QKey& o) const {
|
|
return !(*this == o);
|
|
}
|
|
|
|
#ifndef NDEBUG
|
|
/**
|
|
* Check that this is a valid, initialized QKey.
|
|
*/
|
|
bool repOk() const {
|
|
return len != 0xffffffff;
|
|
}
|
|
#endif
|
|
|
|
uint64_t seq; // sequence
|
|
uint32_t len; // length of sequence
|
|
};
|
|
|
|
template <typename index_t>
|
|
class AlignmentCache;
|
|
|
|
/**
|
|
* Payload for the query multimap: a range of elements in the reference
|
|
* string list.
|
|
*/
|
|
template <typename index_t>
|
|
class QVal {
|
|
|
|
public:
|
|
|
|
QVal() { reset(); }
|
|
|
|
/**
|
|
* Return the offset of the first reference substring in the qlist.
|
|
*/
|
|
index_t offset() const { return i_; }
|
|
|
|
/**
|
|
* Return the number of reference substrings associated with a read
|
|
* substring.
|
|
*/
|
|
index_t numRanges() const {
|
|
assert(valid());
|
|
return rangen_;
|
|
}
|
|
|
|
/**
|
|
* Return the number of elements associated with all associated
|
|
* reference substrings.
|
|
*/
|
|
index_t numElts() const {
|
|
assert(valid());
|
|
return eltn_;
|
|
}
|
|
|
|
/**
|
|
* Return true iff the read substring is not associated with any
|
|
* reference substrings.
|
|
*/
|
|
bool empty() const {
|
|
assert(valid());
|
|
return numRanges() == 0;
|
|
}
|
|
|
|
/**
|
|
* Return true iff the QVal is valid.
|
|
*/
|
|
bool valid() const { return rangen_ != (index_t)OFF_MASK; }
|
|
|
|
/**
|
|
* Reset to invalid state.
|
|
*/
|
|
void reset() {
|
|
i_ = 0; rangen_ = eltn_ = (index_t)OFF_MASK;
|
|
}
|
|
|
|
/**
|
|
* Initialize Qval.
|
|
*/
|
|
void init(index_t i, index_t ranges, index_t elts) {
|
|
i_ = i; rangen_ = ranges; eltn_ = elts;
|
|
}
|
|
|
|
/**
|
|
* Tally another range with given number of elements.
|
|
*/
|
|
void addRange(index_t numElts) {
|
|
rangen_++;
|
|
eltn_ += numElts;
|
|
}
|
|
|
|
#ifndef NDEBUG
|
|
/**
|
|
* Check that this QVal is internally consistent and consistent
|
|
* with the contents of the given cache.
|
|
*/
|
|
bool repOk(const AlignmentCache<index_t>& ac) const;
|
|
#endif
|
|
|
|
protected:
|
|
|
|
index_t i_; // idx of first elt in qlist
|
|
index_t rangen_; // # ranges (= # associated reference substrings)
|
|
index_t eltn_; // # elements (total)
|
|
};
|
|
|
|
/**
|
|
* Key for the suffix array multimap: the reference substring and its
|
|
* length. Same as QKey so I typedef it.
|
|
*/
|
|
typedef QKey SAKey;
|
|
|
|
/**
|
|
* Payload for the suffix array multimap: (a) the top element of the
|
|
* range in BWT, (b) the offset of the first elt in the salist, (c)
|
|
* length of the range.
|
|
*/
|
|
template <typename index_t>
|
|
struct SAVal {
|
|
|
|
SAVal() : topf(), topb(), i(), len(OFF_MASK) { }
|
|
|
|
/**
|
|
* Return true iff the SAVal is valid.
|
|
*/
|
|
bool valid() { return len != (index_t)OFF_MASK; }
|
|
|
|
#ifndef NDEBUG
|
|
/**
|
|
* Check that this SAVal is internally consistent and consistent
|
|
* with the contents of the given cache.
|
|
*/
|
|
bool repOk(const AlignmentCache<index_t>& ac) const;
|
|
#endif
|
|
|
|
/**
|
|
* Initialize the SAVal.
|
|
*/
|
|
void init(
|
|
index_t tf,
|
|
index_t tb,
|
|
index_t ii,
|
|
index_t ln)
|
|
{
|
|
topf = tf;
|
|
topb = tb;
|
|
i = ii;
|
|
len = ln;
|
|
}
|
|
|
|
index_t topf; // top in BWT
|
|
index_t topb; // top in BWT'
|
|
index_t i; // idx of first elt in salist
|
|
index_t len; // length of range
|
|
};
|
|
|
|
/**
|
|
* One data structure that encapsulates all of the cached information
|
|
* associated with a particular reference substring. This is useful
|
|
* for summarizing what info should be added to the cache for a partial
|
|
* alignment.
|
|
*/
|
|
template <typename index_t>
|
|
class SATuple {
|
|
|
|
public:
|
|
|
|
SATuple() { reset(); };
|
|
|
|
SATuple(SAKey k, index_t tf, index_t tb, TSlice o) {
|
|
init(k, tf, tb, o);
|
|
}
|
|
|
|
void init(SAKey k, index_t tf, index_t tb, TSlice o) {
|
|
key = k; topf = tf; topb = tb; offs = o;
|
|
}
|
|
|
|
/**
|
|
* Initialize this SATuple from a subrange of the SATuple 'src'.
|
|
*/
|
|
void init(const SATuple& src, index_t first, index_t last) {
|
|
assert_neq((index_t)OFF_MASK, src.topb);
|
|
key = src.key;
|
|
topf = (index_t)(src.topf + first);
|
|
topb = (index_t)OFF_MASK; // unknown!
|
|
offs.init(src.offs, first, last);
|
|
}
|
|
|
|
#ifndef NDEBUG
|
|
/**
|
|
* Check that this SATuple is internally consistent and that its
|
|
* PListSlice is consistent with its backing PList.
|
|
*/
|
|
bool repOk() const {
|
|
assert(offs.repOk());
|
|
return true;
|
|
}
|
|
#endif
|
|
|
|
/**
|
|
* Function for ordering SATuples. This is used when prioritizing which to
|
|
* explore first when extending seed hits into full alignments. Smaller
|
|
* ranges get higher priority and we use 'top' to break ties, though any
|
|
* way of breaking a tie would be fine.
|
|
*/
|
|
bool operator<(const SATuple& o) const {
|
|
if(offs.size() < o.offs.size()) {
|
|
return true;
|
|
}
|
|
if(offs.size() > o.offs.size()) {
|
|
return false;
|
|
}
|
|
return topf < o.topf;
|
|
}
|
|
bool operator>(const SATuple& o) const {
|
|
if(offs.size() < o.offs.size()) {
|
|
return false;
|
|
}
|
|
if(offs.size() > o.offs.size()) {
|
|
return true;
|
|
}
|
|
return topf > o.topf;
|
|
}
|
|
|
|
bool operator==(const SATuple& o) const {
|
|
return key == o.key && topf == o.topf && topb == o.topb && offs == o.offs;
|
|
}
|
|
|
|
void reset() { topf = topb = (index_t)OFF_MASK; offs.reset(); }
|
|
|
|
/**
|
|
* Set the length to be at most the original length.
|
|
*/
|
|
void setLength(index_t nlen) {
|
|
assert_leq(nlen, offs.size());
|
|
offs.setLength(nlen);
|
|
}
|
|
|
|
/**
|
|
* Return the number of times this reference substring occurs in the
|
|
* reference, which is also the size of the 'offs' TSlice.
|
|
*/
|
|
index_t size() const { return (index_t)offs.size(); }
|
|
|
|
// bot/length of SA range equals offs.size()
|
|
SAKey key; // sequence key
|
|
index_t topf; // top in BWT index
|
|
index_t topb; // top in BWT' index
|
|
TSlice offs; // offsets
|
|
};
|
|
|
|
/**
|
|
* Encapsulate the data structures and routines that constitute a
|
|
* particular cache, i.e., a particular stratum of the cache system,
|
|
* which might comprise many strata.
|
|
*
|
|
* Each thread has a "current-read" AlignmentCache which is used to
|
|
* build and store subproblem results as alignment is performed. When
|
|
* we're finished with a read, we might copy the cached results for
|
|
* that read (and perhaps a bundle of other recently-aligned reads) to
|
|
* a higher-level "across-read" cache. Higher-level caches may or may
|
|
* not be shared among threads.
|
|
*
|
|
* A cache consists chiefly of two multimaps, each implemented as a
|
|
* Red-Black tree map backed by an EList. A 'version' counter is
|
|
* incremented every time the cache is cleared.
|
|
*/
|
|
template <typename index_t>
|
|
class AlignmentCache {
|
|
|
|
typedef RedBlackNode<QKey, QVal<index_t> > QNode;
|
|
typedef RedBlackNode<SAKey, SAVal<index_t> > SANode;
|
|
|
|
typedef PList<SAKey, CACHE_PAGE_SZ> TQList;
|
|
typedef PList<index_t, CACHE_PAGE_SZ> TSAList;
|
|
|
|
public:
|
|
|
|
AlignmentCache(
|
|
uint64_t bytes,
|
|
bool shared) :
|
|
pool_(bytes, CACHE_PAGE_SZ, CA_CAT),
|
|
qmap_(CACHE_PAGE_SZ, CA_CAT),
|
|
qlist_(CA_CAT),
|
|
samap_(CACHE_PAGE_SZ, CA_CAT),
|
|
salist_(CA_CAT),
|
|
shared_(shared),
|
|
mutex_m(),
|
|
version_(0)
|
|
{
|
|
}
|
|
|
|
/**
|
|
* Given a QVal, populate the given EList of SATuples with records
|
|
* describing all of the cached information about the QVal's
|
|
* reference substrings.
|
|
*/
|
|
template <int S>
|
|
void queryQval(
|
|
const QVal<index_t>& qv,
|
|
EList<SATuple<index_t>, S>& satups,
|
|
index_t& nrange,
|
|
index_t& nelt,
|
|
bool getLock = true)
|
|
{
|
|
ThreadSafe ts(lockPtr(), shared_ && getLock);
|
|
assert(qv.repOk(*this));
|
|
const index_t refi = qv.offset();
|
|
const index_t reff = refi + qv.numRanges();
|
|
// For each reference sequence sufficiently similar to the
|
|
// query sequence in the QKey...
|
|
for(index_t i = refi; i < reff; i++) {
|
|
// Get corresponding SAKey, containing similar reference
|
|
// sequence & length
|
|
SAKey sak = qlist_.get(i);
|
|
// Shouldn't have identical keys in qlist_
|
|
assert(i == refi || qlist_.get(i) != qlist_.get(i-1));
|
|
// Get corresponding SANode
|
|
SANode *n = samap_.lookup(sak);
|
|
assert(n != NULL);
|
|
const SAVal<index_t>& sav = n->payload;
|
|
assert(sav.repOk(*this));
|
|
if(sav.len > 0) {
|
|
nrange++;
|
|
satups.expand();
|
|
satups.back().init(sak, sav.topf, sav.topb, TSlice(salist_, sav.i, sav.len));
|
|
nelt += sav.len;
|
|
#ifndef NDEBUG
|
|
// Shouldn't add consecutive identical entries too satups
|
|
if(i > refi) {
|
|
const SATuple<index_t> b1 = satups.back();
|
|
const SATuple<index_t> b2 = satups[satups.size()-2];
|
|
assert(b1.key != b2.key || b1.topf != b2.topf || b1.offs != b2.offs);
|
|
}
|
|
#endif
|
|
}
|
|
}
|
|
}
|
|
|
|
/**
|
|
* Return true iff the cache has no entries in it.
|
|
*/
|
|
bool empty() const {
|
|
bool ret = qmap_.empty();
|
|
assert(!ret || qlist_.empty());
|
|
assert(!ret || samap_.empty());
|
|
assert(!ret || salist_.empty());
|
|
return ret;
|
|
}
|
|
|
|
/**
|
|
* Add a new query key ('qk'), usually a 2-bit encoded substring of
|
|
* the read) as the key in a new Red-Black node in the qmap and
|
|
* return a pointer to the node's QVal.
|
|
*
|
|
* The expectation is that the caller is about to set about finding
|
|
* associated reference substrings, and that there will be future
|
|
* calls to addOnTheFly to add associations to reference substrings
|
|
* found.
|
|
*/
|
|
QVal<index_t>* add(
|
|
const QKey& qk,
|
|
bool *added,
|
|
bool getLock = true)
|
|
{
|
|
ThreadSafe ts(lockPtr(), shared_ && getLock);
|
|
assert(qk.cacheable());
|
|
QNode *n = qmap_.add(pool(), qk, added);
|
|
return (n != NULL ? &n->payload : NULL);
|
|
}
|
|
|
|
/**
|
|
* Add a new association between a read sequnce ('seq') and a
|
|
* reference sequence ('')
|
|
*/
|
|
bool addOnTheFly(
|
|
QVal<index_t>& qv, // qval that points to the range of reference substrings
|
|
const SAKey& sak, // the key holding the reference substring
|
|
index_t topf, // top range elt in BWT index
|
|
index_t botf, // bottom range elt in BWT index
|
|
index_t topb, // top range elt in BWT' index
|
|
index_t botb, // bottom range elt in BWT' index
|
|
bool getLock = true);
|
|
|
|
/**
|
|
* Clear the cache, i.e. turn it over. All HitGens referring to
|
|
* ranges in this cache will become invalid and the corresponding
|
|
* reads will have to be re-aligned.
|
|
*/
|
|
void clear(bool getLock = true) {
|
|
ThreadSafe ts(lockPtr(), shared_ && getLock);
|
|
pool_.clear();
|
|
qmap_.clear();
|
|
qlist_.clear();
|
|
samap_.clear();
|
|
salist_.clear();
|
|
version_++;
|
|
}
|
|
|
|
/**
|
|
* Return the number of keys in the query multimap.
|
|
*/
|
|
index_t qNumKeys() const { return (index_t)qmap_.size(); }
|
|
|
|
/**
|
|
* Return the number of keys in the suffix array multimap.
|
|
*/
|
|
index_t saNumKeys() const { return (index_t)samap_.size(); }
|
|
|
|
/**
|
|
* Return the number of elements in the reference substring list.
|
|
*/
|
|
index_t qSize() const { return (index_t)qlist_.size(); }
|
|
|
|
/**
|
|
* Return the number of elements in the SA range list.
|
|
*/
|
|
index_t saSize() const { return (index_t)salist_.size(); }
|
|
|
|
/**
|
|
* Return the pool.
|
|
*/
|
|
Pool& pool() { return pool_; }
|
|
|
|
/**
|
|
* Return the lock object.
|
|
*/
|
|
MUTEX_T& lock() {
|
|
return mutex_m;
|
|
}
|
|
|
|
/**
|
|
* Return a const pointer to the lock object. This allows us to
|
|
* write const member functions that grab the lock.
|
|
*/
|
|
MUTEX_T* lockPtr() const {
|
|
return const_cast<MUTEX_T*>(&mutex_m);
|
|
}
|
|
|
|
/**
|
|
* Return true iff this cache is shared among threads.
|
|
*/
|
|
bool shared() const { return shared_; }
|
|
|
|
/**
|
|
* Return the current "version" of the cache, i.e. the total number
|
|
* of times it has turned over since its creation.
|
|
*/
|
|
uint32_t version() const { return version_; }
|
|
|
|
protected:
|
|
|
|
Pool pool_; // dispenses memory pages
|
|
RedBlack<QKey, QVal<index_t> > qmap_; // map from query substrings to reference substrings
|
|
TQList qlist_; // list of reference substrings
|
|
RedBlack<SAKey, SAVal<index_t> > samap_; // map from reference substrings to SA ranges
|
|
TSAList salist_; // list of SA ranges
|
|
|
|
bool shared_; // true -> this cache is global
|
|
MUTEX_T mutex_m; // mutex used for syncronization in case the the cache is shared.
|
|
uint32_t version_; // cache version
|
|
};
|
|
|
|
/**
|
|
* Interface used to query and update a pair of caches: one thread-
|
|
* local and unsynchronized, another shared and synchronized. One or
|
|
* both can be NULL.
|
|
*/
|
|
template <typename index_t>
|
|
class AlignmentCacheIface {
|
|
|
|
public:
|
|
|
|
AlignmentCacheIface(
|
|
AlignmentCache<index_t> *current,
|
|
AlignmentCache<index_t> *local,
|
|
AlignmentCache<index_t> *shared) :
|
|
qk_(),
|
|
qv_(NULL),
|
|
cacheable_(false),
|
|
rangen_(0),
|
|
eltsn_(0),
|
|
current_(current),
|
|
local_(local),
|
|
shared_(shared)
|
|
{
|
|
assert(current_ != NULL);
|
|
}
|
|
|
|
#if 0
|
|
/**
|
|
* Query the relevant set of caches, looking for a QVal to go with
|
|
* the provided QKey. If the QVal is found in a cache other than
|
|
* the current-read cache, it is copied into the current-read cache
|
|
* first and the QVal pointer for the current-read cache is
|
|
* returned. This function never returns a pointer from any cache
|
|
* other than the current-read cache. If the QVal could not be
|
|
* found in any cache OR if the QVal was found in a cache other
|
|
* than the current-read cache but could not be copied into the
|
|
* current-read cache, NULL is returned.
|
|
*/
|
|
QVal* queryCopy(const QKey& qk, bool getLock = true) {
|
|
assert(qk.cacheable());
|
|
AlignmentCache* caches[3] = { current_, local_, shared_ };
|
|
for(int i = 0; i < 3; i++) {
|
|
if(caches[i] == NULL) continue;
|
|
QVal* qv = caches[i]->query(qk, getLock);
|
|
if(qv != NULL) {
|
|
if(i == 0) return qv;
|
|
if(!current_->copy(qk, *qv, *caches[i], getLock)) {
|
|
// Exhausted memory in the current cache while
|
|
// attempting to copy in the qk
|
|
return NULL;
|
|
}
|
|
QVal* curqv = current_->query(qk, getLock);
|
|
assert(curqv != NULL);
|
|
return curqv;
|
|
}
|
|
}
|
|
return NULL;
|
|
}
|
|
|
|
/**
|
|
* Query the relevant set of caches, looking for a QVal to go with
|
|
* the provided QKey. If a QVal is found and which is non-NULL,
|
|
* *which is set to 0 if the qval was found in the current-read
|
|
* cache, 1 if it was found in the local across-read cache, and 2
|
|
* if it was found in the shared across-read cache.
|
|
*/
|
|
inline QVal* query(
|
|
const QKey& qk,
|
|
AlignmentCache** which,
|
|
bool getLock = true)
|
|
{
|
|
assert(qk.cacheable());
|
|
AlignmentCache* caches[3] = { current_, local_, shared_ };
|
|
for(int i = 0; i < 3; i++) {
|
|
if(caches[i] == NULL) continue;
|
|
QVal* qv = caches[i]->query(qk, getLock);
|
|
if(qv != NULL) {
|
|
if(which != NULL) *which = caches[i];
|
|
return qv;
|
|
}
|
|
}
|
|
return NULL;
|
|
}
|
|
#endif
|
|
|
|
/**
|
|
* This function is called whenever we start to align a new read or
|
|
* read substring. We make key for it and store the key in qk_.
|
|
* If the sequence is uncacheable, we don't actually add it to the
|
|
* map but the corresponding reference substrings are still added
|
|
* to the qlist_.
|
|
*
|
|
* Returns:
|
|
* -1 if out of memory
|
|
* 0 if key was found in cache
|
|
* 1 if key was not found in cache (and there's enough memory to
|
|
* add a new key)
|
|
*/
|
|
int beginAlign(
|
|
const BTDnaString& seq,
|
|
const BTString& qual,
|
|
QVal<index_t>& qv, // out: filled in if we find it in the cache
|
|
bool getLock = true)
|
|
{
|
|
assert(repOk());
|
|
qk_.init(seq ASSERT_ONLY(, tmpdnastr_));
|
|
//if(qk_.cacheable() && (qv_ = current_->query(qk_, getLock)) != NULL) {
|
|
// // qv_ holds the answer
|
|
// assert(qv_->valid());
|
|
// qv = *qv_;
|
|
// resetRead();
|
|
// return 1; // found in cache
|
|
//} else
|
|
if(qk_.cacheable()) {
|
|
// Make a QNode for this key and possibly add the QNode to the
|
|
// Red-Black map; but if 'seq' isn't cacheable, just create the
|
|
// QNode (without adding it to the map).
|
|
qv_ = current_->add(qk_, &cacheable_, getLock);
|
|
} else {
|
|
qv_ = &qvbuf_;
|
|
}
|
|
if(qv_ == NULL) {
|
|
resetRead();
|
|
return -1; // Not in memory
|
|
}
|
|
qv_->reset();
|
|
return 0; // Need to search for it
|
|
}
|
|
ASSERT_ONLY(BTDnaString tmpdnastr_);
|
|
|
|
/**
|
|
* Called when is finished aligning a read (and so is finished
|
|
* adding associated reference strings). Returns a copy of the
|
|
* final QVal object and resets the alignment state of the
|
|
* current-read cache.
|
|
*
|
|
* Also, if the alignment is cacheable, it commits it to the next
|
|
* cache up in the cache hierarchy.
|
|
*/
|
|
QVal<index_t> finishAlign(bool getLock = true) {
|
|
if(!qv_->valid()) {
|
|
qv_->init(0, 0, 0);
|
|
}
|
|
// Copy this pointer because we're about to reset the qv_ field
|
|
// to NULL
|
|
QVal<index_t>* qv = qv_;
|
|
// Commit the contents of the current-read cache to the next
|
|
// cache up in the hierarchy.
|
|
// If qk is cacheable, then it must be in the cache
|
|
#if 0
|
|
if(qk_.cacheable()) {
|
|
AlignmentCache* caches[3] = { current_, local_, shared_ };
|
|
ASSERT_ONLY(AlignmentCache* which);
|
|
ASSERT_ONLY(QVal* qv2 = query(qk_, &which, true));
|
|
assert(qv2 == qv);
|
|
assert(which == current_);
|
|
for(int i = 1; i < 3; i++) {
|
|
if(caches[i] != NULL) {
|
|
// Copy this key/value pair to the to the higher
|
|
// level cache and, if its memory is exhausted,
|
|
// clear the cache and try again.
|
|
caches[i]->clearCopy(qk_, *qv_, *current_, getLock);
|
|
break;
|
|
}
|
|
}
|
|
}
|
|
#endif
|
|
// Reset the state in this iface in preparation for the next
|
|
// alignment.
|
|
resetRead();
|
|
assert(repOk());
|
|
return *qv;
|
|
}
|
|
|
|
/**
|
|
* A call to this member indicates that the caller has finished
|
|
* with the last read (if any) and is ready to work on the next.
|
|
* This gives the cache a chance to reset some of its state if
|
|
* necessary.
|
|
*/
|
|
void nextRead() {
|
|
current_->clear();
|
|
resetRead();
|
|
assert(!aligning());
|
|
}
|
|
|
|
/**
|
|
* Return true iff we're in the middle of aligning a sequence.
|
|
*/
|
|
bool aligning() const {
|
|
return qv_ != NULL;
|
|
}
|
|
|
|
/**
|
|
* Clears both the local and shared caches.
|
|
*/
|
|
void clear() {
|
|
if(current_ != NULL) current_->clear();
|
|
if(local_ != NULL) local_->clear();
|
|
if(shared_ != NULL) shared_->clear();
|
|
}
|
|
|
|
/**
|
|
* Add an alignment to the running list of alignments being
|
|
* compiled for the current read in the local cache.
|
|
*/
|
|
bool addOnTheFly(
|
|
const BTDnaString& rfseq, // reference sequence close to read seq
|
|
index_t topf, // top in BWT index
|
|
index_t botf, // bot in BWT index
|
|
index_t topb, // top in BWT' index
|
|
index_t botb, // bot in BWT' index
|
|
bool getLock = true) // true -> lock is not held by caller
|
|
{
|
|
|
|
assert(aligning());
|
|
assert(repOk());
|
|
ASSERT_ONLY(BTDnaString tmp);
|
|
SAKey sak(rfseq ASSERT_ONLY(, tmp));
|
|
//assert(sak.cacheable());
|
|
if(current_->addOnTheFly((*qv_), sak, topf, botf, topb, botb, getLock)) {
|
|
rangen_++;
|
|
eltsn_ += (botf-topf);
|
|
return true;
|
|
}
|
|
return false;
|
|
}
|
|
|
|
/**
|
|
* Given a QVal, populate the given EList of SATuples with records
|
|
* describing all of the cached information about the QVal's
|
|
* reference substrings.
|
|
*/
|
|
template<int S>
|
|
void queryQval(
|
|
const QVal<index_t>& qv,
|
|
EList<SATuple<index_t>, S>& satups,
|
|
index_t& nrange,
|
|
index_t& nelt,
|
|
bool getLock = true)
|
|
{
|
|
current_->queryQval(qv, satups, nrange, nelt, getLock);
|
|
}
|
|
|
|
/**
|
|
* Return a pointer to the current-read cache object.
|
|
*/
|
|
const AlignmentCache<index_t>* currentCache() const { return current_; }
|
|
|
|
index_t curNumRanges() const { return rangen_; }
|
|
index_t curNumElts() const { return eltsn_; }
|
|
|
|
#ifndef NDEBUG
|
|
/**
|
|
* Check that AlignmentCacheIface is internally consistent.
|
|
*/
|
|
bool repOk() const {
|
|
assert(current_ != NULL);
|
|
assert_geq(eltsn_, rangen_);
|
|
if(qv_ == NULL) {
|
|
assert_eq(0, rangen_);
|
|
assert_eq(0, eltsn_);
|
|
}
|
|
return true;
|
|
}
|
|
#endif
|
|
|
|
/**
|
|
* Return the alignment cache for the current read.
|
|
*/
|
|
const AlignmentCache<index_t>& current() {
|
|
return *current_;
|
|
}
|
|
|
|
protected:
|
|
|
|
/**
|
|
* Reset fields encoding info about the in-process read.
|
|
*/
|
|
void resetRead() {
|
|
cacheable_ = false;
|
|
rangen_ = eltsn_ = 0;
|
|
qv_ = NULL;
|
|
}
|
|
|
|
QKey qk_; // key representation for current read substring
|
|
QVal<index_t> *qv_; // pointer to value representation for current read substring
|
|
QVal<index_t> qvbuf_; // buffer for when key is uncacheable but we need a qv
|
|
bool cacheable_; // true iff the read substring currently being aligned is cacheable
|
|
|
|
index_t rangen_; // number of ranges since last alignment job began
|
|
index_t eltsn_; // number of elements since last alignment job began
|
|
|
|
AlignmentCache<index_t> *current_; // cache dedicated to the current read
|
|
AlignmentCache<index_t> *local_; // local, unsynchronized cache
|
|
AlignmentCache<index_t> *shared_; // shared, synchronized cache
|
|
};
|
|
|
|
#ifndef NDEBUG
|
|
/**
|
|
* Check that this QVal is internally consistent and consistent
|
|
* with the contents of the given cache.
|
|
*/
|
|
template <typename index_t>
|
|
bool QVal<index_t>::repOk(const AlignmentCache<index_t>& ac) const {
|
|
if(rangen_ > 0) {
|
|
assert_lt(i_, ac.qSize());
|
|
assert_leq(i_ + rangen_, ac.qSize());
|
|
}
|
|
assert_geq(eltn_, rangen_);
|
|
return true;
|
|
}
|
|
#endif
|
|
|
|
#ifndef NDEBUG
|
|
/**
|
|
* Check that this SAVal is internally consistent and consistent
|
|
* with the contents of the given cache.
|
|
*/
|
|
template <typename index_t>
|
|
bool SAVal<index_t>::repOk(const AlignmentCache<index_t>& ac) const {
|
|
assert(len == 0 || i < ac.saSize());
|
|
assert_leq(i + len, ac.saSize());
|
|
return true;
|
|
}
|
|
#endif
|
|
|
|
/**
|
|
* Add a new association between a read sequnce ('seq') and a
|
|
* reference sequence ('')
|
|
*/
|
|
template <typename index_t>
|
|
bool AlignmentCache<index_t>::addOnTheFly(
|
|
QVal<index_t>& qv, // qval that points to the range of reference substrings
|
|
const SAKey& sak, // the key holding the reference substring
|
|
index_t topf, // top range elt in BWT index
|
|
index_t botf, // bottom range elt in BWT index
|
|
index_t topb, // top range elt in BWT' index
|
|
index_t botb, // bottom range elt in BWT' index
|
|
bool getLock)
|
|
{
|
|
ThreadSafe ts(lockPtr(), shared_ && getLock);
|
|
bool added = true;
|
|
// If this is the first reference sequence we're associating with
|
|
// the query sequence, initialize the QVal.
|
|
if(!qv.valid()) {
|
|
qv.init((index_t)qlist_.size(), 0, 0);
|
|
}
|
|
qv.addRange(botf-topf); // update tally for # ranges and # elts
|
|
if(!qlist_.add(pool(), sak)) {
|
|
return false; // Exhausted pool memory
|
|
}
|
|
#ifndef NDEBUG
|
|
for(index_t i = qv.offset(); i < qlist_.size(); i++) {
|
|
if(i > qv.offset()) {
|
|
assert(qlist_.get(i) != qlist_.get(i-1));
|
|
}
|
|
}
|
|
#endif
|
|
assert_eq(qv.offset() + qv.numRanges(), qlist_.size());
|
|
SANode *s = samap_.add(pool(), sak, &added);
|
|
if(s == NULL) {
|
|
return false; // Exhausted pool memory
|
|
}
|
|
assert(s->key.repOk());
|
|
if(added) {
|
|
s->payload.i = (index_t)salist_.size();
|
|
s->payload.len = botf - topf;
|
|
s->payload.topf = topf;
|
|
s->payload.topb = topb;
|
|
for(size_t j = 0; j < (botf-topf); j++) {
|
|
if(!salist_.add(pool(), (index_t)0xffffffff)) {
|
|
// Change the payload's len field
|
|
s->payload.len = (uint32_t)j;
|
|
return false; // Exhausted pool memory
|
|
}
|
|
}
|
|
assert(s->payload.repOk(*this));
|
|
}
|
|
// Now that we know all allocations have succeeded, we can do a few final
|
|
// updates
|
|
|
|
return true;
|
|
}
|
|
|
|
#endif /*ALIGNER_CACHE_H_*/
|