/* * Copyright 2011, Ben Langmead * * This file is part of Bowtie 2. * * Bowtie 2 is free software: you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation, either version 3 of the License, or * (at your option) any later version. * * Bowtie 2 is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with Bowtie 2. If not, see . */ /* * group_walk.h * * Classes and routines for walking a set of BW ranges backwards from the edge * of a seed hit with the goal of resolving the offset of each row in each * range. Here "offset" means offset into the concatenated string of all * references. The main class is 'GroupWalk' and an important helper is * 'GWState'. * * For each combination of seed offset and orientation, there is an associated * QVal. Each QVal describes a (possibly empty) set of suffix array ranges. * Call these "seed range sets." Each range in the set is "backed" by a range * of the salist, represented as a PListSlice. Such a range is the origin of a * walk. * * When an offset is resolved, it is entered into the salist via the * PListSlice. Note that other routines in this same thread might also be * setting elements of the salist, so routines here should expect that elements * can go from unresolved to resolved at any time. * * What bookkeeping do we have to do as we walk? Before the first step, we * convert the initial QVal into a list of SATuples; the SATuples are our link * to the correpsonding ranges in the suffix array. The list of SATuples is * then converted to a list of GWState objects; these keep track of where we * are in our walk (e.g. what 'top' and 'bot' are, how many steps have we gone, * etc) as well as how the elements in the current range correspond to elements * from the original range. * * The user asks the GroupWalk to resolve another offset by calling advance(). * advance() can be called in various ways: * * (a) The user can request that the GroupWalk proceed until a * *particular* element is resolved, then return that resolved * element. Other elements may be resolved along the way, but * those results are buffered and may be dispensed in future calls * to advance(). * * (b) The user can request that the GroupWalk select an as-yet- * unreported element at random and and proceed until that element * is resolved and report it. Again, other elements may be * resolved along the way but they are buffered. * * (c) The user can request that the GroupWalk resolve elements in a * particular BW range (with a particular offset and orientation) * in an order of its choosing. The GroupWalk in this case * attempts to resolve as many offsets as possible as quickly as * possible, and returns them as soon as they're found. The res_ * buffer is used in this case. * * (d) Like (c) but resolving elements at a paritcular offset and * orientation instead of at a specific BW range. The res_ buffer * is used in this case, since there's a chance that the * * There are simple ways to heuristically reduce the problem size while * maintaining randomness. For instance, the user put a ceiling on the * number of elements that we walk from any given seed offset or range. * We can then trim away random subranges to reduce the size of the * problem. There is no need for the caller to do this for us. */ #ifndef GROUP_WALK_H_ #define GROUP_WALK_H_ #include #include #include "ds.h" #include "gfm.h" #include "read.h" #include "reference.h" #include "mem_ids.h" /** * Encapsulate an SA range and an associated list of slots where the resolved * offsets can be placed. */ template class SARangeWithOffs { public: SARangeWithOffs() { reset(); }; SARangeWithOffs( index_t tf, index_t bf, index_t ntf, index_t nbf, const EList >& n_iedge_count, size_t len, const T& o) { init(tf, bf, ntf, nbf, n_iedge_count, len, o); } void init( index_t tf, index_t bf, index_t ntf, index_t nbf, const EList >& n_iedge_count, size_t len_, const T& o) { topf = tf; botf = bf; assert_lt(topf, botf); node_top = ntf; node_bot = nbf; assert_leq(node_bot - node_top, botf - topf); node_iedge_count = n_iedge_count; len = len_, offs = o; } /** * Reset to uninitialized state. */ void reset() { topf = (index_t)INDEX_MAX; } /** * Return true if this is initialized. */ bool inited() const { return topf != (index_t)INDEX_MAX; } /** * Return the number of times this reference substring occurs in the * reference, which is also the size of the 'offs' TSlice. */ size_t size() const { return offs.size(); } index_t topf; // top in GBWT index index_t botf; index_t node_top; // top node index_t node_bot; EList > node_iedge_count; size_t len; // length of the reference sequence involved T offs; // offsets }; /** * A group of per-thread state that can be shared between all the GroupWalks * used in that thread. */ template struct GroupWalkState { GroupWalkState(int cat) : map(cat) { masks[0].setCat(cat); masks[1].setCat(cat); masks[2].setCat(cat); masks[3].setCat(cat); } EList masks[4]; // temporary list for masks; used in GWState EList map; // temporary list of GWState maps }; /** * Encapsulates counters that encode how much work the walk-left logic * has done. */ struct WalkMetrics { WalkMetrics() { reset(); } /** * Sum each across this object and 'm'. This is the only safe way * to update a WalkMetrics shared by many threads. */ void merge(const WalkMetrics& m, bool getLock = false) { ThreadSafe ts(&mutex_m, getLock); bwops += m.bwops; branches += m.branches; resolves += m.resolves; refresolves += m.refresolves; reports += m.reports; } /** * Set all to 0. */ void reset() { bwops = branches = resolves = refresolves = reports = 0; } uint64_t bwops; // Burrows-Wheeler operations uint64_t branches; // BW range branch-offs uint64_t resolves; // # offs resolved with BW walk-left uint64_t refresolves; // # resolutions caused by reference scanning uint64_t reports; // # offs reported (1 can be reported many times) MUTEX_T mutex_m; }; /** * Coordinates for a BW element that the GroupWalk might resolve. */ template struct GWElt { GWElt() { reset(); } /** * Reset GWElt to uninitialized state. */ void reset() { offidx = range = elt = len = (index_t)OFF_MASK; fw = false; } /** * Initialize this WalkResult. */ void init( index_t oi, bool f, index_t r, index_t e, index_t l) { offidx = oi; fw = f; range = r; elt = e; len = l; } /** * Return true iff this GWElt and the given GWElt refer to the same * element. */ bool operator==(const GWElt& o) const { return offidx == o.offidx && fw == o.fw && range == o.range && elt == o.elt && len == o.len; } /** * Return true iff this GWElt and the given GWElt refer to * different elements. */ bool operator!=(const GWElt& o) const { return !(*this == o); } index_t offidx; // seed offset index bool fw; // strand index_t range; // range index_t elt; // element index_t len; // length }; /** * A record encapsulating the result of looking up one BW element in * the Bowtie index. */ template struct WalkResult { WalkResult() { reset(); } /** * Reset GWElt to uninitialized state. */ void reset() { elt.reset(); bwrow = toff = (index_t)OFF_MASK; } /** * Initialize this WalkResult. */ void init( index_t oi, // seed offset index bool f, // strand index_t r, // range index_t e, // element index_t bwr, // BW row index_t len, // length index_t to) // text offset { elt.init(oi, f, r, e, len); bwrow = bwr; toff = to; } GWElt elt; // element resolved index_t bwrow; // SA row resolved index_t toff; // resolved offset from SA sample }; /** * A GW hit encapsulates an SATuple describing a reference substring * in the cache, along with a bool indicating whether each element of * the hit has been reported yet. */ template class GWHit { public: GWHit() : fmap(0, GW_CAT), offidx((index_t)OFF_MASK), fw(false), range((index_t)OFF_MASK), len((index_t)OFF_MASK), reported_(0, GW_CAT), nrep_(0) { assert(repOkBasic()); } /** * Initialize with a new SA range. Resolve the done vector so that * there's one bool per suffix array element. */ void init( SARangeWithOffs& sa, index_t oi, bool f, index_t r) { nrep_ = 0; offidx = oi; fw = f; range = r; len = (index_t)sa.len; reported_.resize(sa.offs.size()); reported_.fill(false); fmap.resize(sa.offs.size()); fmap.fill(make_pair((index_t)OFF_MASK, (index_t)OFF_MASK)); } /** * Clear contents of sat and done. */ void reset() { reported_.clear(); fmap.clear(); nrep_ = 0; offidx = (index_t)OFF_MASK; fw = false; range = (index_t)OFF_MASK; len = (index_t)OFF_MASK; } #ifndef NDEBUG /** * Check that GWHit is internally consistent. If a pointer to an * EList of GWStates is given, we assume that it is the EList * corresponding to this GWHit and check whether the forward and * reverse mappings match up for the as-yet-unresolved elements. */ bool repOk(const SARangeWithOffs& sa) const { assert_eq(reported_.size(), sa.offs.size()); assert_eq(fmap.size(), sa.offs.size()); // Shouldn't be any repeats among as-yet-unresolveds size_t nrep = 0; for(size_t i = 0; i < fmap.size(); i++) { if(reported_[i]) nrep++; if(sa.offs[i] != (index_t)OFF_MASK) { continue; } for(size_t j = i+1; j < fmap.size(); j++) { if(sa.offs[j] != (index_t)OFF_MASK) { continue; } assert(fmap[i] != fmap[j]); } } assert_eq(nrep_, nrep); return true; } /** * Return true iff this GWHit is not obviously corrupt. */ bool repOkBasic() { return true; } #endif /** * Set the ith element to be reported. */ void setReported(index_t i) { assert(!reported_[i]); assert_lt(i, reported_.size()); reported_[i] = true; nrep_++; } /** * Return true iff element i has been reported. */ bool reported(index_t i) const { assert_lt(i, reported_.size()); return reported_[i]; } /** * Return true iff all elements have been reported. */ bool done() const { assert_leq(nrep_, reported_.size()); return nrep_ == reported_.size(); } EList, 16> fmap; // forward map; to GWState & elt index_t offidx; // offset idx bool fw; // orientation index_t range; // original range index index_t len; // length of hit protected: EList reported_; // per-elt bool indicating whether it's been reported index_t nrep_; }; /** * Encapsulates the progress made along a particular path from the original * range. */ template class GWState { public: GWState() : map_(0, GW_CAT) { reset(); assert(repOkBasic()); } /** * Initialize this GWState with new gfm, top, bot, step, and sat. * * We assume map is already set up. * * Returns true iff at least one elt was resolved. */ template pair init( const GFM& gfm, // index to walk left in const BitPairReference& ref, // bitpair-encoded reference SARangeWithOffs& sa, // SA range with offsets EList& sts, // EList of GWStates for range being advanced GWHit& hit, // Corresponding hit structure index_t range, // which range is this? bool reportList, // if true, "report" resolved offsets immediately by adding them to 'res' list EList, 16>* res, // EList where resolved offsets should be appended index_t tp, // top of range at this step index_t bt, // bot of range at this step index_t n_tp, // node at top index_t n_bt, // node at bot const EList >& n_iedge_count, index_t st, // # steps taken to get to this step WalkMetrics& met) { assert_gt(bt, tp); assert_gt(n_bt, n_tp); assert_geq(bt - tp, n_bt - n_tp); assert_lt(range, sts.size()); top = tp; bot = bt; node_top = n_tp; node_bot = n_bt; node_iedge_count = n_iedge_count; step = st; assert(!inited_); ASSERT_ONLY(inited_ = true); ASSERT_ONLY(lastStep_ = step-1); return init(gfm, ref, sa, sts, hit, range, reportList, res, met); } /** * Initialize this GWState. * * We assume map is already set up, and that 'step' is equal to the * number of steps taken to get to the new top/bot pair *currently* * in the top and bot fields. * * Returns a pair of numbers, the first being the number of * resolved but unreported offsets found during this advance, the * second being the number of as-yet-unresolved offsets. */ template pair init( const GFM& gfm, // forward Bowtie index const BitPairReference& ref, // bitpair-encoded reference SARangeWithOffs& sa, // SA range with offsets EList& st, // EList of GWStates for advancing range GWHit& hit, // Corresponding hit structure index_t range, // range being inited bool reportList, // report resolutions, adding to 'res' list? EList, 16>* res, // EList to append resolutions WalkMetrics& met) // update these metrics { assert(inited_); assert_eq(step, lastStep_+1); ASSERT_ONLY(lastStep_++); assert_leq((index_t)step, gfm.gh().len()); assert_lt(range, st.size()); pair ret = make_pair(0, 0); index_t trimBegin = 0, trimEnd = 0; bool empty = true; // assume all resolved until proven otherwise // Commit new information, if any, to the PListSlide. Also, // trim and check if we're done. assert_eq(node_bot - node_top, map_.size()); ASSERT_ONLY(index_t num_orig_iedges = 0, orig_e = 0); index_t num_iedges = 0, e = 0; for(size_t i = mapi_; i < map_.size(); i++) { bool resolved = (off((index_t)i, sa) != (index_t)OFF_MASK); if(!resolved) { #ifndef NDEBUG while(orig_e < sa.node_iedge_count.size()) { if(map((index_t)i) <= sa.node_iedge_count[orig_e].first) { break; } num_orig_iedges += sa.node_iedge_count[orig_e].second; orig_e++; } #endif while(e < node_iedge_count.size()) { if(i <= node_iedge_count[e].first) { break; } num_iedges += node_iedge_count[e].second; e++; } // Elt not resolved yet; try to resolve it now index_t bwrow = (index_t)(top + i + num_iedges); index_t node = (index_t)(node_top + i); index_t toff = gfm.tryOffset(bwrow, node); ASSERT_ONLY(index_t origBwRow = sa.topf + map((index_t)i) + num_orig_iedges); ASSERT_ONLY(index_t origNode = sa.node_top + map((index_t)i)); assert_eq(bwrow, gfm.walkLeft(origBwRow, step)); if(toff != (index_t)OFF_MASK) { // Yes, toff was resolvable assert_eq(toff, gfm.getOffset(bwrow, node)); met.resolves++; toff += step; assert_eq(toff, gfm.getOffset(origBwRow, origNode)); setOff((index_t)i, toff, sa, met); if(!reportList) ret.first++; #if 0 // used to be #ifndef NDEBUG, but since we no longer require that the reference // string info be included, this is no longer relevant. // Sanity check that the reference characters under this // hit match the seed characters in hit.satup->key.seq. // This is NOT a check that we associated the exact right // text offset with the BW row. This is an important // distinction because when resolved offsets are filled in // via refernce scanning, they are not necessarily the // exact right text offsets to associate with the // respective BW rows but they WILL all be correct w/r/t // the reference sequence underneath, which is what really // matters here. index_t tidx = (index_t)OFF_MASK, tof, tlen; bool straddled = false; gfm.joinedToTextOff( hit.len, // length of seed toff, // offset in joined reference string tidx, // reference sequence id tof, // offset in reference coordinates tlen, // length of reference sequence true, // don't reject straddlers straddled); if(tidx != (index_t)OFF_MASK && hit.satup->key.seq != std::numeric_limits::max()) { // key: 2-bit characters packed into a 64-bit word with // the least significant bitpair corresponding to the // rightmost character on the Watson reference strand. uint64_t key = hit.satup->key.seq; for(int64_t j = tof + hit.len-1; j >= tof; j--) { // Get next reference base to the left int c = ref.getBase(tidx, j); assert_range(0, 3, c); // Must equal least significant bitpair of key if(c != (int)(key & 3)) { // Oops; when we jump to the piece of the // reference where the seed hit is, it doesn't // match the seed hit. Before dying, check // whether we have the right spot in the joined // reference string SString jref; gfm.restore(jref); uint64_t key2 = hit.satup->key.seq; for(int64_t k = toff + hit.len-1; k >= toff; k--) { int c = jref[k]; assert_range(0, 3, c); assert_eq(c, (int)(key2 & 3)); key2 >>= 2; } assert(false); } key >>= 2; } } #endif } } // Is the element resolved? We ask this regardless of how it was // resolved (whether this function did it just now, whether it did // it a while ago, or whether some other function outside GroupWalk // did it). if(off((index_t)i, sa) != (index_t)OFF_MASK) { if(reportList && !hit.reported(map((index_t)i))) { // Report it index_t toff = off((index_t)i, sa); assert(res != NULL); res->expand(); index_t origBwRow = sa.topf + map((index_t)i); res->back().init( hit.offidx, // offset idx hit.fw, // orientation hit.range, // original range index map((index_t)i), // original element offset origBwRow, // BW row resolved hit.len, // hit length toff); // text offset hit.setReported(map((index_t)i)); met.reports++; } // Offset resolved if(empty) { // Haven't seen a non-empty entry yet, so we // can trim this from the beginning. trimBegin++; } else { trimEnd++; } } else { // Offset not yet resolved ret.second++; trimEnd = 0; empty = false; // Set the forward map in the corresponding GWHit // object to point to the appropriate element of our // range assert_geq(i, mapi_); index_t bmap = map((index_t)i); hit.fmap[bmap].first = range; hit.fmap[bmap].second = (index_t)i; #ifndef NDEBUG for(size_t j = 0; j < bmap; j++) { if(sa.offs[j] == (index_t)OFF_MASK && hit.fmap[j].first == range) { assert_neq(i, hit.fmap[j].second); } } #endif } } // Trim from beginning assert_geq(trimBegin, 0); mapi_ += trimBegin; if(trimBegin > 0) { top += trimBegin; index_t e = 0; for(; e < node_iedge_count.size(); e++) { if(node_iedge_count[e].first >= trimBegin) break; assert_geq(top, node_iedge_count[e].second); top += node_iedge_count[e].second; } if(e > 0) node_iedge_count.erase(0, e); for(e = 0; e < node_iedge_count.size(); e++) { assert_geq(node_iedge_count[e].first, trimBegin); node_iedge_count[e].first -= trimBegin; } } node_top += trimBegin; if(trimEnd > 0) { // Trim from end map_.resize(map_.size() - trimEnd); bot -= trimEnd; index_t node_range = node_bot - node_top; while(node_iedge_count.size() > 0) { if(node_iedge_count.back().first < (node_range - trimEnd)) break; assert_geq(bot, node_iedge_count.back().second); bot -= node_iedge_count.back().second; node_iedge_count.pop_back(); } } node_bot -= trimEnd; #ifndef NDEBUG assert_leq(node_top, node_bot); index_t num_nodes = node_bot - node_top; index_t add = 0; for(index_t e = 0; e < node_iedge_count.size(); e++) { assert_lt(node_iedge_count[e].first, num_nodes); add += node_iedge_count[e].second; } assert_eq(bot - top, num_nodes + add); #endif if(empty) { assert(done()); #ifndef NDEBUG // If range is done, all elements from map should be // resolved for(size_t i = mapi_; i < map_.size(); i++) { assert_neq((index_t)OFF_MASK, off((index_t)i, sa)); } // If this range is done, then it should be the case that // all elements in the corresponding GWHit that point to // this range are resolved. for(size_t i = 0; i < hit.fmap.size(); i++) { if(sa.offs[i] == (index_t)OFF_MASK) { assert_neq(range, hit.fmap[i].first); } } #endif return ret; } else { assert(!done()); } // Is there a dollar sign in the middle of the range? tmp_zOffs.clear(); for(index_t i = 0; i < gfm._zOffs.size(); i++) { #ifndef NDEBUG if(i > 0) { assert_lt(gfm._zOffs[i-1], gfm._zOffs[i]); } #endif assert_neq(top, gfm._zOffs[i]); // assert_neq(bot-1, gfm._zOffs[i]); if(gfm._zOffs[i] > top && gfm._zOffs[i] < bot) { tmp_zOffs.push_back(gfm._zOffs[i]); } } // Yes, the dollar sign is in the middle of this range. We // must split it into the two ranges on either side of the // dollar. Let 'bot' and 'top' delimit the portion of the // range prior to the dollar. if(tmp_zOffs.size() > 0) { tmp_gbwt_to_node.clear(); index_t n = 0, e = 0; for(index_t r = 0; r < (bot - top); r++) { tmp_gbwt_to_node.push_back(n); if(e < node_iedge_count.size()) { assert_leq(n, node_iedge_count[e].first); if(n == node_iedge_count[e].first) { for(index_t a = 0; a < node_iedge_count[e].second; a++) { tmp_gbwt_to_node.push_back(n); r++; } e++; } } n++; } assert_eq(bot - top, tmp_gbwt_to_node.size()); for(index_t i = 0; i < tmp_zOffs.size(); i++) { // Note: might be able to do additional trimming off the end. // Create a new range for the portion after the dollar. index_t new_top = tmp_zOffs[i] + 1; if(i + 1 < tmp_zOffs.size() && new_top == tmp_zOffs[i+1]) { continue; } assert_leq(new_top - top, tmp_gbwt_to_node.size()); if(new_top - top == tmp_gbwt_to_node.size()) { break; } index_t new_node_top = tmp_gbwt_to_node[new_top - top] + node_top; assert_lt(new_node_top, node_bot); index_t new_bot; if(i + 1 < tmp_zOffs.size()) { new_bot = tmp_zOffs[i+1]; } else { new_bot = bot; } index_t new_node_bot = node_bot; if(new_bot - top < tmp_gbwt_to_node.size()) { new_node_bot = node_top + tmp_gbwt_to_node[new_bot - top]; if(new_bot - top > 0 && tmp_gbwt_to_node[new_bot - top] == tmp_gbwt_to_node[new_bot - top - 1]) { new_node_bot++; } } tmp_node_iedge_count.clear(); if(new_top >= new_bot) continue; for(index_t j = new_top - top; j + 1 < new_bot - top;) { index_t n = tmp_gbwt_to_node[j]; index_t j2 = j + 1; while(j2 < new_bot - top) { if(n != tmp_gbwt_to_node[j2]) { break; } j2++; } if(j + 1 < j2) { tmp_node_iedge_count.expand(); assert_lt(node_top, new_node_top); tmp_node_iedge_count.back().first = n - (new_node_top - node_top); tmp_node_iedge_count.back().second = j2 - j - 1; } j = j2; } st.expand(); st.back().reset(); st.back().initMap(new_node_bot - new_node_top); for(index_t j = new_node_top; j < new_node_bot; j++) { st.back().map_[j - new_node_top] = map(j - node_top + mapi_); } st.back().init( gfm, ref, sa, st, hit, (index_t)st.size()-1, reportList, res, new_top, new_bot, new_node_top, new_node_bot, tmp_node_iedge_count, step, met); } assert_eq((index_t)map_.size(), node_bot - node_top + mapi_); bot = tmp_zOffs[0]; assert_lt(bot - top, tmp_gbwt_to_node.size()); node_bot = tmp_gbwt_to_node[bot - top - 1] + node_top + 1; map_.resize(node_bot - node_top + mapi_); index_t width = node_bot - node_top; for(index_t e = 0; e < node_iedge_count.size(); e++) { if(node_iedge_count[e].first >= node_bot - node_top) { node_iedge_count.resize(e); break; } width += node_iedge_count[e].second; } if(width != bot - top) { assert_eq(width, bot - top + 1); assert_gt(node_iedge_count.size(), 0); assert_gt(node_iedge_count.back().second, 0); node_iedge_count.back().second -= 1; if(node_iedge_count.back().second == 0) { node_iedge_count.resize(node_iedge_count.size()- 1); } } } assert_gt(bot, top); // Prepare SideLocus's for next step if(bot-top > 1) { SideLocus::initFromTopBot(top, bot, gfm.gh(), gfm.gfm(), tloc, bloc); assert(tloc.valid()); assert(tloc.repOk(gfm.gh())); assert(bloc.valid()); assert(bloc.repOk(gfm.gh())); } else { tloc.initFromRow(top, gfm.gh(), gfm.gfm()); assert(tloc.valid()); assert(tloc.repOk(gfm.gh())); bloc.invalidate(); } return ret; } #ifndef NDEBUG /** * Check if this GWP is internally consistent. */ bool repOk( const GFM& gfm, GWHit& hit, index_t range) const { assert(done() || bot > top); assert(doneResolving(hit) || (tloc.valid() && tloc.repOk(gfm.gh()))); assert(doneResolving(hit) || bot == top+1 || (bloc.valid() && bloc.repOk(gfm.gh()))); assert_eq(map_.size()-mapi_, bot-top); // Make sure that 'done' is compatible with whether we have >= // 1 elements left to resolve. int left = 0; for(size_t i = mapi_; i < map_.size(); i++) { ASSERT_ONLY(index_t row = (index_t)(top + i - mapi_)); ASSERT_ONLY(index_t origRow = hit.satup->topf + map(i)); assert(step == 0 || row != origRow); assert_eq(row, gfm.walkLeft(origRow, step)); assert_lt(map_[i], hit.satup->offs.size()); if(off(i, hit) == (index_t)OFF_MASK) left++; } assert(repOkMapRepeats()); assert(repOkMapInclusive(hit, range)); return true; } /** * Return true iff this GWState is not obviously corrupt. */ bool repOkBasic() { assert_geq(bot, top); return true; } /** * Check that the fmap elements pointed to by our map_ include all * of the fmap elements that point to this range. */ bool repOkMapInclusive(GWHit& hit, index_t range) const { for(size_t i = 0; i < hit.fmap.size(); i++) { if(hit.satup->offs[i] == (index_t)OFF_MASK) { if(range == hit.fmap[i].first) { ASSERT_ONLY(bool found = false); for(size_t j = mapi_; j < map_.size(); j++) { if(map(j) == i) { ASSERT_ONLY(found = true); break; } } assert(found); } } } return true; } /** * Check that no two elements in map_ are the same. */ bool repOkMapRepeats() const { for(size_t i = mapi_; i < map_.size(); i++) { for(size_t j = i+1; j < map_.size(); j++) { assert_neq(map_[i], map_[j]); } } return true; } #endif /** * Return the offset currently assigned to the ith element. If it * has not yet been resolved, return 0xffffffff. */ index_t off( index_t i, const SARangeWithOffs& sa) { assert_geq(i, mapi_); assert_lt(i, map_.size()); assert_lt(map_[i], sa.offs.size()); return sa.offs.get(map_[i]); } /** * Return the offset of the element within the original range's * PListSlice that the ith element of this range corresponds to. */ index_t map(index_t i) const { assert_geq(i, mapi_); assert_lt(i, map_.size()); return map_[i]; } /** * Return the offset of the first untrimmed offset in the map. */ index_t mapi() const { return mapi_; } /** * Return number of active elements in the range being tracked by * this GWState. */ index_t size() const { return (index_t)(map_.size() - mapi_); } /** * Return true iff all elements in this leaf range have been * resolved. */ bool done() const { return size() == 0; } /** * Set the PListSlice element that corresponds to the ith element * of 'map' to the specified offset. */ void setOff( index_t i, index_t off, SARangeWithOffs& sa, WalkMetrics& met) { assert_lt(i + mapi_, map_.size()); assert_lt(map_[i + mapi_], sa.offs.size()); size_t saoff = map_[i + mapi_]; sa.offs[saoff] = off; assert_eq(off, sa.offs[saoff]); } /** * Advance this GWState by one step (i.e. one BW operation). In * the event of a "split", more elements are added to the EList * 'st', which must have room for at least 3 more elements without * needing another expansion. If an expansion of 'st' is * triggered, this GWState object becomes invalid. * * Returns a pair of numbers, the first being the number of * resolved but unreported offsets found during this advance, the * second being the number of as-yet-unresolved offsets. */ template pair advance( const GFM& gfm, // the forward Bowtie index, for stepping left const BitPairReference& ref, // bitpair-encoded reference SARangeWithOffs& sa, // SA range with offsets GWHit& hit, // the associated GWHit object index_t range, // which range is this? bool reportList, // if true, "report" resolved offsets immediately by adding them to 'res' list EList, 16>* res, // EList where resolved offsets should be appended EList& st, // EList of GWStates for range being advanced GroupWalkState& gws, // temporary storage for masks WalkMetrics& met, PerReadMetrics& prm) { ASSERT_ONLY(index_t origTop = top); ASSERT_ONLY(index_t origBot = bot); assert_geq(step, 0); assert_eq(step, lastStep_); // assert_geq(st.capacity(), st.size() + 4); assert(tloc.valid()); assert(tloc.repOk(gfm.gh())); assert_eq(node_bot-node_top, (index_t)(map_.size()-mapi_)); pair ret = make_pair(0, 0); assert_eq(top, tloc.toBWRow(gfm.gh())); if(bot - top > 1) { bool first = true; ASSERT_ONLY(index_t sum = 0); index_t newtop = 0, newbot = 0; index_t new_node_top = 0, new_node_bot = 0; gws.map.clear(); // Still multiple elements being tracked index_t curtop = top, curbot = bot; index_t cur_node_top = node_top, cur_node_bot = node_bot; for(index_t e = 0; e < node_iedge_count.size() + 1; e++) { if(e >= node_iedge_count.size()) { if(e > 0) { curtop = curbot + node_iedge_count[e-1].second; curbot = bot; if(curtop >= curbot) { assert_eq(curtop, curbot); break; } cur_node_top = cur_node_bot; cur_node_bot = node_bot; } } else { if(e > 0) { curtop = curbot + node_iedge_count[e-1].second; assert_lt(node_iedge_count[e-1].first, node_iedge_count[e].first); curbot = curtop + (node_iedge_count[e].first - node_iedge_count[e-1].first); cur_node_top = cur_node_bot; } else { curbot = curtop + node_iedge_count[e].first + 1; } cur_node_bot = node_top + node_iedge_count[e].first + 1; } assert_lt(curtop, curbot); index_t upto[4], in[4]; upto[0] = in[0] = upto[1] = in[1] = upto[2] = in[2] = upto[3] = in[3] = 0; // assert_eq(bot, bloc.toBWRow(gfm.gh())); met.bwops++; prm.nExFmops++; // Assert that there's not a dollar sign in the middle of // this range #ifndef NDEBUG for(index_t i = 0; i < gfm._zOffs.size(); i++) { assert(curbot <= gfm._zOffs[i] || curtop > gfm._zOffs[i]); } #endif SideLocus curtloc, curbloc; SideLocus::initFromTopBot(curtop, curbot, gfm.gh(), gfm.gfm(), curtloc, curbloc); gfm.mapLFRange(curtloc, curbloc, curbot-curtop, upto, in, gws.masks); #ifndef NDEBUG for(int i = 0; i < 4; i++) { assert_eq(curbot-curtop, (index_t)(gws.masks[i].size())); } #endif for(int i = 0; i < 4; i++) { if(in[i] > 0) { // Non-empty range resulted if(first) { // For the first one, first = false; pair range, node_range; backup_node_iedge_count.clear(); SideLocus::initFromTopBot(curtop, curbot, gfm.gh(), gfm.gfm(), curtloc, curbloc); range = gfm.mapGLF(curtloc, curbloc, i, &node_range, &backup_node_iedge_count, cur_node_bot - cur_node_top); newtop = range.first; newbot = range.second; new_node_top = node_range.first; new_node_bot = node_range.second; // Range narrowed so we have to look at the masks for(size_t j = 0; j < gws.masks[i].size(); j++) { assert_lt(j+mapi_+(cur_node_top - node_top), map_.size()); if(gws.masks[i][j]) { gws.map.push_back(map_[j+mapi_+(cur_node_top - node_top)]); assert(gws.map.size() <= 1 || gws.map.back() != gws.map[gws.map.size()-2]); #if 0 // If this element is not yet resolved, // then check that it really is the // expected number of steps to the left // of the corresponding element in the // root range assert_lt(gws.map.back(), sa.size()); if(sa.offs[gws.map.back()] == (index_t)OFF_MASK) { assert_eq(newtop + gws.map.size() - 1, gfm.walkLeft(sa.topf + gws.map.back(), step+1)); } #endif } } assert_lt(new_node_top, new_node_bot); if(new_node_bot - new_node_top < gws.map.size()) { assert_eq(curbot - curtop, cur_node_bot - cur_node_top); SideLocus tmptloc, tmpbloc; pair tmp_node_range; index_t j1 = 0, j2 = 0; for(index_t c = 0; c < gws.masks[i].size(); c++) { if(gws.masks[i][c]) { j1 = c; break; } } for(index_t j = 0; j + 1 < gws.map.size(); j++) { for(index_t c = j1 + 1; c < gws.masks[i].size(); c++) { if(gws.masks[i][c]) { j2 = c; break; } } assert_lt(j1, j2); SideLocus::initFromTopBot(curtop + j1, curtop + j2 + 1, gfm.gh(), gfm.gfm(), tmptloc, tmpbloc); gfm.mapGLF(tmptloc, tmpbloc, i, &tmp_node_range); assert_gt(tmp_node_range.second - tmp_node_range.first, 0); if(tmp_node_range.second - tmp_node_range.first == 1) { index_t jmap = gws.map[j]; assert_lt(jmap, sa.offs.size()); sa.offs[jmap] = gws.map[j]; gws.map[j] = (index_t)OFF_MASK; } j1 = j2; j2 = 0; } for(index_t j = 0; j < gws.map.size();) { if(gws.map[j] == (index_t)OFF_MASK) { gws.map.erase(j); } else j++; } #ifndef NDEBUG for(index_t j = 0; j < gws.map.size(); j++) { assert_neq(gws.map[j], (index_t)OFF_MASK); } #endif } assert_eq(new_node_bot - new_node_top, (index_t)(gws.map.size())); } else { // For each beyond the first, create a new // GWState and add it to the GWState list. // NOTE: this can cause the underlying list to // be expanded which in turn might leave 'st' // pointing to bad memory. st.expand(); st.back().reset(); tmp_node_iedge_count.clear(); pair range, node_range; SideLocus::initFromTopBot(curtop, curbot, gfm.gh(), gfm.gfm(), curtloc, curbloc); range = gfm.mapGLF(curtloc, curbloc, i, &node_range, &tmp_node_iedge_count, cur_node_bot - cur_node_top); assert_geq(range.second - range.first, node_range.second - node_range.first); index_t ntop = range.first; index_t nbot = range.second; st.back().mapi_ = 0; st.back().map_.clear(); met.branches++; // Range narrowed so we have to look at the masks for(size_t j = 0; j < gws.masks[i].size(); j++) { if(gws.masks[i][j]) st.back().map_.push_back(map_[j+mapi_+(cur_node_top - node_top)]); } assert_lt(node_range.first, node_range.second); if(node_range.second - node_range.first < st.back().map_.size()) { assert_eq(curbot - curtop, cur_node_bot - cur_node_top); SideLocus tmptloc, tmpbloc; pair tmp_node_range; index_t j1 = 0, j2 = 0; for(index_t c = 0; c < gws.masks[i].size(); c++) { if(gws.masks[i][c]) { j1 = c; break; } } for(index_t j = 0; j + 1 < st.back().map_.size(); j++) { for(index_t c = j1 + 1; c < gws.masks[i].size(); c++) { if(gws.masks[i][c]) { j2 = c; break; } } assert_lt(j1, j2); SideLocus::initFromTopBot(curtop + j1, curtop + j2 + 1, gfm.gh(), gfm.gfm(), tmptloc, tmpbloc); gfm.mapGLF(tmptloc, tmpbloc, i, &tmp_node_range); assert_gt(tmp_node_range.second - tmp_node_range.first, 0); if(tmp_node_range.second - tmp_node_range.first == 1) { index_t jmap = st.back().map_[j]; assert_lt(jmap, sa.offs.size()); sa.offs[jmap] = st.back().map_[j]; st.back().map_[j] = (index_t)OFF_MASK; } j1 = j2; j2 = 0; } for(index_t j = 0; j < st.back().map_.size();) { if(st.back().map_[j] == (index_t)OFF_MASK) { st.back().map_.erase(j); } else j++; } #ifndef NDEBUG for(index_t j = 0; j < st.back().map_.size(); j++) { assert_neq(st.back().map_[j], (index_t)OFF_MASK); } #endif } assert_eq(node_range.second - node_range.first, st.back().map_.size()); pair rret = st.back().init( gfm, // forward Bowtie index ref, // bitpair-encodede reference sa, // SA range with offsets st, // EList of all GWStates associated with original range hit, // associated GWHit object (index_t)st.size()-1, // range offset reportList, // if true, report hits to 'res' list res, // report hits here if reportList is true ntop, // BW top of new range nbot, // BW bot of new range node_range.first, node_range.second, tmp_node_iedge_count, step+1, // # steps taken to get to this new range met); // update these metrics ret.first += rret.first; ret.second += rret.second; } ASSERT_ONLY(sum += in[i]); } } } mapi_ = 0; // assert_eq(new_node_bot-new_node_top, sum); assert_gt(newbot, newtop); assert(top != newtop || bot != newbot); //assert(!(newtop < top && newbot > top)); top = newtop; bot = newbot; node_top = new_node_top; node_bot = new_node_bot; node_iedge_count = backup_node_iedge_count; backup_node_iedge_count.clear(); if(!gws.map.empty()) { map_ = gws.map; } //assert(repOkMapRepeats()); //assert(repOkMapInclusive(hit, range)); assert_eq(node_bot-node_top, (index_t)map_.size()); } else { // Down to one element assert_eq(bot, top+1); assert_eq(1, map_.size()-mapi_); // Sets top, returns char walked through (which we ignore) ASSERT_ONLY(index_t oldtop = top); met.bwops++; prm.nExFmops++; pair node_range(0, 0); pair range = gfm.mapGLF1(top, tloc, &node_range); top = range.first; assert_neq(top, oldtop); bot = top+1; node_top = node_range.first; node_bot = node_range.second; if(mapi_ > 0) { map_[0] = map_[mapi_]; mapi_ = 0; } map_.resize(1); } assert(top != origTop || bot != origBot); step++; assert_gt(step, 0); assert_leq((index_t)step, gfm.gh().len()); pair rret = init( gfm, // forward GFM index ref, // bitpair-encodede reference sa, // SA range with offsets st, // EList of all GWStates associated with original range hit, // associated GWHit object range, // range offset reportList, // if true, report hits to 'res' list res, // report hits here if reportList is true met); // update these metrics ret.first += rret.first; ret.second += rret.second; return ret; } /** * Clear all state in preparation for the next walk. */ void reset() { top = bot = node_top = node_bot = step = mapi_ = 0; ASSERT_ONLY(lastStep_ = -1); ASSERT_ONLY(inited_ = false); tloc.invalidate(); bloc.invalidate(); map_.clear(); node_iedge_count.clear(); backup_node_iedge_count.clear(); tmp_node_iedge_count.clear(); } /** * Resize the map_ field to the given size. */ void initMap(size_t newsz) { mapi_ = 0; map_.resize(newsz); for(size_t i = 0; i < newsz; i++) { map_[i] = (index_t)i; } } /** * Return true iff all rows corresponding to this GWState have been * resolved and reported. */ bool doneReporting(const GWHit& hit) const { for(size_t i = mapi_; i < map_.size(); i++) { if(!hit.reported(map(i))) return false; } return true; } /** * Return true iff all rows corresponding to this GWState have been * resolved (but not necessarily reported). */ bool doneResolving(const SARangeWithOffs& sa) const { for(size_t i = mapi_; i < map_.size(); i++) { if(sa.offs[map((index_t)i)] == (index_t)OFF_MASK) return false; } return true; } SideLocus tloc; // SideLocus for top SideLocus bloc; // SideLocus for bottom index_t top; // top elt of range in GBWT index_t bot; // bot elt of range in GBWT index_t node_top; index_t node_bot; EList > node_iedge_count; int step; // how many steps have we walked to the left so far // temporary EList > backup_node_iedge_count; EList > tmp_node_iedge_count; EList tmp_zOffs; EList tmp_gbwt_to_node; protected: ASSERT_ONLY(bool inited_); ASSERT_ONLY(int lastStep_); EList map_; // which elts in range 'range' we're tracking index_t mapi_; // first untrimmed element of map }; template class GroupWalk2S { public: typedef EList, S> TStateV; GroupWalk2S() : st_(8, GW_CAT) { reset(); } /** * Reset the GroupWalk in preparation for the next SeedResults. */ void reset() { elt_ = rep_ = 0; ASSERT_ONLY(inited_ = false); } /** * Initialize a new group walk w/r/t a QVal object. */ void init( const GFM& gfmFw, // forward Bowtie index for walking left const BitPairReference& ref, // bitpair-encoded reference SARangeWithOffs& sa, // SA range with offsets RandomSource& rnd, // pseudo-random generator for sampling rows WalkMetrics& met) // update metrics here { reset(); #ifndef NDEBUG inited_ = true; #endif // Init GWHit hit_.init(sa, 0, false, 0); // Init corresponding GWState st_.resize(1); st_.back().reset(); assert(st_.back().repOkBasic()); index_t top = sa.topf; index_t bot = sa.botf; index_t node_top = sa.node_top; index_t node_bot = (index_t)(node_top + sa.size()); st_.back().initMap(sa.size()); st_.ensure(4); st_.back().init( gfmFw, // Bowtie index ref, // bitpair-encoded reference sa, // SA range with offsets st_, // EList hit_, // GWHit 0, // range 0 false, // put resolved elements into res_? NULL, // put resolved elements here top, // GBW row at top bot, // GBW row at bot node_top, // node at top node_bot, // node at bot sa.node_iedge_count, 0, // # steps taken met); // update metrics here elt_ += sa.size(); assert(hit_.repOk(sa)); } // // ELEMENT-BASED // /** * Advance the GroupWalk until all elements have been resolved. */ void resolveAll(WalkMetrics& met, PerReadMetrics& prm) { WalkResult res; // ignore results for now for(size_t i = 0; i < elt_; i++) { advanceElement((index_t)i, res, met, prm); } } /** * Advance the GroupWalk until the specified element has been * resolved. */ bool advanceElement( index_t elt, // element within the range const GFM& gfmFw, // forward Bowtie index for walking left const BitPairReference& ref, // bitpair-encoded reference SARangeWithOffs& sa, // SA range with offsets GroupWalkState& gws, // GroupWalk state; scratch space WalkResult& res, // put the result here WalkMetrics& met, // metrics PerReadMetrics& prm) // per-read metrics { assert(inited_); assert(!done()); assert(hit_.repOk(sa)); assert_lt(elt, sa.size()); // elt must fall within range // Until we've resolved our element of interest... while(sa.offs[elt] == (index_t)OFF_MASK) { // Get the GWState that contains our element of interest size_t range = hit_.fmap[elt].first; assert_lt(range, st_.size()); st_.ensure(st_[range].node_bot - st_[range].node_top); // st_.ensure(4); GWState& st = st_[range]; assert(!st.doneResolving(sa)); // Returns a pair of numbers, the first being the number of // resolved but unreported offsets found during this advance, the // second being the number of as-yet-unresolved offsets. st.advance( gfmFw, ref, sa, hit_, (index_t)range, false, NULL, st_, gws, met, prm); assert(sa.offs[elt] != (index_t)OFF_MASK || !st_[hit_.fmap[elt].first].doneResolving(sa)); } assert_neq((index_t)OFF_MASK, sa.offs[elt]); // Report it! if(!hit_.reported(elt)) { hit_.setReported(elt); } met.reports++; res.init( 0, // seed offset false, // orientation 0, // range elt, // element sa.topf + elt, // bw row (index_t)sa.len, // length of hit sa.offs[elt]); // resolved text offset rep_++; return true; } /** * Return true iff all elements have been resolved and reported. */ bool done() const { return rep_ == elt_; } #ifndef NDEBUG /** * Check that GroupWalk is internally consistent. */ bool repOk(const SARangeWithOffs& sa) const { assert(hit_.repOk(sa)); assert_leq(rep_, elt_); // This is a lot of work size_t resolved = 0, reported = 0; // For each element const size_t sz = sa.size(); for(size_t m = 0; m < sz; m++) { // Is it resolved? if(sa.offs[m] != (index_t)OFF_MASK) { resolved++; } else { assert(!hit_.reported(m)); } // Is it reported? if(hit_.reported(m)) { reported++; } assert_geq(resolved, reported); } assert_geq(resolved, reported); assert_eq(rep_, reported); assert_eq(elt_, sz); return true; } #endif /** * Return the number of BW elements that we can resolve. */ index_t numElts() const { return elt_; } /** * Return the size occupied by this GroupWalk and all its constituent * objects. */ size_t totalSizeBytes() const { return 2 * sizeof(size_t) + st_.totalSizeBytes() + sizeof(GWHit); } /** * Return the capacity of this GroupWalk and all its constituent objects. */ size_t totalCapacityBytes() const { return 2 * sizeof(size_t) + st_.totalCapacityBytes() + sizeof(GWHit); } #ifndef NDEBUG bool initialized() const { return inited_; } #endif protected: ASSERT_ONLY(bool inited_); // initialized? index_t elt_; // # BW elements under the control of the GropuWalk index_t rep_; // # BW elements reported // For each orientation and seed offset, keep a GWState object that // holds the state of the walk so far. TStateV st_; // For each orientation and seed offset, keep an EList of GWHit. GWHit hit_; }; #endif /*GROUP_WALK_H_*/