/* * Copyright 2011, Ben Langmead * * This file is part of Bowtie 2. * * Bowtie 2 is free software: you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation, either version 3 of the License, or * (at your option) any later version. * * Bowtie 2 is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with Bowtie 2. If not, see . */ #ifndef DIFF_SAMPLE_H_ #define DIFF_SAMPLE_H_ #include #include #include "assert_helpers.h" #include "multikey_qsort.h" #include "timer.h" #include "ds.h" #include "mem_ids.h" #include "ls.h" #include "btypes.h" using namespace std; #ifndef VMSG_NL #define VMSG_NL(...) \ if(this->verbose()) { \ stringstream tmp; \ tmp << __VA_ARGS__ << endl; \ this->verbose(tmp.str()); \ } #endif #ifndef VMSG #define VMSG(...) \ if(this->verbose()) { \ stringstream tmp; \ tmp << __VA_ARGS__; \ this->verbose(tmp.str()); \ } #endif /** * Routines for calculating, sanity-checking, and dispensing difference * cover samples to clients. */ /** * */ struct sampleEntry { uint32_t maxV; uint32_t numSamples; uint32_t samples[128]; }; /// Array of Colbourn and Ling calculated difference covers up to /// r = 16 (maxV = 5953) extern struct sampleEntry clDCs[16]; extern bool clDCs_calced; /// have clDCs been calculated? /** * Check that the given difference cover 'ds' actually covers all * differences for a periodicity of v. */ template static bool dcRepOk(T v, EList& ds) { // diffs[] records all the differences observed AutoArray covered(v, EBWT_CAT); for(T i = 1; i < v; i++) { covered[i] = false; } for(T di = T(); di < ds.size(); di++) { for(T dj = di+1; dj < ds.size(); dj++) { assert_lt(ds[di], ds[dj]); T d1 = (ds[dj] - ds[di]); T d2 = (ds[di] + v - ds[dj]); assert_lt(d1, v); assert_lt(d2, v); covered[d1] = true; covered[d2] = true; } } bool ok = true; for(T i = 1; i < v; i++) { if(covered[i] == false) { ok = false; break; } } return ok; } /** * Return true iff each element of ts (with length 'limit') is greater * than the last. */ template static bool increasing(T* ts, size_t limit) { for(size_t i = 0; i < limit-1; i++) { if(ts[i+1] <= ts[i]) return false; } return true; } /** * Return true iff the given difference cover covers difference 'diff' * mod 'v'. */ template static inline bool hasDifference(T *ds, T d, T v, T diff) { // diffs[] records all the differences observed for(T di = T(); di < d; di++) { for(T dj = di+1; dj < d; dj++) { assert_lt(ds[di], ds[dj]); T d1 = (ds[dj] - ds[di]); T d2 = (ds[di] + v - ds[dj]); assert_lt(d1, v); assert_lt(d2, v); if(d1 == diff || d2 == diff) return true; } } return false; } /** * Exhaustively calculate optimal difference cover samples for v = 4, * 8, 16, 32, 64, 128, 256 and store results in p2DCs[] */ template void calcExhaustiveDC(T i, bool verbose = false, bool sanityCheck = false) { T v = i; AutoArray diffs(v, EBWT_CAT); // v is the target period T ld = (T)ceil(sqrt(v)); // ud is the upper bound on |D| T ud = v / 2; // for all possible |D|s bool ok = true; T *ds = NULL; T d; for(d = ld; d <= ud+1; d++) { // for all possible |D| samples AutoArray ds(d, EBWT_CAT); for(T j = 0; j < d; j++) { ds[j] = j; } assert(increasing(ds, d)); while(true) { // reset diffs[] for(T t = 1; t < v; t++) { diffs[t] = false; } T diffCnt = 0; // diffs[] records all the differences observed for(T di = 0; di < d; di++) { for(T dj = di+1; dj < d; dj++) { assert_lt(ds[di], ds[dj]); T d1 = (ds[dj] - ds[di]); T d2 = (ds[di] + v - ds[dj]); assert_lt(d1, v); assert_lt(d2, v); assert_gt(d1, 0); assert_gt(d2, 0); if(!diffs[d1]) { diffCnt++; diffs[d1] = true; } if(!diffs[d2]) { diffCnt++; diffs[d2] = true; } } } // Do we observe all possible differences (except 0) ok = diffCnt == v-1; if(ok) { // Yes, all differences are covered break; } else { // Advance ds // (Following is commented out because it turns out // it's slow) // Find a missing difference //uint32_t missing = 0xffffffff; //for(uint32_t t = 1; t < v; t++) { // if(diffs[t] == false) { // missing = diffs[t]; // break; // } //} //assert_neq(missing, 0xffffffff); assert(increasing(ds, d)); bool advanced = false; bool keepGoing = false; do { keepGoing = false; for(T bd = d-1; bd > 1; bd--) { T dif = (d-1)-bd; if(ds[bd] < v-1-dif) { ds[bd]++; assert_neq(0, ds[bd]); // Reset subsequent ones for(T bdi = bd+1; bdi < d; bdi++) { assert_eq(0, ds[bdi]); ds[bdi] = ds[bdi-1]+1; assert_gt(ds[bdi], ds[bdi-1]); } assert(increasing(ds, d)); // (Following is commented out because // it turns out it's slow) // See if the new DC has the missing value //if(!hasDifference(ds, d, v, missing)) { // keepGoing = true; // break; //} advanced = true; break; } else { ds[bd] = 0; // keep going } } } while(keepGoing); // No solution for this |D| if(!advanced) break; assert(increasing(ds, d)); } } // next sample assignment if(ok) { break; } } // next |D| assert(ok); cout << "Did exhaustive v=" << v << " |D|=" << d << endl; cout << " "; for(T i = 0; i < d; i++) { cout << ds[i]; if(i < d-1) cout << ","; } cout << endl; } /** * Routune for calculating the elements of clDCs up to r = 16 using the * technique of Colbourn and Ling. * * See http://citeseer.ist.psu.edu/211575.html */ template void calcColbournAndLingDCs(bool verbose = false, bool sanityCheck = false) { for(T r = 0; r < 16; r++) { T maxv = 24*r*r + 36*r + 13; // Corollary 2.3 T numsamp = 6*r + 4; clDCs[r].maxV = maxv; clDCs[r].numSamples = numsamp; memset(clDCs[r].samples, 0, 4 * 128); T i; // clDCs[r].samples[0] = 0; // Fill in the 1^r part of the B series for(i = 1; i < r+1; i++) { clDCs[r].samples[i] = clDCs[r].samples[i-1] + 1; } // Fill in the (r + 1)^1 part clDCs[r].samples[r+1] = clDCs[r].samples[r] + r + 1; // Fill in the (2r + 1)^r part for(i = r+2; i < r+2+r; i++) { clDCs[r].samples[i] = clDCs[r].samples[i-1] + 2*r + 1; } // Fill in the (4r + 3)^(2r + 1) part for(i = r+2+r; i < r+2+r+2*r+1; i++) { clDCs[r].samples[i] = clDCs[r].samples[i-1] + 4*r + 3; } // Fill in the (2r + 2)^(r + 1) part for(i = r+2+r+2*r+1; i < r+2+r+2*r+1+r+1; i++) { clDCs[r].samples[i] = clDCs[r].samples[i-1] + 2*r + 2; } // Fill in the last 1^r part for(i = r+2+r+2*r+1+r+1; i < r+2+r+2*r+1+r+1+r; i++) { clDCs[r].samples[i] = clDCs[r].samples[i-1] + 1; } assert_eq(i, numsamp); assert_lt(i, 128); if(sanityCheck) { // diffs[] records all the differences observed AutoArray diffs(maxv, EBWT_CAT); for(T i = 0; i < numsamp; i++) { for(T j = i+1; j < numsamp; j++) { T d1 = (clDCs[r].samples[j] - clDCs[r].samples[i]); T d2 = (clDCs[r].samples[i] + maxv - clDCs[r].samples[j]); assert_lt(d1, maxv); assert_lt(d2, maxv); diffs[d1] = true; diffs[d2] = true; } } // Should have observed all possible differences (except 0) for(T i = 1; i < maxv; i++) { if(diffs[i] == false) cout << r << ", " << i << endl; assert(diffs[i] == true); } } } clDCs_calced = true; } /** * A precalculated list of difference covers. */ extern uint32_t dc0to64[65][10]; /** * Get a difference cover for the requested periodicity v. */ template static EList getDiffCover( T v, bool verbose = false, bool sanityCheck = false) { assert_gt(v, 2); EList ret; ret.clear(); // Can we look it up in our hardcoded array? if(v <= 64 && dc0to64[v][0] == 0xffffffff) { if(verbose) cout << "v in hardcoded area, but hardcoded entry was all-fs" << endl; return ret; } else if(v <= 64) { ret.push_back(0); for(size_t i = 0; i < 10; i++) { if(dc0to64[v][i] == 0) break; ret.push_back(dc0to64[v][i]); } if(sanityCheck) assert(dcRepOk(v, ret)); return ret; } // Can we look it up in our calcColbournAndLingDCs array? if(!clDCs_calced) { calcColbournAndLingDCs(verbose, sanityCheck); assert(clDCs_calced); } for(size_t i = 0; i < 16; i++) { if(v <= clDCs[i].maxV) { for(size_t j = 0; j < clDCs[i].numSamples; j++) { T s = clDCs[i].samples[j]; if(s >= v) { s %= v; for(size_t k = 0; k < ret.size(); k++) { if(s == ret[k]) break; if(s < ret[k]) { ret.insert(s, k); break; } } } else { ret.push_back(s % v); } } if(sanityCheck) assert(dcRepOk(v, ret)); return ret; } } cerr << "Error: Could not find a difference cover sample for v=" << v << endl; throw 1; } /** * Calculate and return a delta map based on the given difference cover * and periodicity v. */ template static EList getDeltaMap(T v, const EList& dc) { // Declare anchor-map-related items EList amap; size_t amapEnts = 1; amap.resizeExact((size_t)v); amap.fill(0xffffffff); amap[0] = 0; // Print out difference cover (and optionally calculate // anchor map) for(size_t i = 0; i < dc.size(); i++) { for(size_t j = i+1; j < dc.size(); j++) { assert_gt(dc[j], dc[i]); T diffLeft = dc[j] - dc[i]; T diffRight = dc[i] + v - dc[j]; assert_lt(diffLeft, v); assert_lt(diffRight, v); if(amap[diffLeft] == 0xffffffff) { amap[diffLeft] = dc[i]; amapEnts++; } if(amap[diffRight] == 0xffffffff) { amap[diffRight] = dc[j]; amapEnts++; } } } return amap; } /** * Return population count (count of all bits set to 1) of i. */ template static unsigned int popCount(T i) { unsigned int cnt = 0; for(size_t j = 0; j < sizeof(T)*8; j++) { if(i & 1) cnt++; i >>= 1; } return cnt; } /** * Calculate log-base-2 of i */ template static unsigned int myLog2(T i) { assert_eq(1, popCount(i)); // must be power of 2 for(size_t j = 0; j < sizeof(T)*8; j++) { if(i & 1) return (int)j; i >>= 1; } assert(false); return 0xffffffff; } /** * */ template class DifferenceCoverSample { public: DifferenceCoverSample(const TStr& __text, uint32_t __v, bool __verbose = false, bool __sanity = false, ostream& __logger = cout) : _text(__text), _v(__v), _verbose(__verbose), _sanity(__sanity), _ds(getDiffCover(_v, _verbose, _sanity)), _dmap(getDeltaMap(_v, _ds)), _d((uint32_t)_ds.size()), _doffs(), _isaPrime(), _dInv(), _log2v(myLog2(_v)), _vmask(OFF_MASK << _log2v), _logger(__logger) { assert_gt(_d, 0); assert_eq(1, popCount(_v)); // must be power of 2 // Build map from d's to idx's _dInv.resizeExact((size_t)v()); _dInv.fill(0xffffffff); uint32_t lim = (uint32_t)_ds.size(); for(uint32_t i = 0; i < lim; i++) { _dInv[_ds[i]] = i; } } /** * Allocate an amount of memory that simulates the peak memory * usage of the DifferenceCoverSample with the given text and v. * Throws bad_alloc if it's not going to fit in memory. Returns * the approximate number of bytes the Cover takes at all times. */ static size_t simulateAllocs(const TStr& text, uint32_t v) { EList ds(getDiffCover(v, false /*verbose*/, false /*sanity*/)); size_t len = text.length(); size_t sPrimeSz = (len / v) * ds.size(); // sPrime, sPrimeOrder, _isaPrime all exist in memory at // once and that's the peak AutoArray aa(sPrimeSz * 3 + (1024 * 1024 /*out of caution*/), EBWT_CAT); return sPrimeSz * 4; // sPrime array } uint32_t v() const { return _v; } uint32_t log2v() const { return _log2v; } uint32_t vmask() const { return _vmask; } uint32_t modv(TIndexOffU i) const { return (uint32_t)(i & ~_vmask); } TIndexOffU divv(TIndexOffU i) const { return i >> _log2v; } uint32_t d() const { return _d; } bool verbose() const { return _verbose; } bool sanityCheck() const { return _sanity; } const TStr& text() const { return _text; } const EList& ds() const { return _ds; } const EList& dmap() const { return _dmap; } ostream& log() const { return _logger; } void build(int nthreads); uint32_t tieBreakOff(TIndexOffU i, TIndexOffU j) const; int64_t breakTie(TIndexOffU i, TIndexOffU j) const; bool isCovered(TIndexOffU i) const; TIndexOffU rank(TIndexOffU i) const; /** * Print out the suffix array such that every sample offset has its * rank filled in and every non-sample offset is shown as '-'. */ void print(ostream& out) { for(size_t i = 0; i < _text.length(); i++) { if(isCovered(i)) { out << rank(i); } else { out << "-"; } if(i < _text.length()-1) { out << ","; } } out << endl; } private: void doBuiltSanityCheck() const; void buildSPrime(EList& sPrime, size_t padding); bool built() const { return _isaPrime.size() > 0; } void verbose(const string& s) const { if(this->verbose()) { this->log() << s.c_str(); this->log().flush(); } } const TStr& _text; // text to sample uint32_t _v; // periodicity of sample bool _verbose; // bool _sanity; // EList _ds; // samples: idx -> d EList _dmap; // delta map uint32_t _d; // |D| - size of sample EList _doffs; // offsets into sPrime/isaPrime for each d idx EList _isaPrime; // ISA' array EList _dInv; // Map from d -> idx uint32_t _log2v; TIndexOffU _vmask; ostream& _logger; }; /** * Sanity-check the difference cover by first inverting _isaPrime then * checking that each successive suffix really is less than the next. */ template void DifferenceCoverSample::doBuiltSanityCheck() const { uint32_t v = this->v(); assert(built()); VMSG_NL(" Doing sanity check"); TIndexOffU added = 0; EList sorted; sorted.resizeExact(_isaPrime.size()); sorted.fill(OFF_MASK); for(size_t di = 0; di < this->d(); di++) { uint32_t d = _ds[di]; size_t i = 0; for(size_t doi = _doffs[di]; doi < _doffs[di+1]; doi++, i++) { assert_eq(OFF_MASK, sorted[_isaPrime[doi]]); // Maps the offset of the suffix to its rank sorted[_isaPrime[doi]] = (TIndexOffU)(v*i + d); added++; } } assert_eq(added, _isaPrime.size()); #ifndef NDEBUG for(size_t i = 0; i < sorted.size()-1; i++) { assert(sstr_suf_lt(this->text(), sorted[i], this->text(), sorted[i+1], false)); } #endif } /** * Build the s' array by sampling suffixes (suffix offsets, actually) * from t according to the difference-cover sample and pack them into * an array of machine words in the order dictated by the "mu" mapping * described in Burkhardt. * * Also builds _doffs map. */ template void DifferenceCoverSample::buildSPrime( EList& sPrime, size_t padding) { const TStr& t = this->text(); const EList& ds = this->ds(); TIndexOffU tlen = (TIndexOffU)t.length(); uint32_t v = this->v(); uint32_t d = this->d(); assert_gt(v, 2); assert_lt(d, v); // Record where each d section should begin in sPrime TIndexOffU tlenDivV = this->divv(tlen); uint32_t tlenModV = this->modv(tlen); TIndexOffU sPrimeSz = 0; assert(_doffs.empty()); _doffs.resizeExact((size_t)d+1); for(uint32_t di = 0; di < d; di++) { // mu mapping TIndexOffU sz = tlenDivV + ((ds[di] <= tlenModV) ? 1 : 0); assert_geq(sz, 0); _doffs[di] = sPrimeSz; sPrimeSz += sz; } _doffs[d] = sPrimeSz; #ifndef NDEBUG if(tlenDivV > 0) { for(size_t i = 0; i < d; i++) { assert_gt(_doffs[i+1], _doffs[i]); TIndexOffU diff = _doffs[i+1] - _doffs[i]; assert(diff == tlenDivV || diff == tlenDivV+1); } } #endif assert_eq(_doffs.size(), d+1); // Size sPrime appropriately sPrime.resizeExact((size_t)sPrimeSz + padding); sPrime.fill(OFF_MASK); // Slot suffixes from text into sPrime according to the mu // mapping; where the mapping would leave a blank, insert a 0 TIndexOffU added = 0; TIndexOffU i = 0; for(uint64_t ti = 0; ti <= tlen; ti += v) { for(uint32_t di = 0; di < d; di++) { TIndexOffU tti = (TIndexOffU)ti + (TIndexOffU)ds[di]; if(tti > tlen) break; TIndexOffU spi = _doffs[di] + i; assert_lt(spi, _doffs[di+1]); assert_leq(tti, tlen); assert_lt(spi, sPrimeSz); assert_eq(OFF_MASK, sPrime[spi]); sPrime[spi] = tti; added++; } i++; } assert_eq(added, sPrimeSz); } /** * Return true iff suffixes with offsets suf1 and suf2 out of host * string 'host' are identical up to depth 'v'. */ template static inline bool suffixSameUpTo( const TStr& host, TIndexOffU suf1, TIndexOffU suf2, TIndexOffU v) { for(TIndexOffU i = 0; i < v; i++) { bool endSuf1 = suf1+i >= host.length(); bool endSuf2 = suf2+i >= host.length(); if((endSuf1 && !endSuf2) || (!endSuf1 && endSuf2)) return false; if(endSuf1 && endSuf2) return true; if(host[suf1+i] != host[suf2+i]) return false; } return true; } template struct VSortingParam { DifferenceCoverSample* dcs; TIndexOffU* sPrimeArr; size_t sPrimeSz; TIndexOffU* sPrimeOrderArr; size_t depth; const EList* boundaries; size_t* cur; MUTEX_T* mutex; }; template static void VSorting_worker(void *vp) { VSortingParam* param = (VSortingParam*)vp; DifferenceCoverSample* dcs = param->dcs; const TStr& host = dcs->text(); const size_t hlen = host.length(); uint32_t v = dcs->v(); while(true) { size_t cur = 0; { ThreadSafe ts(param->mutex, true); cur = *(param->cur); (*param->cur)++; } if(cur >= param->boundaries->size()) return; size_t begin = (cur == 0 ? 0 : (*param->boundaries)[cur-1]); size_t end = (*param->boundaries)[cur]; assert_leq(begin, end); if(end - begin <= 1) continue; mkeyQSortSuf2( host, hlen, param->sPrimeArr, param->sPrimeSz, param->sPrimeOrderArr, 4, begin, end, param->depth, v); } } /** * Calculates a ranking of all suffixes in the sample and stores them, * packed according to the mu mapping, in _isaPrime. */ template void DifferenceCoverSample::build(int nthreads) { // Local names for relevant types VMSG_NL("Building DifferenceCoverSample"); // Local names for relevant data const TStr& t = this->text(); uint32_t v = this->v(); assert_gt(v, 2); // Build s' EList sPrime; // Need to allocate 2 extra elements at the end of the sPrime and _isaPrime // arrays. One element that's less than all others, and another that acts // as needed padding for the Larsson-Sadakane sorting code. size_t padding = 1; VMSG_NL(" Building sPrime"); buildSPrime(sPrime, padding); size_t sPrimeSz = sPrime.size() - padding; assert_gt(sPrime.size(), padding); assert_leq(sPrime.size(), t.length() + padding + 1); TIndexOffU nextRank = 0; { VMSG_NL(" Building sPrimeOrder"); EList sPrimeOrder; sPrimeOrder.resizeExact(sPrimeSz); for(TIndexOffU i = 0; i < sPrimeSz; i++) { sPrimeOrder[i] = i; } // sPrime now holds suffix-offsets for DC samples. { Timer timer(cout, " V-Sorting samples time: ", this->verbose()); VMSG_NL(" V-Sorting samples"); // Extract backing-store array from sPrime and sPrimeOrder; // the mkeyQSortSuf2 routine works on the array for maximum // efficiency TIndexOffU *sPrimeArr = (TIndexOffU*)sPrime.ptr(); assert_eq(sPrimeArr[0], sPrime[0]); assert_eq(sPrimeArr[sPrimeSz-1], sPrime[sPrimeSz-1]); TIndexOffU *sPrimeOrderArr = (TIndexOffU*)sPrimeOrder.ptr(); assert_eq(sPrimeOrderArr[0], sPrimeOrder[0]); assert_eq(sPrimeOrderArr[sPrimeSz-1], sPrimeOrder[sPrimeSz-1]); // Sort sample suffixes up to the vth character using a // multikey quicksort. Sort time is proportional to the // number of samples times v. It isn't quadratic. // sPrimeOrder is passed in as a swapping partner for // sPrimeArr, i.e., every time the multikey qsort swaps // elements in sPrime, it swaps the same elements in // sPrimeOrder too. This allows us to easily reconstruct // what the sort did. if(nthreads == 1) { mkeyQSortSuf2(t, sPrimeArr, sPrimeSz, sPrimeOrderArr, 4, this->verbose(), this->sanityCheck(), v); } else { int query_depth = 0; int tmp_nthreads = nthreads; while(tmp_nthreads > 0) { query_depth++; tmp_nthreads >>= 1; } EList boundaries; // bucket boundaries for parallelization TIndexOffU *sOrig = NULL; if(this->sanityCheck()) { sOrig = new TIndexOffU[sPrimeSz]; memcpy(sOrig, sPrimeArr, OFF_SIZE * sPrimeSz); } mkeyQSortSuf2(t, sPrimeArr, sPrimeSz, sPrimeOrderArr, 4, this->verbose(), false, query_depth, &boundaries); if(boundaries.size() > 0) { AutoArray threads(nthreads); EList > tparams; size_t cur = 0; MUTEX_T mutex; for(int tid = 0; tid < nthreads; tid++) { // Calculate bucket sizes by doing a binary search for each // suffix and noting where it lands tparams.expand(); tparams.back().dcs = this; tparams.back().sPrimeArr = sPrimeArr; tparams.back().sPrimeSz = sPrimeSz; tparams.back().sPrimeOrderArr = sPrimeOrderArr; tparams.back().depth = query_depth; tparams.back().boundaries = &boundaries; tparams.back().cur = &cur; tparams.back().mutex = &mutex; threads[tid] = new tthread::thread(VSorting_worker, (void*)&tparams.back()); } for (int tid = 0; tid < nthreads; tid++) { threads[tid]->join(); } } if(this->sanityCheck()) { sanityCheckOrderedSufs(t, t.length(), sPrimeArr, sPrimeSz, v); for(size_t i = 0; i < sPrimeSz; i++) { assert_eq(sPrimeArr[i], sOrig[sPrimeOrderArr[i]]); } delete[] sOrig; } } // Make sure sPrime and sPrimeOrder are consistent with // their respective backing-store arrays assert_eq(sPrimeArr[0], sPrime[0]); assert_eq(sPrimeArr[sPrimeSz-1], sPrime[sPrimeSz-1]); assert_eq(sPrimeOrderArr[0], sPrimeOrder[0]); assert_eq(sPrimeOrderArr[sPrimeSz-1], sPrimeOrder[sPrimeSz-1]); } // Now assign the ranking implied by the sorted sPrime/sPrimeOrder // arrays back into sPrime. VMSG_NL(" Allocating rank array"); _isaPrime.resizeExact(sPrime.size()); ASSERT_ONLY(_isaPrime.fill(OFF_MASK)); assert_gt(_isaPrime.size(), 0); { Timer timer(cout, " Ranking v-sort output time: ", this->verbose()); VMSG_NL(" Ranking v-sort output"); for(size_t i = 0; i < sPrimeSz-1; i++) { // Place the appropriate ranking _isaPrime[sPrimeOrder[i]] = nextRank; // If sPrime[i] and sPrime[i+1] are identical up to v, then we // should give the next suffix the same rank if(!suffixSameUpTo(t, sPrime[i], sPrime[i+1], v)) nextRank++; } _isaPrime[sPrimeOrder[sPrimeSz-1]] = nextRank; // finish off #ifndef NDEBUG for(size_t i = 0; i < sPrimeSz; i++) { assert_neq(OFF_MASK, _isaPrime[i]); assert_lt(_isaPrime[i], sPrimeSz); } #endif } // sPrimeOrder is destroyed // All the information we need is now in _isaPrime } _isaPrime[_isaPrime.size()-1] = (TIndexOffU)sPrimeSz; sPrime[sPrime.size()-1] = (TIndexOffU)sPrimeSz; // _isaPrime[_isaPrime.size()-1] and sPrime[sPrime.size()-1] are just // spacer for the Larsson-Sadakane routine to use { Timer timer(cout, " Invoking Larsson-Sadakane on ranks time: ", this->verbose()); VMSG_NL(" Invoking Larsson-Sadakane on ranks"); if(sPrime.size() >= LS_SIZE) { cerr << "Error; sPrime array has so many elements that it can't be converted to a signed array without overflow." << endl; throw 1; } LarssonSadakane ls; ls.suffixsort( (TIndexOff*)_isaPrime.ptr(), (TIndexOff*)sPrime.ptr(), (TIndexOff)sPrimeSz, (TIndexOff)sPrime.size(), 0); } // chop off final character of _isaPrime _isaPrime.resizeExact(sPrimeSz); for(size_t i = 0; i < _isaPrime.size(); i++) { _isaPrime[i]--; } #ifndef NDEBUG for(size_t i = 0; i < sPrimeSz-1; i++) { assert_lt(_isaPrime[i], sPrimeSz); assert(i == 0 || _isaPrime[i] != _isaPrime[i-1]); } #endif VMSG_NL(" Sanity-checking and returning"); if(this->sanityCheck()) doBuiltSanityCheck(); } /** * Return true iff index i within the text is covered by the difference * cover sample. Allow i to be off the end of the text; simplifies * logic elsewhere. */ template bool DifferenceCoverSample::isCovered(TIndexOffU i) const { assert(built()); uint32_t modi = this->modv(i); assert_lt(modi, _dInv.size()); return _dInv[modi] != 0xffffffff; } /** * Given a text offset that's covered, return its lexicographical rank * among the sample suffixes. */ template TIndexOffU DifferenceCoverSample::rank(TIndexOffU i) const { assert(built()); assert_lt(i, this->text().length()); uint32_t imodv = this->modv(i); assert_neq(0xffffffff, _dInv[imodv]); // must be in the sample TIndexOffU ioff = this->divv(i); assert_lt(ioff, _doffs[_dInv[imodv]+1] - _doffs[_dInv[imodv]]); TIndexOffU isaIIdx = _doffs[_dInv[imodv]] + ioff; assert_lt(isaIIdx, _isaPrime.size()); TIndexOffU isaPrimeI = _isaPrime[isaIIdx]; assert_leq(isaPrimeI, _isaPrime.size()); return isaPrimeI; } /** * Return: < 0 if suffix i is lexicographically less than suffix j; > 0 * if suffix j is lexicographically greater. */ template int64_t DifferenceCoverSample::breakTie(TIndexOffU i, TIndexOffU j) const { assert(built()); assert_neq(i, j); assert_lt(i, this->text().length()); assert_lt(j, this->text().length()); uint32_t imodv = this->modv(i); uint32_t jmodv = this->modv(j); assert_neq(0xffffffff, _dInv[imodv]); // must be in the sample assert_neq(0xffffffff, _dInv[jmodv]); // must be in the sample uint32_t dimodv = _dInv[imodv]; uint32_t djmodv = _dInv[jmodv]; TIndexOffU ioff = this->divv(i); TIndexOffU joff = this->divv(j); assert_lt(dimodv+1, _doffs.size()); assert_lt(djmodv+1, _doffs.size()); // assert_lt: expected (32024) < (0) assert_lt(ioff, _doffs[dimodv+1] - _doffs[dimodv]); assert_lt(joff, _doffs[djmodv+1] - _doffs[djmodv]); TIndexOffU isaIIdx = _doffs[dimodv] + ioff; TIndexOffU isaJIdx = _doffs[djmodv] + joff; assert_lt(isaIIdx, _isaPrime.size()); assert_lt(isaJIdx, _isaPrime.size()); assert_neq(isaIIdx, isaJIdx); // ranks must be unique TIndexOffU isaPrimeI = _isaPrime[isaIIdx]; TIndexOffU isaPrimeJ = _isaPrime[isaJIdx]; assert_neq(isaPrimeI, isaPrimeJ); // ranks must be unique assert_leq(isaPrimeI, _isaPrime.size()); assert_leq(isaPrimeJ, _isaPrime.size()); return (int64_t)isaPrimeI - (int64_t)isaPrimeJ; } /** * Given i, j, return the number of additional characters that need to * be compared before the difference cover can break the tie. */ template uint32_t DifferenceCoverSample::tieBreakOff(TIndexOffU i, TIndexOffU j) const { const TStr& t = this->text(); const EList& dmap = this->dmap(); assert(built()); // It's actually convenient to allow this, but we're permitted to // return nonsense in that case if(t[i] != t[j]) return 0xffffffff; //assert_eq(t[i], t[j]); // if they're unequal, there's no tie to break uint32_t v = this->v(); assert_neq(i, j); assert_lt(i, t.length()); assert_lt(j, t.length()); uint32_t imod = this->modv(i); uint32_t jmod = this->modv(j); uint32_t diffLeft = (jmod >= imod)? (jmod - imod) : (jmod + v - imod); uint32_t diffRight = (imod >= jmod)? (imod - jmod) : (imod + v - jmod); assert_lt(diffLeft, dmap.size()); assert_lt(diffRight, dmap.size()); uint32_t destLeft = dmap[diffLeft]; // offset where i needs to be uint32_t destRight = dmap[diffRight]; // offset where i needs to be assert(isCovered(destLeft)); assert(isCovered(destLeft+diffLeft)); assert(isCovered(destRight)); assert(isCovered(destRight+diffRight)); assert_lt(destLeft, v); assert_lt(destRight, v); uint32_t deltaLeft = (destLeft >= imod)? (destLeft - imod) : (destLeft + v - imod); if(deltaLeft == v) deltaLeft = 0; uint32_t deltaRight = (destRight >= jmod)? (destRight - jmod) : (destRight + v - jmod); if(deltaRight == v) deltaRight = 0; assert_lt(deltaLeft, v); assert_lt(deltaRight, v); assert(isCovered(i+deltaLeft)); assert(isCovered(j+deltaLeft)); assert(isCovered(i+deltaRight)); assert(isCovered(j+deltaRight)); return min(deltaLeft, deltaRight); } #endif /*DIFF_SAMPLE_H_*/