1114 lines
40 KiB
C
1114 lines
40 KiB
C
|
/*
|
||
|
* Copyright 2011, Ben Langmead <langmea@cs.jhu.edu>
|
||
|
*
|
||
|
* This file is part of Bowtie 2.
|
||
|
*
|
||
|
* Bowtie 2 is free software: you can redistribute it and/or modify
|
||
|
* it under the terms of the GNU General Public License as published by
|
||
|
* the Free Software Foundation, either version 3 of the License, or
|
||
|
* (at your option) any later version.
|
||
|
*
|
||
|
* Bowtie 2 is distributed in the hope that it will be useful,
|
||
|
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||
|
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||
|
* GNU General Public License for more details.
|
||
|
*
|
||
|
* You should have received a copy of the GNU General Public License
|
||
|
* along with Bowtie 2. If not, see <http://www.gnu.org/licenses/>.
|
||
|
*/
|
||
|
|
||
|
#ifndef BLOCKWISE_SA_H_
|
||
|
#define BLOCKWISE_SA_H_
|
||
|
|
||
|
#include <stdint.h>
|
||
|
#include <stdlib.h>
|
||
|
#include <iostream>
|
||
|
#include <sstream>
|
||
|
#include <stdexcept>
|
||
|
#include "assert_helpers.h"
|
||
|
#include "diff_sample.h"
|
||
|
#include "multikey_qsort.h"
|
||
|
#include "random_source.h"
|
||
|
#include "binary_sa_search.h"
|
||
|
#include "zbox.h"
|
||
|
#include "alphabet.h"
|
||
|
#include "timer.h"
|
||
|
#include "ds.h"
|
||
|
#include "mem_ids.h"
|
||
|
#include "word_io.h"
|
||
|
|
||
|
using namespace std;
|
||
|
|
||
|
// Helpers for printing verbose messages
|
||
|
|
||
|
#ifndef VMSG_NL
|
||
|
#define VMSG_NL(...) \
|
||
|
if(this->verbose()) { \
|
||
|
stringstream tmp; \
|
||
|
tmp << __VA_ARGS__ << endl; \
|
||
|
this->verbose(tmp.str()); \
|
||
|
}
|
||
|
#endif
|
||
|
|
||
|
#ifndef VMSG
|
||
|
#define VMSG(...) \
|
||
|
if(this->verbose()) { \
|
||
|
stringstream tmp; \
|
||
|
tmp << __VA_ARGS__; \
|
||
|
this->verbose(tmp.str()); \
|
||
|
}
|
||
|
#endif
|
||
|
|
||
|
/**
|
||
|
* Abstract parent class for blockwise suffix-array building schemes.
|
||
|
*/
|
||
|
template<typename TStr>
|
||
|
class BlockwiseSA {
|
||
|
public:
|
||
|
BlockwiseSA(const TStr& __text,
|
||
|
TIndexOffU __bucketSz,
|
||
|
bool __sanityCheck = false,
|
||
|
bool __passMemExc = false,
|
||
|
bool __verbose = false,
|
||
|
ostream& __logger = cout) :
|
||
|
_text(__text),
|
||
|
_bucketSz(max<TIndexOffU>(__bucketSz, 2u)),
|
||
|
_sanityCheck(__sanityCheck),
|
||
|
_passMemExc(__passMemExc),
|
||
|
_verbose(__verbose),
|
||
|
_itrBucket(EBWTB_CAT),
|
||
|
_itrBucketPos(OFF_MASK),
|
||
|
_itrPushedBackSuffix(OFF_MASK),
|
||
|
_logger(__logger)
|
||
|
{ }
|
||
|
|
||
|
virtual ~BlockwiseSA() { }
|
||
|
|
||
|
/**
|
||
|
* Get the next suffix; compute the next bucket if necessary.
|
||
|
*/
|
||
|
virtual TIndexOffU nextSuffix() = 0;
|
||
|
|
||
|
/**
|
||
|
* Return true iff the next call to nextSuffix will succeed.
|
||
|
*/
|
||
|
bool hasMoreSuffixes() {
|
||
|
if(_itrPushedBackSuffix != OFF_MASK) return true;
|
||
|
try {
|
||
|
_itrPushedBackSuffix = nextSuffix();
|
||
|
} catch(out_of_range& e) {
|
||
|
assert_eq(OFF_MASK, _itrPushedBackSuffix);
|
||
|
return false;
|
||
|
}
|
||
|
return true;
|
||
|
}
|
||
|
|
||
|
/**
|
||
|
* Reset the suffix iterator so that the next call to nextSuffix()
|
||
|
* returns the lexicographically-first suffix.
|
||
|
*/
|
||
|
void resetSuffixItr() {
|
||
|
_itrBucket.clear();
|
||
|
_itrBucketPos = OFF_MASK;
|
||
|
_itrPushedBackSuffix = OFF_MASK;
|
||
|
reset();
|
||
|
assert(suffixItrIsReset());
|
||
|
}
|
||
|
|
||
|
/**
|
||
|
* Returns true iff the next call to nextSuffix() returns the
|
||
|
* lexicographically-first suffix.
|
||
|
*/
|
||
|
bool suffixItrIsReset() {
|
||
|
return _itrBucket.size() == 0 &&
|
||
|
_itrBucketPos == OFF_MASK &&
|
||
|
_itrPushedBackSuffix == OFF_MASK &&
|
||
|
isReset();
|
||
|
}
|
||
|
|
||
|
const TStr& text() const { return _text; }
|
||
|
TIndexOffU bucketSz() const { return _bucketSz; }
|
||
|
bool sanityCheck() const { return _sanityCheck; }
|
||
|
bool verbose() const { return _verbose; }
|
||
|
ostream& log() const { return _logger; }
|
||
|
size_t size() const { return _text.length()+1; }
|
||
|
|
||
|
protected:
|
||
|
/// Reset back to the first block
|
||
|
virtual void reset() = 0;
|
||
|
/// Return true iff reset to the first block
|
||
|
virtual bool isReset() = 0;
|
||
|
|
||
|
/**
|
||
|
* Grab the next block of sorted suffixes. The block is guaranteed
|
||
|
* to have at most _bucketSz elements.
|
||
|
*/
|
||
|
virtual void nextBlock(int cur_block, int tid = 0) = 0;
|
||
|
/// Return true iff more blocks are available
|
||
|
virtual bool hasMoreBlocks() const = 0;
|
||
|
/// Optionally output a verbose message
|
||
|
void verbose(const string& s) const {
|
||
|
if(this->verbose()) {
|
||
|
this->log() << s.c_str();
|
||
|
this->log().flush();
|
||
|
}
|
||
|
}
|
||
|
|
||
|
const TStr& _text; /// original string
|
||
|
const TIndexOffU _bucketSz; /// target maximum bucket size
|
||
|
const bool _sanityCheck; /// whether to perform sanity checks
|
||
|
const bool _passMemExc; /// true -> pass on memory exceptions
|
||
|
const bool _verbose; /// be talkative
|
||
|
EList<TIndexOffU> _itrBucket; /// current bucket
|
||
|
TIndexOffU _itrBucketPos;/// offset into current bucket
|
||
|
TIndexOffU _itrPushedBackSuffix; /// temporary slot for lookahead
|
||
|
ostream& _logger; /// write log messages here
|
||
|
};
|
||
|
|
||
|
/**
|
||
|
* Abstract parent class for a blockwise suffix array builder that
|
||
|
* always doles out blocks in lexicographical order.
|
||
|
*/
|
||
|
template<typename TStr>
|
||
|
class InorderBlockwiseSA : public BlockwiseSA<TStr> {
|
||
|
public:
|
||
|
InorderBlockwiseSA(const TStr& __text,
|
||
|
TIndexOffU __bucketSz,
|
||
|
bool __sanityCheck = false,
|
||
|
bool __passMemExc = false,
|
||
|
bool __verbose = false,
|
||
|
ostream& __logger = cout) :
|
||
|
BlockwiseSA<TStr>(__text, __bucketSz, __sanityCheck, __passMemExc, __verbose, __logger)
|
||
|
{ }
|
||
|
};
|
||
|
|
||
|
/**
|
||
|
* Build the SA a block at a time according to the scheme outlined in
|
||
|
* Karkkainen's "Fast BWT" paper.
|
||
|
*/
|
||
|
template<typename TStr>
|
||
|
class KarkkainenBlockwiseSA : public InorderBlockwiseSA<TStr> {
|
||
|
public:
|
||
|
typedef DifferenceCoverSample<TStr> TDC;
|
||
|
|
||
|
KarkkainenBlockwiseSA(const TStr& __text,
|
||
|
TIndexOffU __bucketSz,
|
||
|
int __nthreads,
|
||
|
uint32_t __dcV,
|
||
|
uint32_t __seed = 0,
|
||
|
bool __sanityCheck = false,
|
||
|
bool __passMemExc = false,
|
||
|
bool __verbose = false,
|
||
|
string base_fname = "",
|
||
|
ostream& __logger = cout) :
|
||
|
InorderBlockwiseSA<TStr>(__text, __bucketSz, __sanityCheck, __passMemExc, __verbose, __logger),
|
||
|
_sampleSuffs(EBWTB_CAT), _nthreads(__nthreads), _itrBucketIdx(0), _cur(0), _dcV(__dcV), _dc(EBWTB_CAT), _built(false), _base_fname(base_fname), _bigEndian(currentlyBigEndian())
|
||
|
{ _randomSrc.init(__seed); reset(); }
|
||
|
|
||
|
~KarkkainenBlockwiseSA()
|
||
|
{
|
||
|
if(_threads.size() > 0) {
|
||
|
for (size_t tid = 0; tid < _threads.size(); tid++) {
|
||
|
_threads[tid]->join();
|
||
|
delete _threads[tid];
|
||
|
}
|
||
|
}
|
||
|
}
|
||
|
|
||
|
|
||
|
/**
|
||
|
* Allocate an amount of memory that simulates the peak memory
|
||
|
* usage of the DifferenceCoverSample with the given text and v.
|
||
|
* Throws bad_alloc if it's not going to fit in memory. Returns
|
||
|
* the approximate number of bytes the Cover takes at all times.
|
||
|
*/
|
||
|
static size_t simulateAllocs(const TStr& text, TIndexOffU bucketSz) {
|
||
|
size_t len = text.length();
|
||
|
// _sampleSuffs and _itrBucket are in memory at the peak
|
||
|
size_t bsz = bucketSz;
|
||
|
size_t sssz = len / max<TIndexOffU>(bucketSz-1, 1);
|
||
|
AutoArray<TIndexOffU> tmp(bsz + sssz + (1024 * 1024 /*out of caution*/), EBWT_CAT);
|
||
|
return bsz;
|
||
|
}
|
||
|
|
||
|
static void nextBlock_Worker(void *vp) {
|
||
|
pair<KarkkainenBlockwiseSA*, int> param = *(pair<KarkkainenBlockwiseSA*, int>*)vp;
|
||
|
KarkkainenBlockwiseSA* sa = param.first;
|
||
|
int tid = param.second;
|
||
|
while(true) {
|
||
|
size_t cur = 0;
|
||
|
{
|
||
|
ThreadSafe ts(&sa->_mutex, sa->_nthreads > 1);
|
||
|
cur = sa->_cur;
|
||
|
if(cur > sa->_sampleSuffs.size()) break;
|
||
|
sa->_cur++;
|
||
|
}
|
||
|
sa->nextBlock((int)cur, tid);
|
||
|
// Write suffixes into a file
|
||
|
std::ostringstream number; number << cur;
|
||
|
const string fname = sa->_base_fname + "." + number.str() + ".sa";
|
||
|
ofstream sa_file(fname.c_str(), ios::binary);
|
||
|
if(!sa_file.good()) {
|
||
|
cerr << "Could not open file for writing a bucket: \"" << fname << "\"" << endl;
|
||
|
throw 1;
|
||
|
}
|
||
|
const EList<TIndexOffU>& bucket = sa->_itrBuckets[tid];
|
||
|
writeIndex<TIndexOffU>(sa_file, (TIndexOffU)bucket.size(), sa->_bigEndian);
|
||
|
for(size_t i = 0; i < bucket.size(); i++) {
|
||
|
writeIndex<TIndexOffU>(sa_file, bucket[i], sa->_bigEndian);
|
||
|
}
|
||
|
sa_file.close();
|
||
|
sa->_itrBuckets[tid].clear();
|
||
|
sa->_done[cur] = true;
|
||
|
}
|
||
|
}
|
||
|
|
||
|
/**
|
||
|
* Get the next suffix; compute the next bucket if necessary.
|
||
|
*/
|
||
|
virtual TIndexOffU nextSuffix() {
|
||
|
// Launch threads if not
|
||
|
if(this->_nthreads > 1) {
|
||
|
if(_threads.size() == 0) {
|
||
|
_done.resize(_sampleSuffs.size() + 1);
|
||
|
_done.fill(false);
|
||
|
_itrBuckets.resize(this->_nthreads);
|
||
|
for(int tid = 0; tid < this->_nthreads; tid++) {
|
||
|
_tparams.expand();
|
||
|
_tparams.back().first = this;
|
||
|
_tparams.back().second = tid;
|
||
|
_threads.push_back(new tthread::thread(nextBlock_Worker, (void*)&_tparams.back()));
|
||
|
}
|
||
|
assert_eq(_threads.size(), (size_t)this->_nthreads);
|
||
|
}
|
||
|
}
|
||
|
if(this->_itrPushedBackSuffix != OFF_MASK) {
|
||
|
TIndexOffU tmp = this->_itrPushedBackSuffix;
|
||
|
this->_itrPushedBackSuffix = OFF_MASK;
|
||
|
return tmp;
|
||
|
}
|
||
|
while(this->_itrBucketPos >= this->_itrBucket.size() ||
|
||
|
this->_itrBucket.size() == 0)
|
||
|
{
|
||
|
if(!hasMoreBlocks()) {
|
||
|
throw out_of_range("No more suffixes");
|
||
|
}
|
||
|
if(this->_nthreads == 1) {
|
||
|
nextBlock((int)_cur);
|
||
|
_cur++;
|
||
|
} else {
|
||
|
while(!_done[this->_itrBucketIdx]) {
|
||
|
#if defined(_TTHREAD_WIN32_)
|
||
|
Sleep(1);
|
||
|
#elif defined(_TTHREAD_POSIX_)
|
||
|
const static timespec ts = {0, 1000000}; // 1 millisecond
|
||
|
nanosleep(&ts, NULL);
|
||
|
#endif
|
||
|
}
|
||
|
// Read suffixes from a file
|
||
|
std::ostringstream number; number << this->_itrBucketIdx;
|
||
|
const string fname = _base_fname + "." + number.str() + ".sa";
|
||
|
ifstream sa_file(fname.c_str(), ios::binary);
|
||
|
if(!sa_file.good()) {
|
||
|
cerr << "Could not open file for reading a bucket: \"" << fname << "\"" << endl;
|
||
|
throw 1;
|
||
|
}
|
||
|
size_t numSAs = readIndex<TIndexOffU>(sa_file, _bigEndian);
|
||
|
this->_itrBucket.resizeExact(numSAs);
|
||
|
for(size_t i = 0; i < numSAs; i++) {
|
||
|
this->_itrBucket[i] = readIndex<TIndexOffU>(sa_file, _bigEndian);
|
||
|
}
|
||
|
sa_file.close();
|
||
|
std::remove(fname.c_str());
|
||
|
}
|
||
|
this->_itrBucketIdx++;
|
||
|
this->_itrBucketPos = 0;
|
||
|
}
|
||
|
return this->_itrBucket[this->_itrBucketPos++];
|
||
|
}
|
||
|
|
||
|
/// Defined in blockwise_sa.cpp
|
||
|
virtual void nextBlock(int cur_block, int tid = 0);
|
||
|
|
||
|
/// Defined in blockwise_sa.cpp
|
||
|
virtual void qsort(EList<TIndexOffU>& bucket);
|
||
|
|
||
|
/// Return true iff more blocks are available
|
||
|
virtual bool hasMoreBlocks() const {
|
||
|
return this->_itrBucketIdx <= _sampleSuffs.size();
|
||
|
}
|
||
|
|
||
|
/// Return the difference-cover period
|
||
|
uint32_t dcV() const { return _dcV; }
|
||
|
|
||
|
protected:
|
||
|
|
||
|
/**
|
||
|
* Initialize the state of the blockwise suffix sort. If the
|
||
|
* difference cover sample and the sample set have not yet been
|
||
|
* built, build them. Then reset the block cursor to point to
|
||
|
* the first block.
|
||
|
*/
|
||
|
virtual void reset() {
|
||
|
if(!_built) {
|
||
|
build();
|
||
|
}
|
||
|
assert(_built);
|
||
|
_cur = 0;
|
||
|
}
|
||
|
|
||
|
/// Return true iff we're about to dole out the first bucket
|
||
|
virtual bool isReset() {
|
||
|
return _cur == 0;
|
||
|
}
|
||
|
|
||
|
private:
|
||
|
|
||
|
/**
|
||
|
* Calculate the difference-cover sample and sample suffixes.
|
||
|
*/
|
||
|
void build() {
|
||
|
// Calculate difference-cover sample
|
||
|
assert(_dc.get() == NULL);
|
||
|
if(_dcV != 0) {
|
||
|
_dc.init(new TDC(this->text(), _dcV, this->verbose(), this->sanityCheck()));
|
||
|
_dc.get()->build(this->_nthreads);
|
||
|
}
|
||
|
// Calculate sample suffixes
|
||
|
if(this->bucketSz() <= this->text().length()) {
|
||
|
VMSG_NL("Building samples");
|
||
|
buildSamples();
|
||
|
} else {
|
||
|
VMSG_NL("Skipping building samples since text length " <<
|
||
|
this->text().length() << " is less than bucket size: " <<
|
||
|
this->bucketSz());
|
||
|
}
|
||
|
_built = true;
|
||
|
}
|
||
|
|
||
|
/**
|
||
|
* Calculate the lcp between two suffixes using the difference
|
||
|
* cover as a tie-breaker. If the tie-breaker is employed, then
|
||
|
* the calculated lcp may be an underestimate.
|
||
|
*
|
||
|
* Defined in blockwise_sa.cpp
|
||
|
*/
|
||
|
inline bool tieBreakingLcp(TIndexOffU aOff,
|
||
|
TIndexOffU bOff,
|
||
|
TIndexOffU& lcp,
|
||
|
bool& lcpIsSoft);
|
||
|
|
||
|
/**
|
||
|
* Compare two suffixes using the difference-cover sample.
|
||
|
*/
|
||
|
inline bool suffixCmp(TIndexOffU cmp,
|
||
|
TIndexOffU i,
|
||
|
int64_t& j,
|
||
|
int64_t& k,
|
||
|
bool& kSoft,
|
||
|
const EList<TIndexOffU>& z);
|
||
|
|
||
|
void buildSamples();
|
||
|
|
||
|
EList<TIndexOffU> _sampleSuffs; /// sample suffixes
|
||
|
int _nthreads; /// # of threads
|
||
|
TIndexOffU _itrBucketIdx;
|
||
|
TIndexOffU _cur; /// offset to 1st elt of next block
|
||
|
const uint32_t _dcV; /// difference-cover periodicity
|
||
|
PtrWrap<TDC> _dc; /// queryable difference-cover data
|
||
|
bool _built; /// whether samples/DC have been built
|
||
|
RandomSource _randomSrc; /// source of pseudo-randoms
|
||
|
|
||
|
MUTEX_T _mutex; /// synchronization of output message
|
||
|
string _base_fname; /// base file name for storing SA blocks
|
||
|
bool _bigEndian; /// bigEndian?
|
||
|
EList<tthread::thread*> _threads; /// thread list
|
||
|
EList<pair<KarkkainenBlockwiseSA*, int> > _tparams;
|
||
|
ELList<TIndexOffU> _itrBuckets; /// buckets
|
||
|
EList<bool> _done; /// is a block processed?
|
||
|
};
|
||
|
|
||
|
/**
|
||
|
* Qsort the set of suffixes whose offsets are in 'bucket'.
|
||
|
*/
|
||
|
template<typename TStr>
|
||
|
inline void KarkkainenBlockwiseSA<TStr>::qsort(EList<TIndexOffU>& bucket) {
|
||
|
const TStr& t = this->text();
|
||
|
TIndexOffU *s = bucket.ptr();
|
||
|
size_t slen = bucket.size();
|
||
|
TIndexOffU len = (TIndexOffU)t.length();
|
||
|
if(_dc.get() != NULL) {
|
||
|
// Use the difference cover as a tie-breaker if we have it
|
||
|
VMSG_NL(" (Using difference cover)");
|
||
|
// Extract the 'host' array because it's faster to work
|
||
|
// with than the EList<> container
|
||
|
const uint8_t *host = (const uint8_t *)t.buf();
|
||
|
assert(_dc.get() != NULL);
|
||
|
mkeyQSortSufDcU8(t, host, len, s, slen, *_dc.get(), 4,
|
||
|
this->verbose(), this->sanityCheck());
|
||
|
} else {
|
||
|
VMSG_NL(" (Not using difference cover)");
|
||
|
// We don't have a difference cover - just do a normal
|
||
|
// suffix sort
|
||
|
mkeyQSortSuf(t, s, slen, 4,
|
||
|
this->verbose(), this->sanityCheck());
|
||
|
}
|
||
|
}
|
||
|
|
||
|
/**
|
||
|
* Qsort the set of suffixes whose offsets are in 'bucket'. This
|
||
|
* specialization for packed strings does not attempt to extract and
|
||
|
* operate directly on the host string; the fact that the string is
|
||
|
* packed means that the array cannot be sorted directly.
|
||
|
*/
|
||
|
template<>
|
||
|
inline void KarkkainenBlockwiseSA<S2bDnaString>::qsort(
|
||
|
EList<TIndexOffU>& bucket)
|
||
|
{
|
||
|
const S2bDnaString& t = this->text();
|
||
|
TIndexOffU *s = bucket.ptr();
|
||
|
size_t slen = bucket.size();
|
||
|
size_t len = t.length();
|
||
|
if(_dc.get() != NULL) {
|
||
|
// Use the difference cover as a tie-breaker if we have it
|
||
|
VMSG_NL(" (Using difference cover)");
|
||
|
// Can't use the text's 'host' array because the backing
|
||
|
// store for the packed string is not one-char-per-elt.
|
||
|
mkeyQSortSufDcU8(t, t, len, s, slen, *_dc.get(), 4,
|
||
|
this->verbose(), this->sanityCheck());
|
||
|
} else {
|
||
|
VMSG_NL(" (Not using difference cover)");
|
||
|
// We don't have a difference cover - just do a normal
|
||
|
// suffix sort
|
||
|
mkeyQSortSuf(t, s, slen, 4,
|
||
|
this->verbose(), this->sanityCheck());
|
||
|
}
|
||
|
}
|
||
|
|
||
|
template<typename TStr>
|
||
|
struct BinarySortingParam {
|
||
|
const TStr* t;
|
||
|
const EList<TIndexOffU>* sampleSuffs;
|
||
|
EList<TIndexOffU> bucketSzs;
|
||
|
EList<TIndexOffU> bucketReps;
|
||
|
size_t begin;
|
||
|
size_t end;
|
||
|
};
|
||
|
|
||
|
template<typename TStr>
|
||
|
static void BinarySorting_worker(void *vp)
|
||
|
{
|
||
|
BinarySortingParam<TStr>* param = (BinarySortingParam<TStr>*)vp;
|
||
|
const TStr& t = *(param->t);
|
||
|
size_t len = t.length();
|
||
|
const EList<TIndexOffU>& sampleSuffs = *(param->sampleSuffs);
|
||
|
EList<TIndexOffU>& bucketSzs = param->bucketSzs;
|
||
|
EList<TIndexOffU>& bucketReps = param->bucketReps;
|
||
|
ASSERT_ONLY(size_t numBuckets = bucketSzs.size());
|
||
|
size_t begin = param->begin;
|
||
|
size_t end = param->end;
|
||
|
// Iterate through every suffix in the text, determine which
|
||
|
// bucket it falls into by doing a binary search across the
|
||
|
// sorted list of samples, and increment a counter associated
|
||
|
// with that bucket. Also, keep one representative for each
|
||
|
// bucket so that we can split it later. We loop in ten
|
||
|
// stretches so that we can print out a helpful progress
|
||
|
// message. (This step can take a long time.)
|
||
|
for(TIndexOffU i = (TIndexOffU)begin; i < end && i < len; i++) {
|
||
|
TIndexOffU r = binarySASearch(t, i, sampleSuffs);
|
||
|
if(r == std::numeric_limits<TIndexOffU>::max()) continue; // r was one of the samples
|
||
|
assert_lt(r, numBuckets);
|
||
|
bucketSzs[r]++;
|
||
|
assert_lt(bucketSzs[r], len);
|
||
|
if(bucketReps[r] == OFF_MASK || (i & 100) == 0) {
|
||
|
bucketReps[r] = i; // clobbers previous one, but that's OK
|
||
|
}
|
||
|
}
|
||
|
}
|
||
|
|
||
|
/**
|
||
|
* Select a set of bucket-delineating sample suffixes such that no
|
||
|
* bucket is greater than the requested upper limit. Some care is
|
||
|
* taken to make each bucket's size close to the limit without
|
||
|
* going over.
|
||
|
*/
|
||
|
template<typename TStr>
|
||
|
void KarkkainenBlockwiseSA<TStr>::buildSamples() {
|
||
|
const TStr& t = this->text();
|
||
|
TIndexOffU bsz = this->bucketSz()-1; // subtract 1 to leave room for sample
|
||
|
size_t len = this->text().length();
|
||
|
// Prepare _sampleSuffs array
|
||
|
_sampleSuffs.clear();
|
||
|
TIndexOffU numSamples = (TIndexOffU)((len/bsz)+1)<<1; // ~len/bsz x 2
|
||
|
assert_gt(numSamples, 0);
|
||
|
VMSG_NL("Reserving space for " << numSamples << " sample suffixes");
|
||
|
if(this->_passMemExc) {
|
||
|
_sampleSuffs.resizeExact(numSamples);
|
||
|
// Randomly generate samples. Allow duplicates for now.
|
||
|
VMSG_NL("Generating random suffixes");
|
||
|
for(size_t i = 0; i < numSamples; i++) {
|
||
|
#ifdef BOWTIE_64BIT_INDEX
|
||
|
_sampleSuffs[i] = (TIndexOffU)(_randomSrc.nextU64() % len);
|
||
|
#else
|
||
|
_sampleSuffs[i] = (TIndexOffU)(_randomSrc.nextU32() % len);
|
||
|
#endif
|
||
|
}
|
||
|
} else {
|
||
|
try {
|
||
|
_sampleSuffs.resizeExact(numSamples);
|
||
|
// Randomly generate samples. Allow duplicates for now.
|
||
|
VMSG_NL("Generating random suffixes");
|
||
|
for(size_t i = 0; i < numSamples; i++) {
|
||
|
#ifdef BOWTIE_64BIT_INDEX
|
||
|
_sampleSuffs[i] = (TIndexOffU)(_randomSrc.nextU64() % len);
|
||
|
#else
|
||
|
_sampleSuffs[i] = (TIndexOffU)(_randomSrc.nextU32() % len);
|
||
|
#endif
|
||
|
}
|
||
|
} catch(bad_alloc &e) {
|
||
|
if(this->_passMemExc) {
|
||
|
throw e; // rethrow immediately
|
||
|
} else {
|
||
|
cerr << "Could not allocate sample suffix container of " << (numSamples * OFF_SIZE) << " bytes." << endl
|
||
|
<< "Please try using a smaller number of blocks by specifying a larger --bmax or" << endl
|
||
|
<< "a smaller --bmaxdivn" << endl;
|
||
|
throw 1;
|
||
|
}
|
||
|
}
|
||
|
}
|
||
|
// Remove duplicates; very important to do this before the call to
|
||
|
// mkeyQSortSuf so that it doesn't try to calculate lexicographical
|
||
|
// relationships between very long, identical strings, which takes
|
||
|
// an extremely long time in general, and causes the stack to grow
|
||
|
// linearly with the size of the input
|
||
|
{
|
||
|
Timer timer(cout, "QSorting sample offsets, eliminating duplicates time: ", this->verbose());
|
||
|
VMSG_NL("QSorting " << _sampleSuffs.size() << " sample offsets, eliminating duplicates");
|
||
|
_sampleSuffs.sort();
|
||
|
size_t sslen = _sampleSuffs.size();
|
||
|
for(size_t i = 0; i < sslen-1; i++) {
|
||
|
if(_sampleSuffs[i] == _sampleSuffs[i+1]) {
|
||
|
_sampleSuffs.erase(i--);
|
||
|
sslen--;
|
||
|
}
|
||
|
}
|
||
|
}
|
||
|
// Multikey quicksort the samples
|
||
|
{
|
||
|
Timer timer(cout, " Multikey QSorting samples time: ", this->verbose());
|
||
|
VMSG_NL("Multikey QSorting " << _sampleSuffs.size() << " samples");
|
||
|
this->qsort(_sampleSuffs);
|
||
|
}
|
||
|
// Calculate bucket sizes
|
||
|
VMSG_NL("Calculating bucket sizes");
|
||
|
int limit = 5;
|
||
|
// Iterate until all buckets are less than
|
||
|
while(--limit >= 0) {
|
||
|
TIndexOffU numBuckets = (TIndexOffU)_sampleSuffs.size()+1;
|
||
|
AutoArray<tthread::thread*> threads(this->_nthreads);
|
||
|
EList<BinarySortingParam<TStr> > tparams;
|
||
|
for(int tid = 0; tid < this->_nthreads; tid++) {
|
||
|
// Calculate bucket sizes by doing a binary search for each
|
||
|
// suffix and noting where it lands
|
||
|
tparams.expand();
|
||
|
try {
|
||
|
// Allocate and initialize containers for holding bucket
|
||
|
// sizes and representatives.
|
||
|
tparams.back().bucketSzs.resizeExact(numBuckets);
|
||
|
tparams.back().bucketReps.resizeExact(numBuckets);
|
||
|
tparams.back().bucketSzs.fillZero();
|
||
|
tparams.back().bucketReps.fill(OFF_MASK);
|
||
|
} catch(bad_alloc &e) {
|
||
|
if(this->_passMemExc) {
|
||
|
throw e; // rethrow immediately
|
||
|
} else {
|
||
|
cerr << "Could not allocate sizes, representatives (" << ((numBuckets*8)>>10) << " KB) for blocks." << endl
|
||
|
<< "Please try using a smaller number of blocks by specifying a larger --bmax or a" << endl
|
||
|
<< "smaller --bmaxdivn." << endl;
|
||
|
throw 1;
|
||
|
}
|
||
|
}
|
||
|
tparams.back().t = &t;
|
||
|
tparams.back().sampleSuffs = &_sampleSuffs;
|
||
|
tparams.back().begin = (tid == 0 ? 0 : len / this->_nthreads * tid);
|
||
|
tparams.back().end = (tid + 1 == this->_nthreads ? len : len / this->_nthreads * (tid + 1));
|
||
|
if(this->_nthreads == 1) {
|
||
|
BinarySorting_worker<TStr>((void*)&tparams.back());
|
||
|
} else {
|
||
|
threads[tid] = new tthread::thread(BinarySorting_worker<TStr>, (void*)&tparams.back());
|
||
|
}
|
||
|
}
|
||
|
|
||
|
if(this->_nthreads > 1) {
|
||
|
for (int tid = 0; tid < this->_nthreads; tid++) {
|
||
|
threads[tid]->join();
|
||
|
}
|
||
|
}
|
||
|
|
||
|
EList<TIndexOffU>& bucketSzs = tparams[0].bucketSzs;
|
||
|
EList<TIndexOffU>& bucketReps = tparams[0].bucketReps;
|
||
|
for(int tid = 1; tid < this->_nthreads; tid++) {
|
||
|
for(size_t j = 0; j < numBuckets; j++) {
|
||
|
bucketSzs[j] += tparams[tid].bucketSzs[j];
|
||
|
if(bucketReps[j] == OFF_MASK) {
|
||
|
bucketReps[j] = tparams[tid].bucketReps[j];
|
||
|
}
|
||
|
}
|
||
|
}
|
||
|
// Check for large buckets and mergeable pairs of small buckets
|
||
|
// and split/merge as necessary
|
||
|
TIndexOff added = 0;
|
||
|
TIndexOff merged = 0;
|
||
|
assert_eq(bucketSzs.size(), numBuckets);
|
||
|
assert_eq(bucketReps.size(), numBuckets);
|
||
|
{
|
||
|
Timer timer(cout, " Splitting and merging time: ", this->verbose());
|
||
|
VMSG_NL("Splitting and merging");
|
||
|
for(TIndexOffU i = 0; i < numBuckets; i++) {
|
||
|
TIndexOffU mergedSz = bsz + 1;
|
||
|
assert(bucketSzs[(size_t)i] == 0 || bucketReps[(size_t)i] != OFF_MASK);
|
||
|
if(i < numBuckets-1) {
|
||
|
mergedSz = bucketSzs[(size_t)i] + bucketSzs[(size_t)i+1] + 1;
|
||
|
}
|
||
|
// Merge?
|
||
|
if(mergedSz <= bsz) {
|
||
|
bucketSzs[(size_t)i+1] += (bucketSzs[(size_t)i]+1);
|
||
|
// The following may look strange, but it's necessary
|
||
|
// to ensure that the merged bucket has a representative
|
||
|
bucketReps[(size_t)i+1] = _sampleSuffs[(size_t)i+added];
|
||
|
_sampleSuffs.erase((size_t)i+added);
|
||
|
bucketSzs.erase((size_t)i);
|
||
|
bucketReps.erase((size_t)i);
|
||
|
i--; // might go to -1 but ++ will overflow back to 0
|
||
|
numBuckets--;
|
||
|
merged++;
|
||
|
assert_eq(numBuckets, _sampleSuffs.size()+1-added);
|
||
|
assert_eq(numBuckets, bucketSzs.size());
|
||
|
}
|
||
|
// Split?
|
||
|
else if(bucketSzs[(size_t)i] > bsz) {
|
||
|
// Add an additional sample from the bucketReps[]
|
||
|
// set accumulated in the binarySASearch loop; this
|
||
|
// effectively splits the bucket
|
||
|
_sampleSuffs.insert(bucketReps[(size_t)i], (TIndexOffU)(i + (added++)));
|
||
|
}
|
||
|
}
|
||
|
}
|
||
|
if(added == 0) {
|
||
|
//if(this->verbose()) {
|
||
|
// cout << "Final bucket sizes:" << endl;
|
||
|
// cout << " (begin): " << bucketSzs[0] << " (" << (int)(bsz - bucketSzs[0]) << ")" << endl;
|
||
|
// for(uint32_t i = 1; i < numBuckets; i++) {
|
||
|
// cout << " " << bucketSzs[i] << " (" << (int)(bsz - bucketSzs[i]) << ")" << endl;
|
||
|
// }
|
||
|
//}
|
||
|
break;
|
||
|
}
|
||
|
// Otherwise, continue until no more buckets need to be
|
||
|
// split
|
||
|
VMSG_NL("Split " << added << ", merged " << merged << "; iterating...");
|
||
|
}
|
||
|
// Do *not* force a do-over
|
||
|
// if(limit == 0) {
|
||
|
// VMSG_NL("Iterated too many times; trying again...");
|
||
|
// buildSamples();
|
||
|
// }
|
||
|
VMSG_NL("Avg bucket size: " << ((double)(len-_sampleSuffs.size()) / (_sampleSuffs.size()+1)) << " (target: " << bsz << ")");
|
||
|
}
|
||
|
|
||
|
/**
|
||
|
* Do a simple LCP calculation on two strings.
|
||
|
*/
|
||
|
template<typename T> inline
|
||
|
static TIndexOffU suffixLcp(const T& t, TIndexOffU aOff, TIndexOffU bOff) {
|
||
|
TIndexOffU c = 0;
|
||
|
size_t len = t.length();
|
||
|
assert_leq(aOff, len);
|
||
|
assert_leq(bOff, len);
|
||
|
while(aOff + c < len && bOff + c < len && t[aOff + c] == t[bOff + c]) c++;
|
||
|
return c;
|
||
|
}
|
||
|
|
||
|
/**
|
||
|
* Calculate the lcp between two suffixes using the difference
|
||
|
* cover as a tie-breaker. If the tie-breaker is employed, then
|
||
|
* the calculated lcp may be an underestimate. If the tie-breaker is
|
||
|
* employed, lcpIsSoft will be set to true (otherwise, false).
|
||
|
*/
|
||
|
template<typename TStr> inline
|
||
|
bool KarkkainenBlockwiseSA<TStr>::tieBreakingLcp(TIndexOffU aOff,
|
||
|
TIndexOffU bOff,
|
||
|
TIndexOffU& lcp,
|
||
|
bool& lcpIsSoft)
|
||
|
{
|
||
|
const TStr& t = this->text();
|
||
|
TIndexOffU c = 0;
|
||
|
TIndexOffU tlen = (TIndexOffU)t.length();
|
||
|
assert_leq(aOff, tlen);
|
||
|
assert_leq(bOff, tlen);
|
||
|
assert(_dc.get() != NULL);
|
||
|
uint32_t dcDist = _dc.get()->tieBreakOff(aOff, bOff);
|
||
|
lcpIsSoft = false; // hard until proven soft
|
||
|
while(c < dcDist && // we haven't hit the tie breaker
|
||
|
c < tlen-aOff && // we haven't fallen off of LHS suffix
|
||
|
c < tlen-bOff && // we haven't fallen off of RHS suffix
|
||
|
t[aOff+c] == t[bOff+c]) // we haven't hit a mismatch
|
||
|
c++;
|
||
|
lcp = c;
|
||
|
if(c == tlen-aOff) {
|
||
|
// Fell off LHS (a), a is greater
|
||
|
return false;
|
||
|
} else if(c == tlen-bOff) {
|
||
|
// Fell off RHS (b), b is greater
|
||
|
return true;
|
||
|
} else if(c == dcDist) {
|
||
|
// Hit a tie-breaker element
|
||
|
lcpIsSoft = true;
|
||
|
assert_neq(dcDist, 0xffffffff);
|
||
|
return _dc.get()->breakTie(aOff+c, bOff+c) < 0;
|
||
|
} else {
|
||
|
assert_neq(t[aOff+c], t[bOff+c]);
|
||
|
return t[aOff+c] < t[bOff+c];
|
||
|
}
|
||
|
}
|
||
|
|
||
|
/**
|
||
|
* Lookup a suffix LCP in the given z array; if the element is not
|
||
|
* filled in then calculate it from scratch.
|
||
|
*/
|
||
|
template<typename T>
|
||
|
static TIndexOffU lookupSuffixZ(
|
||
|
const T& t,
|
||
|
TIndexOffU zOff,
|
||
|
TIndexOffU off,
|
||
|
const EList<TIndexOffU>& z)
|
||
|
{
|
||
|
if(zOff < z.size()) {
|
||
|
TIndexOffU ret = z[zOff];
|
||
|
assert_eq(ret, suffixLcp(t, off + zOff, off));
|
||
|
return ret;
|
||
|
}
|
||
|
assert_leq(off + zOff, t.length());
|
||
|
return suffixLcp(t, off + zOff, off);
|
||
|
}
|
||
|
|
||
|
/**
|
||
|
* true -> i < cmp
|
||
|
* false -> i > cmp
|
||
|
*/
|
||
|
template<typename TStr> inline
|
||
|
bool KarkkainenBlockwiseSA<TStr>::suffixCmp(
|
||
|
TIndexOffU cmp,
|
||
|
TIndexOffU i,
|
||
|
int64_t& j,
|
||
|
int64_t& k,
|
||
|
bool& kSoft,
|
||
|
const EList<TIndexOffU>& z)
|
||
|
{
|
||
|
const TStr& t = this->text();
|
||
|
TIndexOffU len = (TIndexOffU)t.length();
|
||
|
// i is not covered by any previous match
|
||
|
TIndexOffU l;
|
||
|
if((int64_t)i > k) {
|
||
|
k = i; // so that i + lHi == kHi
|
||
|
l = 0; // erase any previous l
|
||
|
kSoft = false;
|
||
|
// To be extended
|
||
|
}
|
||
|
// i is covered by a previous match
|
||
|
else /* i <= k */ {
|
||
|
assert_gt((int64_t)i, j);
|
||
|
TIndexOffU zIdx = (TIndexOffU)(i-j);
|
||
|
assert_leq(zIdx, len-cmp);
|
||
|
if(zIdx < _dcV || _dc.get() == NULL) {
|
||
|
// Go as far as the Z-box says
|
||
|
l = lookupSuffixZ(t, zIdx, cmp, z);
|
||
|
if(i + l > len) {
|
||
|
l = len-i;
|
||
|
}
|
||
|
assert_leq(i + l, len);
|
||
|
// Possibly to be extended
|
||
|
} else {
|
||
|
// But we're past the point of no-more-Z-boxes
|
||
|
bool ret = tieBreakingLcp(i, cmp, l, kSoft);
|
||
|
// Sanity-check tie-breaker
|
||
|
if(this->sanityCheck()) {
|
||
|
if(ret) assert(sstr_suf_lt(t, i, t, cmp, false));
|
||
|
else assert(sstr_suf_gt(t, i, t, cmp, false));
|
||
|
}
|
||
|
j = i;
|
||
|
k = i + l;
|
||
|
if(this->sanityCheck()) {
|
||
|
if(kSoft) { assert_leq(l, suffixLcp(t, i, cmp)); }
|
||
|
else { assert_eq (l, suffixLcp(t, i, cmp)); }
|
||
|
}
|
||
|
return ret;
|
||
|
}
|
||
|
}
|
||
|
|
||
|
// Z box extends exactly as far as previous match (or there
|
||
|
// is neither a Z box nor a previous match)
|
||
|
if((int64_t)(i + l) == k) {
|
||
|
// Extend
|
||
|
while(l < len-cmp && k < (int64_t)len && t[(size_t)(cmp+l)] == t[(size_t)k]) {
|
||
|
k++; l++;
|
||
|
}
|
||
|
j = i; // update furthest-extending LHS
|
||
|
kSoft = false;
|
||
|
assert_eq(l, suffixLcp(t, i, cmp));
|
||
|
}
|
||
|
// Z box extends further than previous match
|
||
|
else if((int64_t)(i + l) > k) {
|
||
|
l = (TIndexOffU)(k - i); // point to just after previous match
|
||
|
j = i; // update furthest-extending LHS
|
||
|
if(kSoft) {
|
||
|
while(l < len-cmp && k < (int64_t)len && t[(size_t)(cmp+l)] == t[(size_t)k]) {
|
||
|
k++; l++;
|
||
|
}
|
||
|
kSoft = false;
|
||
|
assert_eq(l, suffixLcp(t, i, cmp));
|
||
|
} else assert_eq(l, suffixLcp(t, i, cmp));
|
||
|
}
|
||
|
|
||
|
// Check that calculated lcp matches actual lcp
|
||
|
if(this->sanityCheck()) {
|
||
|
if(!kSoft) {
|
||
|
// l should exactly match lcp
|
||
|
assert_eq(l, suffixLcp(t, i, cmp));
|
||
|
} else {
|
||
|
// l is an underestimate of LCP
|
||
|
assert_leq(l, suffixLcp(t, i, cmp));
|
||
|
}
|
||
|
}
|
||
|
assert_leq(l+i, len);
|
||
|
assert_leq(l, len-cmp);
|
||
|
|
||
|
// i and cmp should not be the same suffix
|
||
|
assert(l != len-cmp || i+l != len);
|
||
|
|
||
|
// Now we're ready to do a comparison on the next char
|
||
|
if(l+i != len && (
|
||
|
l == len-cmp || // departure from paper algorithm:
|
||
|
// falling off pattern implies
|
||
|
// pattern is *greater* in our case
|
||
|
t[i + l] < t[cmp + l]))
|
||
|
{
|
||
|
// Case 2: Text suffix is less than upper sample suffix
|
||
|
#ifndef NDEBUG
|
||
|
if(this->sanityCheck()) {
|
||
|
assert(sstr_suf_lt(t, i, t, cmp, false));
|
||
|
}
|
||
|
#endif
|
||
|
return true; // suffix at i is less than suffix at cmp
|
||
|
}
|
||
|
else {
|
||
|
// Case 3: Text suffix is greater than upper sample suffix
|
||
|
#ifndef NDEBUG
|
||
|
if(this->sanityCheck()) {
|
||
|
assert(sstr_suf_gt(t, i, t, cmp, false));
|
||
|
}
|
||
|
#endif
|
||
|
return false; // suffix at i is less than suffix at cmp
|
||
|
}
|
||
|
}
|
||
|
|
||
|
/**
|
||
|
* Retrieve the next block. This is the most performance-critical part
|
||
|
* of the blockwise suffix sorting process.
|
||
|
*/
|
||
|
template<typename TStr>
|
||
|
void KarkkainenBlockwiseSA<TStr>::nextBlock(int cur_block, int tid) {
|
||
|
#ifndef NDEBUG
|
||
|
if(this->_nthreads > 1) {
|
||
|
assert_lt(tid, this->_itrBuckets.size());
|
||
|
}
|
||
|
#endif
|
||
|
EList<TIndexOffU>& bucket = (this->_nthreads > 1 ? this->_itrBuckets[tid] : this->_itrBucket);
|
||
|
{
|
||
|
ThreadSafe ts(&_mutex, this->_nthreads > 1);
|
||
|
VMSG_NL("Getting block " << (cur_block+1) << " of " << _sampleSuffs.size()+1);
|
||
|
}
|
||
|
assert(_built);
|
||
|
assert_gt(_dcV, 3);
|
||
|
assert_leq(cur_block, _sampleSuffs.size());
|
||
|
const TStr& t = this->text();
|
||
|
TIndexOffU len = (TIndexOffU)t.length();
|
||
|
// Set up the bucket
|
||
|
bucket.clear();
|
||
|
TIndexOffU lo = OFF_MASK, hi = OFF_MASK;
|
||
|
if(_sampleSuffs.size() == 0) {
|
||
|
// Special case: if _sampleSuffs is 0, then multikey-quicksort
|
||
|
// everything
|
||
|
{
|
||
|
ThreadSafe ts(&_mutex, this->_nthreads > 1);
|
||
|
VMSG_NL(" No samples; assembling all-inclusive block");
|
||
|
}
|
||
|
assert_eq(0, cur_block);
|
||
|
try {
|
||
|
if(bucket.capacity() < this->bucketSz()) {
|
||
|
bucket.reserveExact(len+1);
|
||
|
}
|
||
|
bucket.resize(len);
|
||
|
for(TIndexOffU i = 0; i < len; i++) {
|
||
|
bucket[i] = i;
|
||
|
}
|
||
|
} catch(bad_alloc &e) {
|
||
|
if(this->_passMemExc) {
|
||
|
throw e; // rethrow immediately
|
||
|
} else {
|
||
|
cerr << "Could not allocate a master suffix-array block of " << ((len+1) * 4) << " bytes" << endl
|
||
|
<< "Please try using a larger number of blocks by specifying a smaller --bmax or" << endl
|
||
|
<< "a larger --bmaxdivn" << endl;
|
||
|
throw 1;
|
||
|
}
|
||
|
}
|
||
|
} else {
|
||
|
try {
|
||
|
{
|
||
|
ThreadSafe ts(&_mutex, this->_nthreads > 1);
|
||
|
VMSG_NL(" Reserving size (" << this->bucketSz() << ") for bucket " << (cur_block+1));
|
||
|
}
|
||
|
// BTL: Add a +100 fudge factor; there seem to be instances
|
||
|
// where a bucket ends up having one more elt than bucketSz()
|
||
|
if(bucket.size() < this->bucketSz()+100) {
|
||
|
bucket.reserveExact(this->bucketSz()+100);
|
||
|
}
|
||
|
} catch(bad_alloc &e) {
|
||
|
if(this->_passMemExc) {
|
||
|
throw e; // rethrow immediately
|
||
|
} else {
|
||
|
cerr << "Could not allocate a suffix-array block of " << ((this->bucketSz()+1) * 4) << " bytes" << endl;
|
||
|
cerr << "Please try using a larger number of blocks by specifying a smaller --bmax or" << endl
|
||
|
<< "a larger --bmaxdivn" << endl;
|
||
|
throw 1;
|
||
|
}
|
||
|
}
|
||
|
// Select upper and lower bounds from _sampleSuffs[] and
|
||
|
// calculate the Z array up to the difference-cover periodicity
|
||
|
// for both. Be careful about first/last buckets.
|
||
|
EList<TIndexOffU> zLo(EBWTB_CAT), zHi(EBWTB_CAT);
|
||
|
assert_geq(cur_block, 0);
|
||
|
assert_leq((size_t)cur_block, _sampleSuffs.size());
|
||
|
bool first = (cur_block == 0);
|
||
|
bool last = ((size_t)cur_block == _sampleSuffs.size());
|
||
|
try {
|
||
|
// Timer timer(cout, " Calculating Z arrays time: ", this->verbose());
|
||
|
{
|
||
|
ThreadSafe ts(&_mutex, this->_nthreads > 1);
|
||
|
VMSG_NL(" Calculating Z arrays for bucket " << (cur_block+1));
|
||
|
}
|
||
|
if(!last) {
|
||
|
// Not the last bucket
|
||
|
assert_lt(cur_block, _sampleSuffs.size());
|
||
|
hi = _sampleSuffs[cur_block];
|
||
|
zHi.resizeExact(_dcV);
|
||
|
zHi.fillZero();
|
||
|
assert_eq(zHi[0], 0);
|
||
|
calcZ(t, hi, zHi, this->verbose(), this->sanityCheck());
|
||
|
}
|
||
|
if(!first) {
|
||
|
// Not the first bucket
|
||
|
assert_gt(cur_block, 0);
|
||
|
assert_leq(cur_block, _sampleSuffs.size());
|
||
|
lo = _sampleSuffs[cur_block-1];
|
||
|
zLo.resizeExact(_dcV);
|
||
|
zLo.fillZero();
|
||
|
assert_gt(_dcV, 3);
|
||
|
assert_eq(zLo[0], 0);
|
||
|
calcZ(t, lo, zLo, this->verbose(), this->sanityCheck());
|
||
|
}
|
||
|
} catch(bad_alloc &e) {
|
||
|
if(this->_passMemExc) {
|
||
|
throw e; // rethrow immediately
|
||
|
} else {
|
||
|
cerr << "Could not allocate a z-array of " << (_dcV * 4) << " bytes" << endl;
|
||
|
cerr << "Please try using a larger number of blocks by specifying a smaller --bmax or" << endl
|
||
|
<< "a larger --bmaxdivn" << endl;
|
||
|
throw 1;
|
||
|
}
|
||
|
}
|
||
|
|
||
|
// This is the most critical loop in the algorithm; this is where
|
||
|
// we iterate over all suffixes in the text and pick out those that
|
||
|
// fall into the current bucket.
|
||
|
//
|
||
|
// This loop is based on the SMALLERSUFFIXES function outlined on
|
||
|
// p7 of the "Fast BWT" paper
|
||
|
//
|
||
|
int64_t kHi = -1, kLo = -1;
|
||
|
int64_t jHi = -1, jLo = -1;
|
||
|
bool kHiSoft = false, kLoSoft = false;
|
||
|
assert_eq(0, bucket.size());
|
||
|
{
|
||
|
// Timer timer(cout, " Block accumulator loop time: ", this->verbose());
|
||
|
{
|
||
|
ThreadSafe ts(&_mutex, this->_nthreads > 1);
|
||
|
VMSG_NL(" Entering block accumulator loop for bucket " << (cur_block+1) << ":");
|
||
|
}
|
||
|
TIndexOffU lenDiv10 = (len + 9) / 10;
|
||
|
for(TIndexOffU iten = 0, ten = 0; iten < len; iten += lenDiv10, ten++) {
|
||
|
TIndexOffU itenNext = iten + lenDiv10;
|
||
|
{
|
||
|
ThreadSafe ts(&_mutex, this->_nthreads > 1);
|
||
|
if(ten > 0) VMSG_NL(" bucket " << (cur_block+1) << ": " << (ten * 10) << "%");
|
||
|
}
|
||
|
for(TIndexOffU i = iten; i < itenNext && i < len; i++) {
|
||
|
assert_lt(jLo, (int64_t)i); assert_lt(jHi, (int64_t)i);
|
||
|
// Advance the upper-bound comparison by one character
|
||
|
if(i == hi || i == lo) continue; // equal to one of the bookends
|
||
|
if(hi != OFF_MASK && !suffixCmp(hi, i, jHi, kHi, kHiSoft, zHi)) {
|
||
|
continue; // not in the bucket
|
||
|
}
|
||
|
if(lo != OFF_MASK && suffixCmp(lo, i, jLo, kLo, kLoSoft, zLo)) {
|
||
|
continue; // not in the bucket
|
||
|
}
|
||
|
// In the bucket! - add it
|
||
|
assert_lt(i, len);
|
||
|
try {
|
||
|
bucket.push_back(i);
|
||
|
} catch(bad_alloc &e) {
|
||
|
cerr << "Could not append element to block of " << ((bucket.size()) * OFF_SIZE) << " bytes" << endl;
|
||
|
if(this->_passMemExc) {
|
||
|
throw e; // rethrow immediately
|
||
|
} else {
|
||
|
cerr << "Please try using a larger number of blocks by specifying a smaller --bmax or" << endl
|
||
|
<< "a larger --bmaxdivn" << endl;
|
||
|
throw 1;
|
||
|
}
|
||
|
}
|
||
|
// Not necessarily true; we allow overflowing buckets
|
||
|
// since we can't guarantee that a good set of sample
|
||
|
// suffixes can be found in a reasonable amount of time
|
||
|
//assert_lt(bucket.size(), this->bucketSz());
|
||
|
}
|
||
|
} // end loop over all suffixes of t
|
||
|
{
|
||
|
ThreadSafe ts(&_mutex, this->_nthreads > 1);
|
||
|
VMSG_NL(" bucket " << (cur_block+1) << ": 100%");
|
||
|
}
|
||
|
}
|
||
|
} // end else clause of if(_sampleSuffs.size() == 0)
|
||
|
// Sort the bucket
|
||
|
if(bucket.size() > 0) {
|
||
|
Timer timer(cout, " Sorting block time: ", this->verbose());
|
||
|
{
|
||
|
ThreadSafe ts(&_mutex, this->_nthreads > 1);
|
||
|
VMSG_NL(" Sorting block of length " << bucket.size() << " for bucket " << (cur_block+1));
|
||
|
}
|
||
|
this->qsort(bucket);
|
||
|
}
|
||
|
if(hi != OFF_MASK) {
|
||
|
// Not the final bucket; throw in the sample on the RHS
|
||
|
bucket.push_back(hi);
|
||
|
} else {
|
||
|
// Final bucket; throw in $ suffix
|
||
|
bucket.push_back(len);
|
||
|
}
|
||
|
{
|
||
|
ThreadSafe ts(&_mutex, this->_nthreads > 1);
|
||
|
VMSG_NL("Returning block of " << bucket.size() << " for bucket " << (cur_block+1));
|
||
|
}
|
||
|
}
|
||
|
|
||
|
#endif /*BLOCKWISE_SA_H_*/
|