hisat-3n/gfm.h
2025-01-18 21:09:52 +08:00

6981 lines
252 KiB
C++

/*
* Copyright 2015, Daehwan Kim <infphilo@gmail.com>
*
* This file is part of HISAT 2.
*
* HISAT 2 is free software: you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation, either version 3 of the License, or
* (at your option) any later version.
*
* HISAT 2 is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with HISAT 2. If not, see <http://www.gnu.org/licenses/>.
*/
#ifndef GFM_H_
#define GFM_H_
#include <stdint.h>
#include <string.h>
#include <iostream>
#include <fstream>
#include <sstream>
#include <memory>
#include <fcntl.h>
#include <math.h>
#include <errno.h>
#include <set>
#include <stdexcept>
#include <sys/stat.h>
#ifdef BOWTIE_MM
#include <sys/mman.h>
#include <sys/shm.h>
#endif
#include "shmem.h"
#include "alphabet.h"
#include "assert_helpers.h"
#include "bitpack.h"
#include "blockwise_sa.h"
#include "endian_swap.h"
#include "word_io.h"
#include "random_source.h"
#include "ref_read.h"
#include "threading.h"
#include "str_util.h"
#include "mm.h"
#include "timer.h"
#include "reference.h"
#include "search_globals.h"
#include "ds.h"
#include "random_source.h"
#include "mem_ids.h"
#include "btypes.h"
#include "tokenize.h"
#include "repeat.h"
#include "repeat_kmer.h"
#ifdef POPCNT_CAPABILITY
#include "processor_support.h"
#endif
#include "gbwt_graph.h"
using namespace std;
// From ccnt_lut.cpp, automatically generated by gen_lookup_tables.pl
extern uint8_t cCntLUT_4[4][4][256];
extern uint8_t cCntLUT_4_rev[4][4][256];
extern uint8_t cCntBIT[8][256];
extern bool threeN;
static const uint64_t c_table[4] = {
0xffffffffffffffff,
0xaaaaaaaaaaaaaaaa,
0x5555555555555555,
0x0000000000000000
};
#ifndef VMSG_NL
#define VMSG_NL(...) \
if(this->verbose()) { \
stringstream tmp; \
tmp << __VA_ARGS__ << endl; \
this->verbose(tmp.str()); \
}
#endif
#ifndef VMSG
#define VMSG(...) \
if(this->verbose()) { \
stringstream tmp; \
tmp << __VA_ARGS__; \
this->verbose(tmp.str()); \
}
#endif
/**
* Flags describing type of Ebwt.
*/
enum GFM_FLAGS {
GFM_ENTIRE_REV = 4 // true -> reverse Ebwt is the whole
// concatenated string reversed, rather than
// each stretch reversed
};
/**
* Extended Burrows-Wheeler transform header. This together with the
* actual data arrays and other text-specific parameters defined in
* class Ebwt constitute the entire Ebwt.
*/
template <typename index_t = uint32_t>
class GFMParams {
public:
GFMParams() { }
GFMParams(
index_t len,
index_t gbwtLen,
index_t numNodes,
int32_t lineRate,
int32_t offRate,
int32_t ftabChars,
index_t eftabLen,
bool entireReverse)
{
init(len, gbwtLen, numNodes, lineRate, offRate, ftabChars, eftabLen, entireReverse);
}
GFMParams(const GFMParams& gh) {
init(gh._len, gh._gbwtLen, gh._numNodes, gh._lineRate, gh._offRate,
gh._ftabChars, gh._eftabLen, gh._entireReverse);
}
void init(
index_t len,
index_t gbwtLen,
index_t numNodes,
int32_t lineRate,
int32_t offRate,
int32_t ftabChars,
index_t eftabLen,
bool entireReverse)
{
_entireReverse = entireReverse;
_linearFM = (len + 1 == gbwtLen || gbwtLen == 0);
_len = len;
_gbwtLen = (gbwtLen == 0 ? len + 1 : gbwtLen);
_numNodes = (numNodes == 0 ? len + 1 : numNodes);
if(_linearFM) {
_sz = (len+3)/4;
_gbwtSz = _gbwtLen/4 + 1;
} else {
_sz = (len+1)/2;
_gbwtSz = _gbwtLen/2 + 1;
}
_lineRate = lineRate;
_origOffRate = offRate;
_offRate = offRate;
_offMask = std::numeric_limits<index_t>::max() << _offRate;
_ftabChars = ftabChars;
_eftabLen = eftabLen;
_eftabSz = _eftabLen*sizeof(index_t);
_ftabLen = (1 << (_ftabChars*2))+1;
_ftabSz = _ftabLen*sizeof(index_t);
_offsLen = (_numNodes + (1 << _offRate) - 1) >> _offRate;
_offsSz = _offsLen*sizeof(index_t);
_lineSz = 1 << _lineRate;
_sideSz = _lineSz * 1 /* lines per side */;
if(_linearFM) {
_sideGbwtSz = _sideSz - (sizeof(index_t) * 4);
_sideGbwtLen = _sideGbwtSz << 2;
} else {
_sideGbwtSz = _sideSz - (sizeof(index_t) * 6);
_sideGbwtLen = _sideGbwtSz << 1;
}
_numSides = (_gbwtSz+(_sideGbwtSz)-1)/(_sideGbwtSz);
_numLines = _numSides * 1 /* lines per side */;
_gbwtTotLen = _numSides * _sideSz;
_gbwtTotSz = _gbwtTotLen;
assert(repOk());
}
index_t len() const { return _len; }
index_t lenNucs() const { return _len; }
index_t gbwtLen() const { return _gbwtLen; }
index_t sz() const { return _sz; }
index_t gbwtSz() const { return _gbwtSz; }
int32_t lineRate() const { return _lineRate; }
int32_t origOffRate() const { return _origOffRate; }
int32_t offRate() const { return _offRate; }
index_t offMask() const { return _offMask; }
int32_t ftabChars() const { return _ftabChars; }
index_t eftabLen() const { return _eftabLen; }
index_t eftabSz() const { return _eftabSz; }
index_t ftabLen() const { return _ftabLen; }
index_t ftabSz() const { return _ftabSz; }
index_t offsLen() const { return _offsLen; }
index_t offsSz() const { return _offsSz; }
index_t lineSz() const { return _lineSz; }
index_t sideSz() const { return _sideSz; }
index_t sideGbtSz() const { return _sideGbwtSz; }
index_t sideGbwtLen() const { return _sideGbwtLen; }
index_t numSides() const { return _numSides; }
index_t numLines() const { return _numLines; }
index_t gbwtTotLen() const { return _gbwtTotLen; }
index_t gbwtTotSz() const { return _gbwtTotSz; }
bool entireReverse() const { return _entireReverse; }
bool linearFM() const { return _linearFM; }
index_t numNodes() const { return _numNodes; }
/**
* Set a new suffix-array sampling rate, which involves updating
* rate, mask, sample length, and sample size.
*/
void setOffRate(int __offRate) {
_offRate = __offRate;
_offMask = std::numeric_limits<index_t>::max() << _offRate;
_offsLen = (_gbwtLen + (1 << _offRate) - 1) >> _offRate;
_offsSz = _offsLen * sizeof(index_t);
}
#ifndef NDEBUG
/// Check that this EbwtParams is internally consistent
bool repOk() const {
// assert_gt(_len, 0);
assert_gt(_lineRate, 3);
assert_geq(_offRate, 0);
assert_leq(_ftabChars, 16);
assert_geq(_ftabChars, 1);
assert_lt(_lineRate, 32);
assert_lt(_ftabChars, 32);
assert_eq(0, _gbwtTotSz % _lineSz);
return true;
}
#endif
/**
* Pretty-print the header contents to the given output stream.
*/
void print(ostream& out) const {
out << "Headers:" << endl
<< " len: " << _len << endl
<< " gbwtLen: " << _gbwtLen << endl
<< " nodes: " << _numNodes << endl
<< " sz: " << _sz << endl
<< " gbwtSz: " << _gbwtSz << endl
<< " lineRate: " << _lineRate << endl
<< " offRate: " << _offRate << endl
<< " offMask: 0x" << hex << _offMask << dec << endl
<< " ftabChars: " << _ftabChars << endl
<< " eftabLen: " << _eftabLen << endl
<< " eftabSz: " << _eftabSz << endl
<< " ftabLen: " << _ftabLen << endl
<< " ftabSz: " << _ftabSz << endl
<< " offsLen: " << _offsLen << endl
<< " offsSz: " << _offsSz << endl
<< " lineSz: " << _lineSz << endl
<< " sideSz: " << _sideSz << endl
<< " sideGbwtSz: " << _sideGbwtSz << endl
<< " sideGbwtLen: " << _sideGbwtLen << endl
<< " numSides: " << _numSides << endl
<< " numLines: " << _numLines << endl
<< " gbwtTotLen: " << _gbwtTotLen << endl
<< " gbwtTotSz: " << _gbwtTotSz << endl
<< " reverse: " << _entireReverse << endl
<< " linearFM: " << (_linearFM ? "Yes" : "No") << endl;
}
index_t _len;
index_t _gbwtLen;
index_t _sz;
index_t _gbwtSz;
int32_t _lineRate;
int32_t _origOffRate;
int32_t _offRate;
index_t _offMask;
int32_t _ftabChars;
index_t _eftabLen;
index_t _eftabSz;
index_t _ftabLen;
index_t _ftabSz;
index_t _offsLen;
index_t _offsSz;
index_t _lineSz;
index_t _sideSz;
index_t _sideGbwtSz;
index_t _sideGbwtLen;
index_t _numSides;
index_t _numLines;
index_t _gbwtTotLen;
index_t _gbwtTotSz;
bool _entireReverse;
bool _linearFM;
index_t _numNodes;
};
/**
* Exception to throw when a file-realted error occurs.
*/
class GFMFileOpenException : public std::runtime_error {
public:
GFMFileOpenException(const std::string& msg = "") :
std::runtime_error(msg) { }
};
/**
* Calculate size of file with given name.
*/
static inline int64_t fileSize(const char* name) {
std::ifstream f;
f.open(name, std::ios_base::binary | std::ios_base::in);
if (!f.good() || f.eof() || !f.is_open()) { return 0; }
f.seekg(0, std::ios_base::beg);
std::ifstream::pos_type begin_pos = f.tellg();
f.seekg(0, std::ios_base::end);
return static_cast<int64_t>(f.tellg() - begin_pos);
}
/**
* Encapsulates a location in the gbwt text in terms of the side it
* occurs in and its offset within the side.
*/
template <typename index_t = uint32_t>
struct SideLocus {
SideLocus() :
_sideByteOff(0),
_sideNum(0),
_charOff(0),
_by(-1),
_bp(-1) { }
/**
* Construct from row and other relevant information about the Ebwt.
*/
SideLocus(index_t row, const GFMParams<index_t>& ep, const uint8_t* ebwt) {
initFromRow(row, ep, ebwt);
}
/**
* Init two SideLocus objects from a top/bot pair, using the result
* from one call to initFromRow to possibly avoid a second call.
*/
static void initFromTopBot(
index_t top,
index_t bot,
const GFMParams<index_t>& gp,
const uint8_t* gfm,
SideLocus& ltop,
SideLocus& lbot)
{
const index_t sideGbwtLen = gp._sideGbwtLen;
assert_gt(bot, top);
ltop.initFromRow(top, gp, gfm);
index_t spread = bot - top;
// Many cache misses on the following lines
if(ltop._charOff + spread < sideGbwtLen) {
lbot._charOff = ltop._charOff + spread;
lbot._sideNum = ltop._sideNum;
lbot._sideByteOff = ltop._sideByteOff;
lbot._by = lbot._charOff >> 2;
assert_lt(lbot._by, (int)gp._sideGbwtSz);
lbot._bp = lbot._charOff & 0x3;
} else {
lbot.initFromRow(bot, gp, gfm);
}
}
/**
* Calculate SideLocus based on a row and other relevant
* information about the shape of the Ebwt.
*/
void initFromRow(
index_t row,
const GFMParams<index_t>& gp,
const uint8_t* gfm) {
const index_t sideSz = gp._sideSz;
// Side length is hard-coded for now; this allows the compiler
// to do clever things to accelerate / and %.
_sideNum = row / gp._sideGbwtLen;
assert_lt(_sideNum, gp._numSides);
_charOff = row % gp._sideGbwtLen;
_sideByteOff = _sideNum * sideSz;
assert_leq(row, gp._gbwtLen);
assert_leq(_sideByteOff + sideSz, gp._gbwtTotSz);
// Tons of cache misses on the next line
_by = _charOff >> 2; // byte within side
assert_lt(_by, (int)gp._sideGbwtSz);
_bp = _charOff & 0x3; // bit-pair within byte
}
/**
* Init two SideLocus objects from a top/bot pair, using the result
* from one call to initFromRow to possibly avoid a second call.
*/
static void initFromTopBot_bit(
index_t top,
index_t bot,
const GFMParams<index_t>& gp,
const uint8_t* gfm,
SideLocus& ltop,
SideLocus& lbot)
{
const index_t sideGbwtLen = gp._sideGbwtLen;
// assert_gt(bot, top);
ltop.initFromRow_bit(top, gp, gfm);
index_t spread = bot - top;
// Many cache misses on the following lines
if(ltop._charOff + spread < sideGbwtLen) {
lbot._charOff = ltop._charOff + spread;
lbot._sideNum = ltop._sideNum;
lbot._sideByteOff = ltop._sideByteOff;
lbot._by = lbot._charOff >> 3;
assert_lt(lbot._by, (int)gp._sideGbwtSz);
lbot._bp = lbot._charOff & 0x7;
} else {
lbot.initFromRow_bit(bot, gp, gfm);
}
}
/**
* Calculate SideLocus based on a row and other relevant
* information about the shape of the Ebwt.
*/
void initFromRow_bit(
index_t row,
const GFMParams<index_t>& gp,
const uint8_t* gfm) {
const index_t sideSz = gp._sideSz;
// Side length is hard-coded for now; this allows the compiler
// to do clever things to accelerate / and %.
_sideNum = row / gp._sideGbwtLen;
assert_lt(_sideNum, gp._numSides);
_charOff = row % gp._sideGbwtLen;
_sideByteOff = _sideNum * sideSz;
assert_lt(row, gp._gbwtLen);
assert_leq(_sideByteOff + sideSz, gp._gbwtTotSz);
// Tons of cache misses on the next line
_by = _charOff >> 3; // byte within side
assert_lt(_by, (int)gp._sideGbwtSz);
_bp = _charOff & 0x7; // bit-pair within byte
}
/**
* Transform this SideLocus to refer to the next side (i.e. the one
* corresponding to the next side downstream). Set all cursors to
* point to the beginning of the side.
*/
void nextSide(const GFMParams<index_t>& gp) {
assert(valid());
_sideByteOff += gp.sideSz();
_sideNum++;
_by = _bp = _charOff = 0;
assert(valid());
}
/**
* Return true iff this is an initialized SideLocus
*/
bool valid() const {
if(_bp != -1) {
return true;
}
return false;
}
/**
* Convert locus to BW row it corresponds to.
*/
index_t toBWRow(const GFMParams<index_t>& gp) const;
#ifndef NDEBUG
/**
* Check that SideLocus is internally consistent and consistent
* with the (provided) EbwtParams.
*/
bool repOk(const GFMParams<index_t>& gp) const {
ASSERT_ONLY(index_t row = toBWRow(gp));
assert_leq(row, gp._gbwtLen);
assert_range(-1, 3, _bp);
assert_range(0, (int)gp._sideGbwtSz, _by);
return true;
}
#endif
/// Make this look like an invalid SideLocus
void invalidate() {
_bp = -1;
}
/**
* Return a read-only pointer to the beginning of the top side.
*/
const uint8_t *side(const uint8_t* gbwt) const {
return gbwt + _sideByteOff;
}
/**
* Return a read-only pointer to the beginning of the top side.
*/
const uint8_t *next_side(const GFMParams<index_t>& gp, const uint8_t* gbwt) const {
if(_sideByteOff + gp._sideSz < gp._ebwtTotSz) {
return gbwt + _sideByteOff + gp._sideSz;
} else {
return NULL;
}
}
index_t _sideByteOff; // offset of top side within ebwt[]
index_t _sideNum; // index of side
index_t _charOff; // character offset within side
int32_t _by; // byte within side (not adjusted for bw sides)
int32_t _bp; // bitpair within byte (not adjusted for bw sides)
};
/**
* Convert locus to BW row it corresponds to.
*/
template <typename index_t>
inline index_t SideLocus<index_t>::toBWRow(const GFMParams<index_t>& gp) const {
return _sideNum * (gp._sideGbwtSz << (gp.linearFM() ? 2 : 1)) + _charOff;
}
#ifdef POPCNT_CAPABILITY // wrapping of "struct"
struct USE_POPCNT_GENERIC {
#endif
// Use this standard bit-bashing population count
inline static int pop64(uint64_t x) {
// Lots of cache misses on following lines (>10K)
x = x - ((x >> 1) & 0x5555555555555555llu);
x = (x & 0x3333333333333333llu) + ((x >> 2) & 0x3333333333333333llu);
x = (x + (x >> 4)) & 0x0F0F0F0F0F0F0F0Fllu;
x = x + (x >> 8);
x = x + (x >> 16);
x = x + (x >> 32);
return (int)(x & 0x3Fllu);
}
#ifdef POPCNT_CAPABILITY // wrapping a "struct"
};
#endif
#ifdef POPCNT_CAPABILITY
struct USE_POPCNT_INSTRUCTION {
inline static int pop64(uint64_t x) {
int64_t count;
#ifdef USING_MSC_COMPILER
count = __popcnt64(x);
#else
asm ("popcntq %[x],%[count]\n": [count] "=&r" (count): [x] "r" (x));
#endif
return (int)count;
}
};
#endif
/**
* Tricky-bit-bashing bitpair counting for given two-bit value (0-3)
* within a 64-bit argument.
*/
#ifdef POPCNT_CAPABILITY
template<typename Operation>
#endif
inline static int countInU64(int c, uint64_t dw) {
uint64_t c0 = c_table[c];
uint64_t x0 = dw ^ c0;
uint64_t x1 = (x0 >> 1);
uint64_t x2 = x1 & (0x5555555555555555);
uint64_t x3 = x0 & x2;
#ifdef POPCNT_CAPABILITY
uint64_t tmp = Operation().pop64(x3);
#else
uint64_t tmp = pop64(x3);
#endif
return (int) tmp;
}
#ifdef POPCNT_CAPABILITY // wrapping of "struct"
struct USE_POPCNT_GENERIC_BITS {
// Use this standard bit-bashing population count
inline static uint64_t pop64(uint64_t x) {
#else
// Use this standard bit-bashing population count
inline static uint64_t pop6464(uint64_t x) {
#endif
x -= (x >> 1) & 0x5555555555555555ULL;
x = (x & 0x3333333333333333ULL) + ((x >> 2) & 0x3333333333333333ULL);
x = (x + (x >> 4)) & 0x0f0f0f0f0f0f0f0fULL;
return int((x * 0x0101010101010101ULL) >> 56);
}
#ifdef POPCNT_CAPABILITY // wrapping a "struct"
};
#endif
/**
* Tricky-bit-bashing bitpair counting for given two-bit value (0-3)
* within a 64-bit argument.
*/
#ifdef POPCNT_CAPABILITY
template<typename Operation>
#endif
inline static int countInU64_bits(uint64_t dw) {
#ifdef POPCNT_CAPABILITY
uint64_t tmp = Operation().pop64(dw);
#else
uint64_t tmp = pop6464(dw);
#endif
return (int) tmp;
}
// Forward declarations for Ebwt class
class GFMSearchParams;
/**
* Extended Burrows-Wheeler transform data.
*
* An Ebwt may be transferred to and from RAM with calls to
* evictFromMemory() and loadIntoMemory(). By default, a newly-created
* Ebwt is not loaded into memory; if the user would like to use a
* newly-created Ebwt to answer queries, they must first call
* loadIntoMemory().
*/
template <class index_t = uint32_t>
class GFM {
public:
#define GFM_INITS \
_toBigEndian(currentlyBigEndian()), \
_overrideOffRate(overrideOffRate), \
_verbose(verbose), \
_passMemExc(passMemExc), \
_sanity(sanityCheck), \
fw_(fw), \
_in1(NULL), \
_in2(NULL), \
_nPat(0), \
_nFrag(0), \
_plen(EBWT_CAT), \
_rstarts(EBWT_CAT), \
_fchr(EBWT_CAT), \
_ftab(EBWT_CAT), \
_eftab(EBWT_CAT), \
_offs(EBWT_CAT), \
_gfm(EBWT_CAT), \
_useMm(false), \
useShmem_(false), \
_refnames(EBWT_CAT), \
mmFile1_(NULL), \
mmFile2_(NULL), \
_nthreads(1)
GFM() {}
/// Construct a GFM from the given input file
GFM(const string& in,
ALTDB<index_t>* altdb,
RepeatDB<index_t>* repeatdb,
EList<size_t>* readLens,
int needEntireReverse,
bool fw,
int32_t overrideOffRate, // = -1,
int32_t offRatePlus, // = -1,
bool useMm, // = false,
bool useShmem, // = false,
bool mmSweep, // = false,
bool loadNames, // = false,
bool loadSASamp, // = true,
bool loadFtab, // = true,
bool loadRstarts, // = true,
bool loadSpliceSites, // = true,
bool verbose, // = false,
bool startVerbose, // = false,
bool passMemExc, // = false,
bool sanityCheck, // = false)
bool useHaplotype, // = false
bool skipLoading = false) :
GFM_INITS
{
assert(!useMm || !useShmem);
#ifdef POPCNT_CAPABILITY
ProcessorSupport ps;
_usePOPCNTinstruction = ps.POPCNTenabled();
#endif
packed_ = false;
_useMm = useMm;
useShmem_ = useShmem;
_in1Str = in + ".1." + gfm_ext;
_in2Str = in + ".2." + gfm_ext;
if(skipLoading) return;
if(repeatdb == NULL) {
readIntoMemory(
fw ? -1 : needEntireReverse, // need REF_READ_REVERSE
loadSASamp, // load the SA sample portion?
loadFtab, // load the ftab & eftab?
loadRstarts, // load the rstarts array?
true, // stop after loading the header portion?
&_gh, // params
mmSweep, // mmSweep
loadNames, // loadNames
startVerbose); // startVerbose
// If the offRate has been overridden, reflect that in the
// _eh._offRate field
if(offRatePlus > 0 && _overrideOffRate == -1) {
_overrideOffRate = _gh._offRate + offRatePlus;
}
if(_overrideOffRate > _gh._offRate) {
_gh.setOffRate(_overrideOffRate);
assert_eq(_overrideOffRate, _gh._offRate);
}
}
// Read ALTs
EList<ALT<index_t> >& alts = altdb->alts();
EList<Haplotype<index_t> >& haplotypes = altdb->haplotypes();
EList<string>& altnames = altdb->altnames();
alts.clear(); altnames.clear();
string in7Str = in + ".7." + gfm_ext;
string in8Str = in + ".8." + gfm_ext;
// open alts
if(verbose || startVerbose) cerr << "Opening \"" << in7Str.c_str() << "\"" << endl;
ifstream in7(in7Str.c_str(), ios::binary);
if(!in7.good()) {
cerr << "Could not open index file " << in7Str.c_str() << endl;
}
EList<index_t> to_alti;
index_t to_alti_far = 0;
readI32(in7, this->toBe());
index_t numAlts = readIndex<index_t>(in7, this->toBe());
// open altnames
if(verbose || startVerbose) cerr << "Opening \"" << in8Str.c_str() << "\"" << endl;
ifstream in8(in8Str.c_str(), ios::binary);
if(!in8.good()) {
cerr << "Could not open index file " << in8Str.c_str() << endl;
}
readI32(in8, this->toBe());
index_t numAltnames = readIndex<index_t>(in8, this->toBe());
assert_eq(numAlts, numAltnames);
if(numAlts > 0) {
alts.resizeExact(numAlts); alts.clear();
to_alti.resizeExact(numAlts); to_alti.clear();
while(!in7.eof() && !in8.eof()) {
alts.expand();
alts.back().read(in7, this->toBe());
to_alti.push_back(to_alti_far);
to_alti_far++;
altnames.expand();
in8 >> altnames.back();
if(!loadSpliceSites) {
if(alts.back().splicesite()) {
alts.pop_back();
assert_gt(numAlts, 0);
altnames.pop_back();
assert_gt(numAltnames, 0);
numAlts--;
numAltnames--;
to_alti.back() = std::numeric_limits<index_t>::max();
to_alti_far--;
}
}
if(alts.size() == numAlts) break;
}
}
assert_eq(alts.size(), numAlts);
assert_eq(to_alti_far, numAlts);
assert_eq(alts.size(), altnames.size());
// Check if it hits the end of file, and this routine is needed for backward compatibility
if(in7.peek() != std::ifstream::traits_type::eof()) {
index_t numHaplotypes = readIndex<index_t>(in7, this->toBe());
if(numHaplotypes > 0) {
haplotypes.resizeExact(numHaplotypes);
haplotypes.clear();
while(!in7.eof()) {
haplotypes.expand();
haplotypes.back().read(in7, this->toBe());
Haplotype<index_t>& ht = haplotypes.back();
for(index_t h = 0; h < ht.alts.size(); h++) {
ht.alts[h] = to_alti[ht.alts[h]];
}
if(haplotypes.size() == numHaplotypes) break;
}
}
if(!useHaplotype) {
haplotypes.nullify();
}
}
// Read repeats
_repeat = false;
if(repeatdb != NULL) {
_repeat = true;
// Number of repeat groups in the index
index_t numRepeatIndex = readIndex<index_t>(in7, this->toBe());
assert_gt(numRepeatIndex, 0);
EList<pair<index_t, index_t> > repeatLens; repeatLens.resizeExact(numRepeatIndex);
for(size_t k = 0; k < numRepeatIndex; k++) {
repeatLens[k].first = readIndex<index_t>(in7, this->toBe());
repeatLens[k].second = readIndex<index_t>(in7, this->toBe());
}
if (readLens != NULL && !readLens->empty()) {
// Load subset of repeat groups.
size_t k = 0;
size_t k2 = 0;
_repeatIncluded.resizeExact(numRepeatIndex);
_repeatIncluded.fillZero();
while(k < numRepeatIndex && k2 < readLens->size()) {
if (repeatLens[k].first >= (*readLens)[k2]) {
_repeatIncluded[k] = true;
k2++;
} else {
k++;
}
}
// at least last repeat group is included
_repeatIncluded[numRepeatIndex - 1] = true;
_repeatLens.clear();
for(size_t i = 0; i < numRepeatIndex; i++) {
if (_repeatIncluded[i]) {
_repeatLens.push_back(repeatLens[i]);
}
}
} else {
// Load all repeat groups
_repeatLens = repeatLens;
_repeatIncluded.resizeExact(numRepeatIndex);
_repeatIncluded.fill(true);
}
repeatdb->read(in7, this->toBe(), _repeatIncluded);
index_t numKmertables = readIndex<index_t>(in7, this->toBe());
EList<streampos> filePos; filePos.resizeExact(numKmertables);
for(size_t k = 0; k < numKmertables; k++) {
filePos[k] = readIndex<uint64_t>(in7, this->toBe());
}
for(size_t k = 0; k < numKmertables; k++) {
if(!_repeatIncluded[k])
continue;
if(k > 0) {
in7.seekg(filePos[k-1]);
}
_repeat_kmertables.expand();
_repeat_kmertables.back().read(in7, this->toBe());
}
in7.seekg(filePos.back());
}
in7.close();
in8.close();
// Sort SNPs and Splice Sites based on positions
index_t nalts = (index_t)alts.size();
for(index_t s = 0; s < nalts; s++) {
ALT<index_t> alt = alts[s];
if(alt.snp()) altdb->setSNPs(true);
if(alt.exon()) altdb->setExons(true);
if(alt.splicesite()) {
altdb->setSpliceSites(true);
alts.push_back(alt);
alts.back().left = alt.right;
alts.back().right = alt.left;
altnames.push_back("ssr");
} else if(alt.deletion()) {
alts.push_back(alt);
alts.back().pos = alt.pos + alt.len - 1;
alts.back().reversed = true;
string altname = altnames[s];
altnames.push_back(altname);
}
}
if(alts.size() > 1 && alts.size() > nalts) {
assert_eq(alts.size(), altnames.size());
EList<pair<ALT<index_t>, index_t> > buf; buf.resize(alts.size());
EList<string> buf2; buf2.resize(alts.size());
for(size_t i = 0; i < alts.size(); i++) {
buf[i].first = alts[i];
buf[i].second = (index_t)i;
buf2[i] = altnames[i];
}
buf.sort();
for(size_t i = 0; i < alts.size(); i++) {
alts[i] = buf[i].first;
altnames[i] = buf2[buf[i].second];
if(buf[i].second < numAlts) {
to_alti[buf[i].second] = i;
}
}
}
if(useHaplotype) {
EList<index_t>& haplotype_maxrights = altdb->haplotype_maxrights();
haplotype_maxrights.resizeExact(haplotypes.size());
for(index_t h = 0; h < haplotypes.size(); h++) {
Haplotype<index_t>& ht = haplotypes[h];
for(index_t h2 = 0; h2 < ht.alts.size(); h2++) {
ht.alts[h2] = to_alti[ht.alts[h2]];
}
if(h == 0) {
haplotype_maxrights[h] = ht.right;
} else {
haplotype_maxrights[h] = std::max<index_t>(haplotype_maxrights[h - 1], ht.right);
}
}
}
assert(repeatdb != NULL || repOk());
}
/// Construct an Ebwt from the given header parameters and string
/// vector, optionally using a blockwise suffix sorter with the
/// given 'bmax' and 'dcv' parameters. The string vector is
/// ultimately joined and the joined string is passed to buildToDisk().
GFM(
bool packed,
int needEntireReverse,
int32_t lineRate,
int32_t offRate,
int32_t ftabChars,
const string& file, // base filename for GFM files
bool fw,
int dcv,
EList<RefRecord>& szs,
index_t sztot,
const RefReadInParams& refparams,
uint32_t seed,
int32_t overrideOffRate = -1,
bool verbose = false,
bool passMemExc = false,
bool sanityCheck = false) :
GFM_INITS,
_gh(
joinedLen(szs),
0,
0,
lineRate,
offRate,
ftabChars,
0,
refparams.reverse == REF_READ_REVERSE)
{
#ifdef POPCNT_CAPABILITY
ProcessorSupport ps;
_usePOPCNTinstruction = ps.POPCNTenabled();
#endif
packed_ = packed;
}
/// Construct an Ebwt from the given header parameters and string
/// vector, optionally using a blockwise suffix sorter with the
/// given 'bmax' and 'dcv' parameters. The string vector is
/// ultimately joined and the joined string is passed to buildToDisk().
template<typename TStr>
GFM(
TStr& s,
bool packed,
int needEntireReverse,
int32_t lineRate,
int32_t offRate,
int32_t ftabChars,
int nthreads,
const string& snpfile,
const string& htfile,
const string& ssfile,
const string& exonfile,
const string& svfile,
const string& repeatfile,
const string& outfile, // base filename for GFM files
bool fw,
bool useBlockwise,
index_t bmax,
index_t bmaxSqrtMult,
index_t bmaxDivN,
int dcv,
EList<FileBuf*>& is,
EList<RefRecord>& szs,
index_t sztot,
const RefReadInParams& refparams,
EList<RefRecord>* parent_szs,
EList<string>* parent_refnames,
uint32_t seed,
int32_t overrideOffRate = -1,
bool verbose = false,
bool passMemExc = false,
bool sanityCheck = false) :
GFM_INITS,
_gh(
joinedLen(szs),
0,
0,
lineRate,
offRate,
ftabChars,
0,
refparams.reverse == REF_READ_REVERSE)
{
assert_gt(nthreads, 0);
_nthreads = nthreads;
#ifdef POPCNT_CAPABILITY
ProcessorSupport ps;
_usePOPCNTinstruction = ps.POPCNTenabled();
#endif
_in1Str = outfile + ".1." + gfm_ext;
_in2Str = outfile + ".2." + gfm_ext;
packed_ = packed;
// Open output files
ofstream fout1(_in1Str.c_str(), ios::binary);
if(!fout1.good()) {
cerr << "Could not open index file for writing: \"" << _in1Str.c_str() << "\"" << endl
<< "Please make sure the directory exists and that permissions allow writing by" << endl
<< "HISAT2." << endl;
throw 1;
}
ofstream fout2(_in2Str.c_str(), ios::binary);
if(!fout2.good()) {
cerr << "Could not open index file for writing: \"" << _in2Str.c_str() << "\"" << endl
<< "Please make sure the directory exists and that permissions allow writing by" << endl
<< "HISAT2." << endl;
throw 1;
}
// Build
initFromVector<TStr>(
s,
snpfile,
htfile,
ssfile,
exonfile,
svfile,
repeatfile,
is,
szs,
sztot,
refparams,
fout1,
fout2,
outfile,
useBlockwise,
bmax,
bmaxSqrtMult,
bmaxDivN,
dcv,
parent_szs,
parent_refnames,
seed,
verbose);
// Close output files
fout1.flush();
int64_t tellpSz1 = (int64_t)fout1.tellp();
VMSG_NL("Wrote " << fout1.tellp() << " bytes to primary GFM file: " << _in1Str.c_str());
fout1.close();
bool err = false;
if(tellpSz1 > fileSize(_in1Str.c_str())) {
err = true;
cerr << "Index is corrupt: File size for " << _in1Str.c_str() << " should have been " << tellpSz1
<< " but is actually " << fileSize(_in1Str.c_str()) << "." << endl;
}
fout2.flush();
int64_t tellpSz2 = (int64_t)fout2.tellp();
VMSG_NL("Wrote " << fout2.tellp() << " bytes to secondary GFM file: " << _in2Str.c_str());
fout2.close();
if(tellpSz2 > fileSize(_in2Str.c_str())) {
err = true;
cerr << "Index is corrupt: File size for " << _in2Str.c_str() << " should have been " << tellpSz2
<< " but is actually " << fileSize(_in2Str.c_str()) << "." << endl;
}
if(err) {
cerr << "Please check if there is a problem with the disk or if disk is full." << endl;
throw 1;
}
// Reopen as input streams
VMSG_NL("Re-opening _in1 and _in2 as input streams");
if(_sanity) {
VMSG_NL("Sanity-checking Bt2");
assert(!isInMemory());
readIntoMemory(
fw ? -1 : needEntireReverse, // 1 -> need the reverse to be reverse-of-concat
true, // load SA sample (_offs[])?
true, // load ftab (_ftab[] & _eftab[])?
true, // load r-starts (_rstarts[])?
false, // just load header?
NULL, // Params object to fill
false, // mm sweep?
true, // load names?
false); // verbose startup?
// sanityCheckAll(refparams.reverse);
evictFromMemory();
assert(!isInMemory());
}
VMSG_NL("Returning from GFM constructor");
}
/**
* Static constructor for a pair of forward/reverse indexes for the
* given reference string.
*/
template<typename TStr>
static pair<GFM*, GFM*>
fromString(
const char* str,
bool packed,
int reverse,
bool bigEndian,
int32_t lineRate,
int32_t offRate,
int32_t ftabChars,
const string& file,
bool useBlockwise,
index_t bmax,
index_t bmaxSqrtMult,
index_t bmaxDivN,
int dcv,
uint32_t seed,
bool verbose,
bool autoMem,
bool sanity)
{
EList<std::string> strs(EBWT_CAT);
strs.push_back(std::string(str));
return fromStrings<TStr>(
strs,
packed,
reverse,
bigEndian,
lineRate,
offRate,
ftabChars,
file,
useBlockwise,
bmax,
bmaxSqrtMult,
bmaxDivN,
dcv,
seed,
verbose,
autoMem,
sanity);
}
/**
* Static constructor for a pair of forward/reverse indexes for the
* given list of reference strings.
*/
template<typename TStr>
static pair<GFM*, GFM*>
fromStrings(
const EList<std::string>& strs,
bool packed,
int reverse,
bool bigEndian,
int32_t lineRate,
int32_t offRate,
int32_t ftabChars,
const string& file,
bool useBlockwise,
index_t bmax,
index_t bmaxSqrtMult,
index_t bmaxDivN,
int dcv,
uint32_t seed,
bool verbose,
bool autoMem,
bool sanity)
{
assert(!strs.empty());
EList<FileBuf*> is(EBWT_CAT);
RefReadInParams refparams(false /* color */, REF_READ_FORWARD, false, false);
// Adapt sequence strings to stringstreams open for input
auto_ptr<stringstream> ss(new stringstream());
for(index_t i = 0; i < strs.size(); i++) {
(*ss) << ">" << i << endl << strs[i] << endl;
}
auto_ptr<FileBuf> fb(new FileBuf(ss.get()));
assert(!fb->eof());
assert(fb->get() == '>');
ASSERT_ONLY(fb->reset());
assert(!fb->eof());
is.push_back(fb.get());
// Vector for the ordered list of "records" comprising the input
// sequences. A record represents a stretch of unambiguous
// characters in one of the input sequences.
EList<RefRecord> szs(EBWT_CAT);
std::pair<index_t, index_t> sztot;
sztot = BitPairReference::szsFromFasta(is,
file,
bigEndian,
refparams,
szs,
sanity);
// Construct Ebwt from input strings and parameters
GFM<index_t> *gfmFw = new GFM<index_t>(
TStr(),
packed,
-1, // fw
lineRate,
offRate, // suffix-array sampling rate
ftabChars, // number of chars in initial arrow-pair calc
file, // basename for .?.ebwt files
true, // fw?
useBlockwise, // useBlockwise
bmax, // block size for blockwise SA builder
bmaxSqrtMult, // block size as multiplier of sqrt(len)
bmaxDivN, // block size as divisor of len
dcv, // difference-cover period
is, // list of input streams
szs, // list of reference sizes
sztot.first, // total size of all unambiguous ref chars
refparams, // reference read-in parameters
seed, // pseudo-random number generator seed
-1, // override offRate
verbose, // be talkative
autoMem, // pass exceptions up to the toplevel so that we can adjust memory settings automatically
sanity); // verify results and internal consistency
refparams.reverse = reverse;
szs.clear();
sztot = BitPairReference::szsFromFasta(is,
file,
bigEndian,
refparams,
szs,
sanity);
// Construct Ebwt from input strings and parameters
GFM<index_t> *gfmBw = new GFM<index_t>(
TStr(),
packed,
reverse == REF_READ_REVERSE,
lineRate,
offRate, // suffix-array sampling rate
ftabChars, // number of chars in initial arrow-pair calc
file + ".rev",// basename for .?.ebwt files
false, // fw?
useBlockwise, // useBlockwise
bmax, // block size for blockwise SA builder
bmaxSqrtMult, // block size as multiplier of sqrt(len)
bmaxDivN, // block size as divisor of len
dcv, // difference-cover period
is, // list of input streams
szs, // list of reference sizes
sztot.first, // total size of all unambiguous ref chars
refparams, // reference read-in parameters
seed, // pseudo-random number generator seed
-1, // override offRate
verbose, // be talkative
autoMem, // pass exceptions up to the toplevel so that we can adjust memory settings automatically
sanity); // verify results and internal consistency
return make_pair(gfmFw, gfmBw);
}
/// Return true iff the Ebwt is packed
bool isPacked() { return packed_; }
/**
* Write the rstarts array given the szs array for the reference.
*/
void szsToDisk(const EList<RefRecord>& szs, ostream& os, int reverse);
bool checkPosToSzs(const EList<RefRecord>& szs, index_t start_idx, index_t pos)
{
assert(szs[start_idx].first);
for(index_t i = start_idx; i < szs.size(); i++) {
if((i != start_idx) && (szs[i].first)) {
// span to next chr
return false;
}
if(pos < szs[i].off) {
return false;
} else {
pos -= szs[i].off;
if(pos < szs[i].len) {
return true;
}
pos -= szs[i].len;
}
}
assert(false);
return false;
}
/**
* Helper for the constructors above. Takes a vector of text
* strings and joins them into a single string with a call to
* joinToDisk, which does a join (with padding) and writes some of
* the resulting data directly to disk rather than keep it in
* memory. It then constructs a suffix-array producer (what kind
* depends on 'useBlockwise') for the resulting sequence. The
* suffix-array producer can then be used to obtain chunks of the
* joined string's suffix array.
*/
template <typename TStr>
void initFromVector(TStr& s,
const string& snpfile,
const string& htfile,
const string& ssfile,
const string& exonfile,
const string& svfile,
const string& repeatfile,
EList<FileBuf*>& is,
EList<RefRecord>& szs,
index_t sztot,
const RefReadInParams& refparams,
ofstream& out1,
ofstream& out2,
const string& outfile,
bool useBlockwise,
index_t bmax,
index_t bmaxSqrtMult,
index_t bmaxDivN,
int dcv,
EList<RefRecord>* parent_szs,
EList<string>* parent_refnames,
uint32_t seed,
bool verbose)
{
// Compose text strings into single string
VMSG_NL("Calculating joined length");
index_t jlen;
jlen = joinedLen(szs);
_repeat = (parent_szs != NULL);
assert_geq(jlen, sztot);
VMSG_NL("Writing header");
writeFromMemory(true, out1, out2);
try {
VMSG_NL("Reserving space for joined string");
s.resize(jlen);
VMSG_NL("Joining reference sequences");
if(refparams.reverse == REF_READ_REVERSE) {
{
Timer timer(cerr, " Time to join reference sequences: ", _verbose);
joinToDisk(is, szs, sztot, refparams, s, out1, out2);
} {
Timer timer(cerr, " Time to reverse reference sequence: ", _verbose);
EList<RefRecord> tmp(EBWT_CAT);
s.reverse();
reverseRefRecords(szs, tmp, false, verbose);
szsToDisk(tmp, out1, refparams.reverse);
}
} else {
Timer timer(cerr, " Time to join reference sequences: ", _verbose);
joinToDisk(is, szs, sztot, refparams, s, out1, out2);
szsToDisk(szs, out1, refparams.reverse);
}
{
Timer timer(cerr, " Time to read SNPs and splice sites: ", _verbose);
_alts.clear();
_altnames.clear();
EList<pair<index_t, index_t> > chr_szs;
index_t tmp_len = 0;
for(index_t i = 0; i < szs.size(); i++) {
if(szs[i].first) {
chr_szs.expand();
chr_szs.back().first = tmp_len;
chr_szs.back().second = i;
}
tmp_len += (index_t)szs[i].len;
}
// Write SNPs into 7.ht2 and 8.ht2
string file7 = outfile + ".7." + gfm_ext;
string file8 = outfile + ".8." + gfm_ext;
// Open output stream for the '.7.gfm_ext' file which will
// hold SNPs (except IDs).
ofstream fout7(file7.c_str(), ios::binary);
if(!fout7.good()) {
cerr << "Could not open index file for writing: \"" << file7.c_str() << "\"" << endl
<< "Please make sure the directory exists and that permissions allow writing by" << endl
<< "HISAT2." << endl;
throw 1;
}
// Open output stream for the '.8.gfm_ext' file which will
// hold SNP IDs.
ofstream fout8(file8.c_str(), ios::binary);
if(!fout8.good()) {
cerr << "Could not open index file for writing: \"" << file8.c_str() << "\"" << endl
<< "Please make sure the directory exists and that permissions allow writing by" << endl
<< "HISAT2." << endl;
throw 1;
}
writeIndex<int32_t>(fout7, 1, this->toBe()); // endianness sentinel
writeIndex<int32_t>(fout8, 1, this->toBe()); // endianness sentinel
for(index_t i = 0; i < _refnames.size(); i++) {
_refnames_nospace.push_back("");
for(index_t j = 0; j < _refnames[i].size(); j++) {
char c = _refnames[i][j];
if(c == ' ') break;
_refnames_nospace.back().push_back(c);
}
}
map<string, index_t> snpID2num;
if(snpfile != "") {
ifstream snp_file(snpfile.c_str(), ios::in);
if(!snp_file.is_open()) {
cerr << "Error: could not open " << snpfile.c_str() << endl;
throw 1;
}
while(!snp_file.eof()) {
// rs73387790 single 22:20000001-21000000 145 A
string snp_id;
snp_file >> snp_id;
if(snp_id.empty() || snp_id[0] == '#') {
string line;
getline(snp_file, line);
continue;
}
string type, chr;
index_t genome_pos;
char snp_ch = '\0';
string ins_seq;
index_t del_len = 0;
snp_file >> type >> chr >> genome_pos;
if(type == "single") {
snp_file >> snp_ch;
} else if(type == "deletion") {
snp_file >> del_len;
} else if(type == "insertion") {
snp_file >> ins_seq;
}
index_t chr_idx = 0;
for(; chr_idx < _refnames_nospace.size(); chr_idx++) {
if(chr == _refnames_nospace[chr_idx])
break;
}
if(chr_idx >= _refnames_nospace.size()) {
continue;
}
assert_eq(chr_szs.size(), _refnames_nospace.size());
assert_lt(chr_idx, chr_szs.size());
pair<index_t, index_t> tmp_pair = chr_szs[chr_idx];
const index_t sofar_len = tmp_pair.first;
const index_t szs_idx = tmp_pair.second;
bool involve_Ns = false;
index_t pos = genome_pos;
index_t add_pos = 0;
assert(szs[szs_idx].first);
for(index_t i = szs_idx; i < szs.size(); i++) {
if(i != szs_idx && szs[i].first) {
break;
}
if(pos < szs[i].off) {
involve_Ns = true;
break;
} else {
pos -= szs[i].off;
if(pos == 0) {
if(type == "deletion" || type == "insertion") {
involve_Ns = true;
break;
}
}
if(pos < szs[i].len) {
break;
} else {
pos -= szs[i].len;
add_pos += szs[i].len;
}
}
}
if(involve_Ns) {
continue;
}
pos = sofar_len + add_pos + pos;
if(chr_idx + 1 < chr_szs.size()) {
if(pos >= chr_szs[chr_idx + 1].first) {
continue;
}
} else {
if(pos >= jlen){
continue;
}
}
_alts.expand();
ALT<index_t>& snp = _alts.back();
snp.pos = pos;
if(type == "single") {
snp.type = ALT_SNP_SGL;
snp_ch = toupper(snp_ch);
if(snp_ch != 'A' && snp_ch != 'C' && snp_ch != 'G' && snp_ch != 'T') {
_alts.pop_back();
continue;
}
uint64_t bp = asc2dna[(int)snp_ch];
assert_lt(bp, 4);
if((int)bp == s[pos]) {
if (!threeN) {
cerr << "Warning: single type should have a different base than " << "ACGTN"[(int)s[pos]]
<< " (" << snp_id << ") at " << genome_pos << " on " << chr << endl;
}
_alts.pop_back();
continue;
// throw 1;
}
snp.len = 1;
snp.seq = bp;
} else if(type == "deletion") {
snp.type = ALT_SNP_DEL;
snp.len = del_len;
snp.seq = 0;
snp.reversed = false;
} else if(type == "insertion") {
snp.type = ALT_SNP_INS;
snp.len = (index_t)ins_seq.size();
if(snp.len > sizeof(snp.seq) * 4) {
_alts.pop_back();
continue;
}
snp.seq = 0;
bool failed = false;
for(size_t i = 0; i < ins_seq.size(); i++) {
char ch = toupper(ins_seq[i]);
if(ch != 'A' && ch != 'C' && ch != 'G' && ch != 'T') {
failed = true;
break;
}
uint64_t bp = asc2dna[(int)ch];
assert_lt(bp, 4);
snp.seq = (snp.seq << 2) | bp;
}
if(failed) {
_alts.pop_back();
continue;
}
} else {
cerr << "Error: unknown snp type " << type << endl;
throw 1;
}
_altnames.push_back(snp_id);
assert_eq(_alts.size(), _altnames.size());
snpID2num[snp_id] = (index_t)_alts.size() - 1;
}
snp_file.close();
assert_eq(_alts.size(), _altnames.size());
}
_haplotypes.clear();
if(_alts.size() > 0 && htfile != "") {
ifstream ht_file(htfile.c_str(), ios::in);
if(!ht_file.is_open()) {
cerr << "Error: could not open "<< htfile.c_str() << endl;
throw 1;
}
while(!ht_file.eof()) {
// ht66 A*01:01:01:01 371 533 66,69,72,75,76,77,84,88,90,92,95
string ht_id;
ht_file >> ht_id;
if(ht_id.empty() || ht_id[0] == '#') {
string line;
getline(ht_file, line);
continue;
}
string chr, alt_list;
index_t left, right; // inclusive [left, right]
ht_file >> chr >> left >> right >> alt_list;
assert_leq(left, right);
index_t chr_idx = 0;
for(; chr_idx < _refnames_nospace.size(); chr_idx++) {
if(chr == _refnames_nospace[chr_idx])
break;
}
if(chr_idx >= _refnames_nospace.size()) {
continue;
}
assert_eq(chr_szs.size(), _refnames_nospace.size());
assert_lt(chr_idx, chr_szs.size());
pair<index_t, index_t> tmp_pair = chr_szs[chr_idx];
const index_t sofar_len = tmp_pair.first;
const index_t szs_idx = tmp_pair.second;
bool inside_Ns = false;
index_t add_pos = 0;
assert(szs[szs_idx].first);
for(index_t i = szs_idx; i < szs.size(); i++) {
if(i != szs_idx && szs[i].first) break;
if(left < szs[i].off) {
inside_Ns = true;
break;
} else {
left -= szs[i].off;
right -= szs[i].off;
if(left < szs[i].len) {
if(right >= szs[i].len) {
inside_Ns = true;
}
break;
} else {
left -= szs[i].len;
right -= szs[i].len;
add_pos += szs[i].len;
}
}
}
if(inside_Ns) {
continue;
}
left = sofar_len + add_pos + left;
right = sofar_len + add_pos + right;
if(chr_idx + 1 < chr_szs.size()) {
if(right >= chr_szs[chr_idx + 1].first) {
continue;
}
} else {
if(right >= jlen) {
continue;
}
}
_haplotypes.expand();
_haplotypes.back().left = left;
_haplotypes.back().right = right;
EList<string> alts;
tokenize(alt_list, ",", alts);
assert_gt(alts.size(), 0);
_haplotypes.back().alts.clear();
for(size_t i = 0; i < alts.size(); i++) {
const string& alt = alts[i];
if(snpID2num.find(alt) != snpID2num.end()) {
_haplotypes.back().alts.push_back(snpID2num[alt]);
}
}
if(_haplotypes.back().alts.size() <= 0) {
_haplotypes.pop_back();
}
}
_haplotypes.sort();
ht_file.close();
} else {
for(index_t a = 0; a < _alts.size(); a++) {
const ALT<index_t>& alt = _alts[a];
if(!alt.snp()) continue;
_haplotypes.expand();
_haplotypes.back().left = alt.pos;
if(alt.deletion()) {
_haplotypes.back().right = alt.pos + alt.len - 1;
} else {
_haplotypes.back().right = alt.pos;
}
_haplotypes.back().alts.clear();
_haplotypes.back().alts.push_back(a);
}
}
if(ssfile != "") {
ifstream ss_file(ssfile.c_str(), ios::in);
if(!ss_file.is_open()) {
cerr << "Error: could not open " << ssfile.c_str() << endl;
throw 1;
}
map<uint64_t, uint64_t> ss_seq;
while(!ss_file.eof()) {
// 22 16062315 16062810 +
string chr;
ss_file >> chr;
if(chr.empty() || chr[0] == '#') {
string line;
getline(ss_file, line);
continue;
}
index_t left, right;
char strand;
ss_file >> left >> right >> strand;
// Convert exonic position to intronic position
left += 1; right -= 1;
if(left >= right) continue;
index_t chr_idx = 0;
for(; chr_idx < _refnames_nospace.size(); chr_idx++) {
if(chr == _refnames_nospace[chr_idx])
break;
}
if(chr_idx >= _refnames_nospace.size()) continue;
assert_eq(chr_szs.size(), _refnames_nospace.size());
assert_lt(chr_idx, chr_szs.size());
pair<index_t, index_t> tmp_pair = chr_szs[chr_idx];
const index_t sofar_len = tmp_pair.first;
const index_t szs_idx = tmp_pair.second;
// check whether ambiguous base is in exon's last and first base
if(!checkPosToSzs(szs, szs_idx, left - 1)
|| !checkPosToSzs(szs, szs_idx, right + 1)) {
//cerr << "Skip ss. " << chr << ", " << left - 1 << ", " << right + 1 << endl;
continue;
}
bool inside_Ns = false;
index_t add_pos = 0;
assert(szs[szs_idx].first);
for(index_t i = szs_idx; i < szs.size(); i++) {
if(i != szs_idx && szs[i].first) break;
if(left < szs[i].off) {
inside_Ns = true;
break;
} else {
left -= szs[i].off;
right -= szs[i].off;
if(left < szs[i].len) {
if(right >= szs[i].len) {
inside_Ns = true;
}
break;
} else {
left -= szs[i].len;
right -= szs[i].len;
add_pos += szs[i].len;
}
}
}
if(inside_Ns) continue;
left = sofar_len + add_pos + left;
right = sofar_len + add_pos + right;
if(chr_idx + 1 < chr_szs.size()) {
if(right >= chr_szs[chr_idx + 1].first) continue;
} else {
if(right >= jlen) continue;
}
// Avoid splice sites in repetitive sequences
// Otherwise, it will likely explode due to an exponential number of combinations
index_t seqlen = 16; assert_leq(seqlen, 16);
if(left >= seqlen && right + 1 + seqlen <= s.length()) {
uint64_t seq = 0;
for(index_t si = left - seqlen; si < left; si++) {
seq = seq << 2 | s[si];
}
for(index_t si = right + 1; si < right + 1 + seqlen; si++) {
seq = seq << 2 | s[si];
}
if(_alts.size() > 0) {
if(_alts.back().left == left &&
_alts.back().right == right) continue;
}
if(ss_seq.find(seq) == ss_seq.end()) ss_seq[seq] = 1;
else ss_seq[seq]++;
}
_alts.expand();
ALT<index_t>& alt = _alts.back();
alt.type = ALT_SPLICESITE;
alt.left = left;
alt.right = right;
alt.fw = (strand == '+' ? true : false);
alt.excluded = false;
_altnames.push_back("ss");
}
ss_file.close();
assert_eq(_alts.size(), _altnames.size());
for(size_t i = 0; i < _alts.size(); i++) {
ALT<index_t>& alt = _alts[i];
if(!alt.splicesite()) continue;
index_t seqlen = 16; assert_leq(seqlen, 16);
if(alt.left >= seqlen && alt.right + 1 + seqlen <= s.length()) {
uint64_t seq = 0;
for(index_t si = alt.left - seqlen; si < alt.left; si++) {
seq = seq << 2 | s[si];
}
for(index_t si = alt.right + 1; si < alt.right + 1 + seqlen; si++) {
seq = seq << 2 | s[si];
}
assert(ss_seq.find(seq) != ss_seq.end());
alt.excluded = ss_seq[seq] > 1;
}
}
}
if(exonfile != "") {
ifstream exon_file(exonfile.c_str(), ios::in);
if(!exon_file.is_open()) {
cerr << "Error: could not open " << ssfile.c_str() << endl;
throw 1;
}
while(!exon_file.eof()) {
// 22 16062156 16062315 +
string chr;
exon_file >> chr;
if(chr.empty() || chr[0] == '#') {
string line;
getline(exon_file, line);
continue;
}
index_t left, right;
char strand;
exon_file >> left >> right >> strand;
// Convert exonic position to intronic position
left += 1; right -= 1;
if(left >= right) continue;
index_t chr_idx = 0;
for(; chr_idx < _refnames_nospace.size(); chr_idx++) {
if(chr == _refnames_nospace[chr_idx])
break;
}
if(chr_idx >= _refnames_nospace.size()) continue;
assert_eq(chr_szs.size(), _refnames_nospace.size());
assert_lt(chr_idx, chr_szs.size());
pair<index_t, index_t> tmp_pair = chr_szs[chr_idx];
const index_t sofar_len = tmp_pair.first;
const index_t szs_idx = tmp_pair.second;
bool inside_Ns = false;
index_t add_pos = 0;
assert(szs[szs_idx].first);
for(index_t i = szs_idx; i < szs.size(); i++) {
if(i != szs_idx && szs[i].first) break;
if(left < szs[i].off) {
inside_Ns = true;
break;
} else {
left -= szs[i].off;
right -= szs[i].off;
if(left < szs[i].len) {
if(right >= szs[i].len) {
inside_Ns = true;
}
break;
} else {
left -= szs[i].len;
right -= szs[i].len;
add_pos += szs[i].len;
}
}
}
if(inside_Ns) continue;
left = sofar_len + add_pos + left;
right = sofar_len + add_pos + right;
if(chr_idx + 1 < chr_szs.size()) {
if(right >= chr_szs[chr_idx + 1].first) continue;
} else {
if(right >= jlen) continue;
}
_alts.expand();
ALT<index_t>& alt = _alts.back();
alt.type = ALT_EXON;
alt.left = left;
alt.right = right;
alt.fw = (strand == '+' ? true : false);
_altnames.push_back("exon");
}
exon_file.close();
}
// Todo - implement structural variations
if(svfile != "") {
cerr << "Warning: SV option is not implemented " << svfile.c_str() << endl;
}
// Sort SNPs and Splice Sites based on positions
if(_alts.size() > 1) {
assert_eq(_alts.size(), _altnames.size());
EList<pair<ALT<index_t>, index_t> > buf; buf.resize(_alts.size());
EList<string> buf2; buf2.resize(_alts.size());
for(size_t i = 0; i < _alts.size(); i++) {
buf[i].first = _alts[i];
buf[i].second = (index_t)i;
buf2[i] = _altnames[i];
}
buf.sort();
for(size_t i = 0; i < _alts.size(); i++) {
_alts[i] = buf[i].first;
_altnames[i] = buf2[buf[i].second];
}
EList<index_t> buf3; buf3.resize(_alts.size());
for(size_t i = 0; i < buf3.size(); i++) {
index_t before = buf[i].second;
assert_lt(before, buf3.size());
buf3[before] = (index_t)i;
}
for(size_t h = 0; h < _haplotypes.size(); h++) {
EList<index_t, 1>& alts = _haplotypes[h].alts;
for(size_t a = 0; a < alts.size(); a++) {
index_t before = alts[a];
assert_lt(before, buf3.size());
alts[a] = buf3[before];
}
}
#ifndef NDEBUG
for(size_t i = 0; i < _alts.size(); i++) {
if(i + 1 < _alts.size()) {
assert(_alts[i] < _alts[i+1]);
}
const ALT<index_t>& alt = _alts[i];
if(alt.snp()) {
assert(_altnames[i] != "");
} else if(alt.splicesite()) {
assert(_altnames[i] == "ss");
} else if(alt.exon()) {
assert(_altnames[i] == "exon");
} else {
assert(false);
}
}
#endif
}
writeIndex<index_t>(fout7, (index_t)_alts.size(), this->toBe());
writeIndex<index_t>(fout8, (index_t)_alts.size(), this->toBe());
for(index_t i = 0; i < _alts.size(); i++) {
_alts[i].write(fout7, this->toBe());
fout8 << _altnames[i] << endl;
}
writeIndex<index_t>(fout7, (index_t)_haplotypes.size(), this->toBe());
for(index_t i = 0; i < _haplotypes.size(); i++) {
_haplotypes[i].write(fout7, this->toBe());
}
EList<Repeat<index_t> >& repeats = _repeatdb.repeats();
if(_repeat) {
ifstream repeat_file(repeatfile.c_str(), ios::in);
if(!repeat_file.is_open()) {
cerr << "Error: could not open " << ssfile.c_str() << endl;
throw 1;
}
if(parent_szs == NULL) {
throw 1;
}
if(parent_refnames == NULL) {
throw 1;
}
EList<pair<index_t, index_t> > parent_chr_szs;
index_t tmp_len = 0;
for(index_t i = 0; i < parent_szs->size(); i++) {
if((*parent_szs)[i].first) {
parent_chr_szs.expand();
parent_chr_szs.back().first = tmp_len;
parent_chr_szs.back().second = i;
}
tmp_len += (index_t)(*parent_szs)[i].len;
}
index_t parent_jlen = joinedLen(*parent_szs);
string prev_repName = "";
while(!repeat_file.eof()) {
// >rep1*0 rep 0 100 470 0
// 20_rep:26622650:+ 20_rep:26628088:+ 20_rep:26632508:+ 20_rep:26635636:+
// 20_rep:26669936:+ 20_rep:26672654:+ 20_rep:26675373:+ 20_rep:26678095:+
string repName, repAlleleName;
repeat_file >> repAlleleName;
if(repAlleleName.empty()) // Reached the end of file
break;
if(repAlleleName[0] != '>') {
cerr << "Error: the file format is not correct" << endl;
throw 1;
}
repAlleleName = repAlleleName.substr(1); // Remove '>'
index_t alleleID = 0;
size_t star_pos = repAlleleName.find('*');
if(star_pos >= repAlleleName.length()) {
repName = repAlleleName;
} else {
repName = repAlleleName.substr(0, star_pos);
string strID = repAlleleName.substr(star_pos + 1);
istringstream(strID) >> alleleID;
}
string refRepName;
index_t repPos, repLen;
repeat_file >> refRepName >> repPos >> repLen;
index_t rep_idx = 0;
for(; rep_idx < _refnames_nospace.size(); rep_idx++) {
if(refRepName == _refnames_nospace[rep_idx])
break;
}
if(rep_idx >= _refnames_nospace.size()) {
cerr << "Error: " << refRepName << " is not found in " << endl;
throw 1;
}
if(repeats.size() == 0 ||
repeats.back().repID != rep_idx ||
repeats.back().repName != repName) {
if(repeats.size() > 0) {
repeats.back().positions.sort();
}
repeats.expand();
repeats.back().init(repName, rep_idx, repPos, repLen);
}
// update repPos and repLen
if(repPos < repeats.back().repPos) {
repeats.back().repLen += (repeats.back().repPos - repPos);
repeats.back().repPos = repPos;
}
if(repPos + repLen > repeats.back().repPos + repeats.back().repLen) {
repeats.back().repLen = repPos + repLen - repeats.back().repPos;
}
size_t baseOff = 0;
if(repeats.size() > 1 && repeats[repeats.size() - 2].repID == rep_idx) {
baseOff = repeats[repeats.size() - 2].repPos + repeats[repeats.size() - 2].repLen;
}
index_t numCoords, numAlts;
repeat_file >> numCoords >> numAlts;
EList<index_t> snpIDs;
EList<string> snpStrIDs;
if(numAlts > 0) {
string snpStrID;
repeat_file >> snpStrID;
tokenize(snpStrID, ",", snpStrIDs);
if(snpStrIDs.size() != numAlts) {
assert(false);
cerr << "Error: the number of SNPs (" << snpIDs.size() << ", " << snpStrID << ") does not equal to " << numAlts << endl;
throw 1;
}
for(index_t i = 0; i < snpStrIDs.size(); i++) {
if(snpID2num.find(snpStrIDs[i]) == snpID2num.end()) {
cerr << "Error: " << snpStrIDs[i] << " is not found" << endl;
throw 1;
}
index_t numID = snpID2num[snpStrIDs[i]];
snpIDs.push_back(numID);
}
}
EList<RepeatCoord<index_t> >& positions = repeats.back().positions;
size_t sofar_numCoords = positions.size();
while(positions.size() - sofar_numCoords < numCoords) {
string chr_pos;
repeat_file >> chr_pos;
size_t colon_pos = chr_pos.find(':');
if(colon_pos + 1 >= chr_pos.length()) {
cerr << "Error: : is not found in " << chr_pos << endl;
throw 1;
}
string chr = chr_pos.substr(0, colon_pos);
string strPos = chr_pos.substr(colon_pos + 1, chr_pos.length() - colon_pos - 3);
bool repfw = (chr_pos[chr_pos.length() - 1] == '+');
index_t pos = 0;
istringstream(strPos) >> pos;
index_t chr_idx = 0;
for(; chr_idx < parent_refnames->size(); chr_idx++) {
if(chr == (*parent_refnames)[chr_idx])
break;
}
if(chr_idx >= parent_refnames->size()) {
cerr << "Error: " << chr << " is not found in " << endl;
throw 1;
}
assert_eq(parent_chr_szs.size(), parent_refnames->size());
assert_lt(chr_idx, parent_chr_szs.size());
positions.expand();
positions.back().tid = chr_idx;
positions.back().toff = pos;
positions.back().fw = repfw;
positions.back().alleleID = alleleID;
pair<index_t, index_t> tmp_pair = parent_chr_szs[chr_idx];
const index_t sofar_len = tmp_pair.first;
const index_t szs_idx = tmp_pair.second;
bool involve_Ns = false;
index_t add_pos = 0;
assert((*parent_szs)[szs_idx].first);
for(index_t i = szs_idx; i < parent_szs->size(); i++) {
if(i != szs_idx && (*parent_szs)[i].first) {
break;
}
if(pos < (*parent_szs)[i].off) {
involve_Ns = true;
break;
} else {
pos -= (*parent_szs)[i].off;
if(pos < (*parent_szs)[i].len) {
break;
} else {
pos -= (*parent_szs)[i].len;
add_pos += (*parent_szs)[i].len;
}
}
}
if(involve_Ns) {
assert(false);
throw 1;
}
pos = sofar_len + add_pos + pos;
if(chr_idx + 1 < parent_chr_szs.size()) {
if(pos >= parent_chr_szs[chr_idx + 1].first) {
assert(false);
throw 1;
}
} else {
if(pos >= parent_jlen){
assert(false);
throw 1;
}
}
positions.back().joinedOff = pos;
}
repeats.back().alleles.expand();
assert_geq(repPos, baseOff);
repeats.back().alleles.back().init(repPos - baseOff, repLen);
}
if(repeats.size() > 0) {
repeats.back().positions.sort();
}
repeat_file.close();
index_t total_repeat_len = 0;
for(size_t r = 0; r + 1 < repeats.size(); r++) {
if(repeats[r].repID != repeats[r+1].repID) {
index_t repeat_len = repeats[r].repPos + repeats[r].repLen;
total_repeat_len += repeat_len;
}
}
index_t repeat_len = repeats.back().repPos + repeats.back().repLen;
total_repeat_len += repeat_len;
if(total_repeat_len != s.length()) {
cerr << "Error: repeat length (" << repeats.back().repPos + repeats.back().repLen;
cerr << ") does not match sequence length (" << s.length() << ")" << endl;
throw 1;
}
_repeatLens.resizeExact(szs.size());
for(size_t i = 0; i < _repeatLens.size(); i++) {
_repeatLens[i].first = numeric_limits<index_t>::max();
_repeatLens[i].second = 0;
}
for(size_t i = 0; i < repeats.size(); i++) {
index_t id = repeats[i].repID;
index_t len = repeats[i].repLen;
assert_lt(id, _repeatLens.size());
if(_repeatLens[id].first > len) {
_repeatLens[id].first = len;
}
if(_repeatLens[id].second < len) {
_repeatLens[id].second = len;
}
}
writeIndex<index_t>(fout7, _repeatLens.size(), this->toBe());
for(size_t i = 0; i < _repeatLens.size(); i++) {
writeIndex<index_t>(fout7, _repeatLens[i].first, this->toBe());
writeIndex<index_t>(fout7, _repeatLens[i].second, this->toBe());
}
_repeatdb.write(fout7, this->toBe());
writeIndex<index_t>(fout7, chr_szs.size(), this->toBe()); // number of repeat indexes
EList<string> seqs;
EList<streampos> tableFilePos;
streampos filepos = fout7.tellp();
for(size_t i = 0; i < chr_szs.size(); i++) {
writeIndex<uint64_t>(fout7, 0, this->toBe());
}
for(size_t i = 0; i < repeats.size(); i++) {
const Repeat<index_t>& repeat = repeats[i];
assert_lt(repeat.repID, chr_szs.size());
index_t template_len = 0;
if(repeat.repID + 1 < chr_szs.size()) {
template_len = chr_szs[repeat.repID + 1].first - chr_szs[repeat.repID].first;
} else {
template_len = s.length() - chr_szs[repeat.repID].first;
}
assert_leq(repeat.repPos + repeat.repLen, template_len);
index_t pos = chr_szs[repeat.repID].first + repeat.repPos;
assert_leq(pos + repeat.repLen, s.length());
seqs.expand();
seqs.back().clear();
for(index_t j = 0; j < repeat.repLen; j++) {
int c = s[pos + j];
assert_range(0, 3, c);
seqs.back().push_back("ACGT"[c]);
}
if(i + 1 == repeats.size() || repeats[i].repID != repeats[i+1].repID) {
const size_t w = RB_Minimizer<string>::default_w, k = RB_Minimizer<string>::default_k;
RB_KmerTable kmer_table;
kmer_table.build(seqs, w, k);
kmer_table.write(fout7, this->toBe());
seqs.clear();
tableFilePos.push_back(fout7.tellp());
}
}
assert_eq(tableFilePos.size(), chr_szs.size());
streampos origpos = fout7.tellp();
fout7.seekp(filepos);
for(size_t i = 0; i < tableFilePos.size(); i++) {
writeIndex<uint64_t>(fout7, tableFilePos[i], this->toBe());
}
fout7.seekp(origpos);
}
fout7.close();
fout8.close();
}
// Joined reference sequence now in 's'
} catch(bad_alloc& e) {
// If we throw an allocation exception in the try block,
// that means that the joined version of the reference
// string itself is too larger to fit in memory. The only
// alternatives are to tell the user to give us more memory
// or to try again with a packed representation of the
// reference (if we haven't tried that already).
cerr << "Could not allocate space for a joined string of " << jlen << " elements." << endl;
if(!isPacked() && _passMemExc) {
// Pass the exception up so that we can retry using a
// packed string representation
throw e;
}
// There's no point passing this exception on. The fact
// that we couldn't allocate the joined string means that
// --bmax is irrelevant - the user should re-run with
// ebwt-build-packed
if(isPacked()) {
cerr << "Please try running bowtie-build on a computer with more memory." << endl;
} else {
cerr << "Please try running bowtie-build in packed mode (-p/--packed) or in automatic" << endl
<< "mode (-a/--auto), or try again on a computer with more memory." << endl;
}
if(sizeof(void*) == 4) {
cerr << "If this computer has more than 4 GB of memory, try using a 64-bit executable;" << endl
<< "this executable is 32-bit." << endl;
}
throw 1;
}
// Succesfully obtained joined reference string
assert_geq(s.length(), jlen);
if(bmax != (index_t)OFF_MASK) {
// VMSG_NL("bmax according to bmax setting: " << bmax);
}
else if(bmaxSqrtMult != (index_t)OFF_MASK) {
bmax *= bmaxSqrtMult;
// VMSG_NL("bmax according to bmaxSqrtMult setting: " << bmax);
}
else if(bmaxDivN != (index_t)OFF_MASK) {
bmax = max<uint32_t>(jlen / (bmaxDivN * _nthreads), 1);
// VMSG_NL("bmax according to bmaxDivN setting: " << bmax);
}
else {
bmax = (uint32_t)sqrt(s.length());
// VMSG_NL("bmax defaulted to: " << bmax);
}
int iter = 0;
bool first = true;
streampos out1pos = out1.tellp();
streampos out2pos = out2.tellp();
if(!_repeat) {
// Look for bmax/dcv parameters that work.
while(true) {
if(!first && bmax < 40 && _passMemExc) {
cerr << "Could not find approrpiate bmax/dcv settings for building this index." << endl;
if(!isPacked()) {
// Throw an exception exception so that we can
// retry using a packed string representation
throw bad_alloc();
} else {
cerr << "Already tried a packed string representation." << endl;
}
cerr << "Please try indexing this reference on a computer with more memory." << endl;
if(sizeof(void*) == 4) {
cerr << "If this computer has more than 4 GB of memory, try using a 64-bit executable;" << endl
<< "this executable is 32-bit." << endl;
}
throw 1;
}
if(!first) {
out1.seekp(out1pos);
out2.seekp(out2pos);
}
if(dcv > 4096) dcv = 4096;
if((iter % 6) == 5 && dcv < 4096 && dcv != 0) {
dcv <<= 1; // double difference-cover period
} else {
bmax -= (bmax >> 2); // reduce by 25%
}
iter++;
try {
if(_alts.empty()) {
VMSG("Using parameters --bmax " << bmax);
if(dcv == 0) {
VMSG_NL(" and *no difference cover*");
} else {
VMSG_NL(" --dcv " << dcv);
}
{
VMSG_NL(" Doing ahead-of-time memory usage test");
// Make a quick-and-dirty attempt to force a bad_alloc iff
// we would have thrown one eventually as part of
// constructing the DifferenceCoverSample
dcv <<= 1;
index_t sz = (index_t)DifferenceCoverSample<TStr>::simulateAllocs(s, dcv >> 1);
if(_nthreads > 1) sz *= (_nthreads + 1);
AutoArray<uint8_t> tmp(sz, EBWT_CAT);
dcv >>= 1;
// Likewise with the KarkkainenBlockwiseSA
sz = (index_t)KarkkainenBlockwiseSA<TStr>::simulateAllocs(s, bmax);
AutoArray<uint8_t> tmp2(sz, EBWT_CAT);
// Now throw in the 'ftab' and 'isaSample' structures
// that we'll eventually allocate in buildToDisk
AutoArray<index_t> ftab(_gh._ftabLen * 2, EBWT_CAT);
AutoArray<uint8_t> side(_gh._sideSz, EBWT_CAT);
// Grab another 20 MB out of caution
AutoArray<uint32_t> extra(20*1024*1024, EBWT_CAT);
// If we made it here without throwing bad_alloc, then we
// passed the memory-usage stress test
VMSG(" Passed! Constructing with these parameters: --bmax " << bmax << " --dcv " << dcv);
if(isPacked()) {
VMSG(" --packed");
}
VMSG_NL("");
}
VMSG_NL("Constructing suffix-array element generator");
KarkkainenBlockwiseSA<TStr> bsa(s, bmax, _nthreads, dcv, seed, _sanity, _passMemExc, _verbose, outfile);
assert(bsa.suffixItrIsReset());
assert_eq(bsa.size(), s.length()+1);
VMSG_NL("Converting suffix-array elements to index image");
buildToDisk(bsa, s, out1, out2);
} else {
RefGraph<index_t>* graph = new RefGraph<index_t>(
s,
szs,
_alts,
_haplotypes,
outfile,
_nthreads,
verbose);
PathGraph<index_t>* pg = new PathGraph<index_t>(
*graph,
outfile,
std::numeric_limits<index_t>::max(),
_nthreads,
verbose);
if(verbose) { cerr << "Generating edges... " << endl; }
if(!pg->generateEdges(*graph)) { return; }
// Re-initialize GFM parameters to reflect real number of edges (gbwt string)
_gh.init(
_gh.len(),
pg->getNumEdges(),
pg->getNumNodes(),
_gh.lineRate(),
_gh.offRate(),
_gh.ftabChars(),
0,
_gh.entireReverse());
buildToDisk(*pg, s, out1, out2);
delete pg; pg = NULL;
delete graph; graph = NULL;
}
out1.flush(); out2.flush();
if(out1.fail() || out2.fail()) {
cerr << "An error occurred writing the index to disk. Please check if the disk is full." << endl;
throw 1;
}
break;
} catch(bad_alloc& e) {
if(_passMemExc) {
VMSG_NL(" Ran out of memory; automatically trying more memory-economical parameters.");
} else {
cerr << "Out of memory while constructing suffix array. Please try using a smaller" << endl
<< "number of blocks by specifying a smaller --bmax or a larger --bmaxdivn" << endl;
throw 1;
}
}
first = false;
}
assert(repOk());
// Now write reference sequence names on the end
assert_eq(this->_refnames.size(), this->_nPat);
for(index_t i = 0; i < this->_refnames.size(); i++) {
out1 << this->_refnames[i].c_str() << endl;
}
out1 << '\0';
out1.flush(); out2.flush();
if(out1.fail() || out2.fail()) {
cerr << "An error occurred writing the index to disk. Please check if the disk is full." << endl;
throw 1;
}
}
VMSG_NL("Returning from initFromVector");
}
/**
* Return the length that the joined string of the given string
* list will have. Note that this is indifferent to how the text
* fragments correspond to input sequences - it just cares about
* the lengths of the fragments.
*/
index_t joinedLen(EList<RefRecord>& szs) {
index_t ret = 0;
for(unsigned int i = 0; i < szs.size(); i++) {
ret += (index_t)szs[i].len;
}
return ret;
}
/// Destruct an Ebwt
~GFM() {
_fchr.reset();
_ftab.reset();
_eftab.reset();
_plen.reset();
_rstarts.reset();
_offs.reset();
_gfm.reset();
if(offs() != NULL && useShmem_) {
FREE_SHARED(offs());
}
if(gfm() != NULL && useShmem_) {
FREE_SHARED(gfm());
}
if (_in1 != NULL) fclose(_in1);
if (_in2 != NULL) fclose(_in2);
}
/// Accessors
inline const GFMParams<index_t>& gh() const { return _gh; }
index_t numZOffs() const { return _zOffs.size(); }
index_t zOff(index_t i) const { assert_lt(i, _zOffs.size()); return _zOffs[i]; }
index_t zGbwtByteOff(index_t i) const { assert_lt(i, _zGbwtByteOffs.size()); return _zGbwtByteOffs[i]; }
int zGbwtBpOff(index_t i) const { assert_lt(i, _zGbwtBpOffs.size()); return _zGbwtBpOffs[i]; }
index_t nPat() const { return _nPat; }
index_t nFrag() const { return _nFrag; }
inline index_t* fchr() { return _fchr.get(); }
inline index_t* ftab() { return _ftab.get(); }
inline index_t* eftab() { return _eftab.get(); }
inline index_t* offs() { return _offs.get(); }
inline index_t* plen() { return _plen.get(); }
inline index_t* rstarts() { return _rstarts.get(); }
inline uint8_t* gfm() { return _gfm.get(); }
inline const index_t* fchr() const { return _fchr.get(); }
inline const index_t* ftab() const { return _ftab.get(); }
inline const index_t* eftab() const { return _eftab.get(); }
inline const index_t* offs() const { return _offs.get(); }
inline const index_t* plen() const { return _plen.get(); }
inline const index_t* rstarts() const { return _rstarts.get(); }
inline const uint8_t* gfm() const { return _gfm.get(); }
inline const EList<ALT<index_t> >& alts() const { return _alts; }
inline const EList<string>& altnames() const { return _altnames; }
bool toBe() const { return _toBigEndian; }
bool verbose() const { return _verbose; }
bool sanityCheck() const { return _sanity; }
EList<string>& refnames() { return _refnames; }
bool fw() const { return fw_; }
bool repeat() const { return _repeat; }
const EList<uint8_t>& getRepeatIncluded() const { return _repeatIncluded; }
#ifdef POPCNT_CAPABILITY
bool _usePOPCNTinstruction;
#endif
/**
* Returns true iff the index contains the given string (exactly). The
* given string must contain only unambiguous characters. TODO:
* support skipping of ambiguous characters.
*/
bool contains(
const BTDnaString& str,
index_t *top = NULL,
index_t *bot = NULL) const;
/**
* Returns true iff the index contains the given string (exactly). The
* given string must contain only unambiguous characters. TODO:
* support skipping of ambiguous characters.
*/
bool contains(
const char *str,
index_t *top = NULL,
index_t *bot = NULL) const
{
return contains(BTDnaString(str, true), top, bot);
}
/// Return true iff the Ebwt is currently in memory
bool isInMemory() const {
if(gfm() != NULL) {
// Note: We might have skipped loading _offs, _ftab,
// _eftab, and _rstarts depending on whether this is the
// reverse index and what algorithm is being used.
assert(_gh.repOk());
//assert(_ftab != NULL);
//assert(_eftab != NULL);
assert(fchr() != NULL);
//assert(_offs != NULL);
//assert(_rstarts != NULL);
// assert_neq(_zGbwtByteOff, INDEX_MAX);
// assert_neq(_zGbwtBpOff, -1);
return true;
} else {
assert(ftab() == NULL);
assert(eftab() == NULL);
assert(fchr() == NULL);
assert(offs() == NULL);
assert(rstarts() == NULL);
assert_eq(_zOffs.size(), 0);
assert_eq(_zGbwtByteOffs.size(), 0);
assert_eq(_zGbwtBpOffs.size(), 0);
return false;
}
}
/// Return true iff the Ebwt is currently stored on disk
bool isEvicted() const {
return !isInMemory();
}
/**
* Load this Ebwt into memory by reading it in from the _in1 and
* _in2 streams.
*/
void loadIntoMemory(
int needEntireReverse,
bool loadSASamp,
bool loadFtab,
bool loadRstarts,
bool loadNames,
bool verbose)
{
readIntoMemory(
needEntireReverse, // require reverse index to be concatenated reference reversed
loadSASamp, // load the SA sample portion?
loadFtab, // load the ftab (_ftab[] and _eftab[])?
loadRstarts, // load the r-starts (_rstarts[])?
false, // stop after loading the header portion?
NULL, // params
false, // mmSweep
loadNames, // loadNames
verbose); // startVerbose
}
/**
* Frees memory associated with the Ebwt.
*/
void evictFromMemory() {
assert(isInMemory());
_fchr.free();
_ftab.free();
_eftab.free();
_rstarts.free();
_offs.free(); // might not be under control of APtrWrap
_gfm.free(); // might not be under control of APtrWrap
// Keep plen; it's small and the client may want to seq it
// even when the others are evicted.
//_plen = NULL;
_zOffs.clear();
_zGbwtByteOffs.clear();
_zGbwtBpOffs.clear();
}
/**
* Turn a substring of 'seq' starting at offset 'off' and having
* length equal to the index's 'ftabChars' into an int that can be
* used to index into the ftab array.
*/
index_t ftabSeqToInt(
const BTDnaString& seq,
index_t off,
bool rev) const
{
int fc = _gh._ftabChars;
index_t lo = off, hi = lo + fc;
assert_leq(hi, seq.length());
index_t ftabOff = 0;
for(int i = 0; i < fc; i++) {
bool fwex = fw();
if(rev) fwex = !fwex;
// We add characters to the ftabOff in the order they would
// have been consumed in a normal search. For BWT, this
// means right-to-left order; for BWT' it's left-to-right.
int c = (fwex ? seq[lo + i] : seq[hi - i - 1]);
if(c > 3) {
return std::numeric_limits<index_t>::max();
}
assert_range(0, 3, c);
ftabOff <<= 2;
ftabOff |= c;
}
return ftabOff;
}
/**
* Non-static facade for static function ftabHi.
*/
index_t ftabHi(index_t i) const {
return GFM<index_t>::ftabHi(
ftab(),
eftab(),
_gh.linearFM() ? _gh._len : _gh._gbwtLen,
_gh._ftabLen,
_gh._eftabLen,
i);
}
/**
* Get "high interpretation" of ftab entry at index i. The high
* interpretation of a regular ftab entry is just the entry
* itself. The high interpretation of an extended entry is the
* second correpsonding ui32 in the eftab.
*
* It's a static member because it's convenient to ask this
* question before the Ebwt is fully initialized.
*/
static index_t ftabHi(
const index_t *ftab,
const index_t *eftab,
index_t gbwtLen,
index_t ftabLen,
index_t eftabLen,
index_t i)
{
assert_lt(i, ftabLen);
if(ftab[i] <= gbwtLen) {
return ftab[i];
} else {
index_t efIdx = ftab[i] ^ (index_t)INDEX_MAX;
assert_lt(efIdx*2+1, eftabLen);
return eftab[efIdx*2+1];
}
}
/**
* Non-static facade for static function ftabLo.
*/
index_t ftabLo(index_t i) const {
return GFM<index_t>::ftabLo(
ftab(),
eftab(),
_gh.linearFM() ? _gh._len : _gh._gbwtLen,
_gh._ftabLen,
_gh._eftabLen,
i);
}
/**
* Get low bound of ftab range.
*/
index_t ftabLo(const BTDnaString& seq, index_t off) const {
return ftabLo(ftabSeqToInt(seq, off, false));
}
/**
* Get high bound of ftab range.
*/
index_t ftabHi(const BTDnaString& seq, index_t off) const {
return ftabHi(ftabSeqToInt(seq, off, false));
}
/**
* Extract characters from seq starting at offset 'off' and going either
* forward or backward, depending on 'rev'. Order matters when compiling
* the integer that gets looked up in the ftab. Each successive character
* is ORed into the least significant bit-pair, and characters are
* integrated in the direction of the search.
*/
bool
ftabLoHi(
const BTDnaString& seq, // sequence to extract from
index_t off, // offset into seq to begin extracting
bool rev, // reverse while extracting
index_t& top,
index_t& bot) const
{
index_t fi = ftabSeqToInt(seq, off, rev);
if(fi == std::numeric_limits<index_t>::max()) {
return false;
}
top = ftabHi(fi);
bot = ftabLo(fi+1);
assert_geq(bot, top);
return true;
}
/**
* Get "low interpretation" of ftab entry at index i. The low
* interpretation of a regular ftab entry is just the entry
* itself. The low interpretation of an extended entry is the
* first correpsonding ui32 in the eftab.
*
* It's a static member because it's convenient to ask this
* question before the Ebwt is fully initialized.
*/
static index_t ftabLo(
const index_t *ftab,
const index_t *eftab,
index_t gbwtLen,
index_t ftabLen,
index_t eftabLen,
index_t i)
{
assert_lt(i, ftabLen);
if(ftab[i] <= gbwtLen) {
return ftab[i];
} else {
index_t efIdx = ftab[i] ^ (index_t)INDEX_MAX;
assert_lt(efIdx*2+1, eftabLen);
return eftab[efIdx*2];
}
}
/**
* Try to resolve the reference offset of the BW element 'elt'. If
* it can be resolved immediately, return the reference offset. If
* it cannot be resolved immediately, return 0xffffffff.
*/
index_t tryOffset(index_t elt, index_t node) const {
assert(offs() != NULL);
for(index_t i = 0; i < _zOffs.size(); i++) {
if(elt == _zOffs[i]) return 0;
}
if((node & _gh._offMask) == node) {
index_t nodeOff = node >> _gh._offRate;
assert_lt(nodeOff, _gh._offsLen);
index_t off = offs()[nodeOff];
return off;
} else {
// Try looking at zoff
return (index_t)INDEX_MAX;
}
}
/**
* Try to resolve the reference offset of the BW element 'elt' such
* that the offset returned is at the right-hand side of the
* forward reference substring involved in the hit.
*/
index_t tryOffset(
index_t elt,
bool fw,
index_t hitlen) const
{
index_t off = tryOffset(elt);
if(off != (index_t)INDEX_MAX && !fw) {
assert_lt(off, _gh._len);
off = _gh._len - off - 1;
assert_geq(off, hitlen-1);
off -= (hitlen-1);
assert_lt(off, _gh._len);
}
return off;
}
/**
* Walk 'steps' steps to the left and return the row arrived at.
*/
index_t walkLeft(index_t row, index_t steps) const;
/**
* Resolve the reference offset of the BW element 'elt'.
*/
index_t getOffset(index_t row, index_t node = 0) const;
/**
* Resolve the reference offset of the BW element 'elt' such that
* the offset returned is at the right-hand side of the forward
* reference substring involved in the hit.
*/
index_t getOffset(
index_t elt,
bool fw,
index_t hitlen) const;
/**
* When using read() to create an Ebwt, we have to set a couple of
* additional fields in the Ebwt object that aren't part of the
* parameter list and are not stored explicitly in the file. Right
* now, this just involves initializing _zEbwtByteOff and
* _zEbwtBpOff from _zOff.
*/
void postReadInit(const GFMParams<index_t>& gh) {
_zGbwtByteOffs.resizeExact(_zOffs.size());
_zGbwtBpOffs.resizeExact(_zOffs.size());
for(index_t i = 0; i < _zOffs.size(); i++) {
index_t sideNum = _zOffs[i] / gh._sideGbwtLen;
index_t sideCharOff = _zOffs[i] % gh._sideGbwtLen;
index_t sideByteOff = sideNum * gh._sideSz;
_zGbwtByteOffs[i] = sideCharOff >> 2;
assert_lt(_zGbwtByteOffs[i], gh._sideGbwtSz);
_zGbwtBpOffs[i] = sideCharOff & 3;
assert_lt(_zGbwtBpOffs[i], 4);
_zGbwtByteOffs[i] += sideByteOff;
}
assert(repOk(gh)); // Ebwt should be fully initialized now
}
/**
* Given basename of an Ebwt index, read and return its flag.
*/
static int32_t readVersionFlags(const string& instr, int& major, int& minor, string& extra_version);
static void readProgramVersion(int& major_version, int& minor_version, string& extra_version) {
char extra[256] = {0,};
int second_version;
sscanf(HISAT2_VERSION, "%d.%d.%d-%s",
&second_version,
&major_version,
&minor_version,
extra);
extra_version = extra;
}
static void readIndexVersion(int index_version, int& major_version, int& minor_version, string& extra_version) {
major_version = (index_version >> 16) & 0xff;
minor_version = (index_version >> 8) & 0xff;
if((index_version & 0xff) == 1) {
extra_version = "alpha";
} else if((index_version & 0xff) == 2) {
extra_version = "beta";
} else {
extra_version = "";
}
}
static int getIndexVersion() {
int major_version = 0, minor_version = 0;
string extra_version;
readProgramVersion(major_version, minor_version, extra_version);
int version = 2; // HISAT2
version = (version << 8) | (major_version & 0xff);
version = (version << 8) | (minor_version & 0xff);
version = version << 8;
if(extra_version == "alpha") {
version |= 0x1;
} else if(extra_version == "beta") {
version |= 0x2;
}
return version;
}
/**
* Pretty-print the Ebwt to the given output stream.
*/
void print(ostream& out) const {
print(out, _gh);
}
/**
* Pretty-print the Ebwt and given EbwtParams to the given output
* stream.
*/
void print(ostream& out, const GFMParams<index_t>& gh) const {
gh.print(out); // print params
return;
out << "Ebwt (" << (isInMemory()? "memory" : "disk") << "):" << endl;
for(index_t i = 0; i < _zOffs.size(); i++) {
out << " " << (i+1) << " zOffs: " << _zOffs[i] << endl
<< " " << (i+1) << " zGbwtByteOff: " << _zGbwtByteOffs[i] << endl
<< " " << (i+1) << " zGbwtBpOff: " << _zGbwtBpOffs[i] << endl;
}
out << " nPat: " << _nPat << endl
<< " plen: ";
if(plen() == NULL) {
out << "NULL" << endl;
} else {
out << "non-NULL, [0] = " << plen()[0] << endl;
}
out << " rstarts: ";
if(rstarts() == NULL) {
out << "NULL" << endl;
} else {
out << "non-NULL, [0] = " << rstarts()[0] << endl;
}
out << " ebwt: ";
if(gfm() == NULL) {
out << "NULL" << endl;
} else {
out << "non-NULL, [0] = " << gfm()[0] << endl;
}
out << " fchr: ";
if(fchr() == NULL) {
out << "NULL" << endl;
} else {
out << "non-NULL, [0] = " << fchr()[0] << endl;
}
out << " ftab: ";
if(ftab() == NULL) {
out << "NULL" << endl;
} else {
out << "non-NULL, [0] = " << ftab()[0] << endl;
}
out << " eftab: ";
if(eftab() == NULL) {
out << "NULL" << endl;
} else {
out << "non-NULL, [0] = " << eftab()[0] << endl;
}
out << " offs: ";
if(offs() == NULL) {
out << "NULL" << endl;
} else {
out << "non-NULL, [0] = " << offs()[0] << endl;
}
}
// Building
template <typename TStr> static TStr join(EList<TStr>& l, uint32_t seed);
template <typename TStr> static void join(EList<FileBuf*>& l, EList<RefRecord>& szs, index_t sztot, const RefReadInParams& refparams, uint32_t seed, TStr& s, bool include_rc = false, bool CGtoTG = false);
template <typename TStr> void joinToDisk(EList<FileBuf*>& l, EList<RefRecord>& szs, index_t sztot, const RefReadInParams& refparams, TStr& ret, ostream& out1, ostream& out2);
template <typename TStr> void buildToDisk(PathGraph<index_t>& gbwt, const TStr& s, ostream& out1, ostream& out2, streampos headerPos = -1);
template <typename TStr> void buildToDisk(InorderBlockwiseSA<TStr>& sa, const TStr& s, ostream& out1, ostream& out2, streampos headerPos = -1);
// I/O
void readIntoMemory(int needEntireRev, bool loadSASamp, bool loadFtab, bool loadRstarts, bool justHeader, GFMParams<index_t> *params, bool mmSweep, bool loadNames, bool startVerbose, bool subIndex = false);
void writeFromMemory(bool justHeader, ostream& out1, ostream& out2) const;
void writeFromMemory(bool justHeader, const string& out1, const string& out2) const;
// Sanity checking
void sanityCheckUpToSide(int upToSide) const;
void sanityCheckAll(int reverse) const;
void restore(SString<char>& s) const;
void checkOrigs(const EList<SString<char> >& os, bool mirror) const;
// Searching and reporting
bool joinedToTextOff(index_t qlen, index_t off, index_t& tidx, index_t& textoff, index_t& tlen, bool rejectStraddle, bool& straddled) const;
bool textOffToJoined(index_t tid, index_t tlen, index_t& off) const;
#define WITHIN_BWT_LEN(x) \
assert_leq(x[0], this->_gh._sideGbwtLen); \
assert_leq(x[1], this->_gh._sideGbwtLen); \
assert_leq(x[2], this->_gh._sideGbwtLen); \
assert_leq(x[3], this->_gh._sideGbwtLen)
#define WITHIN_FCHR(x) \
assert_leq(x[0], this->fchr()[1]); \
assert_leq(x[1], this->fchr()[2]); \
assert_leq(x[2], this->fchr()[3]); \
assert_leq(x[3], this->fchr()[4])
#define WITHIN_FCHR_DOLLARA(x) \
assert_leq(x[0], this->fchr()[1]+1); \
assert_leq(x[1], this->fchr()[2]); \
assert_leq(x[2], this->fchr()[3]); \
assert_leq(x[3], this->fchr()[4])
/**
* Count all occurrences of character c from the beginning of the
* forward side to <by,bp> and add in the occ[] count up to the side
* break just prior to the side.
*
* A Bowtie 2 side is shaped like:
*
* XXXXXXXXXXXXXXXX [A] [C] [G] [T]
* --------48------ -4- -4- -4- -4- (numbers in bytes)
*/
inline index_t countBt2Side(const SideLocus<index_t>& l, int c) const {
assert_range(0, 3, c);
assert_range(0, (int)this->_gh._sideGbwtSz-1, (int)l._by);
assert_range(0, 3, (int)l._bp);
const uint8_t *side = l.side(this->gfm());
index_t cCnt = countUpTo(l, c);
assert_leq(cCnt, l.toBWRow(_gh));
assert_leq(cCnt, this->_gh._sideGbwtLen);
assert_eq(_zGbwtByteOffs.size(), _zGbwtBpOffs.size());
for(index_t i = 0; i < _zGbwtByteOffs.size(); i++) {
index_t zGbwtByteOff = _zGbwtByteOffs[i];
if(c == 0 && l._sideByteOff <= zGbwtByteOff && l._sideByteOff + l._by >= zGbwtByteOff) {
// Adjust for the fact that we represented $ with an 'A', but
// shouldn't count it as an 'A' here
int zGbwtBpOff = _zGbwtBpOffs[i];
if((l._sideByteOff + l._by > zGbwtByteOff) ||
(l._sideByteOff + l._by == zGbwtByteOff && l._bp > zGbwtBpOff))
{
cCnt--; // Adjust for '$' looking like an 'A'
}
}
}
index_t ret;
// Now factor in the occ[] count at the side break
const uint8_t *acgt8 = side + _gh._sideGbwtSz;
if(!_gh._linearFM) {
acgt8 += (sizeof(index_t) << 1);
}
const index_t *acgt = reinterpret_cast<const index_t*>(acgt8);
assert_leq(acgt[0], this->_gh._numSides * this->_gh._sideGbwtLen); // b/c it's used as padding
assert_lt(acgt[1], this->_gh._gbwtLen);
assert_lt(acgt[2], this->_gh._gbwtLen);
assert_lt(acgt[3], this->_gh._gbwtLen);
ret = acgt[c] + cCnt + this->fchr()[c];
#ifndef NDEBUG
assert_leq(ret, this->fchr()[c+1]); // can't have jumpded into next char's section
if(c == 0) {
assert_leq(cCnt, this->_gh._sideGbwtLen);
} else {
assert_leq(ret, this->_gh._gbwtLen);
}
#endif
return ret;
}
/**
* Count all occurrences of all four nucleotides up to the starting
* point (which must be in a forward side) given by 'l' storing the
* result in 'cntsUpto', then count nucleotide occurrences within the
* range of length 'num' storing the result in 'cntsIn'. Also, keep
* track of the characters occurring within the range by setting
* 'masks' accordingly (masks[1][10] == true -> 11th character is a
* 'C', and masks[0][10] == masks[2][10] == masks[3][10] == false.
*/
inline void countBt2SideRange(
const SideLocus<index_t>& l, // top locus
index_t num, // number of elts in range to tall
index_t* cntsUpto, // A/C/G/T counts up to top
index_t* cntsIn, // A/C/G/T counts within range
EList<bool> *masks) const // masks indicating which range elts = A/C/G/T
{
assert_gt(num, 0);
assert_range(0, (int)this->_gh._sideGbwtSz-1, (int)l._by);
assert_range(0, 3, (int)l._bp);
countUpToEx(l, cntsUpto);
WITHIN_FCHR_DOLLARA(cntsUpto);
WITHIN_BWT_LEN(cntsUpto);
const uint8_t *side = l.side(this->gfm());
assert_eq(_zGbwtByteOffs.size(), _zGbwtBpOffs.size());
for(index_t i = 0; i < _zGbwtByteOffs.size(); i++) {
index_t zGbwtByteOff = _zGbwtByteOffs[i];
if(l._sideByteOff <= zGbwtByteOff && l._sideByteOff + l._by >= zGbwtByteOff) {
// Adjust for the fact that we represented $ with an 'A', but
// shouldn't count it as an 'A' here
int zGbwtBpOff = _zGbwtBpOffs[i];
if((l._sideByteOff + l._by > zGbwtByteOff) ||
(l._sideByteOff + l._by == zGbwtByteOff && l._bp > zGbwtBpOff))
{
cntsUpto[0]--; // Adjust for '$' looking like an 'A'
}
}
}
// Now factor in the occ[] count at the side break
const index_t *acgt = reinterpret_cast<const index_t*>(side + _gh._sideGbwtSz);
if(!this->_gh.linearFM()) acgt += 2;
assert_leq(acgt[0], this->fchr()[1] + this->_gh.sideGbwtLen());
assert_leq(acgt[1], this->fchr()[2]-this->fchr()[1]);
assert_leq(acgt[2], this->fchr()[3]-this->fchr()[2]);
assert_leq(acgt[3], this->fchr()[4]-this->fchr()[3]);
assert_leq(acgt[0], this->_gh._gbwtLen + this->_gh.sideGbwtLen());
assert_leq(acgt[1], this->_gh._gbwtLen);
assert_leq(acgt[2], this->_gh._gbwtLen);
assert_leq(acgt[3], this->_gh._gbwtLen);
cntsUpto[0] += (acgt[0] + this->fchr()[0]);
cntsUpto[1] += (acgt[1] + this->fchr()[1]);
cntsUpto[2] += (acgt[2] + this->fchr()[2]);
cntsUpto[3] += (acgt[3] + this->fchr()[3]);
masks[0].resize(num);
masks[1].resize(num);
masks[2].resize(num);
masks[3].resize(num);
WITHIN_FCHR_DOLLARA(cntsUpto);
WITHIN_FCHR_DOLLARA(cntsIn);
// 'cntsUpto' is complete now.
// Walk forward until we've tallied the entire 'In' range
index_t nm = 0;
// Rest of this side
nm += countBt2SideRange2(l, true, num - nm, cntsIn, masks, nm);
assert_eq(nm, cntsIn[0] + cntsIn[1] + cntsIn[2] + cntsIn[3]);
assert_leq(nm, num);
SideLocus<index_t> lcopy = l;
while(nm < num) {
// Subsequent sides, if necessary
lcopy.nextSide(this->_gh);
nm += countBt2SideRange2(lcopy, false, num - nm, cntsIn, masks, nm);
WITHIN_FCHR_DOLLARA(cntsIn);
assert_leq(nm, num);
assert_eq(nm, cntsIn[0] + cntsIn[1] + cntsIn[2] + cntsIn[3]);
}
assert_eq(num, cntsIn[0] + cntsIn[1] + cntsIn[2] + cntsIn[3]);
WITHIN_FCHR_DOLLARA(cntsIn);
}
/**
* Count all occurrences of character c from the beginning of the
* forward side to <by,bp> and add in the occ[] count up to the side
* break just prior to the side.
*
* A forward side is shaped like:
*
* [A] [C] XXXXXXXXXXXXXXXX
* -4- -4- --------56------ (numbers in bytes)
* ^
* Side ptr (result from SideLocus.side())
*
* And following it is a reverse side shaped like:
*
* [G] [T] XXXXXXXXXXXXXXXX
* -4- -4- --------56------ (numbers in bytes)
* ^
* Side ptr (result from SideLocus.side())
*
*/
inline void countBt2SideEx(const SideLocus<index_t>& l, index_t* arrs) const {
assert_range(0, (int)this->_gh._sideGbwtSz-1, (int)l._by);
assert_range(0, 3, (int)l._bp);
countUpToEx(l, arrs);
assert_eq(_zGbwtByteOffs.size(), _zGbwtBpOffs.size());
for(index_t i = 0; i < _zGbwtByteOffs.size(); i++) {
index_t zGbwtByteOff = _zGbwtByteOffs[i];
if(l._sideByteOff <= zGbwtByteOff && l._sideByteOff + l._by >= zGbwtByteOff) {
// Adjust for the fact that we represented $ with an 'A', but
// shouldn't count it as an 'A' here
int zGbwtBpOff = _zGbwtBpOffs[i];
if((l._sideByteOff + l._by > zGbwtByteOff) ||
(l._sideByteOff + l._by == zGbwtByteOff && l._bp > zGbwtBpOff))
{
arrs[0]--; // Adjust for '$' looking like an 'A'
}
}
}
WITHIN_FCHR(arrs);
WITHIN_BWT_LEN(arrs);
// Now factor in the occ[] count at the side break
const uint8_t *side = l.side(this->gfm());
const uint8_t *acgt16 = side + this->_gh._sideSz - sizeof(index_t) * 4;
const index_t *acgt = reinterpret_cast<const index_t*>(acgt16);
assert_leq(acgt[0], this->fchr()[1] + this->_gh.sideGbwtLen());
assert_leq(acgt[1], this->fchr()[2]-this->fchr()[1]);
assert_leq(acgt[2], this->fchr()[3]-this->fchr()[2]);
assert_leq(acgt[3], this->fchr()[4]-this->fchr()[3]);
assert_leq(acgt[0], this->_gh._len + this->_gh.sideGbwtLen());
assert_leq(acgt[1], this->_gh._len);
assert_leq(acgt[2], this->_gh._len);
assert_leq(acgt[3], this->_gh._len);
arrs[0] += (acgt[0] + this->fchr()[0]);
arrs[1] += (acgt[1] + this->fchr()[1]);
arrs[2] += (acgt[2] + this->fchr()[2]);
arrs[3] += (acgt[3] + this->fchr()[3]);
WITHIN_FCHR(arrs);
}
/**
* Count all occurrences of character 1 from the beginning of the
* forward side to <by,bp> and add in the occ[] count up to the side
* break just prior to the side.
*
*/
inline index_t countMSide(const SideLocus<index_t>& l) const {
assert_range(0, (int)this->_gh._sideGbwtSz-1, (int)l._by);
assert_range(0, 7, (int)l._bp);
index_t cCnt = countUpTo_bits(l, false /* F? */);
const uint8_t *side = l.side(this->gfm());
cCnt += *(index_t*)(side + _gh._sideGbwtSz + sizeof(index_t));
assert_leq(cCnt, l.toBWRow(_gh));
assert_leq(cCnt, this->_gh._numNodes);
return cCnt;
}
/**
* Counts the number of occurrences of character 'c' in the given Ebwt
* side up to (but not including) the given byte/bitpair (by/bp).
*
* This is a performance-critical function. This is the top search-
* related hit in the time profile.
*
* Function gets 11.09% in profile
*/
inline index_t countUpTo(const SideLocus<index_t>& l, int c) const {
// Count occurrences of c in each 64-bit (using bit trickery);
// Someday countInU64() and pop() functions should be
// vectorized/SSE-ized in case that helps.
bool usePOPCNT = false;
index_t cCnt = 0;
const uint8_t *side = l.side(this->gfm());
int i = 0;
#ifdef POPCNT_CAPABILITY
if(_usePOPCNTinstruction) {
usePOPCNT = true;
int by = l._by + (l._bp > 0 ? 1 : 0);
for(; i < by; i += 8) {
if(i + 8 < by) {
cCnt += countInU64<USE_POPCNT_INSTRUCTION>(c, *(uint64_t*)&side[i]);
} else {
index_t by_shift = 8 - (by - i);
index_t bp_shift = (l._bp > 0 ? 4 - l._bp : 0);
index_t shift = (by_shift << 3) + (bp_shift << 1);
uint64_t side_i = *(uint64_t*)&side[i];
side_i = (_toBigEndian ? side_i >> shift : side_i << shift);
index_t cCnt_add = countInU64<USE_POPCNT_INSTRUCTION>(c, side_i);
if(c == 0) cCnt_add -= (shift >> 1);
#ifndef NDEBUG
index_t cCnt_temp = 0;
for(int j = i; j < l._by; j++) {
cCnt_temp += cCntLUT_4[0][c][side[j]];
}
if(l._bp > 0) {
cCnt_temp += cCntLUT_4[(int)l._bp][c][side[l._by]];
}
assert_eq(cCnt_add, cCnt_temp);
#endif
cCnt += cCnt_add;
break;
}
}
} else {
for(; i + 7 < l._by; i += 8) {
cCnt += countInU64<USE_POPCNT_GENERIC>(c, *(uint64_t*)&side[i]);
}
}
#else
for(; i + 7 < l._by; i += 8) {
cCnt += countInU64(c, *(uint64_t*)&side[i]);
}
#endif
if(!usePOPCNT) {
// Count occurences of c in the rest of the side (using LUT)
for(; i < l._by; i++) {
cCnt += cCntLUT_4[0][c][side[i]];
}
// Count occurences of c in the rest of the byte
if(l._bp > 0) {
cCnt += cCntLUT_4[(int)l._bp][c][side[i]];
}
}
return cCnt;
}
/**
* Counts the number of occurrences of character 'c' in the given Ebwt
* side down to the given byte/bitpair (by/bp).
*
*/
inline index_t countDownTo(const SideLocus<index_t>& l, int c) const {
// Count occurrences of c in each 64-bit (using bit trickery);
// Someday countInU64() and pop() functions should be
// vectorized/SSE-ized in case that helps.
index_t cCnt = 0;
const uint8_t *side = l.side(this->gfm());
int i = 64 - 4 * sizeof(index_t) - 1;
#ifdef POPCNT_CAPABILITY
if ( _usePOPCNTinstruction) {
for(; i - 7 > l._by; i -= 8) {
cCnt += countInU64<USE_POPCNT_INSTRUCTION>(c, *(uint64_t*)&side[i-7]);
}
}
else {
for(; i + 7 > l._by; i -= 8) {
cCnt += countInU64<USE_POPCNT_GENERIC>(c, *(uint64_t*)&side[i-7]);
}
}
#else
for(; i + 7 > l._by; i -= 8) {
cCnt += countInU64(c, *(uint64_t*)&side[i-7]);
}
#endif
// Count occurences of c in the rest of the side (using LUT)
for(; i > l._by; i--) {
cCnt += cCntLUT_4_rev[0][c][side[i]];
}
// Count occurences of c in the rest of the byte
if(l._bp > 0) {
cCnt += cCntLUT_4_rev[4-(int)l._bp][c][side[i]];
} else {
cCnt += cCntLUT_4_rev[0][c][side[i]];
}
return cCnt;
}
/**
* Tricky-bit-bashing bitpair counting for given two-bit value (0-3)
* within a 64-bit argument.
*
* Function gets 2.32% in profile
*/
#ifdef POPCNT_CAPABILITY
template<typename Operation>
#endif
inline static void countInU64Ex(uint64_t dw, index_t* arrs) {
uint64_t c0 = c_table[0];
uint64_t x0 = dw ^ c0;
uint64_t x1 = (x0 >> 1);
uint64_t x2 = x1 & (0x5555555555555555llu);
uint64_t x3 = x0 & x2;
#ifdef POPCNT_CAPABILITY
uint64_t tmp = Operation().pop64(x3);
#else
uint64_t tmp = pop64(x3);
#endif
arrs[0] += (uint32_t) tmp;
c0 = c_table[1];
x0 = dw ^ c0;
x1 = (x0 >> 1);
x2 = x1 & (0x5555555555555555llu);
x3 = x0 & x2;
#ifdef POPCNT_CAPABILITY
tmp = Operation().pop64(x3);
#else
tmp = pop64(x3);
#endif
arrs[1] += (uint32_t) tmp;
c0 = c_table[2];
x0 = dw ^ c0;
x1 = (x0 >> 1);
x2 = x1 & (0x5555555555555555llu);
x3 = x0 & x2;
#ifdef POPCNT_CAPABILITY
tmp = Operation().pop64(x3);
#else
tmp = pop64(x3);
#endif
arrs[2] += (uint32_t) tmp;
c0 = c_table[3];
x0 = dw ^ c0;
x1 = (x0 >> 1);
x2 = x1 & (0x5555555555555555llu);
x3 = x0 & x2;
#ifdef POPCNT_CAPABILITY
tmp = Operation().pop64(x3);
#else
tmp = pop64(x3);
#endif
arrs[3] += (uint32_t) tmp;
}
/**
* Counts the number of occurrences of all four nucleotides in the
* given side up to (but not including) the given byte/bitpair (by/bp).
* Count for 'a' goes in arrs[0], 'c' in arrs[1], etc.
*/
inline void countUpToEx(const SideLocus<index_t>& l, index_t* arrs) const {
int i = 0;
// Count occurrences of each nucleotide in each 64-bit word using
// bit trickery; note: this seems does not seem to lend a
// significant boost to performance in practice. If you comment
// out this whole loop (which won't affect correctness - it will
// just cause the following loop to take up the slack) then runtime
// does not change noticeably. Someday the countInU64() and pop()
// functions should be vectorized/SSE-ized in case that helps.
const uint8_t *side = l.side(this->gfm());
#ifdef POPCNT_CAPABILITY
if (_usePOPCNTinstruction) {
for(; i+7 < l._by; i += 8) {
countInU64Ex<USE_POPCNT_INSTRUCTION>(*(uint64_t*)&side[i], arrs);
}
}
else {
for(; i+7 < l._by; i += 8) {
countInU64Ex<USE_POPCNT_GENERIC>(*(uint64_t*)&side[i], arrs);
}
}
#else
for(; i+7 < l._by; i += 8) {
countInU64Ex(*(uint64_t*)&side[i], arrs);
}
#endif
// Count occurences of nucleotides in the rest of the side (using LUT)
// Many cache misses on following lines (~20K)
for(; i < l._by; i++) {
arrs[0] += cCntLUT_4[0][0][side[i]];
arrs[1] += cCntLUT_4[0][1][side[i]];
arrs[2] += cCntLUT_4[0][2][side[i]];
arrs[3] += cCntLUT_4[0][3][side[i]];
}
// Count occurences of c in the rest of the byte
if(l._bp > 0) {
arrs[0] += cCntLUT_4[(int)l._bp][0][side[i]];
arrs[1] += cCntLUT_4[(int)l._bp][1][side[i]];
arrs[2] += cCntLUT_4[(int)l._bp][2][side[i]];
arrs[3] += cCntLUT_4[(int)l._bp][3][side[i]];
}
}
/**
* Counts the number of occurrences of character 'c' in the given Ebwt
* side up to (but not including) the given byte/bitpair (by/bp).
*
* This is a performance-critical function. This is the top search-
* related hit in the time profile.
*/
inline index_t countUpTo_bits(const SideLocus<index_t>& l, bool F) const {
// Count occurrences of c in each 64-bit (using bit trickery);
// Someday countInU64() and pop() functions should be
// vectorized/SSE-ized in case that helps.
bool usePOPCNT = false;
index_t cCnt = 0;
const uint8_t *side = l.side(this->gfm());
if(F) {
side += (_gh._sideGbwtSz >> 1);
} else {
side += (_gh._sideGbwtSz - (_gh._sideGbwtSz >> 2));
}
int i = 0;
#ifdef POPCNT_CAPABILITY
if(_usePOPCNTinstruction) {
usePOPCNT = true;
int by = l._by + (l._bp > 0 ? 1 : 0);
for(; i < by; i += 8) {
if(i + 8 < by) {
cCnt += countInU64_bits<USE_POPCNT_INSTRUCTION>(*(uint64_t*)&side[i]);
} else {
index_t by_shift = 8 - (by - i);
index_t bp_shift = (l._bp > 0 ? 8 - l._bp : 0);
index_t shift = (by_shift << 3) + bp_shift;
uint64_t side_i = *(uint64_t*)&side[i];
side_i = (_toBigEndian ? side_i >> shift : side_i << shift);
index_t cCnt_add = countInU64_bits<USE_POPCNT_INSTRUCTION>(side_i);
#ifndef NDEBUG
index_t cCnt_temp = 0;
for(int j = i; j < l._by; j++) {
cCnt_temp += cCntBIT[0][side[j]];
}
if(l._bp > 0) {
cCnt_temp += cCntBIT[(int)l._bp][side[l._by]];
}
assert_eq(cCnt_add, cCnt_temp);
#endif
cCnt += cCnt_add;
break;
}
}
} else {
for(; i + 7 < l._by; i += 8) {
cCnt += countInU64_bits<USE_POPCNT_GENERIC_BITS>(*(uint64_t*)&side[i]);
}
}
#else
for(; i + 7 < l._by; i += 8) {
cCnt += countInU64_bits(*(uint64_t*)&side[i]);
}
#endif
if(!usePOPCNT) {
// Count occurences of c in the rest of the side (using LUT)
for(; i < l._by; i++) {
cCnt += cCntBIT[0][side[i]];
}
// Count occurences of c in the rest of the byte
if(l._bp > 0) {
cCnt += cCntBIT[(int)l._bp][side[i]];
}
}
return cCnt;
}
#ifndef NDEBUG
/**
* Given top and bot loci, calculate counts of all four DNA chars up to
* those loci. Used for more advanced backtracking-search.
*/
inline void mapLFEx(
const SideLocus<index_t>& l,
index_t *arrs
ASSERT_ONLY(, bool overrideSanity = false)
) const
{
assert_eq(0, arrs[0]);
assert_eq(0, arrs[1]);
assert_eq(0, arrs[2]);
assert_eq(0, arrs[3]);
countBt2SideEx(l, arrs);
if(_sanity && !overrideSanity) {
// Make sure results match up with individual calls to mapLF;
// be sure to override sanity-checking in the callee, or we'll
// have infinite recursion
assert_eq(mapLF(l, 0, true), arrs[0]);
assert_eq(mapLF(l, 1, true), arrs[1]);
assert_eq(mapLF(l, 2, true), arrs[2]);
assert_eq(mapLF(l, 3, true), arrs[3]);
}
}
#endif
/**
* Given top and bot rows, calculate counts of all four DNA chars up to
* those loci.
*/
inline void mapLFEx(
index_t top,
index_t bot,
index_t *tops,
index_t *bots
ASSERT_ONLY(, bool overrideSanity = false)
) const
{
SideLocus<index_t> ltop, lbot;
SideLocus<index_t>::initFromTopBot(top, bot, _gh, gfm(), ltop, lbot);
mapLFEx(ltop, lbot, tops, bots ASSERT_ONLY(, overrideSanity));
}
/**
* Given top and bot loci, calculate counts of all four DNA chars up to
* those loci. Used for more advanced backtracking-search.
*/
inline void mapLFEx(
const SideLocus<index_t>& ltop,
const SideLocus<index_t>& lbot,
index_t *tops,
index_t *bots
ASSERT_ONLY(, bool overrideSanity = false)
) const
{
assert(ltop.repOk(this->gh()));
assert(lbot.repOk(this->gh()));
assert_eq(0, tops[0]); assert_eq(0, bots[0]);
assert_eq(0, tops[1]); assert_eq(0, bots[1]);
assert_eq(0, tops[2]); assert_eq(0, bots[2]);
assert_eq(0, tops[3]); assert_eq(0, bots[3]);
countBt2SideEx(ltop, tops);
countBt2SideEx(lbot, bots);
#ifndef NDEBUG
if(_sanity && !overrideSanity) {
// Make sure results match up with individual calls to mapLF;
// be sure to override sanity-checking in the callee, or we'll
// have infinite recursion
assert_eq(mapLF(ltop, 0, true), tops[0]);
assert_eq(mapLF(ltop, 1, true), tops[1]);
assert_eq(mapLF(ltop, 2, true), tops[2]);
assert_eq(mapLF(ltop, 3, true), tops[3]);
assert_eq(mapLF(lbot, 0, true), bots[0]);
assert_eq(mapLF(lbot, 1, true), bots[1]);
assert_eq(mapLF(lbot, 2, true), bots[2]);
assert_eq(mapLF(lbot, 3, true), bots[3]);
}
#endif
}
/**
* Counts the number of occurrences of all four nucleotides in the
* given side from the given byte/bitpair (l->_by/l->_bp) (or the
* beginning of the side if l == 0). Count for 'a' goes in arrs[0],
* 'c' in arrs[1], etc.
*
* Note: must account for $.
*
* Must fill in masks
*/
inline index_t countBt2SideRange2(
const SideLocus<index_t>& l,
bool startAtLocus,
index_t num,
index_t* arrs,
EList<bool> *masks,
index_t maskOff) const
{
assert(!masks[0].empty());
assert_eq(masks[0].size(), masks[1].size());
assert_eq(masks[0].size(), masks[2].size());
assert_eq(masks[0].size(), masks[3].size());
ASSERT_ONLY(index_t myarrs[4] = {0, 0, 0, 0});
index_t nm = 0; // number of nucleotides tallied so far
int iby = 0; // initial byte offset
int ibp = 0; // initial base-pair offset
if(startAtLocus) {
iby = l._by;
ibp = l._bp;
} else {
// Start at beginning
}
int by = iby, bp = ibp;
assert_lt(bp, 4);
index_t sideGbwtSz = this->_gh._sideGbwtSz >> (this->_gh.linearFM() ? 0 : 1);
assert_lt(by, (int)sideGbwtSz);
const uint8_t *side = l.side(this->gfm());
while(nm < num) {
int c = (side[by] >> (bp * 2)) & 3;
assert_lt(maskOff + nm, masks[c].size());
masks[0][maskOff + nm] = masks[1][maskOff + nm] =
masks[2][maskOff + nm] = masks[3][maskOff + nm] = false;
assert_range(0, 3, c);
// Note: we tally $ just like an A
arrs[c]++; // tally it
ASSERT_ONLY(myarrs[c]++);
masks[c][maskOff + nm] = true; // not dead
nm++;
if(++bp == 4) {
bp = 0;
by++;
assert_leq(by, (int)sideGbwtSz);
if(by == (int)sideGbwtSz) {
// Fell off the end of the side
break;
}
}
}
WITHIN_FCHR_DOLLARA(arrs);
#ifndef NDEBUG
if(_sanity) {
// Make sure results match up with a call to mapLFEx.
index_t tops[4] = {0, 0, 0, 0};
index_t bots[4] = {0, 0, 0, 0};
index_t top = l.toBWRow(gh());
index_t bot = top + nm;
mapLFEx(top, bot, tops, bots, false);
assert(myarrs[0] == (bots[0] - tops[0]) || myarrs[0] == (bots[0] - tops[0])+1);
assert_eq(myarrs[1], bots[1] - tops[1]);
assert_eq(myarrs[2], bots[2] - tops[2]);
assert_eq(myarrs[3], bots[3] - tops[3]);
}
#endif
return nm;
}
/**
* Return the final character in row i (i.e. the i'th character in the
* BWT transform). Note that the 'L' in the name of the function
* stands for 'last', as in the literature.
*/
inline int rowL(const SideLocus<index_t>& l) const {
// Extract and return appropriate bit-pair
return unpack_2b_from_8b(l.side(this->gfm())[l._by], l._bp);
}
/**
* Return the final character in row i (i.e. the i'th character in the
* BWT transform). Note that the 'L' in the name of the function
* stands for 'last', as in the literature.
*/
inline int rowL(index_t i) const {
// Extract and return appropriate bit-pair
SideLocus<index_t> l;
l.initFromRow(i, _gh, gfm());
return rowL(l);
}
/**
* Given top and bot loci, calculate counts of all four DNA chars up to
* those loci. Used for more advanced backtracking-search.
*/
inline void mapLFRange(
const SideLocus<index_t>& ltop,
const SideLocus<index_t>& lbot,
index_t num, // Number of elts
index_t* cntsUpto, // A/C/G/T counts up to top
index_t* cntsIn, // A/C/G/T counts within range
EList<bool> *masks
ASSERT_ONLY(, bool overrideSanity = false)
) const
{
assert(ltop.repOk(this->gh()));
assert(lbot.repOk(this->gh()));
assert_eq(num, lbot.toBWRow(this->gh()) - ltop.toBWRow(this->gh()));
assert_eq(0, cntsUpto[0]); assert_eq(0, cntsIn[0]);
assert_eq(0, cntsUpto[1]); assert_eq(0, cntsIn[1]);
assert_eq(0, cntsUpto[2]); assert_eq(0, cntsIn[2]);
assert_eq(0, cntsUpto[3]); assert_eq(0, cntsIn[3]);
countBt2SideRange(ltop, num, cntsUpto, cntsIn, masks);
assert_eq(num, cntsIn[0] + cntsIn[1] + cntsIn[2] + cntsIn[3]);
#ifndef NDEBUG
if(_sanity && !overrideSanity) {
// Make sure results match up with individual calls to mapLF;
// be sure to override sanity-checking in the callee, or we'll
// have infinite recursion
index_t tops[4] = {0, 0, 0, 0};
index_t bots[4] = {0, 0, 0, 0};
assert(ltop.repOk(this->gh()));
assert(lbot.repOk(this->gh()));
mapLFEx(ltop, lbot, tops, bots, false);
for(int i = 0; i < 4; i++) {
assert(cntsUpto[i] == tops[i] || tops[i] == bots[i]);
if(i == 0) {
assert(cntsIn[i] == bots[i]-tops[i] ||
cntsIn[i] == bots[i]-tops[i]+1);
} else {
assert_eq(cntsIn[i], bots[i]-tops[i]);
}
}
}
#endif
}
/**
* Given row i, return the row that the LF mapping maps i to.
*/
inline index_t mapLF(
const SideLocus<index_t>& l
ASSERT_ONLY(, bool overrideSanity = false)
) const
{
ASSERT_ONLY(index_t srcrow = l.toBWRow(_gh));
index_t ret;
assert(l.side(this->gfm()) != NULL);
int c = rowL(l);
assert_lt(c, 4);
assert_geq(c, 0);
ret = countBt2Side(l, c);
assert_lt(ret, this->_gh._gbwtLen);
assert_neq(srcrow, ret);
#ifndef NDEBUG
if(_sanity && !overrideSanity) {
// Make sure results match up with results from mapLFEx;
// be sure to override sanity-checking in the callee, or we'll
// have infinite recursion
index_t arrs[] = { 0, 0, 0, 0 };
mapLFEx(l, arrs, true);
assert_eq(arrs[c], ret);
}
#endif
return ret;
}
/**
* Given row i and character c, return the row that the LF mapping maps
* i to on character c.
*/
inline index_t mapLF(
const SideLocus<index_t>& l, int c
ASSERT_ONLY(, bool overrideSanity = false)
) const
{
index_t ret;
assert_lt(c, 4);
assert_geq(c, 0);
ret = countBt2Side(l, c);
assert_lt(ret, this->_gh._gbwtLen);
#ifndef NDEBUG
if(_sanity && !overrideSanity) {
// Make sure results match up with results from mapLFEx;
// be sure to override sanity-checking in the callee, or we'll
// have infinite recursion
index_t arrs[] = { 0, 0, 0, 0 };
mapLFEx(l, arrs, true);
assert_eq(arrs[c], ret);
}
#endif
return ret;
}
/**
* Given row i and character c, return the row that the GLF mapping maps
* i to on character c.
*/
inline pair<index_t, index_t> mapLF(
SideLocus<index_t>& tloc, SideLocus<index_t>& bloc, int c,
pair<index_t, index_t>* node_range = NULL
ASSERT_ONLY(, bool overrideSanity = false)
) const
{
assert_lt(c, 4);
assert_geq(c, 0);
index_t top = mapLF(tloc, c);
index_t bot = mapLF(bloc, c);
if(node_range != NULL) {
node_range->first = top; node_range->second = bot;
}
return pair<index_t, index_t>(top, bot);
}
/**
* Given row i and character c, return the row that the GLF mapping maps
* i to on character c.
*/
inline pair<index_t, index_t> mapGLF(
SideLocus<index_t>& tloc, SideLocus<index_t>& bloc, int c,
pair<index_t, index_t>* node_range = NULL,
EList<pair<index_t, index_t> >* node_iedges = NULL,
index_t k = 5
ASSERT_ONLY(, bool overrideSanity = false)
) const
{
assert_lt(c, 4);
assert_geq(c, 0);
index_t top = mapLF(tloc, c);
index_t bot = mapLF(bloc, c);
if(gh().linearFM()) {
if(node_range != NULL) {
node_range->first = top; node_range->second = bot;
}
if(node_iedges != NULL) {
node_iedges->clear();
}
return pair<index_t, index_t>(top, bot);
}
if(top + 1 >= gh()._gbwtLen || top >= bot) {
assert_eq(top, bot);
return pair<index_t, index_t>(0, 0);
}
tloc.initFromRow_bit(top + 1, gh(), gfm());
index_t node_top = rank_M(tloc) - 1;
index_t top_F_loc = 0, top_M_occ = 0;
size_t iter = 0;
while(true) {
const uint8_t *side = tloc.side(gfm()) + gh()._sideGbwtSz - gh()._sideSz * iter;
top_F_loc = *((index_t*)side);
side += sizeof(index_t);
top_M_occ = *((index_t*)side);
assert_leq(top_M_occ, node_top + 1);
if(top_M_occ <= node_top) break;
iter++;
}
if(top_M_occ > 0) top_F_loc++;
tloc.initFromRow_bit(top_F_loc, gh(), gfm());
if(node_top + 1 > top_M_occ) {
top = select_F(tloc, node_top + 1 - top_M_occ);
} else {
top = top_F_loc;
}
bloc.initFromRow_bit(bot, gh(), gfm());
index_t node_bot = rank_M(bloc);
const uint8_t *side = bloc.side(gfm()) + gh()._sideGbwtSz;
index_t bot_F_loc = *((index_t*)side);
side += sizeof(index_t);
index_t bot_M_occ = *((index_t*)side);
assert_leq(bot_M_occ, node_bot + 1);
if(bot_M_occ > 0) bot_F_loc++;
bloc.initFromRow_bit(bot_F_loc, gh(), gfm());
if(node_bot + 1 > bot_M_occ) {
bot = select_F(bloc, node_bot + 1 - bot_M_occ);
} else {
bot = bot_F_loc;
}
if(node_range != NULL) {
(*node_range).first = node_top;
(*node_range).second = node_bot;
}
assert_leq(node_bot - node_top, bot - top);
if(node_iedges != NULL && node_bot - node_top <= k && node_bot - node_top < bot - top) {
getInEdgeCount(top, bot, *node_iedges);
}
return pair<index_t, index_t>(top, bot);
}
/**
* Given top and bot loci, calculate counts of all four DNA chars up to
* those loci. Also, update a set of tops and bots for the reverse
* index/direction using the idea from the bi-directional BWT paper.
*/
inline void mapBiLFEx(
const SideLocus<index_t>& ltop,
const SideLocus<index_t>& lbot,
index_t *tops,
index_t *bots,
index_t *topsP, // topsP[0] = top
index_t *botsP
ASSERT_ONLY(, bool overrideSanity = false)
) const
{
#ifndef NDEBUG
for(int i = 0; i < 4; i++) {
assert_eq(0, tops[0]); assert_eq(0, bots[0]);
}
#endif
countBt2SideEx(ltop, tops);
countBt2SideEx(lbot, bots);
#ifndef NDEBUG
if(_sanity && !overrideSanity) {
// Make sure results match up with individual calls to mapLF;
// be sure to override sanity-checking in the callee, or we'll
// have infinite recursion
assert_eq(mapLF(ltop, 0, true), tops[0]);
assert_eq(mapLF(ltop, 1, true), tops[1]);
assert_eq(mapLF(ltop, 2, true), tops[2]);
assert_eq(mapLF(ltop, 3, true), tops[3]);
assert_eq(mapLF(lbot, 0, true), bots[0]);
assert_eq(mapLF(lbot, 1, true), bots[1]);
assert_eq(mapLF(lbot, 2, true), bots[2]);
assert_eq(mapLF(lbot, 3, true), bots[3]);
}
#endif
// bots[0..3] - tops[0..3] = # of ways to extend the suffix with an
// A, C, G, T
botsP[0] = topsP[0] + (bots[0] - tops[0]);
topsP[1] = botsP[0];
botsP[1] = topsP[1] + (bots[1] - tops[1]);
topsP[2] = botsP[1];
botsP[2] = topsP[2] + (bots[2] - tops[2]);
topsP[3] = botsP[2];
botsP[3] = topsP[3] + (bots[3] - tops[3]);
}
/**
* Given row and its locus information, proceed on the given character
* and return the next row, or all-fs if we can't proceed on that
* character. Returns 0xffffffff if this row ends in $.
*/
inline index_t mapLF1(
index_t row, // starting row
const SideLocus<index_t>& l, // locus for starting row
int c // character to proceed on
ASSERT_ONLY(, bool overrideSanity = false)
) const
{
if(rowL(l) != c) return (index_t)INDEX_MAX;
for(index_t i = 0; i < _zOffs.size(); i++) {
if(row == _zOffs[i]) return (index_t)INDEX_MAX;
}
index_t ret;
assert_lt(c, 4);
assert_geq(c, 0);
ret = countBt2Side(l, c);
assert_lt(ret, this->_gh._gbwtLen);
#ifndef NDEBUG
if(_sanity && !overrideSanity) {
// Make sure results match up with results from mapLFEx;
// be sure to override sanity-checking in the callee, or we'll
// have infinite recursion
index_t arrs[] = { 0, 0, 0, 0 };
mapLFEx(l, arrs, true);
assert_eq(arrs[c], ret);
}
#endif
return ret;
}
/**
* Given row and its locus information, set the row to LF(row) and
* return the character that was in the final column.
*/
inline int mapLF1(
index_t& row, // starting row
const SideLocus<index_t>& l // locus for starting row
ASSERT_ONLY(, bool overrideSanity = false)
) const
{
for(index_t i = 0; i < _zOffs.size(); i++) {
if(row == _zOffs[i]) return -1;
}
int c = rowL(l);
assert_range(0, 3, c);
row = countBt2Side(l, c);
assert_lt(row, this->_gh._gbwtLen);
#ifndef NDEBUG
if(_sanity && !overrideSanity) {
// Make sure results match up with results from mapLFEx;
// be sure to override sanity-checking in the callee, or we'll
// have infinite recursion
index_t arrs[] = { 0, 0, 0, 0 };
mapLFEx(l, arrs, true);
assert_eq(arrs[c], row);
}
#endif
return c;
}
/**
* Given row and its locus information, proceed on the given character
* and return the next row, or all-fs if we can't proceed on that
* character. Returns 0xffffffff if this row ends in $.
*/
inline pair<index_t, index_t> mapGLF1(
index_t row, // starting row
SideLocus<index_t>& l, // locus for starting row
int c, // character to proceed
pair<index_t, index_t>* node_range = NULL
ASSERT_ONLY(, bool overrideSanity = false)
) const
{
assert_lt(c, 4);
assert_geq(c, 0);
index_t top = mapLF1(row, l, c);
if(top == (index_t)INDEX_MAX) return pair<index_t, index_t>(0, 0);
if(gh().linearFM()) {
if(node_range != NULL) {
node_range->first = top; node_range->second = top + 1;
}
return pair<index_t, index_t>(top, top + 1);
}
index_t bot = top;
l.initFromRow_bit(top + 1, gh(), gfm());
index_t node_top = rank_M(l) - 1;
index_t F_loc = 0, M_occ = 0;
size_t iter = 0;
while(true) {
const uint8_t *side = l.side(gfm()) + gh()._sideGbwtSz - gh()._sideSz * iter;
F_loc = *((index_t*)side);
side += sizeof(index_t);
M_occ = *((index_t*)side);
assert_leq(M_occ, node_top + 1);
if(M_occ <= node_top) break;
iter++;
}
if(M_occ > 0) F_loc++;
l.initFromRow_bit(F_loc, gh(), gfm());
if(node_top + 1 > M_occ) {
top = select_F(l, node_top + 1 - M_occ);
} else {
top = F_loc;
}
index_t node_bot = node_top + 1;
if(node_bot + 1 > M_occ) {
SideLocus<index_t> l2;
#if 0
l2.initFromRow_bit(top + 1, gh(), gfm());
bot = select_F(l2, 1);
ASSERT_ONLY(index_t bot2 = select_F(l, node_bot + 1 - M_occ));
assert_eq(bot, bot2);
#else
bot = select_F(l, node_bot + 1 - M_occ);
#endif
} else {
bot = F_loc;
}
if(node_range != NULL) {
(*node_range).first = node_top;
(*node_range).second = node_bot;
}
return pair<index_t, index_t>(top, bot);
}
/**
* Given row and its locus information, proceed on the given character
* and return the next row, or all-fs if we can't proceed on that
* character. Returns 0xffffffff if this row ends in $.
*/
inline pair<index_t, index_t> mapGLF1(
index_t row, // starting row
SideLocus<index_t>& l, // locus for starting row
pair<index_t, index_t>* node_range = NULL
ASSERT_ONLY(, bool overrideSanity = false)
) const
{
for(index_t i = 0; i < _zOffs.size(); i++) {
if(row == _zOffs[i]) return pair<index_t, index_t>((index_t)INDEX_MAX, (index_t)INDEX_MAX);
}
mapLF1(row, l);
index_t top = row;
if(top == (index_t)INDEX_MAX) return pair<index_t, index_t>(0, 0);
if(gh().linearFM()) {
if(node_range != NULL) {
node_range->first = top; node_range->second = top + 1;
}
return pair<index_t, index_t>(top, top + 1);
}
index_t bot = top;
l.initFromRow_bit(top + 1, gh(), gfm());
index_t node_top = rank_M(l) - 1;
index_t F_loc = 0, M_occ = 0;
size_t iter = 0;
while(true) {
const uint8_t *side = l.side(gfm()) + gh()._sideGbwtSz - gh()._sideSz * iter;
F_loc = *((index_t*)side);
side += sizeof(index_t);
M_occ = *((index_t*)side);
assert_leq(M_occ, node_top + 1);
if(M_occ <= node_top) break;
iter++;
}
if(M_occ > 0) F_loc++;
l.initFromRow_bit(F_loc, gh(), gfm());
if(node_top + 1 > M_occ) {
top = select_F(l, node_top + 1 - M_occ);
} else {
top = F_loc;
}
index_t node_bot = node_top + 1;
if(node_bot + 1 > M_occ) {
#if 0
l2.initFromRow_bit(top + 1, gh(), gfm());
bot = select_F(l2, 1);
ASSERT_ONLY(index_t bot2 = select_F(l, node_bot + 1 - M_occ));
assert_eq(bot, bot2);
#else
bot = select_F(l, node_bot + 1 - M_occ);
#endif
} else {
bot = F_loc;
}
if(node_range != NULL) {
(*node_range).first = node_top;
(*node_range).second = node_bot;
}
return pair<index_t, index_t>(top, bot);
}
/**
* Given row i, return rank
*/
inline index_t rank_M(
const SideLocus<index_t>& l
ASSERT_ONLY(, bool overrideSanity = false)
) const
{
index_t ret = countMSide(l);
assert_leq(ret, this->_gh._numNodes);
return ret;
}
/**
* Given row i, return select
*/
inline index_t select_F(
SideLocus<index_t> l,
index_t count
ASSERT_ONLY(, bool overrideSanity = false)
) const
{
assert_gt(count, 0);
const uint8_t *side = l.side(this->gfm()) + (_gh._sideGbwtSz >> 1);
while(true) {
index_t remainingBitsSide = (_gh._sideGbwtSz << 1) - l._charOff;
assert_gt(remainingBitsSide, 0);
index_t minSide = (count < remainingBitsSide ? count : remainingBitsSide);
uint64_t bits = *(uint64_t*)&side[l._by];
uint8_t advance = 64;
if(l._bp > 0) {
bits >>= l._bp;
advance -= l._bp;
}
if(minSide < advance) {
advance = minSide;
bits <<= (64 - minSide);
}
uint8_t tmp_count = 0;
#ifdef POPCNT_CAPABILITY
if(_usePOPCNTinstruction) {
tmp_count = countInU64_bits<USE_POPCNT_INSTRUCTION>(bits);
} else {
tmp_count = countInU64_bits<USE_POPCNT_GENERIC_BITS>(bits);
}
#else
tmp_count = countInU64_bits(bits);
#endif
assert_leq(tmp_count, count);
count -= tmp_count;
if(count == 0) {
assert_gt(advance, 0);
l._charOff += (advance - 1);
assert_lt(l._charOff, _gh._sideGbwtSz << 1);
l._by = l._charOff >> 3;
l._bp = l._charOff & 0x7;
break;
}
assert_leq(l._charOff + advance, (_gh._sideGbwtSz << 1));
if(l._charOff + advance == (_gh._sideGbwtSz << 1)) {
l.nextSide(_gh);
side = l.side(this->gfm()) + (_gh._sideGbwtSz >> 1);
} else {
l._charOff += advance;
l._by = l._charOff >> 3;
l._bp = l._charOff & 0x7;
}
}
return l.toBWRow(_gh);
}
/**
*
*/
inline void getInEdgeCount(
index_t top,
index_t bot,
EList<pair<index_t, index_t> >& node_iedges) const
{
assert_lt(top, bot);
node_iedges.clear();
SideLocus<index_t> l; l.initFromRow_bit(top, _gh, gfm());
const uint8_t *side = l.side(this->gfm()) + (_gh._sideGbwtSz >> 1);
assert_lt(l._by, (_gh._sideGbwtSz >> 2));
assert_eq((side[l._by] >> l._bp) & 0x1, 0x1);
bool first = true;
index_t curr_node = 0;
index_t num0s = 0;
while(top < bot) {
if(first) {
first = false;
} else {
int bit = (side[l._by] >> l._bp) & 0x1;
if(bit == 0x1) {
curr_node++;
num0s = 0;
} else {
num0s++;
if(num0s == 1) {
node_iedges.expand();
node_iedges.back().first = curr_node;
}
node_iedges.back().second = num0s;
}
}
if(l._charOff + 1 == (_gh._sideGbwtSz << 1)) {
l.nextSide(_gh);
side = l.side(this->gfm()) + (_gh._sideGbwtSz >> 1);
} else {
l._charOff++;
l._by = l._charOff >> 3;
l._bp = l._charOff & 0x7;
}
top++;
}
}
#ifndef NDEBUG
/// Check that in-memory Ebwt is internally consistent with respect
/// to given EbwtParams; assert if not
bool inMemoryRepOk(const GFMParams<index_t>& gh) const {
assert_eq(_zOffs.size(), _zGbwtByteOffs.size());
assert_eq(_zOffs.size(), _zGbwtBpOffs.size());
for(index_t i = 0; i < _zOffs.size(); i++) {
assert_geq(_zGbwtBpOffs[i], 0);
assert_lt(_zGbwtBpOffs[i], 4);
assert_lt(_zGbwtByteOffs[i], gh._gbwtTotSz);
assert_lt(_zOffs[i], gh._gbwtLen);
}
assert_geq(_nFrag, _nPat);
assert_eq(_alts.size(), _altnames.size());
return true;
}
/// Check that in-memory Ebwt is internally consistent; assert if
/// not
bool inMemoryRepOk() const {
return repOk(_gh);
}
/// Check that Ebwt is internally consistent with respect to given
/// EbwtParams; assert if not
bool repOk(const GFMParams<index_t>& gh) const {
assert(_gh.repOk());
if(isInMemory()) {
return inMemoryRepOk(gh);
}
return true;
}
/// Check that Ebwt is internally consistent; assert if not
bool repOk() const {
return repOk(_gh);
}
#endif
bool _toBigEndian;
int32_t _overrideOffRate;
bool _verbose;
bool _passMemExc;
bool _sanity;
bool fw_; // true iff this is a forward index
FILE *_in1; // input fd for primary index file
FILE *_in2; // input fd for secondary index file
string _in1Str; // filename for primary index file
string _in2Str; // filename for secondary index file
EList<index_t> _zOffs;
EList<index_t> _zGbwtByteOffs;
EList<int> _zGbwtBpOffs;
index_t _nPat; /// number of reference texts
index_t _nFrag; /// number of fragments
APtrWrap<index_t> _plen;
APtrWrap<index_t> _rstarts; // starting offset of fragments / text indexes
// _fchr, _ftab and _eftab are expected to be relatively small
// (usually < 1MB, perhaps a few MB if _fchr is particularly large
// - like, say, 11). For this reason, we don't bother with writing
// them to disk through separate output streams; we
APtrWrap<index_t> _fchr;
APtrWrap<index_t> _ftab;
APtrWrap<index_t> _eftab; // "extended" entries for _ftab
// _offs may be extremely large. E.g. for DNA w/ offRate=4 (one
// offset every 16 rows), the total size of _offs is the same as
// the total size of the input sequence
APtrWrap<index_t> _offs;
// _ebwt is the Extended Burrows-Wheeler Transform itself, and thus
// is at least as large as the input sequence.
APtrWrap<uint8_t> _gfm;
bool _useMm; /// use memory-mapped files to hold the index
bool useShmem_; /// use shared memory to hold large parts of the index
EList<string> _refnames; /// names of the reference sequences
EList<string> _refnames_nospace; // names of the reference sequences (names stop at space)
char *mmFile1_;
char *mmFile2_;
int _nthreads;
GFMParams<index_t> _gh;
bool packed_;
static const uint64_t default_bmax = INDEX_MAX;
static const uint64_t default_bmaxMultSqrt = INDEX_MAX;
static const uint64_t default_bmaxDivN = 4;
static const int default_dcv = 1024;
static const bool default_noDc = false;
static const bool default_useBlockwise = true;
static const uint32_t default_seed = 0;
#ifdef BOWTIE_64BIT_INDEX
static const int default_lineRate_gfm = 8;
static const int default_lineRate_fm = 7;
#else
static const int default_lineRate_gfm = 7;
static const int default_lineRate_fm = 6;
#endif
static const int default_offRate = 5;
static const int default_offRatePlus = 0;
static const int default_ftabChars = 10;
static const bool default_bigEndian = false;
// data used to build an index
EList<ALT<index_t> > _alts;
EList<string> _altnames;
EList<Haplotype<index_t> > _haplotypes;
RepeatDB<index_t> _repeatdb;
EList<RB_KmerTable> _repeat_kmertables;
bool _repeat;
EList<pair<index_t, index_t> > _repeatLens;
EList<uint8_t> _repeatIncluded;
protected:
ostream& log() const {
return cerr; // TODO: turn this into a parameter
}
/// Print a verbose message and flush (flushing is helpful for
/// debugging)
void verbose(const string& s) const {
if(this->verbose()) {
this->log() << s.c_str();
this->log().flush();
}
}
};
/**
* Read reference names from an input stream 'in' for an Ebwt primary
* file and store them in 'refnames'.
*/
template <typename index_t>
void readEbwtRefnames(istream& in, EList<string>& refnames) {
// _in1 must already be open with the get cursor at the
// beginning and no error flags set.
assert(in.good());
assert_eq((streamoff)in.tellg(), ios::beg);
// Read endianness hints from both streams
bool switchEndian = false;
uint32_t one = readU32(in, switchEndian); // 1st word of primary stream
if(one != 1) {
assert_eq((1u<<24), one);
switchEndian = true;
}
readU32(in, switchEndian); // version
// Reads header entries one by one from primary stream
index_t len = readIndex<index_t>(in, switchEndian);
index_t gbwtLen = readIndex<index_t>(in, switchEndian);
index_t numNodes = readIndex<index_t>(in, switchEndian);
int32_t lineRate = readI32(in, switchEndian);
/*int32_t linesPerSide =*/ readI32(in, switchEndian);
int32_t offRate = readI32(in, switchEndian);
int32_t ftabChars = readI32(in, switchEndian);
index_t eftabLen = readIndex<index_t>(in, switchEndian);
// BTL: chunkRate is now deprecated
int32_t flags = readI32(in, switchEndian);
bool entireReverse = false;
if(flags < 0) {
entireReverse = (((-flags) & GFM_ENTIRE_REV) != 0);
}
// Create a new EbwtParams from the entries read from primary stream
GFMParams<index_t> gh(len, gbwtLen, numNodes, lineRate, offRate, ftabChars, eftabLen, entireReverse);
index_t nPat = readIndex<index_t>(in, switchEndian); // nPat
in.seekg(nPat*sizeof(index_t), ios_base::cur); // skip plen
// Skip rstarts
index_t nFrag = readIndex<index_t>(in, switchEndian);
in.seekg(nFrag*sizeof(index_t)*3, ios_base::cur);
// Skip ebwt
in.seekg(gh._gbwtTotLen, ios_base::cur);
// Skip zOff from primary stream
index_t numZOffs = readIndex<index_t>(in, switchEndian);
in.seekg(numZOffs * sizeof(index_t), ios_base::cur);
// Skip fchr
in.seekg(5 * sizeof(index_t), ios_base::cur);
// Skip ftab
in.seekg(gh._ftabLen*sizeof(index_t), ios_base::cur);
// Skip eftab
in.seekg(gh._eftabLen*sizeof(index_t), ios_base::cur);
// Read reference sequence names from primary index file
while(true) {
char c = '\0';
in.read(&c, 1);
if(in.eof()) break;
if(c == '\0') break;
else if(c == '\n') {
refnames.push_back("");
} else {
if(refnames.size() == 0) {
refnames.push_back("");
}
refnames.back().push_back(c);
}
}
if(refnames.back().empty()) {
refnames.pop_back();
}
// Be kind
in.clear(); in.seekg(0, ios::beg);
assert(in.good());
}
/**
* Read reference names from the index with basename 'in' and store
* them in 'refnames'.
*/
template <typename index_t>
void readEbwtRefnames(const string& instr, EList<string>& refnames) {
ifstream in;
// Initialize our primary and secondary input-stream fields
in.open((instr + ".1." + gfm_ext).c_str(), ios_base::in | ios::binary);
if(!in.is_open()) {
throw GFMFileOpenException("Cannot open file " + instr);
}
assert(in.is_open());
assert(in.good());
assert_eq((streamoff)in.tellg(), ios::beg);
readEbwtRefnames<index_t>(in, refnames);
}
///////////////////////////////////////////////////////////////////////
//
// Functions for building Ebwts
//
///////////////////////////////////////////////////////////////////////
/**
* Join several text strings together in a way that's compatible with
* the text-chunking scheme dictated by chunkRate parameter.
*
* The non-static member Ebwt::join additionally builds auxilliary
* arrays that maintain a mapping between chunks in the joined string
* and the original text strings.
*/
template <typename index_t>
template <typename TStr>
TStr GFM<index_t>::join(EList<TStr>& l, uint32_t seed) {
RandomSource rand; // reproducible given same seed
rand.init(seed);
TStr ret;
index_t guessLen = 0;
for(index_t i = 0; i < l.size(); i++) {
guessLen += length(l[i]);
}
ret.resize(guessLen);
index_t off = 0;
for(size_t i = 0; i < l.size(); i++) {
TStr& s = l[i];
assert_gt(s.length(), 0);
for(size_t j = 0; j < s.size(); j++) {
ret.set(s[j], off++);
}
}
return ret;
}
/**
* Join several text strings together in a way that's compatible with
* the text-chunking scheme dictated by chunkRate parameter.
*
* The non-static member Ebwt::join additionally builds auxilliary
* arrays that maintain a mapping between chunks in the joined string
* and the original text strings.
*/
template <typename index_t>
template <typename TStr>
void GFM<index_t>::join(EList<FileBuf*>& l,
EList<RefRecord>& szs,
index_t sztot,
const RefReadInParams& refparams,
uint32_t seed,
TStr& s,
bool include_rc,
bool CGtoTG)
{
RandomSource rand; // reproducible given same seed
rand.init(seed);
RefReadInParams rpcp = refparams;
index_t guessLen = sztot;
if(include_rc) {
s.resize(guessLen << 1);
} else {
s.resize(guessLen);
}
ASSERT_ONLY(index_t szsi = 0);
TIndexOffU dstoff = 0;
for(index_t i = 0; i < l.size(); i++) {
// For each sequence we can pull out of istream l[i]...
assert(!l[i]->eof());
bool first = true;
while(!l[i]->eof()) {
RefRecord rec = fastaRefReadAppend(*l[i], first, s, dstoff, rpcp);
first = false;
index_t bases = (index_t)rec.len;
assert_eq(rec.off, szs[szsi].off);
assert_eq(rec.len, szs[szsi].len);
assert_eq(rec.first, szs[szsi].first);
ASSERT_ONLY(szsi++);
if(bases == 0) continue;
}
}
// Change 'C' in CG to 'T' so that CG becomes TG
if(CGtoTG) {
for(TIndexOffU i = 0; i + 1 < guessLen; i++) {
int nt1 = s[i], nt2 = s[i+1];
if(nt1 == 1 && nt2 == 2) {
s[i] = 3;
}
}
}
// Append reverse complement
if(include_rc) {
for (TIndexOffU i = 0; i < guessLen; i++) {
int nt = s[guessLen - i - 1];
assert_range(0, 3, nt);
s[guessLen + i] = dnacomp[nt];
}
}
}
/**
* Join several text strings together according to the text-chunking
* scheme specified in the EbwtParams. Ebwt fields calculated in this
* function are written directly to disk.
*
* It is assumed, but not required, that the header values have already
* been written to 'out1' before this function is called.
*
* The static member Ebwt::join just returns a joined version of a
* list of strings without building any of the auxilliary arrays.
*/
template <typename index_t>
template <typename TStr>
void GFM<index_t>::joinToDisk(
EList<FileBuf*>& l,
EList<RefRecord>& szs,
index_t sztot,
const RefReadInParams& refparams,
TStr& ret,
ostream& out1,
ostream& out2)
{
RefReadInParams rpcp = refparams;
assert_gt(szs.size(), 0);
assert_gt(l.size(), 0);
assert_gt(sztot, 0);
// Not every fragment represents a distinct sequence - many
// fragments may correspond to a single sequence. Count the
// number of sequences here by counting the number of "first"
// fragments.
this->_nPat = 0;
this->_nFrag = 0;
for(index_t i = 0; i < szs.size(); i++) {
if(szs[i].len > 0) this->_nFrag++;
if(szs[i].first && szs[i].len > 0) this->_nPat++;
}
assert_gt(this->_nPat, 0);
assert_geq(this->_nFrag, this->_nPat);
_rstarts.reset();
writeIndex<index_t>(out1, this->_nPat, this->toBe());
// Allocate plen[]
try {
this->_plen.init(new index_t[this->_nPat], this->_nPat);
} catch(bad_alloc& e) {
cerr << "Out of memory allocating plen[] in Ebwt::join()"
<< " at " << __FILE__ << ":" << __LINE__ << endl;
throw e;
}
// For each pattern, set plen
int npat = -1;
for(index_t i = 0; i < szs.size(); i++) {
if(szs[i].first && szs[i].len > 0) {
if(npat >= 0) {
writeIndex<index_t>(out1, this->plen()[npat], this->toBe());
}
npat++;
this->plen()[npat] = (szs[i].len + szs[i].off);
} else {
this->plen()[npat] += (szs[i].len + szs[i].off);
}
}
assert_eq((index_t)npat, this->_nPat-1);
writeIndex<index_t>(out1, this->plen()[npat], this->toBe());
// Write the number of fragments
writeIndex<index_t>(out1, this->_nFrag, this->toBe());
index_t seqsRead = 0;
ASSERT_ONLY(index_t szsi = 0);
ASSERT_ONLY(index_t entsWritten = 0);
index_t dstoff = 0;
// For each filebuf
for(unsigned int i = 0; i < l.size(); i++) {
assert(!l[i]->eof());
bool first = true;
index_t patoff = 0;
// For each *fragment* (not necessary an entire sequence) we
// can pull out of istream l[i]...
while(!l[i]->eof()) {
string name;
// Push a new name onto our vector
_refnames.push_back("");
RefRecord rec = fastaRefReadAppend(
*l[i], first, ret, dstoff, rpcp, &_refnames.back());
first = false;
index_t bases = rec.len;
if(rec.first && rec.len > 0) {
if(_refnames.back().length() == 0) {
// If name was empty, replace with an index
ostringstream stm;
stm << seqsRead;
_refnames.back() = stm.str();
}
} else {
// This record didn't actually start a new sequence so
// no need to add a name
//assert_eq(0, _refnames.back().length());
_refnames.pop_back();
}
// Increment seqsRead if this is the first fragment
if(rec.first && rec.len > 0) seqsRead++;
assert_lt(szsi, szs.size());
assert_eq(rec.off, szs[szsi].off);
assert_eq(rec.len, szs[szsi].len);
assert_eq(rec.first, szs[szsi].first);
assert(rec.first || rec.off > 0);
ASSERT_ONLY(szsi++);
assert_leq(bases, this->plen()[seqsRead-1]);
// Reset the patoff if this is the first fragment
if(rec.first) patoff = 0;
patoff += rec.off; // add fragment's offset from end of last frag.
// Adjust rpcps
//index_t seq = seqsRead-1;
#ifndef NDEBUG
if(bases > 0) {
ASSERT_ONLY(entsWritten++);
}
#endif
// This is where rstarts elements are written to the output stream
//writeU32(out1, oldRetLen, this->toBe()); // offset from beginning of joined string
//writeU32(out1, seq, this->toBe()); // sequence id
//writeU32(out1, patoff, this->toBe()); // offset into sequence
patoff += (index_t)bases;
}
assert_gt(szsi, 0);
l[i]->reset();
assert(!l[i]->eof());
#ifndef NDEBUG
int c = l[i]->get();
assert_eq('>', c);
assert(!l[i]->eof());
l[i]->reset();
assert(!l[i]->eof());
#endif
}
assert_eq(entsWritten, this->_nFrag);
}
/**
* Build an Ebwt from a string 's' and its suffix array 'sa' (which
* might actually be a suffix array *builder* that builds blocks of the
* array on demand). The bulk of the Ebwt, i.e. the ebwt and offs
* arrays, is written directly to disk. This is by design: keeping
* those arrays in memory needlessly increases the footprint of the
* building process. Instead, we prefer to build the Ebwt directly
* "to disk" and then read it back into memory later as necessary.
*
* It is assumed that the header values and join-related values (nPat,
* plen) have already been written to 'out1' before this function
* is called. When this function is finished, it will have
* additionally written ebwt, zOff, fchr, ftab and eftab to the primary
* file and offs to the secondary file.
*
* Assume DNA/RNA/any alphabet with 4 or fewer elements.
* Assume occ array entries are 32 bits each.
*
* @param sa the suffix array to convert to a Ebwt
* @param s the original string
* @param out
*/
template <typename index_t>
template <typename TStr>
void GFM<index_t>::buildToDisk(
PathGraph<index_t>& gbwt,
const TStr& s,
ostream& out1,
ostream& out2,
streampos headerPos)
{
const GFMParams<index_t>& gh = this->_gh;
assert(gh.repOk());
assert_lt(s.length(), gh.gbwtLen());
assert_eq(s.length(), gh._len);
assert_gt(gh._lineRate, 3);
index_t gbwtLen = gh._gbwtLen;
streampos out1pos = out1.tellp();
if(headerPos < 0) {
out1.seekp(8 + sizeof(index_t));
} else {
out1.seekp(headerPos);
}
writeIndex<index_t>(out1, gbwtLen, this->toBe());
writeIndex<index_t>(out1, gh._numNodes, this->toBe());
out1.seekp(out1pos);
index_t ftabLen = gh._ftabLen;
index_t sideSz = gh._sideSz;
index_t gbwtTotSz = gh._gbwtTotSz;
index_t fchr[] = {0, 0, 0, 0, 0};
EList<index_t> ftab(EBWT_CAT);
EList<index_t> zOffs;
// Save # of occurrences of each character as we walk along the bwt
index_t occ[4] = {0, 0, 0, 0};
index_t occSave[4] = {0, 0, 0, 0};
// # of occurrences of 1 in M arrays
index_t M_occ = 0, M_occSave = 0;
// Location in F that corresponds to 1 in M
index_t F_loc = 0, F_locSave = 0;
// Record rows that should "absorb" adjacent rows in the ftab.
try {
VMSG_NL("Allocating ftab, absorbFtab");
ftab.resize(ftabLen);
ftab.fillZero();
} catch(bad_alloc &e) {
cerr << "Out of memory allocating ftab[] "
<< "in GFM::buildToDisk() at " << __FILE__ << ":"
<< __LINE__ << endl;
throw e;
}
// Allocate the side buffer; holds a single side as its being
// constructed and then written to disk. Reused across all sides.
#ifdef SIXTY4_FORMAT
EList<uint64_t> gfmSide(EBWT_CAT);
#else
EList<uint8_t> gfmSide(EBWT_CAT);
#endif
try {
// Used to calculate ftab and eftab, but having gfm costs a lot of memory
_gfm.init(new uint8_t[gh._gbwtTotLen], gh._gbwtTotLen, true);
#ifdef SIXTY4_FORMAT
gfmSide.resize(sideSz >> 3);
#else
gfmSide.resize(sideSz);
#endif
} catch(bad_alloc &e) {
cerr << "Out of memory allocating ebwtSide[] in "
<< "GFM::buildToDisk() at " << __FILE__ << ":"
<< __LINE__ << endl;
throw e;
}
// Points to the base offset within ebwt for the side currently
// being written
index_t side = 0;
// Whether we're assembling a forward or a reverse bucket
bool fw = true;
int sideCur = 0;
index_t si = 0; // string offset (chars)
ASSERT_ONLY(bool inSA = true); // true iff saI still points inside suffix
// array (as opposed to the padding at the
// end)
// Iterate over packed bwt bytes
VMSG_NL("Entering GFM loop");
ASSERT_ONLY(index_t beforeGbwtOff = (index_t)out1.tellp());
while(side < gbwtTotSz) {
// Sanity-check our cursor into the side buffer
assert_geq(sideCur, 0);
assert_lt(sideCur, (int)gh._sideGbwtSz);
assert_eq(0, side % sideSz); // 'side' must be on side boundary
if(sideCur == 0) {
memset(gfmSide.ptr(), 0, gh._sideGbwtSz);
gfmSide[sideCur] = 0; // clear
}
assert_lt(side + sideCur, gbwtTotSz);
// Iterate over bit-pairs in the si'th character of the BWT
#ifdef SIXTY4_FORMAT
for(int bpi = 0; bpi < 32; bpi++, si++)
#else
for(int bpi = 0; bpi < 4; bpi++, si++)
#endif
{
int gbwtChar = 0; // one of A, C, G, T, and Z
int F= 0, M = 0; // either 0 or 1
index_t pos = 0; // pos on joined string
bool count = true;
if(si < gbwtLen) {
gbwt.nextRow(gbwtChar, F, M, pos);
// (that might have triggered sa to calc next suf block)
if(gbwtChar == 'Z') {
// Don't add the 'Z' in the last column to the BWT
// transform; we can't encode a $ (only A C T or G)
// and counting it as, say, an A, will mess up the
// LF mapping
gbwtChar = 0; count = false;
#ifndef NDEBUG
if(zOffs.size() > 0) {
assert_gt(si, zOffs.back());
}
#endif
zOffs.push_back(si); // remember GBWT row that corresponds to the 0th suffix
} else {
gbwtChar = asc2dna[gbwtChar];
assert_lt(gbwtChar, 4);
// Update the fchr
fchr[gbwtChar]++;
}
assert_lt(F, 2);
assert_lt(M, 2);
if(M == 1) {
assert_neq(F_loc, numeric_limits<index_t>::max());
F_loc = gbwt.nextFLocation();
#ifndef NDEBUG
if(F_loc > 0) {
assert_gt(F_loc, F_locSave);
}
#endif
}
// Suffix array offset boundary? - update offset array
if(M == 1 && (M_occ & gh._offMask) == M_occ) {
assert_lt((M_occ >> gh._offRate), gh._offsLen);
// Write offsets directly to the secondary output
// stream, thereby avoiding keeping them in memory
writeIndex<index_t>(out2, pos, this->toBe());
}
} else {
// Strayed off the end of the SA, now we're just
// padding out a bucket
#ifndef NDEBUG
if(inSA) {
// Assert that we wrote all the characters in the
// string before now
assert_eq(si, gbwtLen);
inSA = false;
}
#endif
// 'A' used for padding; important that padding be
// counted in the occ[] array
gbwtChar = 0;
F = M = 0;
}
if(count) occ[gbwtChar]++;
if(M) M_occ++;
// Append BWT char to bwt section of current side
if(fw) {
// Forward bucket: fill from least to most
#ifdef SIXTY4_FORMAT
gfmSide[sideCur] |= ((uint64_t)gbwtChar << (bpi << 1));
if(gbwtChar > 0) assert_gt(gfmSide[sideCur], 0);
// To be implemented ...
assert(false);
cerr << "Not implemented" << endl;
exit(1);
#else
pack_2b_in_8b(gbwtChar, gfmSide[sideCur], bpi);
assert_eq((gfmSide[sideCur] >> (bpi*2)) & 3, gbwtChar);
int F_sideCur = (gh._sideGbwtSz + sideCur) >> 1;
int F_bpi = bpi + ((sideCur & 0x1) << 2); // Can be used as M_bpi as well
pack_1b_in_8b(F, gfmSide[F_sideCur], F_bpi);
assert_eq((gfmSide[F_sideCur] >> F_bpi) & 1, F);
int M_sideCur = F_sideCur + (gh._sideGbwtSz >> 2);
pack_1b_in_8b(M, gfmSide[M_sideCur], F_bpi);
assert_eq((gfmSide[M_sideCur] >> F_bpi) & 1, M);
#endif
} else {
// Backward bucket: fill from most to least
#ifdef SIXTY4_FORMAT
gfmSide[sideCur] |= ((uint64_t)gbwtChar << ((31 - bpi) << 1));
if(gbwtChar > 0) assert_gt(gfmSide[sideCur], 0);
// To be implemented ...
assert(false);
cerr << "Not implemented" << endl;
exit(1);
#else
pack_2b_in_8b(gbwtChar, gfmSide[sideCur], 3-bpi);
assert_eq((gfmSide[sideCur] >> ((3-bpi)*2)) & 3, gbwtChar);
// To be implemented ...
assert(false);
cerr << "Not implemented" << endl;
exit(1);
#endif
}
} // end loop over bit-pairs
assert_eq(0, (occ[0] + occ[1] + occ[2] + occ[3] + zOffs.size()) & 3);
#ifdef SIXTY4_FORMAT
assert_eq(0, si & 31);
#else
assert_eq(0, si & 3);
#endif
sideCur++;
if((sideCur << 1) == (int)gh._sideGbwtSz) {
sideCur = 0;
index_t *uside = reinterpret_cast<index_t*>(gfmSide.ptr());
// Write 'A', 'C', 'G', 'T', and '1' in M tallies
side += sideSz;
assert_leq(side, gh._gbwtTotSz);
uside[(sideSz / sizeof(index_t))-6] = endianizeIndex(F_locSave, this->toBe());
uside[(sideSz / sizeof(index_t))-5] = endianizeIndex(M_occSave, this->toBe());
uside[(sideSz / sizeof(index_t))-4] = endianizeIndex(occSave[0], this->toBe());
uside[(sideSz / sizeof(index_t))-3] = endianizeIndex(occSave[1], this->toBe());
uside[(sideSz / sizeof(index_t))-2] = endianizeIndex(occSave[2], this->toBe());
uside[(sideSz / sizeof(index_t))-1] = endianizeIndex(occSave[3], this->toBe());
F_locSave = F_loc;
M_occSave = M_occ;
occSave[0] = occ[0];
occSave[1] = occ[1];
occSave[2] = occ[2];
occSave[3] = occ[3];
// Write backward side to primary file
out1.write((const char *)gfmSide.ptr(), sideSz);
//
memcpy(((char*)_gfm.get()) + side - sideSz, (const char *)gfmSide.ptr(), sideSz);
}
}
VMSG_NL("Exited GFM loop");
// Assert that our loop counter got incremented right to the end
assert_eq(side, gh._gbwtTotSz);
// Assert that we wrote the expected amount to out1
assert_eq(((index_t)out1.tellp() - beforeGbwtOff), gh._gbwtTotSz);
// assert that the last thing we did was write a forward bucket
//
// Write zOffs to primary stream
//
assert_gt(zOffs.size(), 0);
writeIndex<index_t>(out1, (index_t)zOffs.size(), this->toBe());
for(size_t i = 0; i < zOffs.size(); i++) {
writeIndex<index_t>(out1, zOffs[i], this->toBe());
}
//
// Finish building fchr
//
// Exclusive prefix sum on fchr
for(int i = 1; i < 4; i++) {
fchr[i] += fchr[i-1];
}
assert_lt(fchr[3], gbwtLen);
// Shift everybody up by one
for(int i = 4; i >= 1; i--) {
fchr[i] = fchr[i-1];
}
fchr[0] = 0;
if(_verbose) {
for(int i = 0; i < 5; i++)
cerr << "fchr[" << "ACGT$"[i] << "]: " << fchr[i] << endl;
}
// Write fchr to primary file
for(int i = 0; i < 5; i++) {
writeIndex<index_t>(out1, fchr[i], this->toBe());
}
_fchr.init(new index_t[5], 5, true);
memcpy(_fchr.get(), fchr, sizeof(index_t) * 5);
// Initialize _zGbwtByteOffs and _zGbwtBpOffs
_zOffs = zOffs;
postReadInit(gh);
// Build ftab and eftab
EList<pair<index_t, index_t> > tFtab;
tFtab.resizeExact(ftabLen - 1);
for(index_t i = 0; i + 1 < ftabLen; i++) {
index_t q = i;
pair<index_t, index_t> range(0, gh._gbwtLen);
SideLocus<index_t> tloc, bloc;
SideLocus<index_t>::initFromTopBot(range.first, range.second, gh, gfm(), tloc, bloc);
index_t j = 0;
for(; j < (index_t)gh._ftabChars; j++) {
int nt = q & 0x3; q >>= 2;
if(bloc.valid()) {
range = mapGLF(tloc, bloc, nt);
} else {
range = mapGLF1(range.first, tloc, nt);
}
if(range.first == (index_t)INDEX_MAX || range.first >= range.second) {
break;
}
if(range.first + 1 == range.second) {
tloc.initFromRow(range.first, gh, gfm());
bloc.invalidate();
} else {
SideLocus<index_t>::initFromTopBot(range.first, range.second, gh, gfm(), tloc, bloc);
}
}
if(range.first >= range.second || j < (index_t)gh._ftabChars) {
if(i == 0) {
tFtab[i].first = tFtab[i].second = 0;
} else {
tFtab[i].first = tFtab[i].second = tFtab[i-1].second;
}
} else {
tFtab[i].first = range.first;
tFtab[i].second = range.second;
}
#ifndef NDEBUG
if(gbwt.ftab.size() > i) {
assert_eq(tFtab[i].first, gbwt.ftab[i].first);
assert_eq(tFtab[i].second, gbwt.ftab[i].second);
}
#endif
}
// Clear memory
_gfm.reset();
_fchr.reset();
_zOffs.clear();
_zGbwtByteOffs.clear();
_zGbwtBpOffs.clear();
//
// Finish building ftab and build eftab
//
// Prefix sum on ftable
index_t eftabLen = 0;
for(index_t i = 1; i + 1 < ftabLen; i++) {
if(tFtab[i-1].second != tFtab[i].first) {
eftabLen += 2;
}
}
if(gh._gbwtLen + (eftabLen >> 1) < gh._gbwtLen) {
cerr << "Too many eftab entries: "
<< gh._gbwtLen << " + " << (eftabLen >> 1)
<< " > " << (index_t)INDEX_MAX << endl;
throw 1;
}
EList<index_t> eftab(EBWT_CAT);
try {
eftab.resize(eftabLen);
eftab.fillZero();
} catch(bad_alloc &e) {
cerr << "Out of memory allocating eftab[] "
<< "in GFM::buildToDisk() at " << __FILE__ << ":"
<< __LINE__ << endl;
throw e;
}
index_t eftabCur = 0;
ftab[0] = tFtab[0].first;
ftab[1] = tFtab[0].second;
for(index_t i = 1; i + 1 < ftabLen; i++) {
if(ftab[i] != tFtab[i].first) {
index_t lo = ftab[i];
index_t hi = tFtab[i].first;
assert_lt(eftabCur*2+1, eftabLen);
eftab[eftabCur*2] = lo;
eftab[eftabCur*2+1] = hi;
// one node can be shared, and one node can have at most four incoming edges
assert_leq(lo, hi + 4);
ftab[i] = (eftabCur++) ^ (index_t)INDEX_MAX; // insert pointer into eftab
assert_eq(lo, GFM<index_t>::ftabLo(ftab.ptr(), eftab.ptr(), gbwtLen, ftabLen, eftabLen, i));
assert_eq(hi, GFM<index_t>::ftabHi(ftab.ptr(), eftab.ptr(), gbwtLen, ftabLen, eftabLen, i));
}
ftab[i+1] = tFtab[i].second;
}
#ifndef NDEBUG
for(index_t i = 0; i + 1 < ftabLen; i++ ){
assert_eq(tFtab[i].first, GFM<index_t>::ftabHi(ftab.ptr(), eftab.ptr(), gbwtLen, ftabLen, eftabLen, i));
assert_eq(tFtab[i].second, GFM<index_t>::ftabLo(ftab.ptr(), eftab.ptr(), gbwtLen, ftabLen, eftabLen, i+1));
}
#endif
// Write ftab to primary file
for(index_t i = 0; i < ftabLen; i++) {
writeIndex<index_t>(out1, ftab[i], this->toBe());
}
// Write eftab to primary file
out1pos = out1.tellp();
if(headerPos < 0) {
out1.seekp(24 + sizeof(index_t) * 3);
} else {
out1.seekp((int)headerPos + 16 + sizeof(index_t) * 2);
}
writeIndex<index_t>(out1, eftabLen, this->toBe());
out1.seekp(out1pos);
for(index_t i = 0; i < eftabLen; i++) {
writeIndex<index_t>(out1, eftab[i], this->toBe());
}
// Note: if you'd like to sanity-check the Ebwt, you'll have to
// read it back into memory first!
assert(!isInMemory());
VMSG_NL("Exiting GFM::buildToDisk()");
}
/**
* Build an Ebwt from a string 's' and its suffix array 'sa' (which
* might actually be a suffix array *builder* that builds blocks of the
* array on demand). The bulk of the Ebwt, i.e. the ebwt and offs
* arrays, is written directly to disk. This is by design: keeping
* those arrays in memory needlessly increases the footprint of the
* building process. Instead, we prefer to build the Ebwt directly
* "to disk" and then read it back into memory later as necessary.
*
* It is assumed that the header values and join-related values (nPat,
* plen) have already been written to 'out1' before this function
* is called. When this function is finished, it will have
* additionally written ebwt, zOff, fchr, ftab and eftab to the primary
* file and offs to the secondary file.
*
* Assume DNA/RNA/any alphabet with 4 or fewer elements.
* Assume occ array entries are 32 bits each.
*
* @param sa the suffix array to convert to a Ebwt
* @param s the original string
* @param out
*/
template <typename index_t>
template <typename TStr>
void GFM<index_t>::buildToDisk(
InorderBlockwiseSA<TStr>& sa,
const TStr& s,
ostream& out1,
ostream& out2,
streampos headerPos)
{
const GFMParams<index_t>& gh = this->_gh;
assert(gh.repOk());
assert(gh.linearFM());
assert_lt(s.length(), gh.gbwtLen());
assert_eq(s.length(), gh._len);
assert_gt(gh._lineRate, 3);
index_t len = gh._len;
index_t gbwtLen = gh._gbwtLen;
assert_eq(len + 1, gbwtLen);
streampos out1pos = out1.tellp();
if(headerPos < 0) {
out1.seekp(8 + sizeof(index_t));
} else {
out1.seekp(headerPos);
}
writeIndex<index_t>(out1, gbwtLen, this->toBe());
writeIndex<index_t>(out1, gh._numNodes, this->toBe());
out1.seekp(out1pos);
index_t ftabLen = gh._ftabLen;
index_t sideSz = gh._sideSz;
index_t gbwtTotSz = gh._gbwtTotSz;
index_t fchr[] = {0, 0, 0, 0, 0};
EList<index_t> ftab(EBWT_CAT);
EList<index_t> zOffs;
// Save # of occurrences of each character as we walk along the bwt
index_t occ[4] = {0, 0, 0, 0};
index_t occSave[4] = {0, 0, 0, 0};
// Record rows that should "absorb" adjacent rows in the ftab.
// The absorbed rows represent suffixes shorter than the ftabChars
// cutoff.
uint8_t absorbCnt = 0;
EList<uint8_t> absorbFtab(EBWT_CAT);
try {
VMSG_NL("Allocating ftab, absorbFtab");
ftab.resize(ftabLen);
ftab.fillZero();
absorbFtab.resize(ftabLen);
absorbFtab.fillZero();
} catch(bad_alloc &e) {
cerr << "Out of memory allocating ftab[] or absorbFtab[] "
<< "in GFM::buildToDisk() at " << __FILE__ << ":"
<< __LINE__ << endl;
throw e;
}
// Allocate the side buffer; holds a single side as its being
// constructed and then written to disk. Reused across all sides.
#ifdef SIXTY4_FORMAT
EList<uint64_t> gfmSide(EBWT_CAT);
#else
EList<uint8_t> gfmSide(EBWT_CAT);
#endif
try {
#ifdef SIXTY4_FORMAT
gfmSide.resize(sideSz >> 3);
#else
gfmSide.resize(sideSz);
#endif
} catch(bad_alloc &e) {
cerr << "Out of memory allocating gfmSide[] in "
<< "GFM::buildToDisk() at " << __FILE__ << ":"
<< __LINE__ << endl;
throw e;
}
// Points to the base offset within ebwt for the side currently
// being written
index_t side = 0;
// Whether we're assembling a forward or a reverse bucket
bool fw = true;
int sideCur = 0;
// Have we skipped the '$' in the last column yet?
ASSERT_ONLY(bool dollarSkipped = false);
index_t si = 0; // string offset (chars)
ASSERT_ONLY(index_t lastSufInt = 0);
ASSERT_ONLY(bool inSA = true); // true iff saI still points inside suffix
// array (as opposed to the padding at the
// end)
// Iterate over packed bwt bytes
VMSG_NL("Entering GFM loop");
ASSERT_ONLY(index_t beforeGbwtOff = (index_t)out1.tellp());
while(side < gbwtTotSz) {
// Sanity-check our cursor into the side buffer
assert_geq(sideCur, 0);
assert_lt(sideCur, (int)gh._sideGbwtSz);
assert_eq(0, side % sideSz); // 'side' must be on side boundary
gfmSide[sideCur] = 0; // clear
assert_lt(side + sideCur, gbwtTotSz);
// Iterate over bit-pairs in the si'th character of the BWT
#ifdef SIXTY4_FORMAT
for(int bpi = 0; bpi < 32; bpi++, si++)
#else
for(int bpi = 0; bpi < 4; bpi++, si++)
#endif
{
int bwtChar;
bool count = true;
if(si <= len) {
// Still in the SA; extract the bwtChar
index_t saElt = sa.nextSuffix();
// (that might have triggered sa to calc next suf block)
if(saElt == 0) {
// Don't add the '$' in the last column to the BWT
// transform; we can't encode a $ (only A C T or G)
// and counting it as, say, an A, will mess up the
// LR mapping
bwtChar = 0; count = false;
ASSERT_ONLY(dollarSkipped = true);
zOffs.push_back(si); // remember the SA row that
// corresponds to the 0th suffix
} else {
bwtChar = (int)(s[saElt-1]);
assert_lt(bwtChar, 4);
// Update the fchr
fchr[bwtChar]++;
}
// Update ftab
if((len-saElt) >= (index_t)gh._ftabChars) {
// Turn the first ftabChars characters of the
// suffix into an integer index into ftab. The
// leftmost (lowest index) character of the suffix
// goes in the most significant bit pair if the
// integer.
index_t sufInt = 0;
for(int i = 0; i < gh._ftabChars; i++) {
sufInt <<= 2;
assert_lt((index_t)i, len-saElt);
sufInt |= (unsigned char)(s[saElt+i]);
}
// Assert that this prefix-of-suffix is greater
// than or equal to the last one (true b/c the
// suffix array is sorted)
#ifndef NDEBUG
if(lastSufInt > 0) assert_geq(sufInt, lastSufInt);
lastSufInt = sufInt;
#endif
// Update ftab
assert_lt(sufInt+1, ftabLen);
ftab[sufInt+1]++;
if(absorbCnt > 0) {
// Absorb all short suffixes since the last
// transition into this transition
absorbFtab[sufInt] = absorbCnt;
absorbCnt = 0;
}
} else {
// Otherwise if suffix is fewer than ftabChars
// characters long, then add it to the 'absorbCnt';
// it will be absorbed into the next transition
assert_lt(absorbCnt, 255);
absorbCnt++;
}
// Suffix array offset boundary? - update offset array
if((si & gh._offMask) == si) {
assert_lt((si >> gh._offRate), gh._offsLen);
// Write offsets directly to the secondary output
// stream, thereby avoiding keeping them in memory
writeIndex<index_t>(out2, saElt, this->toBe());
}
} else {
// Strayed off the end of the SA, now we're just
// padding out a bucket
#ifndef NDEBUG
if(inSA) {
// Assert that we wrote all the characters in the
// string before now
assert_eq(si, len+1);
inSA = false;
}
#endif
// 'A' used for padding; important that padding be
// counted in the occ[] array
bwtChar = 0;
}
if(count) occ[bwtChar]++;
// Append BWT char to bwt section of current side
if(fw) {
// Forward bucket: fill from least to most
#ifdef SIXTY4_FORMAT
ebwtSide[sideCur] |= ((uint64_t)bwtChar << (bpi << 1));
if(bwtChar > 0) assert_gt(ebwtSide[sideCur], 0);
#else
pack_2b_in_8b(bwtChar, gfmSide[sideCur], bpi);
assert_eq((gfmSide[sideCur] >> (bpi*2)) & 3, bwtChar);
#endif
} else {
// Backward bucket: fill from most to least
#ifdef SIXTY4_FORMAT
ebwtSide[sideCur] |= ((uint64_t)bwtChar << ((31 - bpi) << 1));
if(bwtChar > 0) assert_gt(ebwtSide[sideCur], 0);
#else
pack_2b_in_8b(bwtChar, gfmSide[sideCur], 3-bpi);
assert_eq((gfmSide[sideCur] >> ((3-bpi)*2)) & 3, bwtChar);
#endif
}
} // end loop over bit-pairs
assert_eq(dollarSkipped ? 3 : 0, (occ[0] + occ[1] + occ[2] + occ[3]) & 3);
#ifdef SIXTY4_FORMAT
assert_eq(0, si & 31);
#else
assert_eq(0, si & 3);
#endif
sideCur++;
if(sideCur == (int)gh._sideGbwtSz) {
sideCur = 0;
index_t *uside = reinterpret_cast<index_t*>(gfmSide.ptr());
// Write 'A', 'C', 'G', 'T', and '1' in M tallies
side += sideSz;
assert_leq(side, gh._gbwtTotSz);
uside[(sideSz / sizeof(index_t))-4] = endianizeIndex(occSave[0], this->toBe());
uside[(sideSz / sizeof(index_t))-3] = endianizeIndex(occSave[1], this->toBe());
uside[(sideSz / sizeof(index_t))-2] = endianizeIndex(occSave[2], this->toBe());
uside[(sideSz / sizeof(index_t))-1] = endianizeIndex(occSave[3], this->toBe());
occSave[0] = occ[0];
occSave[1] = occ[1];
occSave[2] = occ[2];
occSave[3] = occ[3];
// Write backward side to primary file
out1.write((const char *)gfmSide.ptr(), sideSz);
}
}
VMSG_NL("Exited GFM loop");
if(absorbCnt > 0) {
// Absorb any trailing, as-yet-unabsorbed short suffixes into
// the last element of ftab
absorbFtab[ftabLen-1] = absorbCnt;
}
// Assert that our loop counter got incremented right to the end
assert_eq(side, gh._gbwtTotSz);
// Assert that we wrote the expected amount to out1
assert_eq(((index_t)out1.tellp() - beforeGbwtOff), gh._gbwtTotSz);
// assert that the last thing we did was write a forward bucket
//
// Write zOffs to primary stream
//
assert_eq(zOffs.size(), 1);
writeIndex<index_t>(out1, (index_t)zOffs.size(), this->toBe());
for(size_t i = 0; i < zOffs.size(); i++) {
assert_neq(zOffs[i], (index_t)OFF_MASK);
writeIndex<index_t>(out1, zOffs[i], this->toBe());
}
//
// Finish building fchr
//
// Exclusive prefix sum on fchr
for(int i = 1; i < 4; i++) {
fchr[i] += fchr[i-1];
}
assert_lt(fchr[3], gbwtLen);
// Shift everybody up by one
for(int i = 4; i >= 1; i--) {
fchr[i] = fchr[i-1];
}
fchr[0] = 0;
if(_verbose) {
for(int i = 0; i < 5; i++)
cerr << "fchr[" << "ACGT$"[i] << "]: " << fchr[i] << endl;
}
// Write fchr to primary file
for(int i = 0; i < 5; i++) {
writeIndex<index_t>(out1, fchr[i], this->toBe());
}
//
// Finish building ftab and build eftab
//
// Prefix sum on ftable
index_t eftabLen = 0;
assert_eq(0, absorbFtab[0]);
for(index_t i = 1; i < ftabLen; i++) {
if(absorbFtab[i] > 0) eftabLen += 2;
}
assert_leq(eftabLen, (index_t)gh._ftabChars*2);
eftabLen = gh._ftabChars*2;
EList<index_t> eftab(EBWT_CAT);
try {
eftab.resize(eftabLen);
eftab.fillZero();
} catch(bad_alloc &e) {
cerr << "Out of memory allocating eftab[] "
<< "in GFM::buildToDisk() at " << __FILE__ << ":"
<< __LINE__ << endl;
throw e;
}
index_t eftabCur = 0;
for(index_t i = 1; i < ftabLen; i++) {
index_t lo = ftab[i] + GFM<index_t>::ftabHi(ftab.ptr(), eftab.ptr(), len, ftabLen, eftabLen, i-1);
if(absorbFtab[i] > 0) {
// Skip a number of short pattern indicated by absorbFtab[i]
index_t hi = lo + absorbFtab[i];
assert_lt(eftabCur*2+1, eftabLen);
eftab[eftabCur*2] = lo;
eftab[eftabCur*2+1] = hi;
ftab[i] = (eftabCur++) ^ (index_t)OFF_MASK; // insert pointer into eftab
assert_eq(lo, GFM<index_t>::ftabLo(ftab.ptr(), eftab.ptr(), len, ftabLen, eftabLen, i));
assert_eq(hi, GFM<index_t>::ftabHi(ftab.ptr(), eftab.ptr(), len, ftabLen, eftabLen, i));
} else {
ftab[i] = lo;
}
}
assert_eq(GFM<index_t>::ftabHi(ftab.ptr(), eftab.ptr(), len, ftabLen, eftabLen, ftabLen-1), len+1);
// Write ftab to primary file
for(index_t i = 0; i < ftabLen; i++) {
writeIndex<index_t>(out1, ftab[i], this->toBe());
}
// Write eftab to primary file
out1pos = out1.tellp();
if(headerPos < 0) {
out1.seekp(24 + sizeof(index_t) * 3);
} else {
out1.seekp((int)headerPos + 16 + sizeof(index_t) * 2);
}
writeIndex<index_t>(out1, eftabLen, this->toBe());
out1.seekp(out1pos);
for(index_t i = 0; i < eftabLen; i++) {
writeIndex<index_t>(out1, eftab[i], this->toBe());
}
// Note: if you'd like to sanity-check the Ebwt, you'll have to
// read it back into memory first!
assert(!isInMemory());
VMSG_NL("Exiting GFM::buildToDisk()");
}
extern string gLastIOErrMsg;
/* Checks whether a call to read() failed or not. */
inline bool is_read_err(int fdesc, ssize_t ret, size_t count) {
if (ret < 0) {
std::stringstream sstm;
sstm << "ERRNO: " << errno << " ERR Msg:" << strerror(errno) << std::endl;
gLastIOErrMsg = sstm.str();
return true;
}
return false;
}
/* Checks whether a call to fread() failed or not. */
inline bool is_fread_err(FILE* file_hd, size_t ret, size_t count) {
if (ferror(file_hd)) {
gLastIOErrMsg = "Error Reading File!";
return true;
}
return false;
}
///////////////////////////////////////////////////////////////////////
//
// Functions for searching Ebwts
// (But most of them are defined in the header)
//
///////////////////////////////////////////////////////////////////////
/**
* Take an offset into the joined text and translate it into the
* reference of the index it falls on, the offset into the reference,
* and the length of the reference. Use a binary search through the
* sorted list of reference fragment ranges t
*/
template <typename index_t>
bool GFM<index_t>::joinedToTextOff(
index_t qlen,
index_t off,
index_t& tidx,
index_t& textoff,
index_t& tlen,
bool rejectStraddle,
bool& straddled) const
{
assert(rstarts() != NULL); // must have loaded rstarts
index_t top = 0;
index_t bot = _nFrag; // 1 greater than largest addressable element
index_t elt = (index_t)INDEX_MAX;
// Begin binary search
while(true) {
index_t oldelt = elt;
elt = top + ((bot - top) >> 1);
if(oldelt == elt) {
tidx = (index_t)INDEX_MAX;
return false;
}
index_t lower = rstarts()[elt*3];
index_t upper;
if(elt == _nFrag-1) {
upper = _gh._len;
} else {
upper = rstarts()[((elt+1)*3)];
}
assert_gt(upper, lower);
index_t fraglen = upper - lower;
if(lower <= off) {
if(upper > off) { // not last element, but it's within
// off is in this range; check if it falls off
if(off + qlen > upper) {
straddled = true;
if(rejectStraddle) {
// it falls off; signal no-go and return
tidx = (index_t)INDEX_MAX;
return false;
}
}
// This is the correct text idx whether the index is
// forward or reverse
tidx = rstarts()[(elt*3)+1];
assert_lt(tidx, this->_nPat);
assert_leq(fraglen, this->plen()[tidx]);
// it doesn't fall off; now calculate textoff.
// Initially it's the number of characters that precede
// the alignment in the fragment
index_t fragoff = off - rstarts()[(elt*3)];
if(!this->fw_) {
fragoff = fraglen - fragoff - 1;
fragoff -= (qlen-1);
}
// Add the alignment's offset into the fragment
// ('fragoff') to the fragment's offset within the text
textoff = fragoff + rstarts()[(elt*3)+2];
assert_lt(textoff, this->plen()[tidx]);
break; // done with binary search
} else {
// 'off' belongs somewhere in the region between elt
// and bot
top = elt;
}
} else {
// 'off' belongs somewhere in the region between top and
// elt
bot = elt;
}
// continue with binary search
}
tlen = this->plen()[tidx];
return true;
}
template <typename index_t>
bool GFM<index_t>::textOffToJoined(
index_t tid,
index_t textoff,
index_t& off) const
{
assert(rstarts() != NULL); // must have loaded rstarts
index_t top = 0;
index_t bot = _nFrag; // 1 greater than largest addressable element
index_t elt = (index_t)INDEX_MAX;
// Begin binary search
while(true) {
ASSERT_ONLY(index_t oldelt = elt);
elt = top + ((bot - top) >> 1);
assert_neq(oldelt, elt); // must have made progress
index_t elt_tid = rstarts()[elt*3 + 1];
if(elt_tid == tid) {
while(true) {
if(tid != rstarts()[elt*3+1]) {
return false;
}
if(rstarts()[elt*3 + 2] <= textoff) break;
if(elt == 0) return false;
elt--;
}
while(true) {
assert_leq(rstarts()[elt*3+2], textoff);
if(elt + 1 == _nFrag ||
tid + 1 == rstarts()[(elt+1)*3 + 1] ||
textoff < rstarts()[(elt+1)*3 + 2]) {
off = rstarts()[elt*3] + (textoff - rstarts()[elt*3 + 2]);
if(elt + 1 < _nFrag &&
tid == rstarts()[(elt+1)*3 + 1] &&
off >= rstarts()[(elt+1)*3]) {
return false;
}
break;
}
elt++;
}
break; // done with binary search
} else if(elt_tid < tid) {
top = elt;
} else {
bot = elt;
}
// continue with binary search
}
return true;
}
/**
* Walk 'steps' steps to the left and return the row arrived at. If we
* walk through the dollar sign, return 0xffffffff.
*/
template <typename index_t>
index_t GFM<index_t>::walkLeft(index_t row, index_t steps) const {
assert(offs() != NULL);
assert_neq((index_t)INDEX_MAX, row);
SideLocus<index_t> l;
if(steps > 0) l.initFromRow(row, _gh, gfm());
while(steps > 0) {
for(index_t i = 0; i < _zOffs.size(); i++) {
if(row == _zOffs[i]) return (index_t)INDEX_MAX;
}
pair<index_t, index_t> range = this->mapGLF1(row, l, (pair<index_t, index_t> *)NULL ASSERT_ONLY(, false));
index_t newrow = range.first;
assert_neq((index_t)INDEX_MAX, newrow);
assert_neq(newrow, row);
row = newrow;
steps--;
if(steps > 0) l.initFromRow(row, _gh, gfm());
}
return row;
}
/**
* Resolve the reference offset of the BW element 'elt'.
*/
template <typename index_t>
index_t GFM<index_t>::getOffset(index_t row, index_t node) const {
assert(offs() != NULL);
assert_neq((index_t)INDEX_MAX, row);
for(index_t i = 0; i < _zOffs.size(); i++) {
if(row == _zOffs[i]) return 0;
}
if((node & _gh._offMask) == node) {
index_t off = this->offs()[node >> _gh._offRate];
if(off != (index_t)INDEX_MAX)
return off;
}
index_t jumps = 0;
SideLocus<index_t> l;
l.initFromRow(row, _gh, gfm());
while(true) {
pair<index_t, index_t> node_range(0, 0);
pair<index_t, index_t> range = this->mapGLF1(row, l, &node_range ASSERT_ONLY(, false));
index_t newrow = range.first;
jumps++;
assert_neq((index_t)INDEX_MAX, newrow);
assert_neq(newrow, row);
row = newrow;
for(index_t i = 0; i < _zOffs.size(); i++) {
if(row == _zOffs[i]) return jumps;
}
if((node_range.first & _gh._offMask) == node_range.first) {
index_t off = this->offs()[node_range.first >> _gh._offRate];
if(off != (index_t)INDEX_MAX)
return jumps + off;
}
l.initFromRow(row, _gh, gfm());
}
}
/**
* Resolve the reference offset of the BW element 'elt' such that
* the offset returned is at the right-hand side of the forward
* reference substring involved in the hit.
*/
template <typename index_t>
index_t GFM<index_t>::getOffset(
index_t elt,
bool fw,
index_t hitlen) const
{
index_t off = getOffset(elt);
assert_neq((index_t)INDEX_MAX, off);
if(!fw) {
assert_lt(off, _gh._len);
off = _gh._len - off - 1;
assert_geq(off, hitlen-1);
off -= (hitlen-1);
assert_lt(off, _gh._len);
}
return off;
}
/**
* Returns true iff the index contains the given string (exactly). The given
* string must contain only unambiguous characters. TODO: support ambiguous
* characters in 'str'.
*/
template <typename index_t>
bool GFM<index_t>::contains(
const BTDnaString& str,
index_t *otop,
index_t *obot) const
{
assert(isInMemory());
SideLocus<index_t> tloc, bloc;
if(str.empty()) {
if(otop != NULL && obot != NULL) *otop = *obot = 0;
return true;
}
int c = str[str.length()-1];
assert_range(0, 4, c);
index_t top = 0, bot = 0;
if(c < 4) {
top = fchr()[c];
bot = fchr()[c+1];
} else {
bool set = false;
for(int i = 0; i < 4; i++) {
if(fchr()[c] < fchr()[c+1]) {
if(set) {
return false;
} else {
set = true;
top = fchr()[c];
bot = fchr()[c+1];
}
}
}
}
assert_geq(bot, top);
tloc.initFromRow(top, gh(), gfm());
bloc.initFromRow(bot, gh(), gfm());
ASSERT_ONLY(index_t lastDiff = bot - top);
for(int64_t i = (int64_t)str.length()-2; i >= 0; i--) {
c = str[i];
assert_range(0, 4, c);
if(c <= 3) {
top = mapLF(tloc, c);
bot = mapLF(bloc, c);
} else {
index_t sz = bot - top;
int c1 = mapLF1(top, tloc ASSERT_ONLY(, false));
bot = mapLF(bloc, c1);
assert_leq(bot - top, sz);
if(bot - top < sz) {
// Encountered an N and could not proceed through it because
// there was more than one possible nucleotide we could replace
// it with
return false;
}
}
assert_geq(bot, top);
assert_leq(bot-top, lastDiff);
ASSERT_ONLY(lastDiff = bot-top);
if(i > 0) {
tloc.initFromRow(top, gh(), gfm());
bloc.initFromRow(bot, gh(), gfm());
}
}
if(otop != NULL && obot != NULL) {
*otop = top; *obot = bot;
}
return bot > top;
}
///////////////////////////////////////////////////////////////////////
//
// Functions for reading and writing Ebwts
//
///////////////////////////////////////////////////////////////////////
/**
* Read an Ebwt from file with given filename.
*/
template <typename index_t>
void GFM<index_t>::readIntoMemory(
int needEntireRev,
bool loadSASamp,
bool loadFtab,
bool loadRstarts,
bool justHeader,
GFMParams<index_t> *params,
bool mmSweep,
bool loadNames,
bool startVerbose,
bool subIndex)
{
bool switchEndian; // dummy; caller doesn't care
#ifdef BOWTIE_MM
char *mmFile[] = { NULL, NULL };
#endif
if(_in1Str.length() > 0 && !subIndex) {
if(_verbose || startVerbose) {
cerr << " About to open input files: ";
logTime(cerr);
}
// Initialize our primary and secondary input-stream fields
if(_in1 != NULL) fclose(_in1);
if(_verbose || startVerbose) cerr << "Opening \"" << _in1Str.c_str() << "\"" << endl;
if((_in1 = fopen(_in1Str.c_str(), "rb")) == NULL) {
cerr << "Could not open index file " << _in1Str.c_str() << endl;
}
if(loadSASamp) {
if(_in2 != NULL) fclose(_in2);
if(_verbose || startVerbose) cerr << "Opening \"" << _in2Str.c_str() << "\"" << endl;
if((_in2 = fopen(_in2Str.c_str(), "rb")) == NULL) {
cerr << "Could not open index file " << _in2Str.c_str() << endl;
}
}
if(_verbose || startVerbose) {
cerr << " Finished opening input files: ";
logTime(cerr);
}
#ifdef BOWTIE_MM
if(_useMm /*&& !justHeader*/) {
const char *names[] = {_in1Str.c_str(), _in2Str.c_str()};
int fds[] = { fileno(_in1), fileno(_in2) };
for(int i = 0; i < (loadSASamp ? 2 : 1); i++) {
if(_verbose || startVerbose) {
cerr << " Memory-mapping input file " << (i+1) << ": ";
logTime(cerr);
}
struct stat sbuf;
if (stat(names[i], &sbuf) == -1) {
perror("stat");
cerr << "Error: Could not stat index file " << names[i] << " prior to memory-mapping" << endl;
throw 1;
}
mmFile[i] = (char*)mmap((void *)0, (size_t)sbuf.st_size,
PROT_READ, MAP_SHARED, fds[(size_t)i], 0);
if(mmFile[i] == (void *)(-1)) {
perror("mmap");
cerr << "Error: Could not memory-map the index file " << names[i] << endl;
throw 1;
}
if(mmSweep) {
int sum = 0;
for(off_t j = 0; j < sbuf.st_size; j += 1024) {
sum += (int) mmFile[i][j];
}
if(startVerbose) {
cerr << " Swept the memory-mapped ebwt index file 1; checksum: " << sum << ": ";
logTime(cerr);
}
}
}
mmFile1_ = mmFile[0];
mmFile2_ = loadSASamp ? mmFile[1] : NULL;
}
#endif
}
#ifdef BOWTIE_MM
else if(_useMm && !justHeader) {
mmFile[0] = mmFile1_;
mmFile[1] = mmFile2_;
}
if(_useMm && !justHeader) {
assert(mmFile[0] == mmFile1_);
assert(mmFile[1] == mmFile2_);
}
#endif
if(_verbose || startVerbose) {
cerr << " Reading header: ";
logTime(cerr);
}
// Read endianness hints from both streams
size_t bytesRead = 0;
if(!subIndex) {
switchEndian = false;
uint32_t one = readU32(_in1, switchEndian); // 1st word of primary stream
bytesRead += 4;
if(loadSASamp) {
#ifndef NDEBUG
assert_eq(one, readU32(_in2, switchEndian)); // should match!
#else
readU32(_in2, switchEndian);
#endif
}
if(one != 1) {
assert_eq((1u<<24), one);
assert_eq(1, endianSwapU32(one));
switchEndian = true;
}
_toBigEndian = switchEndian;
// Can't switch endianness and use memory-mapped files; in order to
// support this, someone has to modify the file to switch
// endiannesses appropriately, and we can't do this inside Bowtie
// or we might be setting up a race condition with other processes.
if(switchEndian && _useMm) {
cerr << "Error: Can't use memory-mapped files when the index is the opposite endianness" << endl;
throw 1;
}
// Reads header entries one by one from primary stream
int index_version = (int)readU32(_in1, switchEndian); bytesRead += 4;
int major_index_version, minor_index_version;
string index_version_extra;
readIndexVersion(index_version, major_index_version, minor_index_version, index_version_extra);
int major_program_version, minor_program_version;
string program_version_extra;
readProgramVersion(major_program_version, minor_program_version, program_version_extra);
if(major_program_version < major_index_version ||
(major_program_version == major_index_version && minor_program_version < minor_index_version)) {
cerr << "Warning: the current version of HISAT2 (" << HISAT2_VERSION << ") is older than the version (2."
<< major_index_version << "." << minor_index_version;
if(index_version_extra.length() > 0) {
cerr << "-" << index_version_extra;
}
cerr << ") used to build the index." << endl;
cerr << " Users are strongly recommended to update HISAT2 to the latest version." << endl;
}
} else {
switchEndian = _toBigEndian;
}
index_t len = readIndex<index_t>(_in1, switchEndian);
bytesRead += sizeof(index_t);
index_t gbwtLen = readIndex<index_t>(_in1, switchEndian);
bytesRead += sizeof(index_t);
assert_lt(len, gbwtLen);
index_t numNodes = readIndex<index_t>(_in1, switchEndian);
bytesRead += sizeof(index_t);
int32_t lineRate = readI32(_in1, switchEndian);
bytesRead += 4;
/*int32_t linesPerSide =*/ readI32(_in1, switchEndian);
bytesRead += 4;
int32_t offRate = readI32(_in1, switchEndian);
bytesRead += 4;
// TODO: add isaRate to the actual file format (right now, the
// user has to tell us whether there's an ISA sample and what the
// sampling rate is.
int32_t ftabChars = readI32(_in1, switchEndian);
bytesRead += 4;
index_t eftabLen = readIndex<index_t>(_in1, switchEndian);
bytesRead += sizeof(index_t);
// chunkRate was deprecated in an earlier version of Bowtie; now
// we use it to hold flags.
int32_t flags = readI32(_in1, switchEndian);
bool entireRev = false;
if(flags < 0 && (((-flags) & GFM_ENTIRE_REV) == 0)) {
if(needEntireRev != -1 && needEntireRev != 0) {
cerr << "Error: This index is compatible with 0.* versions of Bowtie, but not with 2.*" << endl
<< "versions. Please build or download a version of the index that is compitble" << endl
<< "with Bowtie 2.* (i.e. built with bowtie-build 2.* or later)" << endl;
throw 1;
}
} else entireRev = true;
bytesRead += 4;
// Create a new EbwtParams from the entries read from primary stream
GFMParams<index_t> *gh;
bool deleteGh = false;
if(params != NULL) {
params->init(len, gbwtLen, numNodes, lineRate, offRate, ftabChars, eftabLen, entireRev);
if(_verbose || startVerbose) params->print(cerr);
gh = params;
} else {
gh = new GFMParams<index_t>(len, gbwtLen, numNodes, lineRate, offRate, ftabChars, eftabLen, entireRev);
deleteGh = true;
}
// Set up overridden suffix-array-sample parameters
index_t offsLen = gh->_offsLen;
index_t offRateDiff = 0;
index_t offsLenSampled = offsLen;
if(_overrideOffRate > offRate) {
offRateDiff = _overrideOffRate - offRate;
}
if(offRateDiff > 0) {
offsLenSampled >>= offRateDiff;
if((offsLen & ~(((index_t)INDEX_MAX) << offRateDiff)) != 0) {
offsLenSampled++;
}
}
// Can't override the offrate or isarate and use memory-mapped
// files; ultimately, all processes need to copy the sparser sample
// into their own memory spaces.
#if 0
if(_useMm && (offRateDiff)) {
cerr << "Error: Can't use memory-mapped files when the offrate is overridden" << endl;
throw 1;
}
#endif
// Read nPat from primary stream
this->_nPat = readIndex<index_t>(_in1, switchEndian);
bytesRead += sizeof(index_t);
_plen.reset();
// Read plen from primary stream
if(_useMm) {
#ifdef BOWTIE_MM
_plen.init((index_t*)(mmFile[0] + bytesRead), _nPat, false);
bytesRead += _nPat*sizeof(index_t);
fseek(_in1, _nPat*sizeof(index_t), SEEK_CUR);
#endif
} else {
try {
if(_verbose || startVerbose) {
cerr << "Reading plen (" << this->_nPat << "): ";
logTime(cerr);
}
_plen.init(new index_t[_nPat], _nPat, true);
if(switchEndian) {
for(index_t i = 0; i < this->_nPat; i++) {
plen()[i] = readIndex<index_t>(_in1, switchEndian);
}
} else {
size_t r = MM_READ(_in1, (void*)(plen()), _nPat*sizeof(index_t));
if(r != (size_t)(_nPat*sizeof(index_t))) {
cerr << "Error reading _plen[] array: " << r << ", " << _nPat*sizeof(index_t) << endl;
throw 1;
}
}
} catch(bad_alloc& e) {
cerr << "Out of memory allocating plen[] in Ebwt::read()"
<< " at " << __FILE__ << ":" << __LINE__ << endl;
throw e;
}
}
// TODO: I'm not consistent on what "header" means. Here I'm using
// "header" to mean everything that would exist in memory if we
// started to build the Ebwt but stopped short of the build*() step
// (i.e. everything up to and including join()).
if(justHeader) {
// Be kind
if(deleteGh) delete gh;
#ifdef BOWTIE_MM
fseek(_in1, 0, SEEK_SET);
if(loadSASamp) fseek(_in2, 0, SEEK_SET);
#else
rewind(_in1);
if(loadSASamp) rewind(_in2);
#endif
return;
}
bool shmemLeader;
this->_nFrag = readIndex<index_t>(_in1, switchEndian);
bytesRead += sizeof(index_t);
if(_verbose || startVerbose) {
cerr << "Reading rstarts (" << this->_nFrag*3 << "): ";
logTime(cerr);
}
assert_geq(this->_nFrag, this->_nPat);
_rstarts.reset();
if(loadRstarts) {
if(_useMm) {
#ifdef BOWTIE_MM
_rstarts.init((index_t*)(mmFile[0] + bytesRead), _nFrag*3, false);
bytesRead += this->_nFrag*sizeof(index_t)*3;
fseek(_in1, this->_nFrag*sizeof(index_t)*3, SEEK_CUR);
#endif
} else {
_rstarts.init(new index_t[_nFrag*3], _nFrag*3, true);
if(switchEndian) {
for(size_t i = 0; i < (size_t)(this->_nFrag*3); i += 3) {
// fragment starting position in joined reference
// string, text id, and fragment offset within text
this->rstarts()[i] = readIndex<index_t>(_in1, switchEndian);
this->rstarts()[i+1] = readIndex<index_t>(_in1, switchEndian);
this->rstarts()[i+2] = readIndex<index_t>(_in1, switchEndian);
}
} else {
size_t r = MM_READ(_in1, (void *)rstarts(), this->_nFrag*sizeof(index_t)*3);
if(r != (size_t)(this->_nFrag*sizeof(index_t)*3)) {
cerr << "Error reading _rstarts[] array: " << r << ", " << (this->_nFrag*sizeof(index_t)*3) << endl;
throw 1;
}
}
}
} else {
// Skip em
assert(rstarts() == NULL);
bytesRead += this->_nFrag*sizeof(index_t)*3;
fseek(_in1, this->_nFrag*sizeof(index_t)*3, SEEK_CUR);
}
_gfm.reset();
if(_useMm) {
#ifdef BOWTIE_MM
_gfm.init((uint8_t*)(mmFile[0] + bytesRead), gh->_gbwtTotLen, false);
bytesRead += gh->_gbwtTotLen;
fseek(_in1, gh->_gbwtTotLen, SEEK_CUR);
#endif
} else {
// Allocate ebwt (big allocation)
if(_verbose || startVerbose) {
cerr << "Reading ebwt (" << gh->_gbwtTotLen << "): ";
logTime(cerr);
}
bool shmemLeader = true;
if(useShmem_) {
uint8_t *tmp = NULL;
shmemLeader = ALLOC_SHARED_U8(
(_in1Str + "[ebwt]"), gh->_gbwtTotLen, &tmp,
"gfm[]", (_verbose || startVerbose));
assert(tmp != NULL);
_gfm.init(tmp, gh->_gbwtTotLen, false);
if(_verbose || startVerbose) {
cerr << " shared-mem " << (shmemLeader ? "leader" : "follower") << endl;
}
} else {
try {
_gfm.init(new uint8_t[gh->_gbwtTotLen], gh->_gbwtTotLen, true);
} catch(bad_alloc& e) {
cerr << "Out of memory allocating the gfm[] array for the Bowtie index. Please try" << endl
<< "again on a computer with more memory." << endl;
throw 1;
}
}
if(shmemLeader) {
// Read ebwt from primary stream
uint64_t bytesLeft = gh->_gbwtTotLen;
char *pgbwt = (char*)this->gfm();
while (bytesLeft>0){
size_t r = MM_READ(this->_in1, (void *)pgbwt, bytesLeft);
if(MM_IS_IO_ERR(this->_in1, r, bytesLeft)) {
cerr << "Error reading _ebwt[] array: " << r << ", "
<< bytesLeft << endl;
throw 1;
}
pgbwt += r;
bytesLeft -= r;
}
if(switchEndian) {
uint8_t *side = this->gfm();
for(size_t i = 0; i < gh->_numSides; i++) {
index_t *cums = reinterpret_cast<index_t*>(side + gh->_sideSz - sizeof(index_t)*2);
cums[0] = endianSwapIndex(cums[0]);
cums[1] = endianSwapIndex(cums[1]);
side += this->_gh._sideSz;
}
}
#ifdef BOWTIE_SHARED_MEM
if(useShmem_) NOTIFY_SHARED(gfm(), gh->_gbwtTotLen);
#endif
} else {
// Seek past the data and wait until master is finished
fseek(_in1, gh->_gbwtTotLen, SEEK_CUR);
#ifdef BOWTIE_SHARED_MEM
if(useShmem_) WAIT_SHARED(gfm(), gh->_gbwtTotLen);
#endif
}
}
// Read zOff from primary stream
_zOffs.clear();
index_t num_zOffs = readIndex<index_t>(_in1, switchEndian);
bytesRead += sizeof(index_t);
for(index_t i = 0; i < num_zOffs; i++) {
index_t zOff = readIndex<index_t>(_in1, switchEndian);
bytesRead += sizeof(index_t);
assert_lt(zOff, gbwtLen);
_zOffs.push_back(zOff);
}
try {
// Read fchr from primary stream
if(_verbose || startVerbose) cerr << "Reading fchr (5)" << endl;
_fchr.reset();
if(_useMm) {
#ifdef BOWTIE_MM
_fchr.init((index_t*)(mmFile[0] + bytesRead), 5, false);
bytesRead += 5*sizeof(index_t);
fseek(_in1, 5*sizeof(index_t), SEEK_CUR);
#endif
} else {
_fchr.init(new index_t[5], 5, true);
for(int i = 0; i < 5; i++) {
this->fchr()[i] = readIndex<index_t>(_in1, switchEndian);
assert_leq(this->fchr()[i], gbwtLen);
assert(i <= 0 || this->fchr()[i] >= this->fchr()[i-1]);
}
}
assert_gt(this->fchr()[4], this->fchr()[0]);
// Read ftab from primary stream
if(_verbose || startVerbose) {
if(loadFtab) {
cerr << "Reading ftab (" << gh->_ftabLen << "): ";
logTime(cerr);
} else {
cerr << "Skipping ftab (" << gh->_ftabLen << "): ";
}
}
_ftab.reset();
if(loadFtab) {
if(_useMm) {
#ifdef BOWTIE_MM
_ftab.init((index_t*)(mmFile[0] + bytesRead), gh->_ftabLen, false);
bytesRead += gh->_ftabLen*sizeof(index_t);
fseek(_in1, gh->_ftabLen*sizeof(index_t), SEEK_CUR);
#endif
} else {
_ftab.init(new index_t[gh->_ftabLen], gh->_ftabLen, true);
if(switchEndian) {
for(size_t i = 0; i < gh->_ftabLen; i++)
this->ftab()[i] = readIndex<index_t>(_in1, switchEndian);
} else {
size_t r = MM_READ(_in1, (void *)ftab(), gh->_ftabLen*sizeof(index_t));
if(r != (size_t)(gh->_ftabLen*sizeof(index_t))) {
cerr << "Error reading _ftab[] array: " << r << ", " << (gh->_ftabLen*sizeof(index_t)) << endl;
throw 1;
}
}
}
// Read etab from primary stream
if(_verbose || startVerbose) {
if(loadFtab) {
cerr << "Reading eftab (" << gh->_eftabLen << "): ";
logTime(cerr);
} else {
cerr << "Skipping eftab (" << gh->_eftabLen << "): ";
}
}
_eftab.reset();
if(_useMm) {
#ifdef BOWTIE_MM
_eftab.init((index_t*)(mmFile[0] + bytesRead), gh->_eftabLen, false);
bytesRead += gh->_eftabLen*sizeof(index_t);
fseek(_in1, gh->_eftabLen*sizeof(index_t), SEEK_CUR);
#endif
} else {
_eftab.init(new index_t[gh->_eftabLen], gh->_eftabLen, true);
if(switchEndian) {
for(size_t i = 0; i < gh->_eftabLen; i++)
this->eftab()[i] = readIndex<index_t>(_in1, switchEndian);
} else {
size_t r = MM_READ(_in1, (void *)this->eftab(), gh->_eftabLen*sizeof(index_t));
if(r != (size_t)(gh->_eftabLen*sizeof(index_t))) {
cerr << "Error reading _eftab[] array: " << r << ", " << (gh->_eftabLen*sizeof(index_t)) << endl;
throw 1;
}
}
}
for(index_t i = 0; i < gh->_eftabLen; i++) {
if(i > 0 && this->eftab()[i] > 0) {
assert_geq(this->eftab()[i] + 4, this->eftab()[i-1]);
} else if(i > 0 && this->eftab()[i-1] == 0) {
assert_eq(0, this->eftab()[i]);
}
}
} else {
assert(ftab() == NULL);
assert(eftab() == NULL);
// Skip ftab
bytesRead += gh->_ftabLen*sizeof(index_t);
fseek(_in1, gh->_ftabLen*sizeof(index_t), SEEK_CUR);
// Skip eftab
bytesRead += sizeof(index_t);
bytesRead += gh->_eftabLen*sizeof(index_t);
fseek(_in1, gh->_eftabLen*sizeof(index_t), SEEK_CUR);
}
} catch(bad_alloc& e) {
cerr << "Out of memory allocating fchr[], ftab[] or eftab[] arrays for the Bowtie index." << endl
<< "Please try again on a computer with more memory." << endl;
throw 1;
}
// Read reference sequence names from primary index file (or not,
// if --refidx is specified)
if(loadNames) {
while(true) {
char c = '\0';
if(MM_READ(_in1, (void *)(&c), (size_t)1) != (size_t)1) break;
bytesRead++;
if(c == '\0') break;
else if(c == '\n') {
this->_refnames.push_back("");
} else {
if(this->_refnames.size() == 0) {
this->_refnames.push_back("");
}
this->_refnames.back().push_back(c);
}
}
}
_offs.reset();
if(loadSASamp) {
bytesRead = 4; // reset for secondary index file (already read 1-sentinel)
shmemLeader = true;
if(_verbose || startVerbose) {
cerr << "Reading offs (" << offsLenSampled << " " << std::setw(2) << sizeof(index_t)*8 << "-bit words): ";
logTime(cerr);
}
if(!_useMm) {
if(!useShmem_) {
// Allocate offs_
try {
_offs.init(new index_t[offsLenSampled], offsLenSampled, true);
} catch(bad_alloc& e) {
cerr << "Out of memory allocating the offs[] array for the Bowtie index." << endl
<< "Please try again on a computer with more memory." << endl;
throw 1;
}
} else {
index_t *tmp = NULL;
shmemLeader = ALLOC_SHARED_U32(
(_in2Str + "[offs]"), offsLenSampled*sizeof(index_t), &tmp,
"offs", (_verbose || startVerbose));
_offs.init((index_t*)tmp, offsLenSampled, false);
}
}
if(_overrideOffRate < 32) {
if(shmemLeader) {
// Allocate offs (big allocation)
if(switchEndian || offRateDiff > 0) {
assert(!_useMm);
const index_t blockMaxSz = (index_t)(2 * 1024 * 1024); // 2 MB block size
const index_t blockMaxSzU = (blockMaxSz / sizeof(index_t)); // # U32s per block
char *buf;
try {
buf = new char[blockMaxSz];
} catch(std::bad_alloc& e) {
cerr << "Error: Out of memory allocating part of _offs array: '" << e.what() << "'" << endl;
throw e;
}
for(index_t i = 0; i < offsLen; i += blockMaxSzU) {
index_t block = min<index_t>((index_t)blockMaxSzU, (index_t)(offsLen - i));
size_t r = MM_READ(_in2, (void *)buf, block * sizeof(index_t));
if(r != (size_t)(block * sizeof(index_t))) {
cerr << "Error reading block of _offs[] array: " << r << ", " << (block * sizeof(index_t)) << endl;
throw 1;
}
index_t idx = i >> offRateDiff;
for(index_t j = 0; j < block; j += (1 << offRateDiff)) {
assert_lt(idx, offsLenSampled);
this->offs()[idx] = ((index_t*)buf)[j];
if(switchEndian) {
this->offs()[idx] = endianSwapIndex(this->offs()[idx]);
}
idx++;
}
}
delete[] buf;
} else {
if(_useMm) {
#ifdef BOWTIE_MM
_offs.init((index_t*)(mmFile[1] + bytesRead), offsLen, false);
bytesRead += (offsLen * sizeof(index_t));
fseek(_in2, (offsLen * sizeof(index_t)), SEEK_CUR);
#endif
} else {
// Workaround for small-index mode where MM_READ may
// not be able to handle read amounts greater than 2^32
// bytes.
uint64_t bytesLeft = (offsLen * sizeof(index_t));
char *offs = (char *)this->offs();
while(bytesLeft > 0) {
size_t r = MM_READ(_in2, (void*)offs, bytesLeft);
if(MM_IS_IO_ERR(_in2,r,bytesLeft)) {
cerr << "Error reading block of _offs[] array: "
<< r << ", " << bytesLeft << gLastIOErrMsg << endl;
throw 1;
}
offs += r;
bytesLeft -= r;
}
}
}
#ifdef BOWTIE_SHARED_MEM
if(useShmem_) NOTIFY_SHARED(offs(), offsLenSampled*sizeof(index_t));
#endif
} else {
// Not the shmem leader
fseek(_in2, offsLenSampled*sizeof(index_t), SEEK_CUR);
#ifdef BOWTIE_SHARED_MEM
if(useShmem_) WAIT_SHARED(offs(), offsLenSampled*sizeof(index_t));
#endif
}
}
}
this->postReadInit(*gh); // Initialize fields of Ebwt not read from file
if(_verbose || startVerbose) print(cerr, *gh);
// The fact that _ebwt and friends actually point to something
// (other than NULL) now signals to other member functions that the
// Ebwt is loaded into memory.
// Be kind
if(deleteGh) delete gh;
if(!subIndex) {
#ifdef BOWTIE_MM
fseek(_in1, 0, SEEK_SET);
if(loadSASamp) fseek(_in2, 0, SEEK_SET);
#else
rewind(_in1);
if(loadSASamp) rewind(_in2);
#endif
}
}
/**
* Read reference names from an input stream 'in' for an Ebwt primary
* file and store them in 'refnames'.
*/
template <typename index_t>
void readGFMRefnames(istream& in, EList<string>& refnames) {
// _in1 must already be open with the get cursor at the
// beginning and no error flags set.
assert(in.good());
assert_eq((streamoff)in.tellg(), ios::beg);
// Read endianness hints from both streams
bool switchEndian = false;
uint32_t one = readU32(in, switchEndian); // 1st word of primary stream
if(one != 1) {
assert_eq((1u<<24), one);
switchEndian = true;
}
// Reads header entries one by one from primary stream
readU32(in, switchEndian); // version
index_t len = readIndex<index_t>(in, switchEndian);
index_t gbwtLen = readIndex<index_t>(in, switchEndian);
index_t numNodes = readIndex<index_t>(in, switchEndian);
int32_t lineRate = readI32(in, switchEndian);
/*int32_t linesPerSide =*/ readI32(in, switchEndian);
int32_t offRate = readI32(in, switchEndian);
int32_t ftabChars = readI32(in, switchEndian);
index_t eftabLen = readIndex<index_t>(in, switchEndian);
// BTL: chunkRate is now deprecated
int32_t flags = readI32(in, switchEndian);
bool entireReverse = false;
if(flags < 0) {
entireReverse = (((-flags) & GFM_ENTIRE_REV) != 0);
}
// Create a new EbwtParams from the entries read from primary stream
GFM<index_t> gh(len, gbwtLen, numNodes, lineRate, offRate, ftabChars, eftabLen, entireReverse);
index_t nPat = readIndex<index_t>(in, switchEndian); // nPat
in.seekg(nPat*sizeof(index_t), ios_base::cur); // skip plen
// Skip rstarts
index_t nFrag = readIndex<index_t>(in, switchEndian);
in.seekg(nFrag*sizeof(index_t)*3, ios_base::cur);
// Skip ebwt
in.seekg(gh._ebwtTotLen, ios_base::cur);
// Skip zOff from primary stream
index_t numZOffs = readIndex<index_t>(in, switchEndian);
in.seekg(numZOffs * sizeof(index_t), ios_base::cur);
// Skip fchr
in.seekg(5 * sizeof(index_t), ios_base::cur);
// Skip ftab
in.seekg(gh._ftabLen*sizeof(index_t), ios_base::cur);
// Skip eftab
in.seekg(gh._eftabLen*sizeof(index_t), ios_base::cur);
// Read reference sequence names from primary index file
while(true) {
char c = '\0';
in.read(&c, 1);
if(in.eof()) break;
if(c == '\0') break;
else if(c == '\n') {
refnames.push_back("");
} else {
if(refnames.size() == 0) {
refnames.push_back("");
}
refnames.back().push_back(c);
}
}
if(refnames.back().empty()) {
refnames.pop_back();
}
// Be kind
in.clear(); in.seekg(0, ios::beg);
assert(in.good());
}
/**
* Read reference names from the index with basename 'in' and store
* them in 'refnames'.
*/
template <typename index_t>
void readGFMRefnames(const string& instr, EList<string>& refnames) {
ifstream in;
// Initialize our primary and secondary input-stream fields
in.open((instr + ".1." + gfm_ext).c_str(), ios_base::in | ios::binary);
if(!in.is_open()) {
throw GFMFileOpenException("Cannot open file " + instr);
}
assert(in.is_open());
assert(in.good());
assert_eq((streamoff)in.tellg(), ios::beg);
readGFMRefnames<index_t>(in, refnames);
}
/**
* Read just enough of the Ebwt's header to get its flags
*/
template <typename index_t>
int32_t GFM<index_t>::readVersionFlags(const string& instr, int& major, int& minor, string& extra_version) {
ifstream in;
// Initialize our primary and secondary input-stream fields
in.open((instr + ".1." + gfm_ext).c_str(), ios_base::in | ios::binary);
if(!in.is_open()) {
throw GFMFileOpenException("Cannot open file " + instr);
}
assert(in.is_open());
assert(in.good());
bool switchEndian = false;
uint32_t one = readU32(in, switchEndian); // 1st word of primary stream
if(one != 1) {
assert_eq((1u<<24), one);
assert_eq(1, endianSwapU32(one));
switchEndian = true;
}
index_t version = readU32(in, switchEndian);
readIndexVersion(version, major, minor, extra_version);
readIndex<index_t>(in, switchEndian);
readIndex<index_t>(in, switchEndian);
readIndex<index_t>(in, switchEndian);
readI32(in, switchEndian);
readI32(in, switchEndian);
readI32(in, switchEndian);
readI32(in, switchEndian);
readIndex<index_t>(in, switchEndian);
int32_t flags = readI32(in, switchEndian);
return flags;
}
/**
* Write an extended Burrows-Wheeler transform to a pair of output
* streams.
*
* @param out1 output stream to primary file
* @param out2 output stream to secondary file
* @param be write in big endian?
*/
template <typename index_t>
void GFM<index_t>::writeFromMemory(bool justHeader,
ostream& out1,
ostream& out2) const
{
const GFMParams<index_t>& gh = this->_gh;
assert(gh.repOk());
uint32_t be = this->toBe();
assert(out1.good());
assert(out2.good());
// When building an Ebwt, these header parameters are known
// "up-front", i.e., they can be written to disk immediately,
// before we join() or buildToDisk()
writeI32(out1, 1, be); // endian hint for priamry stream
writeI32(out2, 1, be); // endian hint for secondary stream
int version = getIndexVersion();
writeI32(out1, version, be); // version
writeIndex<index_t>(out1, gh._len, be); // length of string (and bwt and suffix array)
writeIndex<index_t>(out1, 0, be); // dummy for gbwt len
writeIndex<index_t>(out1, 0, be); // dummy for number of nodes
writeI32(out1, gh._lineRate, be); // 2^lineRate = size in bytes of 1 line
writeI32(out1, 2, be); // not used
writeI32(out1, gh._offRate, be); // every 2^offRate chars is "marked"
writeI32(out1, gh._ftabChars, be); // number of 2-bit chars used to address ftab
writeIndex<index_t>(out1, 0, be); // eftab length
int32_t flags = 1;
if(gh._entireReverse) flags |= GFM_ENTIRE_REV;
writeI32(out1, -flags, be); // BTL: chunkRate is now deprecated
if(!justHeader) {
assert(rstarts() != NULL);
assert(offs() != NULL);
assert(ftab() != NULL);
assert(eftab() != NULL);
assert(isInMemory());
// These Ebwt parameters are known after the inputs strings have
// been joined() but before they have been built(). These can
// written to the disk next and then discarded from memory.
writeIndex<index_t>(out1, this->_nPat, be);
for(index_t i = 0; i < this->_nPat; i++)
writeIndex<index_t>(out1, this->plen()[i], be);
assert_geq(this->_nFrag, this->_nPat);
writeIndex<index_t>(out1, this->_nFrag, be);
for(size_t i = 0; i < this->_nFrag*3; i++)
writeIndex<index_t>(out1, this->rstarts()[i], be);
// These Ebwt parameters are discovered only as the Ebwt is being
// built (in buildToDisk()). Of these, only 'offs' and 'ebwt' are
// terribly large. 'ebwt' is written to the primary file and then
// discarded from memory as it is built; 'offs' is similarly
// written to the secondary file and discarded.
writeIndex<index_t>(out1, gh._gbwtTotLen, be);
out1.write((const char *)this->gfm(), gh._gbwtTotLen);
writeIndex<index_t>(out1, (index_t)_zOffs.size(), be);
for(index_t i = 0; i < _zOffs.size(); i++)
writeIndex<index_t>(out1, _zOffs[i], be);
index_t offsLen = gh._offsLen;
for(index_t i = 0; i < offsLen; i++)
writeIndex<index_t>(out2, this->offs()[i], be);
// 'fchr', 'ftab' and 'eftab' are not fully determined until the
// loop is finished, so they are written to the primary file after
// all of 'ebwt' has already been written and only then discarded
// from memory.
for(int i = 0; i < 5; i++)
writeIndex<index_t>(out1, this->fchr()[i], be);
for(index_t i = 0; i < gh._ftabLen; i++)
writeIndex<index_t>(out1, this->ftab()[i], be);
for(index_t i = 0; i < gh._eftabLen; i++)
writeIndex<index_t>(out1, this->eftab()[i], be);
}
}
/**
* Given a pair of strings representing output filenames, and assuming
* this Ebwt object is currently in memory, write out this Ebwt to the
* specified files.
*
* If sanity-checking is enabled, then once the streams have been
* fully written and closed, we reopen them and read them into a
* (hopefully) exact copy of this Ebwt. We then assert that the
* current Ebwt and the copy match in all of their fields.
*/
template <typename index_t>
void GFM<index_t>::writeFromMemory(bool justHeader,
const string& out1,
const string& out2) const
{
ASSERT_ONLY(const GFMParams<index_t>& gh = this->_gh);
assert(isInMemory());
assert(gh.repOk());
ofstream fout1(out1.c_str(), ios::binary);
ofstream fout2(out2.c_str(), ios::binary);
writeFromMemory(justHeader, fout1, fout2);
fout1.close();
fout2.close();
// Read the file back in and assert that all components match
if(_sanity) {
#if 0
if(_verbose)
cout << "Re-reading \"" << out1 << "\"/\"" << out2 << "\" for sanity check" << endl;
Ebwt copy(out1, out2, _verbose, _sanity);
assert(!isInMemory());
copy.loadIntoMemory(eh._color ? 1 : 0, true, false, false);
assert(isInMemory());
assert_eq(eh._lineRate, copy.eh()._lineRate);
assert_eq(eh._offRate, copy.eh()._offRate);
assert_eq(eh._ftabChars, copy.eh()._ftabChars);
assert_eq(eh._len, copy.eh()._len);
assert_eq(_zOff, copy.zOff());
assert_eq(_zEbwtBpOff, copy.zEbwtBpOff());
assert_eq(_zEbwtByteOff, copy.zEbwtByteOff());
assert_eq(_nPat, copy.nPat());
for(index_t i = 0; i < _nPat; i++)
assert_eq(this->_plen[i], copy.plen()[i]);
assert_eq(this->_nFrag, copy.nFrag());
for(size_t i = 0; i < this->nFrag*3; i++) {
assert_eq(this->_rstarts[i], copy.rstarts()[i]);
}
for(index_t i = 0; i < 5; i++)
assert_eq(this->_fchr[i], copy.fchr()[i]);
for(size_t i = 0; i < eh._ftabLen; i++)
assert_eq(this->ftab()[i], copy.ftab()[i]);
for(size_t i = 0; i < eh._eftabLen; i++)
assert_eq(this->eftab()[i], copy.eftab()[i]);
for(index_t i = 0; i < eh._offsLen; i++)
assert_eq(this->_offs[i], copy.offs()[i]);
for(index_t i = 0; i < eh._ebwtTotLen; i++)
assert_eq(this->ebwt()[i], copy.ebwt()[i]);
copy.sanityCheckAll();
if(_verbose)
cout << "Read-in check passed for \"" << out1 << "\"/\"" << out2 << "\"" << endl;
#endif
}
}
/**
* Write the rstarts array given the szs array for the reference.
*/
template <typename index_t>
void GFM<index_t>::szsToDisk(const EList<RefRecord>& szs, ostream& os, int reverse) {
size_t seq = 0;
index_t off = 0;
index_t totlen = 0;
for(size_t i = 0; i < szs.size(); i++) {
if(szs[i].len == 0) continue;
if(szs[i].first) off = 0;
off += szs[i].off;
if(szs[i].first && szs[i].len > 0) seq++;
index_t seqm1 = (index_t)(seq-1);
assert_lt(seqm1, _nPat);
index_t fwoff = off;
if(reverse == REF_READ_REVERSE) {
// Invert pattern idxs
seqm1 = _nPat - seqm1 - 1;
// Invert pattern idxs
assert_leq(off + szs[i].len, plen()[seqm1]);
fwoff = plen()[seqm1] - (off + szs[i].len);
}
writeIndex<index_t>(os, totlen, this->toBe()); // offset from beginning of joined string
writeIndex<index_t>(os, (index_t)seqm1, this->toBe()); // sequence id
writeIndex<index_t>(os, (index_t)fwoff, this->toBe()); // offset into sequence
totlen += szs[i].len;
off += szs[i].len;
}
}
///////////////////////////////////////////////////////////////////////
//
// Functions for printing and sanity-checking Ebwts
//
///////////////////////////////////////////////////////////////////////
/**
* Check that the ebwt array is internally consistent up to (and not
* including) the given side index by re-counting the chars and
* comparing against the embedded occ[] arrays.
*/
template <typename index_t>
void GFM<index_t>::sanityCheckUpToSide(int upToSide) const {
assert(isInMemory());
index_t occ[] = {0, 0, 0, 0};
ASSERT_ONLY(index_t occ_save[] = {0, 0, 0, 0});
index_t cur = 0; // byte pointer
const GFMParams<index_t>& gh = this->_gh;
bool fw = false;
while(cur < (upToSide * gh._sideSz)) {
assert_leq(cur + gh._sideSz, gh._gbwtTotLen);
for(index_t i = 0; i < gh._sideGbwtSz; i++) {
uint8_t by = this->gfm()[cur + (fw ? i : gh._sideGbwtSz-i-1)];
for(int j = 0; j < 4; j++) {
// Unpack from lowest to highest bit pair
int twoBit = unpack_2b_from_8b(by, fw ? j : 3-j);
occ[twoBit]++;
}
assert_eq(0, (occ[0] + occ[1] + occ[2] + occ[3]) % 4);
}
assert_eq(0, (occ[0] + occ[1] + occ[2] + occ[3]) % gh._sideGbwtLen);
// Finished forward bucket; check saved [A], [C], [G] and [T]
// against the index_ts encoded here
ASSERT_ONLY(const index_t *ugbwt = reinterpret_cast<const index_t*>(&gfm()[cur + gh._sideGbwtSz]));
ASSERT_ONLY(index_t as = ugbwt[0]);
ASSERT_ONLY(index_t cs = ugbwt[1]);
ASSERT_ONLY(index_t gs = ugbwt[2]);
ASSERT_ONLY(index_t ts = ugbwt[3]);
assert(as == occ_save[0] || as == occ_save[0]-1);
assert_eq(cs, occ_save[1]);
assert_eq(gs, occ_save[2]);
assert_eq(ts, occ_save[3]);
#ifndef NDEBUG
occ_save[0] = occ[0];
occ_save[1] = occ[1];
occ_save[2] = occ[2];
occ_save[3] = occ[3];
#endif
cur += gh._sideSz;
}
}
/**
* Sanity-check various pieces of the Ebwt
*/
template <typename index_t>
void GFM<index_t>::sanityCheckAll(int reverse) const {
const GFMParams<index_t>& gh = this->_gh;
assert(isInMemory());
// Check ftab
for(index_t i = 1; i < gh._ftabLen; i++) {
assert_geq(this->ftabHi(i), this->ftabLo(i-1));
assert_geq(this->ftabLo(i), this->ftabHi(i-1));
assert_leq(this->ftabHi(i), gh._gbwtLen);
}
assert_eq(this->ftabHi(gh._ftabLen-1), gh._gbwtLen);
// Check offs
int seenLen = (gh._gbwtLen + 31) >> ((index_t)5);
uint32_t *seen;
try {
seen = new uint32_t[seenLen]; // bitvector marking seen offsets
} catch(bad_alloc& e) {
cerr << "Out of memory allocating seen[] at " << __FILE__ << ":" << __LINE__ << endl;
throw e;
}
memset(seen, 0, 4 * seenLen);
index_t offsLen = gh._offsLen;
for(index_t i = 0; i < offsLen; i++) {
assert_lt(this->offs()[i], gh._gbwtLen);
int w = this->offs()[i] >> 5;
int r = this->offs()[i] & 31;
assert_eq(0, (seen[w] >> r) & 1); // shouldn't have been seen before
seen[w] |= (1 << r);
}
delete[] seen;
// Check nPat
assert_gt(this->_nPat, 0);
// Check plen, flen
for(index_t i = 0; i < this->_nPat; i++) {
assert_geq(this->plen()[i], 0);
}
// Check rstarts
if(this->rstarts() != NULL) {
for(index_t i = 0; i < this->_nFrag-1; i++) {
assert_gt(this->rstarts()[(i+1)*3], this->rstarts()[i*3]);
if(reverse == REF_READ_REVERSE) {
assert(this->rstarts()[(i*3)+1] >= this->rstarts()[((i+1)*3)+1]);
} else {
assert(this->rstarts()[(i*3)+1] <= this->rstarts()[((i+1)*3)+1]);
}
}
}
// Check ebwt
sanityCheckUpToSide(gh._numSides);
VMSG_NL("Ebwt::sanityCheck passed");
}
/**
* Transform this Ebwt into the original string in linear time by using
* the LF mapping to walk backwards starting at the row correpsonding
* to the end of the string. The result is written to s. The Ebwt
* must be in memory.
*/
template <typename index_t>
void GFM<index_t>::restore(SString<char>& s) const {
assert(isInMemory());
s.resize(this->_gh._len);
index_t jumps = 0;
index_t i = this->_gh._len; // should point to final SA elt (starting with '$')
SideLocus<index_t> l(i, this->_gh, this->gfm());
while(true) {
for(index_t j = 0; j < _zOffs.size(); j++) {
if(i == _zOffs[j]) break;
}
assert_lt(jumps, this->_gh._len);
//if(_verbose) cout << "restore: i: " << i << endl;
// Not a marked row; go back a char in the original string
index_t newi = mapLF(l ASSERT_ONLY(, false));
assert_neq(newi, i);
s[this->_gh._len - jumps - 1] = rowL(l);
i = newi;
l.initFromRow(i, this->_gh, this->gfm());
jumps++;
}
assert_eq(jumps, this->_gh._len);
}
/**
* Check that this Ebwt, when restored via restore(), matches up with
* the given array of reference sequences. For sanity checking.
*/
template <typename index_t>
void GFM<index_t>::checkOrigs(
const EList<SString<char> >& os,
bool mirror) const
{
SString<char> rest;
restore(rest);
index_t restOff = 0;
size_t i = 0, j = 0;
if(mirror) {
// TODO: FIXME
return;
}
while(i < os.size()) {
size_t olen = os[i].length();
int lastorig = -1;
for(; j < olen; j++) {
size_t joff = j;
if(mirror) joff = olen - j - 1;
if((int)os[i][joff] == 4) {
// Skip over Ns
lastorig = -1;
if(!mirror) {
while(j < olen && (int)os[i][j] == 4) j++;
} else {
while(j < olen && (int)os[i][olen-j-1] == 4) j++;
}
j--;
continue;
}
assert_eq(os[i][joff], rest[restOff]);
lastorig = (int)os[i][joff];
restOff++;
}
if(j == os[i].length()) {
// Moved to next sequence
i++;
j = 0;
} else {
// Just jumped over a gap
}
}
}
/**
* Try to find the Bowtie index specified by the user. First try the
* exact path given by the user. Then try the user-provided string
* appended onto the path of the "indexes" subdirectory below this
* executable, then try the provided string appended onto
* "$HISAT2_INDEXES/".
*/
string adjustEbwtBase(const string& cmdline,
const string& ebwtFileBase,
bool verbose = false);
#endif /*GFM_H_*/