6981 lines
252 KiB
C
6981 lines
252 KiB
C
|
/*
|
||
|
* Copyright 2015, Daehwan Kim <infphilo@gmail.com>
|
||
|
*
|
||
|
* This file is part of HISAT 2.
|
||
|
*
|
||
|
* HISAT 2 is free software: you can redistribute it and/or modify
|
||
|
* it under the terms of the GNU General Public License as published by
|
||
|
* the Free Software Foundation, either version 3 of the License, or
|
||
|
* (at your option) any later version.
|
||
|
*
|
||
|
* HISAT 2 is distributed in the hope that it will be useful,
|
||
|
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||
|
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||
|
* GNU General Public License for more details.
|
||
|
*
|
||
|
* You should have received a copy of the GNU General Public License
|
||
|
* along with HISAT 2. If not, see <http://www.gnu.org/licenses/>.
|
||
|
*/
|
||
|
|
||
|
#ifndef GFM_H_
|
||
|
#define GFM_H_
|
||
|
|
||
|
#include <stdint.h>
|
||
|
#include <string.h>
|
||
|
#include <iostream>
|
||
|
#include <fstream>
|
||
|
#include <sstream>
|
||
|
#include <memory>
|
||
|
#include <fcntl.h>
|
||
|
#include <math.h>
|
||
|
#include <errno.h>
|
||
|
#include <set>
|
||
|
#include <stdexcept>
|
||
|
#include <sys/stat.h>
|
||
|
#ifdef BOWTIE_MM
|
||
|
#include <sys/mman.h>
|
||
|
#include <sys/shm.h>
|
||
|
#endif
|
||
|
#include "shmem.h"
|
||
|
#include "alphabet.h"
|
||
|
#include "assert_helpers.h"
|
||
|
#include "bitpack.h"
|
||
|
#include "blockwise_sa.h"
|
||
|
#include "endian_swap.h"
|
||
|
#include "word_io.h"
|
||
|
#include "random_source.h"
|
||
|
#include "ref_read.h"
|
||
|
#include "threading.h"
|
||
|
#include "str_util.h"
|
||
|
#include "mm.h"
|
||
|
#include "timer.h"
|
||
|
#include "reference.h"
|
||
|
#include "search_globals.h"
|
||
|
#include "ds.h"
|
||
|
#include "random_source.h"
|
||
|
#include "mem_ids.h"
|
||
|
#include "btypes.h"
|
||
|
#include "tokenize.h"
|
||
|
#include "repeat.h"
|
||
|
#include "repeat_kmer.h"
|
||
|
|
||
|
#ifdef POPCNT_CAPABILITY
|
||
|
#include "processor_support.h"
|
||
|
#endif
|
||
|
|
||
|
#include "gbwt_graph.h"
|
||
|
|
||
|
using namespace std;
|
||
|
|
||
|
// From ccnt_lut.cpp, automatically generated by gen_lookup_tables.pl
|
||
|
extern uint8_t cCntLUT_4[4][4][256];
|
||
|
extern uint8_t cCntLUT_4_rev[4][4][256];
|
||
|
extern uint8_t cCntBIT[8][256];
|
||
|
|
||
|
extern bool threeN;
|
||
|
|
||
|
static const uint64_t c_table[4] = {
|
||
|
0xffffffffffffffff,
|
||
|
0xaaaaaaaaaaaaaaaa,
|
||
|
0x5555555555555555,
|
||
|
0x0000000000000000
|
||
|
};
|
||
|
|
||
|
#ifndef VMSG_NL
|
||
|
#define VMSG_NL(...) \
|
||
|
if(this->verbose()) { \
|
||
|
stringstream tmp; \
|
||
|
tmp << __VA_ARGS__ << endl; \
|
||
|
this->verbose(tmp.str()); \
|
||
|
}
|
||
|
#endif
|
||
|
|
||
|
#ifndef VMSG
|
||
|
#define VMSG(...) \
|
||
|
if(this->verbose()) { \
|
||
|
stringstream tmp; \
|
||
|
tmp << __VA_ARGS__; \
|
||
|
this->verbose(tmp.str()); \
|
||
|
}
|
||
|
#endif
|
||
|
|
||
|
/**
|
||
|
* Flags describing type of Ebwt.
|
||
|
*/
|
||
|
enum GFM_FLAGS {
|
||
|
GFM_ENTIRE_REV = 4 // true -> reverse Ebwt is the whole
|
||
|
// concatenated string reversed, rather than
|
||
|
// each stretch reversed
|
||
|
};
|
||
|
|
||
|
/**
|
||
|
* Extended Burrows-Wheeler transform header. This together with the
|
||
|
* actual data arrays and other text-specific parameters defined in
|
||
|
* class Ebwt constitute the entire Ebwt.
|
||
|
*/
|
||
|
template <typename index_t = uint32_t>
|
||
|
class GFMParams {
|
||
|
|
||
|
public:
|
||
|
GFMParams() { }
|
||
|
|
||
|
GFMParams(
|
||
|
index_t len,
|
||
|
index_t gbwtLen,
|
||
|
index_t numNodes,
|
||
|
int32_t lineRate,
|
||
|
int32_t offRate,
|
||
|
int32_t ftabChars,
|
||
|
index_t eftabLen,
|
||
|
bool entireReverse)
|
||
|
{
|
||
|
init(len, gbwtLen, numNodes, lineRate, offRate, ftabChars, eftabLen, entireReverse);
|
||
|
}
|
||
|
|
||
|
GFMParams(const GFMParams& gh) {
|
||
|
init(gh._len, gh._gbwtLen, gh._numNodes, gh._lineRate, gh._offRate,
|
||
|
gh._ftabChars, gh._eftabLen, gh._entireReverse);
|
||
|
}
|
||
|
|
||
|
void init(
|
||
|
index_t len,
|
||
|
index_t gbwtLen,
|
||
|
index_t numNodes,
|
||
|
int32_t lineRate,
|
||
|
int32_t offRate,
|
||
|
int32_t ftabChars,
|
||
|
index_t eftabLen,
|
||
|
bool entireReverse)
|
||
|
{
|
||
|
_entireReverse = entireReverse;
|
||
|
_linearFM = (len + 1 == gbwtLen || gbwtLen == 0);
|
||
|
_len = len;
|
||
|
_gbwtLen = (gbwtLen == 0 ? len + 1 : gbwtLen);
|
||
|
_numNodes = (numNodes == 0 ? len + 1 : numNodes);
|
||
|
if(_linearFM) {
|
||
|
_sz = (len+3)/4;
|
||
|
_gbwtSz = _gbwtLen/4 + 1;
|
||
|
} else {
|
||
|
_sz = (len+1)/2;
|
||
|
_gbwtSz = _gbwtLen/2 + 1;
|
||
|
}
|
||
|
_lineRate = lineRate;
|
||
|
_origOffRate = offRate;
|
||
|
_offRate = offRate;
|
||
|
_offMask = std::numeric_limits<index_t>::max() << _offRate;
|
||
|
_ftabChars = ftabChars;
|
||
|
_eftabLen = eftabLen;
|
||
|
_eftabSz = _eftabLen*sizeof(index_t);
|
||
|
_ftabLen = (1 << (_ftabChars*2))+1;
|
||
|
_ftabSz = _ftabLen*sizeof(index_t);
|
||
|
_offsLen = (_numNodes + (1 << _offRate) - 1) >> _offRate;
|
||
|
_offsSz = _offsLen*sizeof(index_t);
|
||
|
_lineSz = 1 << _lineRate;
|
||
|
_sideSz = _lineSz * 1 /* lines per side */;
|
||
|
if(_linearFM) {
|
||
|
_sideGbwtSz = _sideSz - (sizeof(index_t) * 4);
|
||
|
_sideGbwtLen = _sideGbwtSz << 2;
|
||
|
} else {
|
||
|
_sideGbwtSz = _sideSz - (sizeof(index_t) * 6);
|
||
|
_sideGbwtLen = _sideGbwtSz << 1;
|
||
|
}
|
||
|
_numSides = (_gbwtSz+(_sideGbwtSz)-1)/(_sideGbwtSz);
|
||
|
_numLines = _numSides * 1 /* lines per side */;
|
||
|
_gbwtTotLen = _numSides * _sideSz;
|
||
|
_gbwtTotSz = _gbwtTotLen;
|
||
|
assert(repOk());
|
||
|
}
|
||
|
|
||
|
index_t len() const { return _len; }
|
||
|
index_t lenNucs() const { return _len; }
|
||
|
index_t gbwtLen() const { return _gbwtLen; }
|
||
|
index_t sz() const { return _sz; }
|
||
|
index_t gbwtSz() const { return _gbwtSz; }
|
||
|
int32_t lineRate() const { return _lineRate; }
|
||
|
int32_t origOffRate() const { return _origOffRate; }
|
||
|
int32_t offRate() const { return _offRate; }
|
||
|
index_t offMask() const { return _offMask; }
|
||
|
int32_t ftabChars() const { return _ftabChars; }
|
||
|
index_t eftabLen() const { return _eftabLen; }
|
||
|
index_t eftabSz() const { return _eftabSz; }
|
||
|
index_t ftabLen() const { return _ftabLen; }
|
||
|
index_t ftabSz() const { return _ftabSz; }
|
||
|
index_t offsLen() const { return _offsLen; }
|
||
|
index_t offsSz() const { return _offsSz; }
|
||
|
index_t lineSz() const { return _lineSz; }
|
||
|
index_t sideSz() const { return _sideSz; }
|
||
|
index_t sideGbtSz() const { return _sideGbwtSz; }
|
||
|
index_t sideGbwtLen() const { return _sideGbwtLen; }
|
||
|
index_t numSides() const { return _numSides; }
|
||
|
index_t numLines() const { return _numLines; }
|
||
|
index_t gbwtTotLen() const { return _gbwtTotLen; }
|
||
|
index_t gbwtTotSz() const { return _gbwtTotSz; }
|
||
|
bool entireReverse() const { return _entireReverse; }
|
||
|
bool linearFM() const { return _linearFM; }
|
||
|
index_t numNodes() const { return _numNodes; }
|
||
|
|
||
|
/**
|
||
|
* Set a new suffix-array sampling rate, which involves updating
|
||
|
* rate, mask, sample length, and sample size.
|
||
|
*/
|
||
|
void setOffRate(int __offRate) {
|
||
|
_offRate = __offRate;
|
||
|
_offMask = std::numeric_limits<index_t>::max() << _offRate;
|
||
|
_offsLen = (_gbwtLen + (1 << _offRate) - 1) >> _offRate;
|
||
|
_offsSz = _offsLen * sizeof(index_t);
|
||
|
}
|
||
|
|
||
|
#ifndef NDEBUG
|
||
|
/// Check that this EbwtParams is internally consistent
|
||
|
bool repOk() const {
|
||
|
// assert_gt(_len, 0);
|
||
|
assert_gt(_lineRate, 3);
|
||
|
assert_geq(_offRate, 0);
|
||
|
assert_leq(_ftabChars, 16);
|
||
|
assert_geq(_ftabChars, 1);
|
||
|
assert_lt(_lineRate, 32);
|
||
|
assert_lt(_ftabChars, 32);
|
||
|
assert_eq(0, _gbwtTotSz % _lineSz);
|
||
|
return true;
|
||
|
}
|
||
|
#endif
|
||
|
|
||
|
/**
|
||
|
* Pretty-print the header contents to the given output stream.
|
||
|
*/
|
||
|
void print(ostream& out) const {
|
||
|
out << "Headers:" << endl
|
||
|
<< " len: " << _len << endl
|
||
|
<< " gbwtLen: " << _gbwtLen << endl
|
||
|
<< " nodes: " << _numNodes << endl
|
||
|
<< " sz: " << _sz << endl
|
||
|
<< " gbwtSz: " << _gbwtSz << endl
|
||
|
<< " lineRate: " << _lineRate << endl
|
||
|
<< " offRate: " << _offRate << endl
|
||
|
<< " offMask: 0x" << hex << _offMask << dec << endl
|
||
|
<< " ftabChars: " << _ftabChars << endl
|
||
|
<< " eftabLen: " << _eftabLen << endl
|
||
|
<< " eftabSz: " << _eftabSz << endl
|
||
|
<< " ftabLen: " << _ftabLen << endl
|
||
|
<< " ftabSz: " << _ftabSz << endl
|
||
|
<< " offsLen: " << _offsLen << endl
|
||
|
<< " offsSz: " << _offsSz << endl
|
||
|
<< " lineSz: " << _lineSz << endl
|
||
|
<< " sideSz: " << _sideSz << endl
|
||
|
<< " sideGbwtSz: " << _sideGbwtSz << endl
|
||
|
<< " sideGbwtLen: " << _sideGbwtLen << endl
|
||
|
<< " numSides: " << _numSides << endl
|
||
|
<< " numLines: " << _numLines << endl
|
||
|
<< " gbwtTotLen: " << _gbwtTotLen << endl
|
||
|
<< " gbwtTotSz: " << _gbwtTotSz << endl
|
||
|
<< " reverse: " << _entireReverse << endl
|
||
|
<< " linearFM: " << (_linearFM ? "Yes" : "No") << endl;
|
||
|
}
|
||
|
|
||
|
index_t _len;
|
||
|
index_t _gbwtLen;
|
||
|
index_t _sz;
|
||
|
index_t _gbwtSz;
|
||
|
int32_t _lineRate;
|
||
|
int32_t _origOffRate;
|
||
|
int32_t _offRate;
|
||
|
index_t _offMask;
|
||
|
int32_t _ftabChars;
|
||
|
index_t _eftabLen;
|
||
|
index_t _eftabSz;
|
||
|
index_t _ftabLen;
|
||
|
index_t _ftabSz;
|
||
|
index_t _offsLen;
|
||
|
index_t _offsSz;
|
||
|
index_t _lineSz;
|
||
|
index_t _sideSz;
|
||
|
index_t _sideGbwtSz;
|
||
|
index_t _sideGbwtLen;
|
||
|
index_t _numSides;
|
||
|
index_t _numLines;
|
||
|
index_t _gbwtTotLen;
|
||
|
index_t _gbwtTotSz;
|
||
|
bool _entireReverse;
|
||
|
bool _linearFM;
|
||
|
index_t _numNodes;
|
||
|
};
|
||
|
|
||
|
/**
|
||
|
* Exception to throw when a file-realted error occurs.
|
||
|
*/
|
||
|
class GFMFileOpenException : public std::runtime_error {
|
||
|
public:
|
||
|
GFMFileOpenException(const std::string& msg = "") :
|
||
|
std::runtime_error(msg) { }
|
||
|
};
|
||
|
|
||
|
/**
|
||
|
* Calculate size of file with given name.
|
||
|
*/
|
||
|
static inline int64_t fileSize(const char* name) {
|
||
|
std::ifstream f;
|
||
|
f.open(name, std::ios_base::binary | std::ios_base::in);
|
||
|
if (!f.good() || f.eof() || !f.is_open()) { return 0; }
|
||
|
f.seekg(0, std::ios_base::beg);
|
||
|
std::ifstream::pos_type begin_pos = f.tellg();
|
||
|
f.seekg(0, std::ios_base::end);
|
||
|
return static_cast<int64_t>(f.tellg() - begin_pos);
|
||
|
}
|
||
|
|
||
|
/**
|
||
|
* Encapsulates a location in the gbwt text in terms of the side it
|
||
|
* occurs in and its offset within the side.
|
||
|
*/
|
||
|
template <typename index_t = uint32_t>
|
||
|
struct SideLocus {
|
||
|
SideLocus() :
|
||
|
_sideByteOff(0),
|
||
|
_sideNum(0),
|
||
|
_charOff(0),
|
||
|
_by(-1),
|
||
|
_bp(-1) { }
|
||
|
|
||
|
/**
|
||
|
* Construct from row and other relevant information about the Ebwt.
|
||
|
*/
|
||
|
SideLocus(index_t row, const GFMParams<index_t>& ep, const uint8_t* ebwt) {
|
||
|
initFromRow(row, ep, ebwt);
|
||
|
}
|
||
|
|
||
|
/**
|
||
|
* Init two SideLocus objects from a top/bot pair, using the result
|
||
|
* from one call to initFromRow to possibly avoid a second call.
|
||
|
*/
|
||
|
static void initFromTopBot(
|
||
|
index_t top,
|
||
|
index_t bot,
|
||
|
const GFMParams<index_t>& gp,
|
||
|
const uint8_t* gfm,
|
||
|
SideLocus& ltop,
|
||
|
SideLocus& lbot)
|
||
|
{
|
||
|
const index_t sideGbwtLen = gp._sideGbwtLen;
|
||
|
assert_gt(bot, top);
|
||
|
ltop.initFromRow(top, gp, gfm);
|
||
|
index_t spread = bot - top;
|
||
|
// Many cache misses on the following lines
|
||
|
if(ltop._charOff + spread < sideGbwtLen) {
|
||
|
lbot._charOff = ltop._charOff + spread;
|
||
|
lbot._sideNum = ltop._sideNum;
|
||
|
lbot._sideByteOff = ltop._sideByteOff;
|
||
|
lbot._by = lbot._charOff >> 2;
|
||
|
assert_lt(lbot._by, (int)gp._sideGbwtSz);
|
||
|
lbot._bp = lbot._charOff & 0x3;
|
||
|
} else {
|
||
|
lbot.initFromRow(bot, gp, gfm);
|
||
|
}
|
||
|
}
|
||
|
|
||
|
/**
|
||
|
* Calculate SideLocus based on a row and other relevant
|
||
|
* information about the shape of the Ebwt.
|
||
|
*/
|
||
|
void initFromRow(
|
||
|
index_t row,
|
||
|
const GFMParams<index_t>& gp,
|
||
|
const uint8_t* gfm) {
|
||
|
const index_t sideSz = gp._sideSz;
|
||
|
// Side length is hard-coded for now; this allows the compiler
|
||
|
// to do clever things to accelerate / and %.
|
||
|
_sideNum = row / gp._sideGbwtLen;
|
||
|
assert_lt(_sideNum, gp._numSides);
|
||
|
_charOff = row % gp._sideGbwtLen;
|
||
|
_sideByteOff = _sideNum * sideSz;
|
||
|
assert_leq(row, gp._gbwtLen);
|
||
|
assert_leq(_sideByteOff + sideSz, gp._gbwtTotSz);
|
||
|
// Tons of cache misses on the next line
|
||
|
_by = _charOff >> 2; // byte within side
|
||
|
assert_lt(_by, (int)gp._sideGbwtSz);
|
||
|
_bp = _charOff & 0x3; // bit-pair within byte
|
||
|
}
|
||
|
|
||
|
/**
|
||
|
* Init two SideLocus objects from a top/bot pair, using the result
|
||
|
* from one call to initFromRow to possibly avoid a second call.
|
||
|
*/
|
||
|
static void initFromTopBot_bit(
|
||
|
index_t top,
|
||
|
index_t bot,
|
||
|
const GFMParams<index_t>& gp,
|
||
|
const uint8_t* gfm,
|
||
|
SideLocus& ltop,
|
||
|
SideLocus& lbot)
|
||
|
{
|
||
|
const index_t sideGbwtLen = gp._sideGbwtLen;
|
||
|
// assert_gt(bot, top);
|
||
|
ltop.initFromRow_bit(top, gp, gfm);
|
||
|
index_t spread = bot - top;
|
||
|
// Many cache misses on the following lines
|
||
|
if(ltop._charOff + spread < sideGbwtLen) {
|
||
|
lbot._charOff = ltop._charOff + spread;
|
||
|
lbot._sideNum = ltop._sideNum;
|
||
|
lbot._sideByteOff = ltop._sideByteOff;
|
||
|
lbot._by = lbot._charOff >> 3;
|
||
|
assert_lt(lbot._by, (int)gp._sideGbwtSz);
|
||
|
lbot._bp = lbot._charOff & 0x7;
|
||
|
} else {
|
||
|
lbot.initFromRow_bit(bot, gp, gfm);
|
||
|
}
|
||
|
}
|
||
|
|
||
|
/**
|
||
|
* Calculate SideLocus based on a row and other relevant
|
||
|
* information about the shape of the Ebwt.
|
||
|
*/
|
||
|
void initFromRow_bit(
|
||
|
index_t row,
|
||
|
const GFMParams<index_t>& gp,
|
||
|
const uint8_t* gfm) {
|
||
|
const index_t sideSz = gp._sideSz;
|
||
|
// Side length is hard-coded for now; this allows the compiler
|
||
|
// to do clever things to accelerate / and %.
|
||
|
_sideNum = row / gp._sideGbwtLen;
|
||
|
assert_lt(_sideNum, gp._numSides);
|
||
|
_charOff = row % gp._sideGbwtLen;
|
||
|
_sideByteOff = _sideNum * sideSz;
|
||
|
assert_lt(row, gp._gbwtLen);
|
||
|
assert_leq(_sideByteOff + sideSz, gp._gbwtTotSz);
|
||
|
// Tons of cache misses on the next line
|
||
|
_by = _charOff >> 3; // byte within side
|
||
|
assert_lt(_by, (int)gp._sideGbwtSz);
|
||
|
_bp = _charOff & 0x7; // bit-pair within byte
|
||
|
}
|
||
|
|
||
|
/**
|
||
|
* Transform this SideLocus to refer to the next side (i.e. the one
|
||
|
* corresponding to the next side downstream). Set all cursors to
|
||
|
* point to the beginning of the side.
|
||
|
*/
|
||
|
void nextSide(const GFMParams<index_t>& gp) {
|
||
|
assert(valid());
|
||
|
_sideByteOff += gp.sideSz();
|
||
|
_sideNum++;
|
||
|
_by = _bp = _charOff = 0;
|
||
|
assert(valid());
|
||
|
}
|
||
|
|
||
|
/**
|
||
|
* Return true iff this is an initialized SideLocus
|
||
|
*/
|
||
|
bool valid() const {
|
||
|
if(_bp != -1) {
|
||
|
return true;
|
||
|
}
|
||
|
return false;
|
||
|
}
|
||
|
|
||
|
/**
|
||
|
* Convert locus to BW row it corresponds to.
|
||
|
*/
|
||
|
index_t toBWRow(const GFMParams<index_t>& gp) const;
|
||
|
|
||
|
#ifndef NDEBUG
|
||
|
/**
|
||
|
* Check that SideLocus is internally consistent and consistent
|
||
|
* with the (provided) EbwtParams.
|
||
|
*/
|
||
|
bool repOk(const GFMParams<index_t>& gp) const {
|
||
|
ASSERT_ONLY(index_t row = toBWRow(gp));
|
||
|
assert_leq(row, gp._gbwtLen);
|
||
|
assert_range(-1, 3, _bp);
|
||
|
assert_range(0, (int)gp._sideGbwtSz, _by);
|
||
|
return true;
|
||
|
}
|
||
|
#endif
|
||
|
|
||
|
/// Make this look like an invalid SideLocus
|
||
|
void invalidate() {
|
||
|
_bp = -1;
|
||
|
}
|
||
|
|
||
|
/**
|
||
|
* Return a read-only pointer to the beginning of the top side.
|
||
|
*/
|
||
|
const uint8_t *side(const uint8_t* gbwt) const {
|
||
|
return gbwt + _sideByteOff;
|
||
|
}
|
||
|
|
||
|
/**
|
||
|
* Return a read-only pointer to the beginning of the top side.
|
||
|
*/
|
||
|
const uint8_t *next_side(const GFMParams<index_t>& gp, const uint8_t* gbwt) const {
|
||
|
if(_sideByteOff + gp._sideSz < gp._ebwtTotSz) {
|
||
|
return gbwt + _sideByteOff + gp._sideSz;
|
||
|
} else {
|
||
|
return NULL;
|
||
|
}
|
||
|
}
|
||
|
|
||
|
index_t _sideByteOff; // offset of top side within ebwt[]
|
||
|
index_t _sideNum; // index of side
|
||
|
index_t _charOff; // character offset within side
|
||
|
int32_t _by; // byte within side (not adjusted for bw sides)
|
||
|
int32_t _bp; // bitpair within byte (not adjusted for bw sides)
|
||
|
};
|
||
|
|
||
|
/**
|
||
|
* Convert locus to BW row it corresponds to.
|
||
|
*/
|
||
|
template <typename index_t>
|
||
|
inline index_t SideLocus<index_t>::toBWRow(const GFMParams<index_t>& gp) const {
|
||
|
return _sideNum * (gp._sideGbwtSz << (gp.linearFM() ? 2 : 1)) + _charOff;
|
||
|
}
|
||
|
|
||
|
#ifdef POPCNT_CAPABILITY // wrapping of "struct"
|
||
|
struct USE_POPCNT_GENERIC {
|
||
|
#endif
|
||
|
// Use this standard bit-bashing population count
|
||
|
inline static int pop64(uint64_t x) {
|
||
|
// Lots of cache misses on following lines (>10K)
|
||
|
x = x - ((x >> 1) & 0x5555555555555555llu);
|
||
|
x = (x & 0x3333333333333333llu) + ((x >> 2) & 0x3333333333333333llu);
|
||
|
x = (x + (x >> 4)) & 0x0F0F0F0F0F0F0F0Fllu;
|
||
|
x = x + (x >> 8);
|
||
|
x = x + (x >> 16);
|
||
|
x = x + (x >> 32);
|
||
|
return (int)(x & 0x3Fllu);
|
||
|
}
|
||
|
#ifdef POPCNT_CAPABILITY // wrapping a "struct"
|
||
|
};
|
||
|
#endif
|
||
|
|
||
|
#ifdef POPCNT_CAPABILITY
|
||
|
struct USE_POPCNT_INSTRUCTION {
|
||
|
inline static int pop64(uint64_t x) {
|
||
|
int64_t count;
|
||
|
#ifdef USING_MSC_COMPILER
|
||
|
count = __popcnt64(x);
|
||
|
#else
|
||
|
asm ("popcntq %[x],%[count]\n": [count] "=&r" (count): [x] "r" (x));
|
||
|
#endif
|
||
|
return (int)count;
|
||
|
}
|
||
|
};
|
||
|
#endif
|
||
|
|
||
|
/**
|
||
|
* Tricky-bit-bashing bitpair counting for given two-bit value (0-3)
|
||
|
* within a 64-bit argument.
|
||
|
*/
|
||
|
#ifdef POPCNT_CAPABILITY
|
||
|
template<typename Operation>
|
||
|
#endif
|
||
|
inline static int countInU64(int c, uint64_t dw) {
|
||
|
uint64_t c0 = c_table[c];
|
||
|
uint64_t x0 = dw ^ c0;
|
||
|
uint64_t x1 = (x0 >> 1);
|
||
|
uint64_t x2 = x1 & (0x5555555555555555);
|
||
|
uint64_t x3 = x0 & x2;
|
||
|
#ifdef POPCNT_CAPABILITY
|
||
|
uint64_t tmp = Operation().pop64(x3);
|
||
|
#else
|
||
|
uint64_t tmp = pop64(x3);
|
||
|
#endif
|
||
|
return (int) tmp;
|
||
|
}
|
||
|
|
||
|
#ifdef POPCNT_CAPABILITY // wrapping of "struct"
|
||
|
struct USE_POPCNT_GENERIC_BITS {
|
||
|
// Use this standard bit-bashing population count
|
||
|
inline static uint64_t pop64(uint64_t x) {
|
||
|
#else
|
||
|
// Use this standard bit-bashing population count
|
||
|
inline static uint64_t pop6464(uint64_t x) {
|
||
|
#endif
|
||
|
x -= (x >> 1) & 0x5555555555555555ULL;
|
||
|
x = (x & 0x3333333333333333ULL) + ((x >> 2) & 0x3333333333333333ULL);
|
||
|
x = (x + (x >> 4)) & 0x0f0f0f0f0f0f0f0fULL;
|
||
|
return int((x * 0x0101010101010101ULL) >> 56);
|
||
|
}
|
||
|
#ifdef POPCNT_CAPABILITY // wrapping a "struct"
|
||
|
};
|
||
|
#endif
|
||
|
|
||
|
/**
|
||
|
* Tricky-bit-bashing bitpair counting for given two-bit value (0-3)
|
||
|
* within a 64-bit argument.
|
||
|
*/
|
||
|
#ifdef POPCNT_CAPABILITY
|
||
|
template<typename Operation>
|
||
|
#endif
|
||
|
inline static int countInU64_bits(uint64_t dw) {
|
||
|
#ifdef POPCNT_CAPABILITY
|
||
|
uint64_t tmp = Operation().pop64(dw);
|
||
|
#else
|
||
|
uint64_t tmp = pop6464(dw);
|
||
|
#endif
|
||
|
return (int) tmp;
|
||
|
}
|
||
|
|
||
|
// Forward declarations for Ebwt class
|
||
|
class GFMSearchParams;
|
||
|
|
||
|
/**
|
||
|
* Extended Burrows-Wheeler transform data.
|
||
|
*
|
||
|
* An Ebwt may be transferred to and from RAM with calls to
|
||
|
* evictFromMemory() and loadIntoMemory(). By default, a newly-created
|
||
|
* Ebwt is not loaded into memory; if the user would like to use a
|
||
|
* newly-created Ebwt to answer queries, they must first call
|
||
|
* loadIntoMemory().
|
||
|
*/
|
||
|
template <class index_t = uint32_t>
|
||
|
class GFM {
|
||
|
public:
|
||
|
#define GFM_INITS \
|
||
|
_toBigEndian(currentlyBigEndian()), \
|
||
|
_overrideOffRate(overrideOffRate), \
|
||
|
_verbose(verbose), \
|
||
|
_passMemExc(passMemExc), \
|
||
|
_sanity(sanityCheck), \
|
||
|
fw_(fw), \
|
||
|
_in1(NULL), \
|
||
|
_in2(NULL), \
|
||
|
_nPat(0), \
|
||
|
_nFrag(0), \
|
||
|
_plen(EBWT_CAT), \
|
||
|
_rstarts(EBWT_CAT), \
|
||
|
_fchr(EBWT_CAT), \
|
||
|
_ftab(EBWT_CAT), \
|
||
|
_eftab(EBWT_CAT), \
|
||
|
_offs(EBWT_CAT), \
|
||
|
_gfm(EBWT_CAT), \
|
||
|
_useMm(false), \
|
||
|
useShmem_(false), \
|
||
|
_refnames(EBWT_CAT), \
|
||
|
mmFile1_(NULL), \
|
||
|
mmFile2_(NULL), \
|
||
|
_nthreads(1)
|
||
|
|
||
|
GFM() {}
|
||
|
/// Construct a GFM from the given input file
|
||
|
GFM(const string& in,
|
||
|
ALTDB<index_t>* altdb,
|
||
|
RepeatDB<index_t>* repeatdb,
|
||
|
EList<size_t>* readLens,
|
||
|
int needEntireReverse,
|
||
|
bool fw,
|
||
|
int32_t overrideOffRate, // = -1,
|
||
|
int32_t offRatePlus, // = -1,
|
||
|
bool useMm, // = false,
|
||
|
bool useShmem, // = false,
|
||
|
bool mmSweep, // = false,
|
||
|
bool loadNames, // = false,
|
||
|
bool loadSASamp, // = true,
|
||
|
bool loadFtab, // = true,
|
||
|
bool loadRstarts, // = true,
|
||
|
bool loadSpliceSites, // = true,
|
||
|
bool verbose, // = false,
|
||
|
bool startVerbose, // = false,
|
||
|
bool passMemExc, // = false,
|
||
|
bool sanityCheck, // = false)
|
||
|
bool useHaplotype, // = false
|
||
|
bool skipLoading = false) :
|
||
|
GFM_INITS
|
||
|
{
|
||
|
assert(!useMm || !useShmem);
|
||
|
|
||
|
#ifdef POPCNT_CAPABILITY
|
||
|
ProcessorSupport ps;
|
||
|
_usePOPCNTinstruction = ps.POPCNTenabled();
|
||
|
#endif
|
||
|
|
||
|
packed_ = false;
|
||
|
_useMm = useMm;
|
||
|
useShmem_ = useShmem;
|
||
|
_in1Str = in + ".1." + gfm_ext;
|
||
|
_in2Str = in + ".2." + gfm_ext;
|
||
|
|
||
|
if(skipLoading) return;
|
||
|
|
||
|
if(repeatdb == NULL) {
|
||
|
readIntoMemory(
|
||
|
fw ? -1 : needEntireReverse, // need REF_READ_REVERSE
|
||
|
loadSASamp, // load the SA sample portion?
|
||
|
loadFtab, // load the ftab & eftab?
|
||
|
loadRstarts, // load the rstarts array?
|
||
|
true, // stop after loading the header portion?
|
||
|
&_gh, // params
|
||
|
mmSweep, // mmSweep
|
||
|
loadNames, // loadNames
|
||
|
startVerbose); // startVerbose
|
||
|
// If the offRate has been overridden, reflect that in the
|
||
|
// _eh._offRate field
|
||
|
if(offRatePlus > 0 && _overrideOffRate == -1) {
|
||
|
_overrideOffRate = _gh._offRate + offRatePlus;
|
||
|
}
|
||
|
if(_overrideOffRate > _gh._offRate) {
|
||
|
_gh.setOffRate(_overrideOffRate);
|
||
|
assert_eq(_overrideOffRate, _gh._offRate);
|
||
|
}
|
||
|
}
|
||
|
|
||
|
// Read ALTs
|
||
|
EList<ALT<index_t> >& alts = altdb->alts();
|
||
|
EList<Haplotype<index_t> >& haplotypes = altdb->haplotypes();
|
||
|
EList<string>& altnames = altdb->altnames();
|
||
|
alts.clear(); altnames.clear();
|
||
|
string in7Str = in + ".7." + gfm_ext;
|
||
|
string in8Str = in + ".8." + gfm_ext;
|
||
|
|
||
|
// open alts
|
||
|
if(verbose || startVerbose) cerr << "Opening \"" << in7Str.c_str() << "\"" << endl;
|
||
|
ifstream in7(in7Str.c_str(), ios::binary);
|
||
|
if(!in7.good()) {
|
||
|
cerr << "Could not open index file " << in7Str.c_str() << endl;
|
||
|
}
|
||
|
|
||
|
EList<index_t> to_alti;
|
||
|
index_t to_alti_far = 0;
|
||
|
readI32(in7, this->toBe());
|
||
|
index_t numAlts = readIndex<index_t>(in7, this->toBe());
|
||
|
|
||
|
|
||
|
// open altnames
|
||
|
if(verbose || startVerbose) cerr << "Opening \"" << in8Str.c_str() << "\"" << endl;
|
||
|
ifstream in8(in8Str.c_str(), ios::binary);
|
||
|
if(!in8.good()) {
|
||
|
cerr << "Could not open index file " << in8Str.c_str() << endl;
|
||
|
}
|
||
|
|
||
|
readI32(in8, this->toBe());
|
||
|
index_t numAltnames = readIndex<index_t>(in8, this->toBe());
|
||
|
|
||
|
assert_eq(numAlts, numAltnames);
|
||
|
|
||
|
if(numAlts > 0) {
|
||
|
alts.resizeExact(numAlts); alts.clear();
|
||
|
to_alti.resizeExact(numAlts); to_alti.clear();
|
||
|
while(!in7.eof() && !in8.eof()) {
|
||
|
alts.expand();
|
||
|
alts.back().read(in7, this->toBe());
|
||
|
to_alti.push_back(to_alti_far);
|
||
|
to_alti_far++;
|
||
|
|
||
|
altnames.expand();
|
||
|
in8 >> altnames.back();
|
||
|
|
||
|
if(!loadSpliceSites) {
|
||
|
if(alts.back().splicesite()) {
|
||
|
alts.pop_back();
|
||
|
assert_gt(numAlts, 0);
|
||
|
altnames.pop_back();
|
||
|
assert_gt(numAltnames, 0);
|
||
|
numAlts--;
|
||
|
numAltnames--;
|
||
|
to_alti.back() = std::numeric_limits<index_t>::max();
|
||
|
to_alti_far--;
|
||
|
}
|
||
|
}
|
||
|
if(alts.size() == numAlts) break;
|
||
|
}
|
||
|
}
|
||
|
assert_eq(alts.size(), numAlts);
|
||
|
assert_eq(to_alti_far, numAlts);
|
||
|
assert_eq(alts.size(), altnames.size());
|
||
|
// Check if it hits the end of file, and this routine is needed for backward compatibility
|
||
|
if(in7.peek() != std::ifstream::traits_type::eof()) {
|
||
|
index_t numHaplotypes = readIndex<index_t>(in7, this->toBe());
|
||
|
if(numHaplotypes > 0) {
|
||
|
haplotypes.resizeExact(numHaplotypes);
|
||
|
haplotypes.clear();
|
||
|
while(!in7.eof()) {
|
||
|
haplotypes.expand();
|
||
|
haplotypes.back().read(in7, this->toBe());
|
||
|
Haplotype<index_t>& ht = haplotypes.back();
|
||
|
for(index_t h = 0; h < ht.alts.size(); h++) {
|
||
|
ht.alts[h] = to_alti[ht.alts[h]];
|
||
|
}
|
||
|
if(haplotypes.size() == numHaplotypes) break;
|
||
|
}
|
||
|
}
|
||
|
if(!useHaplotype) {
|
||
|
haplotypes.nullify();
|
||
|
}
|
||
|
}
|
||
|
|
||
|
// Read repeats
|
||
|
_repeat = false;
|
||
|
if(repeatdb != NULL) {
|
||
|
_repeat = true;
|
||
|
|
||
|
// Number of repeat groups in the index
|
||
|
index_t numRepeatIndex = readIndex<index_t>(in7, this->toBe());
|
||
|
assert_gt(numRepeatIndex, 0);
|
||
|
EList<pair<index_t, index_t> > repeatLens; repeatLens.resizeExact(numRepeatIndex);
|
||
|
|
||
|
for(size_t k = 0; k < numRepeatIndex; k++) {
|
||
|
repeatLens[k].first = readIndex<index_t>(in7, this->toBe());
|
||
|
repeatLens[k].second = readIndex<index_t>(in7, this->toBe());
|
||
|
}
|
||
|
|
||
|
if (readLens != NULL && !readLens->empty()) {
|
||
|
// Load subset of repeat groups.
|
||
|
size_t k = 0;
|
||
|
size_t k2 = 0;
|
||
|
|
||
|
_repeatIncluded.resizeExact(numRepeatIndex);
|
||
|
_repeatIncluded.fillZero();
|
||
|
|
||
|
while(k < numRepeatIndex && k2 < readLens->size()) {
|
||
|
if (repeatLens[k].first >= (*readLens)[k2]) {
|
||
|
_repeatIncluded[k] = true;
|
||
|
k2++;
|
||
|
} else {
|
||
|
k++;
|
||
|
}
|
||
|
}
|
||
|
|
||
|
// at least last repeat group is included
|
||
|
_repeatIncluded[numRepeatIndex - 1] = true;
|
||
|
|
||
|
_repeatLens.clear();
|
||
|
for(size_t i = 0; i < numRepeatIndex; i++) {
|
||
|
if (_repeatIncluded[i]) {
|
||
|
_repeatLens.push_back(repeatLens[i]);
|
||
|
}
|
||
|
}
|
||
|
} else {
|
||
|
// Load all repeat groups
|
||
|
_repeatLens = repeatLens;
|
||
|
_repeatIncluded.resizeExact(numRepeatIndex);
|
||
|
_repeatIncluded.fill(true);
|
||
|
}
|
||
|
|
||
|
repeatdb->read(in7, this->toBe(), _repeatIncluded);
|
||
|
index_t numKmertables = readIndex<index_t>(in7, this->toBe());
|
||
|
EList<streampos> filePos; filePos.resizeExact(numKmertables);
|
||
|
for(size_t k = 0; k < numKmertables; k++) {
|
||
|
filePos[k] = readIndex<uint64_t>(in7, this->toBe());
|
||
|
}
|
||
|
for(size_t k = 0; k < numKmertables; k++) {
|
||
|
if(!_repeatIncluded[k])
|
||
|
continue;
|
||
|
if(k > 0) {
|
||
|
in7.seekg(filePos[k-1]);
|
||
|
}
|
||
|
_repeat_kmertables.expand();
|
||
|
_repeat_kmertables.back().read(in7, this->toBe());
|
||
|
}
|
||
|
in7.seekg(filePos.back());
|
||
|
}
|
||
|
|
||
|
in7.close();
|
||
|
in8.close();
|
||
|
|
||
|
// Sort SNPs and Splice Sites based on positions
|
||
|
index_t nalts = (index_t)alts.size();
|
||
|
for(index_t s = 0; s < nalts; s++) {
|
||
|
ALT<index_t> alt = alts[s];
|
||
|
if(alt.snp()) altdb->setSNPs(true);
|
||
|
if(alt.exon()) altdb->setExons(true);
|
||
|
if(alt.splicesite()) {
|
||
|
altdb->setSpliceSites(true);
|
||
|
alts.push_back(alt);
|
||
|
alts.back().left = alt.right;
|
||
|
alts.back().right = alt.left;
|
||
|
altnames.push_back("ssr");
|
||
|
} else if(alt.deletion()) {
|
||
|
alts.push_back(alt);
|
||
|
alts.back().pos = alt.pos + alt.len - 1;
|
||
|
alts.back().reversed = true;
|
||
|
string altname = altnames[s];
|
||
|
altnames.push_back(altname);
|
||
|
}
|
||
|
}
|
||
|
if(alts.size() > 1 && alts.size() > nalts) {
|
||
|
assert_eq(alts.size(), altnames.size());
|
||
|
EList<pair<ALT<index_t>, index_t> > buf; buf.resize(alts.size());
|
||
|
EList<string> buf2; buf2.resize(alts.size());
|
||
|
for(size_t i = 0; i < alts.size(); i++) {
|
||
|
buf[i].first = alts[i];
|
||
|
buf[i].second = (index_t)i;
|
||
|
buf2[i] = altnames[i];
|
||
|
}
|
||
|
buf.sort();
|
||
|
for(size_t i = 0; i < alts.size(); i++) {
|
||
|
alts[i] = buf[i].first;
|
||
|
altnames[i] = buf2[buf[i].second];
|
||
|
if(buf[i].second < numAlts) {
|
||
|
to_alti[buf[i].second] = i;
|
||
|
}
|
||
|
}
|
||
|
}
|
||
|
|
||
|
if(useHaplotype) {
|
||
|
EList<index_t>& haplotype_maxrights = altdb->haplotype_maxrights();
|
||
|
haplotype_maxrights.resizeExact(haplotypes.size());
|
||
|
for(index_t h = 0; h < haplotypes.size(); h++) {
|
||
|
Haplotype<index_t>& ht = haplotypes[h];
|
||
|
for(index_t h2 = 0; h2 < ht.alts.size(); h2++) {
|
||
|
ht.alts[h2] = to_alti[ht.alts[h2]];
|
||
|
}
|
||
|
if(h == 0) {
|
||
|
haplotype_maxrights[h] = ht.right;
|
||
|
} else {
|
||
|
haplotype_maxrights[h] = std::max<index_t>(haplotype_maxrights[h - 1], ht.right);
|
||
|
}
|
||
|
}
|
||
|
}
|
||
|
|
||
|
assert(repeatdb != NULL || repOk());
|
||
|
}
|
||
|
|
||
|
/// Construct an Ebwt from the given header parameters and string
|
||
|
/// vector, optionally using a blockwise suffix sorter with the
|
||
|
/// given 'bmax' and 'dcv' parameters. The string vector is
|
||
|
/// ultimately joined and the joined string is passed to buildToDisk().
|
||
|
GFM(
|
||
|
bool packed,
|
||
|
int needEntireReverse,
|
||
|
int32_t lineRate,
|
||
|
int32_t offRate,
|
||
|
int32_t ftabChars,
|
||
|
const string& file, // base filename for GFM files
|
||
|
bool fw,
|
||
|
int dcv,
|
||
|
EList<RefRecord>& szs,
|
||
|
index_t sztot,
|
||
|
const RefReadInParams& refparams,
|
||
|
uint32_t seed,
|
||
|
int32_t overrideOffRate = -1,
|
||
|
bool verbose = false,
|
||
|
bool passMemExc = false,
|
||
|
bool sanityCheck = false) :
|
||
|
GFM_INITS,
|
||
|
_gh(
|
||
|
joinedLen(szs),
|
||
|
0,
|
||
|
0,
|
||
|
lineRate,
|
||
|
offRate,
|
||
|
ftabChars,
|
||
|
0,
|
||
|
refparams.reverse == REF_READ_REVERSE)
|
||
|
{
|
||
|
#ifdef POPCNT_CAPABILITY
|
||
|
ProcessorSupport ps;
|
||
|
_usePOPCNTinstruction = ps.POPCNTenabled();
|
||
|
#endif
|
||
|
packed_ = packed;
|
||
|
}
|
||
|
|
||
|
/// Construct an Ebwt from the given header parameters and string
|
||
|
/// vector, optionally using a blockwise suffix sorter with the
|
||
|
/// given 'bmax' and 'dcv' parameters. The string vector is
|
||
|
/// ultimately joined and the joined string is passed to buildToDisk().
|
||
|
template<typename TStr>
|
||
|
GFM(
|
||
|
TStr& s,
|
||
|
bool packed,
|
||
|
int needEntireReverse,
|
||
|
int32_t lineRate,
|
||
|
int32_t offRate,
|
||
|
int32_t ftabChars,
|
||
|
int nthreads,
|
||
|
const string& snpfile,
|
||
|
const string& htfile,
|
||
|
const string& ssfile,
|
||
|
const string& exonfile,
|
||
|
const string& svfile,
|
||
|
const string& repeatfile,
|
||
|
const string& outfile, // base filename for GFM files
|
||
|
bool fw,
|
||
|
bool useBlockwise,
|
||
|
index_t bmax,
|
||
|
index_t bmaxSqrtMult,
|
||
|
index_t bmaxDivN,
|
||
|
int dcv,
|
||
|
EList<FileBuf*>& is,
|
||
|
EList<RefRecord>& szs,
|
||
|
index_t sztot,
|
||
|
const RefReadInParams& refparams,
|
||
|
EList<RefRecord>* parent_szs,
|
||
|
EList<string>* parent_refnames,
|
||
|
uint32_t seed,
|
||
|
int32_t overrideOffRate = -1,
|
||
|
bool verbose = false,
|
||
|
bool passMemExc = false,
|
||
|
bool sanityCheck = false) :
|
||
|
GFM_INITS,
|
||
|
_gh(
|
||
|
joinedLen(szs),
|
||
|
0,
|
||
|
0,
|
||
|
lineRate,
|
||
|
offRate,
|
||
|
ftabChars,
|
||
|
0,
|
||
|
refparams.reverse == REF_READ_REVERSE)
|
||
|
{
|
||
|
assert_gt(nthreads, 0);
|
||
|
_nthreads = nthreads;
|
||
|
#ifdef POPCNT_CAPABILITY
|
||
|
ProcessorSupport ps;
|
||
|
_usePOPCNTinstruction = ps.POPCNTenabled();
|
||
|
#endif
|
||
|
_in1Str = outfile + ".1." + gfm_ext;
|
||
|
_in2Str = outfile + ".2." + gfm_ext;
|
||
|
packed_ = packed;
|
||
|
// Open output files
|
||
|
ofstream fout1(_in1Str.c_str(), ios::binary);
|
||
|
if(!fout1.good()) {
|
||
|
cerr << "Could not open index file for writing: \"" << _in1Str.c_str() << "\"" << endl
|
||
|
<< "Please make sure the directory exists and that permissions allow writing by" << endl
|
||
|
<< "HISAT2." << endl;
|
||
|
throw 1;
|
||
|
}
|
||
|
ofstream fout2(_in2Str.c_str(), ios::binary);
|
||
|
if(!fout2.good()) {
|
||
|
cerr << "Could not open index file for writing: \"" << _in2Str.c_str() << "\"" << endl
|
||
|
<< "Please make sure the directory exists and that permissions allow writing by" << endl
|
||
|
<< "HISAT2." << endl;
|
||
|
throw 1;
|
||
|
}
|
||
|
|
||
|
// Build
|
||
|
initFromVector<TStr>(
|
||
|
s,
|
||
|
snpfile,
|
||
|
htfile,
|
||
|
ssfile,
|
||
|
exonfile,
|
||
|
svfile,
|
||
|
repeatfile,
|
||
|
is,
|
||
|
szs,
|
||
|
sztot,
|
||
|
refparams,
|
||
|
fout1,
|
||
|
fout2,
|
||
|
outfile,
|
||
|
useBlockwise,
|
||
|
bmax,
|
||
|
bmaxSqrtMult,
|
||
|
bmaxDivN,
|
||
|
dcv,
|
||
|
parent_szs,
|
||
|
parent_refnames,
|
||
|
seed,
|
||
|
verbose);
|
||
|
// Close output files
|
||
|
fout1.flush();
|
||
|
int64_t tellpSz1 = (int64_t)fout1.tellp();
|
||
|
VMSG_NL("Wrote " << fout1.tellp() << " bytes to primary GFM file: " << _in1Str.c_str());
|
||
|
fout1.close();
|
||
|
bool err = false;
|
||
|
if(tellpSz1 > fileSize(_in1Str.c_str())) {
|
||
|
err = true;
|
||
|
cerr << "Index is corrupt: File size for " << _in1Str.c_str() << " should have been " << tellpSz1
|
||
|
<< " but is actually " << fileSize(_in1Str.c_str()) << "." << endl;
|
||
|
}
|
||
|
fout2.flush();
|
||
|
int64_t tellpSz2 = (int64_t)fout2.tellp();
|
||
|
VMSG_NL("Wrote " << fout2.tellp() << " bytes to secondary GFM file: " << _in2Str.c_str());
|
||
|
fout2.close();
|
||
|
if(tellpSz2 > fileSize(_in2Str.c_str())) {
|
||
|
err = true;
|
||
|
cerr << "Index is corrupt: File size for " << _in2Str.c_str() << " should have been " << tellpSz2
|
||
|
<< " but is actually " << fileSize(_in2Str.c_str()) << "." << endl;
|
||
|
}
|
||
|
if(err) {
|
||
|
cerr << "Please check if there is a problem with the disk or if disk is full." << endl;
|
||
|
throw 1;
|
||
|
}
|
||
|
// Reopen as input streams
|
||
|
VMSG_NL("Re-opening _in1 and _in2 as input streams");
|
||
|
if(_sanity) {
|
||
|
VMSG_NL("Sanity-checking Bt2");
|
||
|
assert(!isInMemory());
|
||
|
readIntoMemory(
|
||
|
fw ? -1 : needEntireReverse, // 1 -> need the reverse to be reverse-of-concat
|
||
|
true, // load SA sample (_offs[])?
|
||
|
true, // load ftab (_ftab[] & _eftab[])?
|
||
|
true, // load r-starts (_rstarts[])?
|
||
|
false, // just load header?
|
||
|
NULL, // Params object to fill
|
||
|
false, // mm sweep?
|
||
|
true, // load names?
|
||
|
false); // verbose startup?
|
||
|
// sanityCheckAll(refparams.reverse);
|
||
|
evictFromMemory();
|
||
|
assert(!isInMemory());
|
||
|
}
|
||
|
VMSG_NL("Returning from GFM constructor");
|
||
|
}
|
||
|
|
||
|
/**
|
||
|
* Static constructor for a pair of forward/reverse indexes for the
|
||
|
* given reference string.
|
||
|
*/
|
||
|
template<typename TStr>
|
||
|
static pair<GFM*, GFM*>
|
||
|
fromString(
|
||
|
const char* str,
|
||
|
bool packed,
|
||
|
int reverse,
|
||
|
bool bigEndian,
|
||
|
int32_t lineRate,
|
||
|
int32_t offRate,
|
||
|
int32_t ftabChars,
|
||
|
const string& file,
|
||
|
bool useBlockwise,
|
||
|
index_t bmax,
|
||
|
index_t bmaxSqrtMult,
|
||
|
index_t bmaxDivN,
|
||
|
int dcv,
|
||
|
uint32_t seed,
|
||
|
bool verbose,
|
||
|
bool autoMem,
|
||
|
bool sanity)
|
||
|
{
|
||
|
EList<std::string> strs(EBWT_CAT);
|
||
|
strs.push_back(std::string(str));
|
||
|
return fromStrings<TStr>(
|
||
|
strs,
|
||
|
packed,
|
||
|
reverse,
|
||
|
bigEndian,
|
||
|
lineRate,
|
||
|
offRate,
|
||
|
ftabChars,
|
||
|
file,
|
||
|
useBlockwise,
|
||
|
bmax,
|
||
|
bmaxSqrtMult,
|
||
|
bmaxDivN,
|
||
|
dcv,
|
||
|
seed,
|
||
|
verbose,
|
||
|
autoMem,
|
||
|
sanity);
|
||
|
}
|
||
|
|
||
|
/**
|
||
|
* Static constructor for a pair of forward/reverse indexes for the
|
||
|
* given list of reference strings.
|
||
|
*/
|
||
|
template<typename TStr>
|
||
|
static pair<GFM*, GFM*>
|
||
|
fromStrings(
|
||
|
const EList<std::string>& strs,
|
||
|
bool packed,
|
||
|
int reverse,
|
||
|
bool bigEndian,
|
||
|
int32_t lineRate,
|
||
|
int32_t offRate,
|
||
|
int32_t ftabChars,
|
||
|
const string& file,
|
||
|
bool useBlockwise,
|
||
|
index_t bmax,
|
||
|
index_t bmaxSqrtMult,
|
||
|
index_t bmaxDivN,
|
||
|
int dcv,
|
||
|
uint32_t seed,
|
||
|
bool verbose,
|
||
|
bool autoMem,
|
||
|
bool sanity)
|
||
|
{
|
||
|
assert(!strs.empty());
|
||
|
EList<FileBuf*> is(EBWT_CAT);
|
||
|
RefReadInParams refparams(false /* color */, REF_READ_FORWARD, false, false);
|
||
|
// Adapt sequence strings to stringstreams open for input
|
||
|
auto_ptr<stringstream> ss(new stringstream());
|
||
|
for(index_t i = 0; i < strs.size(); i++) {
|
||
|
(*ss) << ">" << i << endl << strs[i] << endl;
|
||
|
}
|
||
|
auto_ptr<FileBuf> fb(new FileBuf(ss.get()));
|
||
|
assert(!fb->eof());
|
||
|
assert(fb->get() == '>');
|
||
|
ASSERT_ONLY(fb->reset());
|
||
|
assert(!fb->eof());
|
||
|
is.push_back(fb.get());
|
||
|
// Vector for the ordered list of "records" comprising the input
|
||
|
// sequences. A record represents a stretch of unambiguous
|
||
|
// characters in one of the input sequences.
|
||
|
EList<RefRecord> szs(EBWT_CAT);
|
||
|
std::pair<index_t, index_t> sztot;
|
||
|
sztot = BitPairReference::szsFromFasta(is,
|
||
|
file,
|
||
|
bigEndian,
|
||
|
refparams,
|
||
|
szs,
|
||
|
sanity);
|
||
|
// Construct Ebwt from input strings and parameters
|
||
|
GFM<index_t> *gfmFw = new GFM<index_t>(
|
||
|
TStr(),
|
||
|
packed,
|
||
|
-1, // fw
|
||
|
lineRate,
|
||
|
offRate, // suffix-array sampling rate
|
||
|
ftabChars, // number of chars in initial arrow-pair calc
|
||
|
file, // basename for .?.ebwt files
|
||
|
true, // fw?
|
||
|
useBlockwise, // useBlockwise
|
||
|
bmax, // block size for blockwise SA builder
|
||
|
bmaxSqrtMult, // block size as multiplier of sqrt(len)
|
||
|
bmaxDivN, // block size as divisor of len
|
||
|
dcv, // difference-cover period
|
||
|
is, // list of input streams
|
||
|
szs, // list of reference sizes
|
||
|
sztot.first, // total size of all unambiguous ref chars
|
||
|
refparams, // reference read-in parameters
|
||
|
seed, // pseudo-random number generator seed
|
||
|
-1, // override offRate
|
||
|
verbose, // be talkative
|
||
|
autoMem, // pass exceptions up to the toplevel so that we can adjust memory settings automatically
|
||
|
sanity); // verify results and internal consistency
|
||
|
refparams.reverse = reverse;
|
||
|
szs.clear();
|
||
|
sztot = BitPairReference::szsFromFasta(is,
|
||
|
file,
|
||
|
bigEndian,
|
||
|
refparams,
|
||
|
szs,
|
||
|
sanity);
|
||
|
// Construct Ebwt from input strings and parameters
|
||
|
GFM<index_t> *gfmBw = new GFM<index_t>(
|
||
|
TStr(),
|
||
|
packed,
|
||
|
reverse == REF_READ_REVERSE,
|
||
|
lineRate,
|
||
|
offRate, // suffix-array sampling rate
|
||
|
ftabChars, // number of chars in initial arrow-pair calc
|
||
|
file + ".rev",// basename for .?.ebwt files
|
||
|
false, // fw?
|
||
|
useBlockwise, // useBlockwise
|
||
|
bmax, // block size for blockwise SA builder
|
||
|
bmaxSqrtMult, // block size as multiplier of sqrt(len)
|
||
|
bmaxDivN, // block size as divisor of len
|
||
|
dcv, // difference-cover period
|
||
|
is, // list of input streams
|
||
|
szs, // list of reference sizes
|
||
|
sztot.first, // total size of all unambiguous ref chars
|
||
|
refparams, // reference read-in parameters
|
||
|
seed, // pseudo-random number generator seed
|
||
|
-1, // override offRate
|
||
|
verbose, // be talkative
|
||
|
autoMem, // pass exceptions up to the toplevel so that we can adjust memory settings automatically
|
||
|
sanity); // verify results and internal consistency
|
||
|
return make_pair(gfmFw, gfmBw);
|
||
|
}
|
||
|
|
||
|
/// Return true iff the Ebwt is packed
|
||
|
bool isPacked() { return packed_; }
|
||
|
|
||
|
/**
|
||
|
* Write the rstarts array given the szs array for the reference.
|
||
|
*/
|
||
|
void szsToDisk(const EList<RefRecord>& szs, ostream& os, int reverse);
|
||
|
|
||
|
bool checkPosToSzs(const EList<RefRecord>& szs, index_t start_idx, index_t pos)
|
||
|
{
|
||
|
assert(szs[start_idx].first);
|
||
|
for(index_t i = start_idx; i < szs.size(); i++) {
|
||
|
if((i != start_idx) && (szs[i].first)) {
|
||
|
// span to next chr
|
||
|
return false;
|
||
|
}
|
||
|
|
||
|
if(pos < szs[i].off) {
|
||
|
return false;
|
||
|
} else {
|
||
|
pos -= szs[i].off;
|
||
|
if(pos < szs[i].len) {
|
||
|
return true;
|
||
|
}
|
||
|
pos -= szs[i].len;
|
||
|
}
|
||
|
}
|
||
|
assert(false);
|
||
|
return false;
|
||
|
}
|
||
|
|
||
|
/**
|
||
|
* Helper for the constructors above. Takes a vector of text
|
||
|
* strings and joins them into a single string with a call to
|
||
|
* joinToDisk, which does a join (with padding) and writes some of
|
||
|
* the resulting data directly to disk rather than keep it in
|
||
|
* memory. It then constructs a suffix-array producer (what kind
|
||
|
* depends on 'useBlockwise') for the resulting sequence. The
|
||
|
* suffix-array producer can then be used to obtain chunks of the
|
||
|
* joined string's suffix array.
|
||
|
*/
|
||
|
template <typename TStr>
|
||
|
void initFromVector(TStr& s,
|
||
|
const string& snpfile,
|
||
|
const string& htfile,
|
||
|
const string& ssfile,
|
||
|
const string& exonfile,
|
||
|
const string& svfile,
|
||
|
const string& repeatfile,
|
||
|
EList<FileBuf*>& is,
|
||
|
EList<RefRecord>& szs,
|
||
|
index_t sztot,
|
||
|
const RefReadInParams& refparams,
|
||
|
ofstream& out1,
|
||
|
ofstream& out2,
|
||
|
const string& outfile,
|
||
|
bool useBlockwise,
|
||
|
index_t bmax,
|
||
|
index_t bmaxSqrtMult,
|
||
|
index_t bmaxDivN,
|
||
|
int dcv,
|
||
|
EList<RefRecord>* parent_szs,
|
||
|
EList<string>* parent_refnames,
|
||
|
uint32_t seed,
|
||
|
bool verbose)
|
||
|
{
|
||
|
// Compose text strings into single string
|
||
|
VMSG_NL("Calculating joined length");
|
||
|
index_t jlen;
|
||
|
jlen = joinedLen(szs);
|
||
|
_repeat = (parent_szs != NULL);
|
||
|
assert_geq(jlen, sztot);
|
||
|
VMSG_NL("Writing header");
|
||
|
writeFromMemory(true, out1, out2);
|
||
|
try {
|
||
|
VMSG_NL("Reserving space for joined string");
|
||
|
s.resize(jlen);
|
||
|
VMSG_NL("Joining reference sequences");
|
||
|
if(refparams.reverse == REF_READ_REVERSE) {
|
||
|
{
|
||
|
Timer timer(cerr, " Time to join reference sequences: ", _verbose);
|
||
|
joinToDisk(is, szs, sztot, refparams, s, out1, out2);
|
||
|
} {
|
||
|
Timer timer(cerr, " Time to reverse reference sequence: ", _verbose);
|
||
|
EList<RefRecord> tmp(EBWT_CAT);
|
||
|
s.reverse();
|
||
|
reverseRefRecords(szs, tmp, false, verbose);
|
||
|
szsToDisk(tmp, out1, refparams.reverse);
|
||
|
}
|
||
|
} else {
|
||
|
Timer timer(cerr, " Time to join reference sequences: ", _verbose);
|
||
|
joinToDisk(is, szs, sztot, refparams, s, out1, out2);
|
||
|
szsToDisk(szs, out1, refparams.reverse);
|
||
|
}
|
||
|
|
||
|
{
|
||
|
Timer timer(cerr, " Time to read SNPs and splice sites: ", _verbose);
|
||
|
_alts.clear();
|
||
|
_altnames.clear();
|
||
|
EList<pair<index_t, index_t> > chr_szs;
|
||
|
index_t tmp_len = 0;
|
||
|
for(index_t i = 0; i < szs.size(); i++) {
|
||
|
if(szs[i].first) {
|
||
|
chr_szs.expand();
|
||
|
chr_szs.back().first = tmp_len;
|
||
|
chr_szs.back().second = i;
|
||
|
}
|
||
|
tmp_len += (index_t)szs[i].len;
|
||
|
}
|
||
|
|
||
|
// Write SNPs into 7.ht2 and 8.ht2
|
||
|
string file7 = outfile + ".7." + gfm_ext;
|
||
|
string file8 = outfile + ".8." + gfm_ext;
|
||
|
|
||
|
// Open output stream for the '.7.gfm_ext' file which will
|
||
|
// hold SNPs (except IDs).
|
||
|
ofstream fout7(file7.c_str(), ios::binary);
|
||
|
if(!fout7.good()) {
|
||
|
cerr << "Could not open index file for writing: \"" << file7.c_str() << "\"" << endl
|
||
|
<< "Please make sure the directory exists and that permissions allow writing by" << endl
|
||
|
<< "HISAT2." << endl;
|
||
|
throw 1;
|
||
|
}
|
||
|
// Open output stream for the '.8.gfm_ext' file which will
|
||
|
// hold SNP IDs.
|
||
|
ofstream fout8(file8.c_str(), ios::binary);
|
||
|
if(!fout8.good()) {
|
||
|
cerr << "Could not open index file for writing: \"" << file8.c_str() << "\"" << endl
|
||
|
<< "Please make sure the directory exists and that permissions allow writing by" << endl
|
||
|
<< "HISAT2." << endl;
|
||
|
throw 1;
|
||
|
}
|
||
|
writeIndex<int32_t>(fout7, 1, this->toBe()); // endianness sentinel
|
||
|
writeIndex<int32_t>(fout8, 1, this->toBe()); // endianness sentinel
|
||
|
|
||
|
for(index_t i = 0; i < _refnames.size(); i++) {
|
||
|
_refnames_nospace.push_back("");
|
||
|
for(index_t j = 0; j < _refnames[i].size(); j++) {
|
||
|
char c = _refnames[i][j];
|
||
|
if(c == ' ') break;
|
||
|
_refnames_nospace.back().push_back(c);
|
||
|
}
|
||
|
}
|
||
|
|
||
|
map<string, index_t> snpID2num;
|
||
|
if(snpfile != "") {
|
||
|
ifstream snp_file(snpfile.c_str(), ios::in);
|
||
|
if(!snp_file.is_open()) {
|
||
|
cerr << "Error: could not open " << snpfile.c_str() << endl;
|
||
|
throw 1;
|
||
|
}
|
||
|
|
||
|
while(!snp_file.eof()) {
|
||
|
// rs73387790 single 22:20000001-21000000 145 A
|
||
|
string snp_id;
|
||
|
snp_file >> snp_id;
|
||
|
if(snp_id.empty() || snp_id[0] == '#') {
|
||
|
string line;
|
||
|
getline(snp_file, line);
|
||
|
continue;
|
||
|
}
|
||
|
string type, chr;
|
||
|
index_t genome_pos;
|
||
|
char snp_ch = '\0';
|
||
|
string ins_seq;
|
||
|
index_t del_len = 0;
|
||
|
snp_file >> type >> chr >> genome_pos;
|
||
|
if(type == "single") {
|
||
|
snp_file >> snp_ch;
|
||
|
} else if(type == "deletion") {
|
||
|
snp_file >> del_len;
|
||
|
} else if(type == "insertion") {
|
||
|
snp_file >> ins_seq;
|
||
|
}
|
||
|
index_t chr_idx = 0;
|
||
|
for(; chr_idx < _refnames_nospace.size(); chr_idx++) {
|
||
|
if(chr == _refnames_nospace[chr_idx])
|
||
|
break;
|
||
|
}
|
||
|
if(chr_idx >= _refnames_nospace.size()) {
|
||
|
continue;
|
||
|
}
|
||
|
assert_eq(chr_szs.size(), _refnames_nospace.size());
|
||
|
assert_lt(chr_idx, chr_szs.size());
|
||
|
pair<index_t, index_t> tmp_pair = chr_szs[chr_idx];
|
||
|
const index_t sofar_len = tmp_pair.first;
|
||
|
const index_t szs_idx = tmp_pair.second;
|
||
|
bool involve_Ns = false;
|
||
|
index_t pos = genome_pos;
|
||
|
index_t add_pos = 0;
|
||
|
assert(szs[szs_idx].first);
|
||
|
for(index_t i = szs_idx; i < szs.size(); i++) {
|
||
|
if(i != szs_idx && szs[i].first) {
|
||
|
break;
|
||
|
}
|
||
|
if(pos < szs[i].off) {
|
||
|
involve_Ns = true;
|
||
|
break;
|
||
|
} else {
|
||
|
pos -= szs[i].off;
|
||
|
if(pos == 0) {
|
||
|
if(type == "deletion" || type == "insertion") {
|
||
|
involve_Ns = true;
|
||
|
break;
|
||
|
}
|
||
|
}
|
||
|
if(pos < szs[i].len) {
|
||
|
break;
|
||
|
} else {
|
||
|
pos -= szs[i].len;
|
||
|
add_pos += szs[i].len;
|
||
|
}
|
||
|
}
|
||
|
}
|
||
|
|
||
|
if(involve_Ns) {
|
||
|
continue;
|
||
|
}
|
||
|
pos = sofar_len + add_pos + pos;
|
||
|
if(chr_idx + 1 < chr_szs.size()) {
|
||
|
if(pos >= chr_szs[chr_idx + 1].first) {
|
||
|
continue;
|
||
|
}
|
||
|
} else {
|
||
|
if(pos >= jlen){
|
||
|
continue;
|
||
|
}
|
||
|
}
|
||
|
|
||
|
_alts.expand();
|
||
|
ALT<index_t>& snp = _alts.back();
|
||
|
snp.pos = pos;
|
||
|
if(type == "single") {
|
||
|
snp.type = ALT_SNP_SGL;
|
||
|
snp_ch = toupper(snp_ch);
|
||
|
if(snp_ch != 'A' && snp_ch != 'C' && snp_ch != 'G' && snp_ch != 'T') {
|
||
|
_alts.pop_back();
|
||
|
continue;
|
||
|
}
|
||
|
uint64_t bp = asc2dna[(int)snp_ch];
|
||
|
assert_lt(bp, 4);
|
||
|
if((int)bp == s[pos]) {
|
||
|
if (!threeN) {
|
||
|
cerr << "Warning: single type should have a different base than " << "ACGTN"[(int)s[pos]]
|
||
|
<< " (" << snp_id << ") at " << genome_pos << " on " << chr << endl;
|
||
|
}
|
||
|
_alts.pop_back();
|
||
|
continue;
|
||
|
// throw 1;
|
||
|
}
|
||
|
snp.len = 1;
|
||
|
snp.seq = bp;
|
||
|
} else if(type == "deletion") {
|
||
|
snp.type = ALT_SNP_DEL;
|
||
|
snp.len = del_len;
|
||
|
snp.seq = 0;
|
||
|
snp.reversed = false;
|
||
|
} else if(type == "insertion") {
|
||
|
snp.type = ALT_SNP_INS;
|
||
|
snp.len = (index_t)ins_seq.size();
|
||
|
if(snp.len > sizeof(snp.seq) * 4) {
|
||
|
_alts.pop_back();
|
||
|
continue;
|
||
|
}
|
||
|
snp.seq = 0;
|
||
|
bool failed = false;
|
||
|
for(size_t i = 0; i < ins_seq.size(); i++) {
|
||
|
char ch = toupper(ins_seq[i]);
|
||
|
if(ch != 'A' && ch != 'C' && ch != 'G' && ch != 'T') {
|
||
|
failed = true;
|
||
|
break;
|
||
|
}
|
||
|
uint64_t bp = asc2dna[(int)ch];
|
||
|
assert_lt(bp, 4);
|
||
|
snp.seq = (snp.seq << 2) | bp;
|
||
|
}
|
||
|
if(failed) {
|
||
|
_alts.pop_back();
|
||
|
continue;
|
||
|
}
|
||
|
} else {
|
||
|
cerr << "Error: unknown snp type " << type << endl;
|
||
|
throw 1;
|
||
|
}
|
||
|
_altnames.push_back(snp_id);
|
||
|
assert_eq(_alts.size(), _altnames.size());
|
||
|
snpID2num[snp_id] = (index_t)_alts.size() - 1;
|
||
|
}
|
||
|
snp_file.close();
|
||
|
assert_eq(_alts.size(), _altnames.size());
|
||
|
}
|
||
|
|
||
|
_haplotypes.clear();
|
||
|
if(_alts.size() > 0 && htfile != "") {
|
||
|
ifstream ht_file(htfile.c_str(), ios::in);
|
||
|
if(!ht_file.is_open()) {
|
||
|
cerr << "Error: could not open "<< htfile.c_str() << endl;
|
||
|
throw 1;
|
||
|
}
|
||
|
|
||
|
while(!ht_file.eof()) {
|
||
|
// ht66 A*01:01:01:01 371 533 66,69,72,75,76,77,84,88,90,92,95
|
||
|
string ht_id;
|
||
|
ht_file >> ht_id;
|
||
|
if(ht_id.empty() || ht_id[0] == '#') {
|
||
|
string line;
|
||
|
getline(ht_file, line);
|
||
|
continue;
|
||
|
}
|
||
|
string chr, alt_list;
|
||
|
index_t left, right; // inclusive [left, right]
|
||
|
ht_file >> chr >> left >> right >> alt_list;
|
||
|
assert_leq(left, right);
|
||
|
index_t chr_idx = 0;
|
||
|
for(; chr_idx < _refnames_nospace.size(); chr_idx++) {
|
||
|
if(chr == _refnames_nospace[chr_idx])
|
||
|
break;
|
||
|
}
|
||
|
if(chr_idx >= _refnames_nospace.size()) {
|
||
|
continue;
|
||
|
}
|
||
|
assert_eq(chr_szs.size(), _refnames_nospace.size());
|
||
|
assert_lt(chr_idx, chr_szs.size());
|
||
|
pair<index_t, index_t> tmp_pair = chr_szs[chr_idx];
|
||
|
const index_t sofar_len = tmp_pair.first;
|
||
|
const index_t szs_idx = tmp_pair.second;
|
||
|
bool inside_Ns = false;
|
||
|
index_t add_pos = 0;
|
||
|
assert(szs[szs_idx].first);
|
||
|
for(index_t i = szs_idx; i < szs.size(); i++) {
|
||
|
if(i != szs_idx && szs[i].first) break;
|
||
|
if(left < szs[i].off) {
|
||
|
inside_Ns = true;
|
||
|
break;
|
||
|
} else {
|
||
|
left -= szs[i].off;
|
||
|
right -= szs[i].off;
|
||
|
if(left < szs[i].len) {
|
||
|
if(right >= szs[i].len) {
|
||
|
inside_Ns = true;
|
||
|
}
|
||
|
break;
|
||
|
} else {
|
||
|
left -= szs[i].len;
|
||
|
right -= szs[i].len;
|
||
|
add_pos += szs[i].len;
|
||
|
}
|
||
|
}
|
||
|
}
|
||
|
if(inside_Ns) {
|
||
|
continue;
|
||
|
}
|
||
|
left = sofar_len + add_pos + left;
|
||
|
right = sofar_len + add_pos + right;
|
||
|
if(chr_idx + 1 < chr_szs.size()) {
|
||
|
if(right >= chr_szs[chr_idx + 1].first) {
|
||
|
continue;
|
||
|
}
|
||
|
} else {
|
||
|
if(right >= jlen) {
|
||
|
continue;
|
||
|
}
|
||
|
}
|
||
|
_haplotypes.expand();
|
||
|
_haplotypes.back().left = left;
|
||
|
_haplotypes.back().right = right;
|
||
|
EList<string> alts;
|
||
|
tokenize(alt_list, ",", alts);
|
||
|
assert_gt(alts.size(), 0);
|
||
|
_haplotypes.back().alts.clear();
|
||
|
for(size_t i = 0; i < alts.size(); i++) {
|
||
|
const string& alt = alts[i];
|
||
|
if(snpID2num.find(alt) != snpID2num.end()) {
|
||
|
_haplotypes.back().alts.push_back(snpID2num[alt]);
|
||
|
}
|
||
|
}
|
||
|
if(_haplotypes.back().alts.size() <= 0) {
|
||
|
_haplotypes.pop_back();
|
||
|
}
|
||
|
}
|
||
|
_haplotypes.sort();
|
||
|
ht_file.close();
|
||
|
} else {
|
||
|
for(index_t a = 0; a < _alts.size(); a++) {
|
||
|
const ALT<index_t>& alt = _alts[a];
|
||
|
if(!alt.snp()) continue;
|
||
|
_haplotypes.expand();
|
||
|
_haplotypes.back().left = alt.pos;
|
||
|
if(alt.deletion()) {
|
||
|
_haplotypes.back().right = alt.pos + alt.len - 1;
|
||
|
} else {
|
||
|
_haplotypes.back().right = alt.pos;
|
||
|
}
|
||
|
_haplotypes.back().alts.clear();
|
||
|
_haplotypes.back().alts.push_back(a);
|
||
|
}
|
||
|
}
|
||
|
|
||
|
if(ssfile != "") {
|
||
|
ifstream ss_file(ssfile.c_str(), ios::in);
|
||
|
if(!ss_file.is_open()) {
|
||
|
cerr << "Error: could not open " << ssfile.c_str() << endl;
|
||
|
throw 1;
|
||
|
}
|
||
|
map<uint64_t, uint64_t> ss_seq;
|
||
|
while(!ss_file.eof()) {
|
||
|
// 22 16062315 16062810 +
|
||
|
string chr;
|
||
|
ss_file >> chr;
|
||
|
if(chr.empty() || chr[0] == '#') {
|
||
|
string line;
|
||
|
getline(ss_file, line);
|
||
|
continue;
|
||
|
}
|
||
|
index_t left, right;
|
||
|
char strand;
|
||
|
ss_file >> left >> right >> strand;
|
||
|
// Convert exonic position to intronic position
|
||
|
left += 1; right -= 1;
|
||
|
if(left >= right) continue;
|
||
|
index_t chr_idx = 0;
|
||
|
for(; chr_idx < _refnames_nospace.size(); chr_idx++) {
|
||
|
if(chr == _refnames_nospace[chr_idx])
|
||
|
break;
|
||
|
}
|
||
|
if(chr_idx >= _refnames_nospace.size()) continue;
|
||
|
assert_eq(chr_szs.size(), _refnames_nospace.size());
|
||
|
assert_lt(chr_idx, chr_szs.size());
|
||
|
pair<index_t, index_t> tmp_pair = chr_szs[chr_idx];
|
||
|
const index_t sofar_len = tmp_pair.first;
|
||
|
const index_t szs_idx = tmp_pair.second;
|
||
|
|
||
|
// check whether ambiguous base is in exon's last and first base
|
||
|
if(!checkPosToSzs(szs, szs_idx, left - 1)
|
||
|
|| !checkPosToSzs(szs, szs_idx, right + 1)) {
|
||
|
//cerr << "Skip ss. " << chr << ", " << left - 1 << ", " << right + 1 << endl;
|
||
|
continue;
|
||
|
}
|
||
|
|
||
|
bool inside_Ns = false;
|
||
|
index_t add_pos = 0;
|
||
|
assert(szs[szs_idx].first);
|
||
|
for(index_t i = szs_idx; i < szs.size(); i++) {
|
||
|
if(i != szs_idx && szs[i].first) break;
|
||
|
if(left < szs[i].off) {
|
||
|
inside_Ns = true;
|
||
|
break;
|
||
|
} else {
|
||
|
left -= szs[i].off;
|
||
|
right -= szs[i].off;
|
||
|
if(left < szs[i].len) {
|
||
|
if(right >= szs[i].len) {
|
||
|
inside_Ns = true;
|
||
|
}
|
||
|
break;
|
||
|
} else {
|
||
|
left -= szs[i].len;
|
||
|
right -= szs[i].len;
|
||
|
add_pos += szs[i].len;
|
||
|
}
|
||
|
}
|
||
|
}
|
||
|
if(inside_Ns) continue;
|
||
|
left = sofar_len + add_pos + left;
|
||
|
right = sofar_len + add_pos + right;
|
||
|
if(chr_idx + 1 < chr_szs.size()) {
|
||
|
if(right >= chr_szs[chr_idx + 1].first) continue;
|
||
|
} else {
|
||
|
if(right >= jlen) continue;
|
||
|
}
|
||
|
|
||
|
// Avoid splice sites in repetitive sequences
|
||
|
// Otherwise, it will likely explode due to an exponential number of combinations
|
||
|
index_t seqlen = 16; assert_leq(seqlen, 16);
|
||
|
if(left >= seqlen && right + 1 + seqlen <= s.length()) {
|
||
|
uint64_t seq = 0;
|
||
|
for(index_t si = left - seqlen; si < left; si++) {
|
||
|
seq = seq << 2 | s[si];
|
||
|
}
|
||
|
for(index_t si = right + 1; si < right + 1 + seqlen; si++) {
|
||
|
seq = seq << 2 | s[si];
|
||
|
}
|
||
|
if(_alts.size() > 0) {
|
||
|
if(_alts.back().left == left &&
|
||
|
_alts.back().right == right) continue;
|
||
|
}
|
||
|
if(ss_seq.find(seq) == ss_seq.end()) ss_seq[seq] = 1;
|
||
|
else ss_seq[seq]++;
|
||
|
}
|
||
|
|
||
|
_alts.expand();
|
||
|
ALT<index_t>& alt = _alts.back();
|
||
|
alt.type = ALT_SPLICESITE;
|
||
|
alt.left = left;
|
||
|
alt.right = right;
|
||
|
alt.fw = (strand == '+' ? true : false);
|
||
|
alt.excluded = false;
|
||
|
_altnames.push_back("ss");
|
||
|
}
|
||
|
ss_file.close();
|
||
|
assert_eq(_alts.size(), _altnames.size());
|
||
|
|
||
|
for(size_t i = 0; i < _alts.size(); i++) {
|
||
|
ALT<index_t>& alt = _alts[i];
|
||
|
if(!alt.splicesite()) continue;
|
||
|
index_t seqlen = 16; assert_leq(seqlen, 16);
|
||
|
if(alt.left >= seqlen && alt.right + 1 + seqlen <= s.length()) {
|
||
|
uint64_t seq = 0;
|
||
|
for(index_t si = alt.left - seqlen; si < alt.left; si++) {
|
||
|
seq = seq << 2 | s[si];
|
||
|
}
|
||
|
for(index_t si = alt.right + 1; si < alt.right + 1 + seqlen; si++) {
|
||
|
seq = seq << 2 | s[si];
|
||
|
}
|
||
|
assert(ss_seq.find(seq) != ss_seq.end());
|
||
|
alt.excluded = ss_seq[seq] > 1;
|
||
|
}
|
||
|
}
|
||
|
}
|
||
|
|
||
|
if(exonfile != "") {
|
||
|
ifstream exon_file(exonfile.c_str(), ios::in);
|
||
|
if(!exon_file.is_open()) {
|
||
|
cerr << "Error: could not open " << ssfile.c_str() << endl;
|
||
|
throw 1;
|
||
|
}
|
||
|
while(!exon_file.eof()) {
|
||
|
// 22 16062156 16062315 +
|
||
|
string chr;
|
||
|
exon_file >> chr;
|
||
|
if(chr.empty() || chr[0] == '#') {
|
||
|
string line;
|
||
|
getline(exon_file, line);
|
||
|
continue;
|
||
|
}
|
||
|
index_t left, right;
|
||
|
char strand;
|
||
|
exon_file >> left >> right >> strand;
|
||
|
// Convert exonic position to intronic position
|
||
|
left += 1; right -= 1;
|
||
|
if(left >= right) continue;
|
||
|
index_t chr_idx = 0;
|
||
|
for(; chr_idx < _refnames_nospace.size(); chr_idx++) {
|
||
|
if(chr == _refnames_nospace[chr_idx])
|
||
|
break;
|
||
|
}
|
||
|
if(chr_idx >= _refnames_nospace.size()) continue;
|
||
|
assert_eq(chr_szs.size(), _refnames_nospace.size());
|
||
|
assert_lt(chr_idx, chr_szs.size());
|
||
|
pair<index_t, index_t> tmp_pair = chr_szs[chr_idx];
|
||
|
const index_t sofar_len = tmp_pair.first;
|
||
|
const index_t szs_idx = tmp_pair.second;
|
||
|
bool inside_Ns = false;
|
||
|
index_t add_pos = 0;
|
||
|
assert(szs[szs_idx].first);
|
||
|
for(index_t i = szs_idx; i < szs.size(); i++) {
|
||
|
if(i != szs_idx && szs[i].first) break;
|
||
|
if(left < szs[i].off) {
|
||
|
inside_Ns = true;
|
||
|
break;
|
||
|
} else {
|
||
|
left -= szs[i].off;
|
||
|
right -= szs[i].off;
|
||
|
if(left < szs[i].len) {
|
||
|
if(right >= szs[i].len) {
|
||
|
inside_Ns = true;
|
||
|
}
|
||
|
break;
|
||
|
} else {
|
||
|
left -= szs[i].len;
|
||
|
right -= szs[i].len;
|
||
|
add_pos += szs[i].len;
|
||
|
}
|
||
|
}
|
||
|
}
|
||
|
if(inside_Ns) continue;
|
||
|
left = sofar_len + add_pos + left;
|
||
|
right = sofar_len + add_pos + right;
|
||
|
if(chr_idx + 1 < chr_szs.size()) {
|
||
|
if(right >= chr_szs[chr_idx + 1].first) continue;
|
||
|
} else {
|
||
|
if(right >= jlen) continue;
|
||
|
}
|
||
|
|
||
|
_alts.expand();
|
||
|
ALT<index_t>& alt = _alts.back();
|
||
|
alt.type = ALT_EXON;
|
||
|
alt.left = left;
|
||
|
alt.right = right;
|
||
|
alt.fw = (strand == '+' ? true : false);
|
||
|
_altnames.push_back("exon");
|
||
|
}
|
||
|
exon_file.close();
|
||
|
}
|
||
|
|
||
|
// Todo - implement structural variations
|
||
|
if(svfile != "") {
|
||
|
cerr << "Warning: SV option is not implemented " << svfile.c_str() << endl;
|
||
|
}
|
||
|
|
||
|
// Sort SNPs and Splice Sites based on positions
|
||
|
if(_alts.size() > 1) {
|
||
|
assert_eq(_alts.size(), _altnames.size());
|
||
|
EList<pair<ALT<index_t>, index_t> > buf; buf.resize(_alts.size());
|
||
|
EList<string> buf2; buf2.resize(_alts.size());
|
||
|
for(size_t i = 0; i < _alts.size(); i++) {
|
||
|
buf[i].first = _alts[i];
|
||
|
buf[i].second = (index_t)i;
|
||
|
buf2[i] = _altnames[i];
|
||
|
}
|
||
|
buf.sort();
|
||
|
for(size_t i = 0; i < _alts.size(); i++) {
|
||
|
_alts[i] = buf[i].first;
|
||
|
_altnames[i] = buf2[buf[i].second];
|
||
|
}
|
||
|
|
||
|
EList<index_t> buf3; buf3.resize(_alts.size());
|
||
|
for(size_t i = 0; i < buf3.size(); i++) {
|
||
|
index_t before = buf[i].second;
|
||
|
assert_lt(before, buf3.size());
|
||
|
buf3[before] = (index_t)i;
|
||
|
}
|
||
|
for(size_t h = 0; h < _haplotypes.size(); h++) {
|
||
|
EList<index_t, 1>& alts = _haplotypes[h].alts;
|
||
|
for(size_t a = 0; a < alts.size(); a++) {
|
||
|
index_t before = alts[a];
|
||
|
assert_lt(before, buf3.size());
|
||
|
alts[a] = buf3[before];
|
||
|
}
|
||
|
}
|
||
|
#ifndef NDEBUG
|
||
|
for(size_t i = 0; i < _alts.size(); i++) {
|
||
|
if(i + 1 < _alts.size()) {
|
||
|
assert(_alts[i] < _alts[i+1]);
|
||
|
}
|
||
|
const ALT<index_t>& alt = _alts[i];
|
||
|
if(alt.snp()) {
|
||
|
assert(_altnames[i] != "");
|
||
|
} else if(alt.splicesite()) {
|
||
|
assert(_altnames[i] == "ss");
|
||
|
} else if(alt.exon()) {
|
||
|
assert(_altnames[i] == "exon");
|
||
|
} else {
|
||
|
assert(false);
|
||
|
}
|
||
|
}
|
||
|
#endif
|
||
|
}
|
||
|
|
||
|
writeIndex<index_t>(fout7, (index_t)_alts.size(), this->toBe());
|
||
|
writeIndex<index_t>(fout8, (index_t)_alts.size(), this->toBe());
|
||
|
for(index_t i = 0; i < _alts.size(); i++) {
|
||
|
_alts[i].write(fout7, this->toBe());
|
||
|
fout8 << _altnames[i] << endl;
|
||
|
}
|
||
|
|
||
|
writeIndex<index_t>(fout7, (index_t)_haplotypes.size(), this->toBe());
|
||
|
for(index_t i = 0; i < _haplotypes.size(); i++) {
|
||
|
_haplotypes[i].write(fout7, this->toBe());
|
||
|
}
|
||
|
|
||
|
EList<Repeat<index_t> >& repeats = _repeatdb.repeats();
|
||
|
if(_repeat) {
|
||
|
ifstream repeat_file(repeatfile.c_str(), ios::in);
|
||
|
if(!repeat_file.is_open()) {
|
||
|
cerr << "Error: could not open " << ssfile.c_str() << endl;
|
||
|
throw 1;
|
||
|
}
|
||
|
if(parent_szs == NULL) {
|
||
|
throw 1;
|
||
|
}
|
||
|
if(parent_refnames == NULL) {
|
||
|
throw 1;
|
||
|
}
|
||
|
|
||
|
EList<pair<index_t, index_t> > parent_chr_szs;
|
||
|
index_t tmp_len = 0;
|
||
|
for(index_t i = 0; i < parent_szs->size(); i++) {
|
||
|
if((*parent_szs)[i].first) {
|
||
|
parent_chr_szs.expand();
|
||
|
parent_chr_szs.back().first = tmp_len;
|
||
|
parent_chr_szs.back().second = i;
|
||
|
}
|
||
|
tmp_len += (index_t)(*parent_szs)[i].len;
|
||
|
}
|
||
|
index_t parent_jlen = joinedLen(*parent_szs);
|
||
|
|
||
|
string prev_repName = "";
|
||
|
while(!repeat_file.eof()) {
|
||
|
// >rep1*0 rep 0 100 470 0
|
||
|
// 20_rep:26622650:+ 20_rep:26628088:+ 20_rep:26632508:+ 20_rep:26635636:+
|
||
|
// 20_rep:26669936:+ 20_rep:26672654:+ 20_rep:26675373:+ 20_rep:26678095:+
|
||
|
string repName, repAlleleName;
|
||
|
repeat_file >> repAlleleName;
|
||
|
if(repAlleleName.empty()) // Reached the end of file
|
||
|
break;
|
||
|
if(repAlleleName[0] != '>') {
|
||
|
cerr << "Error: the file format is not correct" << endl;
|
||
|
throw 1;
|
||
|
}
|
||
|
repAlleleName = repAlleleName.substr(1); // Remove '>'
|
||
|
index_t alleleID = 0;
|
||
|
size_t star_pos = repAlleleName.find('*');
|
||
|
if(star_pos >= repAlleleName.length()) {
|
||
|
repName = repAlleleName;
|
||
|
} else {
|
||
|
repName = repAlleleName.substr(0, star_pos);
|
||
|
string strID = repAlleleName.substr(star_pos + 1);
|
||
|
istringstream(strID) >> alleleID;
|
||
|
}
|
||
|
|
||
|
string refRepName;
|
||
|
index_t repPos, repLen;
|
||
|
repeat_file >> refRepName >> repPos >> repLen;
|
||
|
index_t rep_idx = 0;
|
||
|
for(; rep_idx < _refnames_nospace.size(); rep_idx++) {
|
||
|
if(refRepName == _refnames_nospace[rep_idx])
|
||
|
break;
|
||
|
}
|
||
|
if(rep_idx >= _refnames_nospace.size()) {
|
||
|
cerr << "Error: " << refRepName << " is not found in " << endl;
|
||
|
throw 1;
|
||
|
}
|
||
|
|
||
|
if(repeats.size() == 0 ||
|
||
|
repeats.back().repID != rep_idx ||
|
||
|
repeats.back().repName != repName) {
|
||
|
if(repeats.size() > 0) {
|
||
|
repeats.back().positions.sort();
|
||
|
}
|
||
|
repeats.expand();
|
||
|
repeats.back().init(repName, rep_idx, repPos, repLen);
|
||
|
}
|
||
|
|
||
|
// update repPos and repLen
|
||
|
if(repPos < repeats.back().repPos) {
|
||
|
repeats.back().repLen += (repeats.back().repPos - repPos);
|
||
|
repeats.back().repPos = repPos;
|
||
|
}
|
||
|
if(repPos + repLen > repeats.back().repPos + repeats.back().repLen) {
|
||
|
repeats.back().repLen = repPos + repLen - repeats.back().repPos;
|
||
|
}
|
||
|
|
||
|
size_t baseOff = 0;
|
||
|
if(repeats.size() > 1 && repeats[repeats.size() - 2].repID == rep_idx) {
|
||
|
baseOff = repeats[repeats.size() - 2].repPos + repeats[repeats.size() - 2].repLen;
|
||
|
}
|
||
|
|
||
|
index_t numCoords, numAlts;
|
||
|
repeat_file >> numCoords >> numAlts;
|
||
|
EList<index_t> snpIDs;
|
||
|
EList<string> snpStrIDs;
|
||
|
if(numAlts > 0) {
|
||
|
string snpStrID;
|
||
|
repeat_file >> snpStrID;
|
||
|
tokenize(snpStrID, ",", snpStrIDs);
|
||
|
if(snpStrIDs.size() != numAlts) {
|
||
|
assert(false);
|
||
|
cerr << "Error: the number of SNPs (" << snpIDs.size() << ", " << snpStrID << ") does not equal to " << numAlts << endl;
|
||
|
throw 1;
|
||
|
}
|
||
|
for(index_t i = 0; i < snpStrIDs.size(); i++) {
|
||
|
if(snpID2num.find(snpStrIDs[i]) == snpID2num.end()) {
|
||
|
cerr << "Error: " << snpStrIDs[i] << " is not found" << endl;
|
||
|
throw 1;
|
||
|
}
|
||
|
index_t numID = snpID2num[snpStrIDs[i]];
|
||
|
snpIDs.push_back(numID);
|
||
|
}
|
||
|
}
|
||
|
|
||
|
EList<RepeatCoord<index_t> >& positions = repeats.back().positions;
|
||
|
size_t sofar_numCoords = positions.size();
|
||
|
while(positions.size() - sofar_numCoords < numCoords) {
|
||
|
string chr_pos;
|
||
|
repeat_file >> chr_pos;
|
||
|
size_t colon_pos = chr_pos.find(':');
|
||
|
if(colon_pos + 1 >= chr_pos.length()) {
|
||
|
cerr << "Error: : is not found in " << chr_pos << endl;
|
||
|
throw 1;
|
||
|
}
|
||
|
string chr = chr_pos.substr(0, colon_pos);
|
||
|
string strPos = chr_pos.substr(colon_pos + 1, chr_pos.length() - colon_pos - 3);
|
||
|
bool repfw = (chr_pos[chr_pos.length() - 1] == '+');
|
||
|
index_t pos = 0;
|
||
|
istringstream(strPos) >> pos;
|
||
|
index_t chr_idx = 0;
|
||
|
for(; chr_idx < parent_refnames->size(); chr_idx++) {
|
||
|
if(chr == (*parent_refnames)[chr_idx])
|
||
|
break;
|
||
|
}
|
||
|
if(chr_idx >= parent_refnames->size()) {
|
||
|
cerr << "Error: " << chr << " is not found in " << endl;
|
||
|
throw 1;
|
||
|
}
|
||
|
assert_eq(parent_chr_szs.size(), parent_refnames->size());
|
||
|
assert_lt(chr_idx, parent_chr_szs.size());
|
||
|
|
||
|
positions.expand();
|
||
|
positions.back().tid = chr_idx;
|
||
|
positions.back().toff = pos;
|
||
|
positions.back().fw = repfw;
|
||
|
positions.back().alleleID = alleleID;
|
||
|
|
||
|
pair<index_t, index_t> tmp_pair = parent_chr_szs[chr_idx];
|
||
|
const index_t sofar_len = tmp_pair.first;
|
||
|
const index_t szs_idx = tmp_pair.second;
|
||
|
bool involve_Ns = false;
|
||
|
index_t add_pos = 0;
|
||
|
assert((*parent_szs)[szs_idx].first);
|
||
|
for(index_t i = szs_idx; i < parent_szs->size(); i++) {
|
||
|
if(i != szs_idx && (*parent_szs)[i].first) {
|
||
|
break;
|
||
|
}
|
||
|
if(pos < (*parent_szs)[i].off) {
|
||
|
involve_Ns = true;
|
||
|
break;
|
||
|
} else {
|
||
|
pos -= (*parent_szs)[i].off;
|
||
|
if(pos < (*parent_szs)[i].len) {
|
||
|
break;
|
||
|
} else {
|
||
|
pos -= (*parent_szs)[i].len;
|
||
|
add_pos += (*parent_szs)[i].len;
|
||
|
}
|
||
|
}
|
||
|
}
|
||
|
if(involve_Ns) {
|
||
|
assert(false);
|
||
|
throw 1;
|
||
|
}
|
||
|
pos = sofar_len + add_pos + pos;
|
||
|
if(chr_idx + 1 < parent_chr_szs.size()) {
|
||
|
if(pos >= parent_chr_szs[chr_idx + 1].first) {
|
||
|
assert(false);
|
||
|
throw 1;
|
||
|
}
|
||
|
} else {
|
||
|
if(pos >= parent_jlen){
|
||
|
assert(false);
|
||
|
throw 1;
|
||
|
}
|
||
|
}
|
||
|
|
||
|
positions.back().joinedOff = pos;
|
||
|
}
|
||
|
repeats.back().alleles.expand();
|
||
|
assert_geq(repPos, baseOff);
|
||
|
repeats.back().alleles.back().init(repPos - baseOff, repLen);
|
||
|
|
||
|
}
|
||
|
if(repeats.size() > 0) {
|
||
|
repeats.back().positions.sort();
|
||
|
}
|
||
|
repeat_file.close();
|
||
|
|
||
|
index_t total_repeat_len = 0;
|
||
|
for(size_t r = 0; r + 1 < repeats.size(); r++) {
|
||
|
if(repeats[r].repID != repeats[r+1].repID) {
|
||
|
index_t repeat_len = repeats[r].repPos + repeats[r].repLen;
|
||
|
total_repeat_len += repeat_len;
|
||
|
}
|
||
|
}
|
||
|
index_t repeat_len = repeats.back().repPos + repeats.back().repLen;
|
||
|
total_repeat_len += repeat_len;
|
||
|
if(total_repeat_len != s.length()) {
|
||
|
cerr << "Error: repeat length (" << repeats.back().repPos + repeats.back().repLen;
|
||
|
cerr << ") does not match sequence length (" << s.length() << ")" << endl;
|
||
|
throw 1;
|
||
|
}
|
||
|
|
||
|
_repeatLens.resizeExact(szs.size());
|
||
|
for(size_t i = 0; i < _repeatLens.size(); i++) {
|
||
|
_repeatLens[i].first = numeric_limits<index_t>::max();
|
||
|
_repeatLens[i].second = 0;
|
||
|
}
|
||
|
for(size_t i = 0; i < repeats.size(); i++) {
|
||
|
index_t id = repeats[i].repID;
|
||
|
index_t len = repeats[i].repLen;
|
||
|
assert_lt(id, _repeatLens.size());
|
||
|
if(_repeatLens[id].first > len) {
|
||
|
_repeatLens[id].first = len;
|
||
|
}
|
||
|
if(_repeatLens[id].second < len) {
|
||
|
_repeatLens[id].second = len;
|
||
|
}
|
||
|
}
|
||
|
|
||
|
writeIndex<index_t>(fout7, _repeatLens.size(), this->toBe());
|
||
|
for(size_t i = 0; i < _repeatLens.size(); i++) {
|
||
|
writeIndex<index_t>(fout7, _repeatLens[i].first, this->toBe());
|
||
|
writeIndex<index_t>(fout7, _repeatLens[i].second, this->toBe());
|
||
|
}
|
||
|
_repeatdb.write(fout7, this->toBe());
|
||
|
writeIndex<index_t>(fout7, chr_szs.size(), this->toBe()); // number of repeat indexes
|
||
|
EList<string> seqs;
|
||
|
EList<streampos> tableFilePos;
|
||
|
streampos filepos = fout7.tellp();
|
||
|
for(size_t i = 0; i < chr_szs.size(); i++) {
|
||
|
writeIndex<uint64_t>(fout7, 0, this->toBe());
|
||
|
}
|
||
|
for(size_t i = 0; i < repeats.size(); i++) {
|
||
|
const Repeat<index_t>& repeat = repeats[i];
|
||
|
assert_lt(repeat.repID, chr_szs.size());
|
||
|
index_t template_len = 0;
|
||
|
if(repeat.repID + 1 < chr_szs.size()) {
|
||
|
template_len = chr_szs[repeat.repID + 1].first - chr_szs[repeat.repID].first;
|
||
|
} else {
|
||
|
template_len = s.length() - chr_szs[repeat.repID].first;
|
||
|
}
|
||
|
assert_leq(repeat.repPos + repeat.repLen, template_len);
|
||
|
index_t pos = chr_szs[repeat.repID].first + repeat.repPos;
|
||
|
assert_leq(pos + repeat.repLen, s.length());
|
||
|
seqs.expand();
|
||
|
seqs.back().clear();
|
||
|
for(index_t j = 0; j < repeat.repLen; j++) {
|
||
|
int c = s[pos + j];
|
||
|
assert_range(0, 3, c);
|
||
|
seqs.back().push_back("ACGT"[c]);
|
||
|
}
|
||
|
|
||
|
if(i + 1 == repeats.size() || repeats[i].repID != repeats[i+1].repID) {
|
||
|
const size_t w = RB_Minimizer<string>::default_w, k = RB_Minimizer<string>::default_k;
|
||
|
RB_KmerTable kmer_table;
|
||
|
kmer_table.build(seqs, w, k);
|
||
|
kmer_table.write(fout7, this->toBe());
|
||
|
seqs.clear();
|
||
|
tableFilePos.push_back(fout7.tellp());
|
||
|
}
|
||
|
}
|
||
|
assert_eq(tableFilePos.size(), chr_szs.size());
|
||
|
streampos origpos = fout7.tellp();
|
||
|
fout7.seekp(filepos);
|
||
|
for(size_t i = 0; i < tableFilePos.size(); i++) {
|
||
|
writeIndex<uint64_t>(fout7, tableFilePos[i], this->toBe());
|
||
|
}
|
||
|
fout7.seekp(origpos);
|
||
|
}
|
||
|
|
||
|
fout7.close();
|
||
|
fout8.close();
|
||
|
}
|
||
|
// Joined reference sequence now in 's'
|
||
|
} catch(bad_alloc& e) {
|
||
|
// If we throw an allocation exception in the try block,
|
||
|
// that means that the joined version of the reference
|
||
|
// string itself is too larger to fit in memory. The only
|
||
|
// alternatives are to tell the user to give us more memory
|
||
|
// or to try again with a packed representation of the
|
||
|
// reference (if we haven't tried that already).
|
||
|
cerr << "Could not allocate space for a joined string of " << jlen << " elements." << endl;
|
||
|
if(!isPacked() && _passMemExc) {
|
||
|
// Pass the exception up so that we can retry using a
|
||
|
// packed string representation
|
||
|
throw e;
|
||
|
}
|
||
|
// There's no point passing this exception on. The fact
|
||
|
// that we couldn't allocate the joined string means that
|
||
|
// --bmax is irrelevant - the user should re-run with
|
||
|
// ebwt-build-packed
|
||
|
if(isPacked()) {
|
||
|
cerr << "Please try running bowtie-build on a computer with more memory." << endl;
|
||
|
} else {
|
||
|
cerr << "Please try running bowtie-build in packed mode (-p/--packed) or in automatic" << endl
|
||
|
<< "mode (-a/--auto), or try again on a computer with more memory." << endl;
|
||
|
}
|
||
|
if(sizeof(void*) == 4) {
|
||
|
cerr << "If this computer has more than 4 GB of memory, try using a 64-bit executable;" << endl
|
||
|
<< "this executable is 32-bit." << endl;
|
||
|
}
|
||
|
throw 1;
|
||
|
}
|
||
|
|
||
|
// Succesfully obtained joined reference string
|
||
|
assert_geq(s.length(), jlen);
|
||
|
if(bmax != (index_t)OFF_MASK) {
|
||
|
// VMSG_NL("bmax according to bmax setting: " << bmax);
|
||
|
}
|
||
|
else if(bmaxSqrtMult != (index_t)OFF_MASK) {
|
||
|
bmax *= bmaxSqrtMult;
|
||
|
// VMSG_NL("bmax according to bmaxSqrtMult setting: " << bmax);
|
||
|
}
|
||
|
else if(bmaxDivN != (index_t)OFF_MASK) {
|
||
|
bmax = max<uint32_t>(jlen / (bmaxDivN * _nthreads), 1);
|
||
|
// VMSG_NL("bmax according to bmaxDivN setting: " << bmax);
|
||
|
}
|
||
|
else {
|
||
|
bmax = (uint32_t)sqrt(s.length());
|
||
|
// VMSG_NL("bmax defaulted to: " << bmax);
|
||
|
}
|
||
|
|
||
|
int iter = 0;
|
||
|
bool first = true;
|
||
|
streampos out1pos = out1.tellp();
|
||
|
streampos out2pos = out2.tellp();
|
||
|
|
||
|
if(!_repeat) {
|
||
|
// Look for bmax/dcv parameters that work.
|
||
|
while(true) {
|
||
|
if(!first && bmax < 40 && _passMemExc) {
|
||
|
cerr << "Could not find approrpiate bmax/dcv settings for building this index." << endl;
|
||
|
if(!isPacked()) {
|
||
|
// Throw an exception exception so that we can
|
||
|
// retry using a packed string representation
|
||
|
throw bad_alloc();
|
||
|
} else {
|
||
|
cerr << "Already tried a packed string representation." << endl;
|
||
|
}
|
||
|
cerr << "Please try indexing this reference on a computer with more memory." << endl;
|
||
|
if(sizeof(void*) == 4) {
|
||
|
cerr << "If this computer has more than 4 GB of memory, try using a 64-bit executable;" << endl
|
||
|
<< "this executable is 32-bit." << endl;
|
||
|
}
|
||
|
throw 1;
|
||
|
}
|
||
|
if(!first) {
|
||
|
out1.seekp(out1pos);
|
||
|
out2.seekp(out2pos);
|
||
|
}
|
||
|
if(dcv > 4096) dcv = 4096;
|
||
|
if((iter % 6) == 5 && dcv < 4096 && dcv != 0) {
|
||
|
dcv <<= 1; // double difference-cover period
|
||
|
} else {
|
||
|
bmax -= (bmax >> 2); // reduce by 25%
|
||
|
}
|
||
|
iter++;
|
||
|
try {
|
||
|
if(_alts.empty()) {
|
||
|
VMSG("Using parameters --bmax " << bmax);
|
||
|
if(dcv == 0) {
|
||
|
VMSG_NL(" and *no difference cover*");
|
||
|
} else {
|
||
|
VMSG_NL(" --dcv " << dcv);
|
||
|
}
|
||
|
{
|
||
|
VMSG_NL(" Doing ahead-of-time memory usage test");
|
||
|
// Make a quick-and-dirty attempt to force a bad_alloc iff
|
||
|
// we would have thrown one eventually as part of
|
||
|
// constructing the DifferenceCoverSample
|
||
|
dcv <<= 1;
|
||
|
index_t sz = (index_t)DifferenceCoverSample<TStr>::simulateAllocs(s, dcv >> 1);
|
||
|
if(_nthreads > 1) sz *= (_nthreads + 1);
|
||
|
AutoArray<uint8_t> tmp(sz, EBWT_CAT);
|
||
|
dcv >>= 1;
|
||
|
// Likewise with the KarkkainenBlockwiseSA
|
||
|
sz = (index_t)KarkkainenBlockwiseSA<TStr>::simulateAllocs(s, bmax);
|
||
|
AutoArray<uint8_t> tmp2(sz, EBWT_CAT);
|
||
|
// Now throw in the 'ftab' and 'isaSample' structures
|
||
|
// that we'll eventually allocate in buildToDisk
|
||
|
AutoArray<index_t> ftab(_gh._ftabLen * 2, EBWT_CAT);
|
||
|
AutoArray<uint8_t> side(_gh._sideSz, EBWT_CAT);
|
||
|
// Grab another 20 MB out of caution
|
||
|
AutoArray<uint32_t> extra(20*1024*1024, EBWT_CAT);
|
||
|
// If we made it here without throwing bad_alloc, then we
|
||
|
// passed the memory-usage stress test
|
||
|
VMSG(" Passed! Constructing with these parameters: --bmax " << bmax << " --dcv " << dcv);
|
||
|
if(isPacked()) {
|
||
|
VMSG(" --packed");
|
||
|
}
|
||
|
VMSG_NL("");
|
||
|
}
|
||
|
VMSG_NL("Constructing suffix-array element generator");
|
||
|
KarkkainenBlockwiseSA<TStr> bsa(s, bmax, _nthreads, dcv, seed, _sanity, _passMemExc, _verbose, outfile);
|
||
|
assert(bsa.suffixItrIsReset());
|
||
|
assert_eq(bsa.size(), s.length()+1);
|
||
|
VMSG_NL("Converting suffix-array elements to index image");
|
||
|
buildToDisk(bsa, s, out1, out2);
|
||
|
} else {
|
||
|
RefGraph<index_t>* graph = new RefGraph<index_t>(
|
||
|
s,
|
||
|
szs,
|
||
|
_alts,
|
||
|
_haplotypes,
|
||
|
outfile,
|
||
|
_nthreads,
|
||
|
verbose);
|
||
|
PathGraph<index_t>* pg = new PathGraph<index_t>(
|
||
|
*graph,
|
||
|
outfile,
|
||
|
std::numeric_limits<index_t>::max(),
|
||
|
_nthreads,
|
||
|
verbose);
|
||
|
|
||
|
if(verbose) { cerr << "Generating edges... " << endl; }
|
||
|
if(!pg->generateEdges(*graph)) { return; }
|
||
|
// Re-initialize GFM parameters to reflect real number of edges (gbwt string)
|
||
|
_gh.init(
|
||
|
_gh.len(),
|
||
|
pg->getNumEdges(),
|
||
|
pg->getNumNodes(),
|
||
|
_gh.lineRate(),
|
||
|
_gh.offRate(),
|
||
|
_gh.ftabChars(),
|
||
|
0,
|
||
|
_gh.entireReverse());
|
||
|
buildToDisk(*pg, s, out1, out2);
|
||
|
delete pg; pg = NULL;
|
||
|
delete graph; graph = NULL;
|
||
|
}
|
||
|
out1.flush(); out2.flush();
|
||
|
if(out1.fail() || out2.fail()) {
|
||
|
cerr << "An error occurred writing the index to disk. Please check if the disk is full." << endl;
|
||
|
throw 1;
|
||
|
}
|
||
|
break;
|
||
|
} catch(bad_alloc& e) {
|
||
|
if(_passMemExc) {
|
||
|
VMSG_NL(" Ran out of memory; automatically trying more memory-economical parameters.");
|
||
|
} else {
|
||
|
cerr << "Out of memory while constructing suffix array. Please try using a smaller" << endl
|
||
|
<< "number of blocks by specifying a smaller --bmax or a larger --bmaxdivn" << endl;
|
||
|
throw 1;
|
||
|
}
|
||
|
}
|
||
|
first = false;
|
||
|
}
|
||
|
assert(repOk());
|
||
|
|
||
|
// Now write reference sequence names on the end
|
||
|
assert_eq(this->_refnames.size(), this->_nPat);
|
||
|
for(index_t i = 0; i < this->_refnames.size(); i++) {
|
||
|
out1 << this->_refnames[i].c_str() << endl;
|
||
|
}
|
||
|
out1 << '\0';
|
||
|
out1.flush(); out2.flush();
|
||
|
if(out1.fail() || out2.fail()) {
|
||
|
cerr << "An error occurred writing the index to disk. Please check if the disk is full." << endl;
|
||
|
throw 1;
|
||
|
}
|
||
|
}
|
||
|
VMSG_NL("Returning from initFromVector");
|
||
|
}
|
||
|
|
||
|
/**
|
||
|
* Return the length that the joined string of the given string
|
||
|
* list will have. Note that this is indifferent to how the text
|
||
|
* fragments correspond to input sequences - it just cares about
|
||
|
* the lengths of the fragments.
|
||
|
*/
|
||
|
index_t joinedLen(EList<RefRecord>& szs) {
|
||
|
index_t ret = 0;
|
||
|
for(unsigned int i = 0; i < szs.size(); i++) {
|
||
|
ret += (index_t)szs[i].len;
|
||
|
}
|
||
|
return ret;
|
||
|
}
|
||
|
|
||
|
/// Destruct an Ebwt
|
||
|
~GFM() {
|
||
|
_fchr.reset();
|
||
|
_ftab.reset();
|
||
|
_eftab.reset();
|
||
|
_plen.reset();
|
||
|
_rstarts.reset();
|
||
|
_offs.reset();
|
||
|
_gfm.reset();
|
||
|
if(offs() != NULL && useShmem_) {
|
||
|
FREE_SHARED(offs());
|
||
|
}
|
||
|
if(gfm() != NULL && useShmem_) {
|
||
|
FREE_SHARED(gfm());
|
||
|
}
|
||
|
if (_in1 != NULL) fclose(_in1);
|
||
|
if (_in2 != NULL) fclose(_in2);
|
||
|
}
|
||
|
|
||
|
/// Accessors
|
||
|
inline const GFMParams<index_t>& gh() const { return _gh; }
|
||
|
index_t numZOffs() const { return _zOffs.size(); }
|
||
|
index_t zOff(index_t i) const { assert_lt(i, _zOffs.size()); return _zOffs[i]; }
|
||
|
index_t zGbwtByteOff(index_t i) const { assert_lt(i, _zGbwtByteOffs.size()); return _zGbwtByteOffs[i]; }
|
||
|
int zGbwtBpOff(index_t i) const { assert_lt(i, _zGbwtBpOffs.size()); return _zGbwtBpOffs[i]; }
|
||
|
index_t nPat() const { return _nPat; }
|
||
|
index_t nFrag() const { return _nFrag; }
|
||
|
inline index_t* fchr() { return _fchr.get(); }
|
||
|
inline index_t* ftab() { return _ftab.get(); }
|
||
|
inline index_t* eftab() { return _eftab.get(); }
|
||
|
inline index_t* offs() { return _offs.get(); }
|
||
|
inline index_t* plen() { return _plen.get(); }
|
||
|
inline index_t* rstarts() { return _rstarts.get(); }
|
||
|
inline uint8_t* gfm() { return _gfm.get(); }
|
||
|
inline const index_t* fchr() const { return _fchr.get(); }
|
||
|
inline const index_t* ftab() const { return _ftab.get(); }
|
||
|
inline const index_t* eftab() const { return _eftab.get(); }
|
||
|
inline const index_t* offs() const { return _offs.get(); }
|
||
|
inline const index_t* plen() const { return _plen.get(); }
|
||
|
inline const index_t* rstarts() const { return _rstarts.get(); }
|
||
|
inline const uint8_t* gfm() const { return _gfm.get(); }
|
||
|
inline const EList<ALT<index_t> >& alts() const { return _alts; }
|
||
|
inline const EList<string>& altnames() const { return _altnames; }
|
||
|
bool toBe() const { return _toBigEndian; }
|
||
|
bool verbose() const { return _verbose; }
|
||
|
bool sanityCheck() const { return _sanity; }
|
||
|
EList<string>& refnames() { return _refnames; }
|
||
|
bool fw() const { return fw_; }
|
||
|
bool repeat() const { return _repeat; }
|
||
|
const EList<uint8_t>& getRepeatIncluded() const { return _repeatIncluded; }
|
||
|
|
||
|
#ifdef POPCNT_CAPABILITY
|
||
|
bool _usePOPCNTinstruction;
|
||
|
#endif
|
||
|
|
||
|
/**
|
||
|
* Returns true iff the index contains the given string (exactly). The
|
||
|
* given string must contain only unambiguous characters. TODO:
|
||
|
* support skipping of ambiguous characters.
|
||
|
*/
|
||
|
bool contains(
|
||
|
const BTDnaString& str,
|
||
|
index_t *top = NULL,
|
||
|
index_t *bot = NULL) const;
|
||
|
|
||
|
/**
|
||
|
* Returns true iff the index contains the given string (exactly). The
|
||
|
* given string must contain only unambiguous characters. TODO:
|
||
|
* support skipping of ambiguous characters.
|
||
|
*/
|
||
|
bool contains(
|
||
|
const char *str,
|
||
|
index_t *top = NULL,
|
||
|
index_t *bot = NULL) const
|
||
|
{
|
||
|
return contains(BTDnaString(str, true), top, bot);
|
||
|
}
|
||
|
|
||
|
/// Return true iff the Ebwt is currently in memory
|
||
|
bool isInMemory() const {
|
||
|
if(gfm() != NULL) {
|
||
|
// Note: We might have skipped loading _offs, _ftab,
|
||
|
// _eftab, and _rstarts depending on whether this is the
|
||
|
// reverse index and what algorithm is being used.
|
||
|
assert(_gh.repOk());
|
||
|
//assert(_ftab != NULL);
|
||
|
//assert(_eftab != NULL);
|
||
|
assert(fchr() != NULL);
|
||
|
//assert(_offs != NULL);
|
||
|
//assert(_rstarts != NULL);
|
||
|
// assert_neq(_zGbwtByteOff, INDEX_MAX);
|
||
|
// assert_neq(_zGbwtBpOff, -1);
|
||
|
return true;
|
||
|
} else {
|
||
|
assert(ftab() == NULL);
|
||
|
assert(eftab() == NULL);
|
||
|
assert(fchr() == NULL);
|
||
|
assert(offs() == NULL);
|
||
|
assert(rstarts() == NULL);
|
||
|
assert_eq(_zOffs.size(), 0);
|
||
|
assert_eq(_zGbwtByteOffs.size(), 0);
|
||
|
assert_eq(_zGbwtBpOffs.size(), 0);
|
||
|
return false;
|
||
|
}
|
||
|
}
|
||
|
|
||
|
/// Return true iff the Ebwt is currently stored on disk
|
||
|
bool isEvicted() const {
|
||
|
return !isInMemory();
|
||
|
}
|
||
|
|
||
|
/**
|
||
|
* Load this Ebwt into memory by reading it in from the _in1 and
|
||
|
* _in2 streams.
|
||
|
*/
|
||
|
void loadIntoMemory(
|
||
|
int needEntireReverse,
|
||
|
bool loadSASamp,
|
||
|
bool loadFtab,
|
||
|
bool loadRstarts,
|
||
|
bool loadNames,
|
||
|
bool verbose)
|
||
|
{
|
||
|
readIntoMemory(
|
||
|
needEntireReverse, // require reverse index to be concatenated reference reversed
|
||
|
loadSASamp, // load the SA sample portion?
|
||
|
loadFtab, // load the ftab (_ftab[] and _eftab[])?
|
||
|
loadRstarts, // load the r-starts (_rstarts[])?
|
||
|
false, // stop after loading the header portion?
|
||
|
NULL, // params
|
||
|
false, // mmSweep
|
||
|
loadNames, // loadNames
|
||
|
verbose); // startVerbose
|
||
|
}
|
||
|
|
||
|
/**
|
||
|
* Frees memory associated with the Ebwt.
|
||
|
*/
|
||
|
void evictFromMemory() {
|
||
|
assert(isInMemory());
|
||
|
_fchr.free();
|
||
|
_ftab.free();
|
||
|
_eftab.free();
|
||
|
_rstarts.free();
|
||
|
_offs.free(); // might not be under control of APtrWrap
|
||
|
_gfm.free(); // might not be under control of APtrWrap
|
||
|
// Keep plen; it's small and the client may want to seq it
|
||
|
// even when the others are evicted.
|
||
|
//_plen = NULL;
|
||
|
_zOffs.clear();
|
||
|
_zGbwtByteOffs.clear();
|
||
|
_zGbwtBpOffs.clear();
|
||
|
}
|
||
|
|
||
|
/**
|
||
|
* Turn a substring of 'seq' starting at offset 'off' and having
|
||
|
* length equal to the index's 'ftabChars' into an int that can be
|
||
|
* used to index into the ftab array.
|
||
|
*/
|
||
|
index_t ftabSeqToInt(
|
||
|
const BTDnaString& seq,
|
||
|
index_t off,
|
||
|
bool rev) const
|
||
|
{
|
||
|
int fc = _gh._ftabChars;
|
||
|
index_t lo = off, hi = lo + fc;
|
||
|
assert_leq(hi, seq.length());
|
||
|
index_t ftabOff = 0;
|
||
|
for(int i = 0; i < fc; i++) {
|
||
|
bool fwex = fw();
|
||
|
if(rev) fwex = !fwex;
|
||
|
// We add characters to the ftabOff in the order they would
|
||
|
// have been consumed in a normal search. For BWT, this
|
||
|
// means right-to-left order; for BWT' it's left-to-right.
|
||
|
int c = (fwex ? seq[lo + i] : seq[hi - i - 1]);
|
||
|
if(c > 3) {
|
||
|
return std::numeric_limits<index_t>::max();
|
||
|
}
|
||
|
assert_range(0, 3, c);
|
||
|
ftabOff <<= 2;
|
||
|
ftabOff |= c;
|
||
|
}
|
||
|
return ftabOff;
|
||
|
}
|
||
|
|
||
|
/**
|
||
|
* Non-static facade for static function ftabHi.
|
||
|
*/
|
||
|
index_t ftabHi(index_t i) const {
|
||
|
return GFM<index_t>::ftabHi(
|
||
|
ftab(),
|
||
|
eftab(),
|
||
|
_gh.linearFM() ? _gh._len : _gh._gbwtLen,
|
||
|
_gh._ftabLen,
|
||
|
_gh._eftabLen,
|
||
|
i);
|
||
|
}
|
||
|
|
||
|
/**
|
||
|
* Get "high interpretation" of ftab entry at index i. The high
|
||
|
* interpretation of a regular ftab entry is just the entry
|
||
|
* itself. The high interpretation of an extended entry is the
|
||
|
* second correpsonding ui32 in the eftab.
|
||
|
*
|
||
|
* It's a static member because it's convenient to ask this
|
||
|
* question before the Ebwt is fully initialized.
|
||
|
*/
|
||
|
static index_t ftabHi(
|
||
|
const index_t *ftab,
|
||
|
const index_t *eftab,
|
||
|
index_t gbwtLen,
|
||
|
index_t ftabLen,
|
||
|
index_t eftabLen,
|
||
|
index_t i)
|
||
|
{
|
||
|
assert_lt(i, ftabLen);
|
||
|
if(ftab[i] <= gbwtLen) {
|
||
|
return ftab[i];
|
||
|
} else {
|
||
|
index_t efIdx = ftab[i] ^ (index_t)INDEX_MAX;
|
||
|
assert_lt(efIdx*2+1, eftabLen);
|
||
|
return eftab[efIdx*2+1];
|
||
|
}
|
||
|
}
|
||
|
|
||
|
/**
|
||
|
* Non-static facade for static function ftabLo.
|
||
|
*/
|
||
|
index_t ftabLo(index_t i) const {
|
||
|
return GFM<index_t>::ftabLo(
|
||
|
ftab(),
|
||
|
eftab(),
|
||
|
_gh.linearFM() ? _gh._len : _gh._gbwtLen,
|
||
|
_gh._ftabLen,
|
||
|
_gh._eftabLen,
|
||
|
i);
|
||
|
}
|
||
|
|
||
|
/**
|
||
|
* Get low bound of ftab range.
|
||
|
*/
|
||
|
index_t ftabLo(const BTDnaString& seq, index_t off) const {
|
||
|
return ftabLo(ftabSeqToInt(seq, off, false));
|
||
|
}
|
||
|
|
||
|
/**
|
||
|
* Get high bound of ftab range.
|
||
|
*/
|
||
|
index_t ftabHi(const BTDnaString& seq, index_t off) const {
|
||
|
return ftabHi(ftabSeqToInt(seq, off, false));
|
||
|
}
|
||
|
|
||
|
/**
|
||
|
* Extract characters from seq starting at offset 'off' and going either
|
||
|
* forward or backward, depending on 'rev'. Order matters when compiling
|
||
|
* the integer that gets looked up in the ftab. Each successive character
|
||
|
* is ORed into the least significant bit-pair, and characters are
|
||
|
* integrated in the direction of the search.
|
||
|
*/
|
||
|
bool
|
||
|
ftabLoHi(
|
||
|
const BTDnaString& seq, // sequence to extract from
|
||
|
index_t off, // offset into seq to begin extracting
|
||
|
bool rev, // reverse while extracting
|
||
|
index_t& top,
|
||
|
index_t& bot) const
|
||
|
{
|
||
|
index_t fi = ftabSeqToInt(seq, off, rev);
|
||
|
if(fi == std::numeric_limits<index_t>::max()) {
|
||
|
return false;
|
||
|
}
|
||
|
top = ftabHi(fi);
|
||
|
bot = ftabLo(fi+1);
|
||
|
assert_geq(bot, top);
|
||
|
return true;
|
||
|
}
|
||
|
|
||
|
/**
|
||
|
* Get "low interpretation" of ftab entry at index i. The low
|
||
|
* interpretation of a regular ftab entry is just the entry
|
||
|
* itself. The low interpretation of an extended entry is the
|
||
|
* first correpsonding ui32 in the eftab.
|
||
|
*
|
||
|
* It's a static member because it's convenient to ask this
|
||
|
* question before the Ebwt is fully initialized.
|
||
|
*/
|
||
|
static index_t ftabLo(
|
||
|
const index_t *ftab,
|
||
|
const index_t *eftab,
|
||
|
index_t gbwtLen,
|
||
|
index_t ftabLen,
|
||
|
index_t eftabLen,
|
||
|
index_t i)
|
||
|
{
|
||
|
assert_lt(i, ftabLen);
|
||
|
if(ftab[i] <= gbwtLen) {
|
||
|
return ftab[i];
|
||
|
} else {
|
||
|
index_t efIdx = ftab[i] ^ (index_t)INDEX_MAX;
|
||
|
assert_lt(efIdx*2+1, eftabLen);
|
||
|
return eftab[efIdx*2];
|
||
|
}
|
||
|
}
|
||
|
|
||
|
/**
|
||
|
* Try to resolve the reference offset of the BW element 'elt'. If
|
||
|
* it can be resolved immediately, return the reference offset. If
|
||
|
* it cannot be resolved immediately, return 0xffffffff.
|
||
|
*/
|
||
|
index_t tryOffset(index_t elt, index_t node) const {
|
||
|
assert(offs() != NULL);
|
||
|
for(index_t i = 0; i < _zOffs.size(); i++) {
|
||
|
if(elt == _zOffs[i]) return 0;
|
||
|
}
|
||
|
if((node & _gh._offMask) == node) {
|
||
|
index_t nodeOff = node >> _gh._offRate;
|
||
|
assert_lt(nodeOff, _gh._offsLen);
|
||
|
index_t off = offs()[nodeOff];
|
||
|
return off;
|
||
|
} else {
|
||
|
// Try looking at zoff
|
||
|
return (index_t)INDEX_MAX;
|
||
|
}
|
||
|
}
|
||
|
|
||
|
/**
|
||
|
* Try to resolve the reference offset of the BW element 'elt' such
|
||
|
* that the offset returned is at the right-hand side of the
|
||
|
* forward reference substring involved in the hit.
|
||
|
*/
|
||
|
index_t tryOffset(
|
||
|
index_t elt,
|
||
|
bool fw,
|
||
|
index_t hitlen) const
|
||
|
{
|
||
|
index_t off = tryOffset(elt);
|
||
|
if(off != (index_t)INDEX_MAX && !fw) {
|
||
|
assert_lt(off, _gh._len);
|
||
|
off = _gh._len - off - 1;
|
||
|
assert_geq(off, hitlen-1);
|
||
|
off -= (hitlen-1);
|
||
|
assert_lt(off, _gh._len);
|
||
|
}
|
||
|
return off;
|
||
|
}
|
||
|
|
||
|
/**
|
||
|
* Walk 'steps' steps to the left and return the row arrived at.
|
||
|
*/
|
||
|
index_t walkLeft(index_t row, index_t steps) const;
|
||
|
|
||
|
/**
|
||
|
* Resolve the reference offset of the BW element 'elt'.
|
||
|
*/
|
||
|
index_t getOffset(index_t row, index_t node = 0) const;
|
||
|
|
||
|
/**
|
||
|
* Resolve the reference offset of the BW element 'elt' such that
|
||
|
* the offset returned is at the right-hand side of the forward
|
||
|
* reference substring involved in the hit.
|
||
|
*/
|
||
|
index_t getOffset(
|
||
|
index_t elt,
|
||
|
bool fw,
|
||
|
index_t hitlen) const;
|
||
|
|
||
|
/**
|
||
|
* When using read() to create an Ebwt, we have to set a couple of
|
||
|
* additional fields in the Ebwt object that aren't part of the
|
||
|
* parameter list and are not stored explicitly in the file. Right
|
||
|
* now, this just involves initializing _zEbwtByteOff and
|
||
|
* _zEbwtBpOff from _zOff.
|
||
|
*/
|
||
|
void postReadInit(const GFMParams<index_t>& gh) {
|
||
|
_zGbwtByteOffs.resizeExact(_zOffs.size());
|
||
|
_zGbwtBpOffs.resizeExact(_zOffs.size());
|
||
|
for(index_t i = 0; i < _zOffs.size(); i++) {
|
||
|
index_t sideNum = _zOffs[i] / gh._sideGbwtLen;
|
||
|
index_t sideCharOff = _zOffs[i] % gh._sideGbwtLen;
|
||
|
index_t sideByteOff = sideNum * gh._sideSz;
|
||
|
_zGbwtByteOffs[i] = sideCharOff >> 2;
|
||
|
assert_lt(_zGbwtByteOffs[i], gh._sideGbwtSz);
|
||
|
_zGbwtBpOffs[i] = sideCharOff & 3;
|
||
|
assert_lt(_zGbwtBpOffs[i], 4);
|
||
|
_zGbwtByteOffs[i] += sideByteOff;
|
||
|
}
|
||
|
assert(repOk(gh)); // Ebwt should be fully initialized now
|
||
|
}
|
||
|
|
||
|
/**
|
||
|
* Given basename of an Ebwt index, read and return its flag.
|
||
|
*/
|
||
|
static int32_t readVersionFlags(const string& instr, int& major, int& minor, string& extra_version);
|
||
|
|
||
|
static void readProgramVersion(int& major_version, int& minor_version, string& extra_version) {
|
||
|
char extra[256] = {0,};
|
||
|
int second_version;
|
||
|
sscanf(HISAT2_VERSION, "%d.%d.%d-%s",
|
||
|
&second_version,
|
||
|
&major_version,
|
||
|
&minor_version,
|
||
|
extra);
|
||
|
extra_version = extra;
|
||
|
}
|
||
|
|
||
|
static void readIndexVersion(int index_version, int& major_version, int& minor_version, string& extra_version) {
|
||
|
major_version = (index_version >> 16) & 0xff;
|
||
|
minor_version = (index_version >> 8) & 0xff;
|
||
|
if((index_version & 0xff) == 1) {
|
||
|
extra_version = "alpha";
|
||
|
} else if((index_version & 0xff) == 2) {
|
||
|
extra_version = "beta";
|
||
|
} else {
|
||
|
extra_version = "";
|
||
|
}
|
||
|
}
|
||
|
|
||
|
static int getIndexVersion() {
|
||
|
int major_version = 0, minor_version = 0;
|
||
|
string extra_version;
|
||
|
readProgramVersion(major_version, minor_version, extra_version);
|
||
|
int version = 2; // HISAT2
|
||
|
version = (version << 8) | (major_version & 0xff);
|
||
|
version = (version << 8) | (minor_version & 0xff);
|
||
|
version = version << 8;
|
||
|
if(extra_version == "alpha") {
|
||
|
version |= 0x1;
|
||
|
} else if(extra_version == "beta") {
|
||
|
version |= 0x2;
|
||
|
}
|
||
|
return version;
|
||
|
}
|
||
|
|
||
|
/**
|
||
|
* Pretty-print the Ebwt to the given output stream.
|
||
|
*/
|
||
|
void print(ostream& out) const {
|
||
|
print(out, _gh);
|
||
|
}
|
||
|
|
||
|
/**
|
||
|
* Pretty-print the Ebwt and given EbwtParams to the given output
|
||
|
* stream.
|
||
|
*/
|
||
|
void print(ostream& out, const GFMParams<index_t>& gh) const {
|
||
|
gh.print(out); // print params
|
||
|
return;
|
||
|
out << "Ebwt (" << (isInMemory()? "memory" : "disk") << "):" << endl;
|
||
|
for(index_t i = 0; i < _zOffs.size(); i++) {
|
||
|
out << " " << (i+1) << " zOffs: " << _zOffs[i] << endl
|
||
|
<< " " << (i+1) << " zGbwtByteOff: " << _zGbwtByteOffs[i] << endl
|
||
|
<< " " << (i+1) << " zGbwtBpOff: " << _zGbwtBpOffs[i] << endl;
|
||
|
}
|
||
|
out << " nPat: " << _nPat << endl
|
||
|
<< " plen: ";
|
||
|
if(plen() == NULL) {
|
||
|
out << "NULL" << endl;
|
||
|
} else {
|
||
|
out << "non-NULL, [0] = " << plen()[0] << endl;
|
||
|
}
|
||
|
out << " rstarts: ";
|
||
|
if(rstarts() == NULL) {
|
||
|
out << "NULL" << endl;
|
||
|
} else {
|
||
|
out << "non-NULL, [0] = " << rstarts()[0] << endl;
|
||
|
}
|
||
|
out << " ebwt: ";
|
||
|
if(gfm() == NULL) {
|
||
|
out << "NULL" << endl;
|
||
|
} else {
|
||
|
out << "non-NULL, [0] = " << gfm()[0] << endl;
|
||
|
}
|
||
|
out << " fchr: ";
|
||
|
if(fchr() == NULL) {
|
||
|
out << "NULL" << endl;
|
||
|
} else {
|
||
|
out << "non-NULL, [0] = " << fchr()[0] << endl;
|
||
|
}
|
||
|
out << " ftab: ";
|
||
|
if(ftab() == NULL) {
|
||
|
out << "NULL" << endl;
|
||
|
} else {
|
||
|
out << "non-NULL, [0] = " << ftab()[0] << endl;
|
||
|
}
|
||
|
out << " eftab: ";
|
||
|
if(eftab() == NULL) {
|
||
|
out << "NULL" << endl;
|
||
|
} else {
|
||
|
out << "non-NULL, [0] = " << eftab()[0] << endl;
|
||
|
}
|
||
|
out << " offs: ";
|
||
|
if(offs() == NULL) {
|
||
|
out << "NULL" << endl;
|
||
|
} else {
|
||
|
out << "non-NULL, [0] = " << offs()[0] << endl;
|
||
|
}
|
||
|
}
|
||
|
|
||
|
// Building
|
||
|
template <typename TStr> static TStr join(EList<TStr>& l, uint32_t seed);
|
||
|
template <typename TStr> static void join(EList<FileBuf*>& l, EList<RefRecord>& szs, index_t sztot, const RefReadInParams& refparams, uint32_t seed, TStr& s, bool include_rc = false, bool CGtoTG = false);
|
||
|
template <typename TStr> void joinToDisk(EList<FileBuf*>& l, EList<RefRecord>& szs, index_t sztot, const RefReadInParams& refparams, TStr& ret, ostream& out1, ostream& out2);
|
||
|
template <typename TStr> void buildToDisk(PathGraph<index_t>& gbwt, const TStr& s, ostream& out1, ostream& out2, streampos headerPos = -1);
|
||
|
template <typename TStr> void buildToDisk(InorderBlockwiseSA<TStr>& sa, const TStr& s, ostream& out1, ostream& out2, streampos headerPos = -1);
|
||
|
|
||
|
// I/O
|
||
|
void readIntoMemory(int needEntireRev, bool loadSASamp, bool loadFtab, bool loadRstarts, bool justHeader, GFMParams<index_t> *params, bool mmSweep, bool loadNames, bool startVerbose, bool subIndex = false);
|
||
|
void writeFromMemory(bool justHeader, ostream& out1, ostream& out2) const;
|
||
|
void writeFromMemory(bool justHeader, const string& out1, const string& out2) const;
|
||
|
|
||
|
// Sanity checking
|
||
|
void sanityCheckUpToSide(int upToSide) const;
|
||
|
void sanityCheckAll(int reverse) const;
|
||
|
void restore(SString<char>& s) const;
|
||
|
void checkOrigs(const EList<SString<char> >& os, bool mirror) const;
|
||
|
|
||
|
// Searching and reporting
|
||
|
bool joinedToTextOff(index_t qlen, index_t off, index_t& tidx, index_t& textoff, index_t& tlen, bool rejectStraddle, bool& straddled) const;
|
||
|
bool textOffToJoined(index_t tid, index_t tlen, index_t& off) const;
|
||
|
|
||
|
#define WITHIN_BWT_LEN(x) \
|
||
|
assert_leq(x[0], this->_gh._sideGbwtLen); \
|
||
|
assert_leq(x[1], this->_gh._sideGbwtLen); \
|
||
|
assert_leq(x[2], this->_gh._sideGbwtLen); \
|
||
|
assert_leq(x[3], this->_gh._sideGbwtLen)
|
||
|
|
||
|
#define WITHIN_FCHR(x) \
|
||
|
assert_leq(x[0], this->fchr()[1]); \
|
||
|
assert_leq(x[1], this->fchr()[2]); \
|
||
|
assert_leq(x[2], this->fchr()[3]); \
|
||
|
assert_leq(x[3], this->fchr()[4])
|
||
|
|
||
|
#define WITHIN_FCHR_DOLLARA(x) \
|
||
|
assert_leq(x[0], this->fchr()[1]+1); \
|
||
|
assert_leq(x[1], this->fchr()[2]); \
|
||
|
assert_leq(x[2], this->fchr()[3]); \
|
||
|
assert_leq(x[3], this->fchr()[4])
|
||
|
|
||
|
/**
|
||
|
* Count all occurrences of character c from the beginning of the
|
||
|
* forward side to <by,bp> and add in the occ[] count up to the side
|
||
|
* break just prior to the side.
|
||
|
*
|
||
|
* A Bowtie 2 side is shaped like:
|
||
|
*
|
||
|
* XXXXXXXXXXXXXXXX [A] [C] [G] [T]
|
||
|
* --------48------ -4- -4- -4- -4- (numbers in bytes)
|
||
|
*/
|
||
|
inline index_t countBt2Side(const SideLocus<index_t>& l, int c) const {
|
||
|
assert_range(0, 3, c);
|
||
|
assert_range(0, (int)this->_gh._sideGbwtSz-1, (int)l._by);
|
||
|
assert_range(0, 3, (int)l._bp);
|
||
|
const uint8_t *side = l.side(this->gfm());
|
||
|
index_t cCnt = countUpTo(l, c);
|
||
|
assert_leq(cCnt, l.toBWRow(_gh));
|
||
|
assert_leq(cCnt, this->_gh._sideGbwtLen);
|
||
|
assert_eq(_zGbwtByteOffs.size(), _zGbwtBpOffs.size());
|
||
|
for(index_t i = 0; i < _zGbwtByteOffs.size(); i++) {
|
||
|
index_t zGbwtByteOff = _zGbwtByteOffs[i];
|
||
|
if(c == 0 && l._sideByteOff <= zGbwtByteOff && l._sideByteOff + l._by >= zGbwtByteOff) {
|
||
|
// Adjust for the fact that we represented $ with an 'A', but
|
||
|
// shouldn't count it as an 'A' here
|
||
|
int zGbwtBpOff = _zGbwtBpOffs[i];
|
||
|
if((l._sideByteOff + l._by > zGbwtByteOff) ||
|
||
|
(l._sideByteOff + l._by == zGbwtByteOff && l._bp > zGbwtBpOff))
|
||
|
{
|
||
|
cCnt--; // Adjust for '$' looking like an 'A'
|
||
|
}
|
||
|
}
|
||
|
}
|
||
|
index_t ret;
|
||
|
// Now factor in the occ[] count at the side break
|
||
|
const uint8_t *acgt8 = side + _gh._sideGbwtSz;
|
||
|
if(!_gh._linearFM) {
|
||
|
acgt8 += (sizeof(index_t) << 1);
|
||
|
}
|
||
|
const index_t *acgt = reinterpret_cast<const index_t*>(acgt8);
|
||
|
assert_leq(acgt[0], this->_gh._numSides * this->_gh._sideGbwtLen); // b/c it's used as padding
|
||
|
assert_lt(acgt[1], this->_gh._gbwtLen);
|
||
|
assert_lt(acgt[2], this->_gh._gbwtLen);
|
||
|
assert_lt(acgt[3], this->_gh._gbwtLen);
|
||
|
ret = acgt[c] + cCnt + this->fchr()[c];
|
||
|
#ifndef NDEBUG
|
||
|
assert_leq(ret, this->fchr()[c+1]); // can't have jumpded into next char's section
|
||
|
if(c == 0) {
|
||
|
assert_leq(cCnt, this->_gh._sideGbwtLen);
|
||
|
} else {
|
||
|
assert_leq(ret, this->_gh._gbwtLen);
|
||
|
}
|
||
|
#endif
|
||
|
return ret;
|
||
|
}
|
||
|
|
||
|
/**
|
||
|
* Count all occurrences of all four nucleotides up to the starting
|
||
|
* point (which must be in a forward side) given by 'l' storing the
|
||
|
* result in 'cntsUpto', then count nucleotide occurrences within the
|
||
|
* range of length 'num' storing the result in 'cntsIn'. Also, keep
|
||
|
* track of the characters occurring within the range by setting
|
||
|
* 'masks' accordingly (masks[1][10] == true -> 11th character is a
|
||
|
* 'C', and masks[0][10] == masks[2][10] == masks[3][10] == false.
|
||
|
*/
|
||
|
inline void countBt2SideRange(
|
||
|
const SideLocus<index_t>& l, // top locus
|
||
|
index_t num, // number of elts in range to tall
|
||
|
index_t* cntsUpto, // A/C/G/T counts up to top
|
||
|
index_t* cntsIn, // A/C/G/T counts within range
|
||
|
EList<bool> *masks) const // masks indicating which range elts = A/C/G/T
|
||
|
{
|
||
|
assert_gt(num, 0);
|
||
|
assert_range(0, (int)this->_gh._sideGbwtSz-1, (int)l._by);
|
||
|
assert_range(0, 3, (int)l._bp);
|
||
|
countUpToEx(l, cntsUpto);
|
||
|
WITHIN_FCHR_DOLLARA(cntsUpto);
|
||
|
WITHIN_BWT_LEN(cntsUpto);
|
||
|
const uint8_t *side = l.side(this->gfm());
|
||
|
assert_eq(_zGbwtByteOffs.size(), _zGbwtBpOffs.size());
|
||
|
for(index_t i = 0; i < _zGbwtByteOffs.size(); i++) {
|
||
|
index_t zGbwtByteOff = _zGbwtByteOffs[i];
|
||
|
if(l._sideByteOff <= zGbwtByteOff && l._sideByteOff + l._by >= zGbwtByteOff) {
|
||
|
// Adjust for the fact that we represented $ with an 'A', but
|
||
|
// shouldn't count it as an 'A' here
|
||
|
int zGbwtBpOff = _zGbwtBpOffs[i];
|
||
|
if((l._sideByteOff + l._by > zGbwtByteOff) ||
|
||
|
(l._sideByteOff + l._by == zGbwtByteOff && l._bp > zGbwtBpOff))
|
||
|
{
|
||
|
cntsUpto[0]--; // Adjust for '$' looking like an 'A'
|
||
|
}
|
||
|
}
|
||
|
}
|
||
|
// Now factor in the occ[] count at the side break
|
||
|
const index_t *acgt = reinterpret_cast<const index_t*>(side + _gh._sideGbwtSz);
|
||
|
if(!this->_gh.linearFM()) acgt += 2;
|
||
|
assert_leq(acgt[0], this->fchr()[1] + this->_gh.sideGbwtLen());
|
||
|
assert_leq(acgt[1], this->fchr()[2]-this->fchr()[1]);
|
||
|
assert_leq(acgt[2], this->fchr()[3]-this->fchr()[2]);
|
||
|
assert_leq(acgt[3], this->fchr()[4]-this->fchr()[3]);
|
||
|
assert_leq(acgt[0], this->_gh._gbwtLen + this->_gh.sideGbwtLen());
|
||
|
assert_leq(acgt[1], this->_gh._gbwtLen);
|
||
|
assert_leq(acgt[2], this->_gh._gbwtLen);
|
||
|
assert_leq(acgt[3], this->_gh._gbwtLen);
|
||
|
cntsUpto[0] += (acgt[0] + this->fchr()[0]);
|
||
|
cntsUpto[1] += (acgt[1] + this->fchr()[1]);
|
||
|
cntsUpto[2] += (acgt[2] + this->fchr()[2]);
|
||
|
cntsUpto[3] += (acgt[3] + this->fchr()[3]);
|
||
|
masks[0].resize(num);
|
||
|
masks[1].resize(num);
|
||
|
masks[2].resize(num);
|
||
|
masks[3].resize(num);
|
||
|
WITHIN_FCHR_DOLLARA(cntsUpto);
|
||
|
WITHIN_FCHR_DOLLARA(cntsIn);
|
||
|
// 'cntsUpto' is complete now.
|
||
|
// Walk forward until we've tallied the entire 'In' range
|
||
|
index_t nm = 0;
|
||
|
// Rest of this side
|
||
|
nm += countBt2SideRange2(l, true, num - nm, cntsIn, masks, nm);
|
||
|
assert_eq(nm, cntsIn[0] + cntsIn[1] + cntsIn[2] + cntsIn[3]);
|
||
|
assert_leq(nm, num);
|
||
|
SideLocus<index_t> lcopy = l;
|
||
|
while(nm < num) {
|
||
|
// Subsequent sides, if necessary
|
||
|
lcopy.nextSide(this->_gh);
|
||
|
nm += countBt2SideRange2(lcopy, false, num - nm, cntsIn, masks, nm);
|
||
|
WITHIN_FCHR_DOLLARA(cntsIn);
|
||
|
assert_leq(nm, num);
|
||
|
assert_eq(nm, cntsIn[0] + cntsIn[1] + cntsIn[2] + cntsIn[3]);
|
||
|
}
|
||
|
assert_eq(num, cntsIn[0] + cntsIn[1] + cntsIn[2] + cntsIn[3]);
|
||
|
WITHIN_FCHR_DOLLARA(cntsIn);
|
||
|
}
|
||
|
|
||
|
/**
|
||
|
* Count all occurrences of character c from the beginning of the
|
||
|
* forward side to <by,bp> and add in the occ[] count up to the side
|
||
|
* break just prior to the side.
|
||
|
*
|
||
|
* A forward side is shaped like:
|
||
|
*
|
||
|
* [A] [C] XXXXXXXXXXXXXXXX
|
||
|
* -4- -4- --------56------ (numbers in bytes)
|
||
|
* ^
|
||
|
* Side ptr (result from SideLocus.side())
|
||
|
*
|
||
|
* And following it is a reverse side shaped like:
|
||
|
*
|
||
|
* [G] [T] XXXXXXXXXXXXXXXX
|
||
|
* -4- -4- --------56------ (numbers in bytes)
|
||
|
* ^
|
||
|
* Side ptr (result from SideLocus.side())
|
||
|
*
|
||
|
*/
|
||
|
inline void countBt2SideEx(const SideLocus<index_t>& l, index_t* arrs) const {
|
||
|
assert_range(0, (int)this->_gh._sideGbwtSz-1, (int)l._by);
|
||
|
assert_range(0, 3, (int)l._bp);
|
||
|
countUpToEx(l, arrs);
|
||
|
assert_eq(_zGbwtByteOffs.size(), _zGbwtBpOffs.size());
|
||
|
for(index_t i = 0; i < _zGbwtByteOffs.size(); i++) {
|
||
|
index_t zGbwtByteOff = _zGbwtByteOffs[i];
|
||
|
if(l._sideByteOff <= zGbwtByteOff && l._sideByteOff + l._by >= zGbwtByteOff) {
|
||
|
// Adjust for the fact that we represented $ with an 'A', but
|
||
|
// shouldn't count it as an 'A' here
|
||
|
int zGbwtBpOff = _zGbwtBpOffs[i];
|
||
|
if((l._sideByteOff + l._by > zGbwtByteOff) ||
|
||
|
(l._sideByteOff + l._by == zGbwtByteOff && l._bp > zGbwtBpOff))
|
||
|
{
|
||
|
arrs[0]--; // Adjust for '$' looking like an 'A'
|
||
|
}
|
||
|
}
|
||
|
}
|
||
|
WITHIN_FCHR(arrs);
|
||
|
WITHIN_BWT_LEN(arrs);
|
||
|
// Now factor in the occ[] count at the side break
|
||
|
const uint8_t *side = l.side(this->gfm());
|
||
|
const uint8_t *acgt16 = side + this->_gh._sideSz - sizeof(index_t) * 4;
|
||
|
const index_t *acgt = reinterpret_cast<const index_t*>(acgt16);
|
||
|
assert_leq(acgt[0], this->fchr()[1] + this->_gh.sideGbwtLen());
|
||
|
assert_leq(acgt[1], this->fchr()[2]-this->fchr()[1]);
|
||
|
assert_leq(acgt[2], this->fchr()[3]-this->fchr()[2]);
|
||
|
assert_leq(acgt[3], this->fchr()[4]-this->fchr()[3]);
|
||
|
assert_leq(acgt[0], this->_gh._len + this->_gh.sideGbwtLen());
|
||
|
assert_leq(acgt[1], this->_gh._len);
|
||
|
assert_leq(acgt[2], this->_gh._len);
|
||
|
assert_leq(acgt[3], this->_gh._len);
|
||
|
arrs[0] += (acgt[0] + this->fchr()[0]);
|
||
|
arrs[1] += (acgt[1] + this->fchr()[1]);
|
||
|
arrs[2] += (acgt[2] + this->fchr()[2]);
|
||
|
arrs[3] += (acgt[3] + this->fchr()[3]);
|
||
|
WITHIN_FCHR(arrs);
|
||
|
}
|
||
|
|
||
|
/**
|
||
|
* Count all occurrences of character 1 from the beginning of the
|
||
|
* forward side to <by,bp> and add in the occ[] count up to the side
|
||
|
* break just prior to the side.
|
||
|
*
|
||
|
*/
|
||
|
inline index_t countMSide(const SideLocus<index_t>& l) const {
|
||
|
assert_range(0, (int)this->_gh._sideGbwtSz-1, (int)l._by);
|
||
|
assert_range(0, 7, (int)l._bp);
|
||
|
index_t cCnt = countUpTo_bits(l, false /* F? */);
|
||
|
const uint8_t *side = l.side(this->gfm());
|
||
|
cCnt += *(index_t*)(side + _gh._sideGbwtSz + sizeof(index_t));
|
||
|
assert_leq(cCnt, l.toBWRow(_gh));
|
||
|
assert_leq(cCnt, this->_gh._numNodes);
|
||
|
return cCnt;
|
||
|
}
|
||
|
|
||
|
/**
|
||
|
* Counts the number of occurrences of character 'c' in the given Ebwt
|
||
|
* side up to (but not including) the given byte/bitpair (by/bp).
|
||
|
*
|
||
|
* This is a performance-critical function. This is the top search-
|
||
|
* related hit in the time profile.
|
||
|
*
|
||
|
* Function gets 11.09% in profile
|
||
|
*/
|
||
|
inline index_t countUpTo(const SideLocus<index_t>& l, int c) const {
|
||
|
// Count occurrences of c in each 64-bit (using bit trickery);
|
||
|
// Someday countInU64() and pop() functions should be
|
||
|
// vectorized/SSE-ized in case that helps.
|
||
|
bool usePOPCNT = false;
|
||
|
index_t cCnt = 0;
|
||
|
const uint8_t *side = l.side(this->gfm());
|
||
|
int i = 0;
|
||
|
#ifdef POPCNT_CAPABILITY
|
||
|
if(_usePOPCNTinstruction) {
|
||
|
usePOPCNT = true;
|
||
|
int by = l._by + (l._bp > 0 ? 1 : 0);
|
||
|
for(; i < by; i += 8) {
|
||
|
if(i + 8 < by) {
|
||
|
cCnt += countInU64<USE_POPCNT_INSTRUCTION>(c, *(uint64_t*)&side[i]);
|
||
|
} else {
|
||
|
index_t by_shift = 8 - (by - i);
|
||
|
index_t bp_shift = (l._bp > 0 ? 4 - l._bp : 0);
|
||
|
index_t shift = (by_shift << 3) + (bp_shift << 1);
|
||
|
uint64_t side_i = *(uint64_t*)&side[i];
|
||
|
side_i = (_toBigEndian ? side_i >> shift : side_i << shift);
|
||
|
index_t cCnt_add = countInU64<USE_POPCNT_INSTRUCTION>(c, side_i);
|
||
|
if(c == 0) cCnt_add -= (shift >> 1);
|
||
|
#ifndef NDEBUG
|
||
|
index_t cCnt_temp = 0;
|
||
|
for(int j = i; j < l._by; j++) {
|
||
|
cCnt_temp += cCntLUT_4[0][c][side[j]];
|
||
|
}
|
||
|
if(l._bp > 0) {
|
||
|
cCnt_temp += cCntLUT_4[(int)l._bp][c][side[l._by]];
|
||
|
}
|
||
|
assert_eq(cCnt_add, cCnt_temp);
|
||
|
#endif
|
||
|
cCnt += cCnt_add;
|
||
|
break;
|
||
|
}
|
||
|
}
|
||
|
} else {
|
||
|
for(; i + 7 < l._by; i += 8) {
|
||
|
cCnt += countInU64<USE_POPCNT_GENERIC>(c, *(uint64_t*)&side[i]);
|
||
|
}
|
||
|
}
|
||
|
#else
|
||
|
for(; i + 7 < l._by; i += 8) {
|
||
|
cCnt += countInU64(c, *(uint64_t*)&side[i]);
|
||
|
}
|
||
|
#endif
|
||
|
|
||
|
if(!usePOPCNT) {
|
||
|
// Count occurences of c in the rest of the side (using LUT)
|
||
|
for(; i < l._by; i++) {
|
||
|
cCnt += cCntLUT_4[0][c][side[i]];
|
||
|
}
|
||
|
|
||
|
// Count occurences of c in the rest of the byte
|
||
|
if(l._bp > 0) {
|
||
|
cCnt += cCntLUT_4[(int)l._bp][c][side[i]];
|
||
|
}
|
||
|
}
|
||
|
|
||
|
return cCnt;
|
||
|
}
|
||
|
|
||
|
/**
|
||
|
* Counts the number of occurrences of character 'c' in the given Ebwt
|
||
|
* side down to the given byte/bitpair (by/bp).
|
||
|
*
|
||
|
*/
|
||
|
inline index_t countDownTo(const SideLocus<index_t>& l, int c) const {
|
||
|
// Count occurrences of c in each 64-bit (using bit trickery);
|
||
|
// Someday countInU64() and pop() functions should be
|
||
|
// vectorized/SSE-ized in case that helps.
|
||
|
index_t cCnt = 0;
|
||
|
const uint8_t *side = l.side(this->gfm());
|
||
|
int i = 64 - 4 * sizeof(index_t) - 1;
|
||
|
#ifdef POPCNT_CAPABILITY
|
||
|
if ( _usePOPCNTinstruction) {
|
||
|
for(; i - 7 > l._by; i -= 8) {
|
||
|
cCnt += countInU64<USE_POPCNT_INSTRUCTION>(c, *(uint64_t*)&side[i-7]);
|
||
|
}
|
||
|
}
|
||
|
else {
|
||
|
for(; i + 7 > l._by; i -= 8) {
|
||
|
cCnt += countInU64<USE_POPCNT_GENERIC>(c, *(uint64_t*)&side[i-7]);
|
||
|
}
|
||
|
}
|
||
|
#else
|
||
|
for(; i + 7 > l._by; i -= 8) {
|
||
|
cCnt += countInU64(c, *(uint64_t*)&side[i-7]);
|
||
|
}
|
||
|
#endif
|
||
|
// Count occurences of c in the rest of the side (using LUT)
|
||
|
for(; i > l._by; i--) {
|
||
|
cCnt += cCntLUT_4_rev[0][c][side[i]];
|
||
|
}
|
||
|
// Count occurences of c in the rest of the byte
|
||
|
if(l._bp > 0) {
|
||
|
cCnt += cCntLUT_4_rev[4-(int)l._bp][c][side[i]];
|
||
|
} else {
|
||
|
cCnt += cCntLUT_4_rev[0][c][side[i]];
|
||
|
}
|
||
|
return cCnt;
|
||
|
}
|
||
|
|
||
|
/**
|
||
|
* Tricky-bit-bashing bitpair counting for given two-bit value (0-3)
|
||
|
* within a 64-bit argument.
|
||
|
*
|
||
|
* Function gets 2.32% in profile
|
||
|
*/
|
||
|
#ifdef POPCNT_CAPABILITY
|
||
|
template<typename Operation>
|
||
|
#endif
|
||
|
inline static void countInU64Ex(uint64_t dw, index_t* arrs) {
|
||
|
uint64_t c0 = c_table[0];
|
||
|
uint64_t x0 = dw ^ c0;
|
||
|
uint64_t x1 = (x0 >> 1);
|
||
|
uint64_t x2 = x1 & (0x5555555555555555llu);
|
||
|
uint64_t x3 = x0 & x2;
|
||
|
#ifdef POPCNT_CAPABILITY
|
||
|
uint64_t tmp = Operation().pop64(x3);
|
||
|
#else
|
||
|
uint64_t tmp = pop64(x3);
|
||
|
#endif
|
||
|
arrs[0] += (uint32_t) tmp;
|
||
|
|
||
|
c0 = c_table[1];
|
||
|
x0 = dw ^ c0;
|
||
|
x1 = (x0 >> 1);
|
||
|
x2 = x1 & (0x5555555555555555llu);
|
||
|
x3 = x0 & x2;
|
||
|
#ifdef POPCNT_CAPABILITY
|
||
|
tmp = Operation().pop64(x3);
|
||
|
#else
|
||
|
tmp = pop64(x3);
|
||
|
#endif
|
||
|
arrs[1] += (uint32_t) tmp;
|
||
|
|
||
|
c0 = c_table[2];
|
||
|
x0 = dw ^ c0;
|
||
|
x1 = (x0 >> 1);
|
||
|
x2 = x1 & (0x5555555555555555llu);
|
||
|
x3 = x0 & x2;
|
||
|
#ifdef POPCNT_CAPABILITY
|
||
|
tmp = Operation().pop64(x3);
|
||
|
#else
|
||
|
tmp = pop64(x3);
|
||
|
#endif
|
||
|
arrs[2] += (uint32_t) tmp;
|
||
|
|
||
|
c0 = c_table[3];
|
||
|
x0 = dw ^ c0;
|
||
|
x1 = (x0 >> 1);
|
||
|
x2 = x1 & (0x5555555555555555llu);
|
||
|
x3 = x0 & x2;
|
||
|
#ifdef POPCNT_CAPABILITY
|
||
|
tmp = Operation().pop64(x3);
|
||
|
#else
|
||
|
tmp = pop64(x3);
|
||
|
#endif
|
||
|
arrs[3] += (uint32_t) tmp;
|
||
|
}
|
||
|
|
||
|
/**
|
||
|
* Counts the number of occurrences of all four nucleotides in the
|
||
|
* given side up to (but not including) the given byte/bitpair (by/bp).
|
||
|
* Count for 'a' goes in arrs[0], 'c' in arrs[1], etc.
|
||
|
*/
|
||
|
inline void countUpToEx(const SideLocus<index_t>& l, index_t* arrs) const {
|
||
|
int i = 0;
|
||
|
// Count occurrences of each nucleotide in each 64-bit word using
|
||
|
// bit trickery; note: this seems does not seem to lend a
|
||
|
// significant boost to performance in practice. If you comment
|
||
|
// out this whole loop (which won't affect correctness - it will
|
||
|
// just cause the following loop to take up the slack) then runtime
|
||
|
// does not change noticeably. Someday the countInU64() and pop()
|
||
|
// functions should be vectorized/SSE-ized in case that helps.
|
||
|
const uint8_t *side = l.side(this->gfm());
|
||
|
#ifdef POPCNT_CAPABILITY
|
||
|
if (_usePOPCNTinstruction) {
|
||
|
for(; i+7 < l._by; i += 8) {
|
||
|
countInU64Ex<USE_POPCNT_INSTRUCTION>(*(uint64_t*)&side[i], arrs);
|
||
|
}
|
||
|
}
|
||
|
else {
|
||
|
for(; i+7 < l._by; i += 8) {
|
||
|
countInU64Ex<USE_POPCNT_GENERIC>(*(uint64_t*)&side[i], arrs);
|
||
|
}
|
||
|
}
|
||
|
#else
|
||
|
for(; i+7 < l._by; i += 8) {
|
||
|
countInU64Ex(*(uint64_t*)&side[i], arrs);
|
||
|
}
|
||
|
#endif
|
||
|
// Count occurences of nucleotides in the rest of the side (using LUT)
|
||
|
// Many cache misses on following lines (~20K)
|
||
|
for(; i < l._by; i++) {
|
||
|
arrs[0] += cCntLUT_4[0][0][side[i]];
|
||
|
arrs[1] += cCntLUT_4[0][1][side[i]];
|
||
|
arrs[2] += cCntLUT_4[0][2][side[i]];
|
||
|
arrs[3] += cCntLUT_4[0][3][side[i]];
|
||
|
}
|
||
|
// Count occurences of c in the rest of the byte
|
||
|
if(l._bp > 0) {
|
||
|
arrs[0] += cCntLUT_4[(int)l._bp][0][side[i]];
|
||
|
arrs[1] += cCntLUT_4[(int)l._bp][1][side[i]];
|
||
|
arrs[2] += cCntLUT_4[(int)l._bp][2][side[i]];
|
||
|
arrs[3] += cCntLUT_4[(int)l._bp][3][side[i]];
|
||
|
}
|
||
|
}
|
||
|
|
||
|
/**
|
||
|
* Counts the number of occurrences of character 'c' in the given Ebwt
|
||
|
* side up to (but not including) the given byte/bitpair (by/bp).
|
||
|
*
|
||
|
* This is a performance-critical function. This is the top search-
|
||
|
* related hit in the time profile.
|
||
|
*/
|
||
|
inline index_t countUpTo_bits(const SideLocus<index_t>& l, bool F) const {
|
||
|
// Count occurrences of c in each 64-bit (using bit trickery);
|
||
|
// Someday countInU64() and pop() functions should be
|
||
|
// vectorized/SSE-ized in case that helps.
|
||
|
bool usePOPCNT = false;
|
||
|
index_t cCnt = 0;
|
||
|
const uint8_t *side = l.side(this->gfm());
|
||
|
if(F) {
|
||
|
side += (_gh._sideGbwtSz >> 1);
|
||
|
} else {
|
||
|
side += (_gh._sideGbwtSz - (_gh._sideGbwtSz >> 2));
|
||
|
}
|
||
|
int i = 0;
|
||
|
#ifdef POPCNT_CAPABILITY
|
||
|
if(_usePOPCNTinstruction) {
|
||
|
usePOPCNT = true;
|
||
|
int by = l._by + (l._bp > 0 ? 1 : 0);
|
||
|
for(; i < by; i += 8) {
|
||
|
if(i + 8 < by) {
|
||
|
cCnt += countInU64_bits<USE_POPCNT_INSTRUCTION>(*(uint64_t*)&side[i]);
|
||
|
} else {
|
||
|
index_t by_shift = 8 - (by - i);
|
||
|
index_t bp_shift = (l._bp > 0 ? 8 - l._bp : 0);
|
||
|
index_t shift = (by_shift << 3) + bp_shift;
|
||
|
uint64_t side_i = *(uint64_t*)&side[i];
|
||
|
side_i = (_toBigEndian ? side_i >> shift : side_i << shift);
|
||
|
index_t cCnt_add = countInU64_bits<USE_POPCNT_INSTRUCTION>(side_i);
|
||
|
#ifndef NDEBUG
|
||
|
index_t cCnt_temp = 0;
|
||
|
for(int j = i; j < l._by; j++) {
|
||
|
cCnt_temp += cCntBIT[0][side[j]];
|
||
|
}
|
||
|
if(l._bp > 0) {
|
||
|
cCnt_temp += cCntBIT[(int)l._bp][side[l._by]];
|
||
|
}
|
||
|
assert_eq(cCnt_add, cCnt_temp);
|
||
|
#endif
|
||
|
cCnt += cCnt_add;
|
||
|
break;
|
||
|
}
|
||
|
}
|
||
|
} else {
|
||
|
for(; i + 7 < l._by; i += 8) {
|
||
|
cCnt += countInU64_bits<USE_POPCNT_GENERIC_BITS>(*(uint64_t*)&side[i]);
|
||
|
}
|
||
|
}
|
||
|
#else
|
||
|
for(; i + 7 < l._by; i += 8) {
|
||
|
cCnt += countInU64_bits(*(uint64_t*)&side[i]);
|
||
|
}
|
||
|
#endif
|
||
|
|
||
|
if(!usePOPCNT) {
|
||
|
// Count occurences of c in the rest of the side (using LUT)
|
||
|
for(; i < l._by; i++) {
|
||
|
cCnt += cCntBIT[0][side[i]];
|
||
|
}
|
||
|
|
||
|
// Count occurences of c in the rest of the byte
|
||
|
if(l._bp > 0) {
|
||
|
cCnt += cCntBIT[(int)l._bp][side[i]];
|
||
|
}
|
||
|
}
|
||
|
|
||
|
return cCnt;
|
||
|
}
|
||
|
|
||
|
|
||
|
#ifndef NDEBUG
|
||
|
/**
|
||
|
* Given top and bot loci, calculate counts of all four DNA chars up to
|
||
|
* those loci. Used for more advanced backtracking-search.
|
||
|
*/
|
||
|
inline void mapLFEx(
|
||
|
const SideLocus<index_t>& l,
|
||
|
index_t *arrs
|
||
|
ASSERT_ONLY(, bool overrideSanity = false)
|
||
|
) const
|
||
|
{
|
||
|
assert_eq(0, arrs[0]);
|
||
|
assert_eq(0, arrs[1]);
|
||
|
assert_eq(0, arrs[2]);
|
||
|
assert_eq(0, arrs[3]);
|
||
|
countBt2SideEx(l, arrs);
|
||
|
if(_sanity && !overrideSanity) {
|
||
|
// Make sure results match up with individual calls to mapLF;
|
||
|
// be sure to override sanity-checking in the callee, or we'll
|
||
|
// have infinite recursion
|
||
|
assert_eq(mapLF(l, 0, true), arrs[0]);
|
||
|
assert_eq(mapLF(l, 1, true), arrs[1]);
|
||
|
assert_eq(mapLF(l, 2, true), arrs[2]);
|
||
|
assert_eq(mapLF(l, 3, true), arrs[3]);
|
||
|
}
|
||
|
}
|
||
|
#endif
|
||
|
|
||
|
/**
|
||
|
* Given top and bot rows, calculate counts of all four DNA chars up to
|
||
|
* those loci.
|
||
|
*/
|
||
|
inline void mapLFEx(
|
||
|
index_t top,
|
||
|
index_t bot,
|
||
|
index_t *tops,
|
||
|
index_t *bots
|
||
|
ASSERT_ONLY(, bool overrideSanity = false)
|
||
|
) const
|
||
|
{
|
||
|
SideLocus<index_t> ltop, lbot;
|
||
|
SideLocus<index_t>::initFromTopBot(top, bot, _gh, gfm(), ltop, lbot);
|
||
|
mapLFEx(ltop, lbot, tops, bots ASSERT_ONLY(, overrideSanity));
|
||
|
}
|
||
|
|
||
|
/**
|
||
|
* Given top and bot loci, calculate counts of all four DNA chars up to
|
||
|
* those loci. Used for more advanced backtracking-search.
|
||
|
*/
|
||
|
inline void mapLFEx(
|
||
|
const SideLocus<index_t>& ltop,
|
||
|
const SideLocus<index_t>& lbot,
|
||
|
index_t *tops,
|
||
|
index_t *bots
|
||
|
ASSERT_ONLY(, bool overrideSanity = false)
|
||
|
) const
|
||
|
{
|
||
|
assert(ltop.repOk(this->gh()));
|
||
|
assert(lbot.repOk(this->gh()));
|
||
|
assert_eq(0, tops[0]); assert_eq(0, bots[0]);
|
||
|
assert_eq(0, tops[1]); assert_eq(0, bots[1]);
|
||
|
assert_eq(0, tops[2]); assert_eq(0, bots[2]);
|
||
|
assert_eq(0, tops[3]); assert_eq(0, bots[3]);
|
||
|
countBt2SideEx(ltop, tops);
|
||
|
countBt2SideEx(lbot, bots);
|
||
|
#ifndef NDEBUG
|
||
|
if(_sanity && !overrideSanity) {
|
||
|
// Make sure results match up with individual calls to mapLF;
|
||
|
// be sure to override sanity-checking in the callee, or we'll
|
||
|
// have infinite recursion
|
||
|
assert_eq(mapLF(ltop, 0, true), tops[0]);
|
||
|
assert_eq(mapLF(ltop, 1, true), tops[1]);
|
||
|
assert_eq(mapLF(ltop, 2, true), tops[2]);
|
||
|
assert_eq(mapLF(ltop, 3, true), tops[3]);
|
||
|
assert_eq(mapLF(lbot, 0, true), bots[0]);
|
||
|
assert_eq(mapLF(lbot, 1, true), bots[1]);
|
||
|
assert_eq(mapLF(lbot, 2, true), bots[2]);
|
||
|
assert_eq(mapLF(lbot, 3, true), bots[3]);
|
||
|
}
|
||
|
#endif
|
||
|
}
|
||
|
|
||
|
/**
|
||
|
* Counts the number of occurrences of all four nucleotides in the
|
||
|
* given side from the given byte/bitpair (l->_by/l->_bp) (or the
|
||
|
* beginning of the side if l == 0). Count for 'a' goes in arrs[0],
|
||
|
* 'c' in arrs[1], etc.
|
||
|
*
|
||
|
* Note: must account for $.
|
||
|
*
|
||
|
* Must fill in masks
|
||
|
*/
|
||
|
inline index_t countBt2SideRange2(
|
||
|
const SideLocus<index_t>& l,
|
||
|
bool startAtLocus,
|
||
|
index_t num,
|
||
|
index_t* arrs,
|
||
|
EList<bool> *masks,
|
||
|
index_t maskOff) const
|
||
|
{
|
||
|
assert(!masks[0].empty());
|
||
|
assert_eq(masks[0].size(), masks[1].size());
|
||
|
assert_eq(masks[0].size(), masks[2].size());
|
||
|
assert_eq(masks[0].size(), masks[3].size());
|
||
|
ASSERT_ONLY(index_t myarrs[4] = {0, 0, 0, 0});
|
||
|
index_t nm = 0; // number of nucleotides tallied so far
|
||
|
int iby = 0; // initial byte offset
|
||
|
int ibp = 0; // initial base-pair offset
|
||
|
if(startAtLocus) {
|
||
|
iby = l._by;
|
||
|
ibp = l._bp;
|
||
|
} else {
|
||
|
// Start at beginning
|
||
|
}
|
||
|
int by = iby, bp = ibp;
|
||
|
assert_lt(bp, 4);
|
||
|
index_t sideGbwtSz = this->_gh._sideGbwtSz >> (this->_gh.linearFM() ? 0 : 1);
|
||
|
assert_lt(by, (int)sideGbwtSz);
|
||
|
const uint8_t *side = l.side(this->gfm());
|
||
|
while(nm < num) {
|
||
|
int c = (side[by] >> (bp * 2)) & 3;
|
||
|
assert_lt(maskOff + nm, masks[c].size());
|
||
|
masks[0][maskOff + nm] = masks[1][maskOff + nm] =
|
||
|
masks[2][maskOff + nm] = masks[3][maskOff + nm] = false;
|
||
|
assert_range(0, 3, c);
|
||
|
// Note: we tally $ just like an A
|
||
|
arrs[c]++; // tally it
|
||
|
ASSERT_ONLY(myarrs[c]++);
|
||
|
masks[c][maskOff + nm] = true; // not dead
|
||
|
nm++;
|
||
|
if(++bp == 4) {
|
||
|
bp = 0;
|
||
|
by++;
|
||
|
assert_leq(by, (int)sideGbwtSz);
|
||
|
if(by == (int)sideGbwtSz) {
|
||
|
// Fell off the end of the side
|
||
|
break;
|
||
|
}
|
||
|
}
|
||
|
}
|
||
|
WITHIN_FCHR_DOLLARA(arrs);
|
||
|
#ifndef NDEBUG
|
||
|
if(_sanity) {
|
||
|
// Make sure results match up with a call to mapLFEx.
|
||
|
index_t tops[4] = {0, 0, 0, 0};
|
||
|
index_t bots[4] = {0, 0, 0, 0};
|
||
|
index_t top = l.toBWRow(gh());
|
||
|
index_t bot = top + nm;
|
||
|
mapLFEx(top, bot, tops, bots, false);
|
||
|
assert(myarrs[0] == (bots[0] - tops[0]) || myarrs[0] == (bots[0] - tops[0])+1);
|
||
|
assert_eq(myarrs[1], bots[1] - tops[1]);
|
||
|
assert_eq(myarrs[2], bots[2] - tops[2]);
|
||
|
assert_eq(myarrs[3], bots[3] - tops[3]);
|
||
|
}
|
||
|
#endif
|
||
|
return nm;
|
||
|
}
|
||
|
|
||
|
/**
|
||
|
* Return the final character in row i (i.e. the i'th character in the
|
||
|
* BWT transform). Note that the 'L' in the name of the function
|
||
|
* stands for 'last', as in the literature.
|
||
|
*/
|
||
|
inline int rowL(const SideLocus<index_t>& l) const {
|
||
|
// Extract and return appropriate bit-pair
|
||
|
return unpack_2b_from_8b(l.side(this->gfm())[l._by], l._bp);
|
||
|
}
|
||
|
|
||
|
/**
|
||
|
* Return the final character in row i (i.e. the i'th character in the
|
||
|
* BWT transform). Note that the 'L' in the name of the function
|
||
|
* stands for 'last', as in the literature.
|
||
|
*/
|
||
|
inline int rowL(index_t i) const {
|
||
|
// Extract and return appropriate bit-pair
|
||
|
SideLocus<index_t> l;
|
||
|
l.initFromRow(i, _gh, gfm());
|
||
|
return rowL(l);
|
||
|
}
|
||
|
|
||
|
/**
|
||
|
* Given top and bot loci, calculate counts of all four DNA chars up to
|
||
|
* those loci. Used for more advanced backtracking-search.
|
||
|
*/
|
||
|
inline void mapLFRange(
|
||
|
const SideLocus<index_t>& ltop,
|
||
|
const SideLocus<index_t>& lbot,
|
||
|
index_t num, // Number of elts
|
||
|
index_t* cntsUpto, // A/C/G/T counts up to top
|
||
|
index_t* cntsIn, // A/C/G/T counts within range
|
||
|
EList<bool> *masks
|
||
|
ASSERT_ONLY(, bool overrideSanity = false)
|
||
|
) const
|
||
|
{
|
||
|
assert(ltop.repOk(this->gh()));
|
||
|
assert(lbot.repOk(this->gh()));
|
||
|
assert_eq(num, lbot.toBWRow(this->gh()) - ltop.toBWRow(this->gh()));
|
||
|
assert_eq(0, cntsUpto[0]); assert_eq(0, cntsIn[0]);
|
||
|
assert_eq(0, cntsUpto[1]); assert_eq(0, cntsIn[1]);
|
||
|
assert_eq(0, cntsUpto[2]); assert_eq(0, cntsIn[2]);
|
||
|
assert_eq(0, cntsUpto[3]); assert_eq(0, cntsIn[3]);
|
||
|
countBt2SideRange(ltop, num, cntsUpto, cntsIn, masks);
|
||
|
assert_eq(num, cntsIn[0] + cntsIn[1] + cntsIn[2] + cntsIn[3]);
|
||
|
#ifndef NDEBUG
|
||
|
if(_sanity && !overrideSanity) {
|
||
|
// Make sure results match up with individual calls to mapLF;
|
||
|
// be sure to override sanity-checking in the callee, or we'll
|
||
|
// have infinite recursion
|
||
|
index_t tops[4] = {0, 0, 0, 0};
|
||
|
index_t bots[4] = {0, 0, 0, 0};
|
||
|
assert(ltop.repOk(this->gh()));
|
||
|
assert(lbot.repOk(this->gh()));
|
||
|
mapLFEx(ltop, lbot, tops, bots, false);
|
||
|
for(int i = 0; i < 4; i++) {
|
||
|
assert(cntsUpto[i] == tops[i] || tops[i] == bots[i]);
|
||
|
if(i == 0) {
|
||
|
assert(cntsIn[i] == bots[i]-tops[i] ||
|
||
|
cntsIn[i] == bots[i]-tops[i]+1);
|
||
|
} else {
|
||
|
assert_eq(cntsIn[i], bots[i]-tops[i]);
|
||
|
}
|
||
|
}
|
||
|
}
|
||
|
#endif
|
||
|
}
|
||
|
|
||
|
/**
|
||
|
* Given row i, return the row that the LF mapping maps i to.
|
||
|
*/
|
||
|
inline index_t mapLF(
|
||
|
const SideLocus<index_t>& l
|
||
|
ASSERT_ONLY(, bool overrideSanity = false)
|
||
|
) const
|
||
|
{
|
||
|
ASSERT_ONLY(index_t srcrow = l.toBWRow(_gh));
|
||
|
index_t ret;
|
||
|
assert(l.side(this->gfm()) != NULL);
|
||
|
int c = rowL(l);
|
||
|
assert_lt(c, 4);
|
||
|
assert_geq(c, 0);
|
||
|
ret = countBt2Side(l, c);
|
||
|
assert_lt(ret, this->_gh._gbwtLen);
|
||
|
assert_neq(srcrow, ret);
|
||
|
#ifndef NDEBUG
|
||
|
if(_sanity && !overrideSanity) {
|
||
|
// Make sure results match up with results from mapLFEx;
|
||
|
// be sure to override sanity-checking in the callee, or we'll
|
||
|
// have infinite recursion
|
||
|
index_t arrs[] = { 0, 0, 0, 0 };
|
||
|
mapLFEx(l, arrs, true);
|
||
|
assert_eq(arrs[c], ret);
|
||
|
}
|
||
|
#endif
|
||
|
return ret;
|
||
|
}
|
||
|
|
||
|
/**
|
||
|
* Given row i and character c, return the row that the LF mapping maps
|
||
|
* i to on character c.
|
||
|
*/
|
||
|
inline index_t mapLF(
|
||
|
const SideLocus<index_t>& l, int c
|
||
|
ASSERT_ONLY(, bool overrideSanity = false)
|
||
|
) const
|
||
|
{
|
||
|
index_t ret;
|
||
|
assert_lt(c, 4);
|
||
|
assert_geq(c, 0);
|
||
|
ret = countBt2Side(l, c);
|
||
|
assert_lt(ret, this->_gh._gbwtLen);
|
||
|
#ifndef NDEBUG
|
||
|
if(_sanity && !overrideSanity) {
|
||
|
// Make sure results match up with results from mapLFEx;
|
||
|
// be sure to override sanity-checking in the callee, or we'll
|
||
|
// have infinite recursion
|
||
|
index_t arrs[] = { 0, 0, 0, 0 };
|
||
|
mapLFEx(l, arrs, true);
|
||
|
assert_eq(arrs[c], ret);
|
||
|
}
|
||
|
#endif
|
||
|
return ret;
|
||
|
}
|
||
|
|
||
|
/**
|
||
|
* Given row i and character c, return the row that the GLF mapping maps
|
||
|
* i to on character c.
|
||
|
*/
|
||
|
inline pair<index_t, index_t> mapLF(
|
||
|
SideLocus<index_t>& tloc, SideLocus<index_t>& bloc, int c,
|
||
|
pair<index_t, index_t>* node_range = NULL
|
||
|
ASSERT_ONLY(, bool overrideSanity = false)
|
||
|
) const
|
||
|
{
|
||
|
assert_lt(c, 4);
|
||
|
assert_geq(c, 0);
|
||
|
index_t top = mapLF(tloc, c);
|
||
|
index_t bot = mapLF(bloc, c);
|
||
|
if(node_range != NULL) {
|
||
|
node_range->first = top; node_range->second = bot;
|
||
|
}
|
||
|
return pair<index_t, index_t>(top, bot);
|
||
|
}
|
||
|
|
||
|
/**
|
||
|
* Given row i and character c, return the row that the GLF mapping maps
|
||
|
* i to on character c.
|
||
|
*/
|
||
|
inline pair<index_t, index_t> mapGLF(
|
||
|
SideLocus<index_t>& tloc, SideLocus<index_t>& bloc, int c,
|
||
|
pair<index_t, index_t>* node_range = NULL,
|
||
|
EList<pair<index_t, index_t> >* node_iedges = NULL,
|
||
|
index_t k = 5
|
||
|
ASSERT_ONLY(, bool overrideSanity = false)
|
||
|
) const
|
||
|
{
|
||
|
assert_lt(c, 4);
|
||
|
assert_geq(c, 0);
|
||
|
index_t top = mapLF(tloc, c);
|
||
|
index_t bot = mapLF(bloc, c);
|
||
|
if(gh().linearFM()) {
|
||
|
if(node_range != NULL) {
|
||
|
node_range->first = top; node_range->second = bot;
|
||
|
}
|
||
|
if(node_iedges != NULL) {
|
||
|
node_iedges->clear();
|
||
|
}
|
||
|
return pair<index_t, index_t>(top, bot);
|
||
|
}
|
||
|
if(top + 1 >= gh()._gbwtLen || top >= bot) {
|
||
|
assert_eq(top, bot);
|
||
|
return pair<index_t, index_t>(0, 0);
|
||
|
}
|
||
|
|
||
|
tloc.initFromRow_bit(top + 1, gh(), gfm());
|
||
|
index_t node_top = rank_M(tloc) - 1;
|
||
|
|
||
|
index_t top_F_loc = 0, top_M_occ = 0;
|
||
|
size_t iter = 0;
|
||
|
while(true) {
|
||
|
const uint8_t *side = tloc.side(gfm()) + gh()._sideGbwtSz - gh()._sideSz * iter;
|
||
|
top_F_loc = *((index_t*)side);
|
||
|
side += sizeof(index_t);
|
||
|
top_M_occ = *((index_t*)side);
|
||
|
assert_leq(top_M_occ, node_top + 1);
|
||
|
if(top_M_occ <= node_top) break;
|
||
|
iter++;
|
||
|
}
|
||
|
if(top_M_occ > 0) top_F_loc++;
|
||
|
|
||
|
tloc.initFromRow_bit(top_F_loc, gh(), gfm());
|
||
|
|
||
|
if(node_top + 1 > top_M_occ) {
|
||
|
top = select_F(tloc, node_top + 1 - top_M_occ);
|
||
|
} else {
|
||
|
top = top_F_loc;
|
||
|
}
|
||
|
|
||
|
bloc.initFromRow_bit(bot, gh(), gfm());
|
||
|
index_t node_bot = rank_M(bloc);
|
||
|
|
||
|
const uint8_t *side = bloc.side(gfm()) + gh()._sideGbwtSz;
|
||
|
index_t bot_F_loc = *((index_t*)side);
|
||
|
side += sizeof(index_t);
|
||
|
index_t bot_M_occ = *((index_t*)side);
|
||
|
assert_leq(bot_M_occ, node_bot + 1);
|
||
|
if(bot_M_occ > 0) bot_F_loc++;
|
||
|
|
||
|
bloc.initFromRow_bit(bot_F_loc, gh(), gfm());
|
||
|
|
||
|
if(node_bot + 1 > bot_M_occ) {
|
||
|
bot = select_F(bloc, node_bot + 1 - bot_M_occ);
|
||
|
} else {
|
||
|
bot = bot_F_loc;
|
||
|
}
|
||
|
|
||
|
if(node_range != NULL) {
|
||
|
(*node_range).first = node_top;
|
||
|
(*node_range).second = node_bot;
|
||
|
}
|
||
|
assert_leq(node_bot - node_top, bot - top);
|
||
|
if(node_iedges != NULL && node_bot - node_top <= k && node_bot - node_top < bot - top) {
|
||
|
getInEdgeCount(top, bot, *node_iedges);
|
||
|
}
|
||
|
|
||
|
return pair<index_t, index_t>(top, bot);
|
||
|
}
|
||
|
|
||
|
/**
|
||
|
* Given top and bot loci, calculate counts of all four DNA chars up to
|
||
|
* those loci. Also, update a set of tops and bots for the reverse
|
||
|
* index/direction using the idea from the bi-directional BWT paper.
|
||
|
*/
|
||
|
inline void mapBiLFEx(
|
||
|
const SideLocus<index_t>& ltop,
|
||
|
const SideLocus<index_t>& lbot,
|
||
|
index_t *tops,
|
||
|
index_t *bots,
|
||
|
index_t *topsP, // topsP[0] = top
|
||
|
index_t *botsP
|
||
|
ASSERT_ONLY(, bool overrideSanity = false)
|
||
|
) const
|
||
|
{
|
||
|
#ifndef NDEBUG
|
||
|
for(int i = 0; i < 4; i++) {
|
||
|
assert_eq(0, tops[0]); assert_eq(0, bots[0]);
|
||
|
}
|
||
|
#endif
|
||
|
countBt2SideEx(ltop, tops);
|
||
|
countBt2SideEx(lbot, bots);
|
||
|
#ifndef NDEBUG
|
||
|
if(_sanity && !overrideSanity) {
|
||
|
// Make sure results match up with individual calls to mapLF;
|
||
|
// be sure to override sanity-checking in the callee, or we'll
|
||
|
// have infinite recursion
|
||
|
assert_eq(mapLF(ltop, 0, true), tops[0]);
|
||
|
assert_eq(mapLF(ltop, 1, true), tops[1]);
|
||
|
assert_eq(mapLF(ltop, 2, true), tops[2]);
|
||
|
assert_eq(mapLF(ltop, 3, true), tops[3]);
|
||
|
assert_eq(mapLF(lbot, 0, true), bots[0]);
|
||
|
assert_eq(mapLF(lbot, 1, true), bots[1]);
|
||
|
assert_eq(mapLF(lbot, 2, true), bots[2]);
|
||
|
assert_eq(mapLF(lbot, 3, true), bots[3]);
|
||
|
}
|
||
|
#endif
|
||
|
// bots[0..3] - tops[0..3] = # of ways to extend the suffix with an
|
||
|
// A, C, G, T
|
||
|
botsP[0] = topsP[0] + (bots[0] - tops[0]);
|
||
|
topsP[1] = botsP[0];
|
||
|
botsP[1] = topsP[1] + (bots[1] - tops[1]);
|
||
|
topsP[2] = botsP[1];
|
||
|
botsP[2] = topsP[2] + (bots[2] - tops[2]);
|
||
|
topsP[3] = botsP[2];
|
||
|
botsP[3] = topsP[3] + (bots[3] - tops[3]);
|
||
|
}
|
||
|
|
||
|
/**
|
||
|
* Given row and its locus information, proceed on the given character
|
||
|
* and return the next row, or all-fs if we can't proceed on that
|
||
|
* character. Returns 0xffffffff if this row ends in $.
|
||
|
*/
|
||
|
inline index_t mapLF1(
|
||
|
index_t row, // starting row
|
||
|
const SideLocus<index_t>& l, // locus for starting row
|
||
|
int c // character to proceed on
|
||
|
ASSERT_ONLY(, bool overrideSanity = false)
|
||
|
) const
|
||
|
{
|
||
|
if(rowL(l) != c) return (index_t)INDEX_MAX;
|
||
|
for(index_t i = 0; i < _zOffs.size(); i++) {
|
||
|
if(row == _zOffs[i]) return (index_t)INDEX_MAX;
|
||
|
}
|
||
|
index_t ret;
|
||
|
assert_lt(c, 4);
|
||
|
assert_geq(c, 0);
|
||
|
ret = countBt2Side(l, c);
|
||
|
assert_lt(ret, this->_gh._gbwtLen);
|
||
|
#ifndef NDEBUG
|
||
|
if(_sanity && !overrideSanity) {
|
||
|
// Make sure results match up with results from mapLFEx;
|
||
|
// be sure to override sanity-checking in the callee, or we'll
|
||
|
// have infinite recursion
|
||
|
index_t arrs[] = { 0, 0, 0, 0 };
|
||
|
mapLFEx(l, arrs, true);
|
||
|
assert_eq(arrs[c], ret);
|
||
|
}
|
||
|
#endif
|
||
|
return ret;
|
||
|
}
|
||
|
|
||
|
|
||
|
/**
|
||
|
* Given row and its locus information, set the row to LF(row) and
|
||
|
* return the character that was in the final column.
|
||
|
*/
|
||
|
inline int mapLF1(
|
||
|
index_t& row, // starting row
|
||
|
const SideLocus<index_t>& l // locus for starting row
|
||
|
ASSERT_ONLY(, bool overrideSanity = false)
|
||
|
) const
|
||
|
{
|
||
|
for(index_t i = 0; i < _zOffs.size(); i++) {
|
||
|
if(row == _zOffs[i]) return -1;
|
||
|
}
|
||
|
int c = rowL(l);
|
||
|
assert_range(0, 3, c);
|
||
|
row = countBt2Side(l, c);
|
||
|
assert_lt(row, this->_gh._gbwtLen);
|
||
|
#ifndef NDEBUG
|
||
|
if(_sanity && !overrideSanity) {
|
||
|
// Make sure results match up with results from mapLFEx;
|
||
|
// be sure to override sanity-checking in the callee, or we'll
|
||
|
// have infinite recursion
|
||
|
index_t arrs[] = { 0, 0, 0, 0 };
|
||
|
mapLFEx(l, arrs, true);
|
||
|
assert_eq(arrs[c], row);
|
||
|
}
|
||
|
#endif
|
||
|
return c;
|
||
|
}
|
||
|
|
||
|
/**
|
||
|
* Given row and its locus information, proceed on the given character
|
||
|
* and return the next row, or all-fs if we can't proceed on that
|
||
|
* character. Returns 0xffffffff if this row ends in $.
|
||
|
*/
|
||
|
inline pair<index_t, index_t> mapGLF1(
|
||
|
index_t row, // starting row
|
||
|
SideLocus<index_t>& l, // locus for starting row
|
||
|
int c, // character to proceed
|
||
|
pair<index_t, index_t>* node_range = NULL
|
||
|
ASSERT_ONLY(, bool overrideSanity = false)
|
||
|
) const
|
||
|
{
|
||
|
assert_lt(c, 4);
|
||
|
assert_geq(c, 0);
|
||
|
index_t top = mapLF1(row, l, c);
|
||
|
if(top == (index_t)INDEX_MAX) return pair<index_t, index_t>(0, 0);
|
||
|
if(gh().linearFM()) {
|
||
|
if(node_range != NULL) {
|
||
|
node_range->first = top; node_range->second = top + 1;
|
||
|
}
|
||
|
return pair<index_t, index_t>(top, top + 1);
|
||
|
}
|
||
|
index_t bot = top;
|
||
|
|
||
|
l.initFromRow_bit(top + 1, gh(), gfm());
|
||
|
index_t node_top = rank_M(l) - 1;
|
||
|
|
||
|
index_t F_loc = 0, M_occ = 0;
|
||
|
size_t iter = 0;
|
||
|
while(true) {
|
||
|
const uint8_t *side = l.side(gfm()) + gh()._sideGbwtSz - gh()._sideSz * iter;
|
||
|
F_loc = *((index_t*)side);
|
||
|
side += sizeof(index_t);
|
||
|
M_occ = *((index_t*)side);
|
||
|
assert_leq(M_occ, node_top + 1);
|
||
|
if(M_occ <= node_top) break;
|
||
|
iter++;
|
||
|
}
|
||
|
if(M_occ > 0) F_loc++;
|
||
|
|
||
|
l.initFromRow_bit(F_loc, gh(), gfm());
|
||
|
|
||
|
if(node_top + 1 > M_occ) {
|
||
|
top = select_F(l, node_top + 1 - M_occ);
|
||
|
} else {
|
||
|
top = F_loc;
|
||
|
}
|
||
|
|
||
|
index_t node_bot = node_top + 1;
|
||
|
if(node_bot + 1 > M_occ) {
|
||
|
SideLocus<index_t> l2;
|
||
|
#if 0
|
||
|
l2.initFromRow_bit(top + 1, gh(), gfm());
|
||
|
bot = select_F(l2, 1);
|
||
|
ASSERT_ONLY(index_t bot2 = select_F(l, node_bot + 1 - M_occ));
|
||
|
assert_eq(bot, bot2);
|
||
|
#else
|
||
|
bot = select_F(l, node_bot + 1 - M_occ);
|
||
|
#endif
|
||
|
} else {
|
||
|
bot = F_loc;
|
||
|
}
|
||
|
|
||
|
if(node_range != NULL) {
|
||
|
(*node_range).first = node_top;
|
||
|
(*node_range).second = node_bot;
|
||
|
}
|
||
|
|
||
|
return pair<index_t, index_t>(top, bot);
|
||
|
}
|
||
|
|
||
|
/**
|
||
|
* Given row and its locus information, proceed on the given character
|
||
|
* and return the next row, or all-fs if we can't proceed on that
|
||
|
* character. Returns 0xffffffff if this row ends in $.
|
||
|
*/
|
||
|
inline pair<index_t, index_t> mapGLF1(
|
||
|
index_t row, // starting row
|
||
|
SideLocus<index_t>& l, // locus for starting row
|
||
|
pair<index_t, index_t>* node_range = NULL
|
||
|
ASSERT_ONLY(, bool overrideSanity = false)
|
||
|
) const
|
||
|
{
|
||
|
for(index_t i = 0; i < _zOffs.size(); i++) {
|
||
|
if(row == _zOffs[i]) return pair<index_t, index_t>((index_t)INDEX_MAX, (index_t)INDEX_MAX);
|
||
|
}
|
||
|
|
||
|
mapLF1(row, l);
|
||
|
index_t top = row;
|
||
|
if(top == (index_t)INDEX_MAX) return pair<index_t, index_t>(0, 0);
|
||
|
if(gh().linearFM()) {
|
||
|
if(node_range != NULL) {
|
||
|
node_range->first = top; node_range->second = top + 1;
|
||
|
}
|
||
|
return pair<index_t, index_t>(top, top + 1);
|
||
|
}
|
||
|
index_t bot = top;
|
||
|
|
||
|
l.initFromRow_bit(top + 1, gh(), gfm());
|
||
|
index_t node_top = rank_M(l) - 1;
|
||
|
|
||
|
index_t F_loc = 0, M_occ = 0;
|
||
|
size_t iter = 0;
|
||
|
while(true) {
|
||
|
const uint8_t *side = l.side(gfm()) + gh()._sideGbwtSz - gh()._sideSz * iter;
|
||
|
F_loc = *((index_t*)side);
|
||
|
side += sizeof(index_t);
|
||
|
M_occ = *((index_t*)side);
|
||
|
assert_leq(M_occ, node_top + 1);
|
||
|
if(M_occ <= node_top) break;
|
||
|
iter++;
|
||
|
}
|
||
|
if(M_occ > 0) F_loc++;
|
||
|
|
||
|
l.initFromRow_bit(F_loc, gh(), gfm());
|
||
|
|
||
|
if(node_top + 1 > M_occ) {
|
||
|
top = select_F(l, node_top + 1 - M_occ);
|
||
|
} else {
|
||
|
top = F_loc;
|
||
|
}
|
||
|
|
||
|
index_t node_bot = node_top + 1;
|
||
|
if(node_bot + 1 > M_occ) {
|
||
|
#if 0
|
||
|
l2.initFromRow_bit(top + 1, gh(), gfm());
|
||
|
bot = select_F(l2, 1);
|
||
|
ASSERT_ONLY(index_t bot2 = select_F(l, node_bot + 1 - M_occ));
|
||
|
assert_eq(bot, bot2);
|
||
|
#else
|
||
|
bot = select_F(l, node_bot + 1 - M_occ);
|
||
|
#endif
|
||
|
} else {
|
||
|
bot = F_loc;
|
||
|
}
|
||
|
|
||
|
if(node_range != NULL) {
|
||
|
(*node_range).first = node_top;
|
||
|
(*node_range).second = node_bot;
|
||
|
}
|
||
|
|
||
|
return pair<index_t, index_t>(top, bot);
|
||
|
}
|
||
|
|
||
|
/**
|
||
|
* Given row i, return rank
|
||
|
*/
|
||
|
inline index_t rank_M(
|
||
|
const SideLocus<index_t>& l
|
||
|
ASSERT_ONLY(, bool overrideSanity = false)
|
||
|
) const
|
||
|
{
|
||
|
index_t ret = countMSide(l);
|
||
|
assert_leq(ret, this->_gh._numNodes);
|
||
|
return ret;
|
||
|
}
|
||
|
|
||
|
/**
|
||
|
* Given row i, return select
|
||
|
*/
|
||
|
inline index_t select_F(
|
||
|
SideLocus<index_t> l,
|
||
|
index_t count
|
||
|
ASSERT_ONLY(, bool overrideSanity = false)
|
||
|
) const
|
||
|
{
|
||
|
assert_gt(count, 0);
|
||
|
const uint8_t *side = l.side(this->gfm()) + (_gh._sideGbwtSz >> 1);
|
||
|
while(true) {
|
||
|
index_t remainingBitsSide = (_gh._sideGbwtSz << 1) - l._charOff;
|
||
|
assert_gt(remainingBitsSide, 0);
|
||
|
index_t minSide = (count < remainingBitsSide ? count : remainingBitsSide);
|
||
|
uint64_t bits = *(uint64_t*)&side[l._by];
|
||
|
uint8_t advance = 64;
|
||
|
if(l._bp > 0) {
|
||
|
bits >>= l._bp;
|
||
|
advance -= l._bp;
|
||
|
}
|
||
|
if(minSide < advance) {
|
||
|
advance = minSide;
|
||
|
bits <<= (64 - minSide);
|
||
|
}
|
||
|
uint8_t tmp_count = 0;
|
||
|
#ifdef POPCNT_CAPABILITY
|
||
|
if(_usePOPCNTinstruction) {
|
||
|
tmp_count = countInU64_bits<USE_POPCNT_INSTRUCTION>(bits);
|
||
|
} else {
|
||
|
tmp_count = countInU64_bits<USE_POPCNT_GENERIC_BITS>(bits);
|
||
|
}
|
||
|
#else
|
||
|
tmp_count = countInU64_bits(bits);
|
||
|
#endif
|
||
|
assert_leq(tmp_count, count);
|
||
|
count -= tmp_count;
|
||
|
if(count == 0) {
|
||
|
assert_gt(advance, 0);
|
||
|
l._charOff += (advance - 1);
|
||
|
assert_lt(l._charOff, _gh._sideGbwtSz << 1);
|
||
|
l._by = l._charOff >> 3;
|
||
|
l._bp = l._charOff & 0x7;
|
||
|
break;
|
||
|
}
|
||
|
|
||
|
assert_leq(l._charOff + advance, (_gh._sideGbwtSz << 1));
|
||
|
if(l._charOff + advance == (_gh._sideGbwtSz << 1)) {
|
||
|
l.nextSide(_gh);
|
||
|
side = l.side(this->gfm()) + (_gh._sideGbwtSz >> 1);
|
||
|
} else {
|
||
|
l._charOff += advance;
|
||
|
l._by = l._charOff >> 3;
|
||
|
l._bp = l._charOff & 0x7;
|
||
|
}
|
||
|
}
|
||
|
return l.toBWRow(_gh);
|
||
|
}
|
||
|
|
||
|
/**
|
||
|
*
|
||
|
*/
|
||
|
inline void getInEdgeCount(
|
||
|
index_t top,
|
||
|
index_t bot,
|
||
|
EList<pair<index_t, index_t> >& node_iedges) const
|
||
|
{
|
||
|
assert_lt(top, bot);
|
||
|
node_iedges.clear();
|
||
|
SideLocus<index_t> l; l.initFromRow_bit(top, _gh, gfm());
|
||
|
const uint8_t *side = l.side(this->gfm()) + (_gh._sideGbwtSz >> 1);
|
||
|
assert_lt(l._by, (_gh._sideGbwtSz >> 2));
|
||
|
assert_eq((side[l._by] >> l._bp) & 0x1, 0x1);
|
||
|
bool first = true;
|
||
|
index_t curr_node = 0;
|
||
|
index_t num0s = 0;
|
||
|
while(top < bot) {
|
||
|
if(first) {
|
||
|
first = false;
|
||
|
} else {
|
||
|
int bit = (side[l._by] >> l._bp) & 0x1;
|
||
|
if(bit == 0x1) {
|
||
|
curr_node++;
|
||
|
num0s = 0;
|
||
|
} else {
|
||
|
num0s++;
|
||
|
if(num0s == 1) {
|
||
|
node_iedges.expand();
|
||
|
node_iedges.back().first = curr_node;
|
||
|
}
|
||
|
node_iedges.back().second = num0s;
|
||
|
}
|
||
|
}
|
||
|
if(l._charOff + 1 == (_gh._sideGbwtSz << 1)) {
|
||
|
l.nextSide(_gh);
|
||
|
side = l.side(this->gfm()) + (_gh._sideGbwtSz >> 1);
|
||
|
} else {
|
||
|
l._charOff++;
|
||
|
l._by = l._charOff >> 3;
|
||
|
l._bp = l._charOff & 0x7;
|
||
|
}
|
||
|
top++;
|
||
|
}
|
||
|
}
|
||
|
|
||
|
|
||
|
#ifndef NDEBUG
|
||
|
/// Check that in-memory Ebwt is internally consistent with respect
|
||
|
/// to given EbwtParams; assert if not
|
||
|
bool inMemoryRepOk(const GFMParams<index_t>& gh) const {
|
||
|
assert_eq(_zOffs.size(), _zGbwtByteOffs.size());
|
||
|
assert_eq(_zOffs.size(), _zGbwtBpOffs.size());
|
||
|
for(index_t i = 0; i < _zOffs.size(); i++) {
|
||
|
assert_geq(_zGbwtBpOffs[i], 0);
|
||
|
assert_lt(_zGbwtBpOffs[i], 4);
|
||
|
assert_lt(_zGbwtByteOffs[i], gh._gbwtTotSz);
|
||
|
assert_lt(_zOffs[i], gh._gbwtLen);
|
||
|
}
|
||
|
assert_geq(_nFrag, _nPat);
|
||
|
assert_eq(_alts.size(), _altnames.size());
|
||
|
return true;
|
||
|
}
|
||
|
|
||
|
/// Check that in-memory Ebwt is internally consistent; assert if
|
||
|
/// not
|
||
|
bool inMemoryRepOk() const {
|
||
|
return repOk(_gh);
|
||
|
}
|
||
|
|
||
|
/// Check that Ebwt is internally consistent with respect to given
|
||
|
/// EbwtParams; assert if not
|
||
|
bool repOk(const GFMParams<index_t>& gh) const {
|
||
|
assert(_gh.repOk());
|
||
|
if(isInMemory()) {
|
||
|
return inMemoryRepOk(gh);
|
||
|
}
|
||
|
return true;
|
||
|
}
|
||
|
|
||
|
/// Check that Ebwt is internally consistent; assert if not
|
||
|
bool repOk() const {
|
||
|
return repOk(_gh);
|
||
|
}
|
||
|
#endif
|
||
|
|
||
|
bool _toBigEndian;
|
||
|
int32_t _overrideOffRate;
|
||
|
bool _verbose;
|
||
|
bool _passMemExc;
|
||
|
bool _sanity;
|
||
|
bool fw_; // true iff this is a forward index
|
||
|
FILE *_in1; // input fd for primary index file
|
||
|
FILE *_in2; // input fd for secondary index file
|
||
|
string _in1Str; // filename for primary index file
|
||
|
string _in2Str; // filename for secondary index file
|
||
|
EList<index_t> _zOffs;
|
||
|
EList<index_t> _zGbwtByteOffs;
|
||
|
EList<int> _zGbwtBpOffs;
|
||
|
index_t _nPat; /// number of reference texts
|
||
|
index_t _nFrag; /// number of fragments
|
||
|
APtrWrap<index_t> _plen;
|
||
|
APtrWrap<index_t> _rstarts; // starting offset of fragments / text indexes
|
||
|
// _fchr, _ftab and _eftab are expected to be relatively small
|
||
|
// (usually < 1MB, perhaps a few MB if _fchr is particularly large
|
||
|
// - like, say, 11). For this reason, we don't bother with writing
|
||
|
// them to disk through separate output streams; we
|
||
|
APtrWrap<index_t> _fchr;
|
||
|
APtrWrap<index_t> _ftab;
|
||
|
APtrWrap<index_t> _eftab; // "extended" entries for _ftab
|
||
|
// _offs may be extremely large. E.g. for DNA w/ offRate=4 (one
|
||
|
// offset every 16 rows), the total size of _offs is the same as
|
||
|
// the total size of the input sequence
|
||
|
APtrWrap<index_t> _offs;
|
||
|
|
||
|
// _ebwt is the Extended Burrows-Wheeler Transform itself, and thus
|
||
|
// is at least as large as the input sequence.
|
||
|
APtrWrap<uint8_t> _gfm;
|
||
|
bool _useMm; /// use memory-mapped files to hold the index
|
||
|
bool useShmem_; /// use shared memory to hold large parts of the index
|
||
|
EList<string> _refnames; /// names of the reference sequences
|
||
|
EList<string> _refnames_nospace; // names of the reference sequences (names stop at space)
|
||
|
char *mmFile1_;
|
||
|
char *mmFile2_;
|
||
|
int _nthreads;
|
||
|
GFMParams<index_t> _gh;
|
||
|
bool packed_;
|
||
|
|
||
|
static const uint64_t default_bmax = INDEX_MAX;
|
||
|
static const uint64_t default_bmaxMultSqrt = INDEX_MAX;
|
||
|
static const uint64_t default_bmaxDivN = 4;
|
||
|
static const int default_dcv = 1024;
|
||
|
static const bool default_noDc = false;
|
||
|
static const bool default_useBlockwise = true;
|
||
|
static const uint32_t default_seed = 0;
|
||
|
#ifdef BOWTIE_64BIT_INDEX
|
||
|
static const int default_lineRate_gfm = 8;
|
||
|
static const int default_lineRate_fm = 7;
|
||
|
#else
|
||
|
static const int default_lineRate_gfm = 7;
|
||
|
static const int default_lineRate_fm = 6;
|
||
|
#endif
|
||
|
static const int default_offRate = 5;
|
||
|
static const int default_offRatePlus = 0;
|
||
|
static const int default_ftabChars = 10;
|
||
|
static const bool default_bigEndian = false;
|
||
|
|
||
|
// data used to build an index
|
||
|
EList<ALT<index_t> > _alts;
|
||
|
EList<string> _altnames;
|
||
|
EList<Haplotype<index_t> > _haplotypes;
|
||
|
RepeatDB<index_t> _repeatdb;
|
||
|
EList<RB_KmerTable> _repeat_kmertables;
|
||
|
|
||
|
bool _repeat;
|
||
|
EList<pair<index_t, index_t> > _repeatLens;
|
||
|
EList<uint8_t> _repeatIncluded;
|
||
|
|
||
|
protected:
|
||
|
|
||
|
ostream& log() const {
|
||
|
return cerr; // TODO: turn this into a parameter
|
||
|
}
|
||
|
|
||
|
/// Print a verbose message and flush (flushing is helpful for
|
||
|
/// debugging)
|
||
|
void verbose(const string& s) const {
|
||
|
if(this->verbose()) {
|
||
|
this->log() << s.c_str();
|
||
|
this->log().flush();
|
||
|
}
|
||
|
}
|
||
|
};
|
||
|
|
||
|
/**
|
||
|
* Read reference names from an input stream 'in' for an Ebwt primary
|
||
|
* file and store them in 'refnames'.
|
||
|
*/
|
||
|
template <typename index_t>
|
||
|
void readEbwtRefnames(istream& in, EList<string>& refnames) {
|
||
|
// _in1 must already be open with the get cursor at the
|
||
|
// beginning and no error flags set.
|
||
|
assert(in.good());
|
||
|
assert_eq((streamoff)in.tellg(), ios::beg);
|
||
|
|
||
|
// Read endianness hints from both streams
|
||
|
bool switchEndian = false;
|
||
|
uint32_t one = readU32(in, switchEndian); // 1st word of primary stream
|
||
|
if(one != 1) {
|
||
|
assert_eq((1u<<24), one);
|
||
|
switchEndian = true;
|
||
|
}
|
||
|
readU32(in, switchEndian); // version
|
||
|
|
||
|
// Reads header entries one by one from primary stream
|
||
|
index_t len = readIndex<index_t>(in, switchEndian);
|
||
|
index_t gbwtLen = readIndex<index_t>(in, switchEndian);
|
||
|
index_t numNodes = readIndex<index_t>(in, switchEndian);
|
||
|
int32_t lineRate = readI32(in, switchEndian);
|
||
|
/*int32_t linesPerSide =*/ readI32(in, switchEndian);
|
||
|
int32_t offRate = readI32(in, switchEndian);
|
||
|
int32_t ftabChars = readI32(in, switchEndian);
|
||
|
index_t eftabLen = readIndex<index_t>(in, switchEndian);
|
||
|
// BTL: chunkRate is now deprecated
|
||
|
int32_t flags = readI32(in, switchEndian);
|
||
|
bool entireReverse = false;
|
||
|
if(flags < 0) {
|
||
|
entireReverse = (((-flags) & GFM_ENTIRE_REV) != 0);
|
||
|
}
|
||
|
|
||
|
// Create a new EbwtParams from the entries read from primary stream
|
||
|
GFMParams<index_t> gh(len, gbwtLen, numNodes, lineRate, offRate, ftabChars, eftabLen, entireReverse);
|
||
|
|
||
|
index_t nPat = readIndex<index_t>(in, switchEndian); // nPat
|
||
|
in.seekg(nPat*sizeof(index_t), ios_base::cur); // skip plen
|
||
|
|
||
|
// Skip rstarts
|
||
|
index_t nFrag = readIndex<index_t>(in, switchEndian);
|
||
|
in.seekg(nFrag*sizeof(index_t)*3, ios_base::cur);
|
||
|
|
||
|
// Skip ebwt
|
||
|
in.seekg(gh._gbwtTotLen, ios_base::cur);
|
||
|
|
||
|
// Skip zOff from primary stream
|
||
|
index_t numZOffs = readIndex<index_t>(in, switchEndian);
|
||
|
in.seekg(numZOffs * sizeof(index_t), ios_base::cur);
|
||
|
|
||
|
// Skip fchr
|
||
|
in.seekg(5 * sizeof(index_t), ios_base::cur);
|
||
|
|
||
|
// Skip ftab
|
||
|
in.seekg(gh._ftabLen*sizeof(index_t), ios_base::cur);
|
||
|
|
||
|
// Skip eftab
|
||
|
in.seekg(gh._eftabLen*sizeof(index_t), ios_base::cur);
|
||
|
|
||
|
// Read reference sequence names from primary index file
|
||
|
while(true) {
|
||
|
char c = '\0';
|
||
|
in.read(&c, 1);
|
||
|
if(in.eof()) break;
|
||
|
if(c == '\0') break;
|
||
|
else if(c == '\n') {
|
||
|
refnames.push_back("");
|
||
|
} else {
|
||
|
if(refnames.size() == 0) {
|
||
|
refnames.push_back("");
|
||
|
}
|
||
|
refnames.back().push_back(c);
|
||
|
}
|
||
|
}
|
||
|
if(refnames.back().empty()) {
|
||
|
refnames.pop_back();
|
||
|
}
|
||
|
|
||
|
// Be kind
|
||
|
in.clear(); in.seekg(0, ios::beg);
|
||
|
assert(in.good());
|
||
|
}
|
||
|
|
||
|
/**
|
||
|
* Read reference names from the index with basename 'in' and store
|
||
|
* them in 'refnames'.
|
||
|
*/
|
||
|
template <typename index_t>
|
||
|
void readEbwtRefnames(const string& instr, EList<string>& refnames) {
|
||
|
ifstream in;
|
||
|
// Initialize our primary and secondary input-stream fields
|
||
|
in.open((instr + ".1." + gfm_ext).c_str(), ios_base::in | ios::binary);
|
||
|
if(!in.is_open()) {
|
||
|
throw GFMFileOpenException("Cannot open file " + instr);
|
||
|
}
|
||
|
assert(in.is_open());
|
||
|
assert(in.good());
|
||
|
assert_eq((streamoff)in.tellg(), ios::beg);
|
||
|
readEbwtRefnames<index_t>(in, refnames);
|
||
|
}
|
||
|
|
||
|
///////////////////////////////////////////////////////////////////////
|
||
|
//
|
||
|
// Functions for building Ebwts
|
||
|
//
|
||
|
///////////////////////////////////////////////////////////////////////
|
||
|
|
||
|
/**
|
||
|
* Join several text strings together in a way that's compatible with
|
||
|
* the text-chunking scheme dictated by chunkRate parameter.
|
||
|
*
|
||
|
* The non-static member Ebwt::join additionally builds auxilliary
|
||
|
* arrays that maintain a mapping between chunks in the joined string
|
||
|
* and the original text strings.
|
||
|
*/
|
||
|
template <typename index_t>
|
||
|
template <typename TStr>
|
||
|
TStr GFM<index_t>::join(EList<TStr>& l, uint32_t seed) {
|
||
|
RandomSource rand; // reproducible given same seed
|
||
|
rand.init(seed);
|
||
|
TStr ret;
|
||
|
index_t guessLen = 0;
|
||
|
for(index_t i = 0; i < l.size(); i++) {
|
||
|
guessLen += length(l[i]);
|
||
|
}
|
||
|
ret.resize(guessLen);
|
||
|
index_t off = 0;
|
||
|
for(size_t i = 0; i < l.size(); i++) {
|
||
|
TStr& s = l[i];
|
||
|
assert_gt(s.length(), 0);
|
||
|
for(size_t j = 0; j < s.size(); j++) {
|
||
|
ret.set(s[j], off++);
|
||
|
}
|
||
|
}
|
||
|
return ret;
|
||
|
}
|
||
|
|
||
|
/**
|
||
|
* Join several text strings together in a way that's compatible with
|
||
|
* the text-chunking scheme dictated by chunkRate parameter.
|
||
|
*
|
||
|
* The non-static member Ebwt::join additionally builds auxilliary
|
||
|
* arrays that maintain a mapping between chunks in the joined string
|
||
|
* and the original text strings.
|
||
|
*/
|
||
|
template <typename index_t>
|
||
|
template <typename TStr>
|
||
|
void GFM<index_t>::join(EList<FileBuf*>& l,
|
||
|
EList<RefRecord>& szs,
|
||
|
index_t sztot,
|
||
|
const RefReadInParams& refparams,
|
||
|
uint32_t seed,
|
||
|
TStr& s,
|
||
|
bool include_rc,
|
||
|
bool CGtoTG)
|
||
|
{
|
||
|
RandomSource rand; // reproducible given same seed
|
||
|
rand.init(seed);
|
||
|
RefReadInParams rpcp = refparams;
|
||
|
index_t guessLen = sztot;
|
||
|
if(include_rc) {
|
||
|
s.resize(guessLen << 1);
|
||
|
} else {
|
||
|
s.resize(guessLen);
|
||
|
}
|
||
|
ASSERT_ONLY(index_t szsi = 0);
|
||
|
TIndexOffU dstoff = 0;
|
||
|
for(index_t i = 0; i < l.size(); i++) {
|
||
|
// For each sequence we can pull out of istream l[i]...
|
||
|
assert(!l[i]->eof());
|
||
|
bool first = true;
|
||
|
while(!l[i]->eof()) {
|
||
|
RefRecord rec = fastaRefReadAppend(*l[i], first, s, dstoff, rpcp);
|
||
|
first = false;
|
||
|
index_t bases = (index_t)rec.len;
|
||
|
assert_eq(rec.off, szs[szsi].off);
|
||
|
assert_eq(rec.len, szs[szsi].len);
|
||
|
assert_eq(rec.first, szs[szsi].first);
|
||
|
ASSERT_ONLY(szsi++);
|
||
|
if(bases == 0) continue;
|
||
|
}
|
||
|
}
|
||
|
|
||
|
// Change 'C' in CG to 'T' so that CG becomes TG
|
||
|
if(CGtoTG) {
|
||
|
for(TIndexOffU i = 0; i + 1 < guessLen; i++) {
|
||
|
int nt1 = s[i], nt2 = s[i+1];
|
||
|
if(nt1 == 1 && nt2 == 2) {
|
||
|
s[i] = 3;
|
||
|
}
|
||
|
}
|
||
|
}
|
||
|
|
||
|
// Append reverse complement
|
||
|
if(include_rc) {
|
||
|
for (TIndexOffU i = 0; i < guessLen; i++) {
|
||
|
int nt = s[guessLen - i - 1];
|
||
|
assert_range(0, 3, nt);
|
||
|
s[guessLen + i] = dnacomp[nt];
|
||
|
}
|
||
|
}
|
||
|
}
|
||
|
|
||
|
/**
|
||
|
* Join several text strings together according to the text-chunking
|
||
|
* scheme specified in the EbwtParams. Ebwt fields calculated in this
|
||
|
* function are written directly to disk.
|
||
|
*
|
||
|
* It is assumed, but not required, that the header values have already
|
||
|
* been written to 'out1' before this function is called.
|
||
|
*
|
||
|
* The static member Ebwt::join just returns a joined version of a
|
||
|
* list of strings without building any of the auxilliary arrays.
|
||
|
*/
|
||
|
template <typename index_t>
|
||
|
template <typename TStr>
|
||
|
void GFM<index_t>::joinToDisk(
|
||
|
EList<FileBuf*>& l,
|
||
|
EList<RefRecord>& szs,
|
||
|
index_t sztot,
|
||
|
const RefReadInParams& refparams,
|
||
|
TStr& ret,
|
||
|
ostream& out1,
|
||
|
ostream& out2)
|
||
|
{
|
||
|
RefReadInParams rpcp = refparams;
|
||
|
assert_gt(szs.size(), 0);
|
||
|
assert_gt(l.size(), 0);
|
||
|
assert_gt(sztot, 0);
|
||
|
// Not every fragment represents a distinct sequence - many
|
||
|
// fragments may correspond to a single sequence. Count the
|
||
|
// number of sequences here by counting the number of "first"
|
||
|
// fragments.
|
||
|
this->_nPat = 0;
|
||
|
this->_nFrag = 0;
|
||
|
for(index_t i = 0; i < szs.size(); i++) {
|
||
|
if(szs[i].len > 0) this->_nFrag++;
|
||
|
if(szs[i].first && szs[i].len > 0) this->_nPat++;
|
||
|
}
|
||
|
assert_gt(this->_nPat, 0);
|
||
|
assert_geq(this->_nFrag, this->_nPat);
|
||
|
_rstarts.reset();
|
||
|
writeIndex<index_t>(out1, this->_nPat, this->toBe());
|
||
|
// Allocate plen[]
|
||
|
try {
|
||
|
this->_plen.init(new index_t[this->_nPat], this->_nPat);
|
||
|
} catch(bad_alloc& e) {
|
||
|
cerr << "Out of memory allocating plen[] in Ebwt::join()"
|
||
|
<< " at " << __FILE__ << ":" << __LINE__ << endl;
|
||
|
throw e;
|
||
|
}
|
||
|
// For each pattern, set plen
|
||
|
int npat = -1;
|
||
|
for(index_t i = 0; i < szs.size(); i++) {
|
||
|
if(szs[i].first && szs[i].len > 0) {
|
||
|
if(npat >= 0) {
|
||
|
writeIndex<index_t>(out1, this->plen()[npat], this->toBe());
|
||
|
}
|
||
|
npat++;
|
||
|
this->plen()[npat] = (szs[i].len + szs[i].off);
|
||
|
} else {
|
||
|
this->plen()[npat] += (szs[i].len + szs[i].off);
|
||
|
}
|
||
|
}
|
||
|
assert_eq((index_t)npat, this->_nPat-1);
|
||
|
writeIndex<index_t>(out1, this->plen()[npat], this->toBe());
|
||
|
// Write the number of fragments
|
||
|
writeIndex<index_t>(out1, this->_nFrag, this->toBe());
|
||
|
index_t seqsRead = 0;
|
||
|
ASSERT_ONLY(index_t szsi = 0);
|
||
|
ASSERT_ONLY(index_t entsWritten = 0);
|
||
|
index_t dstoff = 0;
|
||
|
// For each filebuf
|
||
|
for(unsigned int i = 0; i < l.size(); i++) {
|
||
|
assert(!l[i]->eof());
|
||
|
bool first = true;
|
||
|
index_t patoff = 0;
|
||
|
// For each *fragment* (not necessary an entire sequence) we
|
||
|
// can pull out of istream l[i]...
|
||
|
while(!l[i]->eof()) {
|
||
|
string name;
|
||
|
// Push a new name onto our vector
|
||
|
_refnames.push_back("");
|
||
|
RefRecord rec = fastaRefReadAppend(
|
||
|
*l[i], first, ret, dstoff, rpcp, &_refnames.back());
|
||
|
first = false;
|
||
|
index_t bases = rec.len;
|
||
|
if(rec.first && rec.len > 0) {
|
||
|
if(_refnames.back().length() == 0) {
|
||
|
// If name was empty, replace with an index
|
||
|
ostringstream stm;
|
||
|
stm << seqsRead;
|
||
|
_refnames.back() = stm.str();
|
||
|
}
|
||
|
} else {
|
||
|
// This record didn't actually start a new sequence so
|
||
|
// no need to add a name
|
||
|
//assert_eq(0, _refnames.back().length());
|
||
|
_refnames.pop_back();
|
||
|
}
|
||
|
// Increment seqsRead if this is the first fragment
|
||
|
if(rec.first && rec.len > 0) seqsRead++;
|
||
|
assert_lt(szsi, szs.size());
|
||
|
assert_eq(rec.off, szs[szsi].off);
|
||
|
assert_eq(rec.len, szs[szsi].len);
|
||
|
assert_eq(rec.first, szs[szsi].first);
|
||
|
assert(rec.first || rec.off > 0);
|
||
|
ASSERT_ONLY(szsi++);
|
||
|
assert_leq(bases, this->plen()[seqsRead-1]);
|
||
|
// Reset the patoff if this is the first fragment
|
||
|
if(rec.first) patoff = 0;
|
||
|
patoff += rec.off; // add fragment's offset from end of last frag.
|
||
|
// Adjust rpcps
|
||
|
//index_t seq = seqsRead-1;
|
||
|
#ifndef NDEBUG
|
||
|
if(bases > 0) {
|
||
|
ASSERT_ONLY(entsWritten++);
|
||
|
}
|
||
|
#endif
|
||
|
// This is where rstarts elements are written to the output stream
|
||
|
//writeU32(out1, oldRetLen, this->toBe()); // offset from beginning of joined string
|
||
|
//writeU32(out1, seq, this->toBe()); // sequence id
|
||
|
//writeU32(out1, patoff, this->toBe()); // offset into sequence
|
||
|
patoff += (index_t)bases;
|
||
|
}
|
||
|
assert_gt(szsi, 0);
|
||
|
l[i]->reset();
|
||
|
assert(!l[i]->eof());
|
||
|
#ifndef NDEBUG
|
||
|
int c = l[i]->get();
|
||
|
assert_eq('>', c);
|
||
|
assert(!l[i]->eof());
|
||
|
l[i]->reset();
|
||
|
assert(!l[i]->eof());
|
||
|
#endif
|
||
|
}
|
||
|
assert_eq(entsWritten, this->_nFrag);
|
||
|
}
|
||
|
|
||
|
/**
|
||
|
* Build an Ebwt from a string 's' and its suffix array 'sa' (which
|
||
|
* might actually be a suffix array *builder* that builds blocks of the
|
||
|
* array on demand). The bulk of the Ebwt, i.e. the ebwt and offs
|
||
|
* arrays, is written directly to disk. This is by design: keeping
|
||
|
* those arrays in memory needlessly increases the footprint of the
|
||
|
* building process. Instead, we prefer to build the Ebwt directly
|
||
|
* "to disk" and then read it back into memory later as necessary.
|
||
|
*
|
||
|
* It is assumed that the header values and join-related values (nPat,
|
||
|
* plen) have already been written to 'out1' before this function
|
||
|
* is called. When this function is finished, it will have
|
||
|
* additionally written ebwt, zOff, fchr, ftab and eftab to the primary
|
||
|
* file and offs to the secondary file.
|
||
|
*
|
||
|
* Assume DNA/RNA/any alphabet with 4 or fewer elements.
|
||
|
* Assume occ array entries are 32 bits each.
|
||
|
*
|
||
|
* @param sa the suffix array to convert to a Ebwt
|
||
|
* @param s the original string
|
||
|
* @param out
|
||
|
*/
|
||
|
template <typename index_t>
|
||
|
template <typename TStr>
|
||
|
void GFM<index_t>::buildToDisk(
|
||
|
PathGraph<index_t>& gbwt,
|
||
|
const TStr& s,
|
||
|
ostream& out1,
|
||
|
ostream& out2,
|
||
|
streampos headerPos)
|
||
|
{
|
||
|
const GFMParams<index_t>& gh = this->_gh;
|
||
|
assert(gh.repOk());
|
||
|
assert_lt(s.length(), gh.gbwtLen());
|
||
|
assert_eq(s.length(), gh._len);
|
||
|
assert_gt(gh._lineRate, 3);
|
||
|
|
||
|
index_t gbwtLen = gh._gbwtLen;
|
||
|
streampos out1pos = out1.tellp();
|
||
|
if(headerPos < 0) {
|
||
|
out1.seekp(8 + sizeof(index_t));
|
||
|
} else {
|
||
|
out1.seekp(headerPos);
|
||
|
}
|
||
|
writeIndex<index_t>(out1, gbwtLen, this->toBe());
|
||
|
writeIndex<index_t>(out1, gh._numNodes, this->toBe());
|
||
|
out1.seekp(out1pos);
|
||
|
|
||
|
index_t ftabLen = gh._ftabLen;
|
||
|
index_t sideSz = gh._sideSz;
|
||
|
index_t gbwtTotSz = gh._gbwtTotSz;
|
||
|
index_t fchr[] = {0, 0, 0, 0, 0};
|
||
|
EList<index_t> ftab(EBWT_CAT);
|
||
|
EList<index_t> zOffs;
|
||
|
|
||
|
// Save # of occurrences of each character as we walk along the bwt
|
||
|
index_t occ[4] = {0, 0, 0, 0};
|
||
|
index_t occSave[4] = {0, 0, 0, 0};
|
||
|
// # of occurrences of 1 in M arrays
|
||
|
index_t M_occ = 0, M_occSave = 0;
|
||
|
// Location in F that corresponds to 1 in M
|
||
|
index_t F_loc = 0, F_locSave = 0;
|
||
|
|
||
|
// Record rows that should "absorb" adjacent rows in the ftab.
|
||
|
try {
|
||
|
VMSG_NL("Allocating ftab, absorbFtab");
|
||
|
ftab.resize(ftabLen);
|
||
|
ftab.fillZero();
|
||
|
} catch(bad_alloc &e) {
|
||
|
cerr << "Out of memory allocating ftab[] "
|
||
|
<< "in GFM::buildToDisk() at " << __FILE__ << ":"
|
||
|
<< __LINE__ << endl;
|
||
|
throw e;
|
||
|
}
|
||
|
|
||
|
// Allocate the side buffer; holds a single side as its being
|
||
|
// constructed and then written to disk. Reused across all sides.
|
||
|
#ifdef SIXTY4_FORMAT
|
||
|
EList<uint64_t> gfmSide(EBWT_CAT);
|
||
|
#else
|
||
|
EList<uint8_t> gfmSide(EBWT_CAT);
|
||
|
#endif
|
||
|
try {
|
||
|
// Used to calculate ftab and eftab, but having gfm costs a lot of memory
|
||
|
_gfm.init(new uint8_t[gh._gbwtTotLen], gh._gbwtTotLen, true);
|
||
|
#ifdef SIXTY4_FORMAT
|
||
|
gfmSide.resize(sideSz >> 3);
|
||
|
#else
|
||
|
gfmSide.resize(sideSz);
|
||
|
#endif
|
||
|
} catch(bad_alloc &e) {
|
||
|
cerr << "Out of memory allocating ebwtSide[] in "
|
||
|
<< "GFM::buildToDisk() at " << __FILE__ << ":"
|
||
|
<< __LINE__ << endl;
|
||
|
throw e;
|
||
|
}
|
||
|
|
||
|
// Points to the base offset within ebwt for the side currently
|
||
|
// being written
|
||
|
index_t side = 0;
|
||
|
|
||
|
// Whether we're assembling a forward or a reverse bucket
|
||
|
bool fw = true;
|
||
|
int sideCur = 0;
|
||
|
|
||
|
index_t si = 0; // string offset (chars)
|
||
|
ASSERT_ONLY(bool inSA = true); // true iff saI still points inside suffix
|
||
|
// array (as opposed to the padding at the
|
||
|
// end)
|
||
|
// Iterate over packed bwt bytes
|
||
|
VMSG_NL("Entering GFM loop");
|
||
|
ASSERT_ONLY(index_t beforeGbwtOff = (index_t)out1.tellp());
|
||
|
while(side < gbwtTotSz) {
|
||
|
// Sanity-check our cursor into the side buffer
|
||
|
assert_geq(sideCur, 0);
|
||
|
assert_lt(sideCur, (int)gh._sideGbwtSz);
|
||
|
assert_eq(0, side % sideSz); // 'side' must be on side boundary
|
||
|
if(sideCur == 0) {
|
||
|
memset(gfmSide.ptr(), 0, gh._sideGbwtSz);
|
||
|
gfmSide[sideCur] = 0; // clear
|
||
|
}
|
||
|
assert_lt(side + sideCur, gbwtTotSz);
|
||
|
// Iterate over bit-pairs in the si'th character of the BWT
|
||
|
#ifdef SIXTY4_FORMAT
|
||
|
for(int bpi = 0; bpi < 32; bpi++, si++)
|
||
|
#else
|
||
|
for(int bpi = 0; bpi < 4; bpi++, si++)
|
||
|
#endif
|
||
|
{
|
||
|
int gbwtChar = 0; // one of A, C, G, T, and Z
|
||
|
int F= 0, M = 0; // either 0 or 1
|
||
|
index_t pos = 0; // pos on joined string
|
||
|
bool count = true;
|
||
|
if(si < gbwtLen) {
|
||
|
gbwt.nextRow(gbwtChar, F, M, pos);
|
||
|
|
||
|
// (that might have triggered sa to calc next suf block)
|
||
|
if(gbwtChar == 'Z') {
|
||
|
// Don't add the 'Z' in the last column to the BWT
|
||
|
// transform; we can't encode a $ (only A C T or G)
|
||
|
// and counting it as, say, an A, will mess up the
|
||
|
// LF mapping
|
||
|
gbwtChar = 0; count = false;
|
||
|
#ifndef NDEBUG
|
||
|
if(zOffs.size() > 0) {
|
||
|
assert_gt(si, zOffs.back());
|
||
|
}
|
||
|
#endif
|
||
|
zOffs.push_back(si); // remember GBWT row that corresponds to the 0th suffix
|
||
|
} else {
|
||
|
gbwtChar = asc2dna[gbwtChar];
|
||
|
assert_lt(gbwtChar, 4);
|
||
|
// Update the fchr
|
||
|
fchr[gbwtChar]++;
|
||
|
}
|
||
|
|
||
|
assert_lt(F, 2);
|
||
|
assert_lt(M, 2);
|
||
|
if(M == 1) {
|
||
|
assert_neq(F_loc, numeric_limits<index_t>::max());
|
||
|
F_loc = gbwt.nextFLocation();
|
||
|
#ifndef NDEBUG
|
||
|
if(F_loc > 0) {
|
||
|
assert_gt(F_loc, F_locSave);
|
||
|
}
|
||
|
#endif
|
||
|
}
|
||
|
// Suffix array offset boundary? - update offset array
|
||
|
if(M == 1 && (M_occ & gh._offMask) == M_occ) {
|
||
|
assert_lt((M_occ >> gh._offRate), gh._offsLen);
|
||
|
// Write offsets directly to the secondary output
|
||
|
// stream, thereby avoiding keeping them in memory
|
||
|
writeIndex<index_t>(out2, pos, this->toBe());
|
||
|
}
|
||
|
} else {
|
||
|
// Strayed off the end of the SA, now we're just
|
||
|
// padding out a bucket
|
||
|
#ifndef NDEBUG
|
||
|
if(inSA) {
|
||
|
// Assert that we wrote all the characters in the
|
||
|
// string before now
|
||
|
assert_eq(si, gbwtLen);
|
||
|
inSA = false;
|
||
|
}
|
||
|
#endif
|
||
|
// 'A' used for padding; important that padding be
|
||
|
// counted in the occ[] array
|
||
|
gbwtChar = 0;
|
||
|
F = M = 0;
|
||
|
}
|
||
|
if(count) occ[gbwtChar]++;
|
||
|
if(M) M_occ++;
|
||
|
// Append BWT char to bwt section of current side
|
||
|
if(fw) {
|
||
|
// Forward bucket: fill from least to most
|
||
|
#ifdef SIXTY4_FORMAT
|
||
|
gfmSide[sideCur] |= ((uint64_t)gbwtChar << (bpi << 1));
|
||
|
if(gbwtChar > 0) assert_gt(gfmSide[sideCur], 0);
|
||
|
// To be implemented ...
|
||
|
assert(false);
|
||
|
cerr << "Not implemented" << endl;
|
||
|
exit(1);
|
||
|
#else
|
||
|
pack_2b_in_8b(gbwtChar, gfmSide[sideCur], bpi);
|
||
|
assert_eq((gfmSide[sideCur] >> (bpi*2)) & 3, gbwtChar);
|
||
|
|
||
|
int F_sideCur = (gh._sideGbwtSz + sideCur) >> 1;
|
||
|
int F_bpi = bpi + ((sideCur & 0x1) << 2); // Can be used as M_bpi as well
|
||
|
pack_1b_in_8b(F, gfmSide[F_sideCur], F_bpi);
|
||
|
assert_eq((gfmSide[F_sideCur] >> F_bpi) & 1, F);
|
||
|
|
||
|
int M_sideCur = F_sideCur + (gh._sideGbwtSz >> 2);
|
||
|
pack_1b_in_8b(M, gfmSide[M_sideCur], F_bpi);
|
||
|
assert_eq((gfmSide[M_sideCur] >> F_bpi) & 1, M);
|
||
|
#endif
|
||
|
} else {
|
||
|
// Backward bucket: fill from most to least
|
||
|
#ifdef SIXTY4_FORMAT
|
||
|
gfmSide[sideCur] |= ((uint64_t)gbwtChar << ((31 - bpi) << 1));
|
||
|
if(gbwtChar > 0) assert_gt(gfmSide[sideCur], 0);
|
||
|
// To be implemented ...
|
||
|
assert(false);
|
||
|
cerr << "Not implemented" << endl;
|
||
|
exit(1);
|
||
|
#else
|
||
|
pack_2b_in_8b(gbwtChar, gfmSide[sideCur], 3-bpi);
|
||
|
assert_eq((gfmSide[sideCur] >> ((3-bpi)*2)) & 3, gbwtChar);
|
||
|
// To be implemented ...
|
||
|
assert(false);
|
||
|
cerr << "Not implemented" << endl;
|
||
|
exit(1);
|
||
|
#endif
|
||
|
}
|
||
|
} // end loop over bit-pairs
|
||
|
assert_eq(0, (occ[0] + occ[1] + occ[2] + occ[3] + zOffs.size()) & 3);
|
||
|
#ifdef SIXTY4_FORMAT
|
||
|
assert_eq(0, si & 31);
|
||
|
#else
|
||
|
assert_eq(0, si & 3);
|
||
|
#endif
|
||
|
|
||
|
sideCur++;
|
||
|
if((sideCur << 1) == (int)gh._sideGbwtSz) {
|
||
|
sideCur = 0;
|
||
|
index_t *uside = reinterpret_cast<index_t*>(gfmSide.ptr());
|
||
|
// Write 'A', 'C', 'G', 'T', and '1' in M tallies
|
||
|
side += sideSz;
|
||
|
assert_leq(side, gh._gbwtTotSz);
|
||
|
uside[(sideSz / sizeof(index_t))-6] = endianizeIndex(F_locSave, this->toBe());
|
||
|
uside[(sideSz / sizeof(index_t))-5] = endianizeIndex(M_occSave, this->toBe());
|
||
|
uside[(sideSz / sizeof(index_t))-4] = endianizeIndex(occSave[0], this->toBe());
|
||
|
uside[(sideSz / sizeof(index_t))-3] = endianizeIndex(occSave[1], this->toBe());
|
||
|
uside[(sideSz / sizeof(index_t))-2] = endianizeIndex(occSave[2], this->toBe());
|
||
|
uside[(sideSz / sizeof(index_t))-1] = endianizeIndex(occSave[3], this->toBe());
|
||
|
F_locSave = F_loc;
|
||
|
M_occSave = M_occ;
|
||
|
occSave[0] = occ[0];
|
||
|
occSave[1] = occ[1];
|
||
|
occSave[2] = occ[2];
|
||
|
occSave[3] = occ[3];
|
||
|
// Write backward side to primary file
|
||
|
out1.write((const char *)gfmSide.ptr(), sideSz);
|
||
|
|
||
|
//
|
||
|
memcpy(((char*)_gfm.get()) + side - sideSz, (const char *)gfmSide.ptr(), sideSz);
|
||
|
}
|
||
|
}
|
||
|
VMSG_NL("Exited GFM loop");
|
||
|
// Assert that our loop counter got incremented right to the end
|
||
|
assert_eq(side, gh._gbwtTotSz);
|
||
|
// Assert that we wrote the expected amount to out1
|
||
|
assert_eq(((index_t)out1.tellp() - beforeGbwtOff), gh._gbwtTotSz);
|
||
|
// assert that the last thing we did was write a forward bucket
|
||
|
|
||
|
//
|
||
|
// Write zOffs to primary stream
|
||
|
//
|
||
|
assert_gt(zOffs.size(), 0);
|
||
|
writeIndex<index_t>(out1, (index_t)zOffs.size(), this->toBe());
|
||
|
for(size_t i = 0; i < zOffs.size(); i++) {
|
||
|
writeIndex<index_t>(out1, zOffs[i], this->toBe());
|
||
|
}
|
||
|
|
||
|
//
|
||
|
// Finish building fchr
|
||
|
//
|
||
|
// Exclusive prefix sum on fchr
|
||
|
for(int i = 1; i < 4; i++) {
|
||
|
fchr[i] += fchr[i-1];
|
||
|
}
|
||
|
assert_lt(fchr[3], gbwtLen);
|
||
|
// Shift everybody up by one
|
||
|
for(int i = 4; i >= 1; i--) {
|
||
|
fchr[i] = fchr[i-1];
|
||
|
}
|
||
|
fchr[0] = 0;
|
||
|
if(_verbose) {
|
||
|
for(int i = 0; i < 5; i++)
|
||
|
cerr << "fchr[" << "ACGT$"[i] << "]: " << fchr[i] << endl;
|
||
|
}
|
||
|
// Write fchr to primary file
|
||
|
for(int i = 0; i < 5; i++) {
|
||
|
writeIndex<index_t>(out1, fchr[i], this->toBe());
|
||
|
}
|
||
|
_fchr.init(new index_t[5], 5, true);
|
||
|
memcpy(_fchr.get(), fchr, sizeof(index_t) * 5);
|
||
|
|
||
|
|
||
|
// Initialize _zGbwtByteOffs and _zGbwtBpOffs
|
||
|
_zOffs = zOffs;
|
||
|
postReadInit(gh);
|
||
|
|
||
|
// Build ftab and eftab
|
||
|
EList<pair<index_t, index_t> > tFtab;
|
||
|
tFtab.resizeExact(ftabLen - 1);
|
||
|
for(index_t i = 0; i + 1 < ftabLen; i++) {
|
||
|
index_t q = i;
|
||
|
pair<index_t, index_t> range(0, gh._gbwtLen);
|
||
|
SideLocus<index_t> tloc, bloc;
|
||
|
SideLocus<index_t>::initFromTopBot(range.first, range.second, gh, gfm(), tloc, bloc);
|
||
|
index_t j = 0;
|
||
|
for(; j < (index_t)gh._ftabChars; j++) {
|
||
|
int nt = q & 0x3; q >>= 2;
|
||
|
if(bloc.valid()) {
|
||
|
range = mapGLF(tloc, bloc, nt);
|
||
|
} else {
|
||
|
range = mapGLF1(range.first, tloc, nt);
|
||
|
}
|
||
|
if(range.first == (index_t)INDEX_MAX || range.first >= range.second) {
|
||
|
break;
|
||
|
}
|
||
|
if(range.first + 1 == range.second) {
|
||
|
tloc.initFromRow(range.first, gh, gfm());
|
||
|
bloc.invalidate();
|
||
|
} else {
|
||
|
SideLocus<index_t>::initFromTopBot(range.first, range.second, gh, gfm(), tloc, bloc);
|
||
|
}
|
||
|
}
|
||
|
|
||
|
if(range.first >= range.second || j < (index_t)gh._ftabChars) {
|
||
|
if(i == 0) {
|
||
|
tFtab[i].first = tFtab[i].second = 0;
|
||
|
} else {
|
||
|
tFtab[i].first = tFtab[i].second = tFtab[i-1].second;
|
||
|
}
|
||
|
} else {
|
||
|
tFtab[i].first = range.first;
|
||
|
tFtab[i].second = range.second;
|
||
|
}
|
||
|
|
||
|
#ifndef NDEBUG
|
||
|
if(gbwt.ftab.size() > i) {
|
||
|
assert_eq(tFtab[i].first, gbwt.ftab[i].first);
|
||
|
assert_eq(tFtab[i].second, gbwt.ftab[i].second);
|
||
|
}
|
||
|
#endif
|
||
|
}
|
||
|
|
||
|
// Clear memory
|
||
|
_gfm.reset();
|
||
|
_fchr.reset();
|
||
|
_zOffs.clear();
|
||
|
_zGbwtByteOffs.clear();
|
||
|
_zGbwtBpOffs.clear();
|
||
|
|
||
|
//
|
||
|
// Finish building ftab and build eftab
|
||
|
//
|
||
|
// Prefix sum on ftable
|
||
|
index_t eftabLen = 0;
|
||
|
for(index_t i = 1; i + 1 < ftabLen; i++) {
|
||
|
if(tFtab[i-1].second != tFtab[i].first) {
|
||
|
eftabLen += 2;
|
||
|
}
|
||
|
}
|
||
|
|
||
|
if(gh._gbwtLen + (eftabLen >> 1) < gh._gbwtLen) {
|
||
|
cerr << "Too many eftab entries: "
|
||
|
<< gh._gbwtLen << " + " << (eftabLen >> 1)
|
||
|
<< " > " << (index_t)INDEX_MAX << endl;
|
||
|
throw 1;
|
||
|
}
|
||
|
|
||
|
EList<index_t> eftab(EBWT_CAT);
|
||
|
try {
|
||
|
eftab.resize(eftabLen);
|
||
|
eftab.fillZero();
|
||
|
} catch(bad_alloc &e) {
|
||
|
cerr << "Out of memory allocating eftab[] "
|
||
|
<< "in GFM::buildToDisk() at " << __FILE__ << ":"
|
||
|
<< __LINE__ << endl;
|
||
|
throw e;
|
||
|
}
|
||
|
index_t eftabCur = 0;
|
||
|
ftab[0] = tFtab[0].first;
|
||
|
ftab[1] = tFtab[0].second;
|
||
|
for(index_t i = 1; i + 1 < ftabLen; i++) {
|
||
|
if(ftab[i] != tFtab[i].first) {
|
||
|
index_t lo = ftab[i];
|
||
|
index_t hi = tFtab[i].first;
|
||
|
assert_lt(eftabCur*2+1, eftabLen);
|
||
|
eftab[eftabCur*2] = lo;
|
||
|
eftab[eftabCur*2+1] = hi;
|
||
|
// one node can be shared, and one node can have at most four incoming edges
|
||
|
assert_leq(lo, hi + 4);
|
||
|
ftab[i] = (eftabCur++) ^ (index_t)INDEX_MAX; // insert pointer into eftab
|
||
|
assert_eq(lo, GFM<index_t>::ftabLo(ftab.ptr(), eftab.ptr(), gbwtLen, ftabLen, eftabLen, i));
|
||
|
assert_eq(hi, GFM<index_t>::ftabHi(ftab.ptr(), eftab.ptr(), gbwtLen, ftabLen, eftabLen, i));
|
||
|
}
|
||
|
ftab[i+1] = tFtab[i].second;
|
||
|
}
|
||
|
#ifndef NDEBUG
|
||
|
for(index_t i = 0; i + 1 < ftabLen; i++ ){
|
||
|
assert_eq(tFtab[i].first, GFM<index_t>::ftabHi(ftab.ptr(), eftab.ptr(), gbwtLen, ftabLen, eftabLen, i));
|
||
|
assert_eq(tFtab[i].second, GFM<index_t>::ftabLo(ftab.ptr(), eftab.ptr(), gbwtLen, ftabLen, eftabLen, i+1));
|
||
|
}
|
||
|
#endif
|
||
|
// Write ftab to primary file
|
||
|
for(index_t i = 0; i < ftabLen; i++) {
|
||
|
writeIndex<index_t>(out1, ftab[i], this->toBe());
|
||
|
}
|
||
|
// Write eftab to primary file
|
||
|
out1pos = out1.tellp();
|
||
|
if(headerPos < 0) {
|
||
|
out1.seekp(24 + sizeof(index_t) * 3);
|
||
|
} else {
|
||
|
out1.seekp((int)headerPos + 16 + sizeof(index_t) * 2);
|
||
|
}
|
||
|
writeIndex<index_t>(out1, eftabLen, this->toBe());
|
||
|
out1.seekp(out1pos);
|
||
|
for(index_t i = 0; i < eftabLen; i++) {
|
||
|
writeIndex<index_t>(out1, eftab[i], this->toBe());
|
||
|
}
|
||
|
// Note: if you'd like to sanity-check the Ebwt, you'll have to
|
||
|
// read it back into memory first!
|
||
|
assert(!isInMemory());
|
||
|
VMSG_NL("Exiting GFM::buildToDisk()");
|
||
|
}
|
||
|
|
||
|
/**
|
||
|
* Build an Ebwt from a string 's' and its suffix array 'sa' (which
|
||
|
* might actually be a suffix array *builder* that builds blocks of the
|
||
|
* array on demand). The bulk of the Ebwt, i.e. the ebwt and offs
|
||
|
* arrays, is written directly to disk. This is by design: keeping
|
||
|
* those arrays in memory needlessly increases the footprint of the
|
||
|
* building process. Instead, we prefer to build the Ebwt directly
|
||
|
* "to disk" and then read it back into memory later as necessary.
|
||
|
*
|
||
|
* It is assumed that the header values and join-related values (nPat,
|
||
|
* plen) have already been written to 'out1' before this function
|
||
|
* is called. When this function is finished, it will have
|
||
|
* additionally written ebwt, zOff, fchr, ftab and eftab to the primary
|
||
|
* file and offs to the secondary file.
|
||
|
*
|
||
|
* Assume DNA/RNA/any alphabet with 4 or fewer elements.
|
||
|
* Assume occ array entries are 32 bits each.
|
||
|
*
|
||
|
* @param sa the suffix array to convert to a Ebwt
|
||
|
* @param s the original string
|
||
|
* @param out
|
||
|
*/
|
||
|
template <typename index_t>
|
||
|
template <typename TStr>
|
||
|
void GFM<index_t>::buildToDisk(
|
||
|
InorderBlockwiseSA<TStr>& sa,
|
||
|
const TStr& s,
|
||
|
ostream& out1,
|
||
|
ostream& out2,
|
||
|
streampos headerPos)
|
||
|
{
|
||
|
const GFMParams<index_t>& gh = this->_gh;
|
||
|
assert(gh.repOk());
|
||
|
assert(gh.linearFM());
|
||
|
assert_lt(s.length(), gh.gbwtLen());
|
||
|
assert_eq(s.length(), gh._len);
|
||
|
assert_gt(gh._lineRate, 3);
|
||
|
|
||
|
index_t len = gh._len;
|
||
|
index_t gbwtLen = gh._gbwtLen;
|
||
|
assert_eq(len + 1, gbwtLen);
|
||
|
streampos out1pos = out1.tellp();
|
||
|
if(headerPos < 0) {
|
||
|
out1.seekp(8 + sizeof(index_t));
|
||
|
} else {
|
||
|
out1.seekp(headerPos);
|
||
|
}
|
||
|
writeIndex<index_t>(out1, gbwtLen, this->toBe());
|
||
|
writeIndex<index_t>(out1, gh._numNodes, this->toBe());
|
||
|
out1.seekp(out1pos);
|
||
|
|
||
|
index_t ftabLen = gh._ftabLen;
|
||
|
index_t sideSz = gh._sideSz;
|
||
|
index_t gbwtTotSz = gh._gbwtTotSz;
|
||
|
index_t fchr[] = {0, 0, 0, 0, 0};
|
||
|
EList<index_t> ftab(EBWT_CAT);
|
||
|
EList<index_t> zOffs;
|
||
|
|
||
|
// Save # of occurrences of each character as we walk along the bwt
|
||
|
index_t occ[4] = {0, 0, 0, 0};
|
||
|
index_t occSave[4] = {0, 0, 0, 0};
|
||
|
|
||
|
// Record rows that should "absorb" adjacent rows in the ftab.
|
||
|
// The absorbed rows represent suffixes shorter than the ftabChars
|
||
|
// cutoff.
|
||
|
uint8_t absorbCnt = 0;
|
||
|
EList<uint8_t> absorbFtab(EBWT_CAT);
|
||
|
try {
|
||
|
VMSG_NL("Allocating ftab, absorbFtab");
|
||
|
ftab.resize(ftabLen);
|
||
|
ftab.fillZero();
|
||
|
absorbFtab.resize(ftabLen);
|
||
|
absorbFtab.fillZero();
|
||
|
} catch(bad_alloc &e) {
|
||
|
cerr << "Out of memory allocating ftab[] or absorbFtab[] "
|
||
|
<< "in GFM::buildToDisk() at " << __FILE__ << ":"
|
||
|
<< __LINE__ << endl;
|
||
|
throw e;
|
||
|
}
|
||
|
|
||
|
// Allocate the side buffer; holds a single side as its being
|
||
|
// constructed and then written to disk. Reused across all sides.
|
||
|
#ifdef SIXTY4_FORMAT
|
||
|
EList<uint64_t> gfmSide(EBWT_CAT);
|
||
|
#else
|
||
|
EList<uint8_t> gfmSide(EBWT_CAT);
|
||
|
#endif
|
||
|
try {
|
||
|
#ifdef SIXTY4_FORMAT
|
||
|
gfmSide.resize(sideSz >> 3);
|
||
|
#else
|
||
|
gfmSide.resize(sideSz);
|
||
|
#endif
|
||
|
} catch(bad_alloc &e) {
|
||
|
cerr << "Out of memory allocating gfmSide[] in "
|
||
|
<< "GFM::buildToDisk() at " << __FILE__ << ":"
|
||
|
<< __LINE__ << endl;
|
||
|
throw e;
|
||
|
}
|
||
|
|
||
|
// Points to the base offset within ebwt for the side currently
|
||
|
// being written
|
||
|
index_t side = 0;
|
||
|
|
||
|
// Whether we're assembling a forward or a reverse bucket
|
||
|
bool fw = true;
|
||
|
int sideCur = 0;
|
||
|
|
||
|
// Have we skipped the '$' in the last column yet?
|
||
|
ASSERT_ONLY(bool dollarSkipped = false);
|
||
|
|
||
|
index_t si = 0; // string offset (chars)
|
||
|
ASSERT_ONLY(index_t lastSufInt = 0);
|
||
|
ASSERT_ONLY(bool inSA = true); // true iff saI still points inside suffix
|
||
|
// array (as opposed to the padding at the
|
||
|
// end)
|
||
|
// Iterate over packed bwt bytes
|
||
|
VMSG_NL("Entering GFM loop");
|
||
|
ASSERT_ONLY(index_t beforeGbwtOff = (index_t)out1.tellp());
|
||
|
while(side < gbwtTotSz) {
|
||
|
// Sanity-check our cursor into the side buffer
|
||
|
assert_geq(sideCur, 0);
|
||
|
assert_lt(sideCur, (int)gh._sideGbwtSz);
|
||
|
assert_eq(0, side % sideSz); // 'side' must be on side boundary
|
||
|
gfmSide[sideCur] = 0; // clear
|
||
|
assert_lt(side + sideCur, gbwtTotSz);
|
||
|
// Iterate over bit-pairs in the si'th character of the BWT
|
||
|
#ifdef SIXTY4_FORMAT
|
||
|
for(int bpi = 0; bpi < 32; bpi++, si++)
|
||
|
#else
|
||
|
for(int bpi = 0; bpi < 4; bpi++, si++)
|
||
|
#endif
|
||
|
{
|
||
|
int bwtChar;
|
||
|
bool count = true;
|
||
|
if(si <= len) {
|
||
|
// Still in the SA; extract the bwtChar
|
||
|
index_t saElt = sa.nextSuffix();
|
||
|
// (that might have triggered sa to calc next suf block)
|
||
|
if(saElt == 0) {
|
||
|
// Don't add the '$' in the last column to the BWT
|
||
|
// transform; we can't encode a $ (only A C T or G)
|
||
|
// and counting it as, say, an A, will mess up the
|
||
|
// LR mapping
|
||
|
bwtChar = 0; count = false;
|
||
|
ASSERT_ONLY(dollarSkipped = true);
|
||
|
zOffs.push_back(si); // remember the SA row that
|
||
|
// corresponds to the 0th suffix
|
||
|
} else {
|
||
|
bwtChar = (int)(s[saElt-1]);
|
||
|
assert_lt(bwtChar, 4);
|
||
|
// Update the fchr
|
||
|
fchr[bwtChar]++;
|
||
|
}
|
||
|
// Update ftab
|
||
|
if((len-saElt) >= (index_t)gh._ftabChars) {
|
||
|
// Turn the first ftabChars characters of the
|
||
|
// suffix into an integer index into ftab. The
|
||
|
// leftmost (lowest index) character of the suffix
|
||
|
// goes in the most significant bit pair if the
|
||
|
// integer.
|
||
|
index_t sufInt = 0;
|
||
|
for(int i = 0; i < gh._ftabChars; i++) {
|
||
|
sufInt <<= 2;
|
||
|
assert_lt((index_t)i, len-saElt);
|
||
|
sufInt |= (unsigned char)(s[saElt+i]);
|
||
|
}
|
||
|
// Assert that this prefix-of-suffix is greater
|
||
|
// than or equal to the last one (true b/c the
|
||
|
// suffix array is sorted)
|
||
|
#ifndef NDEBUG
|
||
|
if(lastSufInt > 0) assert_geq(sufInt, lastSufInt);
|
||
|
lastSufInt = sufInt;
|
||
|
#endif
|
||
|
// Update ftab
|
||
|
assert_lt(sufInt+1, ftabLen);
|
||
|
ftab[sufInt+1]++;
|
||
|
if(absorbCnt > 0) {
|
||
|
// Absorb all short suffixes since the last
|
||
|
// transition into this transition
|
||
|
absorbFtab[sufInt] = absorbCnt;
|
||
|
absorbCnt = 0;
|
||
|
}
|
||
|
} else {
|
||
|
// Otherwise if suffix is fewer than ftabChars
|
||
|
// characters long, then add it to the 'absorbCnt';
|
||
|
// it will be absorbed into the next transition
|
||
|
assert_lt(absorbCnt, 255);
|
||
|
absorbCnt++;
|
||
|
}
|
||
|
// Suffix array offset boundary? - update offset array
|
||
|
if((si & gh._offMask) == si) {
|
||
|
assert_lt((si >> gh._offRate), gh._offsLen);
|
||
|
// Write offsets directly to the secondary output
|
||
|
// stream, thereby avoiding keeping them in memory
|
||
|
writeIndex<index_t>(out2, saElt, this->toBe());
|
||
|
}
|
||
|
} else {
|
||
|
// Strayed off the end of the SA, now we're just
|
||
|
// padding out a bucket
|
||
|
#ifndef NDEBUG
|
||
|
if(inSA) {
|
||
|
// Assert that we wrote all the characters in the
|
||
|
// string before now
|
||
|
assert_eq(si, len+1);
|
||
|
inSA = false;
|
||
|
}
|
||
|
#endif
|
||
|
// 'A' used for padding; important that padding be
|
||
|
// counted in the occ[] array
|
||
|
bwtChar = 0;
|
||
|
}
|
||
|
if(count) occ[bwtChar]++;
|
||
|
// Append BWT char to bwt section of current side
|
||
|
if(fw) {
|
||
|
// Forward bucket: fill from least to most
|
||
|
#ifdef SIXTY4_FORMAT
|
||
|
ebwtSide[sideCur] |= ((uint64_t)bwtChar << (bpi << 1));
|
||
|
if(bwtChar > 0) assert_gt(ebwtSide[sideCur], 0);
|
||
|
#else
|
||
|
pack_2b_in_8b(bwtChar, gfmSide[sideCur], bpi);
|
||
|
assert_eq((gfmSide[sideCur] >> (bpi*2)) & 3, bwtChar);
|
||
|
#endif
|
||
|
} else {
|
||
|
// Backward bucket: fill from most to least
|
||
|
#ifdef SIXTY4_FORMAT
|
||
|
ebwtSide[sideCur] |= ((uint64_t)bwtChar << ((31 - bpi) << 1));
|
||
|
if(bwtChar > 0) assert_gt(ebwtSide[sideCur], 0);
|
||
|
#else
|
||
|
pack_2b_in_8b(bwtChar, gfmSide[sideCur], 3-bpi);
|
||
|
assert_eq((gfmSide[sideCur] >> ((3-bpi)*2)) & 3, bwtChar);
|
||
|
#endif
|
||
|
}
|
||
|
} // end loop over bit-pairs
|
||
|
assert_eq(dollarSkipped ? 3 : 0, (occ[0] + occ[1] + occ[2] + occ[3]) & 3);
|
||
|
#ifdef SIXTY4_FORMAT
|
||
|
assert_eq(0, si & 31);
|
||
|
#else
|
||
|
assert_eq(0, si & 3);
|
||
|
#endif
|
||
|
|
||
|
sideCur++;
|
||
|
if(sideCur == (int)gh._sideGbwtSz) {
|
||
|
sideCur = 0;
|
||
|
index_t *uside = reinterpret_cast<index_t*>(gfmSide.ptr());
|
||
|
// Write 'A', 'C', 'G', 'T', and '1' in M tallies
|
||
|
side += sideSz;
|
||
|
assert_leq(side, gh._gbwtTotSz);
|
||
|
uside[(sideSz / sizeof(index_t))-4] = endianizeIndex(occSave[0], this->toBe());
|
||
|
uside[(sideSz / sizeof(index_t))-3] = endianizeIndex(occSave[1], this->toBe());
|
||
|
uside[(sideSz / sizeof(index_t))-2] = endianizeIndex(occSave[2], this->toBe());
|
||
|
uside[(sideSz / sizeof(index_t))-1] = endianizeIndex(occSave[3], this->toBe());
|
||
|
occSave[0] = occ[0];
|
||
|
occSave[1] = occ[1];
|
||
|
occSave[2] = occ[2];
|
||
|
occSave[3] = occ[3];
|
||
|
// Write backward side to primary file
|
||
|
out1.write((const char *)gfmSide.ptr(), sideSz);
|
||
|
}
|
||
|
}
|
||
|
VMSG_NL("Exited GFM loop");
|
||
|
if(absorbCnt > 0) {
|
||
|
// Absorb any trailing, as-yet-unabsorbed short suffixes into
|
||
|
// the last element of ftab
|
||
|
absorbFtab[ftabLen-1] = absorbCnt;
|
||
|
}
|
||
|
|
||
|
// Assert that our loop counter got incremented right to the end
|
||
|
assert_eq(side, gh._gbwtTotSz);
|
||
|
// Assert that we wrote the expected amount to out1
|
||
|
assert_eq(((index_t)out1.tellp() - beforeGbwtOff), gh._gbwtTotSz);
|
||
|
// assert that the last thing we did was write a forward bucket
|
||
|
|
||
|
//
|
||
|
// Write zOffs to primary stream
|
||
|
//
|
||
|
assert_eq(zOffs.size(), 1);
|
||
|
writeIndex<index_t>(out1, (index_t)zOffs.size(), this->toBe());
|
||
|
for(size_t i = 0; i < zOffs.size(); i++) {
|
||
|
assert_neq(zOffs[i], (index_t)OFF_MASK);
|
||
|
writeIndex<index_t>(out1, zOffs[i], this->toBe());
|
||
|
}
|
||
|
|
||
|
//
|
||
|
// Finish building fchr
|
||
|
//
|
||
|
// Exclusive prefix sum on fchr
|
||
|
for(int i = 1; i < 4; i++) {
|
||
|
fchr[i] += fchr[i-1];
|
||
|
}
|
||
|
assert_lt(fchr[3], gbwtLen);
|
||
|
// Shift everybody up by one
|
||
|
for(int i = 4; i >= 1; i--) {
|
||
|
fchr[i] = fchr[i-1];
|
||
|
}
|
||
|
fchr[0] = 0;
|
||
|
if(_verbose) {
|
||
|
for(int i = 0; i < 5; i++)
|
||
|
cerr << "fchr[" << "ACGT$"[i] << "]: " << fchr[i] << endl;
|
||
|
}
|
||
|
// Write fchr to primary file
|
||
|
for(int i = 0; i < 5; i++) {
|
||
|
writeIndex<index_t>(out1, fchr[i], this->toBe());
|
||
|
}
|
||
|
|
||
|
//
|
||
|
// Finish building ftab and build eftab
|
||
|
//
|
||
|
// Prefix sum on ftable
|
||
|
index_t eftabLen = 0;
|
||
|
assert_eq(0, absorbFtab[0]);
|
||
|
for(index_t i = 1; i < ftabLen; i++) {
|
||
|
if(absorbFtab[i] > 0) eftabLen += 2;
|
||
|
}
|
||
|
assert_leq(eftabLen, (index_t)gh._ftabChars*2);
|
||
|
eftabLen = gh._ftabChars*2;
|
||
|
EList<index_t> eftab(EBWT_CAT);
|
||
|
try {
|
||
|
eftab.resize(eftabLen);
|
||
|
eftab.fillZero();
|
||
|
} catch(bad_alloc &e) {
|
||
|
cerr << "Out of memory allocating eftab[] "
|
||
|
<< "in GFM::buildToDisk() at " << __FILE__ << ":"
|
||
|
<< __LINE__ << endl;
|
||
|
throw e;
|
||
|
}
|
||
|
index_t eftabCur = 0;
|
||
|
for(index_t i = 1; i < ftabLen; i++) {
|
||
|
index_t lo = ftab[i] + GFM<index_t>::ftabHi(ftab.ptr(), eftab.ptr(), len, ftabLen, eftabLen, i-1);
|
||
|
if(absorbFtab[i] > 0) {
|
||
|
// Skip a number of short pattern indicated by absorbFtab[i]
|
||
|
index_t hi = lo + absorbFtab[i];
|
||
|
assert_lt(eftabCur*2+1, eftabLen);
|
||
|
eftab[eftabCur*2] = lo;
|
||
|
eftab[eftabCur*2+1] = hi;
|
||
|
ftab[i] = (eftabCur++) ^ (index_t)OFF_MASK; // insert pointer into eftab
|
||
|
assert_eq(lo, GFM<index_t>::ftabLo(ftab.ptr(), eftab.ptr(), len, ftabLen, eftabLen, i));
|
||
|
assert_eq(hi, GFM<index_t>::ftabHi(ftab.ptr(), eftab.ptr(), len, ftabLen, eftabLen, i));
|
||
|
} else {
|
||
|
ftab[i] = lo;
|
||
|
}
|
||
|
}
|
||
|
assert_eq(GFM<index_t>::ftabHi(ftab.ptr(), eftab.ptr(), len, ftabLen, eftabLen, ftabLen-1), len+1);
|
||
|
// Write ftab to primary file
|
||
|
for(index_t i = 0; i < ftabLen; i++) {
|
||
|
writeIndex<index_t>(out1, ftab[i], this->toBe());
|
||
|
}
|
||
|
// Write eftab to primary file
|
||
|
out1pos = out1.tellp();
|
||
|
if(headerPos < 0) {
|
||
|
out1.seekp(24 + sizeof(index_t) * 3);
|
||
|
} else {
|
||
|
out1.seekp((int)headerPos + 16 + sizeof(index_t) * 2);
|
||
|
}
|
||
|
writeIndex<index_t>(out1, eftabLen, this->toBe());
|
||
|
out1.seekp(out1pos);
|
||
|
for(index_t i = 0; i < eftabLen; i++) {
|
||
|
writeIndex<index_t>(out1, eftab[i], this->toBe());
|
||
|
}
|
||
|
|
||
|
// Note: if you'd like to sanity-check the Ebwt, you'll have to
|
||
|
// read it back into memory first!
|
||
|
assert(!isInMemory());
|
||
|
VMSG_NL("Exiting GFM::buildToDisk()");
|
||
|
}
|
||
|
|
||
|
extern string gLastIOErrMsg;
|
||
|
|
||
|
/* Checks whether a call to read() failed or not. */
|
||
|
inline bool is_read_err(int fdesc, ssize_t ret, size_t count) {
|
||
|
if (ret < 0) {
|
||
|
std::stringstream sstm;
|
||
|
sstm << "ERRNO: " << errno << " ERR Msg:" << strerror(errno) << std::endl;
|
||
|
gLastIOErrMsg = sstm.str();
|
||
|
return true;
|
||
|
}
|
||
|
return false;
|
||
|
}
|
||
|
|
||
|
/* Checks whether a call to fread() failed or not. */
|
||
|
inline bool is_fread_err(FILE* file_hd, size_t ret, size_t count) {
|
||
|
if (ferror(file_hd)) {
|
||
|
gLastIOErrMsg = "Error Reading File!";
|
||
|
return true;
|
||
|
}
|
||
|
return false;
|
||
|
}
|
||
|
|
||
|
|
||
|
///////////////////////////////////////////////////////////////////////
|
||
|
//
|
||
|
// Functions for searching Ebwts
|
||
|
// (But most of them are defined in the header)
|
||
|
//
|
||
|
///////////////////////////////////////////////////////////////////////
|
||
|
|
||
|
/**
|
||
|
* Take an offset into the joined text and translate it into the
|
||
|
* reference of the index it falls on, the offset into the reference,
|
||
|
* and the length of the reference. Use a binary search through the
|
||
|
* sorted list of reference fragment ranges t
|
||
|
*/
|
||
|
template <typename index_t>
|
||
|
bool GFM<index_t>::joinedToTextOff(
|
||
|
index_t qlen,
|
||
|
index_t off,
|
||
|
index_t& tidx,
|
||
|
index_t& textoff,
|
||
|
index_t& tlen,
|
||
|
bool rejectStraddle,
|
||
|
bool& straddled) const
|
||
|
{
|
||
|
assert(rstarts() != NULL); // must have loaded rstarts
|
||
|
index_t top = 0;
|
||
|
index_t bot = _nFrag; // 1 greater than largest addressable element
|
||
|
index_t elt = (index_t)INDEX_MAX;
|
||
|
// Begin binary search
|
||
|
while(true) {
|
||
|
index_t oldelt = elt;
|
||
|
elt = top + ((bot - top) >> 1);
|
||
|
if(oldelt == elt) {
|
||
|
tidx = (index_t)INDEX_MAX;
|
||
|
return false;
|
||
|
}
|
||
|
index_t lower = rstarts()[elt*3];
|
||
|
index_t upper;
|
||
|
if(elt == _nFrag-1) {
|
||
|
upper = _gh._len;
|
||
|
} else {
|
||
|
upper = rstarts()[((elt+1)*3)];
|
||
|
}
|
||
|
assert_gt(upper, lower);
|
||
|
index_t fraglen = upper - lower;
|
||
|
if(lower <= off) {
|
||
|
if(upper > off) { // not last element, but it's within
|
||
|
// off is in this range; check if it falls off
|
||
|
if(off + qlen > upper) {
|
||
|
straddled = true;
|
||
|
if(rejectStraddle) {
|
||
|
// it falls off; signal no-go and return
|
||
|
tidx = (index_t)INDEX_MAX;
|
||
|
return false;
|
||
|
}
|
||
|
}
|
||
|
// This is the correct text idx whether the index is
|
||
|
// forward or reverse
|
||
|
tidx = rstarts()[(elt*3)+1];
|
||
|
assert_lt(tidx, this->_nPat);
|
||
|
assert_leq(fraglen, this->plen()[tidx]);
|
||
|
// it doesn't fall off; now calculate textoff.
|
||
|
// Initially it's the number of characters that precede
|
||
|
// the alignment in the fragment
|
||
|
index_t fragoff = off - rstarts()[(elt*3)];
|
||
|
if(!this->fw_) {
|
||
|
fragoff = fraglen - fragoff - 1;
|
||
|
fragoff -= (qlen-1);
|
||
|
}
|
||
|
// Add the alignment's offset into the fragment
|
||
|
// ('fragoff') to the fragment's offset within the text
|
||
|
textoff = fragoff + rstarts()[(elt*3)+2];
|
||
|
assert_lt(textoff, this->plen()[tidx]);
|
||
|
break; // done with binary search
|
||
|
} else {
|
||
|
// 'off' belongs somewhere in the region between elt
|
||
|
// and bot
|
||
|
top = elt;
|
||
|
}
|
||
|
} else {
|
||
|
// 'off' belongs somewhere in the region between top and
|
||
|
// elt
|
||
|
bot = elt;
|
||
|
}
|
||
|
// continue with binary search
|
||
|
}
|
||
|
tlen = this->plen()[tidx];
|
||
|
return true;
|
||
|
}
|
||
|
|
||
|
template <typename index_t>
|
||
|
bool GFM<index_t>::textOffToJoined(
|
||
|
index_t tid,
|
||
|
index_t textoff,
|
||
|
index_t& off) const
|
||
|
{
|
||
|
assert(rstarts() != NULL); // must have loaded rstarts
|
||
|
index_t top = 0;
|
||
|
index_t bot = _nFrag; // 1 greater than largest addressable element
|
||
|
index_t elt = (index_t)INDEX_MAX;
|
||
|
// Begin binary search
|
||
|
while(true) {
|
||
|
ASSERT_ONLY(index_t oldelt = elt);
|
||
|
elt = top + ((bot - top) >> 1);
|
||
|
assert_neq(oldelt, elt); // must have made progress
|
||
|
index_t elt_tid = rstarts()[elt*3 + 1];
|
||
|
if(elt_tid == tid) {
|
||
|
while(true) {
|
||
|
if(tid != rstarts()[elt*3+1]) {
|
||
|
return false;
|
||
|
}
|
||
|
if(rstarts()[elt*3 + 2] <= textoff) break;
|
||
|
if(elt == 0) return false;
|
||
|
elt--;
|
||
|
}
|
||
|
while(true) {
|
||
|
assert_leq(rstarts()[elt*3+2], textoff);
|
||
|
if(elt + 1 == _nFrag ||
|
||
|
tid + 1 == rstarts()[(elt+1)*3 + 1] ||
|
||
|
textoff < rstarts()[(elt+1)*3 + 2]) {
|
||
|
off = rstarts()[elt*3] + (textoff - rstarts()[elt*3 + 2]);
|
||
|
if(elt + 1 < _nFrag &&
|
||
|
tid == rstarts()[(elt+1)*3 + 1] &&
|
||
|
off >= rstarts()[(elt+1)*3]) {
|
||
|
return false;
|
||
|
}
|
||
|
break;
|
||
|
}
|
||
|
elt++;
|
||
|
}
|
||
|
break; // done with binary search
|
||
|
} else if(elt_tid < tid) {
|
||
|
top = elt;
|
||
|
} else {
|
||
|
bot = elt;
|
||
|
}
|
||
|
// continue with binary search
|
||
|
}
|
||
|
return true;
|
||
|
}
|
||
|
|
||
|
/**
|
||
|
* Walk 'steps' steps to the left and return the row arrived at. If we
|
||
|
* walk through the dollar sign, return 0xffffffff.
|
||
|
*/
|
||
|
template <typename index_t>
|
||
|
index_t GFM<index_t>::walkLeft(index_t row, index_t steps) const {
|
||
|
assert(offs() != NULL);
|
||
|
assert_neq((index_t)INDEX_MAX, row);
|
||
|
SideLocus<index_t> l;
|
||
|
if(steps > 0) l.initFromRow(row, _gh, gfm());
|
||
|
while(steps > 0) {
|
||
|
for(index_t i = 0; i < _zOffs.size(); i++) {
|
||
|
if(row == _zOffs[i]) return (index_t)INDEX_MAX;
|
||
|
}
|
||
|
pair<index_t, index_t> range = this->mapGLF1(row, l, (pair<index_t, index_t> *)NULL ASSERT_ONLY(, false));
|
||
|
index_t newrow = range.first;
|
||
|
assert_neq((index_t)INDEX_MAX, newrow);
|
||
|
assert_neq(newrow, row);
|
||
|
row = newrow;
|
||
|
steps--;
|
||
|
if(steps > 0) l.initFromRow(row, _gh, gfm());
|
||
|
}
|
||
|
return row;
|
||
|
}
|
||
|
|
||
|
/**
|
||
|
* Resolve the reference offset of the BW element 'elt'.
|
||
|
*/
|
||
|
template <typename index_t>
|
||
|
index_t GFM<index_t>::getOffset(index_t row, index_t node) const {
|
||
|
assert(offs() != NULL);
|
||
|
assert_neq((index_t)INDEX_MAX, row);
|
||
|
for(index_t i = 0; i < _zOffs.size(); i++) {
|
||
|
if(row == _zOffs[i]) return 0;
|
||
|
}
|
||
|
if((node & _gh._offMask) == node) {
|
||
|
index_t off = this->offs()[node >> _gh._offRate];
|
||
|
if(off != (index_t)INDEX_MAX)
|
||
|
return off;
|
||
|
}
|
||
|
index_t jumps = 0;
|
||
|
SideLocus<index_t> l;
|
||
|
l.initFromRow(row, _gh, gfm());
|
||
|
while(true) {
|
||
|
pair<index_t, index_t> node_range(0, 0);
|
||
|
pair<index_t, index_t> range = this->mapGLF1(row, l, &node_range ASSERT_ONLY(, false));
|
||
|
index_t newrow = range.first;
|
||
|
jumps++;
|
||
|
assert_neq((index_t)INDEX_MAX, newrow);
|
||
|
assert_neq(newrow, row);
|
||
|
row = newrow;
|
||
|
for(index_t i = 0; i < _zOffs.size(); i++) {
|
||
|
if(row == _zOffs[i]) return jumps;
|
||
|
}
|
||
|
|
||
|
if((node_range.first & _gh._offMask) == node_range.first) {
|
||
|
index_t off = this->offs()[node_range.first >> _gh._offRate];
|
||
|
if(off != (index_t)INDEX_MAX)
|
||
|
return jumps + off;
|
||
|
}
|
||
|
l.initFromRow(row, _gh, gfm());
|
||
|
}
|
||
|
}
|
||
|
|
||
|
/**
|
||
|
* Resolve the reference offset of the BW element 'elt' such that
|
||
|
* the offset returned is at the right-hand side of the forward
|
||
|
* reference substring involved in the hit.
|
||
|
*/
|
||
|
template <typename index_t>
|
||
|
index_t GFM<index_t>::getOffset(
|
||
|
index_t elt,
|
||
|
bool fw,
|
||
|
index_t hitlen) const
|
||
|
{
|
||
|
index_t off = getOffset(elt);
|
||
|
assert_neq((index_t)INDEX_MAX, off);
|
||
|
if(!fw) {
|
||
|
assert_lt(off, _gh._len);
|
||
|
off = _gh._len - off - 1;
|
||
|
assert_geq(off, hitlen-1);
|
||
|
off -= (hitlen-1);
|
||
|
assert_lt(off, _gh._len);
|
||
|
}
|
||
|
return off;
|
||
|
}
|
||
|
|
||
|
/**
|
||
|
* Returns true iff the index contains the given string (exactly). The given
|
||
|
* string must contain only unambiguous characters. TODO: support ambiguous
|
||
|
* characters in 'str'.
|
||
|
*/
|
||
|
template <typename index_t>
|
||
|
bool GFM<index_t>::contains(
|
||
|
const BTDnaString& str,
|
||
|
index_t *otop,
|
||
|
index_t *obot) const
|
||
|
{
|
||
|
assert(isInMemory());
|
||
|
SideLocus<index_t> tloc, bloc;
|
||
|
if(str.empty()) {
|
||
|
if(otop != NULL && obot != NULL) *otop = *obot = 0;
|
||
|
return true;
|
||
|
}
|
||
|
int c = str[str.length()-1];
|
||
|
assert_range(0, 4, c);
|
||
|
index_t top = 0, bot = 0;
|
||
|
if(c < 4) {
|
||
|
top = fchr()[c];
|
||
|
bot = fchr()[c+1];
|
||
|
} else {
|
||
|
bool set = false;
|
||
|
for(int i = 0; i < 4; i++) {
|
||
|
if(fchr()[c] < fchr()[c+1]) {
|
||
|
if(set) {
|
||
|
return false;
|
||
|
} else {
|
||
|
set = true;
|
||
|
top = fchr()[c];
|
||
|
bot = fchr()[c+1];
|
||
|
}
|
||
|
}
|
||
|
}
|
||
|
}
|
||
|
assert_geq(bot, top);
|
||
|
tloc.initFromRow(top, gh(), gfm());
|
||
|
bloc.initFromRow(bot, gh(), gfm());
|
||
|
ASSERT_ONLY(index_t lastDiff = bot - top);
|
||
|
for(int64_t i = (int64_t)str.length()-2; i >= 0; i--) {
|
||
|
c = str[i];
|
||
|
assert_range(0, 4, c);
|
||
|
if(c <= 3) {
|
||
|
top = mapLF(tloc, c);
|
||
|
bot = mapLF(bloc, c);
|
||
|
} else {
|
||
|
index_t sz = bot - top;
|
||
|
int c1 = mapLF1(top, tloc ASSERT_ONLY(, false));
|
||
|
bot = mapLF(bloc, c1);
|
||
|
assert_leq(bot - top, sz);
|
||
|
if(bot - top < sz) {
|
||
|
// Encountered an N and could not proceed through it because
|
||
|
// there was more than one possible nucleotide we could replace
|
||
|
// it with
|
||
|
return false;
|
||
|
}
|
||
|
}
|
||
|
assert_geq(bot, top);
|
||
|
assert_leq(bot-top, lastDiff);
|
||
|
ASSERT_ONLY(lastDiff = bot-top);
|
||
|
if(i > 0) {
|
||
|
tloc.initFromRow(top, gh(), gfm());
|
||
|
bloc.initFromRow(bot, gh(), gfm());
|
||
|
}
|
||
|
}
|
||
|
if(otop != NULL && obot != NULL) {
|
||
|
*otop = top; *obot = bot;
|
||
|
}
|
||
|
return bot > top;
|
||
|
}
|
||
|
|
||
|
///////////////////////////////////////////////////////////////////////
|
||
|
//
|
||
|
// Functions for reading and writing Ebwts
|
||
|
//
|
||
|
///////////////////////////////////////////////////////////////////////
|
||
|
|
||
|
/**
|
||
|
* Read an Ebwt from file with given filename.
|
||
|
*/
|
||
|
template <typename index_t>
|
||
|
void GFM<index_t>::readIntoMemory(
|
||
|
int needEntireRev,
|
||
|
bool loadSASamp,
|
||
|
bool loadFtab,
|
||
|
bool loadRstarts,
|
||
|
bool justHeader,
|
||
|
GFMParams<index_t> *params,
|
||
|
bool mmSweep,
|
||
|
bool loadNames,
|
||
|
bool startVerbose,
|
||
|
bool subIndex)
|
||
|
{
|
||
|
bool switchEndian; // dummy; caller doesn't care
|
||
|
#ifdef BOWTIE_MM
|
||
|
char *mmFile[] = { NULL, NULL };
|
||
|
#endif
|
||
|
if(_in1Str.length() > 0 && !subIndex) {
|
||
|
if(_verbose || startVerbose) {
|
||
|
cerr << " About to open input files: ";
|
||
|
logTime(cerr);
|
||
|
}
|
||
|
// Initialize our primary and secondary input-stream fields
|
||
|
if(_in1 != NULL) fclose(_in1);
|
||
|
if(_verbose || startVerbose) cerr << "Opening \"" << _in1Str.c_str() << "\"" << endl;
|
||
|
if((_in1 = fopen(_in1Str.c_str(), "rb")) == NULL) {
|
||
|
cerr << "Could not open index file " << _in1Str.c_str() << endl;
|
||
|
}
|
||
|
if(loadSASamp) {
|
||
|
if(_in2 != NULL) fclose(_in2);
|
||
|
if(_verbose || startVerbose) cerr << "Opening \"" << _in2Str.c_str() << "\"" << endl;
|
||
|
if((_in2 = fopen(_in2Str.c_str(), "rb")) == NULL) {
|
||
|
cerr << "Could not open index file " << _in2Str.c_str() << endl;
|
||
|
}
|
||
|
}
|
||
|
if(_verbose || startVerbose) {
|
||
|
cerr << " Finished opening input files: ";
|
||
|
logTime(cerr);
|
||
|
}
|
||
|
|
||
|
#ifdef BOWTIE_MM
|
||
|
if(_useMm /*&& !justHeader*/) {
|
||
|
const char *names[] = {_in1Str.c_str(), _in2Str.c_str()};
|
||
|
int fds[] = { fileno(_in1), fileno(_in2) };
|
||
|
for(int i = 0; i < (loadSASamp ? 2 : 1); i++) {
|
||
|
if(_verbose || startVerbose) {
|
||
|
cerr << " Memory-mapping input file " << (i+1) << ": ";
|
||
|
logTime(cerr);
|
||
|
}
|
||
|
struct stat sbuf;
|
||
|
if (stat(names[i], &sbuf) == -1) {
|
||
|
perror("stat");
|
||
|
cerr << "Error: Could not stat index file " << names[i] << " prior to memory-mapping" << endl;
|
||
|
throw 1;
|
||
|
}
|
||
|
mmFile[i] = (char*)mmap((void *)0, (size_t)sbuf.st_size,
|
||
|
PROT_READ, MAP_SHARED, fds[(size_t)i], 0);
|
||
|
if(mmFile[i] == (void *)(-1)) {
|
||
|
perror("mmap");
|
||
|
cerr << "Error: Could not memory-map the index file " << names[i] << endl;
|
||
|
throw 1;
|
||
|
}
|
||
|
if(mmSweep) {
|
||
|
int sum = 0;
|
||
|
for(off_t j = 0; j < sbuf.st_size; j += 1024) {
|
||
|
sum += (int) mmFile[i][j];
|
||
|
}
|
||
|
if(startVerbose) {
|
||
|
cerr << " Swept the memory-mapped ebwt index file 1; checksum: " << sum << ": ";
|
||
|
logTime(cerr);
|
||
|
}
|
||
|
}
|
||
|
}
|
||
|
mmFile1_ = mmFile[0];
|
||
|
mmFile2_ = loadSASamp ? mmFile[1] : NULL;
|
||
|
}
|
||
|
#endif
|
||
|
}
|
||
|
#ifdef BOWTIE_MM
|
||
|
else if(_useMm && !justHeader) {
|
||
|
mmFile[0] = mmFile1_;
|
||
|
mmFile[1] = mmFile2_;
|
||
|
}
|
||
|
if(_useMm && !justHeader) {
|
||
|
assert(mmFile[0] == mmFile1_);
|
||
|
assert(mmFile[1] == mmFile2_);
|
||
|
}
|
||
|
#endif
|
||
|
|
||
|
if(_verbose || startVerbose) {
|
||
|
cerr << " Reading header: ";
|
||
|
logTime(cerr);
|
||
|
}
|
||
|
|
||
|
// Read endianness hints from both streams
|
||
|
size_t bytesRead = 0;
|
||
|
if(!subIndex) {
|
||
|
switchEndian = false;
|
||
|
uint32_t one = readU32(_in1, switchEndian); // 1st word of primary stream
|
||
|
bytesRead += 4;
|
||
|
if(loadSASamp) {
|
||
|
#ifndef NDEBUG
|
||
|
assert_eq(one, readU32(_in2, switchEndian)); // should match!
|
||
|
#else
|
||
|
readU32(_in2, switchEndian);
|
||
|
#endif
|
||
|
}
|
||
|
if(one != 1) {
|
||
|
assert_eq((1u<<24), one);
|
||
|
assert_eq(1, endianSwapU32(one));
|
||
|
switchEndian = true;
|
||
|
}
|
||
|
|
||
|
_toBigEndian = switchEndian;
|
||
|
|
||
|
// Can't switch endianness and use memory-mapped files; in order to
|
||
|
// support this, someone has to modify the file to switch
|
||
|
// endiannesses appropriately, and we can't do this inside Bowtie
|
||
|
// or we might be setting up a race condition with other processes.
|
||
|
if(switchEndian && _useMm) {
|
||
|
cerr << "Error: Can't use memory-mapped files when the index is the opposite endianness" << endl;
|
||
|
throw 1;
|
||
|
}
|
||
|
|
||
|
// Reads header entries one by one from primary stream
|
||
|
int index_version = (int)readU32(_in1, switchEndian); bytesRead += 4;
|
||
|
int major_index_version, minor_index_version;
|
||
|
string index_version_extra;
|
||
|
readIndexVersion(index_version, major_index_version, minor_index_version, index_version_extra);
|
||
|
int major_program_version, minor_program_version;
|
||
|
string program_version_extra;
|
||
|
readProgramVersion(major_program_version, minor_program_version, program_version_extra);
|
||
|
if(major_program_version < major_index_version ||
|
||
|
(major_program_version == major_index_version && minor_program_version < minor_index_version)) {
|
||
|
cerr << "Warning: the current version of HISAT2 (" << HISAT2_VERSION << ") is older than the version (2."
|
||
|
<< major_index_version << "." << minor_index_version;
|
||
|
if(index_version_extra.length() > 0) {
|
||
|
cerr << "-" << index_version_extra;
|
||
|
}
|
||
|
cerr << ") used to build the index." << endl;
|
||
|
cerr << " Users are strongly recommended to update HISAT2 to the latest version." << endl;
|
||
|
}
|
||
|
} else {
|
||
|
switchEndian = _toBigEndian;
|
||
|
}
|
||
|
|
||
|
index_t len = readIndex<index_t>(_in1, switchEndian);
|
||
|
bytesRead += sizeof(index_t);
|
||
|
index_t gbwtLen = readIndex<index_t>(_in1, switchEndian);
|
||
|
bytesRead += sizeof(index_t);
|
||
|
assert_lt(len, gbwtLen);
|
||
|
index_t numNodes = readIndex<index_t>(_in1, switchEndian);
|
||
|
bytesRead += sizeof(index_t);
|
||
|
int32_t lineRate = readI32(_in1, switchEndian);
|
||
|
bytesRead += 4;
|
||
|
/*int32_t linesPerSide =*/ readI32(_in1, switchEndian);
|
||
|
bytesRead += 4;
|
||
|
int32_t offRate = readI32(_in1, switchEndian);
|
||
|
bytesRead += 4;
|
||
|
// TODO: add isaRate to the actual file format (right now, the
|
||
|
// user has to tell us whether there's an ISA sample and what the
|
||
|
// sampling rate is.
|
||
|
int32_t ftabChars = readI32(_in1, switchEndian);
|
||
|
bytesRead += 4;
|
||
|
index_t eftabLen = readIndex<index_t>(_in1, switchEndian);
|
||
|
bytesRead += sizeof(index_t);
|
||
|
// chunkRate was deprecated in an earlier version of Bowtie; now
|
||
|
// we use it to hold flags.
|
||
|
int32_t flags = readI32(_in1, switchEndian);
|
||
|
bool entireRev = false;
|
||
|
if(flags < 0 && (((-flags) & GFM_ENTIRE_REV) == 0)) {
|
||
|
if(needEntireRev != -1 && needEntireRev != 0) {
|
||
|
cerr << "Error: This index is compatible with 0.* versions of Bowtie, but not with 2.*" << endl
|
||
|
<< "versions. Please build or download a version of the index that is compitble" << endl
|
||
|
<< "with Bowtie 2.* (i.e. built with bowtie-build 2.* or later)" << endl;
|
||
|
throw 1;
|
||
|
}
|
||
|
} else entireRev = true;
|
||
|
bytesRead += 4;
|
||
|
|
||
|
// Create a new EbwtParams from the entries read from primary stream
|
||
|
GFMParams<index_t> *gh;
|
||
|
bool deleteGh = false;
|
||
|
if(params != NULL) {
|
||
|
params->init(len, gbwtLen, numNodes, lineRate, offRate, ftabChars, eftabLen, entireRev);
|
||
|
if(_verbose || startVerbose) params->print(cerr);
|
||
|
gh = params;
|
||
|
} else {
|
||
|
gh = new GFMParams<index_t>(len, gbwtLen, numNodes, lineRate, offRate, ftabChars, eftabLen, entireRev);
|
||
|
deleteGh = true;
|
||
|
}
|
||
|
|
||
|
// Set up overridden suffix-array-sample parameters
|
||
|
index_t offsLen = gh->_offsLen;
|
||
|
index_t offRateDiff = 0;
|
||
|
index_t offsLenSampled = offsLen;
|
||
|
if(_overrideOffRate > offRate) {
|
||
|
offRateDiff = _overrideOffRate - offRate;
|
||
|
}
|
||
|
if(offRateDiff > 0) {
|
||
|
offsLenSampled >>= offRateDiff;
|
||
|
if((offsLen & ~(((index_t)INDEX_MAX) << offRateDiff)) != 0) {
|
||
|
offsLenSampled++;
|
||
|
}
|
||
|
}
|
||
|
|
||
|
// Can't override the offrate or isarate and use memory-mapped
|
||
|
// files; ultimately, all processes need to copy the sparser sample
|
||
|
// into their own memory spaces.
|
||
|
#if 0
|
||
|
if(_useMm && (offRateDiff)) {
|
||
|
cerr << "Error: Can't use memory-mapped files when the offrate is overridden" << endl;
|
||
|
throw 1;
|
||
|
}
|
||
|
#endif
|
||
|
|
||
|
// Read nPat from primary stream
|
||
|
this->_nPat = readIndex<index_t>(_in1, switchEndian);
|
||
|
bytesRead += sizeof(index_t);
|
||
|
_plen.reset();
|
||
|
// Read plen from primary stream
|
||
|
if(_useMm) {
|
||
|
#ifdef BOWTIE_MM
|
||
|
_plen.init((index_t*)(mmFile[0] + bytesRead), _nPat, false);
|
||
|
bytesRead += _nPat*sizeof(index_t);
|
||
|
fseek(_in1, _nPat*sizeof(index_t), SEEK_CUR);
|
||
|
#endif
|
||
|
} else {
|
||
|
try {
|
||
|
if(_verbose || startVerbose) {
|
||
|
cerr << "Reading plen (" << this->_nPat << "): ";
|
||
|
logTime(cerr);
|
||
|
}
|
||
|
_plen.init(new index_t[_nPat], _nPat, true);
|
||
|
if(switchEndian) {
|
||
|
for(index_t i = 0; i < this->_nPat; i++) {
|
||
|
plen()[i] = readIndex<index_t>(_in1, switchEndian);
|
||
|
}
|
||
|
} else {
|
||
|
size_t r = MM_READ(_in1, (void*)(plen()), _nPat*sizeof(index_t));
|
||
|
if(r != (size_t)(_nPat*sizeof(index_t))) {
|
||
|
cerr << "Error reading _plen[] array: " << r << ", " << _nPat*sizeof(index_t) << endl;
|
||
|
throw 1;
|
||
|
}
|
||
|
}
|
||
|
} catch(bad_alloc& e) {
|
||
|
cerr << "Out of memory allocating plen[] in Ebwt::read()"
|
||
|
<< " at " << __FILE__ << ":" << __LINE__ << endl;
|
||
|
throw e;
|
||
|
}
|
||
|
}
|
||
|
|
||
|
// TODO: I'm not consistent on what "header" means. Here I'm using
|
||
|
// "header" to mean everything that would exist in memory if we
|
||
|
// started to build the Ebwt but stopped short of the build*() step
|
||
|
// (i.e. everything up to and including join()).
|
||
|
if(justHeader) {
|
||
|
// Be kind
|
||
|
if(deleteGh) delete gh;
|
||
|
#ifdef BOWTIE_MM
|
||
|
fseek(_in1, 0, SEEK_SET);
|
||
|
if(loadSASamp) fseek(_in2, 0, SEEK_SET);
|
||
|
#else
|
||
|
rewind(_in1);
|
||
|
if(loadSASamp) rewind(_in2);
|
||
|
#endif
|
||
|
return;
|
||
|
}
|
||
|
|
||
|
bool shmemLeader;
|
||
|
|
||
|
this->_nFrag = readIndex<index_t>(_in1, switchEndian);
|
||
|
bytesRead += sizeof(index_t);
|
||
|
if(_verbose || startVerbose) {
|
||
|
cerr << "Reading rstarts (" << this->_nFrag*3 << "): ";
|
||
|
logTime(cerr);
|
||
|
}
|
||
|
assert_geq(this->_nFrag, this->_nPat);
|
||
|
_rstarts.reset();
|
||
|
if(loadRstarts) {
|
||
|
if(_useMm) {
|
||
|
#ifdef BOWTIE_MM
|
||
|
_rstarts.init((index_t*)(mmFile[0] + bytesRead), _nFrag*3, false);
|
||
|
bytesRead += this->_nFrag*sizeof(index_t)*3;
|
||
|
fseek(_in1, this->_nFrag*sizeof(index_t)*3, SEEK_CUR);
|
||
|
#endif
|
||
|
} else {
|
||
|
_rstarts.init(new index_t[_nFrag*3], _nFrag*3, true);
|
||
|
if(switchEndian) {
|
||
|
for(size_t i = 0; i < (size_t)(this->_nFrag*3); i += 3) {
|
||
|
// fragment starting position in joined reference
|
||
|
// string, text id, and fragment offset within text
|
||
|
this->rstarts()[i] = readIndex<index_t>(_in1, switchEndian);
|
||
|
this->rstarts()[i+1] = readIndex<index_t>(_in1, switchEndian);
|
||
|
this->rstarts()[i+2] = readIndex<index_t>(_in1, switchEndian);
|
||
|
}
|
||
|
} else {
|
||
|
size_t r = MM_READ(_in1, (void *)rstarts(), this->_nFrag*sizeof(index_t)*3);
|
||
|
if(r != (size_t)(this->_nFrag*sizeof(index_t)*3)) {
|
||
|
cerr << "Error reading _rstarts[] array: " << r << ", " << (this->_nFrag*sizeof(index_t)*3) << endl;
|
||
|
throw 1;
|
||
|
}
|
||
|
}
|
||
|
}
|
||
|
} else {
|
||
|
// Skip em
|
||
|
assert(rstarts() == NULL);
|
||
|
bytesRead += this->_nFrag*sizeof(index_t)*3;
|
||
|
fseek(_in1, this->_nFrag*sizeof(index_t)*3, SEEK_CUR);
|
||
|
}
|
||
|
|
||
|
_gfm.reset();
|
||
|
if(_useMm) {
|
||
|
#ifdef BOWTIE_MM
|
||
|
_gfm.init((uint8_t*)(mmFile[0] + bytesRead), gh->_gbwtTotLen, false);
|
||
|
bytesRead += gh->_gbwtTotLen;
|
||
|
fseek(_in1, gh->_gbwtTotLen, SEEK_CUR);
|
||
|
#endif
|
||
|
} else {
|
||
|
// Allocate ebwt (big allocation)
|
||
|
if(_verbose || startVerbose) {
|
||
|
cerr << "Reading ebwt (" << gh->_gbwtTotLen << "): ";
|
||
|
logTime(cerr);
|
||
|
}
|
||
|
bool shmemLeader = true;
|
||
|
if(useShmem_) {
|
||
|
uint8_t *tmp = NULL;
|
||
|
shmemLeader = ALLOC_SHARED_U8(
|
||
|
(_in1Str + "[ebwt]"), gh->_gbwtTotLen, &tmp,
|
||
|
"gfm[]", (_verbose || startVerbose));
|
||
|
assert(tmp != NULL);
|
||
|
_gfm.init(tmp, gh->_gbwtTotLen, false);
|
||
|
if(_verbose || startVerbose) {
|
||
|
cerr << " shared-mem " << (shmemLeader ? "leader" : "follower") << endl;
|
||
|
}
|
||
|
} else {
|
||
|
try {
|
||
|
_gfm.init(new uint8_t[gh->_gbwtTotLen], gh->_gbwtTotLen, true);
|
||
|
} catch(bad_alloc& e) {
|
||
|
cerr << "Out of memory allocating the gfm[] array for the Bowtie index. Please try" << endl
|
||
|
<< "again on a computer with more memory." << endl;
|
||
|
throw 1;
|
||
|
}
|
||
|
}
|
||
|
if(shmemLeader) {
|
||
|
// Read ebwt from primary stream
|
||
|
uint64_t bytesLeft = gh->_gbwtTotLen;
|
||
|
char *pgbwt = (char*)this->gfm();
|
||
|
while (bytesLeft>0){
|
||
|
size_t r = MM_READ(this->_in1, (void *)pgbwt, bytesLeft);
|
||
|
if(MM_IS_IO_ERR(this->_in1, r, bytesLeft)) {
|
||
|
cerr << "Error reading _ebwt[] array: " << r << ", "
|
||
|
<< bytesLeft << endl;
|
||
|
throw 1;
|
||
|
}
|
||
|
pgbwt += r;
|
||
|
bytesLeft -= r;
|
||
|
}
|
||
|
if(switchEndian) {
|
||
|
uint8_t *side = this->gfm();
|
||
|
for(size_t i = 0; i < gh->_numSides; i++) {
|
||
|
index_t *cums = reinterpret_cast<index_t*>(side + gh->_sideSz - sizeof(index_t)*2);
|
||
|
cums[0] = endianSwapIndex(cums[0]);
|
||
|
cums[1] = endianSwapIndex(cums[1]);
|
||
|
side += this->_gh._sideSz;
|
||
|
}
|
||
|
}
|
||
|
#ifdef BOWTIE_SHARED_MEM
|
||
|
if(useShmem_) NOTIFY_SHARED(gfm(), gh->_gbwtTotLen);
|
||
|
#endif
|
||
|
} else {
|
||
|
// Seek past the data and wait until master is finished
|
||
|
fseek(_in1, gh->_gbwtTotLen, SEEK_CUR);
|
||
|
#ifdef BOWTIE_SHARED_MEM
|
||
|
if(useShmem_) WAIT_SHARED(gfm(), gh->_gbwtTotLen);
|
||
|
#endif
|
||
|
}
|
||
|
}
|
||
|
|
||
|
// Read zOff from primary stream
|
||
|
_zOffs.clear();
|
||
|
index_t num_zOffs = readIndex<index_t>(_in1, switchEndian);
|
||
|
bytesRead += sizeof(index_t);
|
||
|
for(index_t i = 0; i < num_zOffs; i++) {
|
||
|
index_t zOff = readIndex<index_t>(_in1, switchEndian);
|
||
|
bytesRead += sizeof(index_t);
|
||
|
assert_lt(zOff, gbwtLen);
|
||
|
_zOffs.push_back(zOff);
|
||
|
}
|
||
|
|
||
|
try {
|
||
|
// Read fchr from primary stream
|
||
|
if(_verbose || startVerbose) cerr << "Reading fchr (5)" << endl;
|
||
|
_fchr.reset();
|
||
|
if(_useMm) {
|
||
|
#ifdef BOWTIE_MM
|
||
|
_fchr.init((index_t*)(mmFile[0] + bytesRead), 5, false);
|
||
|
bytesRead += 5*sizeof(index_t);
|
||
|
fseek(_in1, 5*sizeof(index_t), SEEK_CUR);
|
||
|
#endif
|
||
|
} else {
|
||
|
_fchr.init(new index_t[5], 5, true);
|
||
|
for(int i = 0; i < 5; i++) {
|
||
|
this->fchr()[i] = readIndex<index_t>(_in1, switchEndian);
|
||
|
assert_leq(this->fchr()[i], gbwtLen);
|
||
|
assert(i <= 0 || this->fchr()[i] >= this->fchr()[i-1]);
|
||
|
}
|
||
|
}
|
||
|
assert_gt(this->fchr()[4], this->fchr()[0]);
|
||
|
// Read ftab from primary stream
|
||
|
if(_verbose || startVerbose) {
|
||
|
if(loadFtab) {
|
||
|
cerr << "Reading ftab (" << gh->_ftabLen << "): ";
|
||
|
logTime(cerr);
|
||
|
} else {
|
||
|
cerr << "Skipping ftab (" << gh->_ftabLen << "): ";
|
||
|
}
|
||
|
}
|
||
|
_ftab.reset();
|
||
|
if(loadFtab) {
|
||
|
if(_useMm) {
|
||
|
#ifdef BOWTIE_MM
|
||
|
_ftab.init((index_t*)(mmFile[0] + bytesRead), gh->_ftabLen, false);
|
||
|
bytesRead += gh->_ftabLen*sizeof(index_t);
|
||
|
fseek(_in1, gh->_ftabLen*sizeof(index_t), SEEK_CUR);
|
||
|
#endif
|
||
|
} else {
|
||
|
_ftab.init(new index_t[gh->_ftabLen], gh->_ftabLen, true);
|
||
|
if(switchEndian) {
|
||
|
for(size_t i = 0; i < gh->_ftabLen; i++)
|
||
|
this->ftab()[i] = readIndex<index_t>(_in1, switchEndian);
|
||
|
} else {
|
||
|
size_t r = MM_READ(_in1, (void *)ftab(), gh->_ftabLen*sizeof(index_t));
|
||
|
if(r != (size_t)(gh->_ftabLen*sizeof(index_t))) {
|
||
|
cerr << "Error reading _ftab[] array: " << r << ", " << (gh->_ftabLen*sizeof(index_t)) << endl;
|
||
|
throw 1;
|
||
|
}
|
||
|
}
|
||
|
}
|
||
|
// Read etab from primary stream
|
||
|
if(_verbose || startVerbose) {
|
||
|
if(loadFtab) {
|
||
|
cerr << "Reading eftab (" << gh->_eftabLen << "): ";
|
||
|
logTime(cerr);
|
||
|
} else {
|
||
|
cerr << "Skipping eftab (" << gh->_eftabLen << "): ";
|
||
|
}
|
||
|
|
||
|
}
|
||
|
_eftab.reset();
|
||
|
if(_useMm) {
|
||
|
#ifdef BOWTIE_MM
|
||
|
_eftab.init((index_t*)(mmFile[0] + bytesRead), gh->_eftabLen, false);
|
||
|
bytesRead += gh->_eftabLen*sizeof(index_t);
|
||
|
fseek(_in1, gh->_eftabLen*sizeof(index_t), SEEK_CUR);
|
||
|
#endif
|
||
|
} else {
|
||
|
_eftab.init(new index_t[gh->_eftabLen], gh->_eftabLen, true);
|
||
|
if(switchEndian) {
|
||
|
for(size_t i = 0; i < gh->_eftabLen; i++)
|
||
|
this->eftab()[i] = readIndex<index_t>(_in1, switchEndian);
|
||
|
} else {
|
||
|
size_t r = MM_READ(_in1, (void *)this->eftab(), gh->_eftabLen*sizeof(index_t));
|
||
|
if(r != (size_t)(gh->_eftabLen*sizeof(index_t))) {
|
||
|
cerr << "Error reading _eftab[] array: " << r << ", " << (gh->_eftabLen*sizeof(index_t)) << endl;
|
||
|
throw 1;
|
||
|
}
|
||
|
}
|
||
|
}
|
||
|
for(index_t i = 0; i < gh->_eftabLen; i++) {
|
||
|
if(i > 0 && this->eftab()[i] > 0) {
|
||
|
assert_geq(this->eftab()[i] + 4, this->eftab()[i-1]);
|
||
|
} else if(i > 0 && this->eftab()[i-1] == 0) {
|
||
|
assert_eq(0, this->eftab()[i]);
|
||
|
}
|
||
|
}
|
||
|
} else {
|
||
|
assert(ftab() == NULL);
|
||
|
assert(eftab() == NULL);
|
||
|
// Skip ftab
|
||
|
bytesRead += gh->_ftabLen*sizeof(index_t);
|
||
|
fseek(_in1, gh->_ftabLen*sizeof(index_t), SEEK_CUR);
|
||
|
// Skip eftab
|
||
|
bytesRead += sizeof(index_t);
|
||
|
bytesRead += gh->_eftabLen*sizeof(index_t);
|
||
|
fseek(_in1, gh->_eftabLen*sizeof(index_t), SEEK_CUR);
|
||
|
}
|
||
|
} catch(bad_alloc& e) {
|
||
|
cerr << "Out of memory allocating fchr[], ftab[] or eftab[] arrays for the Bowtie index." << endl
|
||
|
<< "Please try again on a computer with more memory." << endl;
|
||
|
throw 1;
|
||
|
}
|
||
|
|
||
|
// Read reference sequence names from primary index file (or not,
|
||
|
// if --refidx is specified)
|
||
|
if(loadNames) {
|
||
|
while(true) {
|
||
|
char c = '\0';
|
||
|
if(MM_READ(_in1, (void *)(&c), (size_t)1) != (size_t)1) break;
|
||
|
bytesRead++;
|
||
|
if(c == '\0') break;
|
||
|
else if(c == '\n') {
|
||
|
this->_refnames.push_back("");
|
||
|
} else {
|
||
|
if(this->_refnames.size() == 0) {
|
||
|
this->_refnames.push_back("");
|
||
|
}
|
||
|
this->_refnames.back().push_back(c);
|
||
|
}
|
||
|
}
|
||
|
}
|
||
|
|
||
|
_offs.reset();
|
||
|
if(loadSASamp) {
|
||
|
bytesRead = 4; // reset for secondary index file (already read 1-sentinel)
|
||
|
|
||
|
shmemLeader = true;
|
||
|
if(_verbose || startVerbose) {
|
||
|
cerr << "Reading offs (" << offsLenSampled << " " << std::setw(2) << sizeof(index_t)*8 << "-bit words): ";
|
||
|
logTime(cerr);
|
||
|
}
|
||
|
|
||
|
if(!_useMm) {
|
||
|
if(!useShmem_) {
|
||
|
// Allocate offs_
|
||
|
try {
|
||
|
_offs.init(new index_t[offsLenSampled], offsLenSampled, true);
|
||
|
} catch(bad_alloc& e) {
|
||
|
cerr << "Out of memory allocating the offs[] array for the Bowtie index." << endl
|
||
|
<< "Please try again on a computer with more memory." << endl;
|
||
|
throw 1;
|
||
|
}
|
||
|
} else {
|
||
|
index_t *tmp = NULL;
|
||
|
shmemLeader = ALLOC_SHARED_U32(
|
||
|
(_in2Str + "[offs]"), offsLenSampled*sizeof(index_t), &tmp,
|
||
|
"offs", (_verbose || startVerbose));
|
||
|
_offs.init((index_t*)tmp, offsLenSampled, false);
|
||
|
}
|
||
|
}
|
||
|
|
||
|
if(_overrideOffRate < 32) {
|
||
|
if(shmemLeader) {
|
||
|
// Allocate offs (big allocation)
|
||
|
if(switchEndian || offRateDiff > 0) {
|
||
|
assert(!_useMm);
|
||
|
const index_t blockMaxSz = (index_t)(2 * 1024 * 1024); // 2 MB block size
|
||
|
const index_t blockMaxSzU = (blockMaxSz / sizeof(index_t)); // # U32s per block
|
||
|
char *buf;
|
||
|
try {
|
||
|
buf = new char[blockMaxSz];
|
||
|
} catch(std::bad_alloc& e) {
|
||
|
cerr << "Error: Out of memory allocating part of _offs array: '" << e.what() << "'" << endl;
|
||
|
throw e;
|
||
|
}
|
||
|
for(index_t i = 0; i < offsLen; i += blockMaxSzU) {
|
||
|
index_t block = min<index_t>((index_t)blockMaxSzU, (index_t)(offsLen - i));
|
||
|
size_t r = MM_READ(_in2, (void *)buf, block * sizeof(index_t));
|
||
|
if(r != (size_t)(block * sizeof(index_t))) {
|
||
|
cerr << "Error reading block of _offs[] array: " << r << ", " << (block * sizeof(index_t)) << endl;
|
||
|
throw 1;
|
||
|
}
|
||
|
index_t idx = i >> offRateDiff;
|
||
|
for(index_t j = 0; j < block; j += (1 << offRateDiff)) {
|
||
|
assert_lt(idx, offsLenSampled);
|
||
|
this->offs()[idx] = ((index_t*)buf)[j];
|
||
|
if(switchEndian) {
|
||
|
this->offs()[idx] = endianSwapIndex(this->offs()[idx]);
|
||
|
}
|
||
|
idx++;
|
||
|
}
|
||
|
}
|
||
|
delete[] buf;
|
||
|
} else {
|
||
|
if(_useMm) {
|
||
|
#ifdef BOWTIE_MM
|
||
|
_offs.init((index_t*)(mmFile[1] + bytesRead), offsLen, false);
|
||
|
bytesRead += (offsLen * sizeof(index_t));
|
||
|
fseek(_in2, (offsLen * sizeof(index_t)), SEEK_CUR);
|
||
|
#endif
|
||
|
} else {
|
||
|
// Workaround for small-index mode where MM_READ may
|
||
|
// not be able to handle read amounts greater than 2^32
|
||
|
// bytes.
|
||
|
uint64_t bytesLeft = (offsLen * sizeof(index_t));
|
||
|
char *offs = (char *)this->offs();
|
||
|
|
||
|
while(bytesLeft > 0) {
|
||
|
size_t r = MM_READ(_in2, (void*)offs, bytesLeft);
|
||
|
if(MM_IS_IO_ERR(_in2,r,bytesLeft)) {
|
||
|
cerr << "Error reading block of _offs[] array: "
|
||
|
<< r << ", " << bytesLeft << gLastIOErrMsg << endl;
|
||
|
throw 1;
|
||
|
}
|
||
|
offs += r;
|
||
|
bytesLeft -= r;
|
||
|
}
|
||
|
}
|
||
|
}
|
||
|
#ifdef BOWTIE_SHARED_MEM
|
||
|
if(useShmem_) NOTIFY_SHARED(offs(), offsLenSampled*sizeof(index_t));
|
||
|
#endif
|
||
|
} else {
|
||
|
// Not the shmem leader
|
||
|
fseek(_in2, offsLenSampled*sizeof(index_t), SEEK_CUR);
|
||
|
#ifdef BOWTIE_SHARED_MEM
|
||
|
if(useShmem_) WAIT_SHARED(offs(), offsLenSampled*sizeof(index_t));
|
||
|
#endif
|
||
|
}
|
||
|
}
|
||
|
}
|
||
|
|
||
|
this->postReadInit(*gh); // Initialize fields of Ebwt not read from file
|
||
|
if(_verbose || startVerbose) print(cerr, *gh);
|
||
|
|
||
|
// The fact that _ebwt and friends actually point to something
|
||
|
// (other than NULL) now signals to other member functions that the
|
||
|
// Ebwt is loaded into memory.
|
||
|
|
||
|
// Be kind
|
||
|
if(deleteGh) delete gh;
|
||
|
|
||
|
if(!subIndex) {
|
||
|
#ifdef BOWTIE_MM
|
||
|
fseek(_in1, 0, SEEK_SET);
|
||
|
if(loadSASamp) fseek(_in2, 0, SEEK_SET);
|
||
|
#else
|
||
|
rewind(_in1);
|
||
|
if(loadSASamp) rewind(_in2);
|
||
|
#endif
|
||
|
}
|
||
|
}
|
||
|
|
||
|
/**
|
||
|
* Read reference names from an input stream 'in' for an Ebwt primary
|
||
|
* file and store them in 'refnames'.
|
||
|
*/
|
||
|
template <typename index_t>
|
||
|
void readGFMRefnames(istream& in, EList<string>& refnames) {
|
||
|
// _in1 must already be open with the get cursor at the
|
||
|
// beginning and no error flags set.
|
||
|
assert(in.good());
|
||
|
assert_eq((streamoff)in.tellg(), ios::beg);
|
||
|
|
||
|
// Read endianness hints from both streams
|
||
|
bool switchEndian = false;
|
||
|
uint32_t one = readU32(in, switchEndian); // 1st word of primary stream
|
||
|
if(one != 1) {
|
||
|
assert_eq((1u<<24), one);
|
||
|
switchEndian = true;
|
||
|
}
|
||
|
|
||
|
// Reads header entries one by one from primary stream
|
||
|
readU32(in, switchEndian); // version
|
||
|
index_t len = readIndex<index_t>(in, switchEndian);
|
||
|
index_t gbwtLen = readIndex<index_t>(in, switchEndian);
|
||
|
index_t numNodes = readIndex<index_t>(in, switchEndian);
|
||
|
int32_t lineRate = readI32(in, switchEndian);
|
||
|
/*int32_t linesPerSide =*/ readI32(in, switchEndian);
|
||
|
int32_t offRate = readI32(in, switchEndian);
|
||
|
int32_t ftabChars = readI32(in, switchEndian);
|
||
|
index_t eftabLen = readIndex<index_t>(in, switchEndian);
|
||
|
// BTL: chunkRate is now deprecated
|
||
|
int32_t flags = readI32(in, switchEndian);
|
||
|
bool entireReverse = false;
|
||
|
if(flags < 0) {
|
||
|
entireReverse = (((-flags) & GFM_ENTIRE_REV) != 0);
|
||
|
}
|
||
|
|
||
|
// Create a new EbwtParams from the entries read from primary stream
|
||
|
GFM<index_t> gh(len, gbwtLen, numNodes, lineRate, offRate, ftabChars, eftabLen, entireReverse);
|
||
|
|
||
|
index_t nPat = readIndex<index_t>(in, switchEndian); // nPat
|
||
|
in.seekg(nPat*sizeof(index_t), ios_base::cur); // skip plen
|
||
|
|
||
|
// Skip rstarts
|
||
|
index_t nFrag = readIndex<index_t>(in, switchEndian);
|
||
|
in.seekg(nFrag*sizeof(index_t)*3, ios_base::cur);
|
||
|
|
||
|
// Skip ebwt
|
||
|
in.seekg(gh._ebwtTotLen, ios_base::cur);
|
||
|
|
||
|
// Skip zOff from primary stream
|
||
|
index_t numZOffs = readIndex<index_t>(in, switchEndian);
|
||
|
in.seekg(numZOffs * sizeof(index_t), ios_base::cur);
|
||
|
|
||
|
// Skip fchr
|
||
|
in.seekg(5 * sizeof(index_t), ios_base::cur);
|
||
|
|
||
|
// Skip ftab
|
||
|
in.seekg(gh._ftabLen*sizeof(index_t), ios_base::cur);
|
||
|
|
||
|
// Skip eftab
|
||
|
in.seekg(gh._eftabLen*sizeof(index_t), ios_base::cur);
|
||
|
|
||
|
// Read reference sequence names from primary index file
|
||
|
while(true) {
|
||
|
char c = '\0';
|
||
|
in.read(&c, 1);
|
||
|
if(in.eof()) break;
|
||
|
if(c == '\0') break;
|
||
|
else if(c == '\n') {
|
||
|
refnames.push_back("");
|
||
|
} else {
|
||
|
if(refnames.size() == 0) {
|
||
|
refnames.push_back("");
|
||
|
}
|
||
|
refnames.back().push_back(c);
|
||
|
}
|
||
|
}
|
||
|
if(refnames.back().empty()) {
|
||
|
refnames.pop_back();
|
||
|
}
|
||
|
|
||
|
// Be kind
|
||
|
in.clear(); in.seekg(0, ios::beg);
|
||
|
assert(in.good());
|
||
|
}
|
||
|
|
||
|
/**
|
||
|
* Read reference names from the index with basename 'in' and store
|
||
|
* them in 'refnames'.
|
||
|
*/
|
||
|
template <typename index_t>
|
||
|
void readGFMRefnames(const string& instr, EList<string>& refnames) {
|
||
|
ifstream in;
|
||
|
// Initialize our primary and secondary input-stream fields
|
||
|
in.open((instr + ".1." + gfm_ext).c_str(), ios_base::in | ios::binary);
|
||
|
if(!in.is_open()) {
|
||
|
throw GFMFileOpenException("Cannot open file " + instr);
|
||
|
}
|
||
|
assert(in.is_open());
|
||
|
assert(in.good());
|
||
|
assert_eq((streamoff)in.tellg(), ios::beg);
|
||
|
readGFMRefnames<index_t>(in, refnames);
|
||
|
}
|
||
|
|
||
|
/**
|
||
|
* Read just enough of the Ebwt's header to get its flags
|
||
|
*/
|
||
|
template <typename index_t>
|
||
|
int32_t GFM<index_t>::readVersionFlags(const string& instr, int& major, int& minor, string& extra_version) {
|
||
|
ifstream in;
|
||
|
// Initialize our primary and secondary input-stream fields
|
||
|
in.open((instr + ".1." + gfm_ext).c_str(), ios_base::in | ios::binary);
|
||
|
if(!in.is_open()) {
|
||
|
throw GFMFileOpenException("Cannot open file " + instr);
|
||
|
}
|
||
|
assert(in.is_open());
|
||
|
assert(in.good());
|
||
|
bool switchEndian = false;
|
||
|
uint32_t one = readU32(in, switchEndian); // 1st word of primary stream
|
||
|
if(one != 1) {
|
||
|
assert_eq((1u<<24), one);
|
||
|
assert_eq(1, endianSwapU32(one));
|
||
|
switchEndian = true;
|
||
|
}
|
||
|
index_t version = readU32(in, switchEndian);
|
||
|
readIndexVersion(version, major, minor, extra_version);
|
||
|
|
||
|
readIndex<index_t>(in, switchEndian);
|
||
|
readIndex<index_t>(in, switchEndian);
|
||
|
readIndex<index_t>(in, switchEndian);
|
||
|
readI32(in, switchEndian);
|
||
|
readI32(in, switchEndian);
|
||
|
readI32(in, switchEndian);
|
||
|
readI32(in, switchEndian);
|
||
|
readIndex<index_t>(in, switchEndian);
|
||
|
int32_t flags = readI32(in, switchEndian);
|
||
|
return flags;
|
||
|
}
|
||
|
|
||
|
|
||
|
/**
|
||
|
* Write an extended Burrows-Wheeler transform to a pair of output
|
||
|
* streams.
|
||
|
*
|
||
|
* @param out1 output stream to primary file
|
||
|
* @param out2 output stream to secondary file
|
||
|
* @param be write in big endian?
|
||
|
*/
|
||
|
template <typename index_t>
|
||
|
void GFM<index_t>::writeFromMemory(bool justHeader,
|
||
|
ostream& out1,
|
||
|
ostream& out2) const
|
||
|
{
|
||
|
const GFMParams<index_t>& gh = this->_gh;
|
||
|
assert(gh.repOk());
|
||
|
uint32_t be = this->toBe();
|
||
|
assert(out1.good());
|
||
|
assert(out2.good());
|
||
|
|
||
|
// When building an Ebwt, these header parameters are known
|
||
|
// "up-front", i.e., they can be written to disk immediately,
|
||
|
// before we join() or buildToDisk()
|
||
|
writeI32(out1, 1, be); // endian hint for priamry stream
|
||
|
writeI32(out2, 1, be); // endian hint for secondary stream
|
||
|
int version = getIndexVersion();
|
||
|
writeI32(out1, version, be); // version
|
||
|
writeIndex<index_t>(out1, gh._len, be); // length of string (and bwt and suffix array)
|
||
|
writeIndex<index_t>(out1, 0, be); // dummy for gbwt len
|
||
|
writeIndex<index_t>(out1, 0, be); // dummy for number of nodes
|
||
|
writeI32(out1, gh._lineRate, be); // 2^lineRate = size in bytes of 1 line
|
||
|
writeI32(out1, 2, be); // not used
|
||
|
writeI32(out1, gh._offRate, be); // every 2^offRate chars is "marked"
|
||
|
writeI32(out1, gh._ftabChars, be); // number of 2-bit chars used to address ftab
|
||
|
writeIndex<index_t>(out1, 0, be); // eftab length
|
||
|
int32_t flags = 1;
|
||
|
if(gh._entireReverse) flags |= GFM_ENTIRE_REV;
|
||
|
writeI32(out1, -flags, be); // BTL: chunkRate is now deprecated
|
||
|
|
||
|
if(!justHeader) {
|
||
|
assert(rstarts() != NULL);
|
||
|
assert(offs() != NULL);
|
||
|
assert(ftab() != NULL);
|
||
|
assert(eftab() != NULL);
|
||
|
assert(isInMemory());
|
||
|
// These Ebwt parameters are known after the inputs strings have
|
||
|
// been joined() but before they have been built(). These can
|
||
|
// written to the disk next and then discarded from memory.
|
||
|
writeIndex<index_t>(out1, this->_nPat, be);
|
||
|
for(index_t i = 0; i < this->_nPat; i++)
|
||
|
writeIndex<index_t>(out1, this->plen()[i], be);
|
||
|
assert_geq(this->_nFrag, this->_nPat);
|
||
|
writeIndex<index_t>(out1, this->_nFrag, be);
|
||
|
for(size_t i = 0; i < this->_nFrag*3; i++)
|
||
|
writeIndex<index_t>(out1, this->rstarts()[i], be);
|
||
|
|
||
|
// These Ebwt parameters are discovered only as the Ebwt is being
|
||
|
// built (in buildToDisk()). Of these, only 'offs' and 'ebwt' are
|
||
|
// terribly large. 'ebwt' is written to the primary file and then
|
||
|
// discarded from memory as it is built; 'offs' is similarly
|
||
|
// written to the secondary file and discarded.
|
||
|
writeIndex<index_t>(out1, gh._gbwtTotLen, be);
|
||
|
out1.write((const char *)this->gfm(), gh._gbwtTotLen);
|
||
|
writeIndex<index_t>(out1, (index_t)_zOffs.size(), be);
|
||
|
for(index_t i = 0; i < _zOffs.size(); i++)
|
||
|
writeIndex<index_t>(out1, _zOffs[i], be);
|
||
|
index_t offsLen = gh._offsLen;
|
||
|
for(index_t i = 0; i < offsLen; i++)
|
||
|
writeIndex<index_t>(out2, this->offs()[i], be);
|
||
|
|
||
|
// 'fchr', 'ftab' and 'eftab' are not fully determined until the
|
||
|
// loop is finished, so they are written to the primary file after
|
||
|
// all of 'ebwt' has already been written and only then discarded
|
||
|
// from memory.
|
||
|
for(int i = 0; i < 5; i++)
|
||
|
writeIndex<index_t>(out1, this->fchr()[i], be);
|
||
|
for(index_t i = 0; i < gh._ftabLen; i++)
|
||
|
writeIndex<index_t>(out1, this->ftab()[i], be);
|
||
|
for(index_t i = 0; i < gh._eftabLen; i++)
|
||
|
writeIndex<index_t>(out1, this->eftab()[i], be);
|
||
|
}
|
||
|
}
|
||
|
|
||
|
/**
|
||
|
* Given a pair of strings representing output filenames, and assuming
|
||
|
* this Ebwt object is currently in memory, write out this Ebwt to the
|
||
|
* specified files.
|
||
|
*
|
||
|
* If sanity-checking is enabled, then once the streams have been
|
||
|
* fully written and closed, we reopen them and read them into a
|
||
|
* (hopefully) exact copy of this Ebwt. We then assert that the
|
||
|
* current Ebwt and the copy match in all of their fields.
|
||
|
*/
|
||
|
template <typename index_t>
|
||
|
void GFM<index_t>::writeFromMemory(bool justHeader,
|
||
|
const string& out1,
|
||
|
const string& out2) const
|
||
|
{
|
||
|
ASSERT_ONLY(const GFMParams<index_t>& gh = this->_gh);
|
||
|
assert(isInMemory());
|
||
|
assert(gh.repOk());
|
||
|
|
||
|
ofstream fout1(out1.c_str(), ios::binary);
|
||
|
ofstream fout2(out2.c_str(), ios::binary);
|
||
|
writeFromMemory(justHeader, fout1, fout2);
|
||
|
fout1.close();
|
||
|
fout2.close();
|
||
|
|
||
|
// Read the file back in and assert that all components match
|
||
|
if(_sanity) {
|
||
|
#if 0
|
||
|
if(_verbose)
|
||
|
cout << "Re-reading \"" << out1 << "\"/\"" << out2 << "\" for sanity check" << endl;
|
||
|
Ebwt copy(out1, out2, _verbose, _sanity);
|
||
|
assert(!isInMemory());
|
||
|
copy.loadIntoMemory(eh._color ? 1 : 0, true, false, false);
|
||
|
assert(isInMemory());
|
||
|
assert_eq(eh._lineRate, copy.eh()._lineRate);
|
||
|
assert_eq(eh._offRate, copy.eh()._offRate);
|
||
|
assert_eq(eh._ftabChars, copy.eh()._ftabChars);
|
||
|
assert_eq(eh._len, copy.eh()._len);
|
||
|
assert_eq(_zOff, copy.zOff());
|
||
|
assert_eq(_zEbwtBpOff, copy.zEbwtBpOff());
|
||
|
assert_eq(_zEbwtByteOff, copy.zEbwtByteOff());
|
||
|
assert_eq(_nPat, copy.nPat());
|
||
|
for(index_t i = 0; i < _nPat; i++)
|
||
|
assert_eq(this->_plen[i], copy.plen()[i]);
|
||
|
assert_eq(this->_nFrag, copy.nFrag());
|
||
|
for(size_t i = 0; i < this->nFrag*3; i++) {
|
||
|
assert_eq(this->_rstarts[i], copy.rstarts()[i]);
|
||
|
}
|
||
|
for(index_t i = 0; i < 5; i++)
|
||
|
assert_eq(this->_fchr[i], copy.fchr()[i]);
|
||
|
for(size_t i = 0; i < eh._ftabLen; i++)
|
||
|
assert_eq(this->ftab()[i], copy.ftab()[i]);
|
||
|
for(size_t i = 0; i < eh._eftabLen; i++)
|
||
|
assert_eq(this->eftab()[i], copy.eftab()[i]);
|
||
|
for(index_t i = 0; i < eh._offsLen; i++)
|
||
|
assert_eq(this->_offs[i], copy.offs()[i]);
|
||
|
for(index_t i = 0; i < eh._ebwtTotLen; i++)
|
||
|
assert_eq(this->ebwt()[i], copy.ebwt()[i]);
|
||
|
copy.sanityCheckAll();
|
||
|
if(_verbose)
|
||
|
cout << "Read-in check passed for \"" << out1 << "\"/\"" << out2 << "\"" << endl;
|
||
|
#endif
|
||
|
}
|
||
|
}
|
||
|
|
||
|
/**
|
||
|
* Write the rstarts array given the szs array for the reference.
|
||
|
*/
|
||
|
template <typename index_t>
|
||
|
void GFM<index_t>::szsToDisk(const EList<RefRecord>& szs, ostream& os, int reverse) {
|
||
|
size_t seq = 0;
|
||
|
index_t off = 0;
|
||
|
index_t totlen = 0;
|
||
|
for(size_t i = 0; i < szs.size(); i++) {
|
||
|
if(szs[i].len == 0) continue;
|
||
|
if(szs[i].first) off = 0;
|
||
|
off += szs[i].off;
|
||
|
if(szs[i].first && szs[i].len > 0) seq++;
|
||
|
index_t seqm1 = (index_t)(seq-1);
|
||
|
assert_lt(seqm1, _nPat);
|
||
|
index_t fwoff = off;
|
||
|
if(reverse == REF_READ_REVERSE) {
|
||
|
// Invert pattern idxs
|
||
|
seqm1 = _nPat - seqm1 - 1;
|
||
|
// Invert pattern idxs
|
||
|
assert_leq(off + szs[i].len, plen()[seqm1]);
|
||
|
fwoff = plen()[seqm1] - (off + szs[i].len);
|
||
|
}
|
||
|
writeIndex<index_t>(os, totlen, this->toBe()); // offset from beginning of joined string
|
||
|
writeIndex<index_t>(os, (index_t)seqm1, this->toBe()); // sequence id
|
||
|
writeIndex<index_t>(os, (index_t)fwoff, this->toBe()); // offset into sequence
|
||
|
totlen += szs[i].len;
|
||
|
off += szs[i].len;
|
||
|
}
|
||
|
}
|
||
|
|
||
|
|
||
|
///////////////////////////////////////////////////////////////////////
|
||
|
//
|
||
|
// Functions for printing and sanity-checking Ebwts
|
||
|
//
|
||
|
///////////////////////////////////////////////////////////////////////
|
||
|
|
||
|
/**
|
||
|
* Check that the ebwt array is internally consistent up to (and not
|
||
|
* including) the given side index by re-counting the chars and
|
||
|
* comparing against the embedded occ[] arrays.
|
||
|
*/
|
||
|
template <typename index_t>
|
||
|
void GFM<index_t>::sanityCheckUpToSide(int upToSide) const {
|
||
|
assert(isInMemory());
|
||
|
index_t occ[] = {0, 0, 0, 0};
|
||
|
ASSERT_ONLY(index_t occ_save[] = {0, 0, 0, 0});
|
||
|
index_t cur = 0; // byte pointer
|
||
|
const GFMParams<index_t>& gh = this->_gh;
|
||
|
bool fw = false;
|
||
|
while(cur < (upToSide * gh._sideSz)) {
|
||
|
assert_leq(cur + gh._sideSz, gh._gbwtTotLen);
|
||
|
for(index_t i = 0; i < gh._sideGbwtSz; i++) {
|
||
|
uint8_t by = this->gfm()[cur + (fw ? i : gh._sideGbwtSz-i-1)];
|
||
|
for(int j = 0; j < 4; j++) {
|
||
|
// Unpack from lowest to highest bit pair
|
||
|
int twoBit = unpack_2b_from_8b(by, fw ? j : 3-j);
|
||
|
occ[twoBit]++;
|
||
|
}
|
||
|
assert_eq(0, (occ[0] + occ[1] + occ[2] + occ[3]) % 4);
|
||
|
}
|
||
|
assert_eq(0, (occ[0] + occ[1] + occ[2] + occ[3]) % gh._sideGbwtLen);
|
||
|
// Finished forward bucket; check saved [A], [C], [G] and [T]
|
||
|
// against the index_ts encoded here
|
||
|
ASSERT_ONLY(const index_t *ugbwt = reinterpret_cast<const index_t*>(&gfm()[cur + gh._sideGbwtSz]));
|
||
|
ASSERT_ONLY(index_t as = ugbwt[0]);
|
||
|
ASSERT_ONLY(index_t cs = ugbwt[1]);
|
||
|
ASSERT_ONLY(index_t gs = ugbwt[2]);
|
||
|
ASSERT_ONLY(index_t ts = ugbwt[3]);
|
||
|
assert(as == occ_save[0] || as == occ_save[0]-1);
|
||
|
assert_eq(cs, occ_save[1]);
|
||
|
assert_eq(gs, occ_save[2]);
|
||
|
assert_eq(ts, occ_save[3]);
|
||
|
#ifndef NDEBUG
|
||
|
occ_save[0] = occ[0];
|
||
|
occ_save[1] = occ[1];
|
||
|
occ_save[2] = occ[2];
|
||
|
occ_save[3] = occ[3];
|
||
|
#endif
|
||
|
cur += gh._sideSz;
|
||
|
}
|
||
|
}
|
||
|
|
||
|
/**
|
||
|
* Sanity-check various pieces of the Ebwt
|
||
|
*/
|
||
|
template <typename index_t>
|
||
|
void GFM<index_t>::sanityCheckAll(int reverse) const {
|
||
|
const GFMParams<index_t>& gh = this->_gh;
|
||
|
assert(isInMemory());
|
||
|
// Check ftab
|
||
|
for(index_t i = 1; i < gh._ftabLen; i++) {
|
||
|
assert_geq(this->ftabHi(i), this->ftabLo(i-1));
|
||
|
assert_geq(this->ftabLo(i), this->ftabHi(i-1));
|
||
|
assert_leq(this->ftabHi(i), gh._gbwtLen);
|
||
|
}
|
||
|
assert_eq(this->ftabHi(gh._ftabLen-1), gh._gbwtLen);
|
||
|
|
||
|
// Check offs
|
||
|
int seenLen = (gh._gbwtLen + 31) >> ((index_t)5);
|
||
|
uint32_t *seen;
|
||
|
try {
|
||
|
seen = new uint32_t[seenLen]; // bitvector marking seen offsets
|
||
|
} catch(bad_alloc& e) {
|
||
|
cerr << "Out of memory allocating seen[] at " << __FILE__ << ":" << __LINE__ << endl;
|
||
|
throw e;
|
||
|
}
|
||
|
memset(seen, 0, 4 * seenLen);
|
||
|
index_t offsLen = gh._offsLen;
|
||
|
for(index_t i = 0; i < offsLen; i++) {
|
||
|
assert_lt(this->offs()[i], gh._gbwtLen);
|
||
|
int w = this->offs()[i] >> 5;
|
||
|
int r = this->offs()[i] & 31;
|
||
|
assert_eq(0, (seen[w] >> r) & 1); // shouldn't have been seen before
|
||
|
seen[w] |= (1 << r);
|
||
|
}
|
||
|
delete[] seen;
|
||
|
|
||
|
// Check nPat
|
||
|
assert_gt(this->_nPat, 0);
|
||
|
|
||
|
// Check plen, flen
|
||
|
for(index_t i = 0; i < this->_nPat; i++) {
|
||
|
assert_geq(this->plen()[i], 0);
|
||
|
}
|
||
|
|
||
|
// Check rstarts
|
||
|
if(this->rstarts() != NULL) {
|
||
|
for(index_t i = 0; i < this->_nFrag-1; i++) {
|
||
|
assert_gt(this->rstarts()[(i+1)*3], this->rstarts()[i*3]);
|
||
|
if(reverse == REF_READ_REVERSE) {
|
||
|
assert(this->rstarts()[(i*3)+1] >= this->rstarts()[((i+1)*3)+1]);
|
||
|
} else {
|
||
|
assert(this->rstarts()[(i*3)+1] <= this->rstarts()[((i+1)*3)+1]);
|
||
|
}
|
||
|
}
|
||
|
}
|
||
|
|
||
|
// Check ebwt
|
||
|
sanityCheckUpToSide(gh._numSides);
|
||
|
VMSG_NL("Ebwt::sanityCheck passed");
|
||
|
}
|
||
|
|
||
|
/**
|
||
|
* Transform this Ebwt into the original string in linear time by using
|
||
|
* the LF mapping to walk backwards starting at the row correpsonding
|
||
|
* to the end of the string. The result is written to s. The Ebwt
|
||
|
* must be in memory.
|
||
|
*/
|
||
|
template <typename index_t>
|
||
|
void GFM<index_t>::restore(SString<char>& s) const {
|
||
|
assert(isInMemory());
|
||
|
s.resize(this->_gh._len);
|
||
|
index_t jumps = 0;
|
||
|
index_t i = this->_gh._len; // should point to final SA elt (starting with '$')
|
||
|
SideLocus<index_t> l(i, this->_gh, this->gfm());
|
||
|
while(true) {
|
||
|
for(index_t j = 0; j < _zOffs.size(); j++) {
|
||
|
if(i == _zOffs[j]) break;
|
||
|
}
|
||
|
assert_lt(jumps, this->_gh._len);
|
||
|
//if(_verbose) cout << "restore: i: " << i << endl;
|
||
|
// Not a marked row; go back a char in the original string
|
||
|
index_t newi = mapLF(l ASSERT_ONLY(, false));
|
||
|
assert_neq(newi, i);
|
||
|
s[this->_gh._len - jumps - 1] = rowL(l);
|
||
|
i = newi;
|
||
|
l.initFromRow(i, this->_gh, this->gfm());
|
||
|
jumps++;
|
||
|
}
|
||
|
assert_eq(jumps, this->_gh._len);
|
||
|
}
|
||
|
|
||
|
/**
|
||
|
* Check that this Ebwt, when restored via restore(), matches up with
|
||
|
* the given array of reference sequences. For sanity checking.
|
||
|
*/
|
||
|
template <typename index_t>
|
||
|
void GFM<index_t>::checkOrigs(
|
||
|
const EList<SString<char> >& os,
|
||
|
bool mirror) const
|
||
|
{
|
||
|
SString<char> rest;
|
||
|
restore(rest);
|
||
|
index_t restOff = 0;
|
||
|
size_t i = 0, j = 0;
|
||
|
if(mirror) {
|
||
|
// TODO: FIXME
|
||
|
return;
|
||
|
}
|
||
|
while(i < os.size()) {
|
||
|
size_t olen = os[i].length();
|
||
|
int lastorig = -1;
|
||
|
for(; j < olen; j++) {
|
||
|
size_t joff = j;
|
||
|
if(mirror) joff = olen - j - 1;
|
||
|
if((int)os[i][joff] == 4) {
|
||
|
// Skip over Ns
|
||
|
lastorig = -1;
|
||
|
if(!mirror) {
|
||
|
while(j < olen && (int)os[i][j] == 4) j++;
|
||
|
} else {
|
||
|
while(j < olen && (int)os[i][olen-j-1] == 4) j++;
|
||
|
}
|
||
|
j--;
|
||
|
continue;
|
||
|
}
|
||
|
assert_eq(os[i][joff], rest[restOff]);
|
||
|
lastorig = (int)os[i][joff];
|
||
|
restOff++;
|
||
|
}
|
||
|
if(j == os[i].length()) {
|
||
|
// Moved to next sequence
|
||
|
i++;
|
||
|
j = 0;
|
||
|
} else {
|
||
|
// Just jumped over a gap
|
||
|
}
|
||
|
}
|
||
|
}
|
||
|
|
||
|
/**
|
||
|
* Try to find the Bowtie index specified by the user. First try the
|
||
|
* exact path given by the user. Then try the user-provided string
|
||
|
* appended onto the path of the "indexes" subdirectory below this
|
||
|
* executable, then try the provided string appended onto
|
||
|
* "$HISAT2_INDEXES/".
|
||
|
*/
|
||
|
string adjustEbwtBase(const string& cmdline,
|
||
|
const string& ebwtFileBase,
|
||
|
bool verbose = false);
|
||
|
|
||
|
#endif /*GFM_H_*/
|