/* * Copyright 2015, Daehwan Kim * * This file is part of HISAT 2. * * HISAT 2 is free software: you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation, either version 3 of the License, or * (at your option) any later version. * * HISAT 2 is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with HISAT 2. If not, see . */ #ifndef HGFM_H_ #define HGFM_H_ #include "hier_idx_common.h" #include "gfm.h" /** * Extended Burrows-Wheeler transform data. * LocalEbwt is a specialized Ebwt index that represents ~64K bps * and therefore uses two bytes as offsets within 64K bps. * This class has only two additional member variables to denote the genomic sequenuce it represents: * (1) the contig index and (2) the offset within the contig. * */ template class LocalGFM : public GFM { typedef GFM PARENT_CLASS; public: /// Construct an Ebwt from the given input file LocalGFM(const string& in, ALTDB* altdb, FILE *in5, FILE *in6, char *mmFile5, char *mmFile6, full_index_t& tidx, full_index_t& localOffset, full_index_t& joinedOffset, bool switchEndian, size_t& bytesRead, size_t& bytesRead2, int needEntireReverse, bool fw, int32_t overrideOffRate, // = -1, int32_t offRatePlus, // = -1, uint32_t lineRate, uint32_t offRate, uint32_t ftabChars, bool useMm, // = false, bool useShmem, // = false, bool mmSweep, // = false, bool loadNames, // = false, bool loadSASamp, // = true, bool loadFtab, // = true, bool loadRstarts, // = true, bool verbose, // = false, bool startVerbose, // = false, bool passMemExc, // = false, bool sanityCheck, // = false) bool useHaplotype) : // = false GFM(in, altdb, NULL, NULL, needEntireReverse, fw, overrideOffRate, offRatePlus, useMm, useShmem, mmSweep, loadNames, loadSASamp, loadFtab, loadRstarts, true, // load Splice Sites verbose, startVerbose, passMemExc, sanityCheck, useHaplotype, true) { this->_in1Str = in + ".5." + gfm_ext; this->_in2Str = in + ".5." + gfm_ext; readIntoMemory( in5, in6, mmFile5, mmFile6, tidx, localOffset, joinedOffset, switchEndian, bytesRead, bytesRead2, needEntireReverse, loadSASamp, loadFtab, loadRstarts, false, //justHeader lineRate, offRate, ftabChars, mmSweep, loadNames, startVerbose); _tidx = tidx; _localOffset = localOffset; _joinedOffset = joinedOffset; // If the offRate has been overridden, reflect that in the // _eh._offRate field if(offRatePlus > 0 && this->_overrideOffRate == -1) { this->_overrideOffRate = this->_gh._offRate + offRatePlus; } if(this->_overrideOffRate > this->_gh._offRate) { this->_gh.setOffRate(this->_overrideOffRate); assert_eq(this->_overrideOffRate, this->_gh._offRate); } assert(this->repOk()); } /// Construct an Ebwt from the given header parameters and string /// vector, optionally using a blockwise suffix sorter with the /// given 'bmax' and 'dcv' parameters. The string vector is /// ultimately joined and the joined string is passed to buildToDisk(). template LocalGFM( TStr& s, const EList& sa, PathGraph* pg, full_index_t tidx, full_index_t localOffset, full_index_t joinedOffset, EList >& alts, index_t local_size, bool packed, int needEntireReverse, int32_t lineRate, int32_t offRate, int32_t ftabChars, const string& file, // base filename for EBWT files bool fw, int dcv, EList& szs, index_t sztot, const RefReadInParams& refparams, uint32_t seed, ostream& out5, ostream& out6, int32_t overrideOffRate = -1, bool verbose = false, bool passMemExc = false, bool sanityCheck = false) : GFM(packed, needEntireReverse, lineRate, offRate, ftabChars, file, fw, dcv, szs, sztot, refparams, seed, overrideOffRate, verbose, passMemExc, sanityCheck) { const GFMParams& gh = this->_gh; assert(gh.repOk()); uint32_t be = this->toBe(); assert(out5.good()); assert(out6.good()); _tidx = tidx; _localOffset = localOffset; _joinedOffset = joinedOffset; writeIndex(out5, tidx, be); writeIndex(out5, localOffset, be); writeIndex(out5, joinedOffset, be); writeIndex(out5, gh._len, be); // length of string (and bwt and suffix array) streampos headerPos = out5.tellp(); writeIndex(out5, 0, be); // gbwtLen writeIndex(out5, 0, be); // num of nodes writeIndex(out5, 0, be); // eftabLen if(gh._len > 0) { assert_gt(szs.size(), 0); assert_gt(sztot, 0); // Not every fragment represents a distinct sequence - many // fragments may correspond to a single sequence. Count the // number of sequences here by counting the number of "first" // fragments. this->_nPat = 0; this->_nFrag = 0; for(size_t i = 0; i < szs.size(); i++) { if(szs[i].len > 0) this->_nFrag++; if(szs[i].first && szs[i].len > 0) this->_nPat++; } assert_eq(this->_nPat, 1); assert_geq(this->_nFrag, this->_nPat); this->_rstarts.reset(); writeIndex(out5, this->_nPat, be); assert_eq(this->_nPat, 1); this->_plen.init(new index_t[this->_nPat], this->_nPat); // For each pattern, set plen int npat = -1; for(size_t i = 0; i < szs.size(); i++) { if(szs[i].first && szs[i].len > 0) { if(npat >= 0) { writeIndex(out5, this->plen()[npat], be); } npat++; this->plen()[npat] = (szs[i].len + szs[i].off); } else { this->plen()[npat] += (szs[i].len + szs[i].off); } } assert_eq((index_t)npat, this->_nPat-1); writeIndex(out5, this->plen()[npat], be); // Write the number of fragments writeIndex(out5, this->_nFrag, be); if(refparams.reverse == REF_READ_REVERSE) { EList tmp(EBWT_CAT); reverseRefRecords(szs, tmp, false, verbose); this->szsToDisk(tmp, out5, refparams.reverse); } else { this->szsToDisk(szs, out5, refparams.reverse); } if(alts.empty()) { assert(pg == NULL); buildToDisk(sa, s, out5, out6, headerPos); } else { assert(pg != NULL); // Re-initialize GFM parameters to reflect real number of edges (gbwt string) this->_gh.init( this->_gh.len(), pg->getNumEdges(), pg->getNumNodes(), this->_gh.lineRate(), this->_gh.offRate(), this->_gh.ftabChars(), 0, this->_gh.entireReverse()); buildToDisk(*pg, s, out5, out6, headerPos); } } out5.flush(); out6.flush(); if(out5.fail() || out6.fail()) { cerr << "An error occurred writing the index to disk. Please check if the disk is full." << endl; throw 1; } } template void buildToDisk( PathGraph& gbwt, const TStr& s, ostream& out1, ostream& out2, streampos headerPos); template void buildToDisk( const EList& sa, const TStr& s, ostream& out1, ostream& out2, streampos headerPos); // I/O void readIntoMemory( FILE *in5, FILE *in6, char *mmFile5, char *mmFile6, full_index_t& tidx, full_index_t& localOffset, full_index_t& joinedOffset, bool switchEndian, size_t& bytesRead, size_t& bytesRead2, int needEntireRev, bool loadSASamp, bool loadFtab, bool loadRstarts, bool justHeader, int32_t lineRate, int32_t offRate, int32_t ftabChars, bool mmSweep, bool loadNames, bool startVerbose); /** * Sanity-check various pieces of the Ebwt */ void sanityCheckAll(int reverse) const { if(this->_gh._len > 0) { PARENT_CLASS::sanityCheckAll(reverse); } } bool empty() const { return this->_gh._len == 0; } public: full_index_t _tidx; full_index_t _localOffset; full_index_t _joinedOffset; }; /** * Build an Ebwt from a string 's' and its suffix array 'sa' (which * might actually be a suffix array *builder* that builds blocks of the * array on demand). The bulk of the Ebwt, i.e. the ebwt and offs * arrays, is written directly to disk. This is by design: keeping * those arrays in memory needlessly increases the footprint of the * building process. Instead, we prefer to build the Ebwt directly * "to disk" and then read it back into memory later as necessary. * * It is assumed that the header values and join-related values (nPat, * plen) have already been written to 'out1' before this function * is called. When this function is finished, it will have * additionally written ebwt, zOff, fchr, ftab and eftab to the primary * file and offs to the secondary file. * * Assume DNA/RNA/any alphabet with 4 or fewer elements. * Assume occ array entries are 32 bits each. * * @param sa the suffix array to convert to a Ebwt * @param s the original string * @param out */ template template void LocalGFM::buildToDisk( PathGraph& gbwt, const TStr& s, ostream& out5, ostream& out6, streampos headerPos) { assert_leq(s.length(), std::numeric_limits::max()); const GFMParams& gh = this->_gh; assert(gh.repOk()); assert_lt(s.length(), gh.gbwtLen()); assert_eq(s.length(), gh._len); assert_gt(gh._lineRate, 3); index_t gbwtLen = gh._gbwtLen; streampos out5pos = out5.tellp(); out5.seekp(headerPos); writeIndex(out5, gbwtLen, this->toBe()); writeIndex(out5, gh._numNodes, this->toBe()); headerPos = out5.tellp(); out5.seekp(out5pos); index_t ftabLen = gh._ftabLen; index_t sideSz = gh._sideSz; index_t gbwtTotSz = gh._gbwtTotSz; index_t fchr[] = {0, 0, 0, 0, 0}; EList ftab(EBWT_CAT); EList zOffs; // Save # of occurrences of each character as we walk along the bwt index_t occ[4] = {0, 0, 0, 0}; index_t occSave[4] = {0, 0, 0, 0}; // # of occurrences of 1 in M arrays index_t M_occ = 0, M_occSave = 0; // Location in F that corresponds to 1 in M index_t F_loc = 0, F_locSave = 0; try { VMSG_NL("Allocating ftab, absorbFtab"); ftab.resize(ftabLen); ftab.fillZero(); } catch(bad_alloc &e) { cerr << "Out of memory allocating ftab[] or absorbFtab[] " << "in LocalGFM::buildToDisk() at " << __FILE__ << ":" << __LINE__ << endl; throw e; } // Allocate the side buffer; holds a single side as its being // constructed and then written to disk. Reused across all sides. #ifdef SIXTY4_FORMAT EList gfmSide(EBWT_CAT); #else EList gfmSide(EBWT_CAT); #endif try { // Used to calculate ftab and eftab, but having gfm costs a lot of memory this->_gfm.init(new uint8_t[gh._gbwtTotLen], gh._gbwtTotLen, true); #ifdef SIXTY4_FORMAT gfmSide.resize(sideSz >> 3); #else gfmSide.resize(sideSz); #endif } catch(bad_alloc &e) { cerr << "Out of memory allocating ebwtSide[] in " << "LocalGFM::buildToDisk() at " << __FILE__ << ":" << __LINE__ << endl; throw e; } // Points to the base offset within ebwt for the side currently // being written index_t side = 0; // Whether we're assembling a forward or a reverse bucket bool fw = true; int sideCur = 0; index_t si = 0; // string offset (chars) ASSERT_ONLY(bool inSA = true); // true iff saI still points inside suffix // array (as opposed to the padding at the // end) // Iterate over packed bwt bytes VMSG_NL("Entering LocalGFM loop"); ASSERT_ONLY(uint32_t beforeGbwtOff = (uint32_t)out5.tellp()); while(side < gbwtTotSz) { // Sanity-check our cursor into the side buffer assert_geq(sideCur, 0); assert_lt(sideCur, (int)gh._sideGbwtSz); assert_eq(0, side % sideSz); // 'side' must be on side boundary gfmSide[sideCur] = 0; // clear if(sideCur == 0) { memset(gfmSide.ptr(), 0, gh._sideGbwtSz); gfmSide[sideCur] = 0; // clear } assert_lt(side + sideCur, gbwtTotSz); // Iterate over bit-pairs in the si'th character of the BWT #ifdef SIXTY4_FORMAT for(int bpi = 0; bpi < 32; bpi++, si++) { #else for(int bpi = 0; bpi < 4; bpi++, si++) { #endif int gbwtChar = 0; int F = 0, M = 0; full_index_t pos = 0; bool count = true; if(si < gbwtLen) { gbwt.nextRow(gbwtChar, F, M, pos); // (that might have triggered sa to calc next suf block) if(gbwtChar == 'Z') { // Don't add the '$' in the last column to the BWT // transform; we can't encode a $ (only A C T or G) // and counting it as, say, an A, will mess up the // LR mapping gbwtChar = 0; count = false; #ifndef NDEBUG if(zOffs.size() > 0) { assert_gt(si, zOffs.back()); } #endif zOffs.push_back(si); // remember GBWT row that corresponds to the 0th suffix } else { gbwtChar = asc2dna[gbwtChar]; assert_lt(gbwtChar, 4); // Update the fchr fchr[gbwtChar]++; } assert_lt(F, 2); assert_lt(M, 2); if(M == 1) { assert_neq(F_loc, numeric_limits::max()); F_loc = gbwt.nextFLocation(); #ifndef NDEBUG if(F_loc > 0) { assert_gt(F_loc, F_locSave); } #endif } // Suffix array offset boundary? - update offset array if(M == 1 && (M_occ & gh._offMask) == M_occ) { assert_lt((M_occ >> gh._offRate), gh._offsLen); // Write offsets directly to the secondary output // stream, thereby avoiding keeping them in memory writeIndex(out6, pos, this->toBe()); } } else { // Strayed off the end of the SA, now we're just // padding out a bucket #ifndef NDEBUG if(inSA) { // Assert that we wrote all the characters in the // string before now assert_eq(si, gbwtLen); inSA = false; } #endif // 'A' used for padding; important that padding be // counted in the occ[] array gbwtChar = 0; F = M = 0; } if(count) occ[gbwtChar]++; if(M) M_occ++; // Append BWT char to bwt section of current side if(fw) { // Forward bucket: fill from least to most #ifdef SIXTY4_FORMAT gfmSide[sideCur] |= ((uint64_t)gbwtChar << (bpi << 1)); if(gbwtChar > 0) assert_gt(gfmSide[sideCur], 0); assert(false); cerr << "Not implemented" << endl; exit(1); #else pack_2b_in_8b(gbwtChar, gfmSide[sideCur], bpi); assert_eq((gfmSide[sideCur] >> (bpi*2)) & 3, gbwtChar); int F_sideCur = (gh._sideGbwtSz + sideCur) >> 1; int F_bpi = bpi + ((sideCur & 0x1) << 2); // Can be used as M_bpi as well pack_1b_in_8b(F, gfmSide[F_sideCur], F_bpi); assert_eq((gfmSide[F_sideCur] >> F_bpi) & 1, F); int M_sideCur = F_sideCur + (gh._sideGbwtSz >> 2); pack_1b_in_8b(M, gfmSide[M_sideCur], F_bpi); assert_eq((gfmSide[M_sideCur] >> F_bpi) & 1, M); #endif } else { // Backward bucket: fill from most to least #ifdef SIXTY4_FORMAT gfmSide[sideCur] |= ((uint64_t)gbwtChar << ((31 - bpi) << 1)); if(gbwtChar > 0) assert_gt(gfmSide[sideCur], 0); // To be implemented ... assert(false); cerr << "Not implemented" << endl; exit(1); #else pack_2b_in_8b(gbwtChar, gfmSide[sideCur], 3-bpi); assert_eq((gfmSide[sideCur] >> ((3-bpi)*2)) & 3, gbwtChar); // To be implemented ... assert(false); cerr << "Not implemented" << endl; exit(1); #endif } } // end loop over bit-pairs assert_eq(0, (occ[0] + occ[1] + occ[2] + occ[3] + zOffs.size()) & 3); #ifdef SIXTY4_FORMAT assert_eq(0, si & 31); #else assert_eq(0, si & 3); #endif sideCur++; if((sideCur << 1) == (int)gh._sideGbwtSz) { sideCur = 0; index_t *uside = reinterpret_cast(gfmSide.ptr()); // Write 'A', 'C', 'G' and 'T' tallies side += sideSz; assert_leq(side, gh._gbwtTotSz); uside[(sideSz / sizeof(index_t))-6] = endianizeIndex(F_locSave, this->toBe()); uside[(sideSz / sizeof(index_t))-5] = endianizeIndex(M_occSave, this->toBe()); uside[(sideSz / sizeof(index_t))-4] = endianizeIndex(occSave[0], this->toBe()); uside[(sideSz / sizeof(index_t))-3] = endianizeIndex(occSave[1], this->toBe()); uside[(sideSz / sizeof(index_t))-2] = endianizeIndex(occSave[2], this->toBe()); uside[(sideSz / sizeof(index_t))-1] = endianizeIndex(occSave[3], this->toBe()); F_locSave = F_loc; M_occSave = M_occ; occSave[0] = occ[0]; occSave[1] = occ[1]; occSave[2] = occ[2]; occSave[3] = occ[3]; // Write backward side to primary file out5.write((const char *)gfmSide.ptr(), sideSz); // memcpy(((char*)this->_gfm.get()) + side - sideSz, (const char *)gfmSide.ptr(), sideSz); } } VMSG_NL("Exited LocalGFM loop"); // Assert that our loop counter got incremented right to the end assert_eq(side, gh._gbwtTotSz); // Assert that we wrote the expected amount to out5 assert_eq(((uint32_t)out5.tellp() - beforeGbwtOff), gh._gbwtTotSz); // assert that the last thing we did was write a forward bucket // // Write zOffs to primary stream // assert_gt(zOffs.size(), 0); writeIndex(out5, zOffs.size(), this->toBe()); for(size_t i = 0; i < zOffs.size(); i++) { writeIndex(out5, zOffs[i], this->toBe()); } // // Finish building fchr // // Exclusive prefix sum on fchr for(int i = 1; i < 4; i++) { fchr[i] += fchr[i-1]; } assert_lt(fchr[3], gbwtLen); // Shift everybody up by one for(int i = 4; i >= 1; i--) { fchr[i] = fchr[i-1]; } fchr[0] = 0; // Write fchr to primary file for(int i = 0; i < 5; i++) { writeIndex(out5, fchr[i], this->toBe()); } this->_fchr.init(new index_t[5], 5, true); memcpy(this->_fchr.get(), fchr, sizeof(index_t) * 5); // Initialize _zGbwtByteOffs and _zGbwtBpOffs this->_zOffs = zOffs; this->postReadInit(gh); // Build ftab and eftab EList > tFtab; tFtab.resizeExact(ftabLen - 1); for(index_t i = 0; i + 1 < ftabLen; i++) { index_t q = i; pair range(0, gh._gbwtLen); SideLocus tloc, bloc; SideLocus::initFromTopBot(range.first, range.second, gh, this->gfm(), tloc, bloc); index_t j = 0; for(; j < gh._ftabChars; j++) { int nt = q & 0x3; q >>= 2; if(bloc.valid()) { range = this->mapGLF(tloc, bloc, nt); } else { range = this->mapGLF1(range.first, tloc, nt); } if(range.first == (index_t)INDEX_MAX || range.first >= range.second) { break; } if(range.first + 1 == range.second) { tloc.initFromRow(range.first, gh, this->gfm()); bloc.invalidate(); } else { SideLocus::initFromTopBot(range.first, range.second, gh, this->gfm(), tloc, bloc); } } if(range.first >= range.second || j < gh._ftabChars) { if(i == 0) { tFtab[i].first = tFtab[i].second = 0; } else { tFtab[i].first = tFtab[i].second = tFtab[i-1].second; } } else { tFtab[i].first = range.first; tFtab[i].second = range.second; } #ifndef NDEBUG if(gbwt.ftab.size() > i) { assert_eq(tFtab[i].first, gbwt.ftab[i].first); assert_eq(tFtab[i].second, gbwt.ftab[i].second); } #endif } // Clear memory this->_gfm.reset(); this->_fchr.reset(); this->_zOffs.clear(); this->_zGbwtByteOffs.clear(); this->_zGbwtBpOffs.clear(); // // Finish building ftab and build eftab // // Prefix sum on ftable index_t eftabLen = 0; for(index_t i = 1; i + 1 < ftabLen; i++) { if(tFtab[i-1].second != tFtab[i].first) { eftabLen += 2; } } if(gh._gbwtLen + (eftabLen >> 1) < gh._gbwtLen) { cerr << "Too many eftab entries: " << gh._gbwtLen << " + " << (eftabLen >> 1) << " > " << (index_t)INDEX_MAX << endl; throw 1; } EList eftab(EBWT_CAT); try { eftab.resize(eftabLen); eftab.fillZero(); } catch(bad_alloc &e) { cerr << "Out of memory allocating eftab[] " << "in LocalGFM::buildToDisk() at " << __FILE__ << ":" << __LINE__ << endl; throw e; } index_t eftabCur = 0; ftab[0] = tFtab[0].first; ftab[1] = tFtab[0].second; for(index_t i = 1; i + 1 < ftabLen; i++) { if(ftab[i] != tFtab[i].first) { index_t lo = ftab[i]; index_t hi = tFtab[i].first; assert_lt(eftabCur*2+1, eftabLen); eftab[eftabCur*2] = lo; eftab[eftabCur*2+1] = hi; assert_leq(lo, hi + 4); ftab[i] = (eftabCur++) ^ (index_t)INDEX_MAX; // insert pointer into eftab assert_eq(lo, GFM::ftabLo(ftab.ptr(), eftab.ptr(), gbwtLen, ftabLen, eftabLen, i)); assert_eq(hi, GFM::ftabHi(ftab.ptr(), eftab.ptr(), gbwtLen, ftabLen, eftabLen, i)); } ftab[i+1] = tFtab[i].second; } #ifndef NDEBUG for(index_t i = 0; i + 1 < ftabLen; i++ ){ assert_eq(tFtab[i].first, GFM::ftabHi(ftab.ptr(), eftab.ptr(), gbwtLen, ftabLen, eftabLen, i)); assert_eq(tFtab[i].second, GFM::ftabLo(ftab.ptr(), eftab.ptr(), gbwtLen, ftabLen, eftabLen, i+1)); } #endif // Write ftab to primary file for(index_t i = 0; i < ftabLen; i++) { writeIndex(out5, ftab[i], this->toBe()); } // Write eftab to primary file out5pos = out5.tellp(); out5.seekp(headerPos); writeIndex(out5, eftabLen, this->toBe()); out5.seekp(out5pos); for(index_t i = 0; i < eftabLen; i++) { writeIndex(out5, eftab[i], this->toBe()); } // Note: if you'd like to sanity-check the Ebwt, you'll have to // read it back into memory first! assert(!this->isInMemory()); VMSG_NL("Exiting LocalGFM::buildToDisk()"); } /** * Build an Ebwt from a string 's' and its suffix array 'sa' (which * might actually be a suffix array *builder* that builds blocks of the * array on demand). The bulk of the Ebwt, i.e. the ebwt and offs * arrays, is written directly to disk. This is by design: keeping * those arrays in memory needlessly increases the footprint of the * building process. Instead, we prefer to build the Ebwt directly * "to disk" and then read it back into memory later as necessary. * * It is assumed that the header values and join-related values (nPat, * plen) have already been written to 'out1' before this function * is called. When this function is finished, it will have * additionally written ebwt, zOff, fchr, ftab and eftab to the primary * file and offs to the secondary file. * * Assume DNA/RNA/any alphabet with 4 or fewer elements. * Assume occ array entries are 32 bits each. * * @param sa the suffix array to convert to a Ebwt * @param s the original string * @param out */ template template void LocalGFM::buildToDisk( const EList& sa, const TStr& s, ostream& out5, ostream& out6, streampos headerPos) { assert_leq(s.length(), std::numeric_limits::max()); const GFMParams& gh = this->_gh; assert(gh.repOk()); assert(gh.linearFM()); assert_lt(s.length(), gh.gbwtLen()); assert_eq(s.length(), gh._len); assert_gt(gh._lineRate, 3); index_t len = gh._len; index_t gbwtLen = gh._gbwtLen; assert_eq(len + 1, gbwtLen); streampos out5pos = out5.tellp(); out5.seekp(headerPos); writeIndex(out5, gbwtLen, this->toBe()); writeIndex(out5, gh._numNodes, this->toBe()); headerPos = out5.tellp(); out5.seekp(out5pos); index_t ftabLen = gh._ftabLen; index_t sideSz = gh._sideSz; index_t gbwtTotSz = gh._gbwtTotSz; index_t fchr[] = {0, 0, 0, 0, 0}; EList ftab(EBWT_CAT); EList zOffs; // Save # of occurrences of each character as we walk along the bwt index_t occ[4] = {0, 0, 0, 0}; index_t occSave[4] = {0, 0, 0, 0}; // Record rows that should "absorb" adjacent rows in the ftab. // The absorbed rows represent suffixes shorter than the ftabChars // cutoff. uint8_t absorbCnt = 0; EList absorbFtab(EBWT_CAT); try { VMSG_NL("Allocating ftab, absorbFtab"); ftab.resize(ftabLen); ftab.fillZero(); absorbFtab.resize(ftabLen); absorbFtab.fillZero(); } catch(bad_alloc &e) { cerr << "Out of memory allocating ftab[] or absorbFtab[] " << "in LocalGFM::buildToDisk() at " << __FILE__ << ":" << __LINE__ << endl; throw e; } // Allocate the side buffer; holds a single side as its being // constructed and then written to disk. Reused across all sides. #ifdef SIXTY4_FORMAT EList gfmSide(EBWT_CAT); #else EList gfmSide(EBWT_CAT); #endif try { #ifdef SIXTY4_FORMAT gfmSide.resize(sideSz >> 3); #else gfmSide.resize(sideSz); #endif } catch(bad_alloc &e) { cerr << "Out of memory allocating gfmSide[] in " << "LocalGFM::buildToDisk() at " << __FILE__ << ":" << __LINE__ << endl; throw e; } // Points to the base offset within ebwt for the side currently // being written index_t side = 0; // Whether we're assembling a forward or a reverse bucket bool fw = true; int sideCur = 0; // Have we skipped the '$' in the last column yet? ASSERT_ONLY(bool dollarSkipped = false); index_t si = 0; // string offset (chars) ASSERT_ONLY(uint32_t lastSufInt = 0); ASSERT_ONLY(bool inSA = true); // true iff saI still points inside suffix // array (as opposed to the padding at the // end) // Iterate over packed bwt bytes VMSG_NL("Entering LocalGFM loop"); ASSERT_ONLY(uint32_t beforeGbwtOff = (uint32_t)out5.tellp()); while(side < gbwtTotSz) { // Sanity-check our cursor into the side buffer assert_geq(sideCur, 0); assert_lt(sideCur, (int)gh._sideGbwtSz); assert_eq(0, side % sideSz); // 'side' must be on side boundary gfmSide[sideCur] = 0; // clear assert_lt(side + sideCur, gbwtTotSz); // Iterate over bit-pairs in the si'th character of the BWT #ifdef SIXTY4_FORMAT for(int bpi = 0; bpi < 32; bpi++, si++) { #else for(int bpi = 0; bpi < 4; bpi++, si++) { #endif int bwtChar; bool count = true; if(si <= len) { // Still in the SA; extract the bwtChar index_t saElt = (index_t)sa[si]; // (that might have triggered sa to calc next suf block) if(saElt == 0) { // Don't add the '$' in the last column to the BWT // transform; we can't encode a $ (only A C T or G) // and counting it as, say, an A, will mess up the // LR mapping bwtChar = 0; count = false; ASSERT_ONLY(dollarSkipped = true); zOffs.push_back(si); // remember the SA row that // corresponds to the 0th suffix } else { bwtChar = (int)(s[saElt-1]); assert_lt(bwtChar, 4); // Update the fchr fchr[bwtChar]++; } // Update ftab if((len-saElt) >= (index_t)gh._ftabChars) { // Turn the first ftabChars characters of the // suffix into an integer index into ftab. The // leftmost (lowest index) character of the suffix // goes in the most significant bit pair if the // integer. uint32_t sufInt = 0; for(int i = 0; i < gh._ftabChars; i++) { sufInt <<= 2; assert_lt((index_t)i, len-saElt); sufInt |= (unsigned char)(s[saElt+i]); } // Assert that this prefix-of-suffix is greater // than or equal to the last one (true b/c the // suffix array is sorted) #ifndef NDEBUG if(lastSufInt > 0) assert_geq(sufInt, lastSufInt); lastSufInt = sufInt; #endif // Update ftab assert_lt(sufInt+1, ftabLen); ftab[sufInt+1]++; if(absorbCnt > 0) { // Absorb all short suffixes since the last // transition into this transition absorbFtab[sufInt] = absorbCnt; absorbCnt = 0; } } else { // Otherwise if suffix is fewer than ftabChars // characters long, then add it to the 'absorbCnt'; // it will be absorbed into the next transition assert_lt(absorbCnt, 255); absorbCnt++; } // Suffix array offset boundary? - update offset array if((si & gh._offMask) == si) { assert_lt((si >> gh._offRate), gh._offsLen); // Write offsets directly to the secondary output // stream, thereby avoiding keeping them in memory writeIndex(out6, saElt, this->toBe()); } } else { // Strayed off the end of the SA, now we're just // padding out a bucket #ifndef NDEBUG if(inSA) { // Assert that we wrote all the characters in the // string before now assert_eq(si, len+1); inSA = false; } #endif // 'A' used for padding; important that padding be // counted in the occ[] array bwtChar = 0; } if(count) occ[bwtChar]++; // Append BWT char to bwt section of current side if(fw) { // Forward bucket: fill from least to most #ifdef SIXTY4_FORMAT gfmSide[sideCur] |= ((uint64_t)bwtChar << (bpi << 1)); if(bwtChar > 0) assert_gt(gfmSide[sideCur], 0); #else pack_2b_in_8b(bwtChar, gfmSide[sideCur], bpi); assert_eq((gfmSide[sideCur] >> (bpi*2)) & 3, bwtChar); #endif } else { // Backward bucket: fill from most to least #ifdef SIXTY4_FORMAT gfmSide[sideCur] |= ((uint64_t)bwtChar << ((31 - bpi) << 1)); if(bwtChar > 0) assert_gt(gfmSide[sideCur], 0); #else pack_2b_in_8b(bwtChar, gfmSide[sideCur], 3-bpi); assert_eq((gfmSide[sideCur] >> ((3-bpi)*2)) & 3, bwtChar); #endif } } // end loop over bit-pairs assert_eq(dollarSkipped ? 3 : 0, (occ[0] + occ[1] + occ[2] + occ[3]) & 3); #ifdef SIXTY4_FORMAT assert_eq(0, si & 31); #else assert_eq(0, si & 3); #endif sideCur++; if(sideCur == (int)gh._sideGbwtSz) { sideCur = 0; index_t *uside = reinterpret_cast(gfmSide.ptr()); // Write 'A', 'C', 'G' and 'T' tallies side += sideSz; assert_leq(side, gh._gbwtTotSz); uside[(sideSz / sizeof(index_t))-4] = endianizeIndex(occSave[0], this->toBe()); uside[(sideSz / sizeof(index_t))-3] = endianizeIndex(occSave[1], this->toBe()); uside[(sideSz / sizeof(index_t))-2] = endianizeIndex(occSave[2], this->toBe()); uside[(sideSz / sizeof(index_t))-1] = endianizeIndex(occSave[3], this->toBe()); occSave[0] = occ[0]; occSave[1] = occ[1]; occSave[2] = occ[2]; occSave[3] = occ[3]; // Write backward side to primary file out5.write((const char *)gfmSide.ptr(), sideSz); } } VMSG_NL("Exited LocalGFM loop"); if(absorbCnt > 0) { // Absorb any trailing, as-yet-unabsorbed short suffixes into // the last element of ftab absorbFtab[ftabLen-1] = absorbCnt; } // Assert that our loop counter got incremented right to the end assert_eq(side, gh._gbwtTotSz); // Assert that we wrote the expected amount to out5 assert_eq(((uint32_t)out5.tellp() - beforeGbwtOff), gh._gbwtTotSz); // assert that the last thing we did was write a forward bucket // // Write zOffs to primary stream // assert_eq(zOffs.size(), 1); writeIndex(out5, zOffs.size(), this->toBe()); for(size_t i = 0; i < zOffs.size(); i++) { assert_neq(zOffs[i], (index_t)OFF_MASK); writeIndex(out5, zOffs[i], this->toBe()); } // // Finish building fchr // // Exclusive prefix sum on fchr for(int i = 1; i < 4; i++) { fchr[i] += fchr[i-1]; } assert_lt(fchr[3], gbwtLen); // Shift everybody up by one for(int i = 4; i >= 1; i--) { fchr[i] = fchr[i-1]; } fchr[0] = 0; // Write fchr to primary file for(int i = 0; i < 5; i++) { writeIndex(out5, fchr[i], this->toBe()); } // // Finish building ftab and build eftab // // Prefix sum on ftable index_t eftabLen = 0; assert_eq(0, absorbFtab[0]); for(index_t i = 1; i < ftabLen; i++) { if(absorbFtab[i] > 0) eftabLen += 2; } assert_leq(eftabLen, (index_t)gh._ftabChars*2); eftabLen = gh._ftabChars*2; EList eftab(EBWT_CAT); try { eftab.resize(eftabLen); eftab.fillZero(); } catch(bad_alloc &e) { cerr << "Out of memory allocating eftab[] " << "in LocalGFM::buildToDisk() at " << __FILE__ << ":" << __LINE__ << endl; throw e; } index_t eftabCur = 0; for(index_t i = 1; i < ftabLen; i++) { index_t lo = ftab[i] + GFM::ftabHi(ftab.ptr(), eftab.ptr(), len, ftabLen, eftabLen, i-1); if(absorbFtab[i] > 0) { // Skip a number of short pattern indicated by absorbFtab[i] index_t hi = lo + absorbFtab[i]; assert_lt(eftabCur*2+1, eftabLen); eftab[eftabCur*2] = lo; eftab[eftabCur*2+1] = hi; ftab[i] = (eftabCur++) ^ (index_t)OFF_MASK; // insert pointer into eftab assert_eq(lo, GFM::ftabLo(ftab.ptr(), eftab.ptr(), len, ftabLen, eftabLen, i)); assert_eq(hi, GFM::ftabHi(ftab.ptr(), eftab.ptr(), len, ftabLen, eftabLen, i)); } else { ftab[i] = lo; } } assert_eq(GFM::ftabHi(ftab.ptr(), eftab.ptr(), len, ftabLen, eftabLen, ftabLen-1), len+1); // Write ftab to primary file for(index_t i = 0; i < ftabLen; i++) { writeIndex(out5, ftab[i], this->toBe()); } // Write eftab to primary file out5pos = out5.tellp(); out5.seekp(headerPos); writeIndex(out5, eftabLen, this->toBe()); out5.seekp(out5pos); for(index_t i = 0; i < eftabLen; i++) { writeIndex(out5, eftab[i], this->toBe()); } // Note: if you'd like to sanity-check the Ebwt, you'll have to // read it back into memory first! assert(!this->isInMemory()); VMSG_NL("Exiting LocalGFM::buildToDisk()"); } /** * Read an Ebwt from file with given filename. */ template void LocalGFM::readIntoMemory( FILE *in5, FILE *in6, char *mmFile5, char *mmFile6, full_index_t& tidx, full_index_t& localOffset, full_index_t& joinedOffset, bool switchEndian, size_t& bytesRead, size_t& bytesRead2, int entireRev, bool loadSASamp, bool loadFtab, bool loadRstarts, bool justHeader, int32_t lineRate, int32_t offRate, int32_t ftabChars, bool mmSweep, bool loadNames, bool startVerbose) { #ifdef BOWTIE_MM char *mmFile[] = { mmFile5, mmFile6 }; #endif // Reads header entries one by one from primary stream tidx = readIndex(in5, switchEndian); bytesRead += sizeof(full_index_t); localOffset = readIndex(in5, switchEndian); bytesRead += sizeof(full_index_t); joinedOffset = readIndex(in5, switchEndian); bytesRead += sizeof(full_index_t); index_t len = readIndex(in5, switchEndian); bytesRead += sizeof(index_t); index_t gbwtLen = readIndex(in5, switchEndian); bytesRead += sizeof(index_t); index_t numNodes = readIndex(in5, switchEndian); bytesRead += sizeof(index_t); index_t eftabLen = readIndex(in5, switchEndian); bytesRead += sizeof(index_t); // Create a new EbwtParams from the entries read from primary stream this->_gh.init(len, gbwtLen, numNodes, lineRate, offRate, ftabChars, eftabLen, entireRev); if(len <= 0) { return; } // Set up overridden suffix-array-sample parameters uint32_t offsLen = this->_gh._offsLen; uint32_t offRateDiff = 0; uint32_t offsLenSampled = offsLen; if(this->_overrideOffRate > offRate) { offRateDiff = this->_overrideOffRate - offRate; } if(offRateDiff > 0) { offsLenSampled >>= offRateDiff; if((offsLen & ~((index_t)OFF_MASK << offRateDiff)) != 0) { offsLenSampled++; } } // Can't override the offrate or isarate and use memory-mapped // files; ultimately, all processes need to copy the sparser sample // into their own memory spaces. if(this->_useMm && (offRateDiff)) { cerr << "Error: Can't use memory-mapped files when the offrate is overridden" << endl; throw 1; } // Read nPat from primary stream this->_nPat = readIndex(in5, switchEndian); assert_eq(this->_nPat, 1); bytesRead += sizeof(index_t); this->_plen.reset(); // Read plen from primary stream if(this->_useMm) { #ifdef BOWTIE_MM this->_plen.init((index_t*)(mmFile[0] + bytesRead), this->_nPat, false); bytesRead += this->_nPat*sizeof(index_t); fseek(in5, this->_nPat*sizeof(index_t), SEEK_CUR); #endif } else { try { if(this->_verbose || startVerbose) { cerr << "Reading plen (" << this->_nPat << "): "; logTime(cerr); } this->_plen.init(new index_t[this->_nPat], this->_nPat, true); if(switchEndian) { for(index_t i = 0; i < this->_nPat; i++) { this->plen()[i] = readIndex(in5, switchEndian); } } else { size_t r = MM_READ(in5, (void*)(this->plen()), this->_nPat*sizeof(index_t)); if(r != (size_t)(this->_nPat*sizeof(index_t))) { cerr << "Error reading _plen[] array: " << r << ", " << this->_nPat*sizeof(index_t) << endl; throw 1; } } } catch(bad_alloc& e) { cerr << "Out of memory allocating plen[] in Ebwt::read()" << " at " << __FILE__ << ":" << __LINE__ << endl; throw e; } } bool shmemLeader; // TODO: I'm not consistent on what "header" means. Here I'm using // "header" to mean everything that would exist in memory if we // started to build the Ebwt but stopped short of the build*() step // (i.e. everything up to and including join()). if(justHeader) return; this->_nFrag = readIndex(in5, switchEndian); bytesRead += sizeof(index_t); if(this->_verbose || startVerbose) { cerr << "Reading rstarts (" << this->_nFrag*3 << "): "; logTime(cerr); } assert_geq(this->_nFrag, this->_nPat); this->_rstarts.reset(); if(loadRstarts) { if(this->_useMm) { #ifdef BOWTIE_MM this->_rstarts.init((index_t*)(mmFile[0] + bytesRead), this->_nFrag*3, false); bytesRead += this->_nFrag*sizeof(index_t)*3; fseek(in5, this->_nFrag*sizeof(index_t)*3, SEEK_CUR); #endif } else { this->_rstarts.init(new index_t[this->_nFrag*3], this->_nFrag*3, true); if(switchEndian) { for(index_t i = 0; i < this->_nFrag*3; i += 3) { // fragment starting position in joined reference // string, text id, and fragment offset within text this->rstarts()[i] = readIndex(in5, switchEndian); this->rstarts()[i+1] = readIndex(in5, switchEndian); this->rstarts()[i+2] = readIndex(in5, switchEndian); } } else { size_t r = MM_READ(in5, (void *)this->rstarts(), this->_nFrag*sizeof(index_t)*3); if(r != (size_t)(this->_nFrag*sizeof(index_t)*3)) { cerr << "Error reading _rstarts[] array: " << r << ", " << (this->_nFrag*sizeof(index_t)*3) << endl; throw 1; } } } } else { // Skip em assert(this->rstarts() == NULL); bytesRead += this->_nFrag*sizeof(index_t)*3; fseek(in5, this->_nFrag*sizeof(index_t)*3, SEEK_CUR); } this->_gfm.reset(); if(this->_useMm) { #ifdef BOWTIE_MM this->_gfm.init((uint8_t*)(mmFile[0] + bytesRead), this->_gh._gbwtTotLen, false); bytesRead += this->_gh._gbwtTotLen; fseek(in5, this->_gh._gbwtTotLen, SEEK_CUR); #endif } else { // Allocate ebwt (big allocation) if(this->_verbose || startVerbose) { cerr << "Reading ebwt (" << this->_gh._gbwtTotLen << "): "; logTime(cerr); } bool shmemLeader = true; if(this->useShmem_) { uint8_t *tmp = NULL; shmemLeader = ALLOC_SHARED_U8( (this->_in1Str + "[gfm]"), this->_gh._gbwtTotLen, &tmp, "gfm[]", (this->_verbose || startVerbose)); assert(tmp != NULL); this->_gfm.init(tmp, this->_gh._gbwtTotLen, false); if(this->_verbose || startVerbose) { cerr << " shared-mem " << (shmemLeader ? "leader" : "follower") << endl; } } else { try { this->_gfm.init(new uint8_t[this->_gh._gbwtTotLen], this->_gh._gbwtTotLen, true); } catch(bad_alloc& e) { cerr << "Out of memory allocating the ebwt[] array for the Bowtie index. Please try" << endl << "again on a computer with more memory." << endl; throw 1; } } if(shmemLeader) { // Read ebwt from primary stream uint64_t bytesLeft = this->_gh._gbwtTotLen; char *pgbwt = (char*)this->gfm(); while (bytesLeft>0){ size_t r = MM_READ(in5, (void *)pgbwt, bytesLeft); if(MM_IS_IO_ERR(in5, r, bytesLeft)) { cerr << "Error reading _ebwt[] array: " << r << ", " << bytesLeft << endl; throw 1; } pgbwt += r; bytesLeft -= r; } if(switchEndian) { uint8_t *side = this->gfm(); for(size_t i = 0; i < this->_gh._numSides; i++) { index_t *cums = reinterpret_cast(side + this->_gh._sideSz - sizeof(index_t)*2); cums[0] = endianSwapIndex(cums[0]); cums[1] = endianSwapIndex(cums[1]); side += this->_gh._sideSz; } } #ifdef BOWTIE_SHARED_MEM if(useShmem_) NOTIFY_SHARED(this->gfm(), this->_gh._gbwtTotLen); #endif } else { // Seek past the data and wait until master is finished fseek(in5, this->_gh._gbwtTotLen, SEEK_CUR); #ifdef BOWTIE_SHARED_MEM if(useShmem_) WAIT_SHARED(this->gfm(), this->_gh._gbwtTotLen); #endif } } // Read zOff from primary stream this->_zOffs.clear(); index_t num_zOffs = readIndex(in5, switchEndian); bytesRead += sizeof(index_t); for(index_t i = 0; i < num_zOffs; i++) { index_t zOff = readIndex(in5, switchEndian); bytesRead += sizeof(index_t); assert_lt(zOff, gbwtLen); this->_zOffs.push_back(zOff); } try { // Read fchr from primary stream if(this->_verbose || startVerbose) cerr << "Reading fchr (5)" << endl; this->_fchr.reset(); if(this->_useMm) { #ifdef BOWTIE_MM this->_fchr.init((index_t*)(mmFile[0] + bytesRead), 5, false); bytesRead += 5*sizeof(index_t); fseek(in5, 5*sizeof(index_t), SEEK_CUR); #endif } else { this->_fchr.init(new index_t[5], 5, true); for(index_t i = 0; i < 5; i++) { this->fchr()[i] = readIndex(in5, switchEndian); assert_leq(this->fchr()[i], gbwtLen); assert(i <= 0 || this->fchr()[i] >= this->fchr()[i-1]); } } assert_gt(this->fchr()[4], this->fchr()[0]); // Read ftab from primary stream if(this->_verbose || startVerbose) { if(loadFtab) { cerr << "Reading ftab (" << this->_gh._ftabLen << "): "; logTime(cerr); } else { cerr << "Skipping ftab (" << this->_gh._ftabLen << "): "; } } this->_ftab.reset(); if(loadFtab) { if(this->_useMm) { #ifdef BOWTIE_MM this->_ftab.init((index_t*)(mmFile[0] + bytesRead), this->_gh._ftabLen, false); bytesRead += this->_gh._ftabLen*sizeof(index_t); fseek(in5, this->_gh._ftabLen*sizeof(index_t), SEEK_CUR); #endif } else { this->_ftab.init(new index_t[this->_gh._ftabLen], this->_gh._ftabLen, true); if(switchEndian) { for(uint32_t i = 0; i < this->_gh._ftabLen; i++) this->ftab()[i] = readIndex(in5, switchEndian); } else { size_t r = MM_READ(in5, (void *)this->ftab(), this->_gh._ftabLen*sizeof(index_t)); if(r != (size_t)(this->_gh._ftabLen*sizeof(index_t))) { cerr << "Error reading _ftab[] array: " << r << ", " << (this->_gh._ftabLen*sizeof(index_t)) << endl; throw 1; } } } // Read etab from primary stream if(this->_verbose || startVerbose) { if(loadFtab) { cerr << "Reading eftab (" << this->_gh._eftabLen << "): "; logTime(cerr); } else { cerr << "Skipping eftab (" << this->_gh._eftabLen << "): "; } } this->_eftab.reset(); if(this->_useMm) { #ifdef BOWTIE_MM this->_eftab.init((index_t*)(mmFile[0] + bytesRead), this->_gh._eftabLen, false); bytesRead += this->_gh._eftabLen*sizeof(index_t); fseek(in5, this->_gh._eftabLen*sizeof(index_t), SEEK_CUR); #endif } else { this->_eftab.init(new index_t[this->_gh._eftabLen], this->_gh._eftabLen, true); if(switchEndian) { for(uint32_t i = 0; i < this->_gh._eftabLen; i++) this->eftab()[i] = readIndex(in5, switchEndian); } else { size_t r = MM_READ(in5, (void *)this->eftab(), this->_gh._eftabLen*sizeof(index_t)); if(r != (size_t)(this->_gh._eftabLen*sizeof(index_t))) { cerr << "Error reading _eftab[] array: " << r << ", " << (this->_gh._eftabLen*sizeof(index_t)) << endl; throw 1; } } } for(uint32_t i = 0; i < this->_gh._eftabLen; i++) { if(i > 0 && this->eftab()[i] > 0) { assert_geq(this->eftab()[i] + 4, this->eftab()[i-1]); } else if(i > 0 && this->eftab()[i-1] == 0) { assert_eq(0, this->eftab()[i]); } } } else { assert(this->ftab() == NULL); assert(this->eftab() == NULL); // Skip ftab bytesRead += this->_gh._ftabLen*sizeof(index_t); fseek(in5, this->_gh._ftabLen*sizeof(index_t), SEEK_CUR); // Skip eftab bytesRead += this->_gh._eftabLen*sizeof(index_t); fseek(in5, this->_gh._eftabLen*sizeof(index_t), SEEK_CUR); } } catch(bad_alloc& e) { cerr << "Out of memory allocating fchr[], ftab[] or eftab[] arrays for the Bowtie index." << endl << "Please try again on a computer with more memory." << endl; throw 1; } this->_offs.reset(); if(loadSASamp) { shmemLeader = true; if(this->_verbose || startVerbose) { cerr << "Reading offs (" << offsLenSampled << " " << std::setw(2) << sizeof(index_t)*8 << "-bit words): "; logTime(cerr); } if(!this->_useMm) { if(!this->useShmem_) { // Allocate offs_ try { this->_offs.init(new index_t[offsLenSampled], offsLenSampled, true); } catch(bad_alloc& e) { cerr << "Out of memory allocating the offs[] array for the Bowtie index." << endl << "Please try again on a computer with more memory." << endl; throw 1; } } else { index_t *tmp = NULL; shmemLeader = ALLOC_SHARED_U32( (this->_in2Str + "[offs]"), offsLenSampled*2, &tmp, "offs", (this->_verbose || startVerbose)); this->_offs.init((index_t*)tmp, offsLenSampled, false); } } if(this->_overrideOffRate < 32) { if(shmemLeader) { // Allocate offs (big allocation) if(switchEndian || offRateDiff > 0) { assert(!this->_useMm); const uint32_t blockMaxSz = (2 * 1024 * 1024); // 2 MB block size const uint32_t blockMaxSzUIndex = (blockMaxSz / sizeof(index_t)); // # UIndexs per block char *buf; try { buf = new char[blockMaxSz]; } catch(std::bad_alloc& e) { cerr << "Error: Out of memory allocating part of _offs array: '" << e.what() << "'" << endl; throw e; } for(index_t i = 0; i < offsLen; i += blockMaxSzUIndex) { index_t block = min((index_t)blockMaxSzUIndex, (index_t)(offsLen - i)); size_t r = MM_READ(in6, (void *)buf, block * sizeof(index_t)); if(r != (size_t)(block * sizeof(index_t))) { cerr << "Error reading block of _offs[] array: " << r << ", " << (block * sizeof(index_t)) << endl; throw 1; } index_t idx = i >> offRateDiff; for(index_t j = 0; j < block; j += (1 << offRateDiff)) { assert_lt(idx, offsLenSampled); this->offs()[idx] = ((index_t*)buf)[j]; if(switchEndian) { this->offs()[idx] = endianSwapIndex(this->offs()[idx]); } idx++; } } delete[] buf; } else { if(this->_useMm) { #ifdef BOWTIE_MM this->_offs.init((index_t*)(mmFile[1] + bytesRead2), offsLen, false); bytesRead2 += (offsLen * sizeof(index_t)); fseek(in6, (offsLen * sizeof(index_t)), SEEK_CUR); #endif } else { // If any of the high two bits are set if((offsLen & 0xc0000000) != 0) { if(sizeof(char *) <= 4) { cerr << "Sanity error: sizeof(char *) <= 4 but offsLen is " << hex << offsLen << endl; throw 1; } // offsLen << 2 overflows, so do it in four reads char *offs = (char *)this->offs(); for(size_t i = 0; i < sizeof(index_t); i++) { size_t r = MM_READ(in6, (void*)offs, offsLen); if(r != (size_t)(offsLen)) { cerr << "Error reading block of _offs[] array: " << r << ", " << offsLen << endl; throw 1; } offs += offsLen; } } else { // Do it all in one read size_t r = MM_READ(in6, (void*)this->offs(), offsLen * sizeof(index_t)); if(r != (size_t)(offsLen * sizeof(index_t))) { cerr << "Error reading _offs[] array: " << r << ", " << (offsLen * sizeof(index_t)) << endl; throw 1; } } } } #ifdef BOWTIE_SHARED_MEM if(this->useShmem_) NOTIFY_SHARED(this->offs(), offsLenSampled*sizeof(index_t)); #endif } else { // Not the shmem leader fseek(in6, offsLenSampled*sizeof(index_t), SEEK_CUR); #ifdef BOWTIE_SHARED_MEM if(this->useShmem_) WAIT_SHARED(this->offs(), offsLenSampled*sizeof(index_t)); #endif } } } this->postReadInit(this->_gh); // Initialize fields of Ebwt not read from file if(this->_verbose || startVerbose) this->print(cerr, this->_gh); } /** * Extended Burrows-Wheeler transform data. * HierEbwt is a specialized Ebwt index that represents one global index and a large set of local indexes. * */ template class HGFM : public GFM { typedef GFM PARENT_CLASS; public: /// Construct a GFM from the given input file HGFM(const string& in, ALTDB* altdb, RepeatDB* repeatdb, EList* readLens, int needEntireReverse, bool fw, int32_t overrideOffRate, // = -1, int32_t offRatePlus, // = -1, bool useMm, // = false, bool useShmem, // = false, bool mmSweep, // = false, bool loadNames, // = false, bool loadSASamp, // = true, bool loadFtab, // = true, bool loadRstarts, // = true, bool loadSpliceSites, // = true, bool verbose, // = false, bool startVerbose, // = false, bool passMemExc, // = false, bool sanityCheck, // = false bool useHaplotype, // = false bool skipLoading = false) : GFM(in, altdb, repeatdb, readLens, needEntireReverse, fw, overrideOffRate, offRatePlus, useMm, useShmem, mmSweep, loadNames, loadSASamp, loadFtab, loadRstarts, loadSpliceSites, verbose, startVerbose, passMemExc, sanityCheck, useHaplotype, skipLoading), _in5(NULL), _in6(NULL) { _in5Str = in + ".5." + gfm_ext; _in6Str = in + ".6." + gfm_ext; } /// Construct a HGFM from the given header parameters and string /// vector, optionally using a blockwise suffix sorter with the /// given 'bmax' and 'dcv' parameters. The string vector is /// ultimately joined and the joined string is passed to buildToDisk(). template HGFM( TStr& s, bool packed, int needEntireReverse, int32_t lineRate, int32_t offRate, int32_t ftabChars, int32_t localOffRate, int32_t localFtabChars, int nthreads, const string& snpfile, const string& htfile, const string& ssfile, const string& exonfile, const string& svfile, const string& repeatfile, const string& outfile, // base filename for GFM files bool fw, bool useBlockwise, TIndexOffU bmax, TIndexOffU bmaxSqrtMult, TIndexOffU bmaxDivN, int dcv, EList& is, EList& szs, index_t sztot, const RefReadInParams& refparams, bool localIndex, // create local indexes? EList* parent_szs, EList* parent_refnames, uint32_t seed, int32_t overrideOffRate = -1, bool verbose = false, bool passMemExc = false, bool sanityCheck = false); HGFM() {} ~HGFM() { clearLocalGFMs(); } /** * Load this Ebwt into memory by reading it in from the _in1 and * _in2 streams. */ void loadIntoMemory( int needEntireReverse, bool loadSASamp, bool loadFtab, bool loadRstarts, bool loadNames, bool verbose) { readIntoMemory( needEntireReverse, // require reverse index to be concatenated reference reversed loadSASamp, // load the SA sample portion? loadFtab, // load the ftab (_ftab[] and _eftab[])? loadRstarts, // load the r-starts (_rstarts[])? false, // stop after loading the header portion? NULL, // params false, // mmSweep loadNames, // loadNames verbose); // startVerbose } // I/O void readIntoMemory( int needEntireRev, bool loadSASamp, bool loadFtab, bool loadRstarts, bool justHeader, GFMParams *params, bool mmSweep, bool loadNames, bool startVerbose); /** * Frees memory associated with the Ebwt. */ void evictFromMemory() { assert(PARENT_CLASS::isInMemory()); clearLocalGFMs(); PARENT_CLASS::evictFromMemory(); } /** * Sanity-check various pieces of the Ebwt */ void sanityCheckAll(int reverse) const { PARENT_CLASS::sanityCheckAll(reverse); for(size_t tidx = 0; tidx < _localGFMs.size(); tidx++) { for(size_t local_idx = 0; local_idx < _localGFMs[tidx].size(); local_idx++) { assert(_localGFMs[tidx][local_idx] != NULL); _localGFMs[tidx][local_idx]->sanityCheckAll(reverse); } } } const LocalGFM* getLocalGFM(index_t tidx, index_t offset) const { assert_lt(tidx, _localGFMs.size()); const EList*>& localGFMs = _localGFMs[tidx]; index_t offsetidx = offset / local_index_interval; if(offsetidx >= localGFMs.size()) { return NULL; } else { return localGFMs[offsetidx]; } } const LocalGFM* prevLocalGFM(const LocalGFM* currLocalGFM) const { assert(currLocalGFM != NULL); index_t tidx = currLocalGFM->_tidx; index_t offset = currLocalGFM->_localOffset; if(offset < local_index_interval) { return NULL; } else { return getLocalGFM(tidx, offset - local_index_interval); } } const LocalGFM* nextLocalGFM(const LocalGFM* currLocalGFM) const { assert(currLocalGFM != NULL); index_t tidx = currLocalGFM->_tidx; index_t offset = currLocalGFM->_localOffset; return getLocalGFM(tidx, offset + local_index_interval); } void clearLocalGFMs() { for(size_t tidx = 0; tidx < _localGFMs.size(); tidx++) { for(size_t local_idx = 0; local_idx < _localGFMs[tidx].size(); local_idx++) { assert(_localGFMs[tidx][local_idx] != NULL); delete _localGFMs[tidx][local_idx]; } _localGFMs[tidx].clear(); } _localGFMs.clear(); } public: index_t _nrefs; /// the number of reference sequences EList _refLens; /// approx lens of ref seqs (excludes trailing ambig chars) EList*> > _localGFMs; index_t _nlocalGFMs; //index_t _local_index_interval; FILE *_in5; // input fd for primary index file FILE *_in6; // input fd for secondary index file string _in5Str; string _in6Str; char *mmFile5_; char *mmFile6_; private: struct ThreadParam { // input SString s; EList > alts; EList > haplotypes; bool bigEndian; index_t local_offset; index_t curr_sztot; EList conv_local_szs; index_t local_sztot; index_t index_size; string file; EList sa; index_t dcv; index_t seed; // output RefGraph* rg; PathGraph* pg; // communication bool done; bool last; bool mainThread; }; static void gbwt_worker(void* vp); }; template void HGFM::gbwt_worker(void* vp) { ThreadParam& tParam = *(ThreadParam*)vp; while(!tParam.last) { if(tParam.mainThread) { assert(!tParam.done); if(tParam.s.length() <= 0) { tParam.done = true; return; } } else { while(tParam.done) { if(tParam.last) return; #if defined(_TTHREAD_WIN32_) Sleep(1); #elif defined(_TTHREAD_POSIX_) const static timespec ts = {0, 1000000}; // 1 millisecond nanosleep(&ts, NULL); #endif } if(tParam.s.length() <= 0) { tParam.done = true; continue; } } while(true) { if(tParam.alts.empty()) { KarkkainenBlockwiseSA > bsa( tParam.s, (index_t)(tParam.s.length()+1), 1, tParam.dcv, tParam.seed, false, /* this->_sanity */ false, /* this->_passMemExc */ false); /* this->_verbose */ assert(bsa.suffixItrIsReset()); assert_eq(bsa.size(), tParam.s.length()+1); tParam.sa.clear(); for(index_t i = 0; i < bsa.size(); i++) { tParam.sa.push_back(bsa.nextSuffix()); } } else { tParam.rg = NULL, tParam.pg = NULL; bool exploded = false; try { tParam.rg = new RefGraph( tParam.s, tParam.conv_local_szs, tParam.alts, tParam.haplotypes, tParam.file, 1, /* num threads */ false); /* verbose? */ tParam.pg = new PathGraph( *tParam.rg, tParam.file, local_max_gbwt, 1, /* num threads */ false); /* verbose? */ } catch (const NongraphException& err) { cerr << "Warning: no variants or splice sites in this graph (" << tParam.curr_sztot << ")" << endl; delete tParam.rg; delete tParam.pg; tParam.alts.clear(); continue; } catch (const ExplosionException& err) { exploded = true; } if(!exploded) { if(!tParam.pg->generateEdges(*tParam.rg)) { cerr << "An error occurred - generateEdges" << endl; throw 1; } exploded = tParam.pg->getNumEdges() > local_max_gbwt; } if(exploded) { cerr << "Warning: a local graph exploded (offset: " << tParam.curr_sztot << ", length: " << tParam.local_sztot << ")" << endl; delete tParam.pg; tParam.pg = NULL; delete tParam.rg; tParam.rg = NULL; if(tParam.alts.size() <= 1) { tParam.alts.clear(); } else { for(index_t s = 2; s < tParam.alts.size(); s += 2) { tParam.alts[s >> 1] = tParam.alts[s]; } tParam.alts.resize(tParam.alts.size() >> 1); tParam.haplotypes.clear(); for(index_t a = 0; a < tParam.alts.size(); a++) { const ALT& alt = tParam.alts[a]; if(!alt.snp()) continue; tParam.haplotypes.expand(); tParam.haplotypes.back().left = alt.pos; if(alt.deletion()) { tParam.haplotypes.back().right = alt.pos + alt.len - 1; } else { tParam.haplotypes.back().right = alt.pos; } tParam.haplotypes.back().alts.clear(); tParam.haplotypes.back().alts.push_back(a); } } continue; } } break; } tParam.done = true; if(tParam.mainThread) break; } } /// Construct a GFM from the given header parameters and string /// vector, optionally using a blockwise suffix sorter with the /// given 'bmax' and 'dcv' parameters. The string vector is /// ultimately joined and the joined string is passed to buildToDisk(). template template HGFM::HGFM( TStr& s, bool packed, int needEntireReverse, int32_t lineRate, int32_t offRate, int32_t ftabChars, int32_t localOffRate, int32_t localFtabChars, int nthreads, const string& snpfile, const string& htfile, const string& ssfile, const string& exonfile, const string& svfile, const string& repeatfile, const string& outfile, // base filename for EBWT files bool fw, bool useBlockwise, TIndexOffU bmax, TIndexOffU bmaxSqrtMult, TIndexOffU bmaxDivN, int dcv, EList& is, EList& szs, index_t sztot, const RefReadInParams& refparams, bool localIndex, EList* parent_szs, EList* parent_refnames, uint32_t seed, int32_t overrideOffRate, bool verbose, bool passMemExc, bool sanityCheck) : GFM(s, packed, needEntireReverse, lineRate, offRate, ftabChars, nthreads, snpfile, htfile, ssfile, exonfile, svfile, repeatfile, outfile, fw, useBlockwise, bmax, bmaxSqrtMult, bmaxDivN, dcv, is, szs, sztot, refparams, parent_szs, parent_refnames, seed, overrideOffRate, verbose, passMemExc, sanityCheck), _in5(NULL), _in6(NULL) { _in5Str = outfile + ".5." + gfm_ext; _in6Str = outfile + ".6." + gfm_ext; // const bool repeat_index = (parent_szs != NULL); int32_t local_lineRate; if(snpfile == "" && ssfile == "" && exonfile == "") { local_lineRate = local_lineRate_fm; } else { local_lineRate = local_lineRate_gfm; } // Open output files ofstream fout5(_in5Str.c_str(), ios::binary); if(!fout5.good()) { cerr << "Could not open index file for writing: \"" << _in5Str.c_str() << "\"" << endl << "Please make sure the directory exists and that permissions allow writing by" << endl << "HISAT2." << endl; throw 1; } ofstream fout6(_in6Str.c_str(), ios::binary); if(!fout6.good()) { cerr << "Could not open index file for writing: \"" << _in6Str.c_str() << "\"" << endl << "Please make sure the directory exists and that permissions allow writing by" << endl << "HISAT2." << endl; throw 1; } // Split the whole genome into a set of local indexes _nrefs = 0; _nlocalGFMs = 0; index_t cumlen = 0; typedef EList EList_RefRecord; ELList all_local_recs; if(localIndex) { // For each unambiguous stretch... for(index_t i = 0; i < szs.size(); i++) { const RefRecord& rec = szs[i]; if(rec.first) { if(_nrefs > 0) { // refLens_ links each reference sequence with the total number // of ambiguous and unambiguous characters in it. _refLens.push_back(cumlen); } cumlen = 0; _nrefs++; all_local_recs.expand(); assert_eq(_nrefs, all_local_recs.size()); } else if(i == 0) { cerr << "First record in reference index file was not marked as " << "'first'" << endl; throw 1; } assert_gt(_nrefs, 0); assert_eq(_nrefs, all_local_recs.size()); EList& ref_local_recs = all_local_recs[_nrefs-1]; index_t next_cumlen = cumlen + rec.off + rec.len; index_t local_off = (cumlen / local_index_interval) * local_index_interval; if(local_off >= local_index_interval) { local_off -= local_index_interval; } for(;local_off < next_cumlen; local_off += local_index_interval) { if(local_off + local_index_size <= cumlen) { continue; } index_t local_idx = local_off / local_index_interval; if(local_idx >= ref_local_recs.size()) { assert_eq(local_idx, ref_local_recs.size()); ref_local_recs.expand(); _nlocalGFMs++; } assert_lt(local_idx, ref_local_recs.size()); EList_RefRecord& local_recs = ref_local_recs[local_idx]; assert_gt(local_off + local_index_size, cumlen); local_recs.expand(); if(local_off + local_index_size < cumlen + rec.off) { local_recs.back().off = local_off + local_index_size - std::max(local_off, cumlen); local_recs.back().len = 0; } else { if(local_off < cumlen + rec.off) { local_recs.back().off = rec.off - (local_off > cumlen ? local_off - cumlen : 0); } else { local_recs.back().off = 0; } local_recs.back().len = std::min(next_cumlen, local_off + local_index_size) - std::max(local_off, cumlen + rec.off); } local_recs.back().first = (local_recs.size() == 1); } cumlen = next_cumlen; } // Store a cap entry for the end of the last reference seq _refLens.push_back(cumlen); #ifndef NDEBUG EList temp_szs; index_t temp_sztot = 0; index_t temp_nlocalGFMs = 0; for(size_t tidx = 0; tidx < all_local_recs.size(); tidx++) { assert_lt(tidx, _refLens.size()); EList& ref_local_recs = all_local_recs[tidx]; assert_eq((_refLens[tidx] + local_index_interval - 1) / local_index_interval, ref_local_recs.size()); temp_szs.expand(); temp_szs.back().off = 0; temp_szs.back().len = 0; temp_szs.back().first = true; index_t temp_ref_len = 0; index_t temp_ref_sztot = 0; temp_nlocalGFMs += ref_local_recs.size(); for(size_t i = 0; i < ref_local_recs.size(); i++) { EList_RefRecord& local_recs = ref_local_recs[i]; index_t local_len = 0; for(size_t j = 0; j < local_recs.size(); j++) { assert(local_recs[j].off != 0 || local_recs[j].len != 0); assert(j != 0 || local_recs[j].first); RefRecord local_rec = local_recs[j]; if(local_len < local_index_interval && local_recs[j].off > 0){ if(local_len + local_recs[j].off > local_index_interval) { temp_ref_len += (local_index_interval - local_len); local_rec.off = local_index_interval - local_len; } else { temp_ref_len += local_recs[j].off; } } else { local_rec.off = 0; } local_len += local_recs[j].off; if(local_len < local_index_interval && local_recs[j].len > 0) { if(local_len + local_recs[j].len > local_index_interval) { temp_ref_len += (local_index_interval - local_len); temp_ref_sztot += (local_index_interval - local_len); local_rec.len = local_index_interval - local_len; } else { temp_ref_len += local_recs[j].len; temp_ref_sztot += local_recs[j].len; } } else { local_rec.len = 0; } local_len += local_recs[j].len; if(local_rec.off > 0) { if(temp_szs.back().len > 0) { temp_szs.expand(); temp_szs.back().off = local_rec.off; temp_szs.back().len = local_rec.len; temp_szs.back().first = false; } else { temp_szs.back().off += local_rec.off; temp_szs.back().len = local_rec.len; } } else if(local_rec.len > 0) { temp_szs.back().len += local_rec.len; } } if(i + 2 < ref_local_recs.size()) { assert_eq(local_len, local_index_size); assert_eq(temp_ref_len % local_index_interval, 0); } else if (i + 1 < ref_local_recs.size()) { assert_leq(local_len, local_index_size); assert_geq(local_len, local_index_interval); } else { assert_eq(local_len % local_index_interval, _refLens[tidx] % local_index_interval); } } assert_eq(temp_ref_len, _refLens[tidx]); temp_sztot += temp_ref_sztot; } assert_eq(temp_sztot, sztot); for(size_t i = 0; i < temp_szs.size(); i++) { assert_lt(i, szs.size()); assert_eq(temp_szs[i].off, szs[i].off); assert_eq(temp_szs[i].len, szs[i].len); assert_eq(temp_szs[i].first, szs[i].first); } assert_eq(temp_szs.size(), szs.size()); assert_eq(_nlocalGFMs, temp_nlocalGFMs); #endif } uint32_t be = this->toBe(); assert(fout5.good()); assert(fout6.good()); // const local_index_t new_localFtabChars = (repeat_index ? 4 : localFtabChars); const local_index_t new_localFtabChars = localFtabChars; // When building an Ebwt, these header parameters are known // "up-front", i.e., they can be written to disk immediately, // before we join() or buildToDisk() writeI32(fout5, 1, be); // endian hint for priamry stream writeI32(fout6, 1, be); // endian hint for secondary stream writeIndex(fout5, _nlocalGFMs, be); // number of local Ebwts writeI32(fout5, local_lineRate, be); // 2^lineRate = size in bytes of 1 line writeI32(fout5, 2, be); // not used writeI32(fout5, (int32_t)localOffRate, be); // every 2^offRate chars is "marked" writeI32(fout5, (int32_t)new_localFtabChars, be); // number of 2-bit chars used to address ftab int32_t flags = 1; if(this->_gh._entireReverse) flags |= GFM_ENTIRE_REV; writeI32(fout5, -flags, be); // BTL: chunkRate is now deprecated if(localIndex) { assert_gt(this->_nthreads, 0); AutoArray threads(this->_nthreads - 1); EList tParams; tParams.reserveExact((size_t)this->_nthreads); for(index_t t = 0; t < (index_t)this->_nthreads; t++) { tParams.expand(); tParams.back().s.clear(); tParams.back().rg = NULL; tParams.back().pg = NULL; tParams.back().file = outfile; tParams.back().done = true; tParams.back().last = false; tParams.back().dcv = 1024; tParams.back().seed = seed; if(t + 1 < (index_t)this->_nthreads) { tParams.back().mainThread = false; threads[t] = new tthread::thread(gbwt_worker, (void*)&tParams.back()); } else { tParams.back().mainThread = true; } } // build local FM indexes index_t curr_sztot = 0; EList > alts; for(size_t tidx = 0; tidx < _refLens.size(); tidx++) { index_t refLen = _refLens[tidx]; index_t local_offset = 0; _localGFMs.expand(); assert_lt(tidx, _localGFMs.size()); while(local_offset < refLen) { index_t t = 0; while(local_offset < refLen && t < (index_t)this->_nthreads) { assert_lt(t, tParams.size()); ThreadParam& tParam = tParams[t]; tParam.index_size = std::min(refLen - local_offset, local_index_size); assert_lt(tidx, all_local_recs.size()); assert_lt(local_offset / local_index_interval, all_local_recs[tidx].size()); EList_RefRecord& local_szs = all_local_recs[tidx][local_offset / local_index_interval]; tParam.conv_local_szs.clear(); index_t local_len = 0, local_sztot = 0, local_sztot_interval = 0; for(size_t i = 0; i < local_szs.size(); i++) { assert(local_szs[i].off != 0 || local_szs[i].len != 0); assert(i != 0 || local_szs[i].first); tParam.conv_local_szs.push_back(local_szs[i]); local_len += local_szs[i].off; if(local_len < local_index_interval && local_szs[i].len > 0) { if(local_len + local_szs[i].len > local_index_interval) { local_sztot_interval += (local_index_interval - local_len); } else { local_sztot_interval += local_szs[i].len; } } local_sztot += local_szs[i].len; local_len += local_szs[i].len; } // Extract sequence corresponding to this local index tParam.s.resize(local_sztot); if(refparams.reverse == REF_READ_REVERSE) { tParam.s.install(s.buf() + s.length() - curr_sztot - local_sztot, local_sztot); } else { tParam.s.install(s.buf() + curr_sztot, local_sztot); } // Extract ALTs corresponding to this local index map alt_map; tParam.alts.clear(); ALT alt; alt.pos = curr_sztot; index_t alt_i = (index_t)this->_alts.bsearchLoBound(alt); for(; alt_i < this->_alts.size(); alt_i++) { const ALT& alt = this->_alts[alt_i]; if(alt.snp()) { if(alt.mismatch()) { if(curr_sztot + local_sztot <= alt.pos) break; } else if(alt.insertion()) { if(curr_sztot + local_sztot < alt.pos) break; } else { assert(alt.deletion()); if(curr_sztot + local_sztot < alt.pos + alt.len) break; } if(curr_sztot <= alt.pos) { alt_map[alt_i] = (index_t)tParam.alts.size(); tParam.alts.push_back(alt); tParam.alts.back().pos -= curr_sztot; } } else if(alt.splicesite()) { if(alt.excluded) continue; if(curr_sztot + local_sztot <= alt.right + 1) continue; if(curr_sztot <= alt.left) { tParam.alts.push_back(alt); tParam.alts.back().left -= curr_sztot; tParam.alts.back().right -= curr_sztot; } } else { assert(alt.exon()); } } // Extract haplotypes tParam.haplotypes.clear(); Haplotype haplotype; haplotype.left = curr_sztot; index_t haplotpye_i = (index_t)this->_haplotypes.bsearchLoBound(haplotype); for(; haplotpye_i < this->_haplotypes.size(); haplotpye_i++) { const Haplotype& haplotype = this->_haplotypes[haplotpye_i]; if(curr_sztot + local_sztot <= haplotype.right) continue; if(curr_sztot <= haplotype.left) { tParam.haplotypes.push_back(haplotype); tParam.haplotypes.back().left -= curr_sztot; tParam.haplotypes.back().right -= curr_sztot; for(index_t a = 0; a < tParam.haplotypes.back().alts.size(); a++) { index_t alt_i = tParam.haplotypes.back().alts[a]; if(alt_map.find(alt_i) == alt_map.end()) { assert(false); tParam.haplotypes.pop_back(); break; } tParam.haplotypes.back().alts[a] = alt_map[alt_i]; } } } tParam.local_offset = local_offset; tParam.curr_sztot = curr_sztot; tParam.local_sztot = local_sztot; assert(tParam.rg == NULL); assert(tParam.pg == NULL); tParam.done = false; curr_sztot += local_sztot_interval; local_offset += local_index_interval; t++; } if(!tParams.back().done) { gbwt_worker((void*)&tParams.back()); } for(index_t t2 = 0; t2 < t; t2++) { ThreadParam& tParam = tParams[t2]; while(!tParam.done) { #if defined(_TTHREAD_WIN32_) Sleep(1); #elif defined(_TTHREAD_POSIX_) const static timespec ts = {0, 1000000}; // 1 millisecond nanosleep(&ts, NULL); #endif } LocalGFM( tParam.s, tParam.sa, tParam.pg, (index_t)tidx, tParam.local_offset, tParam.curr_sztot, tParam.alts, tParam.index_size, packed, needEntireReverse, local_lineRate, localOffRate, // suffix-array sampling rate new_localFtabChars, // number of chars in initial arrow-pair calc outfile, // basename for .?.ebwt files fw, // fw dcv, // difference-cover period tParam.conv_local_szs, // list of reference sizes tParam.local_sztot, // total size of all unambiguous ref chars refparams, // reference read-in parameters seed, // pseudo-random number generator seed fout5, fout6, -1, // override offRate false, // be silent passMemExc, // pass exceptions up to the toplevel so that we can adjust memory settings automatically sanityCheck); // verify results and internal consistency tParam.s.clear(); if(tParam.rg != NULL) { assert(tParam.pg != NULL); delete tParam.rg; tParam.rg = NULL; delete tParam.pg; tParam.pg = NULL; } } } } assert_eq(curr_sztot, sztot); if(this->_nthreads > 1) { for(index_t i = 0; i + 1 < (index_t)this->_nthreads; i++) { tParams[i].last = true; threads[i]->join(); } } } fout5 << '\0'; fout5.flush(); fout6.flush(); if(fout5.fail() || fout6.fail()) { cerr << "An error occurred writing the index to disk. Please check if the disk is full." << endl; throw 1; } VMSG_NL("Returning from initFromVector"); // Close output files fout5.flush(); int64_t tellpSz5 = (int64_t)fout5.tellp(); VMSG_NL("Wrote " << fout5.tellp() << " bytes to primary GFM file: " << _in5Str.c_str()); fout5.close(); bool err = false; if(tellpSz5 > fileSize(_in5Str.c_str())) { err = true; cerr << "Index is corrupt: File size for " << _in5Str.c_str() << " should have been " << tellpSz5 << " but is actually " << fileSize(_in5Str.c_str()) << "." << endl; } fout6.flush(); int64_t tellpSz6 = (int64_t)fout6.tellp(); VMSG_NL("Wrote " << fout6.tellp() << " bytes to secondary GFM file: " << _in6Str.c_str()); fout6.close(); if(tellpSz6 > fileSize(_in6Str.c_str())) { err = true; cerr << "Index is corrupt: File size for " << _in6Str.c_str() << " should have been " << tellpSz6 << " but is actually " << fileSize(_in6Str.c_str()) << "." << endl; } if(err) { cerr << "Please check if there is a problem with the disk or if disk is full." << endl; throw 1; } // Reopen as input streams VMSG_NL("Re-opening _in5 and _in5 as input streams"); if(this->_sanity) { VMSG_NL("Sanity-checking ht2"); assert(!this->isInMemory()); readIntoMemory( fw ? -1 : needEntireReverse, // 1 -> need the reverse to be reverse-of-concat true, // load SA sample (_offs[])? true, // load ftab (_ftab[] & _eftab[])? true, // load r-starts (_rstarts[])? false, // just load header? NULL, // Params object to fill false, // mm sweep? true, // load names? false); // verbose startup? sanityCheckAll(refparams.reverse); evictFromMemory(); assert(!this->isInMemory()); } VMSG_NL("Returning from HGFM constructor"); } /** * Read an Ebwt from file with given filename. */ template void HGFM::readIntoMemory( int needEntireRev, bool loadSASamp, bool loadFtab, bool loadRstarts, bool justHeader, GFMParams *params, bool mmSweep, bool loadNames, bool startVerbose) { PARENT_CLASS::readIntoMemory(needEntireRev, loadSASamp, loadFtab, loadRstarts, justHeader || needEntireRev == 1, params, mmSweep, loadNames, startVerbose); bool switchEndian; // dummy; caller doesn't care #ifdef BOWTIE_MM char *mmFile[] = { NULL, NULL }; #endif if(_in5Str.length() > 0) { if(this->_verbose || startVerbose) { cerr << " About to open input files: "; logTime(cerr); } // Initialize our primary and secondary input-stream fields if(_in5 != NULL) fclose(_in5); if(this->_verbose || startVerbose) cerr << "Opening \"" << _in5Str.c_str() << "\"" << endl; if((_in5 = fopen(_in5Str.c_str(), "rb")) == NULL) { cerr << "Could not open index file " << _in5Str.c_str() << endl; } if(loadSASamp) { if(_in6 != NULL) fclose(_in6); if(this->_verbose || startVerbose) cerr << "Opening \"" << _in6Str.c_str() << "\"" << endl; if((_in6 = fopen(_in6Str.c_str(), "rb")) == NULL) { cerr << "Could not open index file " << _in6Str.c_str() << endl; } } if(this->_verbose || startVerbose) { cerr << " Finished opening input files: "; logTime(cerr); } #ifdef BOWTIE_MM if(this->_useMm /*&& !justHeader*/) { const char *names[] = {_in5Str.c_str(), _in6Str.c_str()}; int fds[] = { fileno(_in5), fileno(_in6) }; for(int i = 0; i < (loadSASamp ? 2 : 1); i++) { if(this->_verbose || startVerbose) { cerr << " ¯ " << (i+1) << ": "; logTime(cerr); } struct stat sbuf; if (stat(names[i], &sbuf) == -1) { perror("stat"); cerr << "Error: Could not stat index file " << names[i] << " prior to memory-mapping" << endl; throw 1; } mmFile[i] = (char*)mmap((void *)0, (size_t)sbuf.st_size, PROT_READ, MAP_SHARED, fds[(size_t)i], 0); if(mmFile[i] == (void *)(-1)) { perror("mmap"); cerr << "Error: Could not memory-map the index file " << names[i] << endl; throw 1; } if(mmSweep) { int sum = 0; for(off_t j = 0; j < sbuf.st_size; j += 1024) { sum += (int) mmFile[i][j]; } if(startVerbose) { cerr << " Swept the memory-mapped ebwt index file 1; checksum: " << sum << ": "; logTime(cerr); } } } mmFile5_ = mmFile[0]; mmFile6_ = loadSASamp ? mmFile[1] : NULL; } #endif } #ifdef BOWTIE_MM else if(this->_useMm && !justHeader) { mmFile[0] = mmFile5_; mmFile[1] = mmFile6_; } if(this->_useMm && !justHeader) { assert(mmFile[0] == mmFile5_); assert(mmFile[1] == mmFile6_); } #endif if(this->_verbose || startVerbose) { cerr << " Reading header: "; logTime(cerr); } // Read endianness hints from both streams size_t bytesRead = 0, bytesRead2 = 4; switchEndian = false; uint32_t one = readU32(_in5, switchEndian); // 1st word of primary stream bytesRead += 4; if(loadSASamp) { #ifndef NDEBUG assert_eq(one, readU32(_in6, switchEndian)); // should match! #else readU32(_in6, switchEndian); #endif } if(one != 1) { assert_eq((1u<<24), one); assert_eq(1, endianSwapU32(one)); switchEndian = true; } // Can't switch endianness and use memory-mapped files; in order to // support this, someone has to modify the file to switch // endiannesses appropriately, and we can't do this inside Bowtie // or we might be setting up a race condition with other processes. if(switchEndian && this->_useMm) { cerr << "Error: Can't use memory-mapped files when the index is the opposite endianness" << endl; throw 1; } _nlocalGFMs = readIndex(_in5, switchEndian); bytesRead += sizeof(index_t); int32_t lineRate = readI32(_in5, switchEndian); bytesRead += 4; readI32(_in5, switchEndian); bytesRead += 4; int32_t offRate = readI32(_in5, switchEndian); bytesRead += 4; // TODO: add isaRate to the actual file format (right now, the // user has to tell us whether there's an ISA sample and what the // sampling rate is. int32_t ftabChars = readI32(_in5, switchEndian); bytesRead += 4; /*int32_t flag =*/ readI32(_in5, switchEndian); bytesRead += 4; if(this->_verbose || startVerbose) { cerr << " number of local indexes: " << _nlocalGFMs << endl << " local offRate: " << offRate << endl << " local ftabLen: " << (1 << (2 * ftabChars)) << endl << " local ftabSz: " << (2 << (2 * ftabChars)) << endl ; } clearLocalGFMs(); index_t tidx = 0, localOffset = 0, joinedOffset = 0; string base = ""; for(size_t i = 0; i < _nlocalGFMs; i++) { LocalGFM *localGFM = new LocalGFM(base, NULL, _in5, _in6, mmFile5_, mmFile6_, tidx, localOffset, joinedOffset, switchEndian, bytesRead, bytesRead2, needEntireRev, this->fw_, -1, // overrideOffRate -1, // offRatePlus (uint32_t)lineRate, (uint32_t)offRate, (uint32_t)ftabChars, this->_useMm, this->useShmem_, mmSweep, loadNames, loadSASamp, loadFtab, loadRstarts, false, // _verbose false, this->_passMemExc, this->_sanity, false); // use haplotypes? if(tidx >= _localGFMs.size()) { assert_eq(tidx, _localGFMs.size()); _localGFMs.expand(); } assert_eq(tidx + 1, _localGFMs.size()); _localGFMs.back().push_back(localGFM); } #ifdef BOWTIE_MM fseek(_in5, 0, SEEK_SET); fseek(_in6, 0, SEEK_SET); #else rewind(_in5); rewind(_in6); #endif } #endif /*HGFM_H_*/