hisat-3n/rfm.h

/*
 * Copyright 2018, Chanhee Park <parkchanhee@gmail.com> and Daehwan Kim <infphilo@gmail.com>
 *
 * This file is part of HISAT 2.
 *
 * HISAT 2 is free software: you can redistribute it and/or modify
 * it under the terms of the GNU General Public License as published by
 * the Free Software Foundation, either version 3 of the License, or
 * (at your option) any later version.
 *
 * HISAT 2 is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU General Public License for more details.
 *
 * You should have received a copy of the GNU General Public License
 * along with HISAT 2.  If not, see <http://www.gnu.org/licenses/>.
 */

#ifndef RFM_H_
#define RFM_H_

#include "hgfm.h"

/**
 * Extended Burrows-Wheeler transform data.
 * LocalEbwt is a specialized Ebwt index that represents ~64K bps
 * and therefore uses two bytes as offsets within 64K bps.
 * This class has only two additional member variables to denote the genomic sequenuce it represents:
 * (1) the contig index and (2) the offset within the contig.
 *
 */
template <typename index_t>
class LocalRFM : public GFM<index_t> {
	typedef GFM<index_t> PARENT_CLASS;
public:
	/// Construct an Ebwt from the given input file
	LocalRFM(const string& in,
             ALTDB<index_t>* altdb,
             FILE *in1,
             FILE *in2,
             char *mmFile1,
             char *mmFile2,
             bool switchEndian,
             size_t& bytesRead,
             size_t& bytesRead2,
             int needEntireReverse,
             bool fw,
             int32_t overrideOffRate, // = -1,
             int32_t offRatePlus, // = -1,
             bool useMm, // = false,
             bool useShmem, // = false,
             bool mmSweep, // = false,
             bool loadNames, // = false,
             bool loadSASamp, // = true,
             bool loadFtab, // = true,
             bool loadRstarts, // = true,
             bool verbose, // = false,
             bool startVerbose, // = false,
             bool passMemExc, // = false,
             bool sanityCheck, // = false)
             bool useHaplotype) : // = false
	GFM<index_t>(in,
                 altdb,
                 NULL,
                 NULL,
                 needEntireReverse,
                 fw,
                 overrideOffRate,
                 offRatePlus,
                 useMm,
                 useShmem,
                 mmSweep,
                 loadNames,
                 loadSASamp,
                 loadFtab,
                 loadRstarts,
                 true, // load Splice Sites
                 verbose,
                 startVerbose,
                 passMemExc,
                 sanityCheck,
                 useHaplotype,
                 true)
	{
        this->_in1 = in1;
        this->_in2 = in2;

        this->_repeat = true;
        bool subIndex = true;
        PARENT_CLASS::readIntoMemory(
                                     needEntireReverse,
                                     loadSASamp,
                                     loadFtab,
                                     loadRstarts,
                                     false,              //justHeader
                                     &(this->_gh),
                                     mmSweep,
                                     loadNames,
                                     startVerbose,
                                     subIndex);

		// If the offRate has been overridden, reflect that in the
		// _eh._offRate field
		if(offRatePlus > 0 && this->_overrideOffRate == -1) {
			this->_overrideOffRate = this->_gh._offRate + offRatePlus;
		}
		if(this->_overrideOffRate > this->_gh._offRate) {
			this->_gh.setOffRate(this->_overrideOffRate);
			assert_eq(this->_overrideOffRate, this->_gh._offRate);
		}
		assert(this->repOk());
	}


	/// Construct an Ebwt from the given header parameters and string
	/// vector, optionally using a blockwise suffix sorter with the
	/// given 'bmax' and 'dcv' parameters.  The string vector is
	/// ultimately joined and the joined string is passed to buildToDisk().
	template<typename TStr>
	LocalRFM(
             TStr& s,
             InorderBlockwiseSA<TStr>* sa,
             PathGraph<index_t>* pg,
             EList<ALT<index_t> >& alts,
             index_t local_size,
             const EList<string>& refnames,
             bool packed,
             int needEntireReverse,
             int32_t lineRate,
             int32_t offRate,
             int32_t ftabChars,
             const string& file,   // base filename for EBWT files
             bool fw,
             int dcv,
             EList<RefRecord>& szs,
             index_t sztot,
             const RefReadInParams& refparams,
             uint32_t seed,
             ostream& out1,
             ostream& out2,
             int32_t overrideOffRate = -1,
             bool verbose = false,
             bool passMemExc = false,
             bool sanityCheck = false) :
	GFM<index_t>(packed,
                 needEntireReverse,
                 lineRate,
                 offRate,
                 ftabChars,
                 file,
                 fw,
                 dcv,
                 szs,
                 sztot,
                 refparams,
                 seed,
                 overrideOffRate,
                 verbose,
                 passMemExc,
                 sanityCheck)
	{
		const GFMParams<index_t>& gh = this->_gh;
		assert(gh.repOk());
		uint32_t be = this->toBe();
		assert(out1.good());
		assert(out2.good());
        assert_eq(refnames.size(), 1);
        this->_refnames.push_back(refnames[0]);

		writeIndex<index_t>(out1, gh._len, be); // length of string (and bwt and suffix array)
        streampos headerPos = out1.tellp();
        writeIndex<index_t>(out1, 0, be); // gbwtLen
        writeIndex<index_t>(out1, 0, be); // num of nodes
        writeI32(out1, lineRate, be);
        writeI32(out1, 0, be);
        writeI32(out1, offRate, be);
        writeI32(out1, ftabChars, be);
        writeIndex<index_t>(out1, 0, be); // eftabLen
        writeI32(out1, 0, be); // flag
		if(gh._len > 0) {
			assert_gt(szs.size(), 0);
			assert_gt(sztot, 0);
			// Not every fragment represents a distinct sequence - many
			// fragments may correspond to a single sequence.  Count the
			// number of sequences here by counting the number of "first"
			// fragments.
			this->_nPat = 0;
			this->_nFrag = 0;
			for(size_t i = 0; i < szs.size(); i++) {
				if(szs[i].len > 0) this->_nFrag++;
				if(szs[i].first && szs[i].len > 0) this->_nPat++;
			}
			assert_eq(this->_nPat, 1);
			assert_geq(this->_nFrag, this->_nPat);
			this->_rstarts.reset();
			writeIndex(out1, this->_nPat, be);
			assert_eq(this->_nPat, 1);
			this->_plen.init(new index_t[this->_nPat], this->_nPat);
			// For each pattern, set plen
			int npat = -1;
			for(size_t i = 0; i < szs.size(); i++) {
				if(szs[i].first && szs[i].len > 0) {
					if(npat >= 0) {
						writeIndex(out1, this->plen()[npat], be);
					}
					npat++;
					this->plen()[npat] = (szs[i].len + szs[i].off);
				} else {
					this->plen()[npat] += (szs[i].len + szs[i].off);
				}
			}
			assert_eq((index_t)npat, this->_nPat-1);
			writeIndex(out1, this->plen()[npat], be);
			// Write the number of fragments
			writeIndex(out1, this->_nFrag, be);

			if(refparams.reverse == REF_READ_REVERSE) {
				EList<RefRecord> tmp(EBWT_CAT);
                reverseRefRecords(szs, tmp, false, verbose);
				this->szsToDisk(tmp, out1, refparams.reverse);
			} else {
				this->szsToDisk(szs, out1, refparams.reverse);
			}

            if(alts.empty()) {
                assert(pg == NULL);
                PARENT_CLASS::buildToDisk(*sa, s, out1, out2, headerPos);
            } else {
                assert(pg != NULL);
                // Re-initialize GFM parameters to reflect real number of edges (gbwt string)
                this->_gh.init(
                               this->_gh.len(),
                               pg->getNumEdges(),
                               pg->getNumNodes(),
                               this->_gh.lineRate(),
                               this->_gh.offRate(),
                               this->_gh.ftabChars(),
                               0,
                               this->_gh.entireReverse());
                PARENT_CLASS::buildToDisk(*pg, s, out1, out2, headerPos);
            }
		}

        // Now write reference sequence names on the end
        assert_eq(this->_refnames.size(), this->_nPat);
        for(index_t i = 0; i < this->_refnames.size(); i++) {
            out1 << this->_refnames[i].c_str();
            if(i + 1 < this->_refnames.size()) cout << endl;
            else out1 << '\0';
        }
		out1.flush(); out2.flush();
		if(out1.fail() || out2.fail()) {
			cerr << "An error occurred writing the index to disk.  Please check if the disk is full." << endl;
			throw 1;
		}
	}

	/**
	 * Sanity-check various pieces of the Ebwt
	 */
	void sanityCheckAll(int reverse) const {
		if(this->_gh._len > 0) {
			PARENT_CLASS::sanityCheckAll(reverse);
		}
	}

    bool empty() const { return this->_gh._len == 0; }
};

/**
 * Extended Burrows-Wheeler transform data.
 * RFM is a specialized Ebwt index that represents one global index and a large set of local indexes.
 *
 */
template <typename index_t = uint32_t>
class RFM : public GFM<index_t> {
	typedef GFM<index_t> PARENT_CLASS;
public:
	/// Construct a GFM from the given input file
	RFM(const string& in,
        ALTDB<index_t>* altdb,
        RepeatDB<index_t>* repeatdb,
        EList<size_t>* readLens,
        int needEntireReverse,
        bool fw,
        int32_t overrideOffRate, // = -1,
        int32_t offRatePlus, // = -1,
        bool useMm, // = false,
        bool useShmem, // = false,
        bool mmSweep, // = false,
        bool loadNames, // = false,
        bool loadSASamp, // = true,
        bool loadFtab, // = true,
        bool loadRstarts, // = true,
        bool loadSpliceSites, // = true,
        bool verbose, // = false,
        bool startVerbose, // = false,
        bool passMemExc, // = false,
        bool sanityCheck, // = false
        bool useHaplotype, // = false
        bool skipLoading = false) :
    GFM<index_t>(in,
                 altdb,
                 repeatdb,
                 readLens,
                 needEntireReverse,
                 fw,
                 overrideOffRate,
                 offRatePlus,
                 useMm,
                 useShmem,
                 mmSweep,
                 loadNames,
                 loadSASamp,
                 loadFtab,
                 loadRstarts,
                 loadSpliceSites,
                 verbose,
                 startVerbose,
                 passMemExc,
                 sanityCheck,
                 useHaplotype,
                 skipLoading),
    _in1(NULL),
    _in2(NULL)
    {
        _in1Str = in + ".1." + gfm_ext;
        _in2Str = in + ".2." + gfm_ext;
    }

	/// Construct a HGFM from the given header parameters and string
	/// vector, optionally using a blockwise suffix sorter with the
	/// given 'bmax' and 'dcv' parameters.  The string vector is
	/// ultimately joined and the joined string is passed to buildToDisk().
	template<typename TStr>
	RFM(
         TStr& s,
         bool packed,
         int needEntireReverse,
         int32_t lineRate,
         int32_t offRate,
         int32_t ftabChars,
         int32_t localOffRate,
         int32_t localFtabChars,
         int nthreads,
         const string& snpfile,
         const string& htfile,
         const string& ssfile,
         const string& exonfile,
         const string& svfile,
         const string& repeatfile,
         const string& outfile,   // base filename for GFM files
         bool fw,
         bool useBlockwise,
         TIndexOffU bmax,
         TIndexOffU bmaxSqrtMult,
         TIndexOffU bmaxDivN,
         int dcv,
         EList<FileBuf*>& is,
         EList<RefRecord>& szs,
         index_t sztot,
         const RefReadInParams& refparams,
         bool localIndex, // create local indexes?
         EList<RefRecord>* parent_szs,
         EList<string>* parent_refnames,
         uint32_t seed,
         int32_t overrideOffRate = -1,
         bool verbose = false,
         bool passMemExc = false,
         bool sanityCheck = false);

	RFM() {
        clearLocalRFMs();
	}

    /**
	 * Load this Ebwt into memory by reading it in from the _in1 and
	 * _in2 streams.
	 */
	void loadIntoMemory(
                        int needEntireReverse,
                        bool loadSASamp,
                        bool loadFtab,
                        bool loadRstarts,
                        bool loadNames,
                        bool verbose)
	{
		readIntoMemory(
                       needEntireReverse, // require reverse index to be concatenated reference reversed
                       loadSASamp,  // load the SA sample portion?
                       loadFtab,    // load the ftab (_ftab[] and _eftab[])?
                       loadRstarts, // load the r-starts (_rstarts[])?
                       false,       // stop after loading the header portion?
                       NULL,        // params
                       false,       // mmSweep
                       loadNames,   // loadNames
                       verbose);    // startVerbose
	}

	// I/O
	void readIntoMemory(
                        int needEntireRev,
                        bool loadSASamp,
                        bool loadFtab,
                        bool loadRstarts,
                        bool justHeader,
                        GFMParams<index_t> *params,
                        bool mmSweep,
                        bool loadNames,
                        bool startVerbose);

	/**
	 * Frees memory associated with the Ebwt.
	 */
	void evictFromMemory() {
		assert(PARENT_CLASS::isInMemory());
		clearLocalRFMs();
		PARENT_CLASS::evictFromMemory();
	}

	/**
	 * Sanity-check various pieces of the Ebwt
	 */
	void sanityCheckAll(int reverse) const {
		PARENT_CLASS::sanityCheckAll(reverse);
		for(size_t i = 0; i < _localRFMs.size(); i++) {
            assert(_localRFMs[i] != NULL);
            _localRFMs[i]->sanityCheckAll(reverse);
		}
	}

    void getReferenceNames(EList<string>& refnames) {
        for(size_t i = 0; i < _localRFMs.size(); i++) {
            assert_eq(_localRFMs[i]->refnames().size(), 1);
            refnames.push_back(_localRFMs[i]->refnames()[0]);
        }
    }

    void getReferenceLens(EList<size_t>& reflens) {
        for(size_t i = 0; i < _localRFMs.size(); i++) {
            reflens.push_back(_localRFMs[i]->plen()[0]);
        }
    }

    index_t getLocalRFM_idx(index_t readLen) {
        for(size_t i = 0; i < this->_repeatLens.size(); i++) {
            if(this->_repeatLens[i].first >= readLen) {
                return i;
            }
        }
        return this->_repeatLens.size() - 1;
    }

    index_t getLocalRFM_idx(const char *refname) {
        for(size_t i = 0; i < this->_repeatLens.size(); i++) {
            assert_eq(_localRFMs[i]->refnames().size(), 1);

            string& ref = _localRFMs[i]->refnames()[0];
            if(ref.compare(refname) == 0) {
                return i;
            }
        }
        return this->_repeatLens.size() - 1;
    }

    bool empty() const { return _localRFMs.empty(); }

    LocalRFM<index_t>& getLocalRFM(index_t idx) {
        assert_lt(idx, this->_repeatLens.size());
        return *_localRFMs[idx];
    }

    RB_KmerTable& getKmertable(index_t idx) {
        assert_lt(idx, this->_repeat_kmertables.size());
        return this->_repeat_kmertables[idx];
    }

    void clearLocalRFMs() {
		for(size_t i = 0; i < _localRFMs.size(); i++) {
            assert(_localRFMs[i] != NULL);
            delete _localRFMs[i];
        }
		_localRFMs.clear();
	}


public:
	EList<index_t>              _refLens;    /// approx lens of ref seqs (excludes trailing ambig chars)
	EList<LocalRFM<index_t>*>   _localRFMs;

    EList<pair<streampos, streampos> >  _localRFMFilePos;

	FILE                        *_in1;    // input fd for primary index file
	FILE                        *_in2;    // input fd for secondary index file
	string                      _in1Str;
	string                      _in2Str;

	char                        *mmFile1_;
	char                        *mmFile2_;

private:
    struct WorkerParam {
        // input
        SString<char>                s;
        EList<ALT<index_t> >         alts;
        EList<Haplotype<index_t> >   haplotypes;
        bool                         bigEndian;
        index_t                      local_offset;
        index_t                      curr_sztot;
        EList<RefRecord>             conv_local_szs;
        index_t                      local_sztot;
        index_t                      index_size;
        string                       file;
        InorderBlockwiseSA<SString<char> >* sa;
        index_t                      dcv;
        index_t                      seed;
        EList<string>                refnames;

        // output
        RefGraph<index_t>*           rg;
        PathGraph<index_t>*          pg;

        int                          threads;
    };
    static void build_worker(void* vp);
};


template <typename index_t>
void RFM<index_t>::build_worker(void* vp)
{
    WorkerParam& tParam = *(WorkerParam*)vp;
    if(tParam.alts.empty()) {
        tParam.sa = NULL;
        tParam.sa = new KarkkainenBlockwiseSA<SString<char> >(
                                                              tParam.s,
                                                              (index_t)(tParam.s.length()+1),
                                                              tParam.threads,
                                                              tParam.dcv,
                                                              tParam.seed,
                                                              false,  /* this->_sanity */
                                                              false,  /* this->_passMemExc */
                                                              false); /* this->_verbose */
        assert(tParam.sa->suffixItrIsReset());
        assert_eq(tParam.sa->size(), tParam.s.length()+1);
    } else {
        while(true) {
            tParam.rg = NULL, tParam.pg = NULL;
            bool exploded = false;
            try {
                tParam.rg = new RefGraph<index_t>(
                                                  tParam.s,
                                                  tParam.conv_local_szs,
                                                  tParam.alts,
                                                  tParam.haplotypes,
                                                  tParam.file,
                                                  tParam.threads,        /* num threads */
                                                  false);   /* verbose? */
                tParam.pg = new PathGraph<index_t>(
                                                   *tParam.rg,
                                                   tParam.file,
                                                   local_max_gbwt,
                                                   1,         /* num threads */
                                                   false);    /* verbose? */
            } catch (const NongraphException& err) {
                cerr << "Warning: no variants or splice sites in this graph (" << tParam.curr_sztot << ")" << endl;
                delete tParam.rg;
                delete tParam.pg;
                tParam.alts.clear();
                continue;
            } catch (const ExplosionException& err) {
                exploded = true;
            }

            if(!exploded) {
                if(!tParam.pg->generateEdges(*tParam.rg)) {
                    cerr << "An error occurred - generateEdges" << endl;
                    throw 1;
                }
                exploded = tParam.pg->getNumEdges() > local_max_gbwt;
            }
            if(exploded) {
                cerr << "Warning: a local graph exploded (offset: " << tParam.curr_sztot << ", length: " << tParam.local_sztot << ")" << endl;

                delete tParam.pg; tParam.pg = NULL;
                delete tParam.rg; tParam.rg = NULL;
                if(tParam.alts.size() <= 1) {
                    tParam.alts.clear();
                } else {
                    for(index_t s = 2; s < tParam.alts.size(); s += 2) {
                        tParam.alts[s >> 1] = tParam.alts[s];
                    }
                    tParam.alts.resize(tParam.alts.size() >> 1);
                    tParam.haplotypes.clear();
                    for(index_t a = 0; a < tParam.alts.size(); a++) {
                        const ALT<index_t>& alt = tParam.alts[a];
                        if(!alt.snp()) continue;
                        tParam.haplotypes.expand();
                        tParam.haplotypes.back().left = alt.pos;
                        if(alt.deletion()) {
                            tParam.haplotypes.back().right = alt.pos + alt.len - 1;
                        } else {
                            tParam.haplotypes.back().right = alt.pos;
                        }
                        tParam.haplotypes.back().alts.clear();
                        tParam.haplotypes.back().alts.push_back(a);
                    }
                }
                continue;
            }

            break;
        }
    }
}

/// Construct a GFM from the given header parameters and string
/// vector, optionally using a blockwise suffix sorter with the
/// given 'bmax' and 'dcv' parameters.  The string vector is
/// ultimately joined and the joined string is passed to buildToDisk().
template <typename index_t>
template <typename TStr>
RFM<index_t>::RFM(
                  TStr& s,
                  bool packed,
                  int needEntireReverse,
                  int32_t lineRate,
                  int32_t offRate,
                  int32_t ftabChars,
                  int32_t localOffRate,
                  int32_t localFtabChars,
                  int nthreads,
                  const string& snpfile,
                  const string& htfile,
                  const string& ssfile,
                  const string& exonfile,
                  const string& svfile,
                  const string& repeatfile,
                  const string& outfile,   // base filename for EBWT files
                  bool fw,
                  bool useBlockwise,
                  TIndexOffU bmax,
                  TIndexOffU bmaxSqrtMult,
                  TIndexOffU bmaxDivN,
                  int dcv,
                  EList<FileBuf*>& is,
                  EList<RefRecord>& szs,
                  index_t sztot,
                  const RefReadInParams& refparams,
                  bool localIndex,
                  EList<RefRecord>* parent_szs,
                  EList<string>* parent_refnames,
                  uint32_t seed,
                  int32_t overrideOffRate,
                  bool verbose,
                  bool passMemExc,
                  bool sanityCheck) :
    GFM<index_t>(s,
                 packed,
                 needEntireReverse,
                 lineRate,
                 offRate,
                 ftabChars,
                 nthreads,
                 snpfile,
                 htfile,
                 ssfile,
                 exonfile,
                 svfile,
                 repeatfile,
                 outfile,
                 fw,
                 useBlockwise,
                 bmax,
                 bmaxSqrtMult,
                 bmaxDivN,
                 dcv,
                 is,
                 szs,
                 sztot,
                 refparams,
                 parent_szs,
                 parent_refnames,
                 seed,
                 overrideOffRate,
                 verbose,
                 passMemExc,
                 sanityCheck),
    _in1(NULL),
    _in2(NULL)
{
    _in1Str = outfile + ".1." + gfm_ext;
    _in2Str = outfile + ".2." + gfm_ext;

    // Open output files
    ofstream fout1(_in1Str.c_str(), ios::binary);
    if(!fout1.good()) {
        cerr << "Could not open index file for writing: \"" << _in1Str.c_str() << "\"" << endl
        << "Please make sure the directory exists and that permissions allow writing by" << endl
        << "HISAT2." << endl;
        throw 1;
    }
    ofstream fout2(_in2Str.c_str(), ios::binary);
    if(!fout2.good()) {
        cerr << "Could not open index file for writing: \"" << _in2Str.c_str() << "\"" << endl
        << "Please make sure the directory exists and that permissions allow writing by" << endl
        << "HISAT2." << endl;
        throw 1;
    }

    // Split the whole genome into a set of local indexes
    uint32_t be = this->toBe();
    assert(fout1.good());
    assert(fout2.good());

    // When building an Ebwt, these header parameters are known
    // "up-front", i.e., they can be written to disk immediately,
    // before we join() or buildToDisk()
    writeI32(fout1, 1, be); // endian hint for priamry stream
    writeI32(fout2, 1, be); // endian hint for secondary stream
    int version = GFM<index_t>::getIndexVersion();
    writeI32(fout1, version, be); // version
    index_t nLocalRFMs = this->_repeatLens.size();
    writeIndex<index_t>(fout1, nLocalRFMs, be); // number of local Ebwts
    for(size_t i = 0; i < this->_repeatLens.size(); i++) {
        writeIndex<index_t>(fout1, this->_repeatLens[i].first, be);
        writeIndex<index_t>(fout1, this->_repeatLens[i].second, be);
    }
    streampos filepos = fout1.tellp();
    _localRFMFilePos.resizeExact(szs.size());
    for(size_t i = 0; i < _localRFMFilePos.size(); i++) {
        writeIndex<uint64_t>(fout1, 0, be);
        writeIndex<uint64_t>(fout1, 0, be);
    }

    assert_gt(this->_nthreads, 0);
    WorkerParam tParam;
    tParam.rg = NULL;
    tParam.pg = NULL;
    tParam.sa = NULL;
    tParam.file = outfile;
    tParam.dcv = 1024;
    tParam.seed = seed;
    tParam.threads = this->_nthreads;

    // build local FM indexes
    assert_eq(szs.size(), this->_refnames.size());
    index_t curr_sztot = 0;
    EList<ALT<index_t> > alts;
    for(size_t tidx = 0; tidx < szs.size(); tidx++) {
        assert(szs[tidx].first);
        index_t refLen = szs[tidx].len;
        _localRFMs.expand();
        assert_lt(tidx, _localRFMs.size());

        tParam.index_size = refLen;
        tParam.conv_local_szs.clear();
        tParam.conv_local_szs.push_back(szs[tidx]);
        tParam.refnames.clear();
        tParam.refnames.push_back(this->_refnames[tidx]);

        // Extract sequence corresponding to this local index
        tParam.s.resize(refLen);
        if(refparams.reverse == REF_READ_REVERSE) {
            tParam.s.install(s.buf() + s.length() - curr_sztot - refLen, refLen);
        } else {
            tParam.s.install(s.buf() + curr_sztot, refLen);
        }

#if 0
        // Extract ALTs corresponding to this local index
        map<index_t, index_t> alt_map;
        tParam.alts.clear();
        ALT<index_t> alt;
        alt.pos = curr_sztot;
        index_t alt_i = (index_t)this->_alts.bsearchLoBound(alt);
        for(; alt_i < this->_alts.size(); alt_i++) {
            const ALT<index_t>& alt = this->_alts[alt_i];
            if(alt.snp()) {
                if(alt.mismatch()) {
                    if(curr_sztot + local_sztot <= alt.pos) break;
                } else if(alt.insertion()) {
                    if(curr_sztot + local_sztot < alt.pos) break;
                } else {
                    assert(alt.deletion());
                    if(curr_sztot + local_sztot < alt.pos + alt.len) break;
                }
                if(curr_sztot <= alt.pos) {
                    alt_map[alt_i] = (index_t)tParam.alts.size();
                    tParam.alts.push_back(alt);
                    tParam.alts.back().pos -= curr_sztot;
                }
            } else if(alt.splicesite()) {
                if(alt.excluded) continue;
                if(curr_sztot + local_sztot <= alt.right + 1) continue;
                if(curr_sztot <= alt.left) {
                    tParam.alts.push_back(alt);
                    tParam.alts.back().left -= curr_sztot;
                    tParam.alts.back().right -= curr_sztot;
                }
            } else {
                assert(alt.exon());
            }
        }

        // Extract haplotypes
        tParam.haplotypes.clear();
        Haplotype<index_t> haplotype;
        haplotype.left = curr_sztot;
        index_t haplotpye_i = (index_t)this->_haplotypes.bsearchLoBound(haplotype);
        for(; haplotpye_i < this->_haplotypes.size(); haplotpye_i++) {
            const Haplotype<index_t>& haplotype = this->_haplotypes[haplotpye_i];
            if(curr_sztot + local_sztot <= haplotype.right) continue;
            if(curr_sztot <= haplotype.left) {
                tParam.haplotypes.push_back(haplotype);
                tParam.haplotypes.back().left -= curr_sztot;
                tParam.haplotypes.back().right -= curr_sztot;
                for(index_t a = 0; a < tParam.haplotypes.back().alts.size(); a++) {
                    index_t alt_i = tParam.haplotypes.back().alts[a];
                    if(alt_map.find(alt_i) == alt_map.end()) {
                        assert(false);
                        tParam.haplotypes.pop_back();
                        break;
                    }
                    tParam.haplotypes.back().alts[a] = alt_map[alt_i];
                }
            }
        }
#endif

        tParam.local_offset = 0;
        tParam.curr_sztot = curr_sztot;
        tParam.local_sztot = refLen;

        assert(tParam.rg == NULL);
        assert(tParam.pg == NULL);
        assert(tParam.sa == NULL);
        curr_sztot += refLen;

        build_worker(&tParam);

        LocalRFM<index_t>(
                          tParam.s,
                          tParam.sa,
                          tParam.pg,
                          tParam.alts,
                          tParam.index_size,
                          tParam.refnames,
                          packed,
                          needEntireReverse,
                          lineRate,
                          offRate,               // suffix-array sampling rate
                          ftabChars,             // number of chars in initial arrow-pair calc
                          outfile,               // basename for .?.ebwt files
                          fw,                    // fw
                          dcv,                   // difference-cover period
                          tParam.conv_local_szs, // list of reference sizes
                          tParam.local_sztot,    // total size of all unambiguous ref chars
                          refparams,             // reference read-in parameters
                          seed,                  // pseudo-random number generator seed
                          fout1,
                          fout2,
                          -1,                    // override offRate
                          false,                 // be silent
                          passMemExc,            // pass exceptions up to the toplevel so that we can adjust memory settings automatically
                          sanityCheck);          // verify results and internal consistency
        if(tParam.rg != NULL) {
            assert(tParam.pg != NULL);
            delete tParam.rg; tParam.rg = NULL;
            delete tParam.pg; tParam.pg = NULL;
        }
        if(tParam.sa != NULL) {
            delete tParam.sa; tParam.sa = NULL;
        }

        _localRFMFilePos[tidx].first = fout1.tellp();
        _localRFMFilePos[tidx].second = fout2.tellp();
    }
    assert_eq(curr_sztot, sztot);

    streampos origpos = fout1.tellp();
    fout1.seekp(filepos);
    for(size_t i = 0; i < _localRFMFilePos.size(); i++) {
        writeIndex<uint64_t>(fout1, _localRFMFilePos[i].first, be);
        writeIndex<uint64_t>(fout1, _localRFMFilePos[i].second, be);
    }
    fout1.seekp(origpos);

    fout1 << '\0';
    fout1.flush(); fout2.flush();
    if(fout1.fail() || fout2.fail()) {
        cerr << "An error occurred writing the index to disk.  Please check if the disk is full." << endl;
        throw 1;
    }
    VMSG_NL("Returning from initFromVector");

    // Close output files
    fout1.flush();
    int64_t tellpSz1 = (int64_t)fout1.tellp();
    VMSG_NL("Wrote " << fout1.tellp() << " bytes to primary GFM file: " << _in1Str.c_str());
    fout1.close();
    bool err = false;
    if(tellpSz1 > fileSize(_in1Str.c_str())) {
        err = true;
        cerr << "Index is corrupt: File size for " << _in1Str.c_str() << " should have been " << tellpSz1
        << " but is actually " << fileSize(_in1Str.c_str()) << "." << endl;
    }
    fout2.flush();
    int64_t tellpSz2 = (int64_t)fout2.tellp();
    VMSG_NL("Wrote " << fout2.tellp() << " bytes to secondary GFM file: " << _in2Str.c_str());
    fout2.close();
    if(tellpSz2 > fileSize(_in2Str.c_str())) {
        err = true;
        cerr << "Index is corrupt: File size for " << _in2Str.c_str() << " should have been " << tellpSz2
        << " but is actually " << fileSize(_in2Str.c_str()) << "." << endl;
    }
    if(err) {
        cerr << "Please check if there is a problem with the disk or if disk is full." << endl;
        throw 1;
    }
    // Reopen as input streams
    VMSG_NL("Re-opening _in1 and _in2 as input streams");
    if(this->_sanity) {
        VMSG_NL("Sanity-checking ht2");
        assert(!this->isInMemory());
        readIntoMemory(
                       fw ? -1 : needEntireReverse, // 1 -> need the reverse to be reverse-of-concat
                       true,                        // load SA sample (_offs[])?
                       true,                        // load ftab (_ftab[] & _eftab[])?
                       true,                        // load r-starts (_rstarts[])?
                       false,                       // just load header?
                       NULL,                        // Params object to fill
                       false,                       // mm sweep?
                       true,                        // load names?
                       false);                      // verbose startup?
        sanityCheckAll(refparams.reverse);
        evictFromMemory();
        assert(!this->isInMemory());
    }
    VMSG_NL("Returning from HGFM constructor");
}


/**
 * Read an Ebwt from file with given filename.
 */
template <typename index_t>
void RFM<index_t>::readIntoMemory(
                                  int needEntireRev,
                                  bool loadSASamp,
                                  bool loadFtab,
                                  bool loadRstarts,
                                  bool justHeader,
                                  GFMParams<index_t> *params,
                                  bool mmSweep,
                                  bool loadNames,
                                  bool startVerbose)
{
    bool switchEndian; // dummy; caller doesn't care
#ifdef BOWTIE_MM
	char *mmFile[] = { NULL, NULL };
#endif
	if(_in1Str.length() > 0) {
		if(this->_verbose || startVerbose) {
			cerr << "  About to open input files: ";
			logTime(cerr);
		}
        // Initialize our primary and secondary input-stream fields
		if(_in1 != NULL) fclose(_in1);
		if(this->_verbose || startVerbose) cerr << "Opening \"" << _in1Str.c_str() << "\"" << endl;
		if((_in1 = fopen(_in1Str.c_str(), "rb")) == NULL) {
			cerr << "Could not open index file " << _in1Str.c_str() << endl;
		}
		if(loadSASamp) {
			if(_in2 != NULL) fclose(_in2);
			if(this->_verbose || startVerbose) cerr << "Opening \"" << _in2Str.c_str() << "\"" << endl;
			if((_in2 = fopen(_in2Str.c_str(), "rb")) == NULL) {
				cerr << "Could not open index file " << _in2Str.c_str() << endl;
			}
		}
		if(this->_verbose || startVerbose) {
			cerr << "  Finished opening input files: ";
			logTime(cerr);
		}

#ifdef BOWTIE_MM
		if(this->_useMm /*&& !justHeader*/) {
			const char *names[] = {_in1Str.c_str(), _in2Str.c_str()};
            int fds[] = { fileno(_in1), fileno(_in2) };
			for(int i = 0; i < (loadSASamp ? 2 : 1); i++) {
				if(this->_verbose || startVerbose) {
					cerr << "  ¯ " << (i+1) << ": ";
					logTime(cerr);
				}
				struct stat sbuf;
				if (stat(names[i], &sbuf) == -1) {
					perror("stat");
					cerr << "Error: Could not stat index file " << names[i] << " prior to memory-mapping" << endl;
					throw 1;
				}
                mmFile[i] = (char*)mmap((void *)0, (size_t)sbuf.st_size,
										PROT_READ, MAP_SHARED, fds[(size_t)i], 0);
				if(mmFile[i] == (void *)(-1)) {
					perror("mmap");
					cerr << "Error: Could not memory-map the index file " << names[i] << endl;
					throw 1;
				}
				if(mmSweep) {
					int sum = 0;
					for(off_t j = 0; j < sbuf.st_size; j += 1024) {
						sum += (int) mmFile[i][j];
					}
					if(startVerbose) {
						cerr << "  Swept the memory-mapped ebwt index file 1; checksum: " << sum << ": ";
						logTime(cerr);
					}
				}
			}
			mmFile1_ = mmFile[0];
			mmFile2_ = loadSASamp ? mmFile[1] : NULL;
		}
#endif
	}
#ifdef BOWTIE_MM
	else if(this->_useMm && !justHeader) {
		mmFile[0] = mmFile1_;
		mmFile[1] = mmFile2_;
	}
	if(this->_useMm && !justHeader) {
		assert(mmFile[0] == mmFile1_);
		assert(mmFile[1] == mmFile2_);
	}
#endif

	if(this->_verbose || startVerbose) {
		cerr << "  Reading header: ";
		logTime(cerr);
	}

	// Read endianness hints from both streams
    size_t bytesRead = 0, bytesRead2 = 4;
	switchEndian = false;
	uint32_t one = readU32(_in1, switchEndian); // 1st word of primary stream
	bytesRead += 4;
	if(loadSASamp) {
#ifndef NDEBUG
		assert_eq(one, readU32(_in2, switchEndian)); // should match!
#else
		readU32(_in2, switchEndian);
#endif
	}
	if(one != 1) {
		assert_eq((1u<<24), one);
		assert_eq(1, endianSwapU32(one));
		switchEndian = true;
	}

	// Can't switch endianness and use memory-mapped files; in order to
	// support this, someone has to modify the file to switch
	// endiannesses appropriately, and we can't do this inside Bowtie
	// or we might be setting up a race condition with other processes.
	if(switchEndian && this->_useMm) {
		cerr << "Error: Can't use memory-mapped files when the index is the opposite endianness" << endl;
		throw 1;
	}

    readI32(_in1, switchEndian); bytesRead += 4;
	index_t nlocalRFMs = readIndex<index_t>(_in1, switchEndian); bytesRead += sizeof(index_t);

    EList<pair<index_t, index_t> > localRFMLens;
    localRFMLens.resizeExact(nlocalRFMs);
    for(size_t i = 0; i < localRFMLens.size(); i++) {
        localRFMLens[i].first = readIndex<index_t>(_in1, switchEndian);
        localRFMLens[i].second = readIndex<index_t>(_in1, switchEndian);
    }

    _localRFMFilePos.resizeExact(nlocalRFMs);
    for(size_t i = 0; i < _localRFMFilePos.size(); i++) {
        _localRFMFilePos[i].first = readIndex<uint64_t>(_in1, switchEndian);
        _localRFMFilePos[i].second = readIndex<uint64_t>(_in1, switchEndian);
    }

	clearLocalRFMs();

    assert_eq(this->_repeatIncluded.size(), nlocalRFMs);
    string base = "";
	for(size_t i = 0; i < nlocalRFMs; i++) {
        if(!this->_repeatIncluded[i])
            continue;
        if(i > 0) {
            fseek(_in1, _localRFMFilePos[i-1].first, SEEK_SET);
            fseek(_in2, _localRFMFilePos[i-1].second, SEEK_SET);
        }
		LocalRFM<index_t> *localRFM = new LocalRFM<index_t>(base,
                                                            NULL,
                                                            _in1,
                                                            _in2,
                                                            mmFile1_,
                                                            mmFile2_,
                                                            switchEndian,
                                                            bytesRead,
                                                            bytesRead2,
                                                            needEntireRev,
                                                            this->fw_,
                                                            -1, // overrideOffRate
                                                            -1, // offRatePlus
                                                            this->_useMm,
                                                            this->useShmem_,
                                                            mmSweep,
                                                            loadNames,
                                                            loadSASamp,
                                                            loadFtab,
                                                            loadRstarts,
                                                            false,  // _verbose
                                                            false,
                                                            this->_passMemExc,
                                                            this->_sanity,
                                                            false); // use haplotypes?

		_localRFMs.push_back(localRFM);
	}

    fseek(_in1, _localRFMFilePos.back().first, SEEK_SET);
    fseek(_in2, _localRFMFilePos.back().second, SEEK_SET);

#ifdef BOWTIE_MM
    fseek(_in1, 0, SEEK_SET);
	fseek(_in2, 0, SEEK_SET);
#else
	rewind(_in1);
    rewind(_in2);
#endif
}

#endif /*RFM_H_*/