/* * Copyright 2011, Ben Langmead * * This file is part of Bowtie 2. * * Bowtie 2 is free software: you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation, either version 3 of the License, or * (at your option) any later version. * * Bowtie 2 is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with Bowtie 2. If not, see . */ #ifndef REFERENCE_H_ #define REFERENCE_H_ #include #include #include #include #ifdef BOWTIE_MM #include #include #endif #include "endian_swap.h" #include "ref_read.h" #include "sequence_io.h" #include "mm.h" #include "shmem.h" #include "timer.h" #include "sstring.h" #include "btypes.h" /** * Concrete reference representation that bulk-loads the reference from * the bit-pair-compacted binary file and stores it in memory also in * bit-pair-compacted format. The user may request reference * characters either on a per-character bases or by "stretch" using * getBase(...) and getStretch(...) respectively. * * Most of the complexity in this class is due to the fact that we want * to represent references with ambiguous (non-A/C/G/T) characters but * we don't want to use more than two bits per base. This means we * need a way to encode the ambiguous stretches of the reference in a * way that is external to the bitpair sequence. To accomplish this, * we use the RefRecords vector, which is stored in the .3.ebwt index * file. The bitpairs themselves are stored in the .4.ebwt index file. * * Once it has been loaded, a BitPairReference is read-only, and is * safe for many threads to access at once. */ class BitPairReference { public: /** * Load from .3.ebwt/.4.ebwt Bowtie index files. */ BitPairReference( const string& in, const EList* included, bool color, bool sanity = false, EList* infiles = NULL, EList >* origs = NULL, bool infilesSeq = false, bool useMm = false, bool useShmem = false, bool mmSweep = false, bool verbose = false, bool startVerbose = false); ~BitPairReference(); /** * Return a single base of the reference. Calling this repeatedly * is not an efficient way to retrieve bases from the reference; * use loadStretch() instead. * * This implementation scans linearly through the records for the * unambiguous stretches of the target reference sequence. When * there are many records, binary search would be more appropriate. */ int getBase(size_t tidx, size_t toff) const; /** * Load a stretch of the reference string into memory at 'dest'. * * This implementation scans linearly through the records for the * unambiguous stretches of the target reference sequence. When * there are many records, binary search would be more appropriate. */ int getStretchNaive( uint32_t *destU32, size_t tidx, size_t toff, size_t count) const; /** * Load a stretch of the reference string into memory at 'dest'. * * This implementation scans linearly through the records for the * unambiguous stretches of the target reference sequence. When * there are many records, binary search would be more appropriate. */ int getStretch( uint32_t *destU32, size_t tidx, size_t toff, size_t count ASSERT_ONLY(, SStringExpandable& destU32_2)) const; /** * Return the number of reference sequences. */ TIndexOffU numRefs() const { return nrefs_; } /** * Return the approximate length of a reference sequence (it might leave * off some Ns on the end). * * TODO: Is it still true that it might leave off Ns? */ TIndexOffU approxLen(TIndexOffU elt) const { assert_lt(elt, nrefs_); return refLens_[elt]; } /** * Return true iff buf_ and all the vectors are populated. */ bool loaded() const { return loaded_; } /** * Given a reference sequence id, return its offset into the pasted * reference string; i.e., return the number of unambiguous nucleotides * preceding it. */ TIndexOffU pastedOffset(TIndexOffU idx) const { return refOffs_[idx]; } /** * Parse the input fasta files, populating the szs list and writing the * .3.ebwt and .4.ebwt portions of the index as we go. */ static std::pair szsFromFasta( EList& is, const string& outfile, bool bigEndian, const RefReadInParams& refparams, EList& szs, bool sanity, EList *names = NULL); size_t getMinK() const{ return minkRepeat; } protected: uint32_t byteToU32_[256]; EList recs_; /// records describing unambiguous stretches // following two lists are purely for the binary search in getStretch EList cumUnambig_; // # unambig ref chars up to each record EList cumRefOff_; // # ref chars up to each record EList refLens_; /// approx lens of ref seqs (excludes trailing ambig chars) EList refOffs_; /// buf_ begin offsets per ref seq EList refRecOffs_; /// record begin/end offsets per ref seq uint8_t *buf_; /// the whole reference as a big bitpacked byte array uint8_t *sanityBuf_;/// for sanity-checking buf_ TIndexOffU bufSz_; /// size of buf_ TIndexOffU bufAllocSz_; TIndexOffU nrefs_; /// the number of reference sequences bool loaded_; /// whether it's loaded bool sanity_; /// do sanity checking bool useMm_; /// load the reference as a memory-mapped file bool useShmem_; /// load the reference into shared memory bool verbose_; size_t minkRepeat; // log4 of the size of repeat genome ASSERT_ONLY(SStringExpandable tmp_destU32_); }; #endif