/* * Copyright 2011, Ben Langmead * * This file is part of Bowtie 2. * * Bowtie 2 is free software: you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation, either version 3 of the License, or * (at your option) any later version. * * Bowtie 2 is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with Bowtie 2. If not, see . */ #ifndef REF_READ_H_ #define REF_READ_H_ #include #include #include #include #include #include #include "alphabet.h" #include "assert_helpers.h" #include "filebuf.h" #include "word_io.h" #include "ds.h" #include "endian_swap.h" using namespace std; class RefTooLongException : public exception { public: RefTooLongException() { #ifdef BOWTIE_64BIT_INDEX // This should never happen! msg = "Error: Reference sequence has more than 2^64-1 characters! " "Please divide the reference into smaller chunks and index each " "independently."; #else msg = "Error: Reference sequence has more than 2^32-1 characters! " "Please build a large index by passing the --large-index option " "to bowtie2-build"; #endif } ~RefTooLongException() throw() {} const char* what() const throw() { return msg.c_str(); } protected: string msg; }; /** * Encapsulates a stretch of the reference containing only unambiguous * characters. From an ordered list of RefRecords, one can (almost) * deduce the "shape" of the reference sequences (almost because we * lose information about stretches of ambiguous characters at the end * of reference sequences). */ struct RefRecord { RefRecord() : off(), len(), first() { } RefRecord(TIndexOffU _off, TIndexOffU _len, bool _first) : off(_off), len(_len), first(_first) { } RefRecord(FILE *in, bool swap) { assert(in != NULL); if(!fread(&off, OFF_SIZE, 1, in)) { cerr << "Error reading RefRecord offset from FILE" << endl; throw 1; } if(swap) off = endianSwapIndex(off); if(!fread(&len, OFF_SIZE, 1, in)) { cerr << "Error reading RefRecord offset from FILE" << endl; throw 1; } if(swap) len = endianSwapIndex(len); first = fgetc(in) ? true : false; } void write(std::ostream& out, bool be) { writeIndex(out, off, be); writeIndex(out, len, be); out.put(first ? 1 : 0); } TIndexOffU off; /// Offset of the first character in the record TIndexOffU len; /// Length of the record bool first; /// Whether this record is the first for a reference sequence }; enum { REF_READ_FORWARD = 0, // don't reverse reference sequence REF_READ_REVERSE, // reverse entire reference sequence REF_READ_REVERSE_EACH, // reverse each unambiguous stretch of reference REF_READ_REVERSE_REPLACEMENT, // reverse the entire reference sequence and replace specific base by others. }; /** * Parameters governing treatment of references as they're read in. */ struct RefReadInParams { RefReadInParams(bool col, int r, bool nsToA, bool bisulf) : color(col), reverse(r), nsToAs(nsToA), bisulfite(bisulf) { } // extract colors from reference bool color; // reverse each reference sequence before passing it along int reverse; // convert ambiguous characters to As bool nsToAs; // bisulfite-convert the reference bool bisulfite; }; extern RefRecord fastaRefReadSize( FileBuf& in, const RefReadInParams& rparms, bool first, BitpairOutFileBuf* bpout, string *name = NULL); extern std::pair fastaRefReadSizes( EList& in, EList& recs, const RefReadInParams& rparms, BitpairOutFileBuf* bpout, TIndexOff& numSeqs); extern std::pair fastaRefReadFragsNames( EList& in, EList& recs, const RefReadInParams& rparms, BitpairOutFileBuf* bpout, TIndexOff& numSeqs, EList& names); extern void reverseRefRecords( const EList& src, EList& dst, bool recursive = false, bool verbose = false); /** * Reads the next sequence from the given FASTA file and appends it to * the end of dst, optionally reversing it. */ template static RefRecord fastaRefReadAppend( FileBuf& in, // input file bool first, // true iff this is the first record in the file TStr& dst, // destination buf for parsed characters TIndexOffU& dstoff, // index of next character in dst to assign RefReadInParams& rparms, // string* name = NULL) // put parsed FASTA name here { int c; static int lastc = '>'; if(first) { c = in.getPastWhitespace(); if(c != '>') { cerr << "Reference file does not seem to be a FASTA file" << endl; throw 1; } lastc = c; } assert_neq(-1, lastc); // RefRecord params size_t len = 0; size_t off = 0; first = true; size_t ilen = dstoff; // Chew up the id line; if the next line is either // another id line or a comment line, keep chewing int lc = -1; // last-DNA char variable for color conversion c = lastc; if(c == '>' || c == '#') { do { while (c == '#') { if((c = in.getPastNewline()) == -1) { lastc = -1; goto bail; } } assert_eq('>', c); while(true) { c = in.get(); if(c == -1) { lastc = -1; goto bail; } if(c == '\n' || c == '\r') { while(c == '\r' || c == '\n') c = in.get(); if(c == -1) { lastc = -1; goto bail; } break; } if (name) name->push_back(c); } // c holds the first character on the line after the name // line if(c == '>') { // If there's another name line immediately after this one, // discard the previous name and start fresh with the new one if (name) name->clear(); } } while (c == '>' || c == '#'); } else { ASSERT_ONLY(int cc = toupper(c)); assert(cc != 'A' && cc != 'C' && cc != 'G' && cc != 'T'); first = false; } // Skip over an initial stretch of gaps or ambiguous characters. // For colorspace we skip until we see two consecutive unambiguous // characters (i.e. the first unambiguous color). while(true) { int cat = asc2dnacat[c]; if(rparms.nsToAs && cat >= 2) { c = 'A'; } int cc = toupper(c); if(rparms.bisulfite && cc == 'C') c = cc = 'T'; if(cat == 1) { // This is a DNA character if(rparms.color) { if(lc != -1) { // Got two consecutive unambiguous DNAs break; // to read-in loop } // Keep going; we need two consecutive unambiguous DNAs lc = asc2dna[(int)c]; // The 'if(off > 0)' takes care of the case where // the reference is entirely unambiguous and we don't // want to incorrectly increment off. if(off > 0) off++; } else { break; // to read-in loop } } else if(cat >= 2) { if(lc != -1 && off == 0) { off++; } lc = -1; off++; // skip it } else if(c == '>') { lastc = '>'; goto bail; } c = in.get(); if(c == -1) { lastc = -1; goto bail; } } if(first && rparms.color && off > 0) { // Handle the case where the first record has ambiguous // characters but we're in color space; one of those counts is // spurious off--; } assert(!rparms.color || lc != -1); assert_eq(1, asc2dnacat[c]); // in now points just past the first character of a sequence // line, and c holds the first character while(true) { // Note: can't have a comment in the middle of a sequence, // though a comment can end a sequence int cat = asc2dnacat[c]; assert_neq(2, cat); if(cat == 1) { // Consume it if(!rparms.color || lc != -1) len++; // Add it to reference buffer if(rparms.color) { dst.set((char)dinuc2color[asc2dna[(int)c]][lc], dstoff++); } else if(!rparms.color) { dst.set(asc2dna[c], dstoff++); } assert_lt((int)dst[dstoff-1], 4); lc = asc2dna[(int)c]; } c = in.get(); if(rparms.nsToAs && asc2dnacat[c] >= 2) c = 'A'; if (c == -1 || c == '>' || c == '#' || asc2dnacat[c] >= 2) { lastc = c; break; } if(rparms.bisulfite && toupper(c) == 'C') c = 'T'; } bail: // Optionally reverse the portion that we just appended. // ilen = length of buffer before this last sequence was appended. if(rparms.reverse == REF_READ_REVERSE_EACH) { // Find limits of the portion we just appended size_t nlen = dstoff; dst.reverseWindow(ilen, nlen); } return RefRecord((TIndexOffU)off, (TIndexOffU)len, first); } #endif /*ndef REF_READ_H_*/