hisat-3n/ref_read.h
2025-01-18 21:09:52 +08:00

326 lines
8.5 KiB
C++

/*
* Copyright 2011, Ben Langmead <langmea@cs.jhu.edu>
*
* This file is part of Bowtie 2.
*
* Bowtie 2 is free software: you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation, either version 3 of the License, or
* (at your option) any later version.
*
* Bowtie 2 is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with Bowtie 2. If not, see <http://www.gnu.org/licenses/>.
*/
#ifndef REF_READ_H_
#define REF_READ_H_
#include <iostream>
#include <cassert>
#include <string>
#include <ctype.h>
#include <fstream>
#include <stdexcept>
#include "alphabet.h"
#include "assert_helpers.h"
#include "filebuf.h"
#include "word_io.h"
#include "ds.h"
#include "endian_swap.h"
using namespace std;
class RefTooLongException : public exception {
public:
RefTooLongException() {
#ifdef BOWTIE_64BIT_INDEX
// This should never happen!
msg = "Error: Reference sequence has more than 2^64-1 characters! "
"Please divide the reference into smaller chunks and index each "
"independently.";
#else
msg = "Error: Reference sequence has more than 2^32-1 characters! "
"Please build a large index by passing the --large-index option "
"to bowtie2-build";
#endif
}
~RefTooLongException() throw() {}
const char* what() const throw() {
return msg.c_str();
}
protected:
string msg;
};
/**
* Encapsulates a stretch of the reference containing only unambiguous
* characters. From an ordered list of RefRecords, one can (almost)
* deduce the "shape" of the reference sequences (almost because we
* lose information about stretches of ambiguous characters at the end
* of reference sequences).
*/
struct RefRecord {
RefRecord() : off(), len(), first() { }
RefRecord(TIndexOffU _off, TIndexOffU _len, bool _first) :
off(_off), len(_len), first(_first)
{ }
RefRecord(FILE *in, bool swap) {
assert(in != NULL);
if(!fread(&off, OFF_SIZE, 1, in)) {
cerr << "Error reading RefRecord offset from FILE" << endl;
throw 1;
}
if(swap) off = endianSwapIndex(off);
if(!fread(&len, OFF_SIZE, 1, in)) {
cerr << "Error reading RefRecord offset from FILE" << endl;
throw 1;
}
if(swap) len = endianSwapIndex(len);
first = fgetc(in) ? true : false;
}
void write(std::ostream& out, bool be) {
writeIndex<TIndexOffU>(out, off, be);
writeIndex<TIndexOffU>(out, len, be);
out.put(first ? 1 : 0);
}
TIndexOffU off; /// Offset of the first character in the record
TIndexOffU len; /// Length of the record
bool first; /// Whether this record is the first for a reference sequence
};
enum {
REF_READ_FORWARD = 0, // don't reverse reference sequence
REF_READ_REVERSE, // reverse entire reference sequence
REF_READ_REVERSE_EACH, // reverse each unambiguous stretch of reference
REF_READ_REVERSE_REPLACEMENT, // reverse the entire reference sequence and replace specific base by others.
};
/**
* Parameters governing treatment of references as they're read in.
*/
struct RefReadInParams {
RefReadInParams(bool col, int r, bool nsToA, bool bisulf) :
color(col), reverse(r), nsToAs(nsToA), bisulfite(bisulf) { }
// extract colors from reference
bool color;
// reverse each reference sequence before passing it along
int reverse;
// convert ambiguous characters to As
bool nsToAs;
// bisulfite-convert the reference
bool bisulfite;
};
extern RefRecord
fastaRefReadSize(
FileBuf& in,
const RefReadInParams& rparms,
bool first,
BitpairOutFileBuf* bpout,
string *name = NULL);
extern std::pair<size_t, size_t>
fastaRefReadSizes(
EList<FileBuf*>& in,
EList<RefRecord>& recs,
const RefReadInParams& rparms,
BitpairOutFileBuf* bpout,
TIndexOff& numSeqs);
extern std::pair<size_t, size_t>
fastaRefReadFragsNames(
EList<FileBuf*>& in,
EList<RefRecord>& recs,
const RefReadInParams& rparms,
BitpairOutFileBuf* bpout,
TIndexOff& numSeqs,
EList<string>& names);
extern void
reverseRefRecords(
const EList<RefRecord>& src,
EList<RefRecord>& dst,
bool recursive = false,
bool verbose = false);
/**
* Reads the next sequence from the given FASTA file and appends it to
* the end of dst, optionally reversing it.
*/
template <typename TStr>
static RefRecord fastaRefReadAppend(
FileBuf& in, // input file
bool first, // true iff this is the first record in the file
TStr& dst, // destination buf for parsed characters
TIndexOffU& dstoff, // index of next character in dst to assign
RefReadInParams& rparms, //
string* name = NULL) // put parsed FASTA name here
{
int c;
static int lastc = '>';
if(first) {
c = in.getPastWhitespace();
if(c != '>') {
cerr << "Reference file does not seem to be a FASTA file" << endl;
throw 1;
}
lastc = c;
}
assert_neq(-1, lastc);
// RefRecord params
size_t len = 0;
size_t off = 0;
first = true;
size_t ilen = dstoff;
// Chew up the id line; if the next line is either
// another id line or a comment line, keep chewing
int lc = -1; // last-DNA char variable for color conversion
c = lastc;
if(c == '>' || c == '#') {
do {
while (c == '#') {
if((c = in.getPastNewline()) == -1) {
lastc = -1;
goto bail;
}
}
assert_eq('>', c);
while(true) {
c = in.get();
if(c == -1) {
lastc = -1;
goto bail;
}
if(c == '\n' || c == '\r') {
while(c == '\r' || c == '\n') c = in.get();
if(c == -1) {
lastc = -1;
goto bail;
}
break;
}
if (name) name->push_back(c);
}
// c holds the first character on the line after the name
// line
if(c == '>') {
// If there's another name line immediately after this one,
// discard the previous name and start fresh with the new one
if (name) name->clear();
}
} while (c == '>' || c == '#');
} else {
ASSERT_ONLY(int cc = toupper(c));
assert(cc != 'A' && cc != 'C' && cc != 'G' && cc != 'T');
first = false;
}
// Skip over an initial stretch of gaps or ambiguous characters.
// For colorspace we skip until we see two consecutive unambiguous
// characters (i.e. the first unambiguous color).
while(true) {
int cat = asc2dnacat[c];
if(rparms.nsToAs && cat >= 2) {
c = 'A';
}
int cc = toupper(c);
if(rparms.bisulfite && cc == 'C') c = cc = 'T';
if(cat == 1) {
// This is a DNA character
if(rparms.color) {
if(lc != -1) {
// Got two consecutive unambiguous DNAs
break; // to read-in loop
}
// Keep going; we need two consecutive unambiguous DNAs
lc = asc2dna[(int)c];
// The 'if(off > 0)' takes care of the case where
// the reference is entirely unambiguous and we don't
// want to incorrectly increment off.
if(off > 0) off++;
} else {
break; // to read-in loop
}
} else if(cat >= 2) {
if(lc != -1 && off == 0) {
off++;
}
lc = -1;
off++; // skip it
} else if(c == '>') {
lastc = '>';
goto bail;
}
c = in.get();
if(c == -1) {
lastc = -1;
goto bail;
}
}
if(first && rparms.color && off > 0) {
// Handle the case where the first record has ambiguous
// characters but we're in color space; one of those counts is
// spurious
off--;
}
assert(!rparms.color || lc != -1);
assert_eq(1, asc2dnacat[c]);
// in now points just past the first character of a sequence
// line, and c holds the first character
while(true) {
// Note: can't have a comment in the middle of a sequence,
// though a comment can end a sequence
int cat = asc2dnacat[c];
assert_neq(2, cat);
if(cat == 1) {
// Consume it
if(!rparms.color || lc != -1) len++;
// Add it to reference buffer
if(rparms.color) {
dst.set((char)dinuc2color[asc2dna[(int)c]][lc], dstoff++);
} else if(!rparms.color) {
dst.set(asc2dna[c], dstoff++);
}
assert_lt((int)dst[dstoff-1], 4);
lc = asc2dna[(int)c];
}
c = in.get();
if(rparms.nsToAs && asc2dnacat[c] >= 2) c = 'A';
if (c == -1 || c == '>' || c == '#' || asc2dnacat[c] >= 2) {
lastc = c;
break;
}
if(rparms.bisulfite && toupper(c) == 'C') c = 'T';
}
bail:
// Optionally reverse the portion that we just appended.
// ilen = length of buffer before this last sequence was appended.
if(rparms.reverse == REF_READ_REVERSE_EACH) {
// Find limits of the portion we just appended
size_t nlen = dstoff;
dst.reverseWindow(ilen, nlen);
}
return RefRecord((TIndexOffU)off, (TIndexOffU)len, first);
}
#endif /*ndef REF_READ_H_*/