hisat-3n/ref_read.cpp
2025-01-18 21:09:52 +08:00

455 lines
12 KiB
C++

/*
* Copyright 2011, Ben Langmead <langmea@cs.jhu.edu>
*
* This file is part of Bowtie 2.
*
* Bowtie 2 is free software: you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation, either version 3 of the License, or
* (at your option) any later version.
*
* Bowtie 2 is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with Bowtie 2. If not, see <http://www.gnu.org/licenses/>.
*/
#include "ref_read.h"
/**
* Reads past the next ambiguous or unambiguous stretch of sequence
* from the given FASTA file and returns its length. Does not do
* anything with the sequence characters themselves; this is purely for
* measuring lengths.
*/
RefRecord fastaRefReadSize(
FileBuf& in,
const RefReadInParams& rparms,
bool first,
BitpairOutFileBuf* bpout,
string* name // put parsed FASTA name here
)
{
int c;
static int lastc = '>'; // last character seen
// RefRecord params
TIndexOffU len = 0; // 'len' counts toward total length
// 'off' counts number of ambiguous characters before first
// unambiguous character
size_t off = 0;
// Pick off the first carat and any preceding whitespace
if(first) {
assert(!in.eof());
lastc = '>';
c = in.getPastWhitespace();
if(in.eof()) {
// Got eof right away; emit warning
cerr << "Warning: Empty input file" << endl;
lastc = -1;
return RefRecord(0, 0, true);
}
assert(c == '>');
}
first = true;
// Skip to the end of the id line; if the next line is either
// another id line or a comment line, keep skipping
if(lastc == '>') {
#if 0
// Skip to the end of the name line
do {
if((c = in.getPastNewline()) == -1) {
// No more input
cerr << "Warning: Encountered empty reference sequence" << endl;
lastc = -1;
return RefRecord(0, 0, true);
}
if(c == '>') {
cerr << "Warning: Encountered empty reference sequence" << endl;
}
// continue until a non-name, non-comment line
} while (c == '>');
#else
c = lastc;
// Skip to the end of the name line
do {
// get name
while (true) {
c = in.get();
if (c == -1) {
// No more input
cerr << "Warning: Encountered empty reference sequence" << endl;
lastc = -1;
return RefRecord(0, 0, true);
}
if(c == '\n' || c == '\r') {
while(c == '\r' || c == '\n') c = in.get();
if (c == -1) {
// No more input
cerr << "Warning: Encountered empty reference sequence" << endl;
lastc = -1;
return RefRecord(0, 0, true);
}
break;
}
if (name) name->push_back(c);
}
if (c == '>') {
cerr << "Warning: Encountered empty reference sequence" << endl;
// If there's another name line immediately after this one,
// discard the previous name and start fresh with the new one
if (name) name->clear();
}
} while (c == '>');
#endif
} else {
first = false; // not the first in a sequence
off = 1; // The gap has already been consumed, so count it
if((c = in.get()) == -1) {
// Don't emit a warning, since this might legitimately be
// a gap on the end of the final sequence in the file
lastc = -1;
return RefRecord((TIndexOffU)off, (TIndexOffU)len, first);
}
}
// Now skip to the first DNA character, counting gap characters
// as we go
int lc = -1; // last-DNA char variable for color conversion
while(true) {
int cat = asc2dnacat[c];
if(rparms.nsToAs && cat >= 2) c = 'A';
if(cat == 1) {
// This is a DNA character
if(rparms.color) {
if(lc != -1) {
// Got two consecutive unambiguous DNAs
break; // to read-in loop
}
// Keep going; we need two consecutive unambiguous DNAs
lc = asc2dna[(int)c];
// The 'if(off > 0)' takes care of the case where
// the reference is entirely unambiguous and we don't
// want to incorrectly increment off.
if(off > 0) off++;
} else {
break; // to read-in loop
}
} else if(cat >= 2) {
if(lc != -1 && off == 0) off++;
lc = -1;
off++; // skip over gap character and increment
} else if(c == '>') {
if(off > 0 && lastc == '>') {
cerr << "Warning: Encountered reference sequence with only gaps" << endl;
} else if(lastc == '>') {
cerr << "Warning: Encountered empty reference sequence" << endl;
}
lastc = '>';
//return RefRecord(off, 0, false);
return RefRecord((TIndexOffU)off, 0, first);
}
c = in.get();
if(c == -1) {
// End-of-file
if(off > 0 && lastc == '>') {
cerr << "Warning: Encountered reference sequence with only gaps" << endl;
} else if(lastc == '>') {
cerr << "Warning: Encountered empty reference sequence" << endl;
}
lastc = -1;
//return RefRecord(off, 0, false);
return RefRecord((TIndexOffU)off, 0, first);
}
}
assert(!rparms.color || (lc != -1));
assert_eq(1, asc2dnacat[c]); // C must be unambiguous base
if(off > 0 && rparms.color && first) {
// Handle the case where the first record has ambiguous
// characters but we're in color space; one of those counts is
// spurious
off--;
}
// in now points just past the first character of a sequence
// line, and c holds the first character
while(c != -1 && c != '>') {
if(rparms.nsToAs && asc2dnacat[c] >= 2) c = 'A';
uint8_t cat = asc2dnacat[c];
int cc = toupper(c);
if(rparms.bisulfite && cc == 'C') c = cc = 'T';
if(cat == 1) {
// It's a DNA character
assert(cc == 'A' || cc == 'C' || cc == 'G' || cc == 'T');
// Check for overflow
if((TIndexOffU)(len + 1) < len) {
throw RefTooLongException();
}
// Consume it
len++;
// Output it
if(bpout != NULL) {
if(rparms.color) {
// output color
bpout->write(dinuc2color[asc2dna[(int)c]][lc]);
} else if(!rparms.color) {
// output nucleotide
bpout->write(asc2dna[c]);
}
}
lc = asc2dna[(int)c];
} else if(cat >= 2) {
// It's an N or a gap
lastc = c;
assert(cc != 'A' && cc != 'C' && cc != 'G' && cc != 'T');
return RefRecord((TIndexOffU)off, (TIndexOffU)len, first);
} else {
// Not DNA and not a gap, ignore it
#ifndef NDEBUG
if(!isspace(c)) {
cerr << "Unexpected character in sequence: ";
if(isprint(c)) {
cerr << ((char)c) << endl;
} else {
cerr << "(" << c << ")" << endl;
}
}
#endif
}
c = in.get();
}
lastc = c;
return RefRecord((TIndexOffU)off, (TIndexOffU)len, first);
}
#if 0
static void
printRecords(ostream& os, const EList<RefRecord>& l) {
for(size_t i = 0; i < l.size(); i++) {
os << l[i].first << ", " << l[i].off << ", " << l[i].len << endl;
}
}
#endif
/**
* Reverse the 'src' list of RefRecords into the 'dst' list. Don't
* modify 'src'.
*/
void reverseRefRecords(
const EList<RefRecord>& src,
EList<RefRecord>& dst,
bool recursive,
bool verbose)
{
dst.clear();
{
EList<RefRecord> cur;
for(int i = (int)src.size()-1; i >= 0; i--) {
bool first = (i == (int)src.size()-1 || src[i+1].first);
// Clause after the || on next line is to deal with empty FASTA
// records at the end of the 'src' list, which would be wrongly
// omitted otherwise.
if(src[i].len || (first && src[i].off == 0)) {
cur.push_back(RefRecord(0, src[i].len, first));
first = false;
}
if(src[i].off) cur.push_back(RefRecord(src[i].off, 0, first));
}
for(int i = 0; i < (int)cur.size(); i++) {
assert(cur[i].off == 0 || cur[i].len == 0);
if(i < (int)cur.size()-1 && cur[i].off != 0 && !cur[i+1].first) {
dst.push_back(RefRecord(cur[i].off, cur[i+1].len, cur[i].first));
i++;
} else {
dst.push_back(cur[i]);
}
}
}
//if(verbose) {
// cout << "Source: " << endl;
// printRecords(cout, src);
// cout << "Dest: " << endl;
// printRecords(cout, dst);
//}
#ifndef NDEBUG
size_t srcnfirst = 0, dstnfirst = 0;
for(size_t i = 0; i < src.size(); i++) {
if(src[i].first) {
srcnfirst++;
}
}
for(size_t i = 0; i < dst.size(); i++) {
if(dst[i].first) {
dstnfirst++;
}
}
assert_eq(srcnfirst, dstnfirst);
if(!recursive) {
EList<RefRecord> tmp;
reverseRefRecords(dst, tmp, true);
assert_eq(tmp.size(), src.size());
for(size_t i = 0; i < src.size(); i++) {
assert_eq(src[i].len, tmp[i].len);
assert_eq(src[i].off, tmp[i].off);
assert_eq(src[i].first, tmp[i].first);
}
}
#endif
}
/**
* Calculate a vector containing the sizes of all of the patterns in
* all of the given input files, in order. Returns the total size of
* all references combined. Rewinds each istream before returning.
*/
std::pair<size_t, size_t>
fastaRefReadSizes(
EList<FileBuf*>& in,
EList<RefRecord>& recs,
const RefReadInParams& rparms,
BitpairOutFileBuf* bpout,
TIndexOff& numSeqs)
{
TIndexOffU unambigTot = 0;
size_t bothTot = 0;
assert_gt(in.size(), 0);
// For each input istream
for(size_t i = 0; i < in.size(); i++) {
bool first = true;
assert(!in[i]->eof());
// For each pattern in this istream
while(!in[i]->eof()) {
RefRecord rec;
try {
rec = fastaRefReadSize(*in[i], rparms, first, bpout);
if((unambigTot + rec.len) < unambigTot) {
throw RefTooLongException();
}
}
catch(RefTooLongException& e) {
cerr << e.what() << endl;
throw 1;
}
// Add the length of this record.
if(rec.first) numSeqs++;
unambigTot += rec.len;
bothTot += rec.len;
bothTot += rec.off;
first = false;
if(rec.len == 0 && rec.off == 0 && !rec.first) continue;
recs.push_back(rec);
}
// Reset the input stream
in[i]->reset();
assert(!in[i]->eof());
#ifndef NDEBUG
// Check that it's really reset
int c = in[i]->get();
assert_eq('>', c);
in[i]->reset();
assert(!in[i]->eof());
#endif
}
// Remove empty reference sequences
for(int64_t i = 0; (size_t)i < recs.size(); i++) {
const RefRecord& rec = recs[i];
if(rec.first && rec.len == 0) {
if(i + 1 >= recs.size() || recs[i+1].first) {
bothTot -= rec.len;
bothTot -= rec.off;
recs.erase(i);
i -= 1;
}
}
}
assert_geq(bothTot, 0);
assert_geq(unambigTot, 0);
return make_pair(
unambigTot, // total number of unambiguous DNA characters read
bothTot); // total number of DNA characters read, incl. ambiguous ones
}
std::pair<size_t, size_t>
fastaRefReadFragsNames(
EList<FileBuf*>& in,
EList<RefRecord>& recs,
const RefReadInParams& rparms,
BitpairOutFileBuf* bpout,
TIndexOff& numSeqs,
EList<string>& names)
{
TIndexOffU unambigTot = 0;
size_t bothTot = 0;
assert_gt(in.size(), 0);
// For each input istream
for(size_t i = 0; i < in.size(); i++) {
bool first = true;
assert(!in[i]->eof());
// For each pattern in this istream
while(!in[i]->eof()) {
RefRecord rec;
string name;
try {
rec = fastaRefReadSize(*in[i], rparms, first, bpout, &name);
if((unambigTot + rec.len) < unambigTot) {
throw RefTooLongException();
}
}
catch(RefTooLongException& e) {
cerr << e.what() << endl;
throw 1;
}
// Add the length of this record.
if(rec.first) numSeqs++;
if(rec.first) names.push_back(name);
unambigTot += rec.len;
bothTot += rec.len;
bothTot += rec.off;
first = false;
if(rec.len == 0 && rec.off == 0 && !rec.first) continue;
recs.push_back(rec);
}
// Reset the input stream
in[i]->reset();
assert(!in[i]->eof());
#ifndef NDEBUG
// Check that it's really reset
int c = in[i]->get();
assert_eq('>', c);
in[i]->reset();
assert(!in[i]->eof());
#endif
}
// Remove empty reference sequences
for(int64_t i = 0; (size_t)i < recs.size(); i++) {
const RefRecord& rec = recs[i];
if(rec.first && rec.len == 0) {
if(i + 1 >= recs.size() || recs[i+1].first) {
bothTot -= rec.len;
bothTot -= rec.off;
recs.erase(i);
i -= 1;
}
}
}
assert_geq(bothTot, 0);
assert_geq(unambigTot, 0);
return make_pair(
unambigTot, // total number of unambiguous DNA characters read
bothTot); // total number of DNA characters read, incl. ambiguous ones
}