hisat-3n/ref_read.cpp

/*
 * Copyright 2011, Ben Langmead <langmea@cs.jhu.edu>
 *
 * This file is part of Bowtie 2.
 *
 * Bowtie 2 is free software: you can redistribute it and/or modify
 * it under the terms of the GNU General Public License as published by
 * the Free Software Foundation, either version 3 of the License, or
 * (at your option) any later version.
 *
 * Bowtie 2 is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU General Public License for more details.
 *
 * You should have received a copy of the GNU General Public License
 * along with Bowtie 2.  If not, see <http://www.gnu.org/licenses/>.
 */

#include "ref_read.h"

/**
 * Reads past the next ambiguous or unambiguous stretch of sequence
 * from the given FASTA file and returns its length.  Does not do
 * anything with the sequence characters themselves; this is purely for
 * measuring lengths.
 */
RefRecord fastaRefReadSize(
	FileBuf& in,
	const RefReadInParams& rparms,
	bool first,
	BitpairOutFileBuf* bpout,
	string* name // put parsed FASTA name here
	)
{
	int c;
	static int lastc = '>'; // last character seen

	// RefRecord params
	TIndexOffU len = 0; // 'len' counts toward total length
	// 'off' counts number of ambiguous characters before first
	// unambiguous character
	size_t off = 0;

	// Pick off the first carat and any preceding whitespace
	if(first) {
		assert(!in.eof());
		lastc = '>';
		c = in.getPastWhitespace();
		if(in.eof()) {
			// Got eof right away; emit warning
			cerr << "Warning: Empty input file" << endl;
			lastc = -1;
			return RefRecord(0, 0, true);
		}
		assert(c == '>');
	}

	first = true;
	// Skip to the end of the id line; if the next line is either
	// another id line or a comment line, keep skipping
	if(lastc == '>') {
#if 0
		// Skip to the end of the name line
		do {
			if((c = in.getPastNewline()) == -1) {
				// No more input
				cerr << "Warning: Encountered empty reference sequence" << endl;
				lastc = -1;
				return RefRecord(0, 0, true);
			}
			if(c == '>') {
				cerr << "Warning: Encountered empty reference sequence" << endl;
			}
			// continue until a non-name, non-comment line
		} while (c == '>');
#else
		c = lastc;
		// Skip to the end of the name line
		do {
			// get name

			while (true) {
				c = in.get();
				if (c == -1) {
					// No more input
					cerr << "Warning: Encountered empty reference sequence" << endl;
					lastc = -1;
					return RefRecord(0, 0, true);
				}

				if(c == '\n' || c == '\r') {
					while(c == '\r' || c == '\n') c = in.get();
					if (c == -1) {
						// No more input
						cerr << "Warning: Encountered empty reference sequence" << endl;
						lastc = -1;
						return RefRecord(0, 0, true);
					}
					break;
				}

				if (name) name->push_back(c);
			}

			if (c == '>') {
				cerr << "Warning: Encountered empty reference sequence" << endl;
				// If there's another name line immediately after this one,
				// discard the previous name and start fresh with the new one
				if (name) name->clear();
			}
		} while (c == '>');
#endif
	} else {
		first = false; // not the first in a sequence
		off = 1; // The gap has already been consumed, so count it
		if((c = in.get()) == -1) {
			// Don't emit a warning, since this might legitimately be
			// a gap on the end of the final sequence in the file
			lastc = -1;
			return RefRecord((TIndexOffU)off, (TIndexOffU)len, first);
		}
	}

	// Now skip to the first DNA character, counting gap characters
	// as we go
	int lc = -1; // last-DNA char variable for color conversion
	while(true) {
		int cat = asc2dnacat[c];
		if(rparms.nsToAs && cat >= 2) c = 'A';
		if(cat == 1) {
			// This is a DNA character
			if(rparms.color) {
				if(lc != -1) {
					// Got two consecutive unambiguous DNAs
					break; // to read-in loop
				}
				// Keep going; we need two consecutive unambiguous DNAs
				lc = asc2dna[(int)c];
				// The 'if(off > 0)' takes care of the case where
				// the reference is entirely unambiguous and we don't
				// want to incorrectly increment off.
				if(off > 0) off++;
			} else {
				break; // to read-in loop
			}
		} else if(cat >= 2) {
			if(lc != -1 && off == 0) off++;
			lc = -1;
			off++; // skip over gap character and increment
		} else if(c == '>') {
			if(off > 0 && lastc == '>') {
				cerr << "Warning: Encountered reference sequence with only gaps" << endl;
			} else if(lastc == '>') {
				cerr << "Warning: Encountered empty reference sequence" << endl;
			}
			lastc = '>';
			//return RefRecord(off, 0, false);
			return RefRecord((TIndexOffU)off, 0, first);
		}
		c = in.get();
		if(c == -1) {
			// End-of-file
			if(off > 0 && lastc == '>') {
				cerr << "Warning: Encountered reference sequence with only gaps" << endl;
			} else if(lastc == '>') {
				cerr << "Warning: Encountered empty reference sequence" << endl;
			}
			lastc = -1;
			//return RefRecord(off, 0, false);
			return RefRecord((TIndexOffU)off, 0, first);
		}
	}
	assert(!rparms.color || (lc != -1));
	assert_eq(1, asc2dnacat[c]); // C must be unambiguous base
	if(off > 0 && rparms.color && first) {
		// Handle the case where the first record has ambiguous
		// characters but we're in color space; one of those counts is
		// spurious
		off--;
	}

	// in now points just past the first character of a sequence
	// line, and c holds the first character
	while(c != -1 && c != '>') {
		if(rparms.nsToAs && asc2dnacat[c] >= 2) c = 'A';
		uint8_t cat = asc2dnacat[c];
		int cc = toupper(c);
		if(rparms.bisulfite && cc == 'C') c = cc = 'T';
		if(cat == 1) {
			// It's a DNA character
			assert(cc == 'A' || cc == 'C' || cc == 'G' || cc == 'T');
			// Check for overflow
			if((TIndexOffU)(len + 1) < len) {
				throw RefTooLongException();
			}
			// Consume it
			len++;
			// Output it
			if(bpout != NULL) {
				if(rparms.color) {
					// output color
					bpout->write(dinuc2color[asc2dna[(int)c]][lc]);
				} else if(!rparms.color) {
					// output nucleotide
					bpout->write(asc2dna[c]);
				}
			}
			lc = asc2dna[(int)c];
		} else if(cat >= 2) {
			// It's an N or a gap
			lastc = c;
			assert(cc != 'A' && cc != 'C' && cc != 'G' && cc != 'T');
			return RefRecord((TIndexOffU)off, (TIndexOffU)len, first);
		} else {
			// Not DNA and not a gap, ignore it
#ifndef NDEBUG
			if(!isspace(c)) {
				cerr << "Unexpected character in sequence: ";
				if(isprint(c)) {
					cerr << ((char)c) << endl;
				} else {
					cerr << "(" << c << ")" << endl;
				}
			}
#endif
		}
		c = in.get();
	}
	lastc = c;
	return RefRecord((TIndexOffU)off, (TIndexOffU)len, first);
}

#if 0
static void
printRecords(ostream& os, const EList<RefRecord>& l) {
	for(size_t i = 0; i < l.size(); i++) {
		os << l[i].first << ", " << l[i].off << ", " << l[i].len << endl;
	}
}
#endif

/**
 * Reverse the 'src' list of RefRecords into the 'dst' list.  Don't
 * modify 'src'.
 */
void reverseRefRecords(
	const EList<RefRecord>& src,
	EList<RefRecord>& dst,
	bool recursive,
	bool verbose)
{
	dst.clear();
	{
		EList<RefRecord> cur;
		for(int i = (int)src.size()-1; i >= 0; i--) {
			bool first = (i == (int)src.size()-1 || src[i+1].first);
			// Clause after the || on next line is to deal with empty FASTA
			// records at the end of the 'src' list, which would be wrongly
			// omitted otherwise.
			if(src[i].len || (first && src[i].off == 0)) {
				cur.push_back(RefRecord(0, src[i].len, first));
				first = false;
			}
			if(src[i].off) cur.push_back(RefRecord(src[i].off, 0, first));
		}
		for(int i = 0; i < (int)cur.size(); i++) {
			assert(cur[i].off == 0 || cur[i].len == 0);
			if(i < (int)cur.size()-1 && cur[i].off != 0 && !cur[i+1].first) {
				dst.push_back(RefRecord(cur[i].off, cur[i+1].len, cur[i].first));
				i++;
			} else {
				dst.push_back(cur[i]);
			}
		}
	}
	//if(verbose) {
	//	cout << "Source: " << endl;
	//	printRecords(cout, src);
	//	cout << "Dest: " << endl;
	//	printRecords(cout, dst);
	//}
#ifndef NDEBUG
	size_t srcnfirst = 0, dstnfirst = 0;
	for(size_t i = 0; i < src.size(); i++) {
		if(src[i].first) {
			srcnfirst++;
		}
	}
	for(size_t i = 0; i < dst.size(); i++) {
		if(dst[i].first) {
			dstnfirst++;
		}
	}
	assert_eq(srcnfirst, dstnfirst);
	if(!recursive) {
		EList<RefRecord> tmp;
		reverseRefRecords(dst, tmp, true);
		assert_eq(tmp.size(), src.size());
		for(size_t i = 0; i < src.size(); i++) {
			assert_eq(src[i].len, tmp[i].len);
			assert_eq(src[i].off, tmp[i].off);
			assert_eq(src[i].first, tmp[i].first);
		}
	}
#endif
}

/**
 * Calculate a vector containing the sizes of all of the patterns in
 * all of the given input files, in order.  Returns the total size of
 * all references combined.  Rewinds each istream before returning.
 */
std::pair<size_t, size_t>
fastaRefReadSizes(
	EList<FileBuf*>& in,
	EList<RefRecord>& recs,
	const RefReadInParams& rparms,
	BitpairOutFileBuf* bpout,
	TIndexOff& numSeqs)
{
	TIndexOffU unambigTot = 0;
	size_t bothTot = 0;
	assert_gt(in.size(), 0);
	// For each input istream
	for(size_t i = 0; i < in.size(); i++) {
		bool first = true;
		assert(!in[i]->eof());
		// For each pattern in this istream
		while(!in[i]->eof()) {
			RefRecord rec;
			try {
				rec = fastaRefReadSize(*in[i], rparms, first, bpout);
				if((unambigTot + rec.len) < unambigTot) {
					throw RefTooLongException();
				}
			}
			catch(RefTooLongException& e) {
				cerr << e.what() << endl;
				throw 1;
			}
			// Add the length of this record.
			if(rec.first) numSeqs++;
			unambigTot += rec.len;
			bothTot += rec.len;
			bothTot += rec.off;
			first = false;
			if(rec.len == 0 && rec.off == 0 && !rec.first) continue;
			recs.push_back(rec);
		}
		// Reset the input stream
		in[i]->reset();
		assert(!in[i]->eof());
#ifndef NDEBUG
		// Check that it's really reset
		int c = in[i]->get();
		assert_eq('>', c);
		in[i]->reset();
		assert(!in[i]->eof());
#endif
	}
    
    // Remove empty reference sequences
    for(int64_t i = 0; (size_t)i < recs.size(); i++) {
        const RefRecord& rec = recs[i];
        if(rec.first && rec.len == 0) {
            if(i + 1 >= recs.size() || recs[i+1].first) {
                bothTot -= rec.len;
                bothTot -= rec.off;
                recs.erase(i);
                i -= 1;
            }
        }
    }
    
	assert_geq(bothTot, 0);
	assert_geq(unambigTot, 0);
	return make_pair(
		unambigTot, // total number of unambiguous DNA characters read
		bothTot); // total number of DNA characters read, incl. ambiguous ones
}


std::pair<size_t, size_t>
fastaRefReadFragsNames(
	EList<FileBuf*>& in,
	EList<RefRecord>& recs,
	const RefReadInParams& rparms,
	BitpairOutFileBuf* bpout,
	TIndexOff& numSeqs,
	EList<string>& names)
{
	TIndexOffU unambigTot = 0;
	size_t bothTot = 0;
	assert_gt(in.size(), 0);
	// For each input istream
	for(size_t i = 0; i < in.size(); i++) {
		bool first = true;
		assert(!in[i]->eof());
		// For each pattern in this istream
		while(!in[i]->eof()) {
			RefRecord rec;
			string name;
			try {
				rec = fastaRefReadSize(*in[i], rparms, first, bpout, &name);
				if((unambigTot + rec.len) < unambigTot) {
					throw RefTooLongException();
				}
			}
			catch(RefTooLongException& e) {
				cerr << e.what() << endl;
				throw 1;
			}
			// Add the length of this record.
			if(rec.first) numSeqs++;
			if(rec.first) names.push_back(name); 
			unambigTot += rec.len;
			bothTot += rec.len;
			bothTot += rec.off;
			first = false;
			if(rec.len == 0 && rec.off == 0 && !rec.first) continue;
			recs.push_back(rec);
		}
		// Reset the input stream
		in[i]->reset();
		assert(!in[i]->eof());
#ifndef NDEBUG
		// Check that it's really reset
		int c = in[i]->get();
		assert_eq('>', c);
		in[i]->reset();
		assert(!in[i]->eof());
#endif
	}
    
    // Remove empty reference sequences
    for(int64_t i = 0; (size_t)i < recs.size(); i++) {
        const RefRecord& rec = recs[i];
        if(rec.first && rec.len == 0) {
            if(i + 1 >= recs.size() || recs[i+1].first) {
                bothTot -= rec.len;
                bothTot -= rec.off;
                recs.erase(i);
                i -= 1;
            }
        }
    }
    
	assert_geq(bothTot, 0);
	assert_geq(unambigTot, 0);
	return make_pair(
		unambigTot, // total number of unambiguous DNA characters read
		bothTot); // total number of DNA characters read, incl. ambiguous ones
}
initial commit 2025-01-18 13:09:52 +00:00			`/*`
			`* Copyright 2011, Ben Langmead <langmea@cs.jhu.edu>`
			`*`
			`* This file is part of Bowtie 2.`
			`*`
			`* Bowtie 2 is free software: you can redistribute it and/or modify`
			`* it under the terms of the GNU General Public License as published by`
			`* the Free Software Foundation, either version 3 of the License, or`
			`* (at your option) any later version.`
			`*`
			`* Bowtie 2 is distributed in the hope that it will be useful,`
			`* but WITHOUT ANY WARRANTY; without even the implied warranty of`
			`* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the`
			`* GNU General Public License for more details.`
			`*`
			`* You should have received a copy of the GNU General Public License`
			`* along with Bowtie 2. If not, see <http://www.gnu.org/licenses/>.`
			`*/`

			`#include "ref_read.h"`

			`/**`
			`* Reads past the next ambiguous or unambiguous stretch of sequence`
			`* from the given FASTA file and returns its length. Does not do`
			`* anything with the sequence characters themselves; this is purely for`
			`* measuring lengths.`
			`*/`
			`RefRecord fastaRefReadSize(`
			`FileBuf& in,`
			`const RefReadInParams& rparms,`
			`bool first,`
			`BitpairOutFileBuf* bpout,`
			`string* name // put parsed FASTA name here`
			`)`
			`{`
			`int c;`
			`static int lastc = '>'; // last character seen`

			`// RefRecord params`
			`TIndexOffU len = 0; // 'len' counts toward total length`
			`// 'off' counts number of ambiguous characters before first`
			`// unambiguous character`
			`size_t off = 0;`

			`// Pick off the first carat and any preceding whitespace`
			`if(first) {`
			`assert(!in.eof());`
			`lastc = '>';`
			`c = in.getPastWhitespace();`
			`if(in.eof()) {`
			`// Got eof right away; emit warning`
			`cerr << "Warning: Empty input file" << endl;`
			`lastc = -1;`
			`return RefRecord(0, 0, true);`
			`}`
			`assert(c == '>');`
			`}`

			`first = true;`
			`// Skip to the end of the id line; if the next line is either`
			`// another id line or a comment line, keep skipping`
			`if(lastc == '>') {`
			`#if 0`
			`// Skip to the end of the name line`
			`do {`
			`if((c = in.getPastNewline()) == -1) {`
			`// No more input`
			`cerr << "Warning: Encountered empty reference sequence" << endl;`
			`lastc = -1;`
			`return RefRecord(0, 0, true);`
			`}`
			`if(c == '>') {`
			`cerr << "Warning: Encountered empty reference sequence" << endl;`
			`}`
			`// continue until a non-name, non-comment line`
			`} while (c == '>');`
			`#else`
			`c = lastc;`
			`// Skip to the end of the name line`
			`do {`
			`// get name`

			`while (true) {`
			`c = in.get();`
			`if (c == -1) {`
			`// No more input`
			`cerr << "Warning: Encountered empty reference sequence" << endl;`
			`lastc = -1;`
			`return RefRecord(0, 0, true);`
			`}`

			`if(c == '\n' \|\| c == '\r') {`
			`while(c == '\r' \|\| c == '\n') c = in.get();`
			`if (c == -1) {`
			`// No more input`
			`cerr << "Warning: Encountered empty reference sequence" << endl;`
			`lastc = -1;`
			`return RefRecord(0, 0, true);`
			`}`
			`break;`
			`}`

			`if (name) name->push_back(c);`
			`}`

			`if (c == '>') {`
			`cerr << "Warning: Encountered empty reference sequence" << endl;`
			`// If there's another name line immediately after this one,`
			`// discard the previous name and start fresh with the new one`
			`if (name) name->clear();`
			`}`
			`} while (c == '>');`
			`#endif`
			`} else {`
			`first = false; // not the first in a sequence`
			`off = 1; // The gap has already been consumed, so count it`
			`if((c = in.get()) == -1) {`
			`// Don't emit a warning, since this might legitimately be`
			`// a gap on the end of the final sequence in the file`
			`lastc = -1;`
			`return RefRecord((TIndexOffU)off, (TIndexOffU)len, first);`
			`}`
			`}`

			`// Now skip to the first DNA character, counting gap characters`
			`// as we go`
			`int lc = -1; // last-DNA char variable for color conversion`
			`while(true) {`
			`int cat = asc2dnacat[c];`
			`if(rparms.nsToAs && cat >= 2) c = 'A';`
			`if(cat == 1) {`
			`// This is a DNA character`
			`if(rparms.color) {`
			`if(lc != -1) {`
			`// Got two consecutive unambiguous DNAs`
			`break; // to read-in loop`
			`}`
			`// Keep going; we need two consecutive unambiguous DNAs`
			`lc = asc2dna[(int)c];`
			`// The 'if(off > 0)' takes care of the case where`
			`// the reference is entirely unambiguous and we don't`
			`// want to incorrectly increment off.`
			`if(off > 0) off++;`
			`} else {`
			`break; // to read-in loop`
			`}`
			`} else if(cat >= 2) {`
			`if(lc != -1 && off == 0) off++;`
			`lc = -1;`
			`off++; // skip over gap character and increment`
			`} else if(c == '>') {`
			`if(off > 0 && lastc == '>') {`
			`cerr << "Warning: Encountered reference sequence with only gaps" << endl;`
			`} else if(lastc == '>') {`
			`cerr << "Warning: Encountered empty reference sequence" << endl;`
			`}`
			`lastc = '>';`
			`//return RefRecord(off, 0, false);`
			`return RefRecord((TIndexOffU)off, 0, first);`
			`}`
			`c = in.get();`
			`if(c == -1) {`
			`// End-of-file`
			`if(off > 0 && lastc == '>') {`
			`cerr << "Warning: Encountered reference sequence with only gaps" << endl;`
			`} else if(lastc == '>') {`
			`cerr << "Warning: Encountered empty reference sequence" << endl;`
			`}`
			`lastc = -1;`
			`//return RefRecord(off, 0, false);`
			`return RefRecord((TIndexOffU)off, 0, first);`
			`}`
			`}`
			`assert(!rparms.color \|\| (lc != -1));`
			`assert_eq(1, asc2dnacat[c]); // C must be unambiguous base`
			`if(off > 0 && rparms.color && first) {`
			`// Handle the case where the first record has ambiguous`
			`// characters but we're in color space; one of those counts is`
			`// spurious`
			`off--;`
			`}`

			`// in now points just past the first character of a sequence`
			`// line, and c holds the first character`
			`while(c != -1 && c != '>') {`
			`if(rparms.nsToAs && asc2dnacat[c] >= 2) c = 'A';`
			`uint8_t cat = asc2dnacat[c];`
			`int cc = toupper(c);`
			`if(rparms.bisulfite && cc == 'C') c = cc = 'T';`
			`if(cat == 1) {`
			`// It's a DNA character`
			`assert(cc == 'A' \|\| cc == 'C' \|\| cc == 'G' \|\| cc == 'T');`
			`// Check for overflow`
			`if((TIndexOffU)(len + 1) < len) {`
			`throw RefTooLongException();`
			`}`
			`// Consume it`
			`len++;`
			`// Output it`
			`if(bpout != NULL) {`
			`if(rparms.color) {`
			`// output color`
			`bpout->write(dinuc2color[asc2dna[(int)c]][lc]);`
			`} else if(!rparms.color) {`
			`// output nucleotide`
			`bpout->write(asc2dna[c]);`
			`}`
			`}`
			`lc = asc2dna[(int)c];`
			`} else if(cat >= 2) {`
			`// It's an N or a gap`
			`lastc = c;`
			`assert(cc != 'A' && cc != 'C' && cc != 'G' && cc != 'T');`
			`return RefRecord((TIndexOffU)off, (TIndexOffU)len, first);`
			`} else {`
			`// Not DNA and not a gap, ignore it`
			`#ifndef NDEBUG`
			`if(!isspace(c)) {`
			`cerr << "Unexpected character in sequence: ";`
			`if(isprint(c)) {`
			`cerr << ((char)c) << endl;`
			`} else {`
			`cerr << "(" << c << ")" << endl;`
			`}`
			`}`
			`#endif`
			`}`
			`c = in.get();`
			`}`
			`lastc = c;`
			`return RefRecord((TIndexOffU)off, (TIndexOffU)len, first);`
			`}`

			`#if 0`
			`static void`
			`printRecords(ostream& os, const EList<RefRecord>& l) {`
			`for(size_t i = 0; i < l.size(); i++) {`
			`os << l[i].first << ", " << l[i].off << ", " << l[i].len << endl;`
			`}`
			`}`
			`#endif`

			`/**`
			`* Reverse the 'src' list of RefRecords into the 'dst' list. Don't`
			`* modify 'src'.`
			`*/`
			`void reverseRefRecords(`
			`const EList<RefRecord>& src,`
			`EList<RefRecord>& dst,`
			`bool recursive,`
			`bool verbose)`
			`{`
			`dst.clear();`
			`{`
			`EList<RefRecord> cur;`
			`for(int i = (int)src.size()-1; i >= 0; i--) {`
			`bool first = (i == (int)src.size()-1 \|\| src[i+1].first);`
			`// Clause after the \|\| on next line is to deal with empty FASTA`
			`// records at the end of the 'src' list, which would be wrongly`
			`// omitted otherwise.`
			`if(src[i].len \|\| (first && src[i].off == 0)) {`
			`cur.push_back(RefRecord(0, src[i].len, first));`
			`first = false;`
			`}`
			`if(src[i].off) cur.push_back(RefRecord(src[i].off, 0, first));`
			`}`
			`for(int i = 0; i < (int)cur.size(); i++) {`
			`assert(cur[i].off == 0 \|\| cur[i].len == 0);`
			`if(i < (int)cur.size()-1 && cur[i].off != 0 && !cur[i+1].first) {`
			`dst.push_back(RefRecord(cur[i].off, cur[i+1].len, cur[i].first));`
			`i++;`
			`} else {`
			`dst.push_back(cur[i]);`
			`}`
			`}`
			`}`
			`//if(verbose) {`
			`// cout << "Source: " << endl;`
			`// printRecords(cout, src);`
			`// cout << "Dest: " << endl;`
			`// printRecords(cout, dst);`
			`//}`
			`#ifndef NDEBUG`
			`size_t srcnfirst = 0, dstnfirst = 0;`
			`for(size_t i = 0; i < src.size(); i++) {`
			`if(src[i].first) {`
			`srcnfirst++;`
			`}`
			`}`
			`for(size_t i = 0; i < dst.size(); i++) {`
			`if(dst[i].first) {`
			`dstnfirst++;`
			`}`
			`}`
			`assert_eq(srcnfirst, dstnfirst);`
			`if(!recursive) {`
			`EList<RefRecord> tmp;`
			`reverseRefRecords(dst, tmp, true);`
			`assert_eq(tmp.size(), src.size());`
			`for(size_t i = 0; i < src.size(); i++) {`
			`assert_eq(src[i].len, tmp[i].len);`
			`assert_eq(src[i].off, tmp[i].off);`
			`assert_eq(src[i].first, tmp[i].first);`
			`}`
			`}`
			`#endif`
			`}`

			`/**`
			`* Calculate a vector containing the sizes of all of the patterns in`
			`* all of the given input files, in order. Returns the total size of`
			`* all references combined. Rewinds each istream before returning.`
			`*/`
			`std::pair<size_t, size_t>`
			`fastaRefReadSizes(`
			`EList<FileBuf*>& in,`
			`EList<RefRecord>& recs,`
			`const RefReadInParams& rparms,`
			`BitpairOutFileBuf* bpout,`
			`TIndexOff& numSeqs)`
			`{`
			`TIndexOffU unambigTot = 0;`
			`size_t bothTot = 0;`
			`assert_gt(in.size(), 0);`
			`// For each input istream`
			`for(size_t i = 0; i < in.size(); i++) {`
			`bool first = true;`
			`assert(!in[i]->eof());`
			`// For each pattern in this istream`
			`while(!in[i]->eof()) {`
			`RefRecord rec;`
			`try {`
			`rec = fastaRefReadSize(*in[i], rparms, first, bpout);`
			`if((unambigTot + rec.len) < unambigTot) {`
			`throw RefTooLongException();`
			`}`
			`}`
			`catch(RefTooLongException& e) {`
			`cerr << e.what() << endl;`
			`throw 1;`
			`}`
			`// Add the length of this record.`
			`if(rec.first) numSeqs++;`
			`unambigTot += rec.len;`
			`bothTot += rec.len;`
			`bothTot += rec.off;`
			`first = false;`
			`if(rec.len == 0 && rec.off == 0 && !rec.first) continue;`
			`recs.push_back(rec);`
			`}`
			`// Reset the input stream`
			`in[i]->reset();`
			`assert(!in[i]->eof());`
			`#ifndef NDEBUG`
			`// Check that it's really reset`
			`int c = in[i]->get();`
			`assert_eq('>', c);`
			`in[i]->reset();`
			`assert(!in[i]->eof());`
			`#endif`
			`}`

			`// Remove empty reference sequences`
			`for(int64_t i = 0; (size_t)i < recs.size(); i++) {`
			`const RefRecord& rec = recs[i];`
			`if(rec.first && rec.len == 0) {`
			`if(i + 1 >= recs.size() \|\| recs[i+1].first) {`
			`bothTot -= rec.len;`
			`bothTot -= rec.off;`
			`recs.erase(i);`
			`i -= 1;`
			`}`
			`}`
			`}`

			`assert_geq(bothTot, 0);`
			`assert_geq(unambigTot, 0);`
			`return make_pair(`
			`unambigTot, // total number of unambiguous DNA characters read`
			`bothTot); // total number of DNA characters read, incl. ambiguous ones`
			`}`


			`std::pair<size_t, size_t>`
			`fastaRefReadFragsNames(`
			`EList<FileBuf*>& in,`
			`EList<RefRecord>& recs,`
			`const RefReadInParams& rparms,`
			`BitpairOutFileBuf* bpout,`
			`TIndexOff& numSeqs,`
			`EList<string>& names)`
			`{`
			`TIndexOffU unambigTot = 0;`
			`size_t bothTot = 0;`
			`assert_gt(in.size(), 0);`
			`// For each input istream`
			`for(size_t i = 0; i < in.size(); i++) {`
			`bool first = true;`
			`assert(!in[i]->eof());`
			`// For each pattern in this istream`
			`while(!in[i]->eof()) {`
			`RefRecord rec;`
			`string name;`
			`try {`
			`rec = fastaRefReadSize(*in[i], rparms, first, bpout, &name);`
			`if((unambigTot + rec.len) < unambigTot) {`
			`throw RefTooLongException();`
			`}`
			`}`
			`catch(RefTooLongException& e) {`
			`cerr << e.what() << endl;`
			`throw 1;`
			`}`
			`// Add the length of this record.`
			`if(rec.first) numSeqs++;`
			`if(rec.first) names.push_back(name);`
			`unambigTot += rec.len;`
			`bothTot += rec.len;`
			`bothTot += rec.off;`
			`first = false;`
			`if(rec.len == 0 && rec.off == 0 && !rec.first) continue;`
			`recs.push_back(rec);`
			`}`
			`// Reset the input stream`
			`in[i]->reset();`
			`assert(!in[i]->eof());`
			`#ifndef NDEBUG`
			`// Check that it's really reset`
			`int c = in[i]->get();`
			`assert_eq('>', c);`
			`in[i]->reset();`
			`assert(!in[i]->eof());`
			`#endif`
			`}`

			`// Remove empty reference sequences`
			`for(int64_t i = 0; (size_t)i < recs.size(); i++) {`
			`const RefRecord& rec = recs[i];`
			`if(rec.first && rec.len == 0) {`
			`if(i + 1 >= recs.size() \|\| recs[i+1].first) {`
			`bothTot -= rec.len;`
			`bothTot -= rec.off;`
			`recs.erase(i);`
			`i -= 1;`
			`}`
			`}`
			`}`

			`assert_geq(bothTot, 0);`
			`assert_geq(unambigTot, 0);`
			`return make_pair(`
			`unambigTot, // total number of unambiguous DNA characters read`
			`bothTot); // total number of DNA characters read, incl. ambiguous ones`
			`}`