305 lines
8.2 KiB
C++
305 lines
8.2 KiB
C++
|
/*
|
||
|
* Copyright 2011, Ben Langmead <langmea@cs.jhu.edu>
|
||
|
*
|
||
|
* This file is part of Bowtie 2.
|
||
|
*
|
||
|
* Bowtie 2 is free software: you can redistribute it and/or modify
|
||
|
* it under the terms of the GNU General Public License as published by
|
||
|
* the Free Software Foundation, either version 3 of the License, or
|
||
|
* (at your option) any later version.
|
||
|
*
|
||
|
* Bowtie 2 is distributed in the hope that it will be useful,
|
||
|
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||
|
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||
|
* GNU General Public License for more details.
|
||
|
*
|
||
|
* You should have received a copy of the GNU General Public License
|
||
|
* along with Bowtie 2. If not, see <http://www.gnu.org/licenses/>.
|
||
|
*/
|
||
|
|
||
|
#include "pat.h"
|
||
|
|
||
|
/**
|
||
|
* Parse a name from fb_ and store in r. Assume that the next
|
||
|
* character obtained via fb_.get() is the first character of
|
||
|
* the sequence and the string stops at the next char upto (could
|
||
|
* be tab, newline, etc.).
|
||
|
*/
|
||
|
int QseqPatternSource::parseName(
|
||
|
Read& r, // buffer for mate 1
|
||
|
Read* r2, // buffer for mate 2 (NULL if mate2 is read separately)
|
||
|
bool append, // true -> append characters, false -> skip them
|
||
|
bool clearFirst, // clear the name buffer first
|
||
|
bool warnEmpty, // emit a warning if nothing was added to the name
|
||
|
bool useDefault, // if nothing is read, put readCnt_ as a default value
|
||
|
int upto) // stop parsing when we first reach character 'upto'
|
||
|
{
|
||
|
if(clearFirst) {
|
||
|
if(r2 != NULL) r2->name.clear();
|
||
|
r.name.clear();
|
||
|
}
|
||
|
while(true) {
|
||
|
int c;
|
||
|
if((c = fb_.get()) < 0) {
|
||
|
// EOF reached in the middle of the name
|
||
|
return -1;
|
||
|
}
|
||
|
if(c == '\n' || c == '\r') {
|
||
|
// EOL reached in the middle of the name
|
||
|
return -1;
|
||
|
}
|
||
|
if(c == upto) {
|
||
|
// Finished with field
|
||
|
break;
|
||
|
}
|
||
|
if(append) {
|
||
|
if(r2 != NULL) r2->name.append(c);
|
||
|
r.name.append(c);
|
||
|
}
|
||
|
}
|
||
|
// Set up a default name if one hasn't been set
|
||
|
if(r.name.empty() && useDefault && append) {
|
||
|
char cbuf[20];
|
||
|
itoa10(readCnt_, cbuf);
|
||
|
r.name.append(cbuf);
|
||
|
if(r2 != NULL) r2->name.append(cbuf);
|
||
|
}
|
||
|
if(r.name.empty() && warnEmpty) {
|
||
|
cerr << "Warning: read had an empty name field" << endl;
|
||
|
}
|
||
|
return (int)r.name.length();
|
||
|
}
|
||
|
|
||
|
/**
|
||
|
* Parse a single sequence from fb_ and store in r. Assume
|
||
|
* that the next character obtained via fb_.get() is the first
|
||
|
* character of the sequence and the sequence stops at the next
|
||
|
* char upto (could be tab, newline, etc.).
|
||
|
*/
|
||
|
int QseqPatternSource::parseSeq(
|
||
|
Read& r,
|
||
|
int& charsRead,
|
||
|
int& trim5,
|
||
|
char upto)
|
||
|
{
|
||
|
int begin = 0;
|
||
|
int c = fb_.get();
|
||
|
assert(c != upto);
|
||
|
r.patFw.clear();
|
||
|
r.color = gColor;
|
||
|
if(gColor) {
|
||
|
// NOTE: clearly this is not relevant for Illumina output, but
|
||
|
// I'm keeping it here in case there's some reason to put SOLiD
|
||
|
// data in this format in the future.
|
||
|
|
||
|
// This may be a primer character. If so, keep it in the
|
||
|
// 'primer' field of the read buf and parse the rest of the
|
||
|
// read without it.
|
||
|
c = toupper(c);
|
||
|
if(asc2dnacat[c] > 0) {
|
||
|
// First char is a DNA char
|
||
|
int c2 = toupper(fb_.peek());
|
||
|
// Second char is a color char
|
||
|
if(asc2colcat[c2] > 0) {
|
||
|
r.primer = c;
|
||
|
r.trimc = c2;
|
||
|
trim5 += 2; // trim primer and first color
|
||
|
}
|
||
|
}
|
||
|
if(c < 0) { return -1; }
|
||
|
}
|
||
|
while(c != upto) {
|
||
|
if(c == '.') c = 'N';
|
||
|
if(gColor) {
|
||
|
if(c >= '0' && c <= '4') c = "ACGTN"[(int)c - '0'];
|
||
|
}
|
||
|
if(isalpha(c)) {
|
||
|
assert_in(toupper(c), "ACGTN");
|
||
|
if(begin++ >= trim5) {
|
||
|
assert_neq(0, asc2dnacat[c]);
|
||
|
r.patFw.append(asc2dna[c]);
|
||
|
}
|
||
|
charsRead++;
|
||
|
}
|
||
|
if((c = fb_.get()) < 0) {
|
||
|
return -1;
|
||
|
}
|
||
|
}
|
||
|
r.patFw.trimEnd(gTrim3);
|
||
|
return (int)r.patFw.length();
|
||
|
}
|
||
|
|
||
|
/**
|
||
|
* Parse a single quality string from fb_ and store in r.
|
||
|
* Assume that the next character obtained via fb_.get() is
|
||
|
* the first character of the quality string and the string stops
|
||
|
* at the next char upto (could be tab, newline, etc.).
|
||
|
*/
|
||
|
int QseqPatternSource::parseQuals(
|
||
|
Read& r,
|
||
|
int charsRead,
|
||
|
int dstLen,
|
||
|
int trim5,
|
||
|
char& c2,
|
||
|
char upto = '\t',
|
||
|
char upto2 = -1)
|
||
|
{
|
||
|
int qualsRead = 0;
|
||
|
int c = 0;
|
||
|
if (intQuals_) {
|
||
|
// Probably not relevant
|
||
|
char buf[4096];
|
||
|
while (qualsRead < charsRead) {
|
||
|
qualToks_.clear();
|
||
|
if(!tokenizeQualLine(fb_, buf, 4096, qualToks_)) break;
|
||
|
for (unsigned int j = 0; j < qualToks_.size(); ++j) {
|
||
|
char c = intToPhred33(atoi(qualToks_[j].c_str()), solQuals_);
|
||
|
assert_geq(c, 33);
|
||
|
if (qualsRead >= trim5) {
|
||
|
r.qual.append(c);
|
||
|
}
|
||
|
++qualsRead;
|
||
|
}
|
||
|
} // done reading integer quality lines
|
||
|
if (charsRead > qualsRead) tooFewQualities(r.name);
|
||
|
} else {
|
||
|
// Non-integer qualities
|
||
|
while((qualsRead < dstLen + trim5) && c >= 0) {
|
||
|
c = fb_.get();
|
||
|
c2 = c;
|
||
|
if (c == ' ') wrongQualityFormat(r.name);
|
||
|
if(c < 0) {
|
||
|
// EOF occurred in the middle of a read - abort
|
||
|
return -1;
|
||
|
}
|
||
|
if(!isspace(c) && c != upto && (upto2 == -1 || c != upto2)) {
|
||
|
if (qualsRead >= trim5) {
|
||
|
c = charToPhred33(c, solQuals_, phred64Quals_);
|
||
|
assert_geq(c, 33);
|
||
|
r.qual.append(c);
|
||
|
}
|
||
|
qualsRead++;
|
||
|
} else {
|
||
|
break;
|
||
|
}
|
||
|
}
|
||
|
}
|
||
|
if(r.qual.length() < (size_t)dstLen) {
|
||
|
tooFewQualities(r.name);
|
||
|
}
|
||
|
// TODO: How to detect too many qualities??
|
||
|
r.qual.resize(dstLen);
|
||
|
while(c != -1 && c != upto && (upto2 == -1 || c != upto2)) {
|
||
|
c = fb_.get();
|
||
|
c2 = c;
|
||
|
}
|
||
|
return qualsRead;
|
||
|
}
|
||
|
|
||
|
/**
|
||
|
* Read another pattern from a Qseq input file.
|
||
|
*/
|
||
|
bool QseqPatternSource::read(
|
||
|
Read& r,
|
||
|
TReadId& rdid,
|
||
|
TReadId& endid,
|
||
|
bool& success,
|
||
|
bool& done)
|
||
|
{
|
||
|
r.reset();
|
||
|
r.color = gColor;
|
||
|
success = true;
|
||
|
done = false;
|
||
|
readCnt_++;
|
||
|
rdid = endid = readCnt_-1;
|
||
|
peekOverNewline(fb_);
|
||
|
fb_.resetLastN();
|
||
|
// 1. Machine name
|
||
|
if(parseName(r, NULL, true, true, true, false, '\t') == -1) BAIL_UNPAIRED();
|
||
|
assert_neq('\t', fb_.peek());
|
||
|
r.name.append('_');
|
||
|
// 2. Run number
|
||
|
if(parseName(r, NULL, true, false, true, false, '\t') == -1) BAIL_UNPAIRED();
|
||
|
assert_neq('\t', fb_.peek());
|
||
|
r.name.append('_');
|
||
|
// 3. Lane number
|
||
|
if(parseName(r, NULL, true, false, true, false, '\t') == -1) BAIL_UNPAIRED();
|
||
|
assert_neq('\t', fb_.peek());
|
||
|
r.name.append('_');
|
||
|
// 4. Tile number
|
||
|
if(parseName(r, NULL, true, false, true, false, '\t') == -1) BAIL_UNPAIRED();
|
||
|
assert_neq('\t', fb_.peek());
|
||
|
r.name.append('_');
|
||
|
// 5. X coordinate of spot
|
||
|
if(parseName(r, NULL, true, false, true, false, '\t') == -1) BAIL_UNPAIRED();
|
||
|
assert_neq('\t', fb_.peek());
|
||
|
r.name.append('_');
|
||
|
// 6. Y coordinate of spot
|
||
|
if(parseName(r, NULL, true, false, true, false, '\t') == -1) BAIL_UNPAIRED();
|
||
|
assert_neq('\t', fb_.peek());
|
||
|
r.name.append('_');
|
||
|
// 7. Index
|
||
|
if(parseName(r, NULL, true, false, true, false, '\t') == -1) BAIL_UNPAIRED();
|
||
|
assert_neq('\t', fb_.peek());
|
||
|
r.name.append('/');
|
||
|
// 8. Mate number
|
||
|
if(parseName(r, NULL, true, false, true, false, '\t') == -1) BAIL_UNPAIRED();
|
||
|
// Empty sequence??
|
||
|
if(fb_.peek() == '\t') {
|
||
|
// Get tab that separates seq from qual
|
||
|
ASSERT_ONLY(int c =) fb_.get();
|
||
|
assert_eq('\t', c);
|
||
|
assert_eq('\t', fb_.peek());
|
||
|
// Get tab that separates qual from filter
|
||
|
ASSERT_ONLY(c =) fb_.get();
|
||
|
assert_eq('\t', c);
|
||
|
// Next char is first char of filter flag
|
||
|
assert_neq('\t', fb_.peek());
|
||
|
fb_.resetLastN();
|
||
|
cerr << "Warning: skipping empty QSEQ read with name '" << r.name << "'" << endl;
|
||
|
} else {
|
||
|
assert_neq('\t', fb_.peek());
|
||
|
int charsRead = 0;
|
||
|
int mytrim5 = gTrim5;
|
||
|
// 9. Sequence
|
||
|
int dstLen = parseSeq(r, charsRead, mytrim5, '\t');
|
||
|
assert_neq('\t', fb_.peek());
|
||
|
if(dstLen < 0) BAIL_UNPAIRED();
|
||
|
char ct = 0;
|
||
|
// 10. Qualities
|
||
|
if(parseQuals(r, charsRead, dstLen, mytrim5, ct, '\t', -1) < 0) BAIL_UNPAIRED();
|
||
|
r.trimmed3 = gTrim3;
|
||
|
r.trimmed5 = mytrim5;
|
||
|
if(ct != '\t') {
|
||
|
cerr << "Error: QSEQ with name " << r.name << " did not have tab after qualities" << endl;
|
||
|
throw 1;
|
||
|
}
|
||
|
assert_eq(ct, '\t');
|
||
|
}
|
||
|
// 11. Filter flag
|
||
|
int filt = fb_.get();
|
||
|
if(filt == -1) BAIL_UNPAIRED();
|
||
|
r.filter = filt;
|
||
|
if(filt != '0' && filt != '1') {
|
||
|
// Bad value for filt
|
||
|
}
|
||
|
if(fb_.peek() != -1 && fb_.peek() != '\n') {
|
||
|
// Bad value right after the filt field
|
||
|
}
|
||
|
fb_.get();
|
||
|
r.readOrigBuf.install(fb_.lastN(), fb_.lastNLen());
|
||
|
fb_.resetLastN();
|
||
|
if(r.qual.length() < r.patFw.length()) {
|
||
|
tooFewQualities(r.name);
|
||
|
} else if(r.qual.length() > r.patFw.length()) {
|
||
|
tooManyQualities(r.name);
|
||
|
}
|
||
|
#ifndef NDEBUG
|
||
|
assert_eq(r.patFw.length(), r.qual.length());
|
||
|
for(size_t i = 0; i < r.qual.length(); i++) {
|
||
|
assert_geq((int)r.qual[i], 33);
|
||
|
}
|
||
|
#endif
|
||
|
return true;
|
||
|
}
|