hisat-3n/pat.cpp

1825 lines
47 KiB
C++
Raw Permalink Normal View History

2025-01-18 13:09:52 +00:00
/*
* Copyright 2011, Ben Langmead <langmea@cs.jhu.edu>
*
* This file is part of Bowtie 2.
*
* Bowtie 2 is free software: you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation, either version 3 of the License, or
* (at your option) any later version.
*
* Bowtie 2 is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with Bowtie 2. If not, see <http://www.gnu.org/licenses/>.
*/
#include <cmath>
#include <iostream>
#include <string>
#include <stdexcept>
#include "sstring.h"
#include "pat.h"
#include "filebuf.h"
#include "formats.h"
#ifdef USE_SRA
#include "tinythread.h"
#include <ncbi-vdb/NGS.hpp>
#include <ngs/ErrorMsg.hpp>
#include <ngs/ReadCollection.hpp>
#include <ngs/ReadIterator.hpp>
#include <ngs/Read.hpp>
#endif
using namespace std;
/**
* Return a new dynamically allocated PatternSource for the given
* format, using the given list of strings as the filenames to read
* from or as the sequences themselves (i.e. if -c was used).
*/
PatternSource* PatternSource::patsrcFromStrings(
const PatternParams& p,
const EList<string>& qs,
size_t nthreads)
{
switch(p.format) {
case FASTA: return new FastaPatternSource(qs, p);
case FASTA_CONT: return new FastaContinuousPatternSource(qs, p);
case RAW: return new RawPatternSource(qs, p);
case FASTQ: return new FastqPatternSource(qs, p);
case TAB_MATE5: return new TabbedPatternSource(qs, p, false);
case TAB_MATE6: return new TabbedPatternSource(qs, p, true);
case CMDLINE: return new VectorPatternSource(qs, p);
case QSEQ: return new QseqPatternSource(qs, p);
#ifdef USE_SRA
case SRA_FASTA:
case SRA_FASTQ: return new SRAPatternSource(qs, p, nthreads);
#endif
default: {
cerr << "Internal error; bad patsrc format: " << p.format << endl;
throw 1;
}
}
}
/**
* The main member function for dispensing patterns.
*
* Returns true iff a pair was parsed succesfully.
*/
bool PatternSource::nextReadPair(
Read& ra,
Read& rb,
TReadId& rdid,
TReadId& endid,
bool& success,
bool& done,
bool& paired,
bool fixName)
{
// nextPatternImpl does the reading from the ultimate source;
// it is implemented in concrete subclasses
success = done = paired = false;
nextReadPairImpl(ra, rb, rdid, endid, success, done, paired);
if(success) {
// Construct reversed versions of fw and rc seqs/quals
ra.finalize();
if(!rb.empty()) {
rb.finalize();
}
// Fill in the random-seed field using a combination of
// information from the user-specified seed and the read
// sequence, qualities, and name
ra.seed = genRandSeed(ra.patFw, ra.qual, ra.name, seed_);
if(!rb.empty()) {
rb.seed = genRandSeed(rb.patFw, rb.qual, rb.name, seed_);
}
}
return success;
}
/**
* The main member function for dispensing patterns.
*/
bool PatternSource::nextRead(
Read& r,
TReadId& rdid,
TReadId& endid,
bool& success,
bool& done)
{
// nextPatternImpl does the reading from the ultimate source;
// it is implemented in concrete subclasses
nextReadImpl(r, rdid, endid, success, done);
if(success) {
// Construct the reversed versions of the fw and rc seqs
// and quals
r.finalize();
// Fill in the random-seed field using a combination of
// information from the user-specified seed and the read
// sequence, qualities, and name
r.seed = genRandSeed(r.patFw, r.qual, r.name, seed_);
}
return success;
}
/**
* Get the next paired or unpaired read from the wrapped
* PairedPatternSource.
*/
bool WrappedPatternSourcePerThread::nextReadPair(
bool& success,
bool& done,
bool& paired,
bool fixName)
{
PatternSourcePerThread::nextReadPair(success, done, paired, fixName);
ASSERT_ONLY(TReadId lastRdId = rdid_);
buf1_.reset();
buf2_.reset();
patsrc_.nextReadPair(buf1_, buf2_, rdid_, endid_, success, done, paired, fixName);
assert(!success || rdid_ != lastRdId);
return success;
}
/**
* The main member function for dispensing pairs of reads or
* singleton reads. Returns true iff ra and rb contain a new
* pair; returns false if ra contains a new unpaired read.
*/
bool PairedSoloPatternSource::nextReadPair(
Read& ra,
Read& rb,
TReadId& rdid,
TReadId& endid,
bool& success,
bool& done,
bool& paired,
bool fixName)
{
uint32_t cur = cur_;
success = false;
while(cur < src_->size()) {
// Patterns from srca_[cur_] are unpaired
do {
(*src_)[cur]->nextReadPair(
ra, rb, rdid, endid, success, done, paired, fixName);
} while(!success && !done);
if(!success) {
assert(done);
// If patFw is empty, that's our signal that the
// input dried up
lock();
if(cur + 1 > cur_) cur_++;
cur = cur_;
unlock();
continue; // on to next pair of PatternSources
}
assert(success);
ra.seed = genRandSeed(ra.patFw, ra.qual, ra.name, seed_);
if(!rb.empty()) {
rb.seed = genRandSeed(rb.patFw, rb.qual, rb.name, seed_);
if(fixName) {
ra.fixMateName(1);
rb.fixMateName(2);
}
}
ra.rdid = rdid;
ra.endid = endid;
if(!rb.empty()) {
rb.rdid = rdid;
rb.endid = endid+1;
}
ra.mate = 1;
rb.mate = 2;
return true; // paired
}
assert_leq(cur, src_->size());
done = (cur == src_->size());
return false;
}
/**
* The main member function for dispensing pairs of reads or
* singleton reads. Returns true iff ra and rb contain a new
* pair; returns false if ra contains a new unpaired read.
*/
bool PairedDualPatternSource::nextReadPair(
Read& ra,
Read& rb,
TReadId& rdid,
TReadId& endid,
bool& success,
bool& done,
bool& paired,
bool fixName)
{
// 'cur' indexes the current pair of PatternSources
uint32_t cur;
{
lock();
cur = cur_;
unlock();
}
success = false;
done = true;
while(cur < srca_->size()) {
if((*srcb_)[cur] == NULL) {
paired = false;
// Patterns from srca_ are unpaired
do {
(*srca_)[cur]->nextRead(ra, rdid, endid, success, done);
} while(!success && !done);
if(!success) {
assert(done);
lock();
if(cur + 1 > cur_) cur_++;
cur = cur_; // Move on to next PatternSource
unlock();
continue; // on to next pair of PatternSources
}
ra.rdid = rdid;
ra.endid = endid;
ra.mate = 0;
return success;
} else {
paired = true;
// Patterns from srca_[cur_] and srcb_[cur_] are paired
TReadId rdid_a = 0, endid_a = 0;
TReadId rdid_b = 0, endid_b = 0;
bool success_a = false, done_a = false;
bool success_b = false, done_b = false;
// Lock to ensure that this thread gets parallel reads
// in the two mate files
lock();
do {
(*srca_)[cur]->nextRead(ra, rdid_a, endid_a, success_a, done_a);
} while(!success_a && !done_a);
do {
(*srcb_)[cur]->nextRead(rb, rdid_b, endid_b, success_b, done_b);
} while(!success_b && !done_b);
if(!success_a && success_b) {
cerr << "Error, fewer reads in file specified with -1 than in file specified with -2" << endl;
throw 1;
} else if(!success_a) {
assert(done_a && done_b);
if(cur + 1 > cur_) cur_++;
cur = cur_; // Move on to next PatternSource
unlock();
continue; // on to next pair of PatternSources
} else if(!success_b) {
cerr << "Error, fewer reads in file specified with -2 than in file specified with -1" << endl;
throw 1;
}
assert_eq(rdid_a, rdid_b);
//assert_eq(endid_a+1, endid_b);
assert_eq(success_a, success_b);
unlock();
if(fixName) {
ra.fixMateName(1);
rb.fixMateName(2);
}
rdid = rdid_a;
endid = endid_a;
success = success_a;
done = done_a;
ra.rdid = rdid;
ra.endid = endid;
if(!rb.empty()) {
rb.rdid = rdid;
rb.endid = endid+1;
}
ra.mate = 1;
rb.mate = 2;
return success;
}
}
return success;
}
/**
* Return the number of reads attempted.
*/
pair<TReadId, TReadId> PairedDualPatternSource::readCnt() const {
uint64_t rets = 0llu, retp = 0llu;
for(size_t i = 0; i < srca_->size(); i++) {
if((*srcb_)[i] == NULL) {
rets += (*srca_)[i]->readCnt();
} else {
assert_eq((*srca_)[i]->readCnt(), (*srcb_)[i]->readCnt());
retp += (*srca_)[i]->readCnt();
}
}
return make_pair(rets, retp);
}
/**
* Given the values for all of the various arguments used to specify
* the read and quality input, create a list of pattern sources to
* dispense them.
*/
PairedPatternSource* PairedPatternSource::setupPatternSources(
const EList<string>& si, // singles, from argv
const EList<string>& m1, // mate1's, from -1 arg
const EList<string>& m2, // mate2's, from -2 arg
const EList<string>& m12, // both mates on each line, from --12 arg
#ifdef USE_SRA
const EList<string>& sra_accs,
#endif
const EList<string>& q, // qualities associated with singles
const EList<string>& q1, // qualities associated with m1
const EList<string>& q2, // qualities associated with m2
const PatternParams& p, // read-in parameters
size_t nthreads,
bool verbose) // be talkative?
{
EList<PatternSource*>* a = new EList<PatternSource*>();
EList<PatternSource*>* b = new EList<PatternSource*>();
EList<PatternSource*>* ab = new EList<PatternSource*>();
// Create list of pattern sources for paired reads appearing
// interleaved in a single file
for(size_t i = 0; i < m12.size(); i++) {
const EList<string>* qs = &m12;
EList<string> tmp;
if(p.fileParallel) {
// Feed query files one to each PatternSource
qs = &tmp;
tmp.push_back(m12[i]);
assert_eq(1, tmp.size());
}
ab->push_back(PatternSource::patsrcFromStrings(p, *qs, nthreads));
if(!p.fileParallel) {
break;
}
}
#ifdef USE_SRA
for(size_t i = 0; i < sra_accs.size(); i++) {
const EList<string>* qs = &sra_accs;
EList<string> tmp;
if(p.fileParallel) {
// Feed query files one to each PatternSource
qs = &tmp;
tmp.push_back(sra_accs[i]);
assert_eq(1, tmp.size());
}
ab->push_back(PatternSource::patsrcFromStrings(p, *qs, nthreads));
if(!p.fileParallel) {
break;
}
}
#endif
// Create list of pattern sources for paired reads
for(size_t i = 0; i < m1.size(); i++) {
const EList<string>* qs = &m1;
EList<string> tmpSeq;
EList<string> tmpQual;
if(p.fileParallel) {
// Feed query files one to each PatternSource
qs = &tmpSeq;
tmpSeq.push_back(m1[i]);
assert_eq(1, tmpSeq.size());
}
PatternSource *patsrc = PatternSource::patsrcFromStrings(p, *qs, nthreads);
patsrc->paired_type = 1;
a->push_back(patsrc);
if(!p.fileParallel) {
break;
}
}
// Create list of pattern sources for paired reads
for(size_t i = 0; i < m2.size(); i++) {
const EList<string>* qs = &m2;
EList<string> tmpSeq;
EList<string> tmpQual;
if(p.fileParallel) {
// Feed query files one to each PatternSource
qs = &tmpSeq;
tmpSeq.push_back(m2[i]);
assert_eq(1, tmpSeq.size());
}
PatternSource *patsrc = PatternSource::patsrcFromStrings(p, *qs, nthreads);
patsrc->paired_type = 2;
b->push_back(patsrc);
if(!p.fileParallel) {
break;
}
}
// All mates/mate files must be paired
assert_eq(a->size(), b->size());
// Create list of pattern sources for the unpaired reads
for(size_t i = 0; i < si.size(); i++) {
const EList<string>* qs = &si;
PatternSource* patsrc = NULL;
EList<string> tmpSeq;
EList<string> tmpQual;
if(p.fileParallel) {
// Feed query files one to each PatternSource
qs = &tmpSeq;
tmpSeq.push_back(si[i]);
assert_eq(1, tmpSeq.size());
}
patsrc = PatternSource::patsrcFromStrings(p, *qs, nthreads);
patsrc->paired_type = 1;
assert(patsrc != NULL);
a->push_back(patsrc);
b->push_back(NULL);
if(!p.fileParallel) {
break;
}
}
PairedPatternSource *patsrc = NULL;
#ifdef USE_SRA
if(m12.size() > 0 || sra_accs.size() > 0) {
#else
if(m12.size() > 0) {
#endif
patsrc = new PairedSoloPatternSource(ab, p);
for(size_t i = 0; i < a->size(); i++) delete (*a)[i];
for(size_t i = 0; i < b->size(); i++) delete (*b)[i];
delete a; delete b;
} else {
patsrc = new PairedDualPatternSource(a, b, p);
for(size_t i = 0; i < ab->size(); i++) delete (*ab)[i];
delete ab;
}
return patsrc;
}
VectorPatternSource::VectorPatternSource(
const EList<string>& v,
const PatternParams& p) :
PatternSource(p),
cur_(p.skip),
skip_(p.skip),
paired_(false),
v_(),
quals_()
{
for(size_t i = 0; i < v.size(); i++) {
EList<string> ss;
tokenize(v[i], ":", ss, 2);
assert_gt(ss.size(), 0);
assert_leq(ss.size(), 2);
// Initialize s
string s = ss[0];
int mytrim5 = gTrim5;
if(gColor && s.length() > 1) {
// This may be a primer character. If so, keep it in the
// 'primer' field of the read buf and parse the rest of the
// read without it.
int c = toupper(s[0]);
if(asc2dnacat[c] > 0) {
// First char is a DNA char
int c2 = toupper(s[1]);
// Second char is a color char
if(asc2colcat[c2] > 0) {
mytrim5 += 2; // trim primer and first color
}
}
}
if(gColor) {
// Convert '0'-'3' to 'A'-'T'
for(size_t i = 0; i < s.length(); i++) {
if(s[i] >= '0' && s[i] <= '4') {
s[i] = "ACGTN"[(int)s[i] - '0'];
}
if(s[i] == '.') s[i] = 'N';
}
}
if(s.length() <= (size_t)(gTrim3 + mytrim5)) {
// Entire read is trimmed away
s.clear();
} else {
// Trim on 5' (high-quality) end
if(mytrim5 > 0) {
s.erase(0, mytrim5);
}
// Trim on 3' (low-quality) end
if(gTrim3 > 0) {
s.erase(s.length()-gTrim3);
}
}
// Initialize vq
string vq;
if(ss.size() == 2) {
vq = ss[1];
}
// Trim qualities
if(vq.length() > (size_t)(gTrim3 + mytrim5)) {
// Trim on 5' (high-quality) end
if(mytrim5 > 0) {
vq.erase(0, mytrim5);
}
// Trim on 3' (low-quality) end
if(gTrim3 > 0) {
vq.erase(vq.length()-gTrim3);
}
}
// Pad quals with Is if necessary; this shouldn't happen
while(vq.length() < s.length()) {
vq.push_back('I');
}
// Truncate quals to match length of read if necessary;
// this shouldn't happen
if(vq.length() > s.length()) {
vq.erase(s.length());
}
assert_eq(vq.length(), s.length());
v_.expand();
v_.back().installChars(s);
quals_.push_back(BTString(vq));
trimmed3_.push_back(gTrim3);
trimmed5_.push_back(mytrim5);
ostringstream os;
os << (names_.size());
names_.push_back(BTString(os.str()));
}
assert_eq(v_.size(), quals_.size());
}
bool VectorPatternSource::nextReadImpl(
Read& r,
TReadId& rdid,
TReadId& endid,
bool& success,
bool& done)
{
// Let Strings begin at the beginning of the respective bufs
r.reset();
lock();
if(cur_ >= v_.size()) {
unlock();
// Clear all the Strings, as a signal to the caller that
// we're out of reads
r.reset();
success = false;
done = true;
assert(r.empty());
return false;
}
// Copy v_*, quals_* strings into the respective Strings
r.color = gColor;
r.patFw = v_[cur_];
r.patFw_3N = v_[cur_];
r.qual = quals_[cur_];
r.trimmed3 = trimmed3_[cur_];
r.trimmed5 = trimmed5_[cur_];
ostringstream os;
os << cur_;
r.name = os.str();
cur_++;
done = cur_ == v_.size();
rdid = endid = readCnt_;
readCnt_++;
unlock();
success = true;
return true;
}
/**
* This is unused, but implementation is given for completeness.
*/
bool VectorPatternSource::nextReadPairImpl(
Read& ra,
Read& rb,
TReadId& rdid,
TReadId& endid,
bool& success,
bool& done,
bool& paired)
{
// Let Strings begin at the beginning of the respective bufs
ra.reset();
rb.reset();
paired = true;
if(!paired_) {
paired_ = true;
cur_ <<= 1;
}
lock();
if(cur_ >= v_.size()-1) {
unlock();
// Clear all the Strings, as a signal to the caller that
// we're out of reads
ra.reset();
rb.reset();
assert(ra.empty());
assert(rb.empty());
success = false;
done = true;
return false;
}
// Copy v_*, quals_* strings into the respective Strings
ra.patFw = v_[cur_];
ra.qual = quals_[cur_];
ra.trimmed3 = trimmed3_[cur_];
ra.trimmed5 = trimmed5_[cur_];
cur_++;
rb.patFw = v_[cur_];
rb.qual = quals_[cur_];
rb.trimmed3 = trimmed3_[cur_];
rb.trimmed5 = trimmed5_[cur_];
ostringstream os;
os << readCnt_;
ra.name = os.str();
rb.name = os.str();
ra.color = rb.color = gColor;
cur_++;
done = cur_ >= v_.size()-1;
rdid = endid = readCnt_;
readCnt_++;
unlock();
success = true;
return true;
}
/**
* Parse a single quality string from fb and store qualities in r.
* Assume the next character obtained via fb.get() is the first
* character of the quality string. When returning, the next
* character returned by fb.peek() or fb.get() should be the first
* character of the following line.
*/
int parseQuals(
Read& r,
FileBuf& fb,
int firstc,
int readLen,
int trim3,
int trim5,
bool intQuals,
bool phred64,
bool solexa64)
{
int c = firstc;
assert(c != '\n' && c != '\r');
r.qual.clear();
if (intQuals) {
while (c != '\r' && c != '\n' && c != -1) {
bool neg = false;
int num = 0;
while(!isspace(c) && !fb.eof()) {
if(c == '-') {
neg = true;
assert_eq(num, 0);
} else {
if(!isdigit(c)) {
char buf[2048];
cerr << "Warning: could not parse quality line:" << endl;
fb.getPastNewline();
cerr << fb.copyLastN(buf);
buf[2047] = '\0';
cerr << buf;
throw 1;
}
assert(isdigit(c));
num *= 10;
num += (c - '0');
}
c = fb.get();
}
if(neg) num = 0;
// Phred-33 ASCII encode it and add it to the back of the
// quality string
r.qual.append('!' + num);
// Skip over next stretch of whitespace
while(c != '\r' && c != '\n' && isspace(c) && !fb.eof()) {
c = fb.get();
}
}
} else {
while (c != '\r' && c != '\n' && c != -1) {
r.qual.append(charToPhred33(c, solexa64, phred64));
c = fb.get();
while(c != '\r' && c != '\n' && isspace(c) && !fb.eof()) {
c = fb.get();
}
}
}
if ((int)r.qual.length() < readLen-1 ||
((int)r.qual.length() < readLen && !r.color))
{
tooFewQualities(r.name);
}
r.qual.trimEnd(trim3);
if(r.qual.length()-trim5 < r.patFw.length()) {
assert(gColor && r.primer != -1);
assert_gt(trim5, 0);
trim5--;
}
r.qual.trimBegin(trim5);
if(r.qual.length() <= 0) return 0;
assert_eq(r.qual.length(), r.patFw.length());
while(fb.peek() == '\n' || fb.peek() == '\r') fb.get();
return (int)r.qual.length();
}
/// Read another pattern from a FASTA input file
bool FastaPatternSource::read(
Read& r,
TReadId& rdid,
TReadId& endid,
bool& success,
bool& done)
{
int c, qc = 0;
success = true;
done = false;
assert(fb_.isOpen());
r.reset();
r.color = gColor;
// Pick off the first carat
c = fb_.get();
if(c < 0) {
bail(r); success = false; done = true; return success;
}
while(c == '#' || c == ';' || c == '\r' || c == '\n') {
c = fb_.peekUptoNewline();
fb_.resetLastN();
c = fb_.get();
}
assert_eq(1, fb_.lastNLen());
// Pick off the first carat
if(first_) {
if(c != '>') {
cerr << "Error: reads file does not look like a FASTA file" << endl;
throw 1;
}
first_ = false;
}
assert_eq('>', c);
c = fb_.get(); // get next char after '>'
// Read to the end of the id line, sticking everything after the '>'
// into *name
//bool warning = false;
while(true) {
if(c < 0 || qc < 0) {
bail(r); success = false; done = true; return success;
}
if(c == '\n' || c == '\r') {
// Break at end of line, after consuming all \r's, \n's
while(c == '\n' || c == '\r') {
if(fb_.peek() == '>') {
// Empty sequence
break;
}
c = fb_.get();
if(c < 0 || qc < 0) {
bail(r); success = false; done = true; return success;
}
}
break;
}
r.name.append(c);
if(fb_.peek() == '>') {
// Empty sequence
break;
}
c = fb_.get();
}
if(c == '>') {
// Empty sequences!
cerr << "Warning: skipping empty FASTA read with name '" << r.name << "'" << endl;
fb_.resetLastN();
rdid = endid = readCnt_;
readCnt_++;
success = true; done = false; return success;
}
assert_neq('>', c);
// _in now points just past the first character of a sequence
// line, and c holds the first character
int begin = 0;
int mytrim5 = gTrim5;
if(gColor) {
// This is the primer character, keep it in the
// 'primer' field of the read buf and keep parsing
c = toupper(c);
if(asc2dnacat[c] > 0) {
// First char is a DNA char
int c2 = toupper(fb_.peek());
if(asc2colcat[c2] > 0) {
// Second char is a color char
r.primer = c;
r.trimc = c2;
mytrim5 += 2;
}
}
if(c < 0) {
bail(r); success = false; done = true; return success;
}
}
while(c != '>' && c >= 0) {
if(gColor) {
if(c >= '0' && c <= '4') c = "ACGTN"[(int)c - '0'];
if(c == '.') c = 'N';
}
if(asc2dnacat[c] > 0 && begin++ >= mytrim5) {
r.patFw.append(asc2dna_3N[0][c]);
if (threeN) {
r.patFw_3N.append(asc2dna_3N[1][c]);
r.originalFw.append((asc2dna[c]));
}
r.qual.append('I');
}
if(fb_.peek() == '>') break;
c = fb_.get();
}
r.patFw.trimEnd(gTrim3);
if (threeN) r.patFw_3N.trimEnd(gTrim3);
r.qual.trimEnd(gTrim3);
r.trimmed3 = gTrim3;
r.trimmed5 = mytrim5;
// Set up a default name if one hasn't been set
if(r.name.empty()) {
char cbuf[20];
itoa10<TReadId>(readCnt_, cbuf);
r.name.install(cbuf);
}
assert_gt(r.name.length(), 0);
r.readOrigBuf.install(fb_.lastN(), fb_.lastNLen());
fb_.resetLastN();
rdid = endid = readCnt_;
readCnt_++;
return success;
}
/// Read another pattern from a FASTQ input file
bool FastqPatternSource::read(
Read& r,
TReadId& rdid,
TReadId& endid,
bool& success,
bool& done)
{
int c;
int dstLen = 0;
success = true;
done = false;
r.reset();
r.color = gColor;
r.fuzzy = fuzzy_;
// Pick off the first at
if(first_) {
c = fb_.get();
if(c != '@') {
c = getOverNewline(fb_);
if(c < 0) {
bail(r); success = false; done = true; return success;
}
}
if(c != '@') {
cerr << "Error: reads file does not look like a FASTQ file" << endl;
throw 1;
}
assert_eq('@', c);
first_ = false;
}
// Read to the end of the id line, sticking everything after the '@'
// into *name
while(true) {
c = fb_.get();
if(c < 0) {
bail(r); success = false; done = true; return success;
}
if(c == '\n' || c == '\r') {
// Break at end of line, after consuming all \r's, \n's
while(c == '\n' || c == '\r') {
c = fb_.get();
if(c < 0) {
bail(r); success = false; done = true; return success;
}
}
break;
}
r.name.append(c);
}
// fb_ now points just past the first character of a
// sequence line, and c holds the first character
int charsRead = 0;
BTDnaString *sbuf = &r.patFw;
int dstLens[] = {0, 0, 0, 0};
int *dstLenCur = &dstLens[0];
int mytrim5 = gTrim5;
int altBufIdx = 0;
if(gColor && c != '+') {
// This may be a primer character. If so, keep it in the
// 'primer' field of the read buf and parse the rest of the
// read without it.
c = toupper(c);
if(asc2dnacat[c] > 0) {
// First char is a DNA char
int c2 = toupper(fb_.peek());
// Second char is a color char
if(asc2colcat[c2] > 0) {
r.primer = c;
r.trimc = c2;
mytrim5 += 2; // trim primer and first color
}
}
if(c < 0) {
bail(r); success = false; done = true; return success;
}
}
int trim5 = 0;
if(c != '+') {
trim5 = mytrim5;
while(c != '+') {
// Convert color numbers to letters if necessary
if(c == '.') c = 'N';
if(gColor) {
if(c >= '0' && c <= '4') c = "ACGTN"[(int)c - '0'];
}
if(fuzzy_ && c == '-') c = 'A';
if(isalpha(c)) {
// If it's past the 5'-end trim point
if(charsRead >= trim5) {
r.patFw.append(asc2dna_3N[0][c]);
if (threeN) {
r.patFw_3N.append(asc2dna_3N[1][c]);
r.originalFw.append((asc2dna[c]));
}
(*dstLenCur)++;
}
charsRead++;
} else if(fuzzy_ && c == ' ') {
trim5 = 0; // disable 5' trimming for now
if(charsRead == 0) {
c = fb_.get();
continue;
}
charsRead = 0;
if(altBufIdx >= 3) {
cerr << "At most 3 alternate sequence strings permitted; offending read: " << r.name << endl;
throw 1;
}
// Move on to the next alternate-sequence buffer
sbuf = &r.altPatFw[altBufIdx++];
dstLenCur = &dstLens[altBufIdx];
}
c = fb_.get();
if(c < 0) {
bail(r); success = false; done = true; return success;
}
}
dstLen = dstLens[0];
charsRead = dstLen + mytrim5;
}
// Trim from 3' end
if(gTrim3 > 0) {
if((int)r.patFw.length() > gTrim3) {
r.patFw.resize(r.patFw.length() - gTrim3);
dstLen -= gTrim3;
assert_eq((int)r.patFw.length(), dstLen);
} else {
// Trimmed the whole read; we won't be using this read,
// but we proceed anyway so that fb_ is advanced
// properly
r.patFw.clear();
dstLen = 0;
}
}
assert_eq('+', c);
// Chew up the optional name on the '+' line
ASSERT_ONLY(int pk =) peekToEndOfLine(fb_);
if(charsRead == 0) {
assert_eq('@', pk);
fb_.get();
fb_.resetLastN();
rdid = endid = readCnt_;
readCnt_++;
return success;
}
// Now read the qualities
if (intQuals_) {
assert(!fuzzy_);
int qualsRead = 0;
char buf[4096];
if(gColor && r.primer != -1) {
// In case the original quality string is one shorter
mytrim5--;
}
qualToks_.clear();
tokenizeQualLine(fb_, buf, 4096, qualToks_);
for(unsigned int j = 0; j < qualToks_.size(); ++j) {
char c = intToPhred33(atoi(qualToks_[j].c_str()), solQuals_);
assert_geq(c, 33);
if (qualsRead >= mytrim5) {
r.qual.append(c);
}
++qualsRead;
} // done reading integer quality lines
if(gColor && r.primer != -1) mytrim5++;
r.qual.trimEnd(gTrim3);
if(r.qual.length() < r.patFw.length()) {
tooFewQualities(r.name);
} else if(r.qual.length() > r.patFw.length() + 1) {
tooManyQualities(r.name);
}
if(r.qual.length() == r.patFw.length()+1 && gColor && r.primer != -1) {
r.qual.remove(0);
}
// Trim qualities on 3' end
if(r.qual.length() > r.patFw.length()) {
r.qual.resize(r.patFw.length());
assert_eq((int)r.qual.length(), dstLen);
}
peekOverNewline(fb_);
} else {
// Non-integer qualities
altBufIdx = 0;
trim5 = mytrim5;
int qualsRead[4] = {0, 0, 0, 0};
int *qualsReadCur = &qualsRead[0];
BTString *qbuf = &r.qual;
if(gColor && r.primer != -1) {
// In case the original quality string is one shorter
trim5--;
}
while(true) {
c = fb_.get();
if (!fuzzy_ && c == ' ') {
wrongQualityFormat(r.name);
} else if(c == ' ') {
trim5 = 0; // disable 5' trimming for now
if((*qualsReadCur) == 0) continue;
if(altBufIdx >= 3) {
cerr << "At most 3 alternate quality strings permitted; offending read: " << r.name << endl;
throw 1;
}
qbuf = &r.altQual[altBufIdx++];
qualsReadCur = &qualsRead[altBufIdx];
continue;
}
if(c < 0) {
break; // let the file end just at the end of a quality line
//bail(r); success = false; done = true; return success;
}
if (c != '\r' && c != '\n') {
if (*qualsReadCur >= trim5) {
c = charToPhred33(c, solQuals_, phred64Quals_);
assert_geq(c, 33);
qbuf->append(c);
}
(*qualsReadCur)++;
} else {
break;
}
}
qualsRead[0] -= gTrim3;
r.qual.trimEnd(gTrim3);
if(r.qual.length() < r.patFw.length()) {
tooFewQualities(r.name);
} else if(r.qual.length() > r.patFw.length()+1) {
tooManyQualities(r.name);
}
if(r.qual.length() == r.patFw.length()+1 && gColor && r.primer != -1) {
r.qual.remove(0);
}
if(fuzzy_) {
// Trim from 3' end of alternate basecall and quality strings
if(gTrim3 > 0) {
for(int i = 0; i < 3; i++) {
assert_eq(r.altQual[i].length(), r.altPatFw[i].length());
if((int)r.altQual[i].length() > gTrim3) {
r.altPatFw[i].resize(gTrim3);
r.altQual[i].resize(gTrim3);
} else {
r.altPatFw[i].clear();
r.altQual[i].clear();
}
qualsRead[i+1] = dstLens[i+1] =
max<int>(0, dstLens[i+1] - gTrim3);
}
}
// Shift to RHS, and install in Strings
assert_eq(0, r.alts);
for(int i = 1; i < 4; i++) {
if(qualsRead[i] == 0) continue;
if(qualsRead[i] > dstLen) {
// Shift everybody up
int shiftAmt = qualsRead[i] - dstLen;
for(int j = 0; j < dstLen; j++) {
r.altQual[i-1].set(r.altQual[i-1][j+shiftAmt], j);
r.altPatFw[i-1].set(r.altPatFw[i-1][j+shiftAmt], j);
}
r.altQual[i-1].resize(dstLen);
r.altPatFw[i-1].resize(dstLen);
} else if (qualsRead[i] < dstLen) {
r.altQual[i-1].resize(dstLen);
r.altPatFw[i-1].resize(dstLen);
// Shift everybody down
int shiftAmt = dstLen - qualsRead[i];
for(int j = dstLen-1; j >= shiftAmt; j--) {
r.altQual[i-1].set(r.altQual[i-1][j-shiftAmt], j);
r.altPatFw[i-1].set(r.altPatFw[i-1][j-shiftAmt], j);
}
// Fill in unset positions
for(int j = 0; j < shiftAmt; j++) {
// '!' - indicates no alternate basecall at
// this position
r.altQual[i-1].set(33, j);
}
}
r.alts++;
}
}
if(c == '\r' || c == '\n') {
c = peekOverNewline(fb_);
} else {
c = peekToEndOfLine(fb_);
}
}
r.readOrigBuf.install(fb_.lastN(), fb_.lastNLen());
fb_.resetLastN();
c = fb_.get();
// Should either be at end of file or at beginning of next record
assert(c == -1 || c == '@');
// Set up a default name if one hasn't been set
if(r.name.empty()) {
char cbuf[20];
itoa10<TReadId>(readCnt_, cbuf);
r.name.install(cbuf);
}
r.trimmed3 = gTrim3;
r.trimmed5 = mytrim5;
rdid = endid = readCnt_;
readCnt_++;
return success;
}
/// Read another pattern from a FASTA input file
bool TabbedPatternSource::read(
Read& r,
TReadId& rdid,
TReadId& endid,
bool& success,
bool& done)
{
r.reset();
r.color = gColor;
success = true;
done = false;
// fb_ is about to dish out the first character of the
// name field
if(parseName(r, NULL, '\t') == -1) {
peekOverNewline(fb_); // skip rest of line
r.reset();
success = false;
done = true;
return false;
}
assert_neq('\t', fb_.peek());
// fb_ is about to dish out the first character of the
// sequence field
int charsRead = 0;
int mytrim5 = gTrim5;
int dstLen = parseSeq(r, charsRead, mytrim5, '\t');
assert_neq('\t', fb_.peek());
if(dstLen < 0) {
peekOverNewline(fb_); // skip rest of line
r.reset();
success = false;
done = true;
return false;
}
// fb_ is about to dish out the first character of the
// quality-string field
char ct = 0;
if(parseQuals(r, charsRead, dstLen, mytrim5, ct, '\n') < 0) {
peekOverNewline(fb_); // skip rest of line
r.reset();
success = false;
done = true;
return false;
}
r.trimmed3 = gTrim3;
r.trimmed5 = mytrim5;
assert_eq(ct, '\n');
assert_neq('\n', fb_.peek());
r.readOrigBuf.install(fb_.lastN(), fb_.lastNLen());
fb_.resetLastN();
rdid = endid = readCnt_;
readCnt_++;
return true;
}
/// Read another pair of patterns from a FASTA input file
bool TabbedPatternSource::readPair(
Read& ra,
Read& rb,
TReadId& rdid,
TReadId& endid,
bool& success,
bool& done,
bool& paired)
{
success = true;
done = false;
// Skip over initial vertical whitespace
if(fb_.peek() == '\r' || fb_.peek() == '\n') {
fb_.peekUptoNewline();
fb_.resetLastN();
}
// fb_ is about to dish out the first character of the
// name field
int mytrim5_1 = gTrim5;
if(parseName(ra, &rb, '\t') == -1) {
peekOverNewline(fb_); // skip rest of line
ra.reset();
rb.reset();
fb_.resetLastN();
success = false;
done = true;
return false;
}
assert_neq('\t', fb_.peek());
// fb_ is about to dish out the first character of the
// sequence field for the first mate
int charsRead1 = 0;
int dstLen1 = parseSeq(ra, charsRead1, mytrim5_1, '\t');
if(dstLen1 < 0) {
peekOverNewline(fb_); // skip rest of line
ra.reset();
rb.reset();
fb_.resetLastN();
success = false;
done = true;
return false;
}
assert_neq('\t', fb_.peek());
// fb_ is about to dish out the first character of the
// quality-string field
char ct = 0;
if(parseQuals(ra, charsRead1, dstLen1, mytrim5_1, ct, '\t', '\n') < 0) {
peekOverNewline(fb_); // skip rest of line
ra.reset();
rb.reset();
fb_.resetLastN();
success = false;
done = true;
return false;
}
ra.trimmed3 = gTrim3;
ra.trimmed5 = mytrim5_1;
assert(ct == '\t' || ct == '\n' || ct == '\r' || ct == -1);
if(ct == '\r' || ct == '\n' || ct == -1) {
// Only had 3 fields prior to newline, so this must be an unpaired read
rb.reset();
ra.readOrigBuf.install(fb_.lastN(), fb_.lastNLen());
fb_.resetLastN();
success = true;
done = false;
paired = false;
rdid = endid = readCnt_;
readCnt_++;
return success;
}
paired = true;
assert_neq('\t', fb_.peek());
// Saw another tab after the third field, so this must be a pair
if(secondName_) {
// The second mate has its own name
if(parseName(rb, NULL, '\t') == -1) {
peekOverNewline(fb_); // skip rest of line
ra.reset();
rb.reset();
fb_.resetLastN();
success = false;
done = true;
return false;
}
assert_neq('\t', fb_.peek());
}
// fb_ about to give the first character of the second mate's sequence
int charsRead2 = 0;
int mytrim5_2 = gTrim5;
int dstLen2 = parseSeq(rb, charsRead2, mytrim5_2, '\t');
if(dstLen2 < 0) {
peekOverNewline(fb_); // skip rest of line
ra.reset();
rb.reset();
fb_.resetLastN();
success = false;
done = true;
return false;
}
assert_neq('\t', fb_.peek());
// fb_ is about to dish out the first character of the
// quality-string field
if(parseQuals(rb, charsRead2, dstLen2, mytrim5_2, ct, '\n') < 0) {
peekOverNewline(fb_); // skip rest of line
ra.reset();
rb.reset();
fb_.resetLastN();
success = false;
done = true;
return false;
}
ra.readOrigBuf.install(fb_.lastN(), fb_.lastNLen());
fb_.resetLastN();
rb.trimmed3 = gTrim3;
rb.trimmed5 = mytrim5_2;
rdid = endid = readCnt_;
readCnt_++;
return true;
}
/**
* Parse a name from fb_ and store in r. Assume that the next
* character obtained via fb_.get() is the first character of
* the sequence and the string stops at the next char upto (could
* be tab, newline, etc.).
*/
int TabbedPatternSource::parseName(
Read& r,
Read* r2,
char upto /* = '\t' */)
{
// Read the name out of the first field
int c = 0;
if(r2 != NULL) r2->name.clear();
r.name.clear();
while(true) {
if((c = fb_.get()) < 0) {
return -1;
}
if(c == upto) {
// Finished with first field
break;
}
if(c == '\n' || c == '\r') {
return -1;
}
if(r2 != NULL) r2->name.append(c);
r.name.append(c);
}
// Set up a default name if one hasn't been set
if(r.name.empty()) {
char cbuf[20];
itoa10<TReadId>(readCnt_, cbuf);
r.name.install(cbuf);
if(r2 != NULL) r2->name.install(cbuf);
}
return (int)r.name.length();
}
/**
* Parse a single sequence from fb_ and store in r. Assume
* that the next character obtained via fb_.get() is the first
* character of the sequence and the sequence stops at the next
* char upto (could be tab, newline, etc.).
*/
int TabbedPatternSource::parseSeq(
Read& r,
int& charsRead,
int& trim5,
char upto /*= '\t'*/)
{
int begin = 0;
int c = fb_.get();
assert(c != upto);
r.patFw.clear();
r.color = gColor;
if(gColor) {
// This may be a primer character. If so, keep it in the
// 'primer' field of the read buf and parse the rest of the
// read without it.
c = toupper(c);
if(asc2dnacat[c] > 0) {
// First char is a DNA char
int c2 = toupper(fb_.peek());
// Second char is a color char
if(asc2colcat[c2] > 0) {
r.primer = c;
r.trimc = c2;
trim5 += 2; // trim primer and first color
}
}
if(c < 0) { return -1; }
}
while(c != upto) {
if(gColor) {
if(c >= '0' && c <= '4') c = "ACGTN"[(int)c - '0'];
if(c == '.') c = 'N';
}
if(isalpha(c)) {
assert_in(toupper(c), "ACGTN");
if(begin++ >= trim5) {
assert_neq(0, asc2dnacat[c]);
r.patFw.append(asc2dna_3N[0][c]);
if (threeN) {
r.patFw_3N.append(asc2dna_3N[1][c]);
r.originalFw.append((asc2dna[c]));
}
}
charsRead++;
}
if((c = fb_.get()) < 0) {
return -1;
}
}
r.patFw.trimEnd(gTrim3);
if (threeN) r.patFw_3N.trimEnd(gTrim3);
return (int)r.patFw.length();
}
/**
* Parse a single quality string from fb_ and store in r.
* Assume that the next character obtained via fb_.get() is
* the first character of the quality string and the string stops
* at the next char upto (could be tab, newline, etc.).
*/
int TabbedPatternSource::parseQuals(
Read& r,
int charsRead,
int dstLen,
int trim5,
char& c2,
char upto /*= '\t'*/,
char upto2 /*= -1*/)
{
int qualsRead = 0;
int c = 0;
if (intQuals_) {
char buf[4096];
while (qualsRead < charsRead) {
qualToks_.clear();
if(!tokenizeQualLine(fb_, buf, 4096, qualToks_)) break;
for (unsigned int j = 0; j < qualToks_.size(); ++j) {
char c = intToPhred33(atoi(qualToks_[j].c_str()), solQuals_);
assert_geq(c, 33);
if (qualsRead >= trim5) {
r.qual.append(c);
}
++qualsRead;
}
} // done reading integer quality lines
if (charsRead > qualsRead) tooFewQualities(r.name);
} else {
// Non-integer qualities
while((qualsRead < dstLen + trim5) && c >= 0) {
c = fb_.get();
c2 = c;
if (c == ' ') wrongQualityFormat(r.name);
if(c < 0) {
// EOF occurred in the middle of a read - abort
return -1;
}
if(!isspace(c) && c != upto && (upto2 == -1 || c != upto2)) {
if (qualsRead >= trim5) {
c = charToPhred33(c, solQuals_, phred64Quals_);
assert_geq(c, 33);
r.qual.append(c);
}
qualsRead++;
} else {
break;
}
}
if(qualsRead < dstLen + trim5) {
tooFewQualities(r.name);
} else if(qualsRead > dstLen + trim5) {
tooManyQualities(r.name);
}
}
r.qual.resize(dstLen);
while(c != upto && (upto2 == -1 || c != upto2) && c != -1) {
c = fb_.get();
c2 = c;
}
return qualsRead;
}
void wrongQualityFormat(const BTString& read_name) {
cerr << "Error: Encountered one or more spaces while parsing the quality "
<< "string for read " << read_name << ". If this is a FASTQ file "
<< "with integer (non-ASCII-encoded) qualities, try re-running with "
<< "the --integer-quals option." << endl;
throw 1;
}
void tooFewQualities(const BTString& read_name) {
cerr << "Error: Read " << read_name << " has more read characters than "
<< "quality values." << endl;
throw 1;
}
void tooManyQualities(const BTString& read_name) {
cerr << "Error: Read " << read_name << " has more quality values than read "
<< "characters." << endl;
throw 1;
}
#ifdef USE_SRA
struct SRA_Read {
SStringExpandable<char, 64> name; // read name
SDnaStringExpandable<128, 2> patFw; // forward-strand sequence
SStringExpandable<char, 128, 2> qual; // quality values
void reset() {
name.clear();
patFw.clear();
qual.clear();
}
};
static const uint64_t buffer_size_per_thread = 4096;
struct SRA_Data {
uint64_t read_pos;
uint64_t write_pos;
uint64_t buffer_size;
bool done;
EList<pair<SRA_Read, SRA_Read> > paired_reads;
ngs::ReadIterator* sra_it;
SRA_Data() {
read_pos = 0;
write_pos = 0;
buffer_size = buffer_size_per_thread;
done = false;
sra_it = NULL;
}
bool isFull() {
assert_leq(read_pos, write_pos);
assert_geq(read_pos + buffer_size, write_pos);
return read_pos + buffer_size <= write_pos;
}
bool isEmpty() {
assert_leq(read_pos, write_pos);
assert_geq(read_pos + buffer_size, write_pos);
return read_pos == write_pos;
}
pair<SRA_Read, SRA_Read>& getPairForRead() {
assert(!isEmpty());
return paired_reads[read_pos % buffer_size];
}
pair<SRA_Read, SRA_Read>& getPairForWrite() {
assert(!isFull());
return paired_reads[write_pos % buffer_size];
}
void advanceReadPos() {
assert(!isEmpty());
read_pos++;
}
void advanceWritePos() {
assert(!isFull());
write_pos++;
}
};
static void SRA_IO_Worker(void *vp)
{
SRA_Data* sra_data = (SRA_Data*)vp;
assert(sra_data != NULL);
ngs::ReadIterator* sra_it = sra_data->sra_it;
assert(sra_it != NULL);
while(!sra_data->done) {
while(sra_data->isFull()) {
#if defined(_TTHREAD_WIN32_)
Sleep(1);
#elif defined(_TTHREAD_POSIX_)
const static timespec ts = {0, 1000000}; // 1 millisecond
nanosleep(&ts, NULL);
#endif
}
pair<SRA_Read, SRA_Read>& pair = sra_data->getPairForWrite();
SRA_Read& ra = pair.first;
SRA_Read& rb = pair.second;
bool exception_thrown = false;
try {
if(!sra_it->nextRead() || !sra_it->nextFragment()) {
ra.reset();
rb.reset();
sra_data->done = true;
return;
}
// Read the name out of the first field
ngs::StringRef rname = sra_it->getReadId();
ra.name.install(rname.data(), rname.size());
assert(!ra.name.empty());
ngs::StringRef ra_seq = sra_it->getFragmentBases();
if(gTrim5 + gTrim3 < (int)ra_seq.size()) {
ra.patFw.installChars(ra_seq.data() + gTrim5, ra_seq.size() - gTrim5 - gTrim3);
}
ngs::StringRef ra_qual = sra_it->getFragmentQualities();
if(ra_seq.size() == ra_qual.size() && gTrim5 + gTrim3 < (int)ra_qual.size()) {
ra.qual.install(ra_qual.data() + gTrim5, ra_qual.size() - gTrim5 - gTrim3);
} else {
ra.qual.resize(ra.patFw.length());
ra.qual.fill('I');
}
assert_eq(ra.patFw.length(), ra.qual.length());
if(!sra_it->nextFragment()) {
rb.reset();
} else {
// rb.name = ra.name;
ngs::StringRef rb_seq = sra_it->getFragmentBases();
if(gTrim5 + gTrim3 < (int)rb_seq.size()) {
rb.patFw.installChars(rb_seq.data() + gTrim5, rb_seq.size() - gTrim5 - gTrim3);
}
ngs::StringRef rb_qual = sra_it->getFragmentQualities();
if(rb_seq.size() == rb_qual.size() && gTrim5 + gTrim3 < (int)rb_qual.size()) {
rb.qual.install(rb_qual.data() + gTrim5, rb_qual.size() - gTrim5 - gTrim3);
} else {
rb.qual.resize(rb.patFw.length());
rb.qual.fill('I');
}
assert_eq(rb.patFw.length(), rb.qual.length());
}
sra_data->advanceWritePos();
} catch(ngs::ErrorMsg & x) {
cerr << x.toString () << endl;
exception_thrown = true;
} catch(exception & x) {
cerr << x.what () << endl;
exception_thrown = true;
} catch(...) {
cerr << "unknown exception\n";
exception_thrown = true;
}
if(exception_thrown) {
ra.reset();
rb.reset();
sra_data->done = true;
cerr << "An error happened while fetching SRA reads. Please rerun HISAT2. You may want to disable the SRA cache if you didn't (see the instructions at https://github.com/ncbi/sra-tools/wiki/Toolkit-Configuration).\n";
exit(1);
}
}
}
SRAPatternSource::~SRAPatternSource() {
if(io_thread_) delete io_thread_;
if(sra_data_) delete sra_data_;
if(sra_it_) delete sra_it_;
if(sra_run_) delete sra_run_;
}
/// Read another pair of patterns from a FASTA input file
bool SRAPatternSource::readPair(
Read& ra,
Read& rb,
TReadId& rdid,
TReadId& endid,
bool& success,
bool& done,
bool& paired)
{
assert(sra_run_ != NULL && sra_it_ != NULL);
success = true;
done = false;
while(sra_data_->isEmpty()) {
if(sra_data_->done && sra_data_->isEmpty()) {
ra.reset();
rb.reset();
success = false;
done = true;
return false;
}
#if defined(_TTHREAD_WIN32_)
Sleep(1);
#elif defined(_TTHREAD_POSIX_)
const static timespec ts = {0, 1000000}; // 1 millisecond
nanosleep(&ts, NULL);
#endif
}
pair<SRA_Read, SRA_Read>& pair = sra_data_->getPairForRead();
ra.name.install(pair.first.name.buf(), pair.first.name.length());
ra.patFw.install(pair.first.patFw.buf(), pair.first.patFw.length());
ra.qual.install(pair.first.qual.buf(), pair.first.qual.length());
ra.trimmed3 = gTrim3;
ra.trimmed5 = gTrim5;
if(pair.second.patFw.length() > 0) {
rb.name.install(pair.first.name.buf(), pair.first.name.length());
rb.patFw.install(pair.second.patFw.buf(), pair.second.patFw.length());
rb.qual.install(pair.second.qual.buf(), pair.second.qual.length());
rb.trimmed3 = gTrim3;
rb.trimmed5 = gTrim5;
paired = true;
} else {
rb.reset();
}
sra_data_->advanceReadPos();
rdid = endid = readCnt_;
readCnt_++;
return true;
}
void SRAPatternSource::open() {
string version = "hisat2-";
version += HISAT2_VERSION;
ncbi::NGS::setAppVersionString(version.c_str());
assert(!sra_accs_.empty());
while(sra_acc_cur_ < sra_accs_.size()) {
// Open read
if(sra_it_) {
delete sra_it_;
sra_it_ = NULL;
}
if(sra_run_) {
delete sra_run_;
sra_run_ = NULL;
}
try {
// open requested accession using SRA implementation of the API
sra_run_ = new ngs::ReadCollection(ncbi::NGS::openReadCollection(sra_accs_[sra_acc_cur_]));
#if 0
string run_name = sra_run_->getName();
cerr << " ReadGroups for " << run_name << endl;
ngs::ReadGroupIterator it = sra_run_->getReadGroups();
do {
ngs::Statistics s = it.getStatistics();
cerr << "Statistics for group <" << it.getName() << ">" << endl;
// for(string p = s.nextPath(""); p != ""; p = s.nextPath(p)){
// System.out.println("\t"+p+": "+s.getAsString(p));
} while(it.nextReadGroup());
exit(1);
#endif
// compute window to iterate through
size_t MAX_ROW = sra_run_->getReadCount();
sra_it_ = new ngs::ReadIterator(sra_run_->getReadRange(1, MAX_ROW, ngs::Read::all));
// create a buffer for SRA data
sra_data_ = new SRA_Data;
sra_data_->sra_it = sra_it_;
sra_data_->buffer_size = nthreads_ * buffer_size_per_thread;
sra_data_->paired_reads.resize(sra_data_->buffer_size);
// create a thread for handling SRA data access
io_thread_ = new tthread::thread(SRA_IO_Worker, (void*)sra_data_);
// io_thread_->join();
} catch(...) {
if(!errs_[sra_acc_cur_]) {
cerr << "Warning: Could not access \"" << sra_accs_[sra_acc_cur_].c_str() << "\" for reading; skipping..." << endl;
errs_[sra_acc_cur_] = true;
}
sra_acc_cur_++;
continue;
}
return;
}
cerr << "Error: No input SRA accessions were valid" << endl;
exit(1);
return;
}
#endif