1801 lines
44 KiB
C
1801 lines
44 KiB
C
|
/*
|
||
|
* Copyright 2011, Ben Langmead <langmea@cs.jhu.edu>
|
||
|
*
|
||
|
* This file is part of Bowtie 2.
|
||
|
*
|
||
|
* Bowtie 2 is free software: you can redistribute it and/or modify
|
||
|
* it under the terms of the GNU General Public License as published by
|
||
|
* the Free Software Foundation, either version 3 of the License, or
|
||
|
* (at your option) any later version.
|
||
|
*
|
||
|
* Bowtie 2 is distributed in the hope that it will be useful,
|
||
|
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||
|
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||
|
* GNU General Public License for more details.
|
||
|
*
|
||
|
* You should have received a copy of the GNU General Public License
|
||
|
* along with Bowtie 2. If not, see <http://www.gnu.org/licenses/>.
|
||
|
*/
|
||
|
|
||
|
#ifndef PAT_H_
|
||
|
#define PAT_H_
|
||
|
|
||
|
#include <cassert>
|
||
|
#include <cmath>
|
||
|
#include <stdexcept>
|
||
|
#include <vector>
|
||
|
#include <string>
|
||
|
#include <cstring>
|
||
|
#include <ctype.h>
|
||
|
#include <fstream>
|
||
|
#include "alphabet.h"
|
||
|
#include "assert_helpers.h"
|
||
|
#include "tokenize.h"
|
||
|
#include "random_source.h"
|
||
|
#include "threading.h"
|
||
|
#include "filebuf.h"
|
||
|
#include "qual.h"
|
||
|
#include "search_globals.h"
|
||
|
#include "sstring.h"
|
||
|
#include "ds.h"
|
||
|
#include "read.h"
|
||
|
#include "util.h"
|
||
|
|
||
|
extern bool threeN;
|
||
|
/**
|
||
|
* Classes and routines for reading reads from various input sources.
|
||
|
*/
|
||
|
|
||
|
using namespace std;
|
||
|
|
||
|
/**
|
||
|
* Calculate a per-read random seed based on a combination of
|
||
|
* the read data (incl. sequence, name, quals) and the global
|
||
|
* seed in '_randSeed'.
|
||
|
*/
|
||
|
static inline uint32_t genRandSeed(const BTDnaString& qry,
|
||
|
const BTString& qual,
|
||
|
const BTString& name,
|
||
|
uint32_t seed)
|
||
|
{
|
||
|
// Calculate a per-read random seed based on a combination of
|
||
|
// the read data (incl. sequence, name, quals) and the global
|
||
|
// seed
|
||
|
uint32_t rseed = (seed + 101) * 59 * 61 * 67 * 71 * 73 * 79 * 83;
|
||
|
size_t qlen = qry.length();
|
||
|
// Throw all the characters of the read into the random seed
|
||
|
for(size_t i = 0; i < qlen; i++) {
|
||
|
int p = (int)qry[i];
|
||
|
assert_leq(p, 4);
|
||
|
size_t off = ((i & 15) << 1);
|
||
|
rseed ^= (p << off);
|
||
|
}
|
||
|
// Throw all the quality values for the read into the random
|
||
|
// seed
|
||
|
for(size_t i = 0; i < qlen; i++) {
|
||
|
int p = (int)qual[i];
|
||
|
assert_leq(p, 255);
|
||
|
size_t off = ((i & 3) << 3);
|
||
|
rseed ^= (p << off);
|
||
|
}
|
||
|
// Throw all the characters in the read name into the random
|
||
|
// seed
|
||
|
size_t namelen = name.length();
|
||
|
for(size_t i = 0; i < namelen; i++) {
|
||
|
int p = (int)name[i];
|
||
|
if(p == '/') break;
|
||
|
assert_leq(p, 255);
|
||
|
size_t off = ((i & 3) << 3);
|
||
|
rseed ^= (p << off);
|
||
|
}
|
||
|
return rseed;
|
||
|
}
|
||
|
|
||
|
/**
|
||
|
* Parameters affecting how reads and read in.
|
||
|
*/
|
||
|
struct PatternParams {
|
||
|
PatternParams(
|
||
|
int format_,
|
||
|
bool fileParallel_,
|
||
|
uint32_t seed_,
|
||
|
bool useSpinlock_,
|
||
|
bool solexa64_,
|
||
|
bool phred64_,
|
||
|
bool intQuals_,
|
||
|
bool fuzzy_,
|
||
|
int sampleLen_,
|
||
|
int sampleFreq_,
|
||
|
uint32_t skip_) :
|
||
|
format(format_),
|
||
|
fileParallel(fileParallel_),
|
||
|
seed(seed_),
|
||
|
useSpinlock(useSpinlock_),
|
||
|
solexa64(solexa64_),
|
||
|
phred64(phred64_),
|
||
|
intQuals(intQuals_),
|
||
|
fuzzy(fuzzy_),
|
||
|
sampleLen(sampleLen_),
|
||
|
sampleFreq(sampleFreq_),
|
||
|
skip(skip_) { }
|
||
|
|
||
|
int format; // file format
|
||
|
bool fileParallel; // true -> wrap files with separate PairedPatternSources
|
||
|
uint32_t seed; // pseudo-random seed
|
||
|
bool useSpinlock; // use spin locks instead of pthreads
|
||
|
bool solexa64; // true -> qualities are on solexa64 scale
|
||
|
bool phred64; // true -> qualities are on phred64 scale
|
||
|
bool intQuals; // true -> qualities are space-separated numbers
|
||
|
bool fuzzy; // true -> try to parse fuzzy fastq
|
||
|
int sampleLen; // length of sampled reads for FastaContinuous...
|
||
|
int sampleFreq; // frequency of sampled reads for FastaContinuous...
|
||
|
uint32_t skip; // skip the first 'skip' patterns
|
||
|
};
|
||
|
|
||
|
/**
|
||
|
* Encapsulates a synchronized source of patterns; usually a file.
|
||
|
* Optionally reverses reads and quality strings before returning them,
|
||
|
* though that is usually more efficiently done by the concrete
|
||
|
* subclass. Concrete subclasses should delimit critical sections with
|
||
|
* calls to lock() and unlock().
|
||
|
*/
|
||
|
class PatternSource {
|
||
|
|
||
|
public:
|
||
|
|
||
|
PatternSource(const PatternParams& p) :
|
||
|
seed_(p.seed),
|
||
|
readCnt_(0),
|
||
|
numWrappers_(0),
|
||
|
doLocking_(true),
|
||
|
useSpinlock_(p.useSpinlock),
|
||
|
mutex()
|
||
|
{
|
||
|
}
|
||
|
|
||
|
virtual ~PatternSource() { }
|
||
|
|
||
|
/**
|
||
|
* Call this whenever this PatternSource is wrapped by a new
|
||
|
* WrappedPatternSourcePerThread. This helps us keep track of
|
||
|
* whether locks will be contended.
|
||
|
*/
|
||
|
void addWrapper() {
|
||
|
lock();
|
||
|
numWrappers_++;
|
||
|
unlock();
|
||
|
}
|
||
|
|
||
|
/**
|
||
|
* The main member function for dispensing patterns.
|
||
|
*
|
||
|
* Returns true iff a pair was parsed succesfully.
|
||
|
*/
|
||
|
virtual bool nextReadPair(
|
||
|
Read& ra,
|
||
|
Read& rb,
|
||
|
TReadId& rdid,
|
||
|
TReadId& endid,
|
||
|
bool& success,
|
||
|
bool& done,
|
||
|
bool& paired,
|
||
|
bool fixName);
|
||
|
|
||
|
/**
|
||
|
* The main member function for dispensing patterns.
|
||
|
*/
|
||
|
virtual bool nextRead(
|
||
|
Read& r,
|
||
|
TReadId& rdid,
|
||
|
TReadId& endid,
|
||
|
bool& success,
|
||
|
bool& done);
|
||
|
|
||
|
/**
|
||
|
* Implementation to be provided by concrete subclasses. An
|
||
|
* implementation for this member is only relevant for formats that
|
||
|
* can read in a pair of reads in a single transaction with a
|
||
|
* single input source. If paired-end input is given as a pair of
|
||
|
* parallel files, this member should throw an error and exit.
|
||
|
*/
|
||
|
virtual bool nextReadPairImpl(
|
||
|
Read& ra,
|
||
|
Read& rb,
|
||
|
TReadId& rdid,
|
||
|
TReadId& endid,
|
||
|
bool& success,
|
||
|
bool& done,
|
||
|
bool& paired) = 0;
|
||
|
|
||
|
/**
|
||
|
* Implementation to be provided by concrete subclasses. An
|
||
|
* implementation for this member is only relevant for formats
|
||
|
* where individual input sources look like single-end-read
|
||
|
* sources, e.g., formats where paired-end reads are specified in
|
||
|
* parallel read files.
|
||
|
*/
|
||
|
virtual bool nextReadImpl(
|
||
|
Read& r,
|
||
|
TReadId& rdid,
|
||
|
TReadId& endid,
|
||
|
bool& success,
|
||
|
bool& done) = 0;
|
||
|
|
||
|
/// Reset state to start over again with the first read
|
||
|
virtual void reset() { readCnt_ = 0; }
|
||
|
|
||
|
/**
|
||
|
* Concrete subclasses call lock() to enter a critical region.
|
||
|
* What constitutes a critical region depends on the subclass.
|
||
|
*/
|
||
|
void lock() {
|
||
|
if(!doLocking_) return; // no contention
|
||
|
mutex.lock();
|
||
|
}
|
||
|
|
||
|
/**
|
||
|
* Concrete subclasses call unlock() to exit a critical region
|
||
|
* What constitutes a critical region depends on the subclass.
|
||
|
*/
|
||
|
void unlock() {
|
||
|
if(!doLocking_) return; // no contention
|
||
|
mutex.unlock();
|
||
|
}
|
||
|
|
||
|
/**
|
||
|
* Return a new dynamically allocated PatternSource for the given
|
||
|
* format, using the given list of strings as the filenames to read
|
||
|
* from or as the sequences themselves (i.e. if -c was used).
|
||
|
*/
|
||
|
static PatternSource* patsrcFromStrings(
|
||
|
const PatternParams& p,
|
||
|
const EList<string>& qs,
|
||
|
size_t nthreads = 1);
|
||
|
|
||
|
/**
|
||
|
* Return the number of reads attempted.
|
||
|
*/
|
||
|
TReadId readCnt() const { return readCnt_ - 1; }
|
||
|
|
||
|
int paired_type; // 1 - left or unpaird, 2-right
|
||
|
// int align_times = 0;
|
||
|
|
||
|
protected:
|
||
|
|
||
|
uint32_t seed_;
|
||
|
|
||
|
/// The number of reads read by this PatternSource
|
||
|
TReadId readCnt_;
|
||
|
|
||
|
int numWrappers_; /// # threads that own a wrapper for this PatternSource
|
||
|
bool doLocking_; /// override whether to lock (true = don't override)
|
||
|
/// User can ask to use the normal pthreads-style lock even if
|
||
|
/// spinlocks is enabled and compiled in. This is sometimes better
|
||
|
/// if we expect bad I/O latency on some reads.
|
||
|
bool useSpinlock_;
|
||
|
MUTEX_T mutex;
|
||
|
};
|
||
|
|
||
|
/**
|
||
|
* Abstract parent class for synhconized sources of paired-end reads
|
||
|
* (and possibly also single-end reads).
|
||
|
*/
|
||
|
class PairedPatternSource {
|
||
|
public:
|
||
|
PairedPatternSource(const PatternParams& p) : mutex_m(), seed_(p.seed) {}
|
||
|
virtual ~PairedPatternSource() { }
|
||
|
|
||
|
virtual void addWrapper() = 0;
|
||
|
virtual void reset() = 0;
|
||
|
|
||
|
virtual bool nextReadPair(
|
||
|
Read& ra,
|
||
|
Read& rb,
|
||
|
TReadId& rdid,
|
||
|
TReadId& endid,
|
||
|
bool& success,
|
||
|
bool& done,
|
||
|
bool& paired,
|
||
|
bool fixName) = 0;
|
||
|
|
||
|
virtual pair<TReadId, TReadId> readCnt() const = 0;
|
||
|
|
||
|
/**
|
||
|
* Lock this PairedPatternSource, usually because one of its shared
|
||
|
* fields is being updated.
|
||
|
*/
|
||
|
void lock() {
|
||
|
mutex_m.lock();
|
||
|
}
|
||
|
|
||
|
/**
|
||
|
* Unlock this PairedPatternSource.
|
||
|
*/
|
||
|
void unlock() {
|
||
|
mutex_m.unlock();
|
||
|
}
|
||
|
|
||
|
/**
|
||
|
* Given the values for all of the various arguments used to specify
|
||
|
* the read and quality input, create a list of pattern sources to
|
||
|
* dispense them.
|
||
|
*/
|
||
|
static PairedPatternSource* setupPatternSources(
|
||
|
const EList<string>& si, // singles, from argv
|
||
|
const EList<string>& m1, // mate1's, from -1 arg
|
||
|
const EList<string>& m2, // mate2's, from -2 arg
|
||
|
const EList<string>& m12, // both mates on each line, from --12 arg
|
||
|
#ifdef USE_SRA
|
||
|
const EList<string>& sra_accs,
|
||
|
#endif
|
||
|
const EList<string>& q, // qualities associated with singles
|
||
|
const EList<string>& q1, // qualities associated with m1
|
||
|
const EList<string>& q2, // qualities associated with m2
|
||
|
const PatternParams& p, // read-in params
|
||
|
size_t nthreads,
|
||
|
bool verbose); // be talkative?
|
||
|
|
||
|
protected:
|
||
|
|
||
|
MUTEX_T mutex_m; /// mutex for syncing over critical regions
|
||
|
uint32_t seed_;
|
||
|
};
|
||
|
|
||
|
/**
|
||
|
* Encapsulates a synchronized source of both paired-end reads and
|
||
|
* unpaired reads, where the paired-end must come from parallel files.
|
||
|
*/
|
||
|
class PairedSoloPatternSource : public PairedPatternSource {
|
||
|
|
||
|
public:
|
||
|
|
||
|
PairedSoloPatternSource(
|
||
|
const EList<PatternSource*>* src,
|
||
|
const PatternParams& p) :
|
||
|
PairedPatternSource(p),
|
||
|
cur_(0),
|
||
|
src_(src)
|
||
|
{
|
||
|
assert(src_ != NULL);
|
||
|
for(size_t i = 0; i < src_->size(); i++) {
|
||
|
assert((*src_)[i] != NULL);
|
||
|
}
|
||
|
}
|
||
|
|
||
|
virtual ~PairedSoloPatternSource() { delete src_; }
|
||
|
|
||
|
/**
|
||
|
* Call this whenever this PairedPatternSource is wrapped by a new
|
||
|
* WrappedPatternSourcePerThread. This helps us keep track of
|
||
|
* whether locks within PatternSources will be contended.
|
||
|
*/
|
||
|
virtual void addWrapper() {
|
||
|
for(size_t i = 0; i < src_->size(); i++) {
|
||
|
(*src_)[i]->addWrapper();
|
||
|
}
|
||
|
}
|
||
|
|
||
|
/**
|
||
|
* Reset this object and all the PatternSources under it so that
|
||
|
* the next call to nextReadPair gets the very first read pair.
|
||
|
*/
|
||
|
virtual void reset() {
|
||
|
for(size_t i = 0; i < src_->size(); i++) {
|
||
|
(*src_)[i]->reset();
|
||
|
}
|
||
|
cur_ = 0;
|
||
|
}
|
||
|
|
||
|
/**
|
||
|
* The main member function for dispensing pairs of reads or
|
||
|
* singleton reads. Returns true iff ra and rb contain a new
|
||
|
* pair; returns false if ra contains a new unpaired read.
|
||
|
*/
|
||
|
virtual bool nextReadPair(
|
||
|
Read& ra,
|
||
|
Read& rb,
|
||
|
TReadId& rdid,
|
||
|
TReadId& endid,
|
||
|
bool& success,
|
||
|
bool& done,
|
||
|
bool& paired,
|
||
|
bool fixName);
|
||
|
|
||
|
/**
|
||
|
* Return the number of reads attempted.
|
||
|
*/
|
||
|
virtual pair<TReadId, TReadId> readCnt() const {
|
||
|
uint64_t ret = 0llu;
|
||
|
for(size_t i = 0; i < src_->size(); i++) ret += (*src_)[i]->readCnt();
|
||
|
return make_pair(ret, 0llu);
|
||
|
}
|
||
|
|
||
|
protected:
|
||
|
|
||
|
volatile uint32_t cur_; // current element in parallel srca_, srcb_ vectors
|
||
|
const EList<PatternSource*>* src_; /// PatternSources for paired-end reads
|
||
|
};
|
||
|
|
||
|
/**
|
||
|
* Encapsulates a synchronized source of both paired-end reads and
|
||
|
* unpaired reads, where the paired-end must come from parallel files.
|
||
|
*/
|
||
|
class PairedDualPatternSource : public PairedPatternSource {
|
||
|
|
||
|
public:
|
||
|
|
||
|
PairedDualPatternSource(
|
||
|
const EList<PatternSource*>* srca,
|
||
|
const EList<PatternSource*>* srcb,
|
||
|
const PatternParams& p) :
|
||
|
PairedPatternSource(p), cur_(0), srca_(srca), srcb_(srcb)
|
||
|
{
|
||
|
assert(srca_ != NULL);
|
||
|
assert(srcb_ != NULL);
|
||
|
// srca_ and srcb_ must be parallel
|
||
|
assert_eq(srca_->size(), srcb_->size());
|
||
|
for(size_t i = 0; i < srca_->size(); i++) {
|
||
|
// Can't have NULL first-mate sources. Second-mate sources
|
||
|
// can be NULL, in the case when the corresponding first-
|
||
|
// mate source is unpaired.
|
||
|
assert((*srca_)[i] != NULL);
|
||
|
for(size_t j = 0; j < srcb_->size(); j++) {
|
||
|
assert_neq((*srca_)[i], (*srcb_)[j]);
|
||
|
}
|
||
|
}
|
||
|
}
|
||
|
|
||
|
virtual ~PairedDualPatternSource() {
|
||
|
delete srca_;
|
||
|
delete srcb_;
|
||
|
}
|
||
|
|
||
|
/**
|
||
|
* Call this whenever this PairedPatternSource is wrapped by a new
|
||
|
* WrappedPatternSourcePerThread. This helps us keep track of
|
||
|
* whether locks within PatternSources will be contended.
|
||
|
*/
|
||
|
virtual void addWrapper() {
|
||
|
for(size_t i = 0; i < srca_->size(); i++) {
|
||
|
(*srca_)[i]->addWrapper();
|
||
|
if((*srcb_)[i] != NULL) {
|
||
|
(*srcb_)[i]->addWrapper();
|
||
|
}
|
||
|
}
|
||
|
}
|
||
|
|
||
|
/**
|
||
|
* Reset this object and all the PatternSources under it so that
|
||
|
* the next call to nextReadPair gets the very first read pair.
|
||
|
*/
|
||
|
virtual void reset() {
|
||
|
for(size_t i = 0; i < srca_->size(); i++) {
|
||
|
(*srca_)[i]->reset();
|
||
|
if((*srcb_)[i] != NULL) {
|
||
|
(*srcb_)[i]->reset();
|
||
|
}
|
||
|
}
|
||
|
cur_ = 0;
|
||
|
}
|
||
|
|
||
|
/**
|
||
|
* The main member function for dispensing pairs of reads or
|
||
|
* singleton reads. Returns true iff ra and rb contain a new
|
||
|
* pair; returns false if ra contains a new unpaired read.
|
||
|
*/
|
||
|
virtual bool nextReadPair(
|
||
|
Read& ra,
|
||
|
Read& rb,
|
||
|
TReadId& rdid,
|
||
|
TReadId& endid,
|
||
|
bool& success,
|
||
|
bool& done,
|
||
|
bool& paired,
|
||
|
bool fixName);
|
||
|
|
||
|
/**
|
||
|
* Return the number of reads attempted.
|
||
|
*/
|
||
|
virtual pair<TReadId, TReadId> readCnt() const;
|
||
|
|
||
|
protected:
|
||
|
|
||
|
volatile uint32_t cur_; // current element in parallel srca_, srcb_ vectors
|
||
|
const EList<PatternSource*>* srca_; /// PatternSources for 1st mates and/or unpaired reads
|
||
|
const EList<PatternSource*>* srcb_; /// PatternSources for 2nd mates
|
||
|
};
|
||
|
|
||
|
/**
|
||
|
* Encapsulates a single thread's interaction with the PatternSource.
|
||
|
* Most notably, this class holds the buffers into which the
|
||
|
* PatterSource will write sequences. This class is *not* threadsafe
|
||
|
* - it doesn't need to be since there's one per thread. PatternSource
|
||
|
* is thread-safe.
|
||
|
*/
|
||
|
class PatternSourcePerThread {
|
||
|
|
||
|
public:
|
||
|
|
||
|
PatternSourcePerThread() :
|
||
|
buf1_(), buf2_(), rdid_(0xffffffff), endid_(0xffffffff) { }
|
||
|
|
||
|
virtual ~PatternSourcePerThread() { }
|
||
|
|
||
|
/**
|
||
|
* change 3N plan for both mate1 and mate2
|
||
|
*/
|
||
|
void changePlan3N(int mappingCycle) {
|
||
|
buf1_.changePlan3N(mappingCycle);
|
||
|
buf2_.changePlan3N(3-mappingCycle);
|
||
|
}
|
||
|
|
||
|
/**
|
||
|
* Read the next read pair.
|
||
|
*/
|
||
|
virtual bool nextReadPair(
|
||
|
bool& success,
|
||
|
bool& done,
|
||
|
bool& paired,
|
||
|
bool fixName)
|
||
|
{
|
||
|
return success;
|
||
|
}
|
||
|
|
||
|
Read& bufa() { return buf1_; }
|
||
|
Read& bufb() { return buf2_; }
|
||
|
const Read& bufa() const { return buf1_; }
|
||
|
const Read& bufb() const { return buf2_; }
|
||
|
|
||
|
TReadId rdid() const { return rdid_; }
|
||
|
TReadId endid() const { return endid_; }
|
||
|
virtual void reset() { rdid_ = endid_ = 0xffffffff; }
|
||
|
|
||
|
/**
|
||
|
* Return the length of mate 1 or mate 2.
|
||
|
*/
|
||
|
size_t length(int mate) const {
|
||
|
return (mate == 1) ? buf1_.length() : buf2_.length();
|
||
|
}
|
||
|
|
||
|
protected:
|
||
|
|
||
|
Read buf1_; // read buffer for mate a
|
||
|
Read buf2_; // read buffer for mate b
|
||
|
TReadId rdid_; // index of read just read
|
||
|
TReadId endid_; // index of read just read
|
||
|
};
|
||
|
|
||
|
/**
|
||
|
* Abstract parent factory for PatternSourcePerThreads.
|
||
|
*/
|
||
|
class PatternSourcePerThreadFactory {
|
||
|
public:
|
||
|
virtual ~PatternSourcePerThreadFactory() { }
|
||
|
virtual PatternSourcePerThread* create() const = 0;
|
||
|
virtual EList<PatternSourcePerThread*>* create(uint32_t n) const = 0;
|
||
|
|
||
|
/// Free memory associated with a pattern source
|
||
|
virtual void destroy(PatternSourcePerThread* patsrc) const {
|
||
|
assert(patsrc != NULL);
|
||
|
// Free the PatternSourcePerThread
|
||
|
delete patsrc;
|
||
|
}
|
||
|
|
||
|
/// Free memory associated with a pattern source list
|
||
|
virtual void destroy(EList<PatternSourcePerThread*>* patsrcs) const {
|
||
|
assert(patsrcs != NULL);
|
||
|
// Free all of the PatternSourcePerThreads
|
||
|
for(size_t i = 0; i < patsrcs->size(); i++) {
|
||
|
if((*patsrcs)[i] != NULL) {
|
||
|
delete (*patsrcs)[i];
|
||
|
(*patsrcs)[i] = NULL;
|
||
|
}
|
||
|
}
|
||
|
// Free the vector
|
||
|
delete patsrcs;
|
||
|
}
|
||
|
};
|
||
|
|
||
|
/**
|
||
|
* A per-thread wrapper for a PairedPatternSource.
|
||
|
*/
|
||
|
class WrappedPatternSourcePerThread : public PatternSourcePerThread {
|
||
|
public:
|
||
|
WrappedPatternSourcePerThread(PairedPatternSource& __patsrc) :
|
||
|
patsrc_(__patsrc)
|
||
|
{
|
||
|
patsrc_.addWrapper();
|
||
|
}
|
||
|
|
||
|
/**
|
||
|
* Get the next paired or unpaired read from the wrapped
|
||
|
* PairedPatternSource.
|
||
|
*/
|
||
|
virtual bool nextReadPair(
|
||
|
bool& success,
|
||
|
bool& done,
|
||
|
bool& paired,
|
||
|
bool fixName);
|
||
|
|
||
|
private:
|
||
|
|
||
|
/// Container for obtaining paired reads from PatternSources
|
||
|
PairedPatternSource& patsrc_;
|
||
|
};
|
||
|
|
||
|
/**
|
||
|
* Abstract parent factory for PatternSourcePerThreads.
|
||
|
*/
|
||
|
class WrappedPatternSourcePerThreadFactory : public PatternSourcePerThreadFactory {
|
||
|
public:
|
||
|
WrappedPatternSourcePerThreadFactory(PairedPatternSource& patsrc) :
|
||
|
patsrc_(patsrc) { }
|
||
|
|
||
|
/**
|
||
|
* Create a new heap-allocated WrappedPatternSourcePerThreads.
|
||
|
*/
|
||
|
virtual PatternSourcePerThread* create() const {
|
||
|
return new WrappedPatternSourcePerThread(patsrc_);
|
||
|
}
|
||
|
|
||
|
/**
|
||
|
* Create a new heap-allocated vector of heap-allocated
|
||
|
* WrappedPatternSourcePerThreads.
|
||
|
*/
|
||
|
virtual EList<PatternSourcePerThread*>* create(uint32_t n) const {
|
||
|
EList<PatternSourcePerThread*>* v = new EList<PatternSourcePerThread*>;
|
||
|
for(size_t i = 0; i < n; i++) {
|
||
|
v->push_back(new WrappedPatternSourcePerThread(patsrc_));
|
||
|
assert(v->back() != NULL);
|
||
|
}
|
||
|
return v;
|
||
|
}
|
||
|
|
||
|
private:
|
||
|
/// Container for obtaining paired reads from PatternSources
|
||
|
PairedPatternSource& patsrc_;
|
||
|
};
|
||
|
|
||
|
/// Skip to the end of the current string of newline chars and return
|
||
|
/// the first character after the newline chars, or -1 for EOF
|
||
|
static inline int getOverNewline(FileBuf& in) {
|
||
|
int c;
|
||
|
while(isspace(c = in.get()));
|
||
|
return c;
|
||
|
}
|
||
|
|
||
|
/// Skip to the end of the current string of newline chars such that
|
||
|
/// the next call to get() returns the first character after the
|
||
|
/// whitespace
|
||
|
static inline int peekOverNewline(FileBuf& in) {
|
||
|
while(true) {
|
||
|
int c = in.peek();
|
||
|
if(c != '\r' && c != '\n') {
|
||
|
return c;
|
||
|
}
|
||
|
in.get();
|
||
|
}
|
||
|
}
|
||
|
|
||
|
/// Skip to the end of the current line; return the first character
|
||
|
/// of the next line or -1 for EOF
|
||
|
static inline int getToEndOfLine(FileBuf& in) {
|
||
|
while(true) {
|
||
|
int c = in.get(); if(c < 0) return -1;
|
||
|
if(c == '\n' || c == '\r') {
|
||
|
while(c == '\n' || c == '\r') {
|
||
|
c = in.get(); if(c < 0) return -1;
|
||
|
}
|
||
|
// c now holds first character of next line
|
||
|
return c;
|
||
|
}
|
||
|
}
|
||
|
}
|
||
|
|
||
|
/// Skip to the end of the current line such that the next call to
|
||
|
/// get() returns the first character on the next line
|
||
|
static inline int peekToEndOfLine(FileBuf& in) {
|
||
|
while(true) {
|
||
|
int c = in.get(); if(c < 0) return c;
|
||
|
if(c == '\n' || c == '\r') {
|
||
|
c = in.peek();
|
||
|
while(c == '\n' || c == '\r') {
|
||
|
in.get(); if(c < 0) return c; // consume \r or \n
|
||
|
c = in.peek();
|
||
|
}
|
||
|
// next get() gets first character of next line
|
||
|
return c;
|
||
|
}
|
||
|
}
|
||
|
}
|
||
|
|
||
|
extern void wrongQualityFormat(const BTString& read_name);
|
||
|
extern void tooFewQualities(const BTString& read_name);
|
||
|
extern void tooManyQualities(const BTString& read_name);
|
||
|
|
||
|
/**
|
||
|
* Encapsulates a source of patterns which is an in-memory vector.
|
||
|
*/
|
||
|
class VectorPatternSource : public PatternSource {
|
||
|
|
||
|
public:
|
||
|
|
||
|
VectorPatternSource(
|
||
|
const EList<string>& v,
|
||
|
const PatternParams& p);
|
||
|
|
||
|
virtual ~VectorPatternSource() { }
|
||
|
|
||
|
virtual bool nextReadImpl(
|
||
|
Read& r,
|
||
|
TReadId& rdid,
|
||
|
TReadId& endid,
|
||
|
bool& success,
|
||
|
bool& done);
|
||
|
|
||
|
/**
|
||
|
* This is unused, but implementation is given for completeness.
|
||
|
*/
|
||
|
virtual bool nextReadPairImpl(
|
||
|
Read& ra,
|
||
|
Read& rb,
|
||
|
TReadId& rdid,
|
||
|
TReadId& endid,
|
||
|
bool& success,
|
||
|
bool& done,
|
||
|
bool& paired);
|
||
|
|
||
|
virtual void reset() {
|
||
|
PatternSource::reset();
|
||
|
cur_ = skip_;
|
||
|
paired_ = false;
|
||
|
}
|
||
|
|
||
|
private:
|
||
|
|
||
|
size_t cur_;
|
||
|
uint32_t skip_;
|
||
|
bool paired_;
|
||
|
EList<BTDnaString> v_; // forward sequences
|
||
|
EList<BTString> quals_; // forward qualities
|
||
|
EList<BTString> names_; // names
|
||
|
EList<int> trimmed3_; // names
|
||
|
EList<int> trimmed5_; // names
|
||
|
};
|
||
|
|
||
|
/**
|
||
|
*
|
||
|
*/
|
||
|
class BufferedFilePatternSource : public PatternSource {
|
||
|
public:
|
||
|
BufferedFilePatternSource(
|
||
|
const EList<string>& infiles,
|
||
|
const PatternParams& p) :
|
||
|
PatternSource(p),
|
||
|
infiles_(infiles),
|
||
|
filecur_(0),
|
||
|
fb_(),
|
||
|
skip_(p.skip),
|
||
|
first_(true)
|
||
|
{
|
||
|
assert_gt(infiles.size(), 0);
|
||
|
errs_.resize(infiles_.size());
|
||
|
errs_.fill(0, infiles_.size(), false);
|
||
|
assert(!fb_.isOpen());
|
||
|
open(); // open first file in the list
|
||
|
filecur_++;
|
||
|
}
|
||
|
|
||
|
virtual ~BufferedFilePatternSource() {
|
||
|
if(fb_.isOpen()) fb_.close();
|
||
|
}
|
||
|
|
||
|
/**
|
||
|
* Fill Read with the sequence, quality and name for the next
|
||
|
* read in the list of read files. This function gets called by
|
||
|
* all the search threads, so we must handle synchronization.
|
||
|
*/
|
||
|
virtual bool nextReadImpl(
|
||
|
Read& r,
|
||
|
TReadId& rdid,
|
||
|
TReadId& endid,
|
||
|
bool& success,
|
||
|
bool& done)
|
||
|
{
|
||
|
// We'll be manipulating our file handle/filecur_ state
|
||
|
lock();
|
||
|
while(true) {
|
||
|
do { read(r, rdid, endid, success, done); }
|
||
|
while(!success && !done);
|
||
|
if(!success && filecur_ < infiles_.size()) {
|
||
|
assert(done);
|
||
|
open();
|
||
|
resetForNextFile(); // reset state to handle a fresh file
|
||
|
filecur_++;
|
||
|
continue;
|
||
|
}
|
||
|
break;
|
||
|
}
|
||
|
assert(r.repOk());
|
||
|
// Leaving critical region
|
||
|
unlock();
|
||
|
return success;
|
||
|
}
|
||
|
|
||
|
/**
|
||
|
*
|
||
|
*/
|
||
|
virtual bool nextReadPairImpl(
|
||
|
Read& ra,
|
||
|
Read& rb,
|
||
|
TReadId& rdid,
|
||
|
TReadId& endid,
|
||
|
bool& success,
|
||
|
bool& done,
|
||
|
bool& paired)
|
||
|
{
|
||
|
// We'll be manipulating our file handle/filecur_ state
|
||
|
lock();
|
||
|
while(true) {
|
||
|
do { readPair(ra, rb, rdid, endid, success, done, paired); }
|
||
|
while(!success && !done);
|
||
|
if(!success && filecur_ < infiles_.size()) {
|
||
|
assert(done);
|
||
|
open();
|
||
|
resetForNextFile(); // reset state to handle a fresh file
|
||
|
filecur_++;
|
||
|
continue;
|
||
|
}
|
||
|
break;
|
||
|
}
|
||
|
assert(ra.repOk());
|
||
|
assert(rb.repOk());
|
||
|
// Leaving critical region
|
||
|
unlock();
|
||
|
return success;
|
||
|
}
|
||
|
|
||
|
/**
|
||
|
* Reset state so that we read start reading again from the
|
||
|
* beginning of the first file. Should only be called by the
|
||
|
* master thread.
|
||
|
*/
|
||
|
virtual void reset() {
|
||
|
PatternSource::reset();
|
||
|
filecur_ = 0;
|
||
|
open();
|
||
|
filecur_++;
|
||
|
}
|
||
|
|
||
|
protected:
|
||
|
|
||
|
/// Read another pattern from the input file; this is overridden
|
||
|
/// to deal with specific file formats
|
||
|
virtual bool read(
|
||
|
Read& r,
|
||
|
TReadId& rdid,
|
||
|
TReadId& endid,
|
||
|
bool& success,
|
||
|
bool& done) = 0;
|
||
|
|
||
|
/// Read another pattern pair from the input file; this is
|
||
|
/// overridden to deal with specific file formats
|
||
|
virtual bool readPair(
|
||
|
Read& ra,
|
||
|
Read& rb,
|
||
|
TReadId& rdid,
|
||
|
TReadId& endid,
|
||
|
bool& success,
|
||
|
bool& done,
|
||
|
bool& paired) = 0;
|
||
|
|
||
|
/// Reset state to handle a fresh file
|
||
|
virtual void resetForNextFile() { }
|
||
|
|
||
|
void open() {
|
||
|
if(fb_.isOpen()) fb_.close();
|
||
|
while(filecur_ < infiles_.size()) {
|
||
|
// Open read
|
||
|
FILE *in;
|
||
|
if(infiles_[filecur_] == "-") {
|
||
|
in = stdin;
|
||
|
} else if((in = fopen(infiles_[filecur_].c_str(), "rb")) == NULL) {
|
||
|
if(!errs_[filecur_]) {
|
||
|
cerr << "Warning: Could not open read file \"" << infiles_[filecur_].c_str() << "\" for reading; skipping..." << endl;
|
||
|
errs_[filecur_] = true;
|
||
|
}
|
||
|
filecur_++;
|
||
|
continue;
|
||
|
}
|
||
|
fb_.newFile(in);
|
||
|
return;
|
||
|
}
|
||
|
cerr << "Error: No input read files were valid" << endl;
|
||
|
exit(1);
|
||
|
return;
|
||
|
}
|
||
|
|
||
|
EList<string> infiles_; // filenames for read files
|
||
|
EList<bool> errs_; // whether we've already printed an error for each file
|
||
|
size_t filecur_; // index into infiles_ of next file to read
|
||
|
FileBuf fb_; // read file currently being read from
|
||
|
TReadId skip_; // number of reads to skip
|
||
|
bool first_;
|
||
|
};
|
||
|
|
||
|
/**
|
||
|
* Parse a single quality string from fb and store qualities in r.
|
||
|
* Assume the next character obtained via fb.get() is the first
|
||
|
* character of the quality string. When returning, the next
|
||
|
* character returned by fb.peek() or fb.get() should be the first
|
||
|
* character of the following line.
|
||
|
*/
|
||
|
int parseQuals(
|
||
|
Read& r,
|
||
|
FileBuf& fb,
|
||
|
int firstc,
|
||
|
int readLen,
|
||
|
int trim3,
|
||
|
int trim5,
|
||
|
bool intQuals,
|
||
|
bool phred64,
|
||
|
bool solexa64);
|
||
|
|
||
|
/**
|
||
|
* Synchronized concrete pattern source for a list of FASTA or CSFASTA
|
||
|
* (if color = true) files.
|
||
|
*/
|
||
|
class FastaPatternSource : public BufferedFilePatternSource {
|
||
|
public:
|
||
|
FastaPatternSource(const EList<string>& infiles,
|
||
|
const PatternParams& p) :
|
||
|
BufferedFilePatternSource(infiles, p),
|
||
|
first_(true), solexa64_(p.solexa64), phred64_(p.phred64), intQuals_(p.intQuals)
|
||
|
{ }
|
||
|
virtual void reset() {
|
||
|
first_ = true;
|
||
|
BufferedFilePatternSource::reset();
|
||
|
}
|
||
|
protected:
|
||
|
/**
|
||
|
* Scan to the next FASTA record (starting with >) and return the first
|
||
|
* character of the record (which will always be >).
|
||
|
*/
|
||
|
static int skipToNextFastaRecord(FileBuf& in) {
|
||
|
int c;
|
||
|
while((c = in.get()) != '>') {
|
||
|
if(in.eof()) return -1;
|
||
|
}
|
||
|
return c;
|
||
|
}
|
||
|
|
||
|
/// Called when we have to bail without having parsed a read.
|
||
|
void bail(Read& r) {
|
||
|
r.reset();
|
||
|
fb_.resetLastN();
|
||
|
}
|
||
|
|
||
|
/// Read another pattern from a FASTA input file
|
||
|
virtual bool read(
|
||
|
Read& r,
|
||
|
TReadId& rdid,
|
||
|
TReadId& endid,
|
||
|
bool& success,
|
||
|
bool& done);
|
||
|
|
||
|
/// Read another pair of patterns from a FASTA input file
|
||
|
virtual bool readPair(
|
||
|
Read& ra,
|
||
|
Read& rb,
|
||
|
TReadId& rdid,
|
||
|
TReadId& endid,
|
||
|
bool& success,
|
||
|
bool& done,
|
||
|
bool& paired)
|
||
|
{
|
||
|
// (For now, we shouldn't ever be here)
|
||
|
cerr << "In FastaPatternSource.readPair()" << endl;
|
||
|
throw 1;
|
||
|
return false;
|
||
|
}
|
||
|
|
||
|
virtual void resetForNextFile() {
|
||
|
first_ = true;
|
||
|
}
|
||
|
|
||
|
private:
|
||
|
bool first_;
|
||
|
|
||
|
public:
|
||
|
bool solexa64_;
|
||
|
bool phred64_;
|
||
|
bool intQuals_;
|
||
|
};
|
||
|
|
||
|
|
||
|
/**
|
||
|
* Tokenize a line of space-separated integer quality values.
|
||
|
*/
|
||
|
static inline bool tokenizeQualLine(
|
||
|
FileBuf& filebuf,
|
||
|
char *buf,
|
||
|
size_t buflen,
|
||
|
EList<string>& toks)
|
||
|
{
|
||
|
size_t rd = filebuf.gets(buf, buflen);
|
||
|
if(rd == 0) return false;
|
||
|
assert(NULL == strrchr(buf, '\n'));
|
||
|
tokenize(string(buf), " ", toks);
|
||
|
return true;
|
||
|
}
|
||
|
|
||
|
/**
|
||
|
* Synchronized concrete pattern source for a list of files with tab-
|
||
|
* delimited name, seq, qual fields (or, for paired-end reads,
|
||
|
* basename, seq1, qual1, seq2, qual2).
|
||
|
*/
|
||
|
class TabbedPatternSource : public BufferedFilePatternSource {
|
||
|
|
||
|
public:
|
||
|
|
||
|
TabbedPatternSource(
|
||
|
const EList<string>& infiles,
|
||
|
const PatternParams& p,
|
||
|
bool secondName) :
|
||
|
BufferedFilePatternSource(infiles, p),
|
||
|
solQuals_(p.solexa64),
|
||
|
phred64Quals_(p.phred64),
|
||
|
intQuals_(p.intQuals),
|
||
|
secondName_(secondName) { }
|
||
|
|
||
|
protected:
|
||
|
|
||
|
/// Read another pattern from a FASTA input file
|
||
|
virtual bool read(
|
||
|
Read& r,
|
||
|
TReadId& rdid,
|
||
|
TReadId& endid,
|
||
|
bool& success,
|
||
|
bool& done);
|
||
|
|
||
|
/// Read another pair of patterns from a FASTA input file
|
||
|
virtual bool readPair(
|
||
|
Read& ra,
|
||
|
Read& rb,
|
||
|
TReadId& rdid,
|
||
|
TReadId& endid,
|
||
|
bool& success,
|
||
|
bool& done,
|
||
|
bool& paired);
|
||
|
|
||
|
private:
|
||
|
|
||
|
/**
|
||
|
* Parse a name from fb_ and store in r. Assume that the next
|
||
|
* character obtained via fb_.get() is the first character of
|
||
|
* the sequence and the string stops at the next char upto (could
|
||
|
* be tab, newline, etc.).
|
||
|
*/
|
||
|
int parseName(Read& r, Read* r2, char upto = '\t');
|
||
|
|
||
|
/**
|
||
|
* Parse a single sequence from fb_ and store in r. Assume
|
||
|
* that the next character obtained via fb_.get() is the first
|
||
|
* character of the sequence and the sequence stops at the next
|
||
|
* char upto (could be tab, newline, etc.).
|
||
|
*/
|
||
|
int parseSeq(Read& r, int& charsRead, int& trim5, char upto = '\t');
|
||
|
|
||
|
/**
|
||
|
* Parse a single quality string from fb_ and store in r.
|
||
|
* Assume that the next character obtained via fb_.get() is
|
||
|
* the first character of the quality string and the string stops
|
||
|
* at the next char upto (could be tab, newline, etc.).
|
||
|
*/
|
||
|
int parseQuals(Read& r, int charsRead, int dstLen, int trim5,
|
||
|
char& c2, char upto = '\t', char upto2 = -1);
|
||
|
|
||
|
bool solQuals_;
|
||
|
bool phred64Quals_;
|
||
|
bool intQuals_;
|
||
|
EList<string> qualToks_;
|
||
|
bool secondName_;
|
||
|
};
|
||
|
|
||
|
/**
|
||
|
* Synchronized concrete pattern source for Illumina Qseq files. In
|
||
|
* Qseq files, each read appears on a separate line and the tab-
|
||
|
* delimited fields are:
|
||
|
*
|
||
|
* 1. Machine name
|
||
|
* 2. Run number
|
||
|
* 3. Lane number
|
||
|
* 4. Tile number
|
||
|
* 5. X coordinate of spot
|
||
|
* 6. Y coordinate of spot
|
||
|
* 7. Index: "Index sequence or 0. For no indexing, or for a file that
|
||
|
* has not been demultiplexed yet, this field should have a value of
|
||
|
* 0."
|
||
|
* 8. Read number: 1 for unpaired, 1 or 2 for paired
|
||
|
* 9. Sequence
|
||
|
* 10. Quality
|
||
|
* 11. Filter: 1 = passed, 0 = didn't
|
||
|
*/
|
||
|
class QseqPatternSource : public BufferedFilePatternSource {
|
||
|
|
||
|
public:
|
||
|
|
||
|
QseqPatternSource(
|
||
|
const EList<string>& infiles,
|
||
|
const PatternParams& p) :
|
||
|
BufferedFilePatternSource(infiles, p),
|
||
|
solQuals_(p.solexa64),
|
||
|
phred64Quals_(p.phred64),
|
||
|
intQuals_(p.intQuals) { }
|
||
|
|
||
|
protected:
|
||
|
|
||
|
#define BAIL_UNPAIRED() { \
|
||
|
peekOverNewline(fb_); \
|
||
|
r.reset(); \
|
||
|
success = false; \
|
||
|
done = true; \
|
||
|
return success; \
|
||
|
}
|
||
|
|
||
|
/**
|
||
|
* Parse a name from fb_ and store in r. Assume that the next
|
||
|
* character obtained via fb_.get() is the first character of
|
||
|
* the sequence and the string stops at the next char upto (could
|
||
|
* be tab, newline, etc.).
|
||
|
*/
|
||
|
int parseName(
|
||
|
Read& r, // buffer for mate 1
|
||
|
Read* r2, // buffer for mate 2 (NULL if mate2 is read separately)
|
||
|
bool append, // true -> append characters, false -> skip them
|
||
|
bool clearFirst, // clear the name buffer first
|
||
|
bool warnEmpty, // emit a warning if nothing was added to the name
|
||
|
bool useDefault, // if nothing is read, put readCnt_ as a default value
|
||
|
int upto); // stop parsing when we first reach character 'upto'
|
||
|
|
||
|
/**
|
||
|
* Parse a single sequence from fb_ and store in r. Assume
|
||
|
* that the next character obtained via fb_.get() is the first
|
||
|
* character of the sequence and the sequence stops at the next
|
||
|
* char upto (could be tab, newline, etc.).
|
||
|
*/
|
||
|
int parseSeq(
|
||
|
Read& r, // buffer for read
|
||
|
int& charsRead,
|
||
|
int& trim5,
|
||
|
char upto);
|
||
|
|
||
|
/**
|
||
|
* Parse a single quality string from fb_ and store in r.
|
||
|
* Assume that the next character obtained via fb_.get() is
|
||
|
* the first character of the quality string and the string stops
|
||
|
* at the next char upto (could be tab, newline, etc.).
|
||
|
*/
|
||
|
int parseQuals(
|
||
|
Read& r, // buffer for read
|
||
|
int charsRead,
|
||
|
int dstLen,
|
||
|
int trim5,
|
||
|
char& c2,
|
||
|
char upto,
|
||
|
char upto2);
|
||
|
|
||
|
/**
|
||
|
* Read another pattern from a Qseq input file.
|
||
|
*/
|
||
|
virtual bool read(
|
||
|
Read& r,
|
||
|
TReadId& rdid,
|
||
|
TReadId& endid,
|
||
|
bool& success,
|
||
|
bool& done);
|
||
|
|
||
|
/**
|
||
|
* Read a pair of patterns from 1 Qseq file. Note: this is never used.
|
||
|
*/
|
||
|
virtual bool readPair(
|
||
|
Read& ra,
|
||
|
Read& rb,
|
||
|
TReadId& rdid,
|
||
|
TReadId& endid,
|
||
|
bool& success,
|
||
|
bool& done,
|
||
|
bool& paired)
|
||
|
{
|
||
|
// (For now, we shouldn't ever be here)
|
||
|
cerr << "In QseqPatternSource.readPair()" << endl;
|
||
|
throw 1;
|
||
|
return false;
|
||
|
}
|
||
|
|
||
|
bool solQuals_;
|
||
|
bool phred64Quals_;
|
||
|
bool intQuals_;
|
||
|
EList<string> qualToks_;
|
||
|
};
|
||
|
|
||
|
/**
|
||
|
* Synchronized concrete pattern source for a list of FASTA files where
|
||
|
* reads need to be extracted from long continuous sequences.
|
||
|
*/
|
||
|
class FastaContinuousPatternSource : public BufferedFilePatternSource {
|
||
|
public:
|
||
|
FastaContinuousPatternSource(const EList<string>& infiles, const PatternParams& p) :
|
||
|
BufferedFilePatternSource(infiles, p),
|
||
|
length_(p.sampleLen), freq_(p.sampleFreq),
|
||
|
eat_(length_-1), beginning_(true),
|
||
|
bufCur_(0), subReadCnt_(0llu)
|
||
|
{
|
||
|
resetForNextFile();
|
||
|
}
|
||
|
|
||
|
virtual void reset() {
|
||
|
BufferedFilePatternSource::reset();
|
||
|
resetForNextFile();
|
||
|
}
|
||
|
|
||
|
protected:
|
||
|
|
||
|
/// Read another pattern from a FASTA input file
|
||
|
virtual bool read(
|
||
|
Read& r,
|
||
|
TReadId& rdid,
|
||
|
TReadId& endid,
|
||
|
bool& success,
|
||
|
bool& done)
|
||
|
{
|
||
|
success = true;
|
||
|
done = false;
|
||
|
r.reset();
|
||
|
while(true) {
|
||
|
r.color = gColor;
|
||
|
int c = fb_.get();
|
||
|
if(c < 0) { success = false; done = true; return success; }
|
||
|
if(c == '>') {
|
||
|
resetForNextFile();
|
||
|
c = fb_.peek();
|
||
|
bool sawSpace = false;
|
||
|
while(c != '\n' && c != '\r') {
|
||
|
if(!sawSpace) {
|
||
|
sawSpace = isspace(c);
|
||
|
}
|
||
|
if(!sawSpace) {
|
||
|
nameBuf_.append(c);
|
||
|
}
|
||
|
fb_.get();
|
||
|
c = fb_.peek();
|
||
|
}
|
||
|
while(c == '\n' || c == '\r') {
|
||
|
fb_.get();
|
||
|
c = fb_.peek();
|
||
|
}
|
||
|
nameBuf_.append('_');
|
||
|
} else {
|
||
|
int cat = asc2dnacat[c];
|
||
|
if(cat >= 2) c = 'N';
|
||
|
if(cat == 0) {
|
||
|
// Encountered non-DNA, non-IUPAC char; skip it
|
||
|
continue;
|
||
|
} else {
|
||
|
// DNA char
|
||
|
buf_[bufCur_++] = c;
|
||
|
if(bufCur_ == 1024) bufCur_ = 0;
|
||
|
if(eat_ > 0) {
|
||
|
eat_--;
|
||
|
// Try to keep readCnt_ aligned with the offset
|
||
|
// into the reference; that lets us see where
|
||
|
// the sampling gaps are by looking at the read
|
||
|
// name
|
||
|
if(!beginning_) readCnt_++;
|
||
|
continue;
|
||
|
}
|
||
|
for(size_t i = 0; i < length_; i++) {
|
||
|
if(length_ - i <= bufCur_) {
|
||
|
c = buf_[bufCur_ - (length_ - i)];
|
||
|
} else {
|
||
|
// Rotate
|
||
|
c = buf_[bufCur_ - (length_ - i) + 1024];
|
||
|
}
|
||
|
r.patFw.append(asc2dna[c]);
|
||
|
r.qual.append('I');
|
||
|
}
|
||
|
// Set up a default name if one hasn't been set
|
||
|
r.name = nameBuf_;
|
||
|
char cbuf[20];
|
||
|
itoa10<TReadId>(readCnt_ - subReadCnt_, cbuf);
|
||
|
r.name.append(cbuf);
|
||
|
eat_ = freq_-1;
|
||
|
readCnt_++;
|
||
|
beginning_ = false;
|
||
|
rdid = endid = readCnt_-1;
|
||
|
break;
|
||
|
}
|
||
|
}
|
||
|
}
|
||
|
return true;
|
||
|
}
|
||
|
|
||
|
/// Shouldn't ever be here; it's not sensible to obtain read pairs
|
||
|
// from a continuous input.
|
||
|
virtual bool readPair(
|
||
|
Read& ra,
|
||
|
Read& rb,
|
||
|
TReadId& rdid,
|
||
|
TReadId& endid,
|
||
|
bool& success,
|
||
|
bool& done,
|
||
|
bool& paired)
|
||
|
{
|
||
|
cerr << "In FastaContinuousPatternSource.readPair()" << endl;
|
||
|
throw 1;
|
||
|
return false;
|
||
|
}
|
||
|
|
||
|
/**
|
||
|
* Reset state to be read for the next file.
|
||
|
*/
|
||
|
virtual void resetForNextFile() {
|
||
|
eat_ = length_-1;
|
||
|
beginning_ = true;
|
||
|
bufCur_ = 0;
|
||
|
nameBuf_.clear();
|
||
|
subReadCnt_ = readCnt_;
|
||
|
}
|
||
|
|
||
|
private:
|
||
|
size_t length_; /// length of reads to generate
|
||
|
size_t freq_; /// frequency to sample reads
|
||
|
size_t eat_; /// number of characters we need to skip before
|
||
|
/// we have flushed all of the ambiguous or
|
||
|
/// non-existent characters out of our read
|
||
|
/// window
|
||
|
bool beginning_; /// skipping over the first read length?
|
||
|
char buf_[1024]; /// read buffer
|
||
|
BTString nameBuf_; /// read buffer for name of fasta record being
|
||
|
/// split into mers
|
||
|
size_t bufCur_; /// buffer cursor; points to where we should
|
||
|
/// insert the next character
|
||
|
uint64_t subReadCnt_;/// number to subtract from readCnt_ to get
|
||
|
/// the pat id to output (so it resets to 0 for
|
||
|
/// each new sequence)
|
||
|
};
|
||
|
|
||
|
/**
|
||
|
* Read a FASTQ-format file.
|
||
|
* See: http://maq.sourceforge.net/fastq.shtml
|
||
|
*/
|
||
|
class FastqPatternSource : public BufferedFilePatternSource {
|
||
|
|
||
|
public:
|
||
|
|
||
|
FastqPatternSource(const EList<string>& infiles, const PatternParams& p) :
|
||
|
BufferedFilePatternSource(infiles, p),
|
||
|
first_(true),
|
||
|
solQuals_(p.solexa64),
|
||
|
phred64Quals_(p.phred64),
|
||
|
intQuals_(p.intQuals),
|
||
|
fuzzy_(p.fuzzy)
|
||
|
{ }
|
||
|
|
||
|
virtual void reset() {
|
||
|
first_ = true;
|
||
|
fb_.resetLastN();
|
||
|
BufferedFilePatternSource::reset();
|
||
|
}
|
||
|
|
||
|
protected:
|
||
|
|
||
|
/**
|
||
|
* Scan to the next FASTQ record (starting with @) and return the first
|
||
|
* character of the record (which will always be @). Since the quality
|
||
|
* line may start with @, we keep scanning until we've seen a line
|
||
|
* beginning with @ where the line two lines back began with +.
|
||
|
*/
|
||
|
static int skipToNextFastqRecord(FileBuf& in, bool sawPlus) {
|
||
|
int line = 0;
|
||
|
int plusLine = -1;
|
||
|
int c = in.get();
|
||
|
int firstc = c;
|
||
|
while(true) {
|
||
|
if(line > 20) {
|
||
|
// If we couldn't find our desired '@' in the first 20
|
||
|
// lines, it's time to give up
|
||
|
if(firstc == '>') {
|
||
|
// That firstc is '>' may be a hint that this is
|
||
|
// actually a FASTA file, so return it intact
|
||
|
return '>';
|
||
|
}
|
||
|
// Return an error
|
||
|
return -1;
|
||
|
}
|
||
|
if(c == -1) return -1;
|
||
|
if(c == '\n') {
|
||
|
c = in.get();
|
||
|
if(c == '@' && sawPlus && plusLine == (line-2)) {
|
||
|
return '@';
|
||
|
}
|
||
|
else if(c == '+') {
|
||
|
// Saw a '+' at the beginning of a line; remember where
|
||
|
// we saw it
|
||
|
sawPlus = true;
|
||
|
plusLine = line;
|
||
|
}
|
||
|
else if(c == -1) {
|
||
|
return -1;
|
||
|
}
|
||
|
line++;
|
||
|
}
|
||
|
c = in.get();
|
||
|
}
|
||
|
}
|
||
|
|
||
|
/// Read another pattern from a FASTQ input file
|
||
|
virtual bool read(
|
||
|
Read& r,
|
||
|
TReadId& rdid,
|
||
|
TReadId& endid,
|
||
|
bool& success,
|
||
|
bool& done);
|
||
|
|
||
|
/// Read another read pair from a FASTQ input file
|
||
|
virtual bool readPair(
|
||
|
Read& ra,
|
||
|
Read& rb,
|
||
|
TReadId& rdid,
|
||
|
TReadId& endid,
|
||
|
bool& success,
|
||
|
bool& done,
|
||
|
bool& paired)
|
||
|
{
|
||
|
// (For now, we shouldn't ever be here)
|
||
|
cerr << "In FastqPatternSource.readPair()" << endl;
|
||
|
throw 1;
|
||
|
return false;
|
||
|
}
|
||
|
|
||
|
virtual void resetForNextFile() {
|
||
|
first_ = true;
|
||
|
}
|
||
|
|
||
|
private:
|
||
|
|
||
|
/**
|
||
|
* Do things we need to do if we have to bail in the middle of a
|
||
|
* read, usually because we reached the end of the input without
|
||
|
* finishing.
|
||
|
*/
|
||
|
void bail(Read& r) {
|
||
|
r.patFw.clear();
|
||
|
fb_.resetLastN();
|
||
|
}
|
||
|
|
||
|
bool first_;
|
||
|
bool solQuals_;
|
||
|
bool phred64Quals_;
|
||
|
bool intQuals_;
|
||
|
bool fuzzy_;
|
||
|
EList<string> qualToks_;
|
||
|
};
|
||
|
|
||
|
/**
|
||
|
* Read a Raw-format file (one sequence per line). No quality strings
|
||
|
* allowed. All qualities are assumed to be 'I' (40 on the Phred-33
|
||
|
* scale).
|
||
|
*/
|
||
|
class RawPatternSource : public BufferedFilePatternSource {
|
||
|
|
||
|
public:
|
||
|
|
||
|
RawPatternSource(const EList<string>& infiles, const PatternParams& p) :
|
||
|
BufferedFilePatternSource(infiles, p), first_(true) { }
|
||
|
|
||
|
virtual void reset() {
|
||
|
first_ = true;
|
||
|
BufferedFilePatternSource::reset();
|
||
|
}
|
||
|
|
||
|
protected:
|
||
|
|
||
|
/// Read another pattern from a Raw input file
|
||
|
virtual bool read(
|
||
|
Read& r,
|
||
|
TReadId& rdid,
|
||
|
TReadId& endid,
|
||
|
bool& success,
|
||
|
bool& done)
|
||
|
{
|
||
|
int c;
|
||
|
success = true;
|
||
|
done = false;
|
||
|
r.reset();
|
||
|
c = getOverNewline(this->fb_);
|
||
|
if(c < 0) {
|
||
|
bail(r); success = false; done = true; return success;
|
||
|
}
|
||
|
assert(!isspace(c));
|
||
|
r.color = gColor;
|
||
|
int mytrim5 = gTrim5;
|
||
|
if(first_) {
|
||
|
// Check that the first character is sane for a raw file
|
||
|
int cc = c;
|
||
|
if(gColor) {
|
||
|
if(cc >= '0' && cc <= '4') cc = "ACGTN"[(int)cc - '0'];
|
||
|
if(cc == '.') cc = 'N';
|
||
|
}
|
||
|
if(asc2dnacat[cc] == 0) {
|
||
|
cerr << "Error: reads file does not look like a Raw file" << endl;
|
||
|
if(c == '>') {
|
||
|
cerr << "Reads file looks like a FASTA file; please use -f" << endl;
|
||
|
}
|
||
|
if(c == '@') {
|
||
|
cerr << "Reads file looks like a FASTQ file; please use -q" << endl;
|
||
|
}
|
||
|
throw 1;
|
||
|
}
|
||
|
first_ = false;
|
||
|
}
|
||
|
if(gColor) {
|
||
|
// This may be a primer character. If so, keep it in the
|
||
|
// 'primer' field of the read buf and parse the rest of the
|
||
|
// read without it.
|
||
|
c = toupper(c);
|
||
|
if(asc2dnacat[c] > 0) {
|
||
|
// First char is a DNA char
|
||
|
int c2 = toupper(fb_.peek());
|
||
|
// Second char is a color char
|
||
|
if(asc2colcat[c2] > 0) {
|
||
|
r.primer = c;
|
||
|
r.trimc = c2;
|
||
|
mytrim5 += 2; // trim primer and first color
|
||
|
}
|
||
|
}
|
||
|
if(c < 0) {
|
||
|
bail(r); success = false; done = true; return success;
|
||
|
}
|
||
|
}
|
||
|
// _in now points just past the first character of a sequence
|
||
|
// line, and c holds the first character
|
||
|
int chs = 0;
|
||
|
while(!isspace(c) && c >= 0) {
|
||
|
if(gColor) {
|
||
|
if(c >= '0' && c <= '4') c = "ACGTN"[(int)c - '0'];
|
||
|
if(c == '.') c = 'N';
|
||
|
}
|
||
|
// 5' trimming
|
||
|
if(isalpha(c) && chs >= mytrim5) {
|
||
|
//size_t len = chs - mytrim5;
|
||
|
//if(len >= 1024) tooManyQualities(BTString("(no name)"));
|
||
|
r.patFw.append(asc2dna[c]);
|
||
|
r.qual.append('I');
|
||
|
}
|
||
|
chs++;
|
||
|
if(isspace(fb_.peek())) break;
|
||
|
c = fb_.get();
|
||
|
}
|
||
|
// 3' trimming
|
||
|
r.patFw.trimEnd(gTrim3);
|
||
|
r.qual.trimEnd(gTrim3);
|
||
|
c = peekToEndOfLine(fb_);
|
||
|
r.trimmed3 = gTrim3;
|
||
|
r.trimmed5 = mytrim5;
|
||
|
r.readOrigBuf.install(fb_.lastN(), fb_.lastNLen());
|
||
|
fb_.resetLastN();
|
||
|
|
||
|
// Set up name
|
||
|
char cbuf[20];
|
||
|
itoa10<TReadId>(readCnt_, cbuf);
|
||
|
r.name.install(cbuf);
|
||
|
readCnt_++;
|
||
|
|
||
|
rdid = endid = readCnt_-1;
|
||
|
return success;
|
||
|
}
|
||
|
|
||
|
/// Read another read pair from a FASTQ input file
|
||
|
virtual bool readPair(
|
||
|
Read& ra,
|
||
|
Read& rb,
|
||
|
TReadId& rdid,
|
||
|
TReadId& endid,
|
||
|
bool& success,
|
||
|
bool& done,
|
||
|
bool& paired)
|
||
|
{
|
||
|
// (For now, we shouldn't ever be here)
|
||
|
cerr << "In RawPatternSource.readPair()" << endl;
|
||
|
throw 1;
|
||
|
return false;
|
||
|
}
|
||
|
|
||
|
virtual void resetForNextFile() {
|
||
|
first_ = true;
|
||
|
}
|
||
|
|
||
|
private:
|
||
|
|
||
|
/**
|
||
|
* Do things we need to do if we have to bail in the middle of a
|
||
|
* read, usually because we reached the end of the input without
|
||
|
* finishing.
|
||
|
*/
|
||
|
void bail(Read& r) {
|
||
|
r.patFw.clear();
|
||
|
fb_.resetLastN();
|
||
|
}
|
||
|
|
||
|
bool first_;
|
||
|
};
|
||
|
|
||
|
#ifdef USE_SRA
|
||
|
|
||
|
namespace ngs {
|
||
|
class ReadCollection;
|
||
|
class ReadIterator;
|
||
|
}
|
||
|
|
||
|
namespace tthread {
|
||
|
class thread;
|
||
|
};
|
||
|
|
||
|
struct SRA_Data;
|
||
|
|
||
|
/**
|
||
|
*
|
||
|
*/
|
||
|
class SRAPatternSource : public PatternSource {
|
||
|
public:
|
||
|
SRAPatternSource(
|
||
|
const EList<string>& sra_accs,
|
||
|
const PatternParams& p,
|
||
|
const size_t nthreads = 1) :
|
||
|
PatternSource(p),
|
||
|
sra_accs_(sra_accs),
|
||
|
sra_acc_cur_(0),
|
||
|
skip_(p.skip),
|
||
|
first_(true),
|
||
|
nthreads_(nthreads),
|
||
|
sra_run_(NULL),
|
||
|
sra_it_(NULL),
|
||
|
sra_data_(NULL),
|
||
|
io_thread_(NULL)
|
||
|
{
|
||
|
assert_gt(sra_accs_.size(), 0);
|
||
|
errs_.resize(sra_accs_.size());
|
||
|
errs_.fill(0, sra_accs_.size(), false);
|
||
|
open(); // open first file in the list
|
||
|
sra_acc_cur_++;
|
||
|
}
|
||
|
|
||
|
virtual ~SRAPatternSource();
|
||
|
|
||
|
/**
|
||
|
* Fill Read with the sequence, quality and name for the next
|
||
|
* read in the list of read files. This function gets called by
|
||
|
* all the search threads, so we must handle synchronization.
|
||
|
*/
|
||
|
virtual bool nextReadImpl(
|
||
|
Read& r,
|
||
|
TReadId& rdid,
|
||
|
TReadId& endid,
|
||
|
bool& success,
|
||
|
bool& done)
|
||
|
{
|
||
|
// We'll be manipulating our file handle/filecur_ state
|
||
|
lock();
|
||
|
while(true) {
|
||
|
do { read(r, rdid, endid, success, done); }
|
||
|
while(!success && !done);
|
||
|
if(!success && sra_acc_cur_ < sra_accs_.size()) {
|
||
|
assert(done);
|
||
|
open();
|
||
|
resetForNextFile(); // reset state to handle a fresh file
|
||
|
sra_acc_cur_++;
|
||
|
continue;
|
||
|
}
|
||
|
break;
|
||
|
}
|
||
|
assert(r.repOk());
|
||
|
// Leaving critical region
|
||
|
unlock();
|
||
|
return success;
|
||
|
}
|
||
|
|
||
|
/**
|
||
|
*
|
||
|
*/
|
||
|
virtual bool nextReadPairImpl(
|
||
|
Read& ra,
|
||
|
Read& rb,
|
||
|
TReadId& rdid,
|
||
|
TReadId& endid,
|
||
|
bool& success,
|
||
|
bool& done,
|
||
|
bool& paired)
|
||
|
{
|
||
|
// We'll be manipulating our file handle/filecur_ state
|
||
|
lock();
|
||
|
while(true) {
|
||
|
do { readPair(ra, rb, rdid, endid, success, done, paired); }
|
||
|
while(!success && !done);
|
||
|
if(!success && sra_acc_cur_ < sra_accs_.size()) {
|
||
|
assert(done);
|
||
|
open();
|
||
|
resetForNextFile(); // reset state to handle a fresh file
|
||
|
sra_acc_cur_++;
|
||
|
continue;
|
||
|
}
|
||
|
break;
|
||
|
}
|
||
|
assert(ra.repOk());
|
||
|
assert(rb.repOk());
|
||
|
// Leaving critical region
|
||
|
unlock();
|
||
|
return success;
|
||
|
}
|
||
|
|
||
|
/**
|
||
|
* Reset state so that we read start reading again from the
|
||
|
* beginning of the first file. Should only be called by the
|
||
|
* master thread.
|
||
|
*/
|
||
|
virtual void reset() {
|
||
|
PatternSource::reset();
|
||
|
sra_acc_cur_ = 0,
|
||
|
open();
|
||
|
sra_acc_cur_++;
|
||
|
}
|
||
|
|
||
|
/// Read another pattern from the input file; this is overridden
|
||
|
/// to deal with specific file formats
|
||
|
virtual bool read(
|
||
|
Read& r,
|
||
|
TReadId& rdid,
|
||
|
TReadId& endid,
|
||
|
bool& success,
|
||
|
bool& done)
|
||
|
{
|
||
|
return true;
|
||
|
}
|
||
|
|
||
|
/// Read another pattern pair from the input file; this is
|
||
|
/// overridden to deal with specific file formats
|
||
|
virtual bool readPair(
|
||
|
Read& ra,
|
||
|
Read& rb,
|
||
|
TReadId& rdid,
|
||
|
TReadId& endid,
|
||
|
bool& success,
|
||
|
bool& done,
|
||
|
bool& paired);
|
||
|
|
||
|
protected:
|
||
|
|
||
|
/// Reset state to handle a fresh file
|
||
|
virtual void resetForNextFile() { }
|
||
|
|
||
|
void open();
|
||
|
|
||
|
EList<string> sra_accs_; // filenames for read files
|
||
|
EList<bool> errs_; // whether we've already printed an error for each file
|
||
|
size_t sra_acc_cur_; // index into infiles_ of next file to read
|
||
|
TReadId skip_; // number of reads to skip
|
||
|
bool first_;
|
||
|
|
||
|
size_t nthreads_;
|
||
|
|
||
|
ngs::ReadCollection* sra_run_;
|
||
|
ngs::ReadIterator* sra_it_;
|
||
|
|
||
|
SRA_Data* sra_data_;
|
||
|
tthread::thread* io_thread_;
|
||
|
};
|
||
|
|
||
|
#endif
|
||
|
|
||
|
#endif /*PAT_H_*/
|