hisat-3n/read.h
2025-01-18 21:09:52 +08:00

600 lines
16 KiB
C++

/*
* Copyright 2011, Ben Langmead <langmea@cs.jhu.edu>
*
* This file is part of Bowtie 2.
* This file is edited by Yun (Leo) Zhang for HISAT-3N.
*
* Bowtie 2 is free software: you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation, either version 3 of the License, or
* (at your option) any later version.
*
* Bowtie 2 is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with Bowtie 2. If not, see <http://www.gnu.org/licenses/>.
*/
#ifndef READ_H_
#define READ_H_
#include <stdint.h>
#include <sys/time.h>
#include "ds.h"
#include "sstring.h"
#include "filebuf.h"
#include "util.h"
/**
* the threeN_cycle
*/
/*enum {
threeN_CT_FW = 0,
threeN_CT_RC,
threeN_GA_FW,
threeN_GA_RC
};*/
enum {
threeN_type1conversion_FW = 0,
threeN_type1conversion_RC,
threeN_type2conversion_FW,
threeN_type2conversion_RC
};
enum rna_strandness_format {
RNA_STRANDNESS_UNKNOWN = 0,
RNA_STRANDNESS_F,
RNA_STRANDNESS_R,
RNA_STRANDNESS_FR,
RNA_STRANDNESS_RF
};
typedef uint64_t TReadId;
typedef size_t TReadOff;
typedef int64_t TAlScore;
extern bool threeN;
class HitSet;
/**
* A buffer for keeping all relevant information about a single read.
*/
struct Read {
Read() { reset(); }
Read(const char *nm, const char *seq, const char *ql) { init(nm, seq, ql); }
void reset() {
rdid = 0;
endid = 0;
alts = 0;
trimmed5 = trimmed3 = 0;
readOrigBuf.clear();
patFw.clear();
patFw_3N.clear();
patRc.clear();
qual.clear();
patFwRev.clear();
patRcRev.clear();
qualRev.clear();
name.clear();
originalFw.clear();
originalRc.clear();
for(int j = 0; j < 3; j++) {
altPatFw[j].clear();
altPatFwRev[j].clear();
altPatRc[j].clear();
altPatRcRev[j].clear();
altQual[j].clear();
altQualRev[j].clear();
}
color = fuzzy = false;
primer = '?';
trimc = '?';
filter = '?';
seed = 0;
ns_ = 0;
threeN_cycle = 0;
oppositeConversion_3N = false;
}
/**
* Finish initializing a new read.
*/
void finalize() {
for(size_t i = 0; i < patFw.length(); i++) {
if((int)patFw[i] > 3) {
ns_++;
}
}
constructRevComps();
constructReverses();
}
/**
* change patFw sequence based on current threeN_cycle and newMappingCycle.
*
* There are two types of changes:
* type1conversion: hs3N_convertedFrom to hs3N_convertedTo
* type2conversion: hs3N_convertedFromComplement to hs3N_convertedToComplement
*
* The initial threeN_cycle is 0. There are 4 cycle: 0, 1, 2, 3;
*
* mate 1, mate2
* initial: threeN_type1conversion_FW(0), threeN_type1conversion_FW(0),
* --------------- type1->type2 change conversion type
* 1st cycle: threeN_type1conversion_FW(0), threeN_type2conversion_RC(3 = 3-0),
* 2nd cycle: threeN_type1conversion_RC(1), threeN_type2conversion_FW(2 = 3-1),
* type1c->type2 type2->type1 change conversion type
* 3rd cycle: threeN_type2conversion_FW(2), threeN_type1conversion_RC(1 = 3-2),
* 4rd cycle: threeN_type2conversion_RC(3), threeN_type1conversion_FW(0 = 3-3),
*/
void changePlan3N(int newMappingCycle) {
if (name.length() == 0) return;
if ((threeN_cycle == threeN_type1conversion_FW && newMappingCycle == threeN_type2conversion_RC) ||
(threeN_cycle == threeN_type1conversion_RC && newMappingCycle == threeN_type2conversion_FW) ||
(threeN_cycle == threeN_type2conversion_FW && newMappingCycle == threeN_type1conversion_RC)) {
ns_ = 0;
swap(patFw, patFw_3N);
finalize();
}
threeN_cycle = newMappingCycle;
oppositeConversion_3N = false;
}
/**
* Simple init function, used for testing.
*/
void init(
const char *nm,
const char *seq,
const char *ql)
{
reset();
patFw.installChars(seq);
qual.install(ql);
for(size_t i = 0; i < patFw.length(); i++) {
if((int)patFw[i] > 3) {
ns_++;
}
}
constructRevComps();
constructReverses();
if(nm != NULL) name.install(nm);
}
/// Return true iff the read (pair) is empty
bool empty() const {
return patFw.empty();
}
/// Return length of the read in the buffer
size_t length() const {
return patFw.length();
}
/**
* Return the number of Ns in the read.
*/
size_t ns() const {
return ns_;
}
/**
* Construct reverse complement of the pattern and the fuzzy
* alternative patters. If read is in colorspace, just reverse
* them.
*/
void constructRevComps() {
if(color) {
patRc.installReverse(patFw);
for(int j = 0; j < alts; j++) {
altPatRc[j].installReverse(altPatFw[j]);
}
if (threeN) originalRc.installReverse(originalFw);
} else {
patRc.installReverseComp(patFw);
for(int j = 0; j < alts; j++) {
altPatRc[j].installReverseComp(altPatFw[j]);
}
if (threeN) originalRc.installReverseComp(originalFw);
}
}
/**
* Given patFw, patRc, and qual, construct the *Rev versions in
* place. Assumes constructRevComps() was called previously.
*/
void constructReverses() {
patFwRev.installReverse(patFw);
patRcRev.installReverse(patRc);
qualRev.installReverse(qual);
for(int j = 0; j < alts; j++) {
altPatFwRev[j].installReverse(altPatFw[j]);
altPatRcRev[j].installReverse(altPatRc[j]);
altQualRev[j].installReverse(altQual[j]);
}
}
/**
* Append a "/1" or "/2" string onto the end of the name buf if
* it's not already there.
*/
void fixMateName(int i) {
assert(i == 1 || i == 2);
size_t namelen = name.length();
bool append = false;
if(namelen < 2) {
// Name is too short to possibly have /1 or /2 on the end
append = true;
} else {
if(i == 1) {
// append = true iff mate name does not already end in /1
append =
name[namelen-2] != '/' ||
name[namelen-1] != '1';
} else {
// append = true iff mate name does not already end in /2
append =
name[namelen-2] != '/' ||
name[namelen-1] != '2';
}
}
if(append) {
name.append('/');
name.append("012"[i]);
}
}
/**
* Dump basic information about this read to the given ostream.
*/
void dump(std::ostream& os) const {
using namespace std;
os << name << ' ';
if(color) {
os << patFw.toZBufXForm("0123.");
} else {
os << patFw;
}
os << ' ';
// Print out the fuzzy alternative sequences
for(int j = 0; j < 3; j++) {
bool started = false;
if(!altQual[j].empty()) {
for(size_t i = 0; i < length(); i++) {
if(altQual[j][i] != '!') {
started = true;
}
if(started) {
if(altQual[j][i] == '!') {
os << '-';
} else {
if(color) {
os << "0123."[(int)altPatFw[j][i]];
} else {
os << altPatFw[j][i];
}
}
}
}
}
cout << " ";
}
os << qual.toZBuf() << " ";
// Print out the fuzzy alternative quality strings
for(int j = 0; j < 3; j++) {
bool started = false;
if(!altQual[j].empty()) {
for(size_t i = 0; i < length(); i++) {
if(altQual[j][i] != '!') {
started = true;
}
if(started) {
os << altQual[j][i];
}
}
}
if(j == 2) {
os << endl;
} else {
os << " ";
}
}
}
/**
* Check whether two reads are the same in the sense that they will
* lead to us finding the same set of alignments.
*/
static bool same(
const BTDnaString& seq1,
const BTString& qual1,
const BTDnaString& seq2,
const BTString& qual2,
bool qualitiesMatter)
{
if(seq1.length() != seq2.length()) {
return false;
}
for(size_t i = 0; i < seq1.length(); i++) {
if(seq1[i] != seq2[i]) return false;
}
if(qualitiesMatter) {
if(qual1.length() != qual2.length()) {
return false;
}
for(size_t i = 0; i < qual1.length(); i++) {
if(qual1[i] != qual2[i]) return false;
}
}
return true;
}
/**
* Get the nucleotide and quality value at the given offset from 5' end.
* If 'fw' is false, get the reverse complement.
*/
std::pair<int, int> get(TReadOff off5p, bool fw) const {
assert_lt(off5p, length());
int c = (int)patFw[off5p];
int q = qual[off5p];
assert_geq(q, 33);
return make_pair((!fw && c < 4) ? (c ^ 3) : c, q - 33);
}
/**
* Get the nucleotide at the given offset from 5' end.
* If 'fw' is false, get the reverse complement.
*/
int getc(TReadOff off5p, bool fw) const {
assert_lt(off5p, length());
int c = (int)patFw[off5p];
return (!fw && c < 4) ? (c ^ 3) : c;
}
/**
* Get the quality value at the given offset from 5' end.
*/
int getq(TReadOff off5p) const {
assert_lt(off5p, length());
int q = qual[off5p];
assert_geq(q, 33);
return q-33;
}
#ifndef NDEBUG
/**
* Check that read info is internally consistent.
*/
bool repOk() const {
if(patFw.empty()) return true;
assert_eq(qual.length(), patFw.length());
return true;
}
#endif
BTDnaString patFw; // forward-strand sequence
BTDnaString patFw_3N;
BTDnaString patRc; // reverse-complement sequence
BTDnaString patRc1;
BTString qual; // quality values
BTDnaString originalFw; // the forward-strand sequence from read (without editing)
BTDnaString originalRc; // the reverse-complement sequence from read (without editing)
BTDnaString altPatFw[3];
BTDnaString altPatRc[3];
BTString altQual[3];
BTDnaString patFwRev;
BTDnaString patRcRev;
BTString qualRev;
BTDnaString altPatFwRev[3];
BTDnaString altPatRcRev[3];
BTString altQualRev[3];
// For remembering the exact input text used to define a read
SStringExpandable<char> readOrigBuf;
BTString name; // read name
TReadId rdid; // 0-based id based on pair's offset in read file(s)
TReadId endid; // 0-based id based on pair's offset in read file(s)
// and which mate ("end") this is
int mate; // 0 = single-end, 1 = mate1, 2 = mate2
uint32_t seed; // random seed
size_t ns_; // # Ns
int alts; // number of alternatives
bool fuzzy; // whether to employ fuzziness
bool color; // whether read is in color space
char primer; // primer base, for csfasta files
char trimc; // trimmed color, for csfasta files
char filter; // if read format permits filter char, set it here
int trimmed5; // amount actually trimmed off 5' end
int trimmed3; // amount actually trimmed off 3' end
HitSet *hitset; // holds previously-found hits; for chaining
// for HISAT-3N
int threeN_cycle;
bool oppositeConversion_3N;
};
/**
* A string of FmStringOps represent a string of tasks performed by the
* best-first alignment search. We model the search as a series of FM ops
* interspersed with reported alignments.
*/
struct FmStringOp {
bool alignment; // true -> found an alignment
TAlScore pen; // penalty of the FM op or alignment
size_t n; // number of FM ops (only relevant for non-alignment)
};
/**
* A string that summarizes the progress of an FM-index-assistet best-first
* search. Useful for trying to figure out what the aligner is spending its
* time doing for a given read.
*/
struct FmString {
/**
* Add one or more FM index ops to the op string
*/
void add(bool alignment, TAlScore pen, size_t nops) {
if(ops.empty() || ops.back().pen != pen) {
ops.expand();
ops.back().alignment = alignment;
ops.back().pen = pen;
ops.back().n = 0;
}
ops.back().n++;
}
/**
* Reset FmString to uninitialized state.
*/
void reset() {
pen = std::numeric_limits<TAlScore>::max();
ops.clear();
}
/**
* Print a :Z optional field where certain characters (whitespace, colon
* and percent) are escaped using % escapes.
*/
void print(BTString& o, char *buf) const {
for(size_t i = 0; i < ops.size(); i++) {
if(i > 0) {
o.append(';');
}
if(ops[i].alignment) {
o.append("A,");
itoa10(ops[i].pen, buf);
o.append(buf);
} else {
o.append("F,");
itoa10(ops[i].pen, buf); o.append(buf);
o.append(',');
itoa10(ops[i].n, buf); o.append(buf);
}
}
}
TAlScore pen; // current penalty
EList<FmStringOp> ops; // op string
};
/**
* Key per-read metrics. These are used for thresholds, allowing us to bail
* for unproductive reads. They also the basis of what's printed when the user
* specifies --read-times.
*/
struct PerReadMetrics {
PerReadMetrics() { reset(); }
void reset() {
nExIters =
nExDps = nExDpSuccs = nExDpFails =
nMateDps = nMateDpSuccs = nMateDpFails =
nExUgs = nExUgSuccs = nExUgFails =
nMateUgs = nMateUgSuccs = nMateUgFails =
nExEes = nExEeSuccs = nExEeFails =
nRedundants =
nEeFmops = nSdFmops = nExFmops =
nDpFail = nDpFailStreak = nDpLastSucc =
nUgFail = nUgFailStreak = nUgLastSucc =
nEeFail = nEeFailStreak = nEeLastSucc =
nFilt = 0;
nFtabs = 0;
nRedSkip = 0;
nRedFail = 0;
nRedIns = 0;
doFmString = false;
nSeedRanges = nSeedElts = 0;
nSeedRangesFw = nSeedEltsFw = 0;
nSeedRangesRc = nSeedEltsRc = 0;
seedMedian = seedMean = 0;
bestLtMinscMate1 =
bestLtMinscMate2 = std::numeric_limits<TAlScore>::min();
fmString.reset();
}
struct timeval tv_beg; // timer start to measure how long alignment takes
struct timezone tz_beg; // timer start to measure how long alignment takes
uint64_t nExIters; // iterations of seed hit extend loop
uint64_t nExDps; // # extend DPs run on this read
uint64_t nExDpSuccs; // # extend DPs run on this read
uint64_t nExDpFails; // # extend DPs run on this read
uint64_t nExUgs; // # extend ungapped alignments run on this read
uint64_t nExUgSuccs; // # extend ungapped alignments run on this read
uint64_t nExUgFails; // # extend ungapped alignments run on this read
uint64_t nExEes; // # extend ungapped alignments run on this read
uint64_t nExEeSuccs; // # extend ungapped alignments run on this read
uint64_t nExEeFails; // # extend ungapped alignments run on this read
uint64_t nMateDps; // # mate DPs run on this read
uint64_t nMateDpSuccs; // # mate DPs run on this read
uint64_t nMateDpFails; // # mate DPs run on this read
uint64_t nMateUgs; // # mate ungapped alignments run on this read
uint64_t nMateUgSuccs; // # mate ungapped alignments run on this read
uint64_t nMateUgFails; // # mate ungapped alignments run on this read
uint64_t nRedundants; // # redundant seed hits
uint64_t nSeedRanges; // # BW ranges found for seeds
uint64_t nSeedElts; // # BW elements found for seeds
uint64_t nSeedRangesFw; // # BW ranges found for seeds from fw read
uint64_t nSeedEltsFw; // # BW elements found for seeds from fw read
uint64_t nSeedRangesRc; // # BW ranges found for seeds from fw read
uint64_t nSeedEltsRc; // # BW elements found for seeds from fw read
uint64_t seedMedian; // median seed hit count
uint64_t seedMean; // rounded mean seed hit count
uint64_t nEeFmops; // FM Index ops for end-to-end alignment
uint64_t nSdFmops; // FM Index ops used to align seeds
uint64_t nExFmops; // FM Index ops used to resolve offsets
uint64_t nFtabs; // # ftab lookups
uint64_t nRedSkip; // # times redundant path was detected and aborted
uint64_t nRedFail; // # times a path was deemed non-redundant
uint64_t nRedIns; // # times a path was added to redundancy list
uint64_t nDpFail; // number of dp failures in a row up until now
uint64_t nDpFailStreak; // longest streak of dp failures
uint64_t nDpLastSucc; // index of last dp attempt that succeeded
uint64_t nUgFail; // number of ungap failures in a row up until now
uint64_t nUgFailStreak; // longest streak of ungap failures
uint64_t nUgLastSucc; // index of last ungap attempt that succeeded
uint64_t nEeFail; // number of ungap failures in a row up until now
uint64_t nEeFailStreak; // longest streak of ungap failures
uint64_t nEeLastSucc; // index of last ungap attempt that succeeded
uint64_t nFilt; // # mates filtered
TAlScore bestLtMinscMate1; // best invalid score observed for mate 1
TAlScore bestLtMinscMate2; // best invalid score observed for mate 2
// For collecting information to go into an FM string
bool doFmString;
FmString fmString;
};
#endif /*READ_H_*/