/* * Copyright 2011, Ben Langmead * * This file is part of Bowtie 2. * This file is edited by Yun (Leo) Zhang for HISAT-3N. * * Bowtie 2 is free software: you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation, either version 3 of the License, or * (at your option) any later version. * * Bowtie 2 is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with Bowtie 2. If not, see . */ #ifndef READ_H_ #define READ_H_ #include #include #include "ds.h" #include "sstring.h" #include "filebuf.h" #include "util.h" /** * the threeN_cycle */ /*enum { threeN_CT_FW = 0, threeN_CT_RC, threeN_GA_FW, threeN_GA_RC };*/ enum { threeN_type1conversion_FW = 0, threeN_type1conversion_RC, threeN_type2conversion_FW, threeN_type2conversion_RC }; enum rna_strandness_format { RNA_STRANDNESS_UNKNOWN = 0, RNA_STRANDNESS_F, RNA_STRANDNESS_R, RNA_STRANDNESS_FR, RNA_STRANDNESS_RF }; typedef uint64_t TReadId; typedef size_t TReadOff; typedef int64_t TAlScore; extern bool threeN; class HitSet; /** * A buffer for keeping all relevant information about a single read. */ struct Read { Read() { reset(); } Read(const char *nm, const char *seq, const char *ql) { init(nm, seq, ql); } void reset() { rdid = 0; endid = 0; alts = 0; trimmed5 = trimmed3 = 0; readOrigBuf.clear(); patFw.clear(); patFw_3N.clear(); patRc.clear(); qual.clear(); patFwRev.clear(); patRcRev.clear(); qualRev.clear(); name.clear(); originalFw.clear(); originalRc.clear(); for(int j = 0; j < 3; j++) { altPatFw[j].clear(); altPatFwRev[j].clear(); altPatRc[j].clear(); altPatRcRev[j].clear(); altQual[j].clear(); altQualRev[j].clear(); } color = fuzzy = false; primer = '?'; trimc = '?'; filter = '?'; seed = 0; ns_ = 0; threeN_cycle = 0; oppositeConversion_3N = false; } /** * Finish initializing a new read. */ void finalize() { for(size_t i = 0; i < patFw.length(); i++) { if((int)patFw[i] > 3) { ns_++; } } constructRevComps(); constructReverses(); } /** * change patFw sequence based on current threeN_cycle and newMappingCycle. * * There are two types of changes: * type1conversion: hs3N_convertedFrom to hs3N_convertedTo * type2conversion: hs3N_convertedFromComplement to hs3N_convertedToComplement * * The initial threeN_cycle is 0. There are 4 cycle: 0, 1, 2, 3; * * mate 1, mate2 * initial: threeN_type1conversion_FW(0), threeN_type1conversion_FW(0), * --------------- type1->type2 change conversion type * 1st cycle: threeN_type1conversion_FW(0), threeN_type2conversion_RC(3 = 3-0), * 2nd cycle: threeN_type1conversion_RC(1), threeN_type2conversion_FW(2 = 3-1), * type1c->type2 type2->type1 change conversion type * 3rd cycle: threeN_type2conversion_FW(2), threeN_type1conversion_RC(1 = 3-2), * 4rd cycle: threeN_type2conversion_RC(3), threeN_type1conversion_FW(0 = 3-3), */ void changePlan3N(int newMappingCycle) { if (name.length() == 0) return; if ((threeN_cycle == threeN_type1conversion_FW && newMappingCycle == threeN_type2conversion_RC) || (threeN_cycle == threeN_type1conversion_RC && newMappingCycle == threeN_type2conversion_FW) || (threeN_cycle == threeN_type2conversion_FW && newMappingCycle == threeN_type1conversion_RC)) { ns_ = 0; swap(patFw, patFw_3N); finalize(); } threeN_cycle = newMappingCycle; oppositeConversion_3N = false; } /** * Simple init function, used for testing. */ void init( const char *nm, const char *seq, const char *ql) { reset(); patFw.installChars(seq); qual.install(ql); for(size_t i = 0; i < patFw.length(); i++) { if((int)patFw[i] > 3) { ns_++; } } constructRevComps(); constructReverses(); if(nm != NULL) name.install(nm); } /// Return true iff the read (pair) is empty bool empty() const { return patFw.empty(); } /// Return length of the read in the buffer size_t length() const { return patFw.length(); } /** * Return the number of Ns in the read. */ size_t ns() const { return ns_; } /** * Construct reverse complement of the pattern and the fuzzy * alternative patters. If read is in colorspace, just reverse * them. */ void constructRevComps() { if(color) { patRc.installReverse(patFw); for(int j = 0; j < alts; j++) { altPatRc[j].installReverse(altPatFw[j]); } if (threeN) originalRc.installReverse(originalFw); } else { patRc.installReverseComp(patFw); for(int j = 0; j < alts; j++) { altPatRc[j].installReverseComp(altPatFw[j]); } if (threeN) originalRc.installReverseComp(originalFw); } } /** * Given patFw, patRc, and qual, construct the *Rev versions in * place. Assumes constructRevComps() was called previously. */ void constructReverses() { patFwRev.installReverse(patFw); patRcRev.installReverse(patRc); qualRev.installReverse(qual); for(int j = 0; j < alts; j++) { altPatFwRev[j].installReverse(altPatFw[j]); altPatRcRev[j].installReverse(altPatRc[j]); altQualRev[j].installReverse(altQual[j]); } } /** * Append a "/1" or "/2" string onto the end of the name buf if * it's not already there. */ void fixMateName(int i) { assert(i == 1 || i == 2); size_t namelen = name.length(); bool append = false; if(namelen < 2) { // Name is too short to possibly have /1 or /2 on the end append = true; } else { if(i == 1) { // append = true iff mate name does not already end in /1 append = name[namelen-2] != '/' || name[namelen-1] != '1'; } else { // append = true iff mate name does not already end in /2 append = name[namelen-2] != '/' || name[namelen-1] != '2'; } } if(append) { name.append('/'); name.append("012"[i]); } } /** * Dump basic information about this read to the given ostream. */ void dump(std::ostream& os) const { using namespace std; os << name << ' '; if(color) { os << patFw.toZBufXForm("0123."); } else { os << patFw; } os << ' '; // Print out the fuzzy alternative sequences for(int j = 0; j < 3; j++) { bool started = false; if(!altQual[j].empty()) { for(size_t i = 0; i < length(); i++) { if(altQual[j][i] != '!') { started = true; } if(started) { if(altQual[j][i] == '!') { os << '-'; } else { if(color) { os << "0123."[(int)altPatFw[j][i]]; } else { os << altPatFw[j][i]; } } } } } cout << " "; } os << qual.toZBuf() << " "; // Print out the fuzzy alternative quality strings for(int j = 0; j < 3; j++) { bool started = false; if(!altQual[j].empty()) { for(size_t i = 0; i < length(); i++) { if(altQual[j][i] != '!') { started = true; } if(started) { os << altQual[j][i]; } } } if(j == 2) { os << endl; } else { os << " "; } } } /** * Check whether two reads are the same in the sense that they will * lead to us finding the same set of alignments. */ static bool same( const BTDnaString& seq1, const BTString& qual1, const BTDnaString& seq2, const BTString& qual2, bool qualitiesMatter) { if(seq1.length() != seq2.length()) { return false; } for(size_t i = 0; i < seq1.length(); i++) { if(seq1[i] != seq2[i]) return false; } if(qualitiesMatter) { if(qual1.length() != qual2.length()) { return false; } for(size_t i = 0; i < qual1.length(); i++) { if(qual1[i] != qual2[i]) return false; } } return true; } /** * Get the nucleotide and quality value at the given offset from 5' end. * If 'fw' is false, get the reverse complement. */ std::pair get(TReadOff off5p, bool fw) const { assert_lt(off5p, length()); int c = (int)patFw[off5p]; int q = qual[off5p]; assert_geq(q, 33); return make_pair((!fw && c < 4) ? (c ^ 3) : c, q - 33); } /** * Get the nucleotide at the given offset from 5' end. * If 'fw' is false, get the reverse complement. */ int getc(TReadOff off5p, bool fw) const { assert_lt(off5p, length()); int c = (int)patFw[off5p]; return (!fw && c < 4) ? (c ^ 3) : c; } /** * Get the quality value at the given offset from 5' end. */ int getq(TReadOff off5p) const { assert_lt(off5p, length()); int q = qual[off5p]; assert_geq(q, 33); return q-33; } #ifndef NDEBUG /** * Check that read info is internally consistent. */ bool repOk() const { if(patFw.empty()) return true; assert_eq(qual.length(), patFw.length()); return true; } #endif BTDnaString patFw; // forward-strand sequence BTDnaString patFw_3N; BTDnaString patRc; // reverse-complement sequence BTDnaString patRc1; BTString qual; // quality values BTDnaString originalFw; // the forward-strand sequence from read (without editing) BTDnaString originalRc; // the reverse-complement sequence from read (without editing) BTDnaString altPatFw[3]; BTDnaString altPatRc[3]; BTString altQual[3]; BTDnaString patFwRev; BTDnaString patRcRev; BTString qualRev; BTDnaString altPatFwRev[3]; BTDnaString altPatRcRev[3]; BTString altQualRev[3]; // For remembering the exact input text used to define a read SStringExpandable readOrigBuf; BTString name; // read name TReadId rdid; // 0-based id based on pair's offset in read file(s) TReadId endid; // 0-based id based on pair's offset in read file(s) // and which mate ("end") this is int mate; // 0 = single-end, 1 = mate1, 2 = mate2 uint32_t seed; // random seed size_t ns_; // # Ns int alts; // number of alternatives bool fuzzy; // whether to employ fuzziness bool color; // whether read is in color space char primer; // primer base, for csfasta files char trimc; // trimmed color, for csfasta files char filter; // if read format permits filter char, set it here int trimmed5; // amount actually trimmed off 5' end int trimmed3; // amount actually trimmed off 3' end HitSet *hitset; // holds previously-found hits; for chaining // for HISAT-3N int threeN_cycle; bool oppositeConversion_3N; }; /** * A string of FmStringOps represent a string of tasks performed by the * best-first alignment search. We model the search as a series of FM ops * interspersed with reported alignments. */ struct FmStringOp { bool alignment; // true -> found an alignment TAlScore pen; // penalty of the FM op or alignment size_t n; // number of FM ops (only relevant for non-alignment) }; /** * A string that summarizes the progress of an FM-index-assistet best-first * search. Useful for trying to figure out what the aligner is spending its * time doing for a given read. */ struct FmString { /** * Add one or more FM index ops to the op string */ void add(bool alignment, TAlScore pen, size_t nops) { if(ops.empty() || ops.back().pen != pen) { ops.expand(); ops.back().alignment = alignment; ops.back().pen = pen; ops.back().n = 0; } ops.back().n++; } /** * Reset FmString to uninitialized state. */ void reset() { pen = std::numeric_limits::max(); ops.clear(); } /** * Print a :Z optional field where certain characters (whitespace, colon * and percent) are escaped using % escapes. */ void print(BTString& o, char *buf) const { for(size_t i = 0; i < ops.size(); i++) { if(i > 0) { o.append(';'); } if(ops[i].alignment) { o.append("A,"); itoa10(ops[i].pen, buf); o.append(buf); } else { o.append("F,"); itoa10(ops[i].pen, buf); o.append(buf); o.append(','); itoa10(ops[i].n, buf); o.append(buf); } } } TAlScore pen; // current penalty EList ops; // op string }; /** * Key per-read metrics. These are used for thresholds, allowing us to bail * for unproductive reads. They also the basis of what's printed when the user * specifies --read-times. */ struct PerReadMetrics { PerReadMetrics() { reset(); } void reset() { nExIters = nExDps = nExDpSuccs = nExDpFails = nMateDps = nMateDpSuccs = nMateDpFails = nExUgs = nExUgSuccs = nExUgFails = nMateUgs = nMateUgSuccs = nMateUgFails = nExEes = nExEeSuccs = nExEeFails = nRedundants = nEeFmops = nSdFmops = nExFmops = nDpFail = nDpFailStreak = nDpLastSucc = nUgFail = nUgFailStreak = nUgLastSucc = nEeFail = nEeFailStreak = nEeLastSucc = nFilt = 0; nFtabs = 0; nRedSkip = 0; nRedFail = 0; nRedIns = 0; doFmString = false; nSeedRanges = nSeedElts = 0; nSeedRangesFw = nSeedEltsFw = 0; nSeedRangesRc = nSeedEltsRc = 0; seedMedian = seedMean = 0; bestLtMinscMate1 = bestLtMinscMate2 = std::numeric_limits::min(); fmString.reset(); } struct timeval tv_beg; // timer start to measure how long alignment takes struct timezone tz_beg; // timer start to measure how long alignment takes uint64_t nExIters; // iterations of seed hit extend loop uint64_t nExDps; // # extend DPs run on this read uint64_t nExDpSuccs; // # extend DPs run on this read uint64_t nExDpFails; // # extend DPs run on this read uint64_t nExUgs; // # extend ungapped alignments run on this read uint64_t nExUgSuccs; // # extend ungapped alignments run on this read uint64_t nExUgFails; // # extend ungapped alignments run on this read uint64_t nExEes; // # extend ungapped alignments run on this read uint64_t nExEeSuccs; // # extend ungapped alignments run on this read uint64_t nExEeFails; // # extend ungapped alignments run on this read uint64_t nMateDps; // # mate DPs run on this read uint64_t nMateDpSuccs; // # mate DPs run on this read uint64_t nMateDpFails; // # mate DPs run on this read uint64_t nMateUgs; // # mate ungapped alignments run on this read uint64_t nMateUgSuccs; // # mate ungapped alignments run on this read uint64_t nMateUgFails; // # mate ungapped alignments run on this read uint64_t nRedundants; // # redundant seed hits uint64_t nSeedRanges; // # BW ranges found for seeds uint64_t nSeedElts; // # BW elements found for seeds uint64_t nSeedRangesFw; // # BW ranges found for seeds from fw read uint64_t nSeedEltsFw; // # BW elements found for seeds from fw read uint64_t nSeedRangesRc; // # BW ranges found for seeds from fw read uint64_t nSeedEltsRc; // # BW elements found for seeds from fw read uint64_t seedMedian; // median seed hit count uint64_t seedMean; // rounded mean seed hit count uint64_t nEeFmops; // FM Index ops for end-to-end alignment uint64_t nSdFmops; // FM Index ops used to align seeds uint64_t nExFmops; // FM Index ops used to resolve offsets uint64_t nFtabs; // # ftab lookups uint64_t nRedSkip; // # times redundant path was detected and aborted uint64_t nRedFail; // # times a path was deemed non-redundant uint64_t nRedIns; // # times a path was added to redundancy list uint64_t nDpFail; // number of dp failures in a row up until now uint64_t nDpFailStreak; // longest streak of dp failures uint64_t nDpLastSucc; // index of last dp attempt that succeeded uint64_t nUgFail; // number of ungap failures in a row up until now uint64_t nUgFailStreak; // longest streak of ungap failures uint64_t nUgLastSucc; // index of last ungap attempt that succeeded uint64_t nEeFail; // number of ungap failures in a row up until now uint64_t nEeFailStreak; // longest streak of ungap failures uint64_t nEeLastSucc; // index of last ungap attempt that succeeded uint64_t nFilt; // # mates filtered TAlScore bestLtMinscMate1; // best invalid score observed for mate 1 TAlScore bestLtMinscMate2; // best invalid score observed for mate 2 // For collecting information to go into an FM string bool doFmString; FmString fmString; }; #endif /*READ_H_*/