/* * Copyright 2011, Ben Langmead * * This file is part of Bowtie 2. * * Bowtie 2 is free software: you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation, either version 3 of the License, or * (at your option) any later version. * * Bowtie 2 is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with Bowtie 2. If not, see . */ #ifndef EDIT_H_ #define EDIT_H_ #include #include #include #include "assert_helpers.h" #include "filebuf.h" #include "sstring.h" #include "ds.h" #include "splice_site.h" /** * 3 types of edits; mismatch (substitution), insertion in the * reference, deletion in the reference. */ enum { EDIT_TYPE_READ_GAP = 1, EDIT_TYPE_REF_GAP, EDIT_TYPE_MM, EDIT_TYPE_SNP, EDIT_TYPE_SPL, // splicing of pre-messenger RNAs into messenger RNAs }; /** * Encapsulates an edit between the read sequence and the reference sequence. * We obey a few conventions when populating its fields. The fields are: * * uint8_t chr; // reference character involved (for subst and ins) * uint8_t qchr; // read character involved (for subst and del) * uint8_t type; // 1 -> mm, 2 -> SNP, 3 -> ins, 4 -> del * uint32_t pos; // position w/r/t search root * * One convention is that pos is always an offset w/r/t the 5' end of the read. * * Another is that chr and qchr are expressed in terms of the nucleotides on * the forward version of the read. So if we're aligning the reverse * complement of the read, and an A in the reverse complement mismatches a C in * the reference, chr should be G and qchr should be T. */ struct Edit { Edit() { reset(); } Edit( uint32_t po, int ch, int qc, int ty, bool chrs = true, uint32_t snp = std::numeric_limits::max()) { init(po, ch, qc, ty, chrs, snp); } Edit( uint32_t po, int ch, int qc, int ty, uint32_t sl, uint8_t sdir, bool knowns, bool chrs = true) { init(po, ch, qc, ty, sl, sdir, knowns, chrs); } /** * Reset Edit to uninitialized state. */ void reset() { pos = pos2 = std::numeric_limits::max(); chr = qchr = type = 0; splLen = 0; splDir = SPL_UNKNOWN; knownSpl = false; snpID = std::numeric_limits::max(); } /** * Return true iff the Edit is initialized. */ bool inited() const { return pos != std::numeric_limits::max(); } /** * Initialize a new Edit. */ void init( uint32_t po, int ch, int qc, int ty, bool chrs = true, uint32_t snp = std::numeric_limits::max()) { chr = ch; qchr = qc; type = ty; splLen = 0; splDir = SPL_UNKNOWN; pos = po; if(qc == '-') { // Read gap pos2 = std::numeric_limits::max() >> 1; } else { pos2 = std::numeric_limits::max(); } snpID = snp; if(!chrs) { assert_range(0, 4, (int)chr); assert_range(0, 4, (int)qchr); chr = "ACGTN"[chr]; qchr = "ACGTN"[qchr]; } #ifndef NDEBUG if(type != EDIT_TYPE_SPL) { assert_in(chr, "ACMGRSVTWYHKDBN-"); assert_in(qchr, "ACGTN-"); assert(chr != qchr || chr == 'N'); } #endif assert(inited()); } /** * Initialize a new Edit. */ void init( uint32_t po, int ch, int qc, int ty, uint32_t sl, uint32_t sdir, bool knowns, bool chrs = true) { assert_eq(ty, EDIT_TYPE_SPL); init(po, ch, qc, ty, chrs); splLen = sl; splDir = sdir; knownSpl = knowns; } /** * Return true iff one part of the edit or the other has an 'N'. */ bool hasN() const { assert(inited()); return chr == 'N' || qchr == 'N'; } /** * Edit less-than overload. */ int operator< (const Edit &rhs) const { assert(inited()); if(pos < rhs.pos) return 1; if(pos > rhs.pos) return 0; if(pos2 < rhs.pos2) return 1; if(pos2 > rhs.pos2) return 0; if(type < rhs.type) return 1; if(type > rhs.type) return 0; if(chr < rhs.chr) return 1; if(chr > rhs.chr) return 0; return (qchr < rhs.qchr)? 1 : 0; } /** * Edit equals overload. */ int operator== (const Edit &rhs) const { assert(inited()); if(type != rhs.type) return false; if(pos != rhs.pos) return false; if(type != EDIT_TYPE_SPL) { return chr == rhs.chr && qchr == rhs.qchr; } else { return pos2 == rhs.pos2 && splLen == rhs.splLen && splDir == rhs.splDir /* && knownSpl == rhs.knownSpl */; } } /** * Return true iff this Edit is an initialized insertion. */ bool isReadGap() const { assert(inited()); return type == EDIT_TYPE_READ_GAP; } /** * Return true iff this Edit is an initialized deletion. */ bool isRefGap() const { assert(inited()); return type == EDIT_TYPE_REF_GAP; } /** * Return true if this Edit is either an initialized deletion or an * initialized insertion. */ bool isGap() const { assert(inited()); return (type == EDIT_TYPE_REF_GAP || type == EDIT_TYPE_READ_GAP); } bool isSpliced() const { assert(inited()); return type == EDIT_TYPE_SPL; } /** * Return the number of gaps in the given edit list. */ static size_t numGaps(const EList& es) { size_t gaps = 0; for(size_t i = 0; i < es.size(); i++) { if(es[i].isGap()) gaps++; } return gaps; } /** * Return true iff this Edit is an initialized mismatch. */ bool isMismatch() const { assert(inited()); return type == EDIT_TYPE_MM; } /** * Sort the edits in the provided list. */ static void sort(EList& edits); /** * Flip all the edits.pos fields so that they're with respect to * the other end of the read (of length 'sz'). */ static void invertPoss( EList& edits, size_t sz, size_t ei, size_t en, bool sort = false); /** * Flip all the edits.pos fields so that they're with respect to * the other end of the read (of length 'sz'). */ static void invertPoss(EList& edits, size_t sz, bool sort = false) { invertPoss(edits, sz, 0, edits.size(), sort); } static void complement(EList& edits); /** * Clip off some of the low-numbered positions. */ static void clipLo(EList& edits, size_t len, size_t amt); /** * Clip off some of the high-numbered positions. */ static void clipHi(EList& edits, size_t len, size_t amt); /** * Given a read string and some edits, generate and append the * corresponding reference string to 'ref'. */ static void toRef( const BTDnaString& read, const EList& edits, BTDnaString& ref, bool fw = true, size_t trim5 = 0, size_t trim3 = 0); /** * Given a string and its edits with respect to some other string, * print the alignment between the strings with the strings stacked * vertically, with vertical bars denoting matches. */ static void printQAlign( std::ostream& os, const BTDnaString& read, const EList& edits); /** * Given a string and its edits with respect to some other string, * print the alignment between the strings with the strings stacked * vertically, with vertical bars denoting matches. Add 'prefix' * before each line of output. */ static void printQAlign( std::ostream& os, const char *prefix, const BTDnaString& read, const EList& edits); /** * Given a string and its edits with respect to some other string, * print the alignment between the strings with the strings stacked * vertically, with vertical bars denoting matches. */ static void printQAlignNoCheck( std::ostream& os, const BTDnaString& read, const EList& edits); /** * Given a string and its edits with respect to some other string, * print the alignment between the strings with the strings stacked * vertically, with vertical bars denoting matches. Add 'prefix' * before each line of output. */ static void printQAlignNoCheck( std::ostream& os, const char *prefix, const BTDnaString& read, const EList& edits); #ifndef NDEBUG bool repOk() const; /** * Given a list of edits and a DNA string representing the query * sequence, check that the edits are consistent with respect to the * query. */ static bool repOk( const EList& edits, const BTDnaString& s, bool fw = true, size_t trim5 = 0, size_t trim3 = 0); #endif uint8_t chr; // reference character involved (for subst and ins) uint8_t qchr; // read character involved (for subst and del) uint8_t type; // 1 -> mm, 2 -> SNP, 3 -> ins, 4 -> del uint32_t pos; // position w/r/t search root uint32_t pos2; // Second int to take into account when sorting. Useful for // sorting read gap edits that are all part of the same long // gap. uint32_t splLen; // skip over the genome due to an intron uint8_t splDir; bool knownSpl; int64_t donor_seq; int64_t acceptor_seq; uint32_t snpID; // snp ID friend std::ostream& operator<< (std::ostream& os, const Edit& e); /** * Print a comma-separated list of Edits to given output stream. */ static void print( std::ostream& os, const EList& edits, char delim = '\t'); /** * Merge second argument into the first. Assume both are sorted to * begin with. */ static void merge(EList& dst, const EList& src); }; #endif /* EDIT_H_ */