/* * Copyright 2011, Ben Langmead * * This file is part of Bowtie 2. * * Bowtie 2 is free software: you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation, either version 3 of the License, or * (at your option) any later version. * * Bowtie 2 is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with Bowtie 2. If not, see . */ #include #include "reference.h" #include "aligner_result.h" #include "read.h" #include "edit.h" #include "sstring.h" #include "ds.h" #include "util.h" #include "alphabet.h" using namespace std; /** * Clear all contents. */ void AlnRes::reset() { if(ned_ != NULL) { assert(aed_ != NULL); ned_->clear(); aed_->clear(); } score_.invalidate(); refcoord_.reset(); refival_.reset(); shapeSet_ = false; rdlen_ = 0; rdid_ = 0; reflen_ = 0; rdrows_ = 0; rdextent_ = 0; rdexrows_ = 0; rfextent_ = 0; refns_ = 0; type_ = ALN_RES_TYPE_UNPAIRED; fraglen_ = -1; trimSoft_ = false; trim5p_ = 0; trim3p_ = 0; pretrimSoft_ = true; pretrim5p_ = 0; pretrim3p_ = 0; seedmms_ = 0; // number of mismatches allowed in seed seedlen_ = 0; // length of seed seedival_ = 0; // interval between seeds minsc_ = 0; // minimum score nuc5p_ = 0; nuc3p_ = 0; fraglenSet_ = false; num_spliced_ = 0; assert(!refcoord_.inited()); assert(!refival_.inited()); } /** * Set the upstream-most reference offset involved in the alignment, and * the extent of the alignment (w/r/t the reference) */ void AlnRes::setShape( TRefId id, // id of reference aligned to TRefOff off, // offset of first aligned char into ref seq TRefOff reflen, // length of reference sequence aligned to bool fw, // aligned to Watson strand? size_t rdlen, // length of read after hard trimming, before soft TReadId rdid, // read ID bool pretrimSoft, // whether trimming prior to alignment was soft size_t pretrim5p, // # poss trimmed form 5p end before alignment size_t pretrim3p, // # poss trimmed form 3p end before alignment bool trimSoft, // whether local-alignment trimming was soft size_t trim5p, // # poss trimmed form 5p end during alignment size_t trim3p) // # poss trimmed form 3p end during alignment { rdlen_ = rdlen; rdid_ = rdid; rdrows_ = rdlen; refcoord_.init(id, off, fw); pretrimSoft_ = pretrimSoft; pretrim5p_ = pretrim5p; pretrim3p_ = pretrim3p; trimSoft_ = trimSoft; trim5p_ = trim5p; trim3p_ = trim3p; // Propagate trimming to the edits. We assume that the pos fields of the // edits are set w/r/t to the rows of the dynamic programming table, and // haven't taken trimming into account yet. // // TODO: The division of labor between the aligner and the AlnRes is not // clean. Perhaps the trimming and *all* of its side-effects should be // handled by the aligner. // daehwan - check this out - this doesn't seem to work with SAWHI // size_t trimBeg = fw ? trim5p : trim3p; size_t trimBeg = trim5p; if(trimBeg > 0) { for(size_t i = 0; i < ned_->size(); i++) { // Shift by trim5p, since edits are w/r/t 5p end assert_geq((*ned_)[i].pos, trimBeg); (*ned_)[i].pos -= (uint32_t)trimBeg; } } // Length after all soft trimming and any hard trimming that occurred // during alignment rdextent_ = rdlen; if(pretrimSoft_) { rdextent_ -= (pretrim5p + pretrim3p); // soft trim } rdextent_ -= (trim5p + trim3p); // soft or hard trim from alignment assert_gt(rdextent_, 0); rdexrows_ = rdextent_; calcRefExtent(); refival_.init(id, off, fw, rfextent_); reflen_ = reflen; shapeSet_ = true; } /** * Initialize new AlnRes. */ void AlnRes::init( size_t rdlen, // # chars after hard trimming TReadId rdid, // read ID AlnScore score, // alignment score const EList* ned, // nucleotide edits size_t ned_i, // first position to copy size_t ned_n, // # positions to copy const EList* aed, // ambiguous base resolutions size_t aed_i, // first position to copy size_t aed_n, // # positions to copy Coord refcoord, // leftmost ref pos of 1st al char TRefOff reflen, // length of ref aligned to LinkedEList >* raw_edits, int seedmms, // # seed mms allowed int seedlen, // seed length int seedival, // space between seeds int64_t minsc, // minimum score for valid aln int nuc5p, int nuc3p, bool pretrimSoft, size_t pretrim5p, // trimming prior to alignment size_t pretrim3p, // trimming prior to alignment bool trimSoft, size_t trim5p, // trimming from alignment size_t trim3p, // trimming from alignment bool repeat) // repeat { assert(raw_edits != NULL); assert(raw_edits_ == NULL || raw_edits_ == raw_edits); raw_edits_ = raw_edits; if(ned_ != NULL) { assert(aed_ != NULL); ned_->clear(); aed_->clear(); } else if(raw_edits_ != NULL) { assert(aed_ == NULL); assert(ned_node_ == NULL && aed_node_ == NULL); ned_node_ = raw_edits_->new_node(); aed_node_ = raw_edits_->new_node(); assert(ned_node_ != NULL && aed_node_ != NULL); ned_ = &(ned_node_->payload); aed_ = &(aed_node_->payload); } rdlen_ = rdlen; rdid_ = rdid; rdrows_ = rdlen; score_ = score; ned_->clear(); aed_->clear(); if(ned != NULL) { for(size_t i = ned_i; i < ned_i + ned_n; i++) { ned_->push_back((*ned)[i]); } } if(aed != NULL) { for(size_t i = aed_i; i < aed_i + aed_n; i++) { aed_->push_back((*aed)[i]); } } refcoord_ = refcoord; reflen_ = reflen; seedmms_ = seedmms; seedlen_ = seedlen; seedival_ = seedival; minsc_ = minsc; nuc5p_ = nuc5p; nuc3p_ = nuc3p; pretrimSoft_ = pretrimSoft; pretrim5p_ = pretrim5p; pretrim3p_ = pretrim3p; trimSoft_ = trimSoft; trim5p_ = trim5p; trim3p_ = trim3p; repeat_ = repeat; rdextent_ = rdlen; // # read characters after any hard trimming if(pretrimSoft) { rdextent_ -= (pretrim5p + pretrim3p); } if(trimSoft) { rdextent_ -= (trim5p + trim3p); } rdexrows_ = rdextent_; calcRefExtent(); setShape( refcoord.ref(), // id of reference aligned to refcoord.off(), // offset of first aligned char into ref seq reflen, // length of reference sequence aligned to refcoord.fw(), // aligned to Watson strand? rdlen, // length of read after hard trimming, before soft rdid, // read ID pretrimSoft, // whether trimming prior to alignment was soft pretrim5p, // # poss trimmed form 5p end before alignment pretrim3p, // # poss trimmed form 3p end before alignment trimSoft, // whether local-alignment trimming was soft trim5p, // # poss trimmed form 5p end during alignment trim3p); // # poss trimmed form 3p end during alignment shapeSet_ = true; num_spliced_ = 0; for(size_t i = 0; i < ned_->size(); i++) { if((*ned_)[i].type == EDIT_TYPE_SPL) { num_spliced_++; } } } /** * Clip given number of characters from the Watson-upstream end of the * alignment. */ void AlnRes::clipLeft(size_t rd_amt, size_t rf_amt) { assert_geq(rd_amt, 0); assert_geq(rf_amt, 0); assert_leq(rd_amt, rdexrows_); assert_leq(rf_amt, rfextent_); assert(trimSoft_); if(fw()) { trim5p_ += rd_amt; Edit::clipLo(*ned_, rdexrows_, rd_amt); Edit::clipLo(*aed_, rdexrows_, rd_amt); } else { trim3p_ += rd_amt; Edit::clipHi(*ned_, rdexrows_, rd_amt); Edit::clipHi(*aed_, rdexrows_, rd_amt); } rdexrows_ -= rd_amt; rdextent_ -= rd_amt; rfextent_ -= rf_amt; refcoord_.adjustOff(rf_amt); refival_.adjustOff(rf_amt); // Adjust refns_? } /** * Clip given number of characters from the Watson-downstream end of the * alignment. */ void AlnRes::clipRight(size_t rd_amt, size_t rf_amt) { assert_geq(rd_amt, 0); assert_geq(rf_amt, 0); assert_leq(rd_amt, rdexrows_); assert_leq(rf_amt, rfextent_); assert(trimSoft_); if(fw()) { trim3p_ += rd_amt; Edit::clipHi(*ned_, rdexrows_, rd_amt); Edit::clipHi(*aed_, rdexrows_, rd_amt); } else { trim5p_ += rd_amt; Edit::clipLo(*ned_, rdexrows_, rd_amt); Edit::clipLo(*aed_, rdexrows_, rd_amt); } rdexrows_ -= rd_amt; rdextent_ -= rd_amt; rfextent_ -= rf_amt; // Adjust refns_? } /** * Clip away portions of the alignment that are outside the given bounds. * Clipping is soft if soft == true, hard otherwise. Assuming for now that * there isn't any other clipping. * * Note that all clipping is expressed in terms of read positions. So if there * are reference gaps in the overhanging portion, we must */ void AlnRes::clipOutside(bool soft, TRefOff refi, TRefOff reff) { // Overhang on LHS TRefOff left = refcoord_.off(); if(left < refi) { size_t rf_amt = (size_t)(refi - left); size_t rf_i = rf_amt; size_t nedsz = ned_->size(); if(!fw()) { Edit::invertPoss(*ned_, rdexrows_, false); } for(size_t i = 0; i < nedsz; i++) { assert_lt((*ned_)[i].pos, rdexrows_); if((*ned_)[i].pos > rf_i) break; if((*ned_)[i].isRefGap()) rf_i++; } if(!fw()) { Edit::invertPoss(*ned_, rdexrows_, false); } clipLeft(rf_i, rf_amt); } // Overhang on RHS TRefOff right = refcoord_.off() + refNucExtent(); if(right > reff) { size_t rf_amt = (size_t)(right - reff); size_t rf_i = rf_amt; size_t nedsz = ned_->size(); if(fw()) { Edit::invertPoss(*ned_, rdexrows_, false); } for(size_t i = 0; i < nedsz; i++) { assert_lt((*ned_)[i].pos, rdexrows_); if((*ned_)[i].pos > rf_i) break; if((*ned_)[i].isRefGap()) rf_i++; } if(fw()) { Edit::invertPoss(*ned_, rdexrows_, false); } clipRight(rf_i, rf_amt); } } /** * Return true iff this AlnRes and the given AlnRes overlap. Two AlnRess * overlap if they share a cell in the overall dynamic programming table: * i.e. if there exists a read position s.t. that position in both reads * matches up with the same reference character. E.g., the following * alignments (drawn schematically as paths through a dynamic programming * table) are redundant: * * a b a b * \ \ \ \ * \ \ \ \ * \ \ \ \ * ---\ \ \ * \ ---\--- * ---\ \ \ * \ \ \ \ * \ \ \ \ * \ \ \ \ * a b a b * * We iterate over each read position that hasn't been hard-trimmed, but * only overlaps at positions that have also not been soft-trimmed are * considered. */ bool AlnRes::overlap(AlnRes& res) { if(fw() != res.fw() || refid() != res.refid()) { // Must be same reference and same strand in order to overlap return false; } TRefOff my_left = refoff(); // my leftmost aligned char TRefOff other_left = res.refoff(); // other leftmost aligned char TRefOff my_right = my_left + refExtent(); TRefOff other_right = other_left + res.refExtent(); if(my_right < other_left || other_right < my_left) { // The rectangular hulls of the two alignments don't overlap, so // they can't overlap at any cell return false; } // Reference and strand are the same and hulls overlap. Now go read // position by read position testing if any align identically with the // reference. // Edits are ordered and indexed from 5' to 3' to start with. We // reorder them to go from left to right along the Watson strand. if(!fw()) { invertEdits(); } if(!res.fw()) { res.invertEdits(); } size_t nedidx = 0, onedidx = 0; bool olap = false; // For each row, going left to right along Watson reference strand... for(size_t i = 0; i < rdexrows_; i++) { size_t diff = 1; // amount to shift to right for next round size_t odiff = 1; // amount to shift to right for next round // Unless there are insertions before the next position, we say // that there is one cell in this row involved in the alignment my_right = my_left + 1; other_right = other_left + 1; while(nedidx < ned_->size() && (*ned_)[nedidx].pos == i) { if((*ned_)[nedidx].isRefGap()) { // Next my_left will be in same column as this round diff = 0; } nedidx++; } while(onedidx < res.ned_->size() && (*res.ned_)[onedidx].pos == i) { if((*res.ned_)[onedidx].isRefGap()) { // Next my_left will be in same column as this round odiff = 0; } onedidx++; } if(i < rdexrows_ - 1) { // See how many inserts there are before the next read // character size_t nedidx_next = nedidx; size_t onedidx_next = onedidx; while(nedidx_next < ned_->size() && (*ned_)[nedidx_next].pos == i+1) { if((*ned_)[nedidx_next].isReadGap()) { my_right++; } nedidx_next++; } while(onedidx_next < res.ned_->size() && (*res.ned_)[onedidx_next].pos == i+1) { if((*res.ned_)[onedidx_next].isReadGap()) { other_right++; } onedidx_next++; } } // Contained? olap = (my_left >= other_left && my_right <= other_right) || (other_left >= my_left && other_right <= my_right); // Overlapping but not contained? if(!olap) { olap = (my_left <= other_left && my_right > other_left) || (other_left <= my_left && other_right > my_left); } if(olap) { break; } // How to do adjust my_left and my_right my_left = my_right + diff - 1; other_left = other_right + odiff - 1; } if(!fw()) { invertEdits(); } if(!res.fw()) { res.invertEdits(); } return olap; } #ifndef NDEBUG /** * Assuming this AlnRes is an alignment for 'rd', check that the alignment and * 'rd' are compatible with the corresponding reference sequence. */ bool AlnRes::matchesRef( const Read& rd, const BitPairReference& ref, BTDnaString& rf, BTDnaString& rdseq, BTString& qseq, SStringExpandable& raw_refbuf, SStringExpandable& destU32, EList& matches, SStringExpandable& raw_refbuf2, EList& reflens, EList& refoffs) { assert(!empty()); assert(repOk()); assert(refcoord_.inited()); size_t rdlen = rd.length(); bool fw = refcoord_.fw(); if(!fw) { assert_lt(trim3p_, rdlen); Edit::invertPoss(const_cast&>(*ned_), rdlen - trim5p_ - trim3p_, false); } size_t refallen = 0; reflens.clear(); refoffs.clear(); int64_t reflen = 0; int64_t refoff = refcoord_.off(); refoffs.push_back((uint32_t)refoff); size_t eidx = 0; assert_lt(trim5p_ + trim3p_, rdlen); for(size_t i = 0; i < rdlen - trim5p_ - trim3p_; i++, reflen++, refoff++) { while(eidx < ned_->size() && (*ned_)[eidx].pos == i) { if((*ned_)[eidx].isReadGap()) { reflen++; refoff++; } else if((*ned_)[eidx].isRefGap()) { reflen--; refoff--; } if((*ned_)[eidx].isSpliced()) { assert_gt(reflen, 0); refallen += (uint32_t)reflen; reflens.push_back((uint32_t)reflen); reflen = 0; refoff += (*ned_)[eidx].splLen; assert_gt(refoff, 0); refoffs.push_back((uint32_t)refoff); } eidx++; } } assert_gt(reflen, 0); refallen += (uint32_t)reflen; reflens.push_back((uint32_t)reflen); assert_gt(reflens.size(), 0); assert_gt(refoffs.size(), 0); assert_eq(reflens.size(), refoffs.size()); if(!fw) { assert_lt(trim3p_, rdlen); Edit::invertPoss(const_cast&>(*ned_), rdlen - trim5p_ - trim3p_, false); } // Adjust reference string length according to edits #ifndef NDEBUG if(reflens.size() == 1) { assert_eq(refallen, refNucExtent()); } #endif assert_geq(refcoord_.ref(), 0); int nsOnLeft = 0; if(refcoord_.off() < 0) { nsOnLeft = -((int)refcoord_.off()); } raw_refbuf.resize(refallen); raw_refbuf.clear(); raw_refbuf2.clear(); for(size_t i = 0; i < reflens.size(); i++) { assert_gt(reflens[i], 0); #ifndef NDEBUG if(i > 0) { assert_gt(refoffs[i], refoffs[i-1]); } #endif raw_refbuf2.resize(reflens[i] + 16); raw_refbuf2.clear(); int off = ref.getStretch( reinterpret_cast(raw_refbuf2.wbuf()), (size_t)refcoord_.ref(), (size_t)max(refoffs[i], 0), reflens[i], destU32); assert_leq(off, 16); raw_refbuf.append(raw_refbuf2.wbuf() + off, reflens[i]); } char *refbuf = raw_refbuf.wbuf(); size_t trim5 = 0, trim3 = 0; if(trimSoft_) { trim5 += trim5p_; trim3 += trim3p_; } if(pretrimSoft_) { trim5 += pretrim5p_; trim3 += pretrim3p_; } rf.clear(); rdseq.clear(); rdseq = rd.patFw; if(!fw) { rdseq.reverseComp(false); } assert_eq(rdrows_, rdseq.length()); // rdseq is the nucleotide sequence from upstream to downstream on the // Watson strand. ned_ are the nucleotide edits from upstream to // downstream. rf contains the reference characters. assert(Edit::repOk(*ned_, rdseq, fw, trim5, trim3)); Edit::toRef(rdseq, *ned_, rf, fw, trim5, trim3); assert_eq(refallen, rf.length()); matches.clear(); bool matchesOverall = true; matches.resize(refallen); matches.fill(true); for(size_t i = 0; i < refallen; i++) { if((int)i < nsOnLeft) { if((int)rf[i] != 4) { matches[i] = false; matchesOverall = false; } } else { if((int)rf[i] != (int)refbuf[i-nsOnLeft]) { matches[i] = false; matchesOverall = false; } } } if(!matchesOverall) { // Print a friendly message showing the difference between the // reference sequence obtained with Edit::toRef and the actual // reference sequence cerr << endl; Edit::printQAlignNoCheck( cerr, " ", rdseq, *ned_); cerr << " "; for(size_t i = 0; i < refallen; i++) { cerr << (matches[i] ? " " : "*"); } cerr << endl; cerr << " "; for(size_t i = 0; i < refallen-nsOnLeft; i++) { cerr << "ACGTN"[(int)refbuf[i]]; } cerr << endl; Edit::printQAlign( cerr, " ", rdseq, *ned_); cerr << endl; } return matchesOverall; } #endif /*ndef NDEBUG*/ #define COPY_BUF() { \ char *bufc = buf; \ while(*bufc != '\0') { \ *occ = *bufc; \ occ++; \ bufc++; \ } \ } /** * Initialized the stacked alignment with respect to a read string, a list of * edits (expressed left-to-right), and integers indicating how much hard and * soft trimming has occurred on either end of the read. * * s: read sequence * ed: all relevant edits, including ambiguous nucleotides * trimLS: # bases soft-trimmed from LHS * trimLH: # bases hard-trimmed from LHS * trimRS: # bases soft-trimmed from RHS * trimRH: # bases hard-trimmed from RHS */ void StackedAln::init( const BTDnaString& s, const EList& ed, size_t trimLS, size_t trimLH, size_t trimRS, size_t trimRH) { trimLS_ = trimLS; trimLH_ = trimLH; trimRS_ = trimRS; trimRH_ = trimRH; ASSERT_ONLY(size_t ln_postsoft = s.length() - trimLS - trimRS); stackRef_.clear(); stackRel_.clear(); stackSNP_.clear(); stackRead_.clear(); size_t rdoff = trimLS; for(size_t i = 0; i < ed.size(); i++) { assert_lt(ed[i].pos, ln_postsoft); size_t pos = ed[i].pos + trimLS; while(rdoff < pos) { int c = s[rdoff++]; assert_range(0, 4, c); stackRef_.push_back("ACGTN"[c]); stackRel_.push_back('='); stackSNP_.push_back(false); stackRead_.push_back("ACGTN"[c]); } if(ed[i].isMismatch()) { int c = s[rdoff++]; assert_range(0, 4, c); assert_eq(c, asc2dna[(int)ed[i].qchr]); assert_neq(c, asc2dna[(int)ed[i].chr]); stackRef_.push_back(ed[i].chr); stackRel_.push_back('X'); stackSNP_.push_back(ed[i].snpID != (uint32_t)INDEX_MAX); stackRead_.push_back("ACGTN"[c]); } else if(ed[i].isRefGap()) { int c = s[rdoff++]; assert_range(0, 4, c); assert_eq(c, asc2dna[(int)ed[i].qchr]); stackRef_.push_back('-'); stackRel_.push_back('I'); stackSNP_.push_back(ed[i].snpID != (uint32_t)INDEX_MAX); stackRead_.push_back("ACGTN"[c]); } else if(ed[i].isReadGap()) { stackRef_.push_back(ed[i].chr); stackRel_.push_back('D'); stackSNP_.push_back(ed[i].snpID != (uint32_t)INDEX_MAX); stackRead_.push_back('-'); } else if(ed[i].isSpliced()) { stackRef_.push_back('N'); stackRel_.push_back('N'); stackSNP_.push_back(false); stackRead_.push_back('N'); assert_gt(ed[i].splLen, 0); stackSkip_.push_back(ed[i].splLen); } } while(rdoff < s.length() - trimRS) { int c = s[rdoff++]; assert_range(0, 4, c); stackRef_.push_back("ACGTN"[c]); stackRel_.push_back('='); stackSNP_.push_back(false); stackRead_.push_back("ACGTN"[c]); } inited_ = true; } /** * Left-align all the gaps. If this changes the alignment and the CIGAR or * MD:Z strings have already been calculated, this renders them invalid. * * We left-align gaps with in the following way: for each gap, we check * whether the character opposite the rightmost gap character is the same * as the character opposite the character just to the left of the gap. If * this is the case, we can slide the gap to the left and make the * rightmost position previously covered by the gap into a non-gap. * * This scheme allows us to push the gap past a mismatch. BWA does seem to * allow this. It's not clear that Bowtie 2 should, since moving the * mismatch could cause a mismatch with one base quality to be replaced * with a mismatch with a different base quality. */ void StackedAln::leftAlign(bool pastMms) { assert(inited_); bool changed = false; size_t ln = stackRef_.size(); // Scan left-to-right for(size_t i = 0; i < ln; i++) { int rel = stackRel_[i]; if(rel != '=' && rel != 'X' && rel != 'N') { // Neither a match nor a mismatch - must be a gap assert(rel == 'I' || rel == 'D'); if(stackSNP_[i]) continue; size_t glen = 1; // Scan further right to measure length of gap for(size_t j = i+1; j < ln; j++) { if(rel != (int)stackRel_[j]) break; glen++; } // We've identified a gap of type 'rel' (D = deletion or read // gap, I = insertion or ref gap) with length 'glen'. Now we // can try to slide it to the left repeatedly. size_t l = i - 1; size_t r = l + glen; EList& gp = ((rel == 'I') ? stackRef_ : stackRead_); const EList& ngp = ((rel == 'I') ? stackRead_ : stackRef_); while(l > 0 && ngp[l] == ngp[r]) { if(stackRel_[l] == 'I' || stackRel_[l] == 'D') break; assert(stackRel_[l] == '=' || stackRel_[l] == 'X' || stackRel_[l] == 'N'); assert(stackRel_[r] == 'D' || stackRel_[r] == 'I'); if(!pastMms && (stackRel_[l] == 'X' || stackRel_[l] == 'N')) { break; } swap(gp[l], gp[r]); swap(stackRel_[l], stackRel_[r]); assert_neq('-', gp[r]); assert_eq('-', gp[l]); l--; r--; changed = true; } i += (glen-1); } } if(changed) { cigCalc_ = mdzCalc_ = false; } } /** * Build the CIGAR list, if it hasn't already built. Returns true iff it * was built for the first time. */ bool StackedAln::buildCigar(bool xeq) { assert(inited_); if(cigCalc_) { return false; // already done } cigOp_.clear(); cigRun_.clear(); if(trimLS_ > 0) { cigOp_.push_back('S'); cigRun_.push_back(trimLS_); } size_t numSkips = 0; size_t ln = stackRef_.size(); for(size_t i = 0; i < ln; i++) { char op = stackRel_[i]; if(!xeq && (op == 'X' || op == '=')) { op = 'M'; } size_t run; if(op != 'N') { run = 1; for(; i + run < ln; run++) { char op2 = stackRel_[i + run]; if(!xeq && (op2 == 'X' || op2 == '=')) { op2 = 'M'; } if(op2 != op) { break; } } i += (run-1); } else { assert_lt(numSkips, stackSkip_.size()); run = stackSkip_[numSkips]; numSkips++; } cigOp_.push_back(op); cigRun_.push_back(run); } if(trimRS_ > 0) { cigOp_.push_back('S'); cigRun_.push_back(trimRS_); } cigCalc_ = true; return true; } /** * Build the CIGAR list, if it hasn't already built. Returns true iff it * was built for the first time. */ bool StackedAln::buildMdz() { assert(inited_); if(mdzCalc_) { return false; // already done } mdzOp_.clear(); mdzChr_.clear(); mdzRun_.clear(); size_t ln = stackRef_.size(); for(size_t i = 0; i < ln; i++) { char op = stackRel_[i]; if(op == '=') { size_t run = 1; size_t ninserts = 0; size_t nskips = 0; // Skip over matches and insertions (ref gaps) for(; i+run < ln; run++) { if(stackRel_[i + run] == '=') { // do nothing } else if(stackRel_[i + run] == 'I') { ninserts++; } else if(stackRel_[i + run] == 'N') { nskips++; } else { break; } } i += (run - 1); mdzOp_.push_back('='); // = X or G mdzChr_.push_back('-'); mdzRun_.push_back(run - ninserts - nskips); } else if(op == 'X') { assert_neq(stackRef_[i], stackRead_[i]); mdzOp_.push_back('X'); // = X or G mdzChr_.push_back(stackRef_[i]); mdzRun_.push_back(1); } else if(op == 'D') { assert_neq('-', stackRef_[i]); mdzOp_.push_back('G'); // = X or G mdzChr_.push_back(stackRef_[i]); mdzRun_.push_back(1); } } mdzCalc_ = true; return true; } /** * Write a CIGAR representation of the alignment to the given string and/or * char buffer. */ void StackedAln::writeCigar( BTString* o, // if non-NULL, string to append to char* occ) const // if non-NULL, character string to append to { const EList& op = cigOp_; const EList& run = cigRun_; assert_eq(op.size(), run.size()); if(o != NULL || occ != NULL) { char buf[128]; ASSERT_ONLY(bool printed = false); for(size_t i = 0; i < op.size(); i++) { size_t r = run[i]; if(r > 0) { itoa10(r, buf); ASSERT_ONLY(printed = true); if(o != NULL) { o->append(buf); o->append(op[i]); } if(occ != NULL) { COPY_BUF(); *occ = op[i]; occ++; } } } assert(printed); if(occ != NULL) { *occ = '\0'; } } } void StackedAln::writeCigar(Alignment* o, char* occ) const { const EList& op = cigOp_; const EList& run = cigRun_; assert_eq(op.size(), run.size()); if(o != NULL || occ != NULL) { char buf[128]; ASSERT_ONLY(bool printed = false); o->cigarSegments.reserve(op.size()); for(size_t i = 0; i < op.size(); i++) { size_t r = run[i]; if(r > 0) { itoa10(r, buf); ASSERT_ONLY(printed = true); if(o != NULL) { o->cigarString.append(buf); o->cigarString.append(op[i]); o->cigarSegments.emplace_back(r, op[i]); o->cigarLength += r; } if(occ != NULL) { COPY_BUF(); *occ = op[i]; occ++; } } } assert(printed); if(occ != NULL) { *occ = '\0'; } } } /** * Write an MD:Z representation of the alignment to the given string and/or * char buffer. */ void StackedAln::writeMdz(BTString* o, char* occ) const { char buf[128]; bool mm_last = false; bool rdgap_last = false; bool first_print = true; const EList& op = mdzOp_; const EList& ch = mdzChr_; const EList& run = mdzRun_; for(size_t i = 0; i < op.size(); i++) { size_t r = run[i]; if(r > 0) { if(op[i] == '=') { // Write run length itoa10(r, buf); if(o != NULL) { o->append(buf); } if(occ != NULL) { COPY_BUF(); } first_print = false; mm_last = false; rdgap_last = false; } else if(op[i] == 'X') { if(o != NULL) { if(rdgap_last || mm_last || first_print) { o->append('0'); } o->append(ch[i]); } if(occ != NULL) { if(rdgap_last || mm_last || first_print) { *occ = '0'; occ++; } *occ = ch[i]; occ++; } first_print = false; mm_last = true; rdgap_last = false; } else if(op[i] == 'G') { if(o != NULL) { if(mm_last || first_print) { o->append('0'); } if(!rdgap_last) { o->append('^'); } o->append(ch[i]); } if(occ != NULL) { if(mm_last || first_print) { *occ = '0'; occ++; } if(!rdgap_last) { *occ = '^'; occ++; } *occ = ch[i]; occ++; } first_print = false; mm_last = false; rdgap_last = true; } } // if r > 0 } // for loop over ops if(mm_last || rdgap_last) { if(o != NULL) { o->append('0'); } if(occ != NULL) { *occ = '0'; occ++; } } if(occ != NULL) { *occ = '\0'; } } /** * Print the sequence for the read that aligned using A, C, G and * T. This will simply print the read sequence (or its reverse * complement). */ void AlnRes::printSeq( const Read& rd, // read const BTDnaString* dns, // already-decoded nucleotides BTString& o) const // buffer to write to { assert(!rd.patFw.empty()); ASSERT_ONLY(size_t written = 0); // Print decoded nucleotides assert(dns != NULL); size_t len = dns->length(); size_t st = 0; size_t en = len; for(size_t i = st; i < en; i++) { int c = dns->get(i); assert_range(0, 3, c); o.append("ACGT"[c]); ASSERT_ONLY(written++); } #ifndef NDEBUG for(size_t i = 0; i < ned_->size(); i++) { if((*ned_)[i].isReadGap()) { assert_leq((*ned_)[i].pos, dns->length()); } else { assert_lt((*ned_)[i].pos, dns->length()); } } #endif } /** * Print the quality string for the read that aligned. This will simply print * the read qualities (or their reverse). */ void AlnRes::printQuals( const Read& rd, // read const BTString* dqs, // already-decoded qualities BTString& o) const // output stream to write to { assert(dqs != NULL); size_t len = dqs->length(); // Print decoded qualities from upstream to downstream Watson for(size_t i = 1; i < len-1; i++) { o.append(dqs->get(i)); } } /** * Add all of the cells involved in the given alignment to the database. */ void RedundantAlns::add(const AlnRes& res) { assert(!cells_.empty()); TRefOff left = res.refoff(), right; const size_t len = res.readExtentRows(); if(!res.fw()) { const_cast(res).invertEdits(); } const EList& ned = res.ned(); size_t nedidx = 0; assert_leq(len, cells_.size()); // For each row... for(size_t i = 0; i < len; i++) { size_t diff = 1; // amount to shift to right for next round right = left + 1; while(nedidx < ned.size() && ned[nedidx].pos == i) { if(ned[nedidx].isRefGap()) { // Next my_left will be in same column as this round diff = 0; } nedidx++; } if(i < len - 1) { // See how many inserts there are before the next read // character size_t nedidx_next = nedidx; while(nedidx_next < ned.size() && ned[nedidx_next].pos == i+1) { if(ned[nedidx_next].isReadGap()) { right++; } nedidx_next++; } } for(TRefOff j = left; j < right; j++) { // Add to db RedundantCell c(res.refid(), res.fw(), j, i); ASSERT_ONLY(bool ret =) cells_[i].insert(c); assert(ret); } left = right + diff - 1; } if(!res.fw()) { const_cast(res).invertEdits(); } } /** * Return true iff the given alignment has at least one cell that overlaps * one of the cells in the database. */ bool RedundantAlns::overlap(const AlnRes& res) { assert(!cells_.empty()); TRefOff left = res.refoff(), right; const size_t len = res.readExtentRows(); if(!res.fw()) { const_cast(res).invertEdits(); } const EList& ned = res.ned(); size_t nedidx = 0; // For each row... bool olap = false; assert_leq(len, cells_.size()); for(size_t i = 0; i < len; i++) { size_t diff = 1; // amount to shift to right for next round right = left + 1; while(nedidx < ned.size() && ned[nedidx].pos == i) { if(ned[nedidx].isRefGap()) { // Next my_left will be in same column as this round diff = 0; } nedidx++; } if(i < len - 1) { // See how many inserts there are before the next read // character size_t nedidx_next = nedidx; while(nedidx_next < ned.size() && ned[nedidx_next].pos == i+1) { if(ned[nedidx_next].isReadGap()) { right++; } nedidx_next++; } } for(TRefOff j = left; j < right; j++) { // Add to db RedundantCell c(res.refid(), res.fw(), j, i); if(cells_[i].contains(c)) { olap = true; break; } } if(olap) { break; } left = right + diff - 1; } if(!res.fw()) { const_cast(res).invertEdits(); } return olap; } /** * Given all the paired and unpaired results involving mates #1 and #2, * calculate best and second-best scores for both mates. These are * used for future MAPQ calculations. */ void AlnSetSumm::init( const Read* rd1, const Read* rd2, const EList* rs1, const EList* rs2, const EList* rs1u, const EList* rs2u, bool exhausted1, bool exhausted2, TRefId orefid, TRefOff orefoff, bool repeat) { assert(rd1 != NULL || rd2 != NULL); assert((rs1 == NULL) == (rs2 == NULL)); AlnScore best[2], secbest[2], bestPaired, secbestPaired; size_t szs[2]; best[0].invalidate(); secbest[0].invalidate(); best[1].invalidate(); secbest[1].invalidate(); bestPaired.invalidate(); secbestPaired.invalidate(); bool paired = (rs1 != NULL && rs2 != NULL); szs[0] = szs[1] = 0; TNumAlns numAlns1 = 0, numAlns2 = 0, numAlnsPaired = 0; if(paired) { // Paired alignments assert_eq(rs1->size(), rs2->size()); szs[0] = szs[1] = rs1->size(); assert_gt(szs[0], 0); numAlnsPaired = szs[0]; for(size_t i = 0; i < rs1->size(); i++) { AlnScore sc = (*rs1)[i].score() + (*rs2)[i].score(); if(sc > bestPaired) { secbestPaired = bestPaired; bestPaired = sc; assert(VALID_AL_SCORE(bestPaired)); } else if(sc > secbestPaired) { secbestPaired = sc; assert(VALID_AL_SCORE(bestPaired)); assert(VALID_AL_SCORE(secbestPaired)); } } } for(int j = 0; j < 2; j++) { const EList* rs = (j == 0 ? rs1u : rs2u); if(rs == NULL) { continue; } assert(rs != NULL); szs[j] = rs->size(); if(j == 0) { numAlns1 = szs[j]; } else { numAlns2 = szs[j]; } //assert_gt(szs[j], 0); for(size_t i = 0; i < rs->size(); i++) { AlnScore sc = (*rs)[i].score(); if(sc > best[j]) { secbest[j] = best[j]; best[j] = sc; assert(VALID_AL_SCORE(best[j])); } else if(sc > secbest[j]) { secbest[j] = sc; assert(VALID_AL_SCORE(best[j])); assert(VALID_AL_SCORE(secbest[j])); } } } if(szs[0] > 0 || szs[1] > 0) { init( best[0], secbest[0], best[1], secbest[1], bestPaired, secbestPaired, (szs[0] == 0) ? 0 : (szs[0] - 1), (szs[1] == 0) ? 0 : (szs[1] - 1), paired, exhausted1, exhausted2, orefid, orefoff, repeat, numAlns1, numAlns2, numAlnsPaired); } else { reset(); orefid_ = orefid; orefoff_ = orefoff; repeat_ = repeat; } } /** * Print out string representation of YF:i flag for indicating whether and * why the mate was filtered. */ bool AlnFlags::printYF(BTString& o, bool first) const { const char *flag = ""; if (!lenfilt_) flag = "LN"; else if(!nfilt_ ) flag = "NS"; else if(!scfilt_ ) flag = "SC"; else if(!qcfilt_ ) flag = "QC"; if(*flag > 0) { if(!first) o.append('\t'); o.append("YF:Z:"); o.append(flag); return false; } return true; } /** * Print out string representation of YM:i flag for indicating with the * mate per se aligned repetitively. */ void AlnFlags::printYM(BTString& o) const { o.append("YM:i:"); o.append(maxed() ? '1' : '0'); } /** * Print out string representation of YM:i flag for indicating with the * pair containing the mate aligned repetitively. */ void AlnFlags::printYP(BTString& o) const { o.append("YP:i:"); o.append(maxedPair() ? '1' : '0'); } /** * Print out string representation of these flags. */ void AlnFlags::printYT(BTString& o) const { o.append("YT:Z:"); if(alignedConcordant()) { o.append("CP"); } else if(alignedDiscordant()) { o.append("DP"); } else if(alignedUnpairedMate()) { o.append("UP"); } else if(alignedUnpaired()) { o.append("UU"); } else { throw 1; } } #ifdef ALIGNER_RESULT_MAIN #include "mem_ids.h" int main() { EList op; EList ch; EList run; { // On top of each other, same length cerr << "Test case 1, simple overlap 1 ... "; AlnRes res1; res1.init( 10, AlnScore(), NULL, NULL, NULL, Coord(0, 0, true), false); AlnRes res2; res2.init( 10, AlnScore(), NULL, NULL, NULL, Coord(0, 0, true), false); assert(res1.overlap(res2)); // Try again, but using the redundant-alignment database RedundantAlns ra; ra.reset(); ra.init(10); ra.add(res1); assert(ra.overlap(res1)); assert(ra.overlap(res2)); char buf1[1024]; res1.printCigar(false, false, false, op, run, NULL, buf1); assert_eq(0, strcmp(buf1, "10M")); res1.printCigar(false, false, true, op, run, NULL, buf1); assert_eq(0, strcmp(buf1, "10=")); char buf2[1024]; res2.printCigar(false, false, false, op, run, NULL, buf2); assert_eq(0, strcmp(buf2, "10M")); res2.printCigar(false, false, true, op, run, NULL, buf2); assert_eq(0, strcmp(buf2, "10=")); char buf3[1024]; res1.printMD(false, false, op, ch, run, NULL, buf3); assert_eq(0, strcmp(buf3, "10")); res1.printMD(false, true, op, ch, run, NULL, buf3); assert_eq(0, strcmp(buf3, "8")); char buf4[1024]; res2.printMD(false, false, op, ch, run, NULL, buf4); assert_eq(0, strcmp(buf4, "10")); res2.printMD(false, true, op, ch, run, NULL, buf4); assert_eq(0, strcmp(buf4, "8")); cerr << "PASSED" << endl; } { // On top of each other, different lengths cerr << "Test case 2, simple overlap 2 ... "; AlnRes res1; res1.init( 10, AlnScore(), NULL, NULL, NULL, Coord(0, 0, true), false); AlnRes res2; res2.init( 11, AlnScore(), NULL, NULL, NULL, Coord(0, 0, true), false); assert(res1.overlap(res2)); // Try again, but using the redundant-alignment database RedundantAlns ra; ra.reset(); ra.init(11); ra.add(res1); assert(ra.overlap(res1)); assert(ra.overlap(res2)); char buf1[1024]; res1.printCigar(false, false, false, op, run, NULL, buf1); assert_eq(0, strcmp(buf1, "10M")); res1.printCigar(false, false, true, op, run, NULL, buf1); assert_eq(0, strcmp(buf1, "10=")); char buf2[1024]; res2.printCigar(false, false, false, op, run, NULL, buf2); assert_eq(0, strcmp(buf2, "11M")); res2.printCigar(false, false, true, op, run, NULL, buf2); assert_eq(0, strcmp(buf2, "11=")); char buf3[1024]; res1.printMD(false, false, op, ch, run, NULL, buf3); assert_eq(0, strcmp(buf3, "10")); res1.printMD(false, true, op, ch, run, NULL, buf3); assert_eq(0, strcmp(buf3, "8")); char buf4[1024]; res2.printMD(false, false, op, ch, run, NULL, buf4); assert_eq(0, strcmp(buf4, "11")); res2.printMD(false, true, op, ch, run, NULL, buf4); assert_eq(0, strcmp(buf4, "9")); cerr << "PASSED" << endl; } { // Different references cerr << "Test case 3, simple overlap 3 ... "; AlnRes res1; res1.init( 10, AlnScore(), NULL, NULL, NULL, Coord(0, 1, true), false); AlnRes res2; res2.init( 11, AlnScore(), NULL, NULL, NULL, Coord(0, 0, true), false); assert(!res1.overlap(res2)); // Try again, but using the redundant-alignment database RedundantAlns ra; ra.reset(); ra.init(11); ra.add(res1); assert(ra.overlap(res1)); assert(!ra.overlap(res2)); cerr << "PASSED" << endl; } { // Different references cerr << "Test case 4, simple overlap 4 ... "; AlnRes res1; res1.init( 10, AlnScore(), NULL, NULL, NULL, Coord(0, 0, true), false); AlnRes res2; res2.init( 10, AlnScore(), NULL, NULL, NULL, Coord(1, 0, true), false); assert(!res1.overlap(res2)); // Try again, but using the redundant-alignment database RedundantAlns ra; ra.reset(); ra.init(10); ra.add(res1); assert(ra.overlap(res1)); assert(!ra.overlap(res2)); cerr << "PASSED" << endl; } { // Different strands cerr << "Test case 5, simple overlap 5 ... "; AlnRes res1; res1.init( 10, AlnScore(), NULL, NULL, NULL, Coord(0, 0, true), false); AlnRes res2; res2.init( 10, AlnScore(), NULL, NULL, NULL, Coord(0, 0, false), false); assert(!res1.overlap(res2)); // Try again, but using the redundant-alignment database RedundantAlns ra; ra.reset(); ra.init(10); ra.add(res1); assert(ra.overlap(res1)); assert(!ra.overlap(res2)); cerr << "PASSED" << endl; } { // Different strands cerr << "Test case 6, simple overlap 6 ... "; EList ned1(RES_CAT); ned1.expand(); // 1 step to the right in the middle of the alignment ned1.back().init(5, 'A' /*chr*/, '-' /*qchr*/, EDIT_TYPE_READ_GAP); AlnRes res1; res1.init( 10, AlnScore(), &ned1, NULL, NULL, Coord(0, 5, false), false); AlnRes res2; res2.init( 10, AlnScore(), NULL, NULL, NULL, Coord(0, 6, false), false); assert(res1.overlap(res2)); // Try again, but using the redundant-alignment database RedundantAlns ra; ra.reset(); ra.init(10); ra.add(res1); assert(ra.overlap(res1)); assert(ra.overlap(res2)); char buf1[1024]; res1.printCigar(false, false, false, op, run, NULL, buf1); assert_eq(0, strcmp(buf1, "5M1D5M")); res1.printCigar(false, false, true, op, run, NULL, buf1); assert_eq(0, strcmp(buf1, "5=1D5=")); char buf2[1024]; res2.printCigar(false, false, false, op, run, NULL, buf2); assert_eq(0, strcmp(buf2, "10M")); res2.printCigar(false, false, true, op, run, NULL, buf2); assert_eq(0, strcmp(buf2, "10=")); char buf3[1024]; res1.printMD(false, false, op, ch, run, NULL, buf3); assert_eq(0, strcmp(buf3, "5^A5")); res1.printMD(false, true, op, ch, run, NULL, buf3); assert_eq(0, strcmp(buf3, "4^A4")); char buf4[1024]; res2.printMD(false, false, op, ch, run, NULL, buf4); assert_eq(0, strcmp(buf4, "10")); res2.printMD(false, true, op, ch, run, NULL, buf4); assert_eq(0, strcmp(buf4, "8")); cerr << "PASSED" << endl; } { // Different strands cerr << "Test case 7, simple overlap 7 ... "; EList ned1(RES_CAT); // 3 steps to the right in the middle of the alignment ned1.push_back(Edit(5, 'A', '-', EDIT_TYPE_READ_GAP)); ned1.push_back(Edit(5, 'C', '-', EDIT_TYPE_READ_GAP)); ned1.push_back(Edit(5, 'G', '-', EDIT_TYPE_READ_GAP)); AlnRes res1; res1.init( 10, AlnScore(), &ned1, NULL, NULL, Coord(0, 5, false), false); AlnRes res2; res2.init( 10, AlnScore(), NULL, NULL, NULL, Coord(0, 6, false), false); assert(res1.overlap(res2)); // Try again, but using the redundant-alignment database RedundantAlns ra; ra.reset(); ra.init(10); ra.add(res1); assert(ra.overlap(res1)); assert(ra.overlap(res2)); char buf1[1024]; res1.printCigar(false, false, false, op, run, NULL, buf1); assert_eq(0, strcmp(buf1, "5M3D5M")); res1.printCigar(false, false, true, op, run, NULL, buf1); assert_eq(0, strcmp(buf1, "5=3D5=")); char buf2[1024]; res2.printCigar(false, false, false, op, run, NULL, buf2); assert_eq(0, strcmp(buf2, "10M")); res2.printCigar(false, false, true, op, run, NULL, buf2); assert_eq(0, strcmp(buf2, "10=")); char buf3[1024]; res1.printMD(false, false, op, ch, run, NULL, buf3); assert_eq(0, strcmp(buf3, "5^GCA5")); res1.printMD(false, true, op, ch, run, NULL, buf3); assert_eq(0, strcmp(buf3, "4^GCA4")); char buf4[1024]; res2.printMD(false, false, op, ch, run, NULL, buf4); assert_eq(0, strcmp(buf4, "10")); res2.printMD(false, true, op, ch, run, NULL, buf4); assert_eq(0, strcmp(buf4, "8")); cerr << "PASSED" << endl; } { // Both with horizontal movements; overlap cerr << "Test case 8, simple overlap 8 ... "; EList ned1(RES_CAT); // 2 steps to the right in the middle of the alignment ned1.push_back(Edit(5, 'A', '-', EDIT_TYPE_READ_GAP)); ned1.push_back(Edit(5, 'C', '-', EDIT_TYPE_READ_GAP)); AlnRes res1; res1.init( 10, AlnScore(), &ned1, NULL, NULL, Coord(0, 5, false), false); EList ned2(RES_CAT); // 2 steps to the right in the middle of the alignment ned2.push_back(Edit(5, 'A', '-', EDIT_TYPE_READ_GAP)); ned2.push_back(Edit(5, 'C', '-', EDIT_TYPE_READ_GAP)); AlnRes res2; res2.init( 10, AlnScore(), &ned2, NULL, NULL, Coord(0, 6, false), false); assert(res1.overlap(res2)); // Try again, but using the redundant-alignment database RedundantAlns ra; ra.reset(); ra.init(10); ra.add(res1); assert(ra.overlap(res1)); assert(ra.overlap(res2)); char buf1[1024]; res1.printCigar(false, false, false, op, run, NULL, buf1); assert_eq(0, strcmp(buf1, "5M2D5M")); res1.printCigar(false, false, true, op, run, NULL, buf1); assert_eq(0, strcmp(buf1, "5=2D5=")); char buf2[1024]; res2.printCigar(false, false, false, op, run, NULL, buf2); assert_eq(0, strcmp(buf2, "5M2D5M")); res2.printCigar(false, false, true, op, run, NULL, buf2); assert_eq(0, strcmp(buf2, "5=2D5=")); cerr << "PASSED" << endl; } { // Both with horizontal movements; no overlap cerr << "Test case 9, simple overlap 9 ... "; EList ned1(RES_CAT); // 2 steps to the right in the middle of the alignment ned1.push_back(Edit(6, 'A', '-', EDIT_TYPE_READ_GAP)); ned1.push_back(Edit(6, 'C', '-', EDIT_TYPE_READ_GAP)); AlnRes res1; res1.init( 10, AlnScore(), &ned1, NULL, NULL, Coord(0, 5, true), false); EList ned2(RES_CAT); // 2 steps to the right in the middle of the alignment ned2.push_back(Edit(5, 'A', '-', EDIT_TYPE_READ_GAP)); ned2.push_back(Edit(5, 'C', '-', EDIT_TYPE_READ_GAP)); AlnRes res2; res2.init( 10, AlnScore(), &ned2, NULL, NULL, Coord(0, 6, true), false); assert(!res1.overlap(res2)); // Try again, but using the redundant-alignment database RedundantAlns ra; ra.reset(); ra.init(10); ra.add(res1); assert(ra.overlap(res1)); assert(!ra.overlap(res2)); char buf1[1024]; res1.printCigar(false, false, false, op, run, NULL, buf1); assert_eq(0, strcmp(buf1, "6M2D4M")); res1.printCigar(false, false, true, op, run, NULL, buf1); assert_eq(0, strcmp(buf1, "6=2D4=")); char buf2[1024]; res2.printCigar(false, false, false, op, run, NULL, buf2); assert_eq(0, strcmp(buf2, "5M2D5M")); res2.printCigar(false, false, true, op, run, NULL, buf2); assert_eq(0, strcmp(buf2, "5=2D5=")); cerr << "PASSED" << endl; } { // Both with horizontal movements; no overlap. Reverse strand. cerr << "Test case 10, simple overlap 10 ... "; EList ned1(RES_CAT); // 2 steps to the right in the middle of the alignment ned1.push_back(Edit(5, 'A', '-', EDIT_TYPE_READ_GAP)); ned1.push_back(Edit(5, 'C', '-', EDIT_TYPE_READ_GAP)); AlnRes res1; res1.init( 10, AlnScore(), &ned1, NULL, NULL, Coord(0, 5, false), false); EList ned2(RES_CAT); // 2 steps to the right in the middle of the alignment ned2.push_back(Edit(6, 'A', '-', EDIT_TYPE_READ_GAP)); ned2.push_back(Edit(6, 'C', '-', EDIT_TYPE_READ_GAP)); AlnRes res2; res2.init( 10, AlnScore(), &ned2, NULL, NULL, Coord(0, 6, false), false); assert(!res1.overlap(res2)); // Try again, but using the redundant-alignment database RedundantAlns ra; ra.reset(); ra.init(10); ra.add(res1); assert(ra.overlap(res1)); assert(!ra.overlap(res2)); char buf1[1024]; res1.printCigar(false, false, false, op, run, NULL, buf1); assert_eq(0, strcmp(buf1, "5M2D5M")); res1.printCigar(false, false, true, op, run, NULL, buf1); assert_eq(0, strcmp(buf1, "5=2D5=")); char buf2[1024]; res2.printCigar(false, false, false, op, run, NULL, buf2); assert_eq(0, strcmp(buf2, "4M2D6M")); res2.printCigar(false, false, true, op, run, NULL, buf2); assert_eq(0, strcmp(buf2, "4=2D6=")); cerr << "PASSED" << endl; } { // Both with vertical movements; no overlap cerr << "Test case 11, simple overlap 11 ... "; EList ned1(RES_CAT); // 2 steps to the right in the middle of the alignment ned1.push_back(Edit(5, '-', 'A', EDIT_TYPE_REF_GAP)); ned1.push_back(Edit(6, '-', 'C', EDIT_TYPE_REF_GAP)); AlnRes res1; res1.init( 10, AlnScore(), &ned1, NULL, NULL, Coord(0, 5, true), false); EList ned2(RES_CAT); // 2 steps to the right in the middle of the alignment ned2.push_back(Edit(6, '-', 'A', EDIT_TYPE_REF_GAP)); ned2.push_back(Edit(7, '-', 'C', EDIT_TYPE_REF_GAP)); AlnRes res2; res2.init( 10, AlnScore(), &ned2, NULL, NULL, Coord(0, 6, true), false); assert(!res1.overlap(res2)); // Try again, but using the redundant-alignment database RedundantAlns ra; ra.reset(); ra.init(10); ra.add(res1); assert(ra.overlap(res1)); assert(!ra.overlap(res2)); char buf1[1024]; res1.printCigar(false, false, false, op, run, NULL, buf1); assert_eq(0, strcmp(buf1, "5M2I3M")); res1.printCigar(false, false, true, op, run, NULL, buf1); assert_eq(0, strcmp(buf1, "5=2I3=")); char buf2[1024]; res2.printCigar(false, false, false, op, run, NULL, buf2); assert_eq(0, strcmp(buf2, "6M2I2M")); res2.printCigar(false, false, true, op, run, NULL, buf2); assert_eq(0, strcmp(buf2, "6=2I2=")); cerr << "PASSED" << endl; } { // Both with vertical movements; no overlap cerr << "Test case 12, simple overlap 12 ... "; EList ned1(RES_CAT); // 2 steps to the right in the middle of the alignment ned1.push_back(Edit(5, '-', 'A', EDIT_TYPE_REF_GAP)); ned1.push_back(Edit(6, '-', 'C', EDIT_TYPE_REF_GAP)); AlnRes res1; res1.init( 10, AlnScore(), &ned1, NULL, NULL, Coord(0, 5, true), false); EList ned2(RES_CAT); // 2 steps to the right in the middle of the alignment ned2.push_back(Edit(5, '-', 'A', EDIT_TYPE_REF_GAP)); ned2.push_back(Edit(6, '-', 'C', EDIT_TYPE_REF_GAP)); AlnRes res2; res2.init( 10, AlnScore(), &ned2, NULL, NULL, Coord(0, 6, true), false); assert(!res1.overlap(res2)); // Try again, but using the redundant-alignment database RedundantAlns ra; ra.reset(); ra.init(10); ra.add(res1); assert(ra.overlap(res1)); assert(!ra.overlap(res2)); char buf1[1024]; res1.printCigar(false, false, false, op, run, NULL, buf1); assert_eq(0, strcmp(buf1, "5M2I3M")); res1.printCigar(false, false, true, op, run, NULL, buf1); assert_eq(0, strcmp(buf1, "5=2I3=")); char buf2[1024]; res2.printCigar(false, false, false, op, run, NULL, buf2); assert_eq(0, strcmp(buf2, "5M2I3M")); res2.printCigar(false, false, true, op, run, NULL, buf2); assert_eq(0, strcmp(buf2, "5=2I3=")); cerr << "PASSED" << endl; } { // Both with vertical movements; overlap cerr << "Test case 13, simple overlap 13 ... "; EList ned1(RES_CAT); // 2 steps to the right in the middle of the alignment ned1.push_back(Edit(5, '-', 'A', EDIT_TYPE_REF_GAP)); ned1.push_back(Edit(6, '-', 'C', EDIT_TYPE_REF_GAP)); AlnRes res1; res1.init( 10, AlnScore(), &ned1, NULL, NULL, Coord(0, 5, true), false); EList ned2(RES_CAT); // 2 steps to the right in the middle of the alignment ned2.push_back(Edit(4, '-', 'A', EDIT_TYPE_REF_GAP)); ned2.push_back(Edit(5, '-', 'C', EDIT_TYPE_REF_GAP)); AlnRes res2; res2.init( 10, AlnScore(), &ned2, NULL, NULL, Coord(0, 6, true), false); assert(res1.overlap(res2)); // Try again, but using the redundant-alignment database RedundantAlns ra; ra.reset(); ra.init(10); ra.add(res1); assert(ra.overlap(res1)); assert(ra.overlap(res2)); char buf1[1024]; res1.printCigar(false, false, false, op, run, NULL, buf1); assert_eq(0, strcmp(buf1, "5M2I3M")); res1.printCigar(false, false, true, op, run, NULL, buf1); assert_eq(0, strcmp(buf1, "5=2I3=")); char buf2[1024]; res2.printCigar(false, false, false, op, run, NULL, buf2); assert_eq(0, strcmp(buf2, "4M2I4M")); res2.printCigar(false, false, true, op, run, NULL, buf2); assert_eq(0, strcmp(buf2, "4=2I4=")); cerr << "PASSED" << endl; } { // Not even close cerr << "Test case 14, simple overlap 14 ... "; EList ned1(RES_CAT); // 2 steps to the right in the middle of the alignment ned1.push_back(Edit(5, '-', 'A', EDIT_TYPE_REF_GAP)); ned1.push_back(Edit(6, '-', 'C', EDIT_TYPE_REF_GAP)); AlnRes res1; res1.init( 10, AlnScore(), &ned1, NULL, NULL, Coord(0, 5, true), false); EList ned2(RES_CAT); // 2 steps to the right in the middle of the alignment ned2.push_back(Edit(4, '-', 'A', EDIT_TYPE_REF_GAP)); ned2.push_back(Edit(5, '-', 'C', EDIT_TYPE_REF_GAP)); AlnRes res2; res2.init( 10, AlnScore(), &ned2, NULL, NULL, Coord(0, 400, true), false); assert(!res1.overlap(res2)); // Try again, but using the redundant-alignment database RedundantAlns ra; ra.reset(); ra.init(10); ra.add(res1); assert(ra.overlap(res1)); assert(!ra.overlap(res2)); char buf1[1024]; res1.printCigar(false, false, false, op, run, NULL, buf1); assert_eq(0, strcmp(buf1, "5M2I3M")); res1.printCigar(false, false, true, op, run, NULL, buf1); assert_eq(0, strcmp(buf1, "5=2I3=")); char buf2[1024]; res2.printCigar(false, false, false, op, run, NULL, buf2); assert_eq(0, strcmp(buf2, "4M2I4M")); res2.printCigar(false, false, true, op, run, NULL, buf2); assert_eq(0, strcmp(buf2, "4=2I4=")); cerr << "PASSED" << endl; } { cerr << "Test case 15, CIGAR string with mismatches ... "; EList ned(RES_CAT); // 2 steps to the right in the middle of the alignment ned.push_back(Edit(0, 'C', 'A', EDIT_TYPE_MM)); ned.push_back(Edit(4, '-', 'C', EDIT_TYPE_REF_GAP)); ned.push_back(Edit(6, '-', 'C', EDIT_TYPE_REF_GAP)); ned.push_back(Edit(7, '-', 'C', EDIT_TYPE_REF_GAP)); ned.push_back(Edit(9, '-', 'A', EDIT_TYPE_READ_GAP)); ned.push_back(Edit(9, '-', 'A', EDIT_TYPE_READ_GAP)); ned.push_back(Edit(9, '-', 'A', EDIT_TYPE_READ_GAP)); ned.push_back(Edit(9, '-', 'A', EDIT_TYPE_READ_GAP)); ned.push_back(Edit(10, '-', 'A', EDIT_TYPE_MM)); AlnRes res; res.init( 11, AlnScore(), &ned, NULL, NULL, Coord(0, 44, true), false); char buf[1024]; res.printCigar(false, false, false, op, run, NULL, buf); assert_eq(0, strcmp(buf, "4M1I1M2I1M4D2M")); res.printCigar(false, false, true, op, run, NULL, buf); assert_eq(0, strcmp(buf, "1X3=1I1=2I1=4D1=1X")); cerr << "PASSED" << endl; } { cerr << "Test case 17, Overhang ... "; EList ned(RES_CAT); // 2 steps to the right in the middle of the alignment ned.push_back(Edit(0, 'N', 'A', EDIT_TYPE_MM)); ned.push_back(Edit(5, 'C', 'A', EDIT_TYPE_MM)); AlnRes res; res.init( 10, AlnScore(), &ned, NULL, NULL, Coord(0, -1, true), false); char buf[1024]; res.printCigar(false, false, false, op, run, NULL, buf); assert_eq(0, strcmp(buf, "10M")); res.printCigar(false, false, true, op, run, NULL, buf); assert_eq(0, strcmp(buf, "1X4=1X4=")); res.printMD(false, false, op, ch, run, NULL, buf); assert_eq(0, strcmp(buf, "0N4C4")); #if 0 AlnRes res2(res); // Now soft-clip away the overhang res2.clipOutside( true, // soft clip 0, // ref begins 40); // ref ends (excl) res2.printCigar(false, false, false, op, run, NULL, buf); assert_eq(0, strcmp(buf, "1S9M")); res2.printCigar(false, false, true, op, run, NULL, buf); assert_eq(0, strcmp(buf, "4=1X4=")); res2.printMD(false, false, op, ch, run, NULL, buf); assert_eq(0, strcmp(buf, "4C4")); AlnRes res3 = res; // Now hard-clip away the overhang res3.clipOutside( false, // hard clip 0, // ref begins 40); // ref ends (excl) res3.printCigar(false, false, false, op, run, NULL, buf); assert_eq(0, strcmp(buf, "9M")); res3.printCigar(false, false, true, op, run, NULL, buf); assert_eq(0, strcmp(buf, "4=1X4=")); res3.printMD(false, false, op, ch, run, NULL, buf); assert_eq(0, strcmp(buf, "4C4")); #endif cerr << "PASSED" << endl; } } #endif /*def ALIGNER_RESULT_MAIN*/