2163 lines
57 KiB
C++
2163 lines
57 KiB
C++
|
/*
|
||
|
* Copyright 2011, Ben Langmead <langmea@cs.jhu.edu>
|
||
|
*
|
||
|
* This file is part of Bowtie 2.
|
||
|
*
|
||
|
* Bowtie 2 is free software: you can redistribute it and/or modify
|
||
|
* it under the terms of the GNU General Public License as published by
|
||
|
* the Free Software Foundation, either version 3 of the License, or
|
||
|
* (at your option) any later version.
|
||
|
*
|
||
|
* Bowtie 2 is distributed in the hope that it will be useful,
|
||
|
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||
|
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||
|
* GNU General Public License for more details.
|
||
|
*
|
||
|
* You should have received a copy of the GNU General Public License
|
||
|
* along with Bowtie 2. If not, see <http://www.gnu.org/licenses/>.
|
||
|
*/
|
||
|
|
||
|
#include <iostream>
|
||
|
#include "reference.h"
|
||
|
#include "aligner_result.h"
|
||
|
#include "read.h"
|
||
|
#include "edit.h"
|
||
|
#include "sstring.h"
|
||
|
#include "ds.h"
|
||
|
#include "util.h"
|
||
|
#include "alphabet.h"
|
||
|
|
||
|
using namespace std;
|
||
|
|
||
|
/**
|
||
|
* Clear all contents.
|
||
|
*/
|
||
|
void AlnRes::reset() {
|
||
|
if(ned_ != NULL) {
|
||
|
assert(aed_ != NULL);
|
||
|
ned_->clear();
|
||
|
aed_->clear();
|
||
|
}
|
||
|
score_.invalidate();
|
||
|
refcoord_.reset();
|
||
|
refival_.reset();
|
||
|
shapeSet_ = false;
|
||
|
rdlen_ = 0;
|
||
|
rdid_ = 0;
|
||
|
reflen_ = 0;
|
||
|
rdrows_ = 0;
|
||
|
rdextent_ = 0;
|
||
|
rdexrows_ = 0;
|
||
|
rfextent_ = 0;
|
||
|
refns_ = 0;
|
||
|
type_ = ALN_RES_TYPE_UNPAIRED;
|
||
|
fraglen_ = -1;
|
||
|
trimSoft_ = false;
|
||
|
trim5p_ = 0;
|
||
|
trim3p_ = 0;
|
||
|
pretrimSoft_ = true;
|
||
|
pretrim5p_ = 0;
|
||
|
pretrim3p_ = 0;
|
||
|
seedmms_ = 0; // number of mismatches allowed in seed
|
||
|
seedlen_ = 0; // length of seed
|
||
|
seedival_ = 0; // interval between seeds
|
||
|
minsc_ = 0; // minimum score
|
||
|
nuc5p_ = 0;
|
||
|
nuc3p_ = 0;
|
||
|
fraglenSet_ = false;
|
||
|
num_spliced_ = 0;
|
||
|
assert(!refcoord_.inited());
|
||
|
assert(!refival_.inited());
|
||
|
}
|
||
|
|
||
|
/**
|
||
|
* Set the upstream-most reference offset involved in the alignment, and
|
||
|
* the extent of the alignment (w/r/t the reference)
|
||
|
*/
|
||
|
void AlnRes::setShape(
|
||
|
TRefId id, // id of reference aligned to
|
||
|
TRefOff off, // offset of first aligned char into ref seq
|
||
|
TRefOff reflen, // length of reference sequence aligned to
|
||
|
bool fw, // aligned to Watson strand?
|
||
|
size_t rdlen, // length of read after hard trimming, before soft
|
||
|
TReadId rdid, // read ID
|
||
|
bool pretrimSoft, // whether trimming prior to alignment was soft
|
||
|
size_t pretrim5p, // # poss trimmed form 5p end before alignment
|
||
|
size_t pretrim3p, // # poss trimmed form 3p end before alignment
|
||
|
bool trimSoft, // whether local-alignment trimming was soft
|
||
|
size_t trim5p, // # poss trimmed form 5p end during alignment
|
||
|
size_t trim3p) // # poss trimmed form 3p end during alignment
|
||
|
{
|
||
|
rdlen_ = rdlen;
|
||
|
rdid_ = rdid;
|
||
|
rdrows_ = rdlen;
|
||
|
refcoord_.init(id, off, fw);
|
||
|
pretrimSoft_ = pretrimSoft;
|
||
|
pretrim5p_ = pretrim5p;
|
||
|
pretrim3p_ = pretrim3p;
|
||
|
trimSoft_ = trimSoft;
|
||
|
trim5p_ = trim5p;
|
||
|
trim3p_ = trim3p;
|
||
|
// Propagate trimming to the edits. We assume that the pos fields of the
|
||
|
// edits are set w/r/t to the rows of the dynamic programming table, and
|
||
|
// haven't taken trimming into account yet.
|
||
|
//
|
||
|
// TODO: The division of labor between the aligner and the AlnRes is not
|
||
|
// clean. Perhaps the trimming and *all* of its side-effects should be
|
||
|
// handled by the aligner.
|
||
|
|
||
|
// daehwan - check this out - this doesn't seem to work with SAWHI
|
||
|
// size_t trimBeg = fw ? trim5p : trim3p;
|
||
|
size_t trimBeg = trim5p;
|
||
|
if(trimBeg > 0) {
|
||
|
for(size_t i = 0; i < ned_->size(); i++) {
|
||
|
// Shift by trim5p, since edits are w/r/t 5p end
|
||
|
assert_geq((*ned_)[i].pos, trimBeg);
|
||
|
(*ned_)[i].pos -= (uint32_t)trimBeg;
|
||
|
}
|
||
|
}
|
||
|
// Length after all soft trimming and any hard trimming that occurred
|
||
|
// during alignment
|
||
|
rdextent_ = rdlen;
|
||
|
if(pretrimSoft_) {
|
||
|
rdextent_ -= (pretrim5p + pretrim3p); // soft trim
|
||
|
}
|
||
|
rdextent_ -= (trim5p + trim3p); // soft or hard trim from alignment
|
||
|
assert_gt(rdextent_, 0);
|
||
|
rdexrows_ = rdextent_;
|
||
|
calcRefExtent();
|
||
|
refival_.init(id, off, fw, rfextent_);
|
||
|
reflen_ = reflen;
|
||
|
shapeSet_ = true;
|
||
|
}
|
||
|
|
||
|
/**
|
||
|
* Initialize new AlnRes.
|
||
|
*/
|
||
|
void AlnRes::init(
|
||
|
size_t rdlen, // # chars after hard trimming
|
||
|
TReadId rdid, // read ID
|
||
|
AlnScore score, // alignment score
|
||
|
const EList<Edit>* ned, // nucleotide edits
|
||
|
size_t ned_i, // first position to copy
|
||
|
size_t ned_n, // # positions to copy
|
||
|
const EList<Edit>* aed, // ambiguous base resolutions
|
||
|
size_t aed_i, // first position to copy
|
||
|
size_t aed_n, // # positions to copy
|
||
|
Coord refcoord, // leftmost ref pos of 1st al char
|
||
|
TRefOff reflen, // length of ref aligned to
|
||
|
LinkedEList<EList<Edit> >* raw_edits,
|
||
|
int seedmms, // # seed mms allowed
|
||
|
int seedlen, // seed length
|
||
|
int seedival, // space between seeds
|
||
|
int64_t minsc, // minimum score for valid aln
|
||
|
int nuc5p,
|
||
|
int nuc3p,
|
||
|
bool pretrimSoft,
|
||
|
size_t pretrim5p, // trimming prior to alignment
|
||
|
size_t pretrim3p, // trimming prior to alignment
|
||
|
bool trimSoft,
|
||
|
size_t trim5p, // trimming from alignment
|
||
|
size_t trim3p, // trimming from alignment
|
||
|
bool repeat) // repeat
|
||
|
{
|
||
|
assert(raw_edits != NULL);
|
||
|
assert(raw_edits_ == NULL || raw_edits_ == raw_edits);
|
||
|
raw_edits_ = raw_edits;
|
||
|
if(ned_ != NULL) {
|
||
|
assert(aed_ != NULL);
|
||
|
ned_->clear();
|
||
|
aed_->clear();
|
||
|
} else if(raw_edits_ != NULL) {
|
||
|
assert(aed_ == NULL);
|
||
|
assert(ned_node_ == NULL && aed_node_ == NULL);
|
||
|
ned_node_ = raw_edits_->new_node();
|
||
|
aed_node_ = raw_edits_->new_node();
|
||
|
assert(ned_node_ != NULL && aed_node_ != NULL);
|
||
|
ned_ = &(ned_node_->payload);
|
||
|
aed_ = &(aed_node_->payload);
|
||
|
}
|
||
|
|
||
|
rdlen_ = rdlen;
|
||
|
rdid_ = rdid;
|
||
|
rdrows_ = rdlen;
|
||
|
score_ = score;
|
||
|
ned_->clear();
|
||
|
aed_->clear();
|
||
|
if(ned != NULL) {
|
||
|
for(size_t i = ned_i; i < ned_i + ned_n; i++) {
|
||
|
ned_->push_back((*ned)[i]);
|
||
|
}
|
||
|
}
|
||
|
if(aed != NULL) {
|
||
|
for(size_t i = aed_i; i < aed_i + aed_n; i++) {
|
||
|
aed_->push_back((*aed)[i]);
|
||
|
}
|
||
|
}
|
||
|
refcoord_ = refcoord;
|
||
|
reflen_ = reflen;
|
||
|
seedmms_ = seedmms;
|
||
|
seedlen_ = seedlen;
|
||
|
seedival_ = seedival;
|
||
|
minsc_ = minsc;
|
||
|
nuc5p_ = nuc5p;
|
||
|
nuc3p_ = nuc3p;
|
||
|
pretrimSoft_ = pretrimSoft;
|
||
|
pretrim5p_ = pretrim5p;
|
||
|
pretrim3p_ = pretrim3p;
|
||
|
trimSoft_ = trimSoft;
|
||
|
trim5p_ = trim5p;
|
||
|
trim3p_ = trim3p;
|
||
|
repeat_ = repeat;
|
||
|
rdextent_ = rdlen; // # read characters after any hard trimming
|
||
|
if(pretrimSoft) {
|
||
|
rdextent_ -= (pretrim5p + pretrim3p);
|
||
|
}
|
||
|
if(trimSoft) {
|
||
|
rdextent_ -= (trim5p + trim3p);
|
||
|
}
|
||
|
rdexrows_ = rdextent_;
|
||
|
calcRefExtent();
|
||
|
setShape(
|
||
|
refcoord.ref(), // id of reference aligned to
|
||
|
refcoord.off(), // offset of first aligned char into ref seq
|
||
|
reflen, // length of reference sequence aligned to
|
||
|
refcoord.fw(), // aligned to Watson strand?
|
||
|
rdlen, // length of read after hard trimming, before soft
|
||
|
rdid, // read ID
|
||
|
pretrimSoft, // whether trimming prior to alignment was soft
|
||
|
pretrim5p, // # poss trimmed form 5p end before alignment
|
||
|
pretrim3p, // # poss trimmed form 3p end before alignment
|
||
|
trimSoft, // whether local-alignment trimming was soft
|
||
|
trim5p, // # poss trimmed form 5p end during alignment
|
||
|
trim3p); // # poss trimmed form 3p end during alignment
|
||
|
shapeSet_ = true;
|
||
|
|
||
|
num_spliced_ = 0;
|
||
|
for(size_t i = 0; i < ned_->size(); i++) {
|
||
|
if((*ned_)[i].type == EDIT_TYPE_SPL) {
|
||
|
num_spliced_++;
|
||
|
}
|
||
|
}
|
||
|
}
|
||
|
|
||
|
/**
|
||
|
* Clip given number of characters from the Watson-upstream end of the
|
||
|
* alignment.
|
||
|
*/
|
||
|
void AlnRes::clipLeft(size_t rd_amt, size_t rf_amt) {
|
||
|
assert_geq(rd_amt, 0);
|
||
|
assert_geq(rf_amt, 0);
|
||
|
assert_leq(rd_amt, rdexrows_);
|
||
|
assert_leq(rf_amt, rfextent_);
|
||
|
assert(trimSoft_);
|
||
|
if(fw()) {
|
||
|
trim5p_ += rd_amt;
|
||
|
Edit::clipLo(*ned_, rdexrows_, rd_amt);
|
||
|
Edit::clipLo(*aed_, rdexrows_, rd_amt);
|
||
|
} else {
|
||
|
trim3p_ += rd_amt;
|
||
|
Edit::clipHi(*ned_, rdexrows_, rd_amt);
|
||
|
Edit::clipHi(*aed_, rdexrows_, rd_amt);
|
||
|
}
|
||
|
rdexrows_ -= rd_amt;
|
||
|
rdextent_ -= rd_amt;
|
||
|
rfextent_ -= rf_amt;
|
||
|
refcoord_.adjustOff(rf_amt);
|
||
|
refival_.adjustOff(rf_amt);
|
||
|
// Adjust refns_?
|
||
|
}
|
||
|
|
||
|
/**
|
||
|
* Clip given number of characters from the Watson-downstream end of the
|
||
|
* alignment.
|
||
|
*/
|
||
|
void AlnRes::clipRight(size_t rd_amt, size_t rf_amt) {
|
||
|
assert_geq(rd_amt, 0);
|
||
|
assert_geq(rf_amt, 0);
|
||
|
assert_leq(rd_amt, rdexrows_);
|
||
|
assert_leq(rf_amt, rfextent_);
|
||
|
assert(trimSoft_);
|
||
|
if(fw()) {
|
||
|
trim3p_ += rd_amt;
|
||
|
Edit::clipHi(*ned_, rdexrows_, rd_amt);
|
||
|
Edit::clipHi(*aed_, rdexrows_, rd_amt);
|
||
|
} else {
|
||
|
trim5p_ += rd_amt;
|
||
|
Edit::clipLo(*ned_, rdexrows_, rd_amt);
|
||
|
Edit::clipLo(*aed_, rdexrows_, rd_amt);
|
||
|
}
|
||
|
rdexrows_ -= rd_amt;
|
||
|
rdextent_ -= rd_amt;
|
||
|
rfextent_ -= rf_amt;
|
||
|
// Adjust refns_?
|
||
|
}
|
||
|
|
||
|
/**
|
||
|
* Clip away portions of the alignment that are outside the given bounds.
|
||
|
* Clipping is soft if soft == true, hard otherwise. Assuming for now that
|
||
|
* there isn't any other clipping.
|
||
|
*
|
||
|
* Note that all clipping is expressed in terms of read positions. So if there
|
||
|
* are reference gaps in the overhanging portion, we must
|
||
|
*/
|
||
|
void AlnRes::clipOutside(bool soft, TRefOff refi, TRefOff reff) {
|
||
|
// Overhang on LHS
|
||
|
TRefOff left = refcoord_.off();
|
||
|
if(left < refi) {
|
||
|
size_t rf_amt = (size_t)(refi - left);
|
||
|
size_t rf_i = rf_amt;
|
||
|
size_t nedsz = ned_->size();
|
||
|
if(!fw()) {
|
||
|
Edit::invertPoss(*ned_, rdexrows_, false);
|
||
|
}
|
||
|
for(size_t i = 0; i < nedsz; i++) {
|
||
|
assert_lt((*ned_)[i].pos, rdexrows_);
|
||
|
if((*ned_)[i].pos > rf_i) break;
|
||
|
if((*ned_)[i].isRefGap()) rf_i++;
|
||
|
}
|
||
|
if(!fw()) {
|
||
|
Edit::invertPoss(*ned_, rdexrows_, false);
|
||
|
}
|
||
|
clipLeft(rf_i, rf_amt);
|
||
|
}
|
||
|
// Overhang on RHS
|
||
|
TRefOff right = refcoord_.off() + refNucExtent();
|
||
|
if(right > reff) {
|
||
|
size_t rf_amt = (size_t)(right - reff);
|
||
|
size_t rf_i = rf_amt;
|
||
|
size_t nedsz = ned_->size();
|
||
|
if(fw()) {
|
||
|
Edit::invertPoss(*ned_, rdexrows_, false);
|
||
|
}
|
||
|
for(size_t i = 0; i < nedsz; i++) {
|
||
|
assert_lt((*ned_)[i].pos, rdexrows_);
|
||
|
if((*ned_)[i].pos > rf_i) break;
|
||
|
if((*ned_)[i].isRefGap()) rf_i++;
|
||
|
}
|
||
|
if(fw()) {
|
||
|
Edit::invertPoss(*ned_, rdexrows_, false);
|
||
|
}
|
||
|
clipRight(rf_i, rf_amt);
|
||
|
}
|
||
|
}
|
||
|
|
||
|
/**
|
||
|
* Return true iff this AlnRes and the given AlnRes overlap. Two AlnRess
|
||
|
* overlap if they share a cell in the overall dynamic programming table:
|
||
|
* i.e. if there exists a read position s.t. that position in both reads
|
||
|
* matches up with the same reference character. E.g., the following
|
||
|
* alignments (drawn schematically as paths through a dynamic programming
|
||
|
* table) are redundant:
|
||
|
*
|
||
|
* a b a b
|
||
|
* \ \ \ \
|
||
|
* \ \ \ \
|
||
|
* \ \ \ \
|
||
|
* ---\ \ \
|
||
|
* \ ---\---
|
||
|
* ---\ \ \
|
||
|
* \ \ \ \
|
||
|
* \ \ \ \
|
||
|
* \ \ \ \
|
||
|
* a b a b
|
||
|
*
|
||
|
* We iterate over each read position that hasn't been hard-trimmed, but
|
||
|
* only overlaps at positions that have also not been soft-trimmed are
|
||
|
* considered.
|
||
|
*/
|
||
|
bool AlnRes::overlap(AlnRes& res) {
|
||
|
if(fw() != res.fw() || refid() != res.refid()) {
|
||
|
// Must be same reference and same strand in order to overlap
|
||
|
return false;
|
||
|
}
|
||
|
TRefOff my_left = refoff(); // my leftmost aligned char
|
||
|
TRefOff other_left = res.refoff(); // other leftmost aligned char
|
||
|
TRefOff my_right = my_left + refExtent();
|
||
|
TRefOff other_right = other_left + res.refExtent();
|
||
|
if(my_right < other_left || other_right < my_left) {
|
||
|
// The rectangular hulls of the two alignments don't overlap, so
|
||
|
// they can't overlap at any cell
|
||
|
return false;
|
||
|
}
|
||
|
// Reference and strand are the same and hulls overlap. Now go read
|
||
|
// position by read position testing if any align identically with the
|
||
|
// reference.
|
||
|
|
||
|
// Edits are ordered and indexed from 5' to 3' to start with. We
|
||
|
// reorder them to go from left to right along the Watson strand.
|
||
|
if(!fw()) {
|
||
|
invertEdits();
|
||
|
}
|
||
|
if(!res.fw()) {
|
||
|
res.invertEdits();
|
||
|
}
|
||
|
size_t nedidx = 0, onedidx = 0;
|
||
|
bool olap = false;
|
||
|
// For each row, going left to right along Watson reference strand...
|
||
|
for(size_t i = 0; i < rdexrows_; i++) {
|
||
|
size_t diff = 1; // amount to shift to right for next round
|
||
|
size_t odiff = 1; // amount to shift to right for next round
|
||
|
// Unless there are insertions before the next position, we say
|
||
|
// that there is one cell in this row involved in the alignment
|
||
|
my_right = my_left + 1;
|
||
|
other_right = other_left + 1;
|
||
|
while(nedidx < ned_->size() && (*ned_)[nedidx].pos == i) {
|
||
|
if((*ned_)[nedidx].isRefGap()) {
|
||
|
// Next my_left will be in same column as this round
|
||
|
diff = 0;
|
||
|
}
|
||
|
nedidx++;
|
||
|
}
|
||
|
while(onedidx < res.ned_->size() && (*res.ned_)[onedidx].pos == i) {
|
||
|
if((*res.ned_)[onedidx].isRefGap()) {
|
||
|
// Next my_left will be in same column as this round
|
||
|
odiff = 0;
|
||
|
}
|
||
|
onedidx++;
|
||
|
}
|
||
|
if(i < rdexrows_ - 1) {
|
||
|
// See how many inserts there are before the next read
|
||
|
// character
|
||
|
size_t nedidx_next = nedidx;
|
||
|
size_t onedidx_next = onedidx;
|
||
|
while(nedidx_next < ned_->size() &&
|
||
|
(*ned_)[nedidx_next].pos == i+1)
|
||
|
{
|
||
|
if((*ned_)[nedidx_next].isReadGap()) {
|
||
|
my_right++;
|
||
|
}
|
||
|
nedidx_next++;
|
||
|
}
|
||
|
while(onedidx_next < res.ned_->size() &&
|
||
|
(*res.ned_)[onedidx_next].pos == i+1)
|
||
|
{
|
||
|
if((*res.ned_)[onedidx_next].isReadGap()) {
|
||
|
other_right++;
|
||
|
}
|
||
|
onedidx_next++;
|
||
|
}
|
||
|
}
|
||
|
// Contained?
|
||
|
olap =
|
||
|
(my_left >= other_left && my_right <= other_right) ||
|
||
|
(other_left >= my_left && other_right <= my_right);
|
||
|
// Overlapping but not contained?
|
||
|
if(!olap) {
|
||
|
olap =
|
||
|
(my_left <= other_left && my_right > other_left) ||
|
||
|
(other_left <= my_left && other_right > my_left);
|
||
|
}
|
||
|
if(olap) {
|
||
|
break;
|
||
|
}
|
||
|
// How to do adjust my_left and my_right
|
||
|
my_left = my_right + diff - 1;
|
||
|
other_left = other_right + odiff - 1;
|
||
|
}
|
||
|
if(!fw()) {
|
||
|
invertEdits();
|
||
|
}
|
||
|
if(!res.fw()) {
|
||
|
res.invertEdits();
|
||
|
}
|
||
|
return olap;
|
||
|
}
|
||
|
|
||
|
#ifndef NDEBUG
|
||
|
|
||
|
/**
|
||
|
* Assuming this AlnRes is an alignment for 'rd', check that the alignment and
|
||
|
* 'rd' are compatible with the corresponding reference sequence.
|
||
|
*/
|
||
|
bool AlnRes::matchesRef(
|
||
|
const Read& rd,
|
||
|
const BitPairReference& ref,
|
||
|
BTDnaString& rf,
|
||
|
BTDnaString& rdseq,
|
||
|
BTString& qseq,
|
||
|
SStringExpandable<char>& raw_refbuf,
|
||
|
SStringExpandable<uint32_t>& destU32,
|
||
|
EList<bool>& matches,
|
||
|
SStringExpandable<char>& raw_refbuf2,
|
||
|
EList<TIndexOffU>& reflens,
|
||
|
EList<TIndexOffU>& refoffs)
|
||
|
{
|
||
|
assert(!empty());
|
||
|
assert(repOk());
|
||
|
assert(refcoord_.inited());
|
||
|
size_t rdlen = rd.length();
|
||
|
bool fw = refcoord_.fw();
|
||
|
if(!fw) {
|
||
|
assert_lt(trim3p_, rdlen);
|
||
|
Edit::invertPoss(const_cast<EList<Edit>&>(*ned_), rdlen - trim5p_ - trim3p_, false);
|
||
|
}
|
||
|
size_t refallen = 0;
|
||
|
reflens.clear(); refoffs.clear();
|
||
|
int64_t reflen = 0;
|
||
|
int64_t refoff = refcoord_.off();
|
||
|
refoffs.push_back((uint32_t)refoff);
|
||
|
size_t eidx = 0;
|
||
|
assert_lt(trim5p_ + trim3p_, rdlen);
|
||
|
for(size_t i = 0; i < rdlen - trim5p_ - trim3p_; i++, reflen++, refoff++) {
|
||
|
while(eidx < ned_->size() && (*ned_)[eidx].pos == i) {
|
||
|
if((*ned_)[eidx].isReadGap()) {
|
||
|
reflen++;
|
||
|
refoff++;
|
||
|
} else if((*ned_)[eidx].isRefGap()) {
|
||
|
reflen--;
|
||
|
refoff--;
|
||
|
}
|
||
|
if((*ned_)[eidx].isSpliced()) {
|
||
|
assert_gt(reflen, 0);
|
||
|
refallen += (uint32_t)reflen;
|
||
|
reflens.push_back((uint32_t)reflen);
|
||
|
reflen = 0;
|
||
|
refoff += (*ned_)[eidx].splLen;
|
||
|
assert_gt(refoff, 0);
|
||
|
refoffs.push_back((uint32_t)refoff);
|
||
|
}
|
||
|
eidx++;
|
||
|
}
|
||
|
}
|
||
|
assert_gt(reflen, 0);
|
||
|
refallen += (uint32_t)reflen;
|
||
|
reflens.push_back((uint32_t)reflen);
|
||
|
assert_gt(reflens.size(), 0);
|
||
|
assert_gt(refoffs.size(), 0);
|
||
|
assert_eq(reflens.size(), refoffs.size());
|
||
|
if(!fw) {
|
||
|
assert_lt(trim3p_, rdlen);
|
||
|
Edit::invertPoss(const_cast<EList<Edit>&>(*ned_), rdlen - trim5p_ - trim3p_, false);
|
||
|
}
|
||
|
|
||
|
// Adjust reference string length according to edits
|
||
|
#ifndef NDEBUG
|
||
|
if(reflens.size() == 1) {
|
||
|
assert_eq(refallen, refNucExtent());
|
||
|
}
|
||
|
#endif
|
||
|
|
||
|
assert_geq(refcoord_.ref(), 0);
|
||
|
int nsOnLeft = 0;
|
||
|
if(refcoord_.off() < 0) {
|
||
|
nsOnLeft = -((int)refcoord_.off());
|
||
|
}
|
||
|
raw_refbuf.resize(refallen);
|
||
|
raw_refbuf.clear();
|
||
|
raw_refbuf2.clear();
|
||
|
for(size_t i = 0; i < reflens.size(); i++) {
|
||
|
assert_gt(reflens[i], 0);
|
||
|
#ifndef NDEBUG
|
||
|
if(i > 0) {
|
||
|
assert_gt(refoffs[i], refoffs[i-1]);
|
||
|
}
|
||
|
#endif
|
||
|
raw_refbuf2.resize(reflens[i] + 16);
|
||
|
raw_refbuf2.clear();
|
||
|
int off = ref.getStretch(
|
||
|
reinterpret_cast<uint32_t*>(raw_refbuf2.wbuf()),
|
||
|
(size_t)refcoord_.ref(),
|
||
|
(size_t)max<TRefOff>(refoffs[i], 0),
|
||
|
reflens[i],
|
||
|
destU32);
|
||
|
assert_leq(off, 16);
|
||
|
raw_refbuf.append(raw_refbuf2.wbuf() + off, reflens[i]);
|
||
|
}
|
||
|
char *refbuf = raw_refbuf.wbuf();
|
||
|
size_t trim5 = 0, trim3 = 0;
|
||
|
if(trimSoft_) {
|
||
|
trim5 += trim5p_;
|
||
|
trim3 += trim3p_;
|
||
|
}
|
||
|
if(pretrimSoft_) {
|
||
|
trim5 += pretrim5p_;
|
||
|
trim3 += pretrim3p_;
|
||
|
}
|
||
|
rf.clear();
|
||
|
rdseq.clear();
|
||
|
rdseq = rd.patFw;
|
||
|
if(!fw) {
|
||
|
rdseq.reverseComp(false);
|
||
|
}
|
||
|
assert_eq(rdrows_, rdseq.length());
|
||
|
// rdseq is the nucleotide sequence from upstream to downstream on the
|
||
|
// Watson strand. ned_ are the nucleotide edits from upstream to
|
||
|
// downstream. rf contains the reference characters.
|
||
|
assert(Edit::repOk(*ned_, rdseq, fw, trim5, trim3));
|
||
|
Edit::toRef(rdseq, *ned_, rf, fw, trim5, trim3);
|
||
|
assert_eq(refallen, rf.length());
|
||
|
matches.clear();
|
||
|
bool matchesOverall = true;
|
||
|
matches.resize(refallen);
|
||
|
matches.fill(true);
|
||
|
for(size_t i = 0; i < refallen; i++) {
|
||
|
if((int)i < nsOnLeft) {
|
||
|
if((int)rf[i] != 4) {
|
||
|
matches[i] = false;
|
||
|
matchesOverall = false;
|
||
|
}
|
||
|
} else {
|
||
|
if((int)rf[i] != (int)refbuf[i-nsOnLeft]) {
|
||
|
matches[i] = false;
|
||
|
matchesOverall = false;
|
||
|
}
|
||
|
}
|
||
|
}
|
||
|
if(!matchesOverall) {
|
||
|
// Print a friendly message showing the difference between the
|
||
|
// reference sequence obtained with Edit::toRef and the actual
|
||
|
// reference sequence
|
||
|
cerr << endl;
|
||
|
Edit::printQAlignNoCheck(
|
||
|
cerr,
|
||
|
" ",
|
||
|
rdseq,
|
||
|
*ned_);
|
||
|
cerr << " ";
|
||
|
for(size_t i = 0; i < refallen; i++) {
|
||
|
cerr << (matches[i] ? " " : "*");
|
||
|
}
|
||
|
cerr << endl;
|
||
|
cerr << " ";
|
||
|
for(size_t i = 0; i < refallen-nsOnLeft; i++) {
|
||
|
cerr << "ACGTN"[(int)refbuf[i]];
|
||
|
}
|
||
|
cerr << endl;
|
||
|
Edit::printQAlign(
|
||
|
cerr,
|
||
|
" ",
|
||
|
rdseq,
|
||
|
*ned_);
|
||
|
cerr << endl;
|
||
|
}
|
||
|
return matchesOverall;
|
||
|
}
|
||
|
|
||
|
#endif /*ndef NDEBUG*/
|
||
|
|
||
|
#define COPY_BUF() { \
|
||
|
char *bufc = buf; \
|
||
|
while(*bufc != '\0') { \
|
||
|
*occ = *bufc; \
|
||
|
occ++; \
|
||
|
bufc++; \
|
||
|
} \
|
||
|
}
|
||
|
|
||
|
/**
|
||
|
* Initialized the stacked alignment with respect to a read string, a list of
|
||
|
* edits (expressed left-to-right), and integers indicating how much hard and
|
||
|
* soft trimming has occurred on either end of the read.
|
||
|
*
|
||
|
* s: read sequence
|
||
|
* ed: all relevant edits, including ambiguous nucleotides
|
||
|
* trimLS: # bases soft-trimmed from LHS
|
||
|
* trimLH: # bases hard-trimmed from LHS
|
||
|
* trimRS: # bases soft-trimmed from RHS
|
||
|
* trimRH: # bases hard-trimmed from RHS
|
||
|
*/
|
||
|
void StackedAln::init(
|
||
|
const BTDnaString& s,
|
||
|
const EList<Edit>& ed,
|
||
|
size_t trimLS,
|
||
|
size_t trimLH,
|
||
|
size_t trimRS,
|
||
|
size_t trimRH)
|
||
|
{
|
||
|
trimLS_ = trimLS;
|
||
|
trimLH_ = trimLH;
|
||
|
trimRS_ = trimRS;
|
||
|
trimRH_ = trimRH;
|
||
|
ASSERT_ONLY(size_t ln_postsoft = s.length() - trimLS - trimRS);
|
||
|
stackRef_.clear();
|
||
|
stackRel_.clear();
|
||
|
stackSNP_.clear();
|
||
|
stackRead_.clear();
|
||
|
size_t rdoff = trimLS;
|
||
|
for(size_t i = 0; i < ed.size(); i++) {
|
||
|
assert_lt(ed[i].pos, ln_postsoft);
|
||
|
size_t pos = ed[i].pos + trimLS;
|
||
|
while(rdoff < pos) {
|
||
|
int c = s[rdoff++];
|
||
|
assert_range(0, 4, c);
|
||
|
stackRef_.push_back("ACGTN"[c]);
|
||
|
stackRel_.push_back('=');
|
||
|
stackSNP_.push_back(false);
|
||
|
stackRead_.push_back("ACGTN"[c]);
|
||
|
}
|
||
|
if(ed[i].isMismatch()) {
|
||
|
int c = s[rdoff++];
|
||
|
assert_range(0, 4, c);
|
||
|
assert_eq(c, asc2dna[(int)ed[i].qchr]);
|
||
|
assert_neq(c, asc2dna[(int)ed[i].chr]);
|
||
|
stackRef_.push_back(ed[i].chr);
|
||
|
stackRel_.push_back('X');
|
||
|
stackSNP_.push_back(ed[i].snpID != (uint32_t)INDEX_MAX);
|
||
|
stackRead_.push_back("ACGTN"[c]);
|
||
|
} else if(ed[i].isRefGap()) {
|
||
|
int c = s[rdoff++];
|
||
|
assert_range(0, 4, c);
|
||
|
assert_eq(c, asc2dna[(int)ed[i].qchr]);
|
||
|
stackRef_.push_back('-');
|
||
|
stackRel_.push_back('I');
|
||
|
stackSNP_.push_back(ed[i].snpID != (uint32_t)INDEX_MAX);
|
||
|
stackRead_.push_back("ACGTN"[c]);
|
||
|
} else if(ed[i].isReadGap()) {
|
||
|
stackRef_.push_back(ed[i].chr);
|
||
|
stackRel_.push_back('D');
|
||
|
stackSNP_.push_back(ed[i].snpID != (uint32_t)INDEX_MAX);
|
||
|
stackRead_.push_back('-');
|
||
|
} else if(ed[i].isSpliced()) {
|
||
|
stackRef_.push_back('N');
|
||
|
stackRel_.push_back('N');
|
||
|
stackSNP_.push_back(false);
|
||
|
stackRead_.push_back('N');
|
||
|
assert_gt(ed[i].splLen, 0);
|
||
|
stackSkip_.push_back(ed[i].splLen);
|
||
|
}
|
||
|
}
|
||
|
while(rdoff < s.length() - trimRS) {
|
||
|
int c = s[rdoff++];
|
||
|
assert_range(0, 4, c);
|
||
|
stackRef_.push_back("ACGTN"[c]);
|
||
|
stackRel_.push_back('=');
|
||
|
stackSNP_.push_back(false);
|
||
|
stackRead_.push_back("ACGTN"[c]);
|
||
|
}
|
||
|
inited_ = true;
|
||
|
}
|
||
|
|
||
|
/**
|
||
|
* Left-align all the gaps. If this changes the alignment and the CIGAR or
|
||
|
* MD:Z strings have already been calculated, this renders them invalid.
|
||
|
*
|
||
|
* We left-align gaps with in the following way: for each gap, we check
|
||
|
* whether the character opposite the rightmost gap character is the same
|
||
|
* as the character opposite the character just to the left of the gap. If
|
||
|
* this is the case, we can slide the gap to the left and make the
|
||
|
* rightmost position previously covered by the gap into a non-gap.
|
||
|
*
|
||
|
* This scheme allows us to push the gap past a mismatch. BWA does seem to
|
||
|
* allow this. It's not clear that Bowtie 2 should, since moving the
|
||
|
* mismatch could cause a mismatch with one base quality to be replaced
|
||
|
* with a mismatch with a different base quality.
|
||
|
*/
|
||
|
void StackedAln::leftAlign(bool pastMms) {
|
||
|
assert(inited_);
|
||
|
bool changed = false;
|
||
|
size_t ln = stackRef_.size();
|
||
|
// Scan left-to-right
|
||
|
for(size_t i = 0; i < ln; i++) {
|
||
|
int rel = stackRel_[i];
|
||
|
if(rel != '=' && rel != 'X' && rel != 'N') {
|
||
|
// Neither a match nor a mismatch - must be a gap
|
||
|
assert(rel == 'I' || rel == 'D');
|
||
|
if(stackSNP_[i]) continue;
|
||
|
size_t glen = 1;
|
||
|
// Scan further right to measure length of gap
|
||
|
for(size_t j = i+1; j < ln; j++) {
|
||
|
if(rel != (int)stackRel_[j]) break;
|
||
|
glen++;
|
||
|
}
|
||
|
// We've identified a gap of type 'rel' (D = deletion or read
|
||
|
// gap, I = insertion or ref gap) with length 'glen'. Now we
|
||
|
// can try to slide it to the left repeatedly.
|
||
|
size_t l = i - 1;
|
||
|
size_t r = l + glen;
|
||
|
EList<char>& gp = ((rel == 'I') ? stackRef_ : stackRead_);
|
||
|
const EList<char>& ngp = ((rel == 'I') ? stackRead_ : stackRef_);
|
||
|
while(l > 0 && ngp[l] == ngp[r]) {
|
||
|
if(stackRel_[l] == 'I' || stackRel_[l] == 'D') break;
|
||
|
assert(stackRel_[l] == '=' || stackRel_[l] == 'X' || stackRel_[l] == 'N');
|
||
|
assert(stackRel_[r] == 'D' || stackRel_[r] == 'I');
|
||
|
if(!pastMms && (stackRel_[l] == 'X' || stackRel_[l] == 'N')) {
|
||
|
break;
|
||
|
}
|
||
|
swap(gp[l], gp[r]);
|
||
|
swap(stackRel_[l], stackRel_[r]);
|
||
|
assert_neq('-', gp[r]);
|
||
|
assert_eq('-', gp[l]);
|
||
|
l--; r--;
|
||
|
changed = true;
|
||
|
}
|
||
|
i += (glen-1);
|
||
|
}
|
||
|
}
|
||
|
if(changed) {
|
||
|
cigCalc_ = mdzCalc_ = false;
|
||
|
}
|
||
|
}
|
||
|
|
||
|
/**
|
||
|
* Build the CIGAR list, if it hasn't already built. Returns true iff it
|
||
|
* was built for the first time.
|
||
|
*/
|
||
|
bool StackedAln::buildCigar(bool xeq) {
|
||
|
assert(inited_);
|
||
|
if(cigCalc_) {
|
||
|
return false; // already done
|
||
|
}
|
||
|
cigOp_.clear();
|
||
|
cigRun_.clear();
|
||
|
if(trimLS_ > 0) {
|
||
|
cigOp_.push_back('S');
|
||
|
cigRun_.push_back(trimLS_);
|
||
|
}
|
||
|
size_t numSkips = 0;
|
||
|
size_t ln = stackRef_.size();
|
||
|
for(size_t i = 0; i < ln; i++) {
|
||
|
char op = stackRel_[i];
|
||
|
if(!xeq && (op == 'X' || op == '=')) {
|
||
|
op = 'M';
|
||
|
}
|
||
|
size_t run;
|
||
|
if(op != 'N') {
|
||
|
run = 1;
|
||
|
for(; i + run < ln; run++) {
|
||
|
char op2 = stackRel_[i + run];
|
||
|
if(!xeq && (op2 == 'X' || op2 == '=')) {
|
||
|
op2 = 'M';
|
||
|
}
|
||
|
if(op2 != op) {
|
||
|
break;
|
||
|
}
|
||
|
}
|
||
|
i += (run-1);
|
||
|
} else {
|
||
|
assert_lt(numSkips, stackSkip_.size());
|
||
|
run = stackSkip_[numSkips];
|
||
|
numSkips++;
|
||
|
}
|
||
|
cigOp_.push_back(op);
|
||
|
cigRun_.push_back(run);
|
||
|
}
|
||
|
if(trimRS_ > 0) {
|
||
|
cigOp_.push_back('S');
|
||
|
cigRun_.push_back(trimRS_);
|
||
|
}
|
||
|
cigCalc_ = true;
|
||
|
return true;
|
||
|
}
|
||
|
|
||
|
/**
|
||
|
* Build the CIGAR list, if it hasn't already built. Returns true iff it
|
||
|
* was built for the first time.
|
||
|
*/
|
||
|
bool StackedAln::buildMdz() {
|
||
|
assert(inited_);
|
||
|
if(mdzCalc_) {
|
||
|
return false; // already done
|
||
|
}
|
||
|
mdzOp_.clear();
|
||
|
mdzChr_.clear();
|
||
|
mdzRun_.clear();
|
||
|
size_t ln = stackRef_.size();
|
||
|
for(size_t i = 0; i < ln; i++) {
|
||
|
char op = stackRel_[i];
|
||
|
if(op == '=') {
|
||
|
size_t run = 1;
|
||
|
size_t ninserts = 0;
|
||
|
size_t nskips = 0;
|
||
|
// Skip over matches and insertions (ref gaps)
|
||
|
for(; i+run < ln; run++) {
|
||
|
if(stackRel_[i + run] == '=') {
|
||
|
// do nothing
|
||
|
} else if(stackRel_[i + run] == 'I') {
|
||
|
ninserts++;
|
||
|
} else if(stackRel_[i + run] == 'N') {
|
||
|
nskips++;
|
||
|
} else {
|
||
|
break;
|
||
|
}
|
||
|
}
|
||
|
i += (run - 1);
|
||
|
mdzOp_.push_back('='); // = X or G
|
||
|
mdzChr_.push_back('-');
|
||
|
mdzRun_.push_back(run - ninserts - nskips);
|
||
|
} else if(op == 'X') {
|
||
|
assert_neq(stackRef_[i], stackRead_[i]);
|
||
|
mdzOp_.push_back('X'); // = X or G
|
||
|
mdzChr_.push_back(stackRef_[i]);
|
||
|
mdzRun_.push_back(1);
|
||
|
} else if(op == 'D') {
|
||
|
assert_neq('-', stackRef_[i]);
|
||
|
mdzOp_.push_back('G'); // = X or G
|
||
|
mdzChr_.push_back(stackRef_[i]);
|
||
|
mdzRun_.push_back(1);
|
||
|
}
|
||
|
}
|
||
|
mdzCalc_ = true;
|
||
|
return true;
|
||
|
}
|
||
|
|
||
|
/**
|
||
|
* Write a CIGAR representation of the alignment to the given string and/or
|
||
|
* char buffer.
|
||
|
*/
|
||
|
void StackedAln::writeCigar(
|
||
|
BTString* o, // if non-NULL, string to append to
|
||
|
char* occ) const // if non-NULL, character string to append to
|
||
|
{
|
||
|
const EList<char>& op = cigOp_;
|
||
|
const EList<size_t>& run = cigRun_;
|
||
|
assert_eq(op.size(), run.size());
|
||
|
if(o != NULL || occ != NULL) {
|
||
|
char buf[128];
|
||
|
ASSERT_ONLY(bool printed = false);
|
||
|
for(size_t i = 0; i < op.size(); i++) {
|
||
|
size_t r = run[i];
|
||
|
if(r > 0) {
|
||
|
itoa10<size_t>(r, buf);
|
||
|
ASSERT_ONLY(printed = true);
|
||
|
if(o != NULL) {
|
||
|
o->append(buf);
|
||
|
o->append(op[i]);
|
||
|
}
|
||
|
if(occ != NULL) {
|
||
|
COPY_BUF();
|
||
|
*occ = op[i];
|
||
|
occ++;
|
||
|
}
|
||
|
}
|
||
|
}
|
||
|
assert(printed);
|
||
|
if(occ != NULL) {
|
||
|
*occ = '\0';
|
||
|
}
|
||
|
}
|
||
|
}
|
||
|
|
||
|
void StackedAln::writeCigar(Alignment* o, char* occ) const {
|
||
|
const EList<char>& op = cigOp_;
|
||
|
const EList<size_t>& run = cigRun_;
|
||
|
assert_eq(op.size(), run.size());
|
||
|
if(o != NULL || occ != NULL) {
|
||
|
char buf[128];
|
||
|
ASSERT_ONLY(bool printed = false);
|
||
|
o->cigarSegments.reserve(op.size());
|
||
|
for(size_t i = 0; i < op.size(); i++) {
|
||
|
size_t r = run[i];
|
||
|
if(r > 0) {
|
||
|
itoa10<size_t>(r, buf);
|
||
|
ASSERT_ONLY(printed = true);
|
||
|
if(o != NULL) {
|
||
|
o->cigarString.append(buf);
|
||
|
o->cigarString.append(op[i]);
|
||
|
o->cigarSegments.emplace_back(r, op[i]);
|
||
|
o->cigarLength += r;
|
||
|
}
|
||
|
if(occ != NULL) {
|
||
|
COPY_BUF();
|
||
|
*occ = op[i];
|
||
|
occ++;
|
||
|
}
|
||
|
}
|
||
|
}
|
||
|
assert(printed);
|
||
|
if(occ != NULL) {
|
||
|
*occ = '\0';
|
||
|
}
|
||
|
}
|
||
|
}
|
||
|
|
||
|
/**
|
||
|
* Write an MD:Z representation of the alignment to the given string and/or
|
||
|
* char buffer.
|
||
|
*/
|
||
|
void StackedAln::writeMdz(BTString* o, char* occ) const {
|
||
|
char buf[128];
|
||
|
bool mm_last = false;
|
||
|
bool rdgap_last = false;
|
||
|
bool first_print = true;
|
||
|
const EList<char>& op = mdzOp_;
|
||
|
const EList<char>& ch = mdzChr_;
|
||
|
const EList<size_t>& run = mdzRun_;
|
||
|
for(size_t i = 0; i < op.size(); i++) {
|
||
|
size_t r = run[i];
|
||
|
if(r > 0) {
|
||
|
if(op[i] == '=') {
|
||
|
// Write run length
|
||
|
itoa10<size_t>(r, buf);
|
||
|
if(o != NULL) { o->append(buf); }
|
||
|
if(occ != NULL) { COPY_BUF(); }
|
||
|
first_print = false;
|
||
|
mm_last = false;
|
||
|
rdgap_last = false;
|
||
|
} else if(op[i] == 'X') {
|
||
|
if(o != NULL) {
|
||
|
if(rdgap_last || mm_last || first_print) {
|
||
|
o->append('0');
|
||
|
}
|
||
|
o->append(ch[i]);
|
||
|
}
|
||
|
if(occ != NULL) {
|
||
|
if(rdgap_last || mm_last || first_print) {
|
||
|
*occ = '0';
|
||
|
occ++;
|
||
|
}
|
||
|
*occ = ch[i];
|
||
|
occ++;
|
||
|
}
|
||
|
first_print = false;
|
||
|
mm_last = true;
|
||
|
rdgap_last = false;
|
||
|
} else if(op[i] == 'G') {
|
||
|
if(o != NULL) {
|
||
|
if(mm_last || first_print) {
|
||
|
o->append('0');
|
||
|
}
|
||
|
if(!rdgap_last) {
|
||
|
o->append('^');
|
||
|
}
|
||
|
o->append(ch[i]);
|
||
|
}
|
||
|
if(occ != NULL) {
|
||
|
if(mm_last || first_print) {
|
||
|
*occ = '0'; occ++;
|
||
|
}
|
||
|
if(!rdgap_last) {
|
||
|
*occ = '^'; occ++;
|
||
|
}
|
||
|
*occ = ch[i];
|
||
|
occ++;
|
||
|
}
|
||
|
first_print = false;
|
||
|
mm_last = false;
|
||
|
rdgap_last = true;
|
||
|
}
|
||
|
} // if r > 0
|
||
|
} // for loop over ops
|
||
|
if(mm_last || rdgap_last) {
|
||
|
if(o != NULL) { o->append('0'); }
|
||
|
if(occ != NULL) { *occ = '0'; occ++; }
|
||
|
}
|
||
|
if(occ != NULL) { *occ = '\0'; }
|
||
|
}
|
||
|
|
||
|
/**
|
||
|
* Print the sequence for the read that aligned using A, C, G and
|
||
|
* T. This will simply print the read sequence (or its reverse
|
||
|
* complement).
|
||
|
*/
|
||
|
void AlnRes::printSeq(
|
||
|
const Read& rd, // read
|
||
|
const BTDnaString* dns, // already-decoded nucleotides
|
||
|
BTString& o) const // buffer to write to
|
||
|
{
|
||
|
assert(!rd.patFw.empty());
|
||
|
ASSERT_ONLY(size_t written = 0);
|
||
|
// Print decoded nucleotides
|
||
|
assert(dns != NULL);
|
||
|
size_t len = dns->length();
|
||
|
size_t st = 0;
|
||
|
size_t en = len;
|
||
|
for(size_t i = st; i < en; i++) {
|
||
|
int c = dns->get(i);
|
||
|
assert_range(0, 3, c);
|
||
|
o.append("ACGT"[c]);
|
||
|
ASSERT_ONLY(written++);
|
||
|
}
|
||
|
#ifndef NDEBUG
|
||
|
for(size_t i = 0; i < ned_->size(); i++) {
|
||
|
if((*ned_)[i].isReadGap()) {
|
||
|
assert_leq((*ned_)[i].pos, dns->length());
|
||
|
} else {
|
||
|
assert_lt((*ned_)[i].pos, dns->length());
|
||
|
}
|
||
|
}
|
||
|
#endif
|
||
|
}
|
||
|
|
||
|
/**
|
||
|
* Print the quality string for the read that aligned. This will simply print
|
||
|
* the read qualities (or their reverse).
|
||
|
*/
|
||
|
void AlnRes::printQuals(
|
||
|
const Read& rd, // read
|
||
|
const BTString* dqs, // already-decoded qualities
|
||
|
BTString& o) const // output stream to write to
|
||
|
{
|
||
|
assert(dqs != NULL);
|
||
|
size_t len = dqs->length();
|
||
|
// Print decoded qualities from upstream to downstream Watson
|
||
|
for(size_t i = 1; i < len-1; i++) {
|
||
|
o.append(dqs->get(i));
|
||
|
}
|
||
|
}
|
||
|
|
||
|
/**
|
||
|
* Add all of the cells involved in the given alignment to the database.
|
||
|
*/
|
||
|
void RedundantAlns::add(const AlnRes& res) {
|
||
|
assert(!cells_.empty());
|
||
|
TRefOff left = res.refoff(), right;
|
||
|
const size_t len = res.readExtentRows();
|
||
|
if(!res.fw()) {
|
||
|
const_cast<AlnRes&>(res).invertEdits();
|
||
|
}
|
||
|
const EList<Edit>& ned = res.ned();
|
||
|
size_t nedidx = 0;
|
||
|
assert_leq(len, cells_.size());
|
||
|
// For each row...
|
||
|
for(size_t i = 0; i < len; i++) {
|
||
|
size_t diff = 1; // amount to shift to right for next round
|
||
|
right = left + 1;
|
||
|
while(nedidx < ned.size() && ned[nedidx].pos == i) {
|
||
|
if(ned[nedidx].isRefGap()) {
|
||
|
// Next my_left will be in same column as this round
|
||
|
diff = 0;
|
||
|
}
|
||
|
nedidx++;
|
||
|
}
|
||
|
if(i < len - 1) {
|
||
|
// See how many inserts there are before the next read
|
||
|
// character
|
||
|
size_t nedidx_next = nedidx;
|
||
|
while(nedidx_next < ned.size() && ned[nedidx_next].pos == i+1)
|
||
|
{
|
||
|
if(ned[nedidx_next].isReadGap()) {
|
||
|
right++;
|
||
|
}
|
||
|
nedidx_next++;
|
||
|
}
|
||
|
}
|
||
|
for(TRefOff j = left; j < right; j++) {
|
||
|
// Add to db
|
||
|
RedundantCell c(res.refid(), res.fw(), j, i);
|
||
|
ASSERT_ONLY(bool ret =) cells_[i].insert(c);
|
||
|
assert(ret);
|
||
|
}
|
||
|
left = right + diff - 1;
|
||
|
}
|
||
|
if(!res.fw()) {
|
||
|
const_cast<AlnRes&>(res).invertEdits();
|
||
|
}
|
||
|
}
|
||
|
|
||
|
/**
|
||
|
* Return true iff the given alignment has at least one cell that overlaps
|
||
|
* one of the cells in the database.
|
||
|
*/
|
||
|
bool RedundantAlns::overlap(const AlnRes& res) {
|
||
|
assert(!cells_.empty());
|
||
|
TRefOff left = res.refoff(), right;
|
||
|
const size_t len = res.readExtentRows();
|
||
|
if(!res.fw()) {
|
||
|
const_cast<AlnRes&>(res).invertEdits();
|
||
|
}
|
||
|
const EList<Edit>& ned = res.ned();
|
||
|
size_t nedidx = 0;
|
||
|
// For each row...
|
||
|
bool olap = false;
|
||
|
assert_leq(len, cells_.size());
|
||
|
for(size_t i = 0; i < len; i++) {
|
||
|
size_t diff = 1; // amount to shift to right for next round
|
||
|
right = left + 1;
|
||
|
while(nedidx < ned.size() && ned[nedidx].pos == i) {
|
||
|
if(ned[nedidx].isRefGap()) {
|
||
|
// Next my_left will be in same column as this round
|
||
|
diff = 0;
|
||
|
}
|
||
|
nedidx++;
|
||
|
}
|
||
|
if(i < len - 1) {
|
||
|
// See how many inserts there are before the next read
|
||
|
// character
|
||
|
size_t nedidx_next = nedidx;
|
||
|
while(nedidx_next < ned.size() && ned[nedidx_next].pos == i+1)
|
||
|
{
|
||
|
if(ned[nedidx_next].isReadGap()) {
|
||
|
right++;
|
||
|
}
|
||
|
nedidx_next++;
|
||
|
}
|
||
|
}
|
||
|
for(TRefOff j = left; j < right; j++) {
|
||
|
// Add to db
|
||
|
RedundantCell c(res.refid(), res.fw(), j, i);
|
||
|
if(cells_[i].contains(c)) {
|
||
|
olap = true;
|
||
|
break;
|
||
|
}
|
||
|
}
|
||
|
if(olap) {
|
||
|
break;
|
||
|
}
|
||
|
left = right + diff - 1;
|
||
|
}
|
||
|
if(!res.fw()) {
|
||
|
const_cast<AlnRes&>(res).invertEdits();
|
||
|
}
|
||
|
return olap;
|
||
|
}
|
||
|
|
||
|
/**
|
||
|
* Given all the paired and unpaired results involving mates #1 and #2,
|
||
|
* calculate best and second-best scores for both mates. These are
|
||
|
* used for future MAPQ calculations.
|
||
|
*/
|
||
|
void AlnSetSumm::init(
|
||
|
const Read* rd1,
|
||
|
const Read* rd2,
|
||
|
const EList<AlnRes>* rs1,
|
||
|
const EList<AlnRes>* rs2,
|
||
|
const EList<AlnRes>* rs1u,
|
||
|
const EList<AlnRes>* rs2u,
|
||
|
bool exhausted1,
|
||
|
bool exhausted2,
|
||
|
TRefId orefid,
|
||
|
TRefOff orefoff,
|
||
|
bool repeat)
|
||
|
{
|
||
|
assert(rd1 != NULL || rd2 != NULL);
|
||
|
assert((rs1 == NULL) == (rs2 == NULL));
|
||
|
AlnScore best[2], secbest[2], bestPaired, secbestPaired;
|
||
|
size_t szs[2];
|
||
|
best[0].invalidate(); secbest[0].invalidate();
|
||
|
best[1].invalidate(); secbest[1].invalidate();
|
||
|
bestPaired.invalidate(); secbestPaired.invalidate();
|
||
|
bool paired = (rs1 != NULL && rs2 != NULL);
|
||
|
szs[0] = szs[1] = 0;
|
||
|
TNumAlns numAlns1 = 0, numAlns2 = 0, numAlnsPaired = 0;
|
||
|
if(paired) {
|
||
|
// Paired alignments
|
||
|
assert_eq(rs1->size(), rs2->size());
|
||
|
szs[0] = szs[1] = rs1->size();
|
||
|
assert_gt(szs[0], 0);
|
||
|
numAlnsPaired = szs[0];
|
||
|
for(size_t i = 0; i < rs1->size(); i++) {
|
||
|
AlnScore sc = (*rs1)[i].score() + (*rs2)[i].score();
|
||
|
if(sc > bestPaired) {
|
||
|
secbestPaired = bestPaired;
|
||
|
bestPaired = sc;
|
||
|
assert(VALID_AL_SCORE(bestPaired));
|
||
|
} else if(sc > secbestPaired) {
|
||
|
secbestPaired = sc;
|
||
|
assert(VALID_AL_SCORE(bestPaired));
|
||
|
assert(VALID_AL_SCORE(secbestPaired));
|
||
|
}
|
||
|
}
|
||
|
}
|
||
|
for(int j = 0; j < 2; j++) {
|
||
|
const EList<AlnRes>* rs = (j == 0 ? rs1u : rs2u);
|
||
|
if(rs == NULL) {
|
||
|
continue;
|
||
|
}
|
||
|
assert(rs != NULL);
|
||
|
szs[j] = rs->size();
|
||
|
if(j == 0) {
|
||
|
numAlns1 = szs[j];
|
||
|
} else {
|
||
|
numAlns2 = szs[j];
|
||
|
}
|
||
|
//assert_gt(szs[j], 0);
|
||
|
for(size_t i = 0; i < rs->size(); i++) {
|
||
|
AlnScore sc = (*rs)[i].score();
|
||
|
if(sc > best[j]) {
|
||
|
secbest[j] = best[j];
|
||
|
best[j] = sc;
|
||
|
assert(VALID_AL_SCORE(best[j]));
|
||
|
} else if(sc > secbest[j]) {
|
||
|
secbest[j] = sc;
|
||
|
assert(VALID_AL_SCORE(best[j]));
|
||
|
assert(VALID_AL_SCORE(secbest[j]));
|
||
|
}
|
||
|
}
|
||
|
}
|
||
|
if(szs[0] > 0 || szs[1] > 0) {
|
||
|
init(
|
||
|
best[0],
|
||
|
secbest[0],
|
||
|
best[1],
|
||
|
secbest[1],
|
||
|
bestPaired,
|
||
|
secbestPaired,
|
||
|
(szs[0] == 0) ? 0 : (szs[0] - 1),
|
||
|
(szs[1] == 0) ? 0 : (szs[1] - 1),
|
||
|
paired,
|
||
|
exhausted1,
|
||
|
exhausted2,
|
||
|
orefid,
|
||
|
orefoff,
|
||
|
repeat,
|
||
|
numAlns1,
|
||
|
numAlns2,
|
||
|
numAlnsPaired);
|
||
|
} else {
|
||
|
reset();
|
||
|
orefid_ = orefid;
|
||
|
orefoff_ = orefoff;
|
||
|
repeat_ = repeat;
|
||
|
}
|
||
|
}
|
||
|
|
||
|
/**
|
||
|
* Print out string representation of YF:i flag for indicating whether and
|
||
|
* why the mate was filtered.
|
||
|
*/
|
||
|
bool AlnFlags::printYF(BTString& o, bool first) const {
|
||
|
const char *flag = "";
|
||
|
if (!lenfilt_) flag = "LN";
|
||
|
else if(!nfilt_ ) flag = "NS";
|
||
|
else if(!scfilt_ ) flag = "SC";
|
||
|
else if(!qcfilt_ ) flag = "QC";
|
||
|
if(*flag > 0) {
|
||
|
if(!first) o.append('\t');
|
||
|
o.append("YF:Z:");
|
||
|
o.append(flag);
|
||
|
return false;
|
||
|
}
|
||
|
return true;
|
||
|
}
|
||
|
|
||
|
|
||
|
/**
|
||
|
* Print out string representation of YM:i flag for indicating with the
|
||
|
* mate per se aligned repetitively.
|
||
|
*/
|
||
|
void AlnFlags::printYM(BTString& o) const {
|
||
|
o.append("YM:i:");
|
||
|
o.append(maxed() ? '1' : '0');
|
||
|
}
|
||
|
|
||
|
/**
|
||
|
* Print out string representation of YM:i flag for indicating with the
|
||
|
* pair containing the mate aligned repetitively.
|
||
|
*/
|
||
|
void AlnFlags::printYP(BTString& o) const {
|
||
|
o.append("YP:i:");
|
||
|
o.append(maxedPair() ? '1' : '0');
|
||
|
}
|
||
|
|
||
|
/**
|
||
|
* Print out string representation of these flags.
|
||
|
*/
|
||
|
void AlnFlags::printYT(BTString& o) const {
|
||
|
o.append("YT:Z:");
|
||
|
if(alignedConcordant()) {
|
||
|
o.append("CP");
|
||
|
} else if(alignedDiscordant()) {
|
||
|
o.append("DP");
|
||
|
} else if(alignedUnpairedMate()) {
|
||
|
o.append("UP");
|
||
|
} else if(alignedUnpaired()) {
|
||
|
o.append("UU");
|
||
|
} else { throw 1; }
|
||
|
}
|
||
|
|
||
|
#ifdef ALIGNER_RESULT_MAIN
|
||
|
|
||
|
#include "mem_ids.h"
|
||
|
|
||
|
int main() {
|
||
|
EList<char> op;
|
||
|
EList<char> ch;
|
||
|
EList<size_t> run;
|
||
|
{
|
||
|
// On top of each other, same length
|
||
|
cerr << "Test case 1, simple overlap 1 ... ";
|
||
|
AlnRes res1;
|
||
|
res1.init(
|
||
|
10,
|
||
|
AlnScore(),
|
||
|
NULL,
|
||
|
NULL,
|
||
|
NULL,
|
||
|
Coord(0, 0, true),
|
||
|
false);
|
||
|
AlnRes res2;
|
||
|
res2.init(
|
||
|
10,
|
||
|
AlnScore(),
|
||
|
NULL,
|
||
|
NULL,
|
||
|
NULL,
|
||
|
Coord(0, 0, true),
|
||
|
false);
|
||
|
assert(res1.overlap(res2));
|
||
|
|
||
|
// Try again, but using the redundant-alignment database
|
||
|
RedundantAlns ra;
|
||
|
ra.reset();
|
||
|
ra.init(10);
|
||
|
ra.add(res1);
|
||
|
assert(ra.overlap(res1));
|
||
|
assert(ra.overlap(res2));
|
||
|
|
||
|
char buf1[1024];
|
||
|
res1.printCigar(false, false, false, op, run, NULL, buf1);
|
||
|
assert_eq(0, strcmp(buf1, "10M"));
|
||
|
res1.printCigar(false, false, true, op, run, NULL, buf1);
|
||
|
assert_eq(0, strcmp(buf1, "10="));
|
||
|
|
||
|
char buf2[1024];
|
||
|
res2.printCigar(false, false, false, op, run, NULL, buf2);
|
||
|
assert_eq(0, strcmp(buf2, "10M"));
|
||
|
res2.printCigar(false, false, true, op, run, NULL, buf2);
|
||
|
assert_eq(0, strcmp(buf2, "10="));
|
||
|
|
||
|
char buf3[1024];
|
||
|
res1.printMD(false, false, op, ch, run, NULL, buf3);
|
||
|
assert_eq(0, strcmp(buf3, "10"));
|
||
|
res1.printMD(false, true, op, ch, run, NULL, buf3);
|
||
|
assert_eq(0, strcmp(buf3, "8"));
|
||
|
|
||
|
char buf4[1024];
|
||
|
res2.printMD(false, false, op, ch, run, NULL, buf4);
|
||
|
assert_eq(0, strcmp(buf4, "10"));
|
||
|
res2.printMD(false, true, op, ch, run, NULL, buf4);
|
||
|
assert_eq(0, strcmp(buf4, "8"));
|
||
|
|
||
|
cerr << "PASSED" << endl;
|
||
|
}
|
||
|
|
||
|
{
|
||
|
// On top of each other, different lengths
|
||
|
cerr << "Test case 2, simple overlap 2 ... ";
|
||
|
AlnRes res1;
|
||
|
res1.init(
|
||
|
10,
|
||
|
AlnScore(),
|
||
|
NULL,
|
||
|
NULL,
|
||
|
NULL,
|
||
|
Coord(0, 0, true),
|
||
|
false);
|
||
|
AlnRes res2;
|
||
|
res2.init(
|
||
|
11,
|
||
|
AlnScore(),
|
||
|
NULL,
|
||
|
NULL,
|
||
|
NULL,
|
||
|
Coord(0, 0, true),
|
||
|
false);
|
||
|
assert(res1.overlap(res2));
|
||
|
|
||
|
// Try again, but using the redundant-alignment database
|
||
|
RedundantAlns ra;
|
||
|
ra.reset();
|
||
|
ra.init(11);
|
||
|
ra.add(res1);
|
||
|
assert(ra.overlap(res1));
|
||
|
assert(ra.overlap(res2));
|
||
|
|
||
|
char buf1[1024];
|
||
|
res1.printCigar(false, false, false, op, run, NULL, buf1);
|
||
|
assert_eq(0, strcmp(buf1, "10M"));
|
||
|
res1.printCigar(false, false, true, op, run, NULL, buf1);
|
||
|
assert_eq(0, strcmp(buf1, "10="));
|
||
|
|
||
|
char buf2[1024];
|
||
|
res2.printCigar(false, false, false, op, run, NULL, buf2);
|
||
|
assert_eq(0, strcmp(buf2, "11M"));
|
||
|
res2.printCigar(false, false, true, op, run, NULL, buf2);
|
||
|
assert_eq(0, strcmp(buf2, "11="));
|
||
|
|
||
|
char buf3[1024];
|
||
|
res1.printMD(false, false, op, ch, run, NULL, buf3);
|
||
|
assert_eq(0, strcmp(buf3, "10"));
|
||
|
res1.printMD(false, true, op, ch, run, NULL, buf3);
|
||
|
assert_eq(0, strcmp(buf3, "8"));
|
||
|
|
||
|
char buf4[1024];
|
||
|
res2.printMD(false, false, op, ch, run, NULL, buf4);
|
||
|
assert_eq(0, strcmp(buf4, "11"));
|
||
|
res2.printMD(false, true, op, ch, run, NULL, buf4);
|
||
|
assert_eq(0, strcmp(buf4, "9"));
|
||
|
|
||
|
cerr << "PASSED" << endl;
|
||
|
}
|
||
|
|
||
|
{
|
||
|
// Different references
|
||
|
cerr << "Test case 3, simple overlap 3 ... ";
|
||
|
AlnRes res1;
|
||
|
res1.init(
|
||
|
10,
|
||
|
AlnScore(),
|
||
|
NULL,
|
||
|
NULL,
|
||
|
NULL,
|
||
|
Coord(0, 1, true),
|
||
|
false);
|
||
|
AlnRes res2;
|
||
|
res2.init(
|
||
|
11,
|
||
|
AlnScore(),
|
||
|
NULL,
|
||
|
NULL,
|
||
|
NULL,
|
||
|
Coord(0, 0, true),
|
||
|
false);
|
||
|
assert(!res1.overlap(res2));
|
||
|
|
||
|
// Try again, but using the redundant-alignment database
|
||
|
RedundantAlns ra;
|
||
|
ra.reset();
|
||
|
ra.init(11);
|
||
|
ra.add(res1);
|
||
|
assert(ra.overlap(res1));
|
||
|
assert(!ra.overlap(res2));
|
||
|
|
||
|
cerr << "PASSED" << endl;
|
||
|
}
|
||
|
|
||
|
{
|
||
|
// Different references
|
||
|
cerr << "Test case 4, simple overlap 4 ... ";
|
||
|
AlnRes res1;
|
||
|
res1.init(
|
||
|
10,
|
||
|
AlnScore(),
|
||
|
NULL,
|
||
|
NULL,
|
||
|
NULL,
|
||
|
Coord(0, 0, true),
|
||
|
false);
|
||
|
AlnRes res2;
|
||
|
res2.init(
|
||
|
10,
|
||
|
AlnScore(),
|
||
|
NULL,
|
||
|
NULL,
|
||
|
NULL,
|
||
|
Coord(1, 0, true),
|
||
|
false);
|
||
|
assert(!res1.overlap(res2));
|
||
|
|
||
|
// Try again, but using the redundant-alignment database
|
||
|
RedundantAlns ra;
|
||
|
ra.reset();
|
||
|
ra.init(10);
|
||
|
ra.add(res1);
|
||
|
assert(ra.overlap(res1));
|
||
|
assert(!ra.overlap(res2));
|
||
|
|
||
|
cerr << "PASSED" << endl;
|
||
|
}
|
||
|
|
||
|
{
|
||
|
// Different strands
|
||
|
cerr << "Test case 5, simple overlap 5 ... ";
|
||
|
AlnRes res1;
|
||
|
res1.init(
|
||
|
10,
|
||
|
AlnScore(),
|
||
|
NULL,
|
||
|
NULL,
|
||
|
NULL,
|
||
|
Coord(0, 0, true),
|
||
|
false);
|
||
|
AlnRes res2;
|
||
|
res2.init(
|
||
|
10,
|
||
|
AlnScore(),
|
||
|
NULL,
|
||
|
NULL,
|
||
|
NULL,
|
||
|
Coord(0, 0, false),
|
||
|
false);
|
||
|
assert(!res1.overlap(res2));
|
||
|
|
||
|
// Try again, but using the redundant-alignment database
|
||
|
RedundantAlns ra;
|
||
|
ra.reset();
|
||
|
ra.init(10);
|
||
|
ra.add(res1);
|
||
|
assert(ra.overlap(res1));
|
||
|
assert(!ra.overlap(res2));
|
||
|
|
||
|
cerr << "PASSED" << endl;
|
||
|
}
|
||
|
|
||
|
{
|
||
|
// Different strands
|
||
|
cerr << "Test case 6, simple overlap 6 ... ";
|
||
|
EList<Edit> ned1(RES_CAT);
|
||
|
ned1.expand();
|
||
|
// 1 step to the right in the middle of the alignment
|
||
|
ned1.back().init(5, 'A' /*chr*/, '-' /*qchr*/, EDIT_TYPE_READ_GAP);
|
||
|
AlnRes res1;
|
||
|
res1.init(
|
||
|
10,
|
||
|
AlnScore(),
|
||
|
&ned1,
|
||
|
NULL,
|
||
|
NULL,
|
||
|
Coord(0, 5, false),
|
||
|
false);
|
||
|
AlnRes res2;
|
||
|
res2.init(
|
||
|
10,
|
||
|
AlnScore(),
|
||
|
NULL,
|
||
|
NULL,
|
||
|
NULL,
|
||
|
Coord(0, 6, false),
|
||
|
false);
|
||
|
assert(res1.overlap(res2));
|
||
|
|
||
|
// Try again, but using the redundant-alignment database
|
||
|
RedundantAlns ra;
|
||
|
ra.reset();
|
||
|
ra.init(10);
|
||
|
ra.add(res1);
|
||
|
assert(ra.overlap(res1));
|
||
|
assert(ra.overlap(res2));
|
||
|
|
||
|
char buf1[1024];
|
||
|
res1.printCigar(false, false, false, op, run, NULL, buf1);
|
||
|
assert_eq(0, strcmp(buf1, "5M1D5M"));
|
||
|
res1.printCigar(false, false, true, op, run, NULL, buf1);
|
||
|
assert_eq(0, strcmp(buf1, "5=1D5="));
|
||
|
|
||
|
char buf2[1024];
|
||
|
res2.printCigar(false, false, false, op, run, NULL, buf2);
|
||
|
assert_eq(0, strcmp(buf2, "10M"));
|
||
|
res2.printCigar(false, false, true, op, run, NULL, buf2);
|
||
|
assert_eq(0, strcmp(buf2, "10="));
|
||
|
|
||
|
char buf3[1024];
|
||
|
res1.printMD(false, false, op, ch, run, NULL, buf3);
|
||
|
assert_eq(0, strcmp(buf3, "5^A5"));
|
||
|
res1.printMD(false, true, op, ch, run, NULL, buf3);
|
||
|
assert_eq(0, strcmp(buf3, "4^A4"));
|
||
|
|
||
|
char buf4[1024];
|
||
|
res2.printMD(false, false, op, ch, run, NULL, buf4);
|
||
|
assert_eq(0, strcmp(buf4, "10"));
|
||
|
res2.printMD(false, true, op, ch, run, NULL, buf4);
|
||
|
assert_eq(0, strcmp(buf4, "8"));
|
||
|
|
||
|
cerr << "PASSED" << endl;
|
||
|
}
|
||
|
|
||
|
{
|
||
|
// Different strands
|
||
|
cerr << "Test case 7, simple overlap 7 ... ";
|
||
|
EList<Edit> ned1(RES_CAT);
|
||
|
// 3 steps to the right in the middle of the alignment
|
||
|
ned1.push_back(Edit(5, 'A', '-', EDIT_TYPE_READ_GAP));
|
||
|
ned1.push_back(Edit(5, 'C', '-', EDIT_TYPE_READ_GAP));
|
||
|
ned1.push_back(Edit(5, 'G', '-', EDIT_TYPE_READ_GAP));
|
||
|
AlnRes res1;
|
||
|
res1.init(
|
||
|
10,
|
||
|
AlnScore(),
|
||
|
&ned1,
|
||
|
NULL,
|
||
|
NULL,
|
||
|
Coord(0, 5, false),
|
||
|
false);
|
||
|
AlnRes res2;
|
||
|
res2.init(
|
||
|
10,
|
||
|
AlnScore(),
|
||
|
NULL,
|
||
|
NULL,
|
||
|
NULL,
|
||
|
Coord(0, 6, false),
|
||
|
false);
|
||
|
assert(res1.overlap(res2));
|
||
|
|
||
|
// Try again, but using the redundant-alignment database
|
||
|
RedundantAlns ra;
|
||
|
ra.reset();
|
||
|
ra.init(10);
|
||
|
ra.add(res1);
|
||
|
assert(ra.overlap(res1));
|
||
|
assert(ra.overlap(res2));
|
||
|
|
||
|
char buf1[1024];
|
||
|
res1.printCigar(false, false, false, op, run, NULL, buf1);
|
||
|
assert_eq(0, strcmp(buf1, "5M3D5M"));
|
||
|
res1.printCigar(false, false, true, op, run, NULL, buf1);
|
||
|
assert_eq(0, strcmp(buf1, "5=3D5="));
|
||
|
|
||
|
char buf2[1024];
|
||
|
res2.printCigar(false, false, false, op, run, NULL, buf2);
|
||
|
assert_eq(0, strcmp(buf2, "10M"));
|
||
|
res2.printCigar(false, false, true, op, run, NULL, buf2);
|
||
|
assert_eq(0, strcmp(buf2, "10="));
|
||
|
|
||
|
char buf3[1024];
|
||
|
res1.printMD(false, false, op, ch, run, NULL, buf3);
|
||
|
assert_eq(0, strcmp(buf3, "5^GCA5"));
|
||
|
res1.printMD(false, true, op, ch, run, NULL, buf3);
|
||
|
assert_eq(0, strcmp(buf3, "4^GCA4"));
|
||
|
|
||
|
char buf4[1024];
|
||
|
res2.printMD(false, false, op, ch, run, NULL, buf4);
|
||
|
assert_eq(0, strcmp(buf4, "10"));
|
||
|
res2.printMD(false, true, op, ch, run, NULL, buf4);
|
||
|
assert_eq(0, strcmp(buf4, "8"));
|
||
|
|
||
|
cerr << "PASSED" << endl;
|
||
|
}
|
||
|
|
||
|
{
|
||
|
// Both with horizontal movements; overlap
|
||
|
cerr << "Test case 8, simple overlap 8 ... ";
|
||
|
EList<Edit> ned1(RES_CAT);
|
||
|
// 2 steps to the right in the middle of the alignment
|
||
|
ned1.push_back(Edit(5, 'A', '-', EDIT_TYPE_READ_GAP));
|
||
|
ned1.push_back(Edit(5, 'C', '-', EDIT_TYPE_READ_GAP));
|
||
|
AlnRes res1;
|
||
|
res1.init(
|
||
|
10,
|
||
|
AlnScore(),
|
||
|
&ned1,
|
||
|
NULL,
|
||
|
NULL,
|
||
|
Coord(0, 5, false),
|
||
|
false);
|
||
|
EList<Edit> ned2(RES_CAT);
|
||
|
// 2 steps to the right in the middle of the alignment
|
||
|
ned2.push_back(Edit(5, 'A', '-', EDIT_TYPE_READ_GAP));
|
||
|
ned2.push_back(Edit(5, 'C', '-', EDIT_TYPE_READ_GAP));
|
||
|
AlnRes res2;
|
||
|
res2.init(
|
||
|
10,
|
||
|
AlnScore(),
|
||
|
&ned2,
|
||
|
NULL,
|
||
|
NULL,
|
||
|
Coord(0, 6, false),
|
||
|
false);
|
||
|
assert(res1.overlap(res2));
|
||
|
|
||
|
// Try again, but using the redundant-alignment database
|
||
|
RedundantAlns ra;
|
||
|
ra.reset();
|
||
|
ra.init(10);
|
||
|
ra.add(res1);
|
||
|
assert(ra.overlap(res1));
|
||
|
assert(ra.overlap(res2));
|
||
|
|
||
|
char buf1[1024];
|
||
|
res1.printCigar(false, false, false, op, run, NULL, buf1);
|
||
|
assert_eq(0, strcmp(buf1, "5M2D5M"));
|
||
|
res1.printCigar(false, false, true, op, run, NULL, buf1);
|
||
|
assert_eq(0, strcmp(buf1, "5=2D5="));
|
||
|
|
||
|
char buf2[1024];
|
||
|
res2.printCigar(false, false, false, op, run, NULL, buf2);
|
||
|
assert_eq(0, strcmp(buf2, "5M2D5M"));
|
||
|
res2.printCigar(false, false, true, op, run, NULL, buf2);
|
||
|
assert_eq(0, strcmp(buf2, "5=2D5="));
|
||
|
|
||
|
cerr << "PASSED" << endl;
|
||
|
}
|
||
|
|
||
|
{
|
||
|
// Both with horizontal movements; no overlap
|
||
|
cerr << "Test case 9, simple overlap 9 ... ";
|
||
|
EList<Edit> ned1(RES_CAT);
|
||
|
// 2 steps to the right in the middle of the alignment
|
||
|
ned1.push_back(Edit(6, 'A', '-', EDIT_TYPE_READ_GAP));
|
||
|
ned1.push_back(Edit(6, 'C', '-', EDIT_TYPE_READ_GAP));
|
||
|
AlnRes res1;
|
||
|
res1.init(
|
||
|
10,
|
||
|
AlnScore(),
|
||
|
&ned1,
|
||
|
NULL,
|
||
|
NULL,
|
||
|
Coord(0, 5, true),
|
||
|
false);
|
||
|
EList<Edit> ned2(RES_CAT);
|
||
|
// 2 steps to the right in the middle of the alignment
|
||
|
ned2.push_back(Edit(5, 'A', '-', EDIT_TYPE_READ_GAP));
|
||
|
ned2.push_back(Edit(5, 'C', '-', EDIT_TYPE_READ_GAP));
|
||
|
AlnRes res2;
|
||
|
res2.init(
|
||
|
10,
|
||
|
AlnScore(),
|
||
|
&ned2,
|
||
|
NULL,
|
||
|
NULL,
|
||
|
Coord(0, 6, true),
|
||
|
false);
|
||
|
assert(!res1.overlap(res2));
|
||
|
|
||
|
// Try again, but using the redundant-alignment database
|
||
|
RedundantAlns ra;
|
||
|
ra.reset();
|
||
|
ra.init(10);
|
||
|
ra.add(res1);
|
||
|
assert(ra.overlap(res1));
|
||
|
assert(!ra.overlap(res2));
|
||
|
|
||
|
char buf1[1024];
|
||
|
res1.printCigar(false, false, false, op, run, NULL, buf1);
|
||
|
assert_eq(0, strcmp(buf1, "6M2D4M"));
|
||
|
res1.printCigar(false, false, true, op, run, NULL, buf1);
|
||
|
assert_eq(0, strcmp(buf1, "6=2D4="));
|
||
|
|
||
|
char buf2[1024];
|
||
|
res2.printCigar(false, false, false, op, run, NULL, buf2);
|
||
|
assert_eq(0, strcmp(buf2, "5M2D5M"));
|
||
|
res2.printCigar(false, false, true, op, run, NULL, buf2);
|
||
|
assert_eq(0, strcmp(buf2, "5=2D5="));
|
||
|
|
||
|
cerr << "PASSED" << endl;
|
||
|
}
|
||
|
|
||
|
{
|
||
|
// Both with horizontal movements; no overlap. Reverse strand.
|
||
|
cerr << "Test case 10, simple overlap 10 ... ";
|
||
|
EList<Edit> ned1(RES_CAT);
|
||
|
// 2 steps to the right in the middle of the alignment
|
||
|
ned1.push_back(Edit(5, 'A', '-', EDIT_TYPE_READ_GAP));
|
||
|
ned1.push_back(Edit(5, 'C', '-', EDIT_TYPE_READ_GAP));
|
||
|
AlnRes res1;
|
||
|
res1.init(
|
||
|
10,
|
||
|
AlnScore(),
|
||
|
&ned1,
|
||
|
NULL,
|
||
|
NULL,
|
||
|
Coord(0, 5, false),
|
||
|
false);
|
||
|
EList<Edit> ned2(RES_CAT);
|
||
|
// 2 steps to the right in the middle of the alignment
|
||
|
ned2.push_back(Edit(6, 'A', '-', EDIT_TYPE_READ_GAP));
|
||
|
ned2.push_back(Edit(6, 'C', '-', EDIT_TYPE_READ_GAP));
|
||
|
AlnRes res2;
|
||
|
res2.init(
|
||
|
10,
|
||
|
AlnScore(),
|
||
|
&ned2,
|
||
|
NULL,
|
||
|
NULL,
|
||
|
Coord(0, 6, false),
|
||
|
false);
|
||
|
assert(!res1.overlap(res2));
|
||
|
|
||
|
// Try again, but using the redundant-alignment database
|
||
|
RedundantAlns ra;
|
||
|
ra.reset();
|
||
|
ra.init(10);
|
||
|
ra.add(res1);
|
||
|
assert(ra.overlap(res1));
|
||
|
assert(!ra.overlap(res2));
|
||
|
|
||
|
char buf1[1024];
|
||
|
res1.printCigar(false, false, false, op, run, NULL, buf1);
|
||
|
assert_eq(0, strcmp(buf1, "5M2D5M"));
|
||
|
res1.printCigar(false, false, true, op, run, NULL, buf1);
|
||
|
assert_eq(0, strcmp(buf1, "5=2D5="));
|
||
|
|
||
|
char buf2[1024];
|
||
|
res2.printCigar(false, false, false, op, run, NULL, buf2);
|
||
|
assert_eq(0, strcmp(buf2, "4M2D6M"));
|
||
|
res2.printCigar(false, false, true, op, run, NULL, buf2);
|
||
|
assert_eq(0, strcmp(buf2, "4=2D6="));
|
||
|
|
||
|
cerr << "PASSED" << endl;
|
||
|
}
|
||
|
|
||
|
{
|
||
|
// Both with vertical movements; no overlap
|
||
|
cerr << "Test case 11, simple overlap 11 ... ";
|
||
|
EList<Edit> ned1(RES_CAT);
|
||
|
// 2 steps to the right in the middle of the alignment
|
||
|
ned1.push_back(Edit(5, '-', 'A', EDIT_TYPE_REF_GAP));
|
||
|
ned1.push_back(Edit(6, '-', 'C', EDIT_TYPE_REF_GAP));
|
||
|
AlnRes res1;
|
||
|
res1.init(
|
||
|
10,
|
||
|
AlnScore(),
|
||
|
&ned1,
|
||
|
NULL,
|
||
|
NULL,
|
||
|
Coord(0, 5, true),
|
||
|
false);
|
||
|
EList<Edit> ned2(RES_CAT);
|
||
|
// 2 steps to the right in the middle of the alignment
|
||
|
ned2.push_back(Edit(6, '-', 'A', EDIT_TYPE_REF_GAP));
|
||
|
ned2.push_back(Edit(7, '-', 'C', EDIT_TYPE_REF_GAP));
|
||
|
AlnRes res2;
|
||
|
res2.init(
|
||
|
10,
|
||
|
AlnScore(),
|
||
|
&ned2,
|
||
|
NULL,
|
||
|
NULL,
|
||
|
Coord(0, 6, true),
|
||
|
false);
|
||
|
assert(!res1.overlap(res2));
|
||
|
|
||
|
// Try again, but using the redundant-alignment database
|
||
|
RedundantAlns ra;
|
||
|
ra.reset();
|
||
|
ra.init(10);
|
||
|
ra.add(res1);
|
||
|
assert(ra.overlap(res1));
|
||
|
assert(!ra.overlap(res2));
|
||
|
|
||
|
char buf1[1024];
|
||
|
res1.printCigar(false, false, false, op, run, NULL, buf1);
|
||
|
assert_eq(0, strcmp(buf1, "5M2I3M"));
|
||
|
res1.printCigar(false, false, true, op, run, NULL, buf1);
|
||
|
assert_eq(0, strcmp(buf1, "5=2I3="));
|
||
|
|
||
|
char buf2[1024];
|
||
|
res2.printCigar(false, false, false, op, run, NULL, buf2);
|
||
|
assert_eq(0, strcmp(buf2, "6M2I2M"));
|
||
|
res2.printCigar(false, false, true, op, run, NULL, buf2);
|
||
|
assert_eq(0, strcmp(buf2, "6=2I2="));
|
||
|
|
||
|
cerr << "PASSED" << endl;
|
||
|
}
|
||
|
|
||
|
{
|
||
|
// Both with vertical movements; no overlap
|
||
|
cerr << "Test case 12, simple overlap 12 ... ";
|
||
|
EList<Edit> ned1(RES_CAT);
|
||
|
// 2 steps to the right in the middle of the alignment
|
||
|
ned1.push_back(Edit(5, '-', 'A', EDIT_TYPE_REF_GAP));
|
||
|
ned1.push_back(Edit(6, '-', 'C', EDIT_TYPE_REF_GAP));
|
||
|
AlnRes res1;
|
||
|
res1.init(
|
||
|
10,
|
||
|
AlnScore(),
|
||
|
&ned1,
|
||
|
NULL,
|
||
|
NULL,
|
||
|
Coord(0, 5, true),
|
||
|
false);
|
||
|
EList<Edit> ned2(RES_CAT);
|
||
|
// 2 steps to the right in the middle of the alignment
|
||
|
ned2.push_back(Edit(5, '-', 'A', EDIT_TYPE_REF_GAP));
|
||
|
ned2.push_back(Edit(6, '-', 'C', EDIT_TYPE_REF_GAP));
|
||
|
AlnRes res2;
|
||
|
res2.init(
|
||
|
10,
|
||
|
AlnScore(),
|
||
|
&ned2,
|
||
|
NULL,
|
||
|
NULL,
|
||
|
Coord(0, 6, true),
|
||
|
false);
|
||
|
assert(!res1.overlap(res2));
|
||
|
|
||
|
// Try again, but using the redundant-alignment database
|
||
|
RedundantAlns ra;
|
||
|
ra.reset();
|
||
|
ra.init(10);
|
||
|
ra.add(res1);
|
||
|
assert(ra.overlap(res1));
|
||
|
assert(!ra.overlap(res2));
|
||
|
|
||
|
char buf1[1024];
|
||
|
res1.printCigar(false, false, false, op, run, NULL, buf1);
|
||
|
assert_eq(0, strcmp(buf1, "5M2I3M"));
|
||
|
res1.printCigar(false, false, true, op, run, NULL, buf1);
|
||
|
assert_eq(0, strcmp(buf1, "5=2I3="));
|
||
|
|
||
|
char buf2[1024];
|
||
|
res2.printCigar(false, false, false, op, run, NULL, buf2);
|
||
|
assert_eq(0, strcmp(buf2, "5M2I3M"));
|
||
|
res2.printCigar(false, false, true, op, run, NULL, buf2);
|
||
|
assert_eq(0, strcmp(buf2, "5=2I3="));
|
||
|
|
||
|
cerr << "PASSED" << endl;
|
||
|
}
|
||
|
|
||
|
{
|
||
|
// Both with vertical movements; overlap
|
||
|
cerr << "Test case 13, simple overlap 13 ... ";
|
||
|
EList<Edit> ned1(RES_CAT);
|
||
|
// 2 steps to the right in the middle of the alignment
|
||
|
ned1.push_back(Edit(5, '-', 'A', EDIT_TYPE_REF_GAP));
|
||
|
ned1.push_back(Edit(6, '-', 'C', EDIT_TYPE_REF_GAP));
|
||
|
AlnRes res1;
|
||
|
res1.init(
|
||
|
10,
|
||
|
AlnScore(),
|
||
|
&ned1,
|
||
|
NULL,
|
||
|
NULL,
|
||
|
Coord(0, 5, true),
|
||
|
false);
|
||
|
EList<Edit> ned2(RES_CAT);
|
||
|
// 2 steps to the right in the middle of the alignment
|
||
|
ned2.push_back(Edit(4, '-', 'A', EDIT_TYPE_REF_GAP));
|
||
|
ned2.push_back(Edit(5, '-', 'C', EDIT_TYPE_REF_GAP));
|
||
|
AlnRes res2;
|
||
|
res2.init(
|
||
|
10,
|
||
|
AlnScore(),
|
||
|
&ned2,
|
||
|
NULL,
|
||
|
NULL,
|
||
|
Coord(0, 6, true),
|
||
|
false);
|
||
|
assert(res1.overlap(res2));
|
||
|
|
||
|
// Try again, but using the redundant-alignment database
|
||
|
RedundantAlns ra;
|
||
|
ra.reset();
|
||
|
ra.init(10);
|
||
|
ra.add(res1);
|
||
|
assert(ra.overlap(res1));
|
||
|
assert(ra.overlap(res2));
|
||
|
|
||
|
char buf1[1024];
|
||
|
res1.printCigar(false, false, false, op, run, NULL, buf1);
|
||
|
assert_eq(0, strcmp(buf1, "5M2I3M"));
|
||
|
res1.printCigar(false, false, true, op, run, NULL, buf1);
|
||
|
assert_eq(0, strcmp(buf1, "5=2I3="));
|
||
|
|
||
|
char buf2[1024];
|
||
|
res2.printCigar(false, false, false, op, run, NULL, buf2);
|
||
|
assert_eq(0, strcmp(buf2, "4M2I4M"));
|
||
|
res2.printCigar(false, false, true, op, run, NULL, buf2);
|
||
|
assert_eq(0, strcmp(buf2, "4=2I4="));
|
||
|
|
||
|
cerr << "PASSED" << endl;
|
||
|
}
|
||
|
|
||
|
{
|
||
|
// Not even close
|
||
|
cerr << "Test case 14, simple overlap 14 ... ";
|
||
|
EList<Edit> ned1(RES_CAT);
|
||
|
// 2 steps to the right in the middle of the alignment
|
||
|
ned1.push_back(Edit(5, '-', 'A', EDIT_TYPE_REF_GAP));
|
||
|
ned1.push_back(Edit(6, '-', 'C', EDIT_TYPE_REF_GAP));
|
||
|
AlnRes res1;
|
||
|
res1.init(
|
||
|
10,
|
||
|
AlnScore(),
|
||
|
&ned1,
|
||
|
NULL,
|
||
|
NULL,
|
||
|
Coord(0, 5, true),
|
||
|
false);
|
||
|
EList<Edit> ned2(RES_CAT);
|
||
|
// 2 steps to the right in the middle of the alignment
|
||
|
ned2.push_back(Edit(4, '-', 'A', EDIT_TYPE_REF_GAP));
|
||
|
ned2.push_back(Edit(5, '-', 'C', EDIT_TYPE_REF_GAP));
|
||
|
AlnRes res2;
|
||
|
res2.init(
|
||
|
10,
|
||
|
AlnScore(),
|
||
|
&ned2,
|
||
|
NULL,
|
||
|
NULL,
|
||
|
Coord(0, 400, true),
|
||
|
false);
|
||
|
assert(!res1.overlap(res2));
|
||
|
|
||
|
// Try again, but using the redundant-alignment database
|
||
|
RedundantAlns ra;
|
||
|
ra.reset();
|
||
|
ra.init(10);
|
||
|
ra.add(res1);
|
||
|
assert(ra.overlap(res1));
|
||
|
assert(!ra.overlap(res2));
|
||
|
|
||
|
char buf1[1024];
|
||
|
res1.printCigar(false, false, false, op, run, NULL, buf1);
|
||
|
assert_eq(0, strcmp(buf1, "5M2I3M"));
|
||
|
res1.printCigar(false, false, true, op, run, NULL, buf1);
|
||
|
assert_eq(0, strcmp(buf1, "5=2I3="));
|
||
|
|
||
|
char buf2[1024];
|
||
|
res2.printCigar(false, false, false, op, run, NULL, buf2);
|
||
|
assert_eq(0, strcmp(buf2, "4M2I4M"));
|
||
|
res2.printCigar(false, false, true, op, run, NULL, buf2);
|
||
|
assert_eq(0, strcmp(buf2, "4=2I4="));
|
||
|
|
||
|
cerr << "PASSED" << endl;
|
||
|
}
|
||
|
|
||
|
{
|
||
|
cerr << "Test case 15, CIGAR string with mismatches ... ";
|
||
|
EList<Edit> ned(RES_CAT);
|
||
|
// 2 steps to the right in the middle of the alignment
|
||
|
ned.push_back(Edit(0, 'C', 'A', EDIT_TYPE_MM));
|
||
|
ned.push_back(Edit(4, '-', 'C', EDIT_TYPE_REF_GAP));
|
||
|
ned.push_back(Edit(6, '-', 'C', EDIT_TYPE_REF_GAP));
|
||
|
ned.push_back(Edit(7, '-', 'C', EDIT_TYPE_REF_GAP));
|
||
|
ned.push_back(Edit(9, '-', 'A', EDIT_TYPE_READ_GAP));
|
||
|
ned.push_back(Edit(9, '-', 'A', EDIT_TYPE_READ_GAP));
|
||
|
ned.push_back(Edit(9, '-', 'A', EDIT_TYPE_READ_GAP));
|
||
|
ned.push_back(Edit(9, '-', 'A', EDIT_TYPE_READ_GAP));
|
||
|
ned.push_back(Edit(10, '-', 'A', EDIT_TYPE_MM));
|
||
|
AlnRes res; res.init(
|
||
|
11,
|
||
|
AlnScore(),
|
||
|
&ned,
|
||
|
NULL,
|
||
|
NULL,
|
||
|
Coord(0, 44, true),
|
||
|
false);
|
||
|
char buf[1024];
|
||
|
res.printCigar(false, false, false, op, run, NULL, buf);
|
||
|
assert_eq(0, strcmp(buf, "4M1I1M2I1M4D2M"));
|
||
|
res.printCigar(false, false, true, op, run, NULL, buf);
|
||
|
assert_eq(0, strcmp(buf, "1X3=1I1=2I1=4D1=1X"));
|
||
|
cerr << "PASSED" << endl;
|
||
|
}
|
||
|
|
||
|
{
|
||
|
cerr << "Test case 17, Overhang ... ";
|
||
|
EList<Edit> ned(RES_CAT);
|
||
|
// 2 steps to the right in the middle of the alignment
|
||
|
ned.push_back(Edit(0, 'N', 'A', EDIT_TYPE_MM));
|
||
|
ned.push_back(Edit(5, 'C', 'A', EDIT_TYPE_MM));
|
||
|
AlnRes res; res.init(
|
||
|
10,
|
||
|
AlnScore(),
|
||
|
&ned,
|
||
|
NULL,
|
||
|
NULL,
|
||
|
Coord(0, -1, true),
|
||
|
false);
|
||
|
|
||
|
char buf[1024];
|
||
|
res.printCigar(false, false, false, op, run, NULL, buf);
|
||
|
assert_eq(0, strcmp(buf, "10M"));
|
||
|
res.printCigar(false, false, true, op, run, NULL, buf);
|
||
|
assert_eq(0, strcmp(buf, "1X4=1X4="));
|
||
|
res.printMD(false, false, op, ch, run, NULL, buf);
|
||
|
assert_eq(0, strcmp(buf, "0N4C4"));
|
||
|
|
||
|
#if 0
|
||
|
AlnRes res2(res);
|
||
|
// Now soft-clip away the overhang
|
||
|
res2.clipOutside(
|
||
|
true, // soft clip
|
||
|
0, // ref begins
|
||
|
40); // ref ends (excl)
|
||
|
res2.printCigar(false, false, false, op, run, NULL, buf);
|
||
|
assert_eq(0, strcmp(buf, "1S9M"));
|
||
|
res2.printCigar(false, false, true, op, run, NULL, buf);
|
||
|
assert_eq(0, strcmp(buf, "4=1X4="));
|
||
|
res2.printMD(false, false, op, ch, run, NULL, buf);
|
||
|
assert_eq(0, strcmp(buf, "4C4"));
|
||
|
|
||
|
AlnRes res3 = res;
|
||
|
// Now hard-clip away the overhang
|
||
|
res3.clipOutside(
|
||
|
false, // hard clip
|
||
|
0, // ref begins
|
||
|
40); // ref ends (excl)
|
||
|
res3.printCigar(false, false, false, op, run, NULL, buf);
|
||
|
assert_eq(0, strcmp(buf, "9M"));
|
||
|
res3.printCigar(false, false, true, op, run, NULL, buf);
|
||
|
assert_eq(0, strcmp(buf, "4=1X4="));
|
||
|
res3.printMD(false, false, op, ch, run, NULL, buf);
|
||
|
assert_eq(0, strcmp(buf, "4C4"));
|
||
|
#endif
|
||
|
|
||
|
cerr << "PASSED" << endl;
|
||
|
}
|
||
|
}
|
||
|
|
||
|
#endif /*def ALIGNER_RESULT_MAIN*/
|