hisat-3n/hi_aligner.h
2025-01-18 21:09:52 +08:00

7007 lines
292 KiB
C++

/*
* Copyright 2015, Daehwan Kim <infphilo@gmail.com>
*
* This file is part of HISAT 2.
* This file is edited by Yun (Leo) Zhang for HISAT-3N.
*
* HISAT 2 is free software: you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation, either version 3 of the License, or
* (at your option) any later version.
*
* HISAT 2 is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with HISAT 2. If not, see <http://www.gnu.org/licenses/>.
*/
#ifndef HI_ALIGNER_H_
#define HI_ALIGNER_H_
#include <iostream>
#include <utility>
#include <limits>
#include "qual.h"
#include "ds.h"
#include "sstring.h"
#include "alphabet.h"
#include "edit.h"
#include "read.h"
// Threading is necessary to synchronize the classes that dump
// intermediate alignment results to files. Otherwise, all data herein
// is constant and shared, or per-thread.
#include "threading.h"
#include "aligner_result.h"
#include "aligner_cache.h"
#include "scoring.h"
#include "mem_ids.h"
#include "simple_func.h"
#include "aligner_driver.h"
#include "aligner_sw_driver.h"
#include "group_walk.h"
#include "tp.h"
#include "gp.h"
// Allow longer introns for long anchored reads involving canonical splice sites
inline uint32_t MaxIntronLen(uint32_t anchor, uint32_t minAnchorLen) {
uint32_t intronLen = 0;
if(anchor >= minAnchorLen) {
if(anchor < 2) anchor = 2;
uint32_t shift = (anchor << 1) - 4;
shift = min<uint32_t>(max<uint32_t>(shift, 13), 30);
intronLen = 1 << shift;
}
return intronLen;
}
inline float intronLen_prob(uint32_t anchor, uint32_t intronLen, uint32_t maxIntronLen) {
uint32_t expected_intron_len = maxIntronLen;
if(anchor < 14) expected_intron_len = 1 << ((anchor << 1) + 4);
if(expected_intron_len > maxIntronLen) expected_intron_len = maxIntronLen;
assert_gt(expected_intron_len, 0);
float result = ((float)intronLen) / ((float)expected_intron_len);
if(result > 1.0f) result = 1.0f;
return result;
}
// Allow longer introns for long anchored reads involving non-canonical splice sites
inline uint32_t MaxIntronLen_noncan(uint32_t anchor, uint32_t minAnchorLen_noncan) {
uint32_t intronLen = 0;
if(anchor >= minAnchorLen_noncan) {
if(anchor < 5) anchor = 5;
uint32_t shift = (anchor << 1) - 10;
shift = min<uint32_t>(shift, 30);
intronLen = 1 << shift;
}
return intronLen;
}
inline float intronLen_prob_noncan(uint32_t anchor, uint32_t intronLen, uint32_t maxIntronLen) {
uint32_t expected_intron_len = maxIntronLen;
if(anchor < 16) expected_intron_len = 1 << (anchor << 1);
if(expected_intron_len > maxIntronLen) expected_intron_len = maxIntronLen;
assert_gt(expected_intron_len, 0);
float result = ((float)intronLen) / ((float)expected_intron_len);
if(result > 1.0f) result = 1.0f;
return result;
}
/**
* Hit types for BWTHit class below
* Three hit types to anchor a read on the genome
*
*/
enum {
CANDIDATE_HIT = 1,
PSEUDOGENE_HIT,
ANCHOR_HIT,
};
/**
* Simple struct for holding a partial alignment for the read
* The alignment locations are represented by FM offsets [top, bot),
* and later genomic offsets are calculated when necessary
*/
template <typename index_t>
struct BWTHit {
BWTHit() { reset(); }
void reset() {
_top = _bot = 0;
_node_top = _node_bot = 0;
_node_iedge_count.clear();
_fw = true;
_bwoff = (index_t)INDEX_MAX;
_len = 0;
_coords.clear();
_anchor_examined = false;
_hit_type = CANDIDATE_HIT;
}
void init(
index_t top,
index_t bot,
index_t node_top,
index_t node_bot,
const EList<pair<index_t, index_t> >& node_iedge_count,
bool fw,
uint32_t bwoff,
uint32_t len,
index_t hit_type = CANDIDATE_HIT)
{
assert_leq(node_bot - node_top, bot - top);
#ifndef NDEBUG
if(node_bot - node_top < bot - top) {
assert_gt(node_iedge_count.size(), 0);
}
#endif
_top = top;
_bot = bot;
_node_top = node_top;
_node_bot = node_bot;
_node_iedge_count = node_iedge_count;
_fw = fw;
_bwoff = bwoff;
_len = len;
_coords.clear();
_anchor_examined = false;
_hit_type = hit_type;
}
bool hasGenomeCoords() const { return !_coords.empty(); }
/**
* Return true iff there is no hit.
*/
bool empty() const {
return _bot <= _top;
}
/**
* Higher score = higher priority.
*/
bool operator<(const BWTHit& o) const {
return _len > o._len;
}
/**
* Return the size of the alignments SA ranges.
*/
index_t size() const {
assert_leq(_top, _bot);
return _bot - _top;
}
index_t len() const {
assert_gt(_len, 0);
return _len;
}
#ifndef NDEBUG
/**
* Check that hit is sane w/r/t read.
*/
bool repOk(const Read& rd) const {
assert_gt(_bot, _top);
assert_neq(_bwoff, (index_t)INDEX_MAX);
assert_gt(_len, 0);
return true;
}
#endif
index_t _top; // start of the range in the FM index
index_t _bot; // end of the range in the FM index
index_t _node_top;
index_t _node_bot;
EList<pair<index_t, index_t> > _node_iedge_count;
bool _fw; // whether read is forward or reverse complemented
index_t _bwoff; // current base of a read to search from the right end
index_t _len; // read length
EList<Coord> _coords; // genomic offsets corresponding to [_top, _bot)
bool _anchor_examined; // whether or not this hit is examined
index_t _hit_type; // hit type (anchor hit, pseudogene hit, or candidate hit)
};
/**
* Simple struct for holding alignments for the read
* The alignments are represented by chains of BWTHits
*/
template <typename index_t>
struct ReadBWTHit {
ReadBWTHit() { reset(); }
void reset() {
_fw = true;
_len = 0;
_cur = 0;
_done = false;
_numPartialSearch = 0;
_numUniqueSearch = 0;
_repeat = false;
_partialHits.clear();
}
void init(
bool fw,
index_t len)
{
_fw = fw;
assert_gt(len, 0);
_len = len;
_cur = 0;
_done = false;
_numPartialSearch = 0;
_numUniqueSearch = 0;
_repeat = false;
_partialHits.clear();
}
bool done() {
#ifndef NDEBUG
assert_gt(_len, 0);
if(_cur >= _len) {
assert(_done);
}
#endif
return _done;
}
void done(bool done) {
assert(!_done);
assert(done);
_done = done;
}
index_t len() const { return _len; }
index_t cur() const { return _cur; }
bool repeat() const { return _repeat; }
index_t offsetSize() { return (index_t)_partialHits.size(); }
size_t numPartialSearch() { return _numPartialSearch; }
index_t numActualPartialSearch()
{
assert_leq(_numUniqueSearch, _numPartialSearch);
return (index_t)(_numPartialSearch - _numUniqueSearch);
}
bool width(index_t offset_) {
assert_lt(offset_, _partialHits.size());
return _partialHits[offset_].size();
}
bool hasGenomeCoords(index_t offset_) {
assert_lt(offset_, _partialHits.size());
index_t width_ = width(offset_);
if(width_ == 0) {
return true;
} else {
return _partialHits[offset_].hasGenomeCoords();
}
}
bool hasAllGenomeCoords() {
if(_cur < _len) return false;
if(_partialHits.size() <= 0) return false;
for(size_t oi = 0; oi < _partialHits.size(); oi++) {
if(!_partialHits[oi].hasGenomeCoords())
return false;
}
return true;
}
/**
*
*/
index_t minWidth(index_t& offset) const {
index_t minWidth_ = (index_t)INDEX_MAX;
index_t minWidthLen_ = 0;
for(size_t oi = 0; oi < _partialHits.size(); oi++) {
const BWTHit<index_t>& hit = _partialHits[oi];
if(hit.empty()) continue;
// if(!hit.hasGenomeCoords()) continue;
assert_gt(hit.size(), 0);
if((minWidth_ > hit.size()) ||
(minWidth_ == hit.size() && minWidthLen_ < hit.len())) {
minWidth_ = hit.size();
minWidthLen_ = hit.len();
offset = (index_t)oi;
}
}
return minWidth_;
}
// add policy for calculating a search score
int64_t searchScore(index_t minK) {
int64_t score = 0;
const int64_t penaltyPerOffset = minK * minK;
for(size_t i = 0; i < _partialHits.size(); i++) {
index_t len = _partialHits[i]._len;
score += (len * len);
}
assert_geq(_numPartialSearch, _partialHits.size());
index_t actualPartialSearch = numActualPartialSearch();
score -= (actualPartialSearch * penaltyPerOffset);
score -= (1 << (actualPartialSearch << 1));
return score;
}
BWTHit<index_t>& getPartialHit(index_t offset_) {
assert_lt(offset_, _partialHits.size());
return _partialHits[offset_];
}
bool adjustOffset(index_t minK) {
assert_gt(_partialHits.size(), 0);
const BWTHit<index_t>& hit = _partialHits.back();
if(hit.len() >= minK + 3) {
return false;
}
assert_geq(_cur, hit.len());
index_t origCur = _cur - hit.len();
_cur = origCur + max(hit.len(), minK + 1) - minK;
_partialHits.pop_back();
return true;
}
void setOffset(index_t offset) {
assert_lt(offset, _len);
_cur = offset;
}
#ifndef NDEBUG
/**
*/
bool repOk() const {
for(size_t i = 0; i < _partialHits.size(); i++) {
if(i == 0) {
assert_geq(_partialHits[i]._bwoff, 0);
}
if(i + 1 < _partialHits.size()) {
assert_leq(_partialHits[i]._bwoff + _partialHits[i]._len, _partialHits[i+1]._bwoff);
} else {
assert_eq(i+1, _partialHits.size());
assert_eq(_partialHits[i]._bwoff + _partialHits[i]._len, _cur);
}
}
return true;
}
#endif
bool _fw;
index_t _len;
index_t _cur;
bool _done;
index_t _numPartialSearch;
index_t _numUniqueSearch;
index_t _cur_local;
bool _repeat;
EList<BWTHit<index_t> > _partialHits;
};
/**
* this is per-thread data, which are shared by GenomeHit classes
* the main purpose of this struct is to avoid extensive use of memory related functions
* such as new and delete - those are really slow and lock based
*/
template <typename index_t>
struct SharedTempVars {
SStringExpandable<char> raw_refbuf;
SStringExpandable<char> raw_refbuf2;
EList<int64_t> temp_scores;
EList<int64_t> temp_scores2;
// Align with alternatives
EList<pair<index_t, int> > ssOffs;
EList<pair<index_t, int> > offDiffs;
EList<SStringExpandable<char> > raw_refbufs;
EList<Edit> alt_edits;
ELList<Edit, 128, 4> candidate_edits;
ELList<pair<index_t, index_t> > ht_llist;
Haplotype<index_t> cmp_ht;
ASSERT_ONLY(SStringExpandable<uint32_t> destU32);
ASSERT_ONLY(BTDnaString editstr);
ASSERT_ONLY(BTDnaString partialseq);
ASSERT_ONLY(BTDnaString refstr);
ASSERT_ONLY(EList<index_t> reflens);
ASSERT_ONLY(EList<index_t> refoffs);
LinkedEList<EList<Edit> > raw_edits;
LinkedEList<EList<pair<index_t, index_t> > > raw_ht_lists;
};
/**
* GenomeHit represents read alignment or alignment of a part of a read
* Two GenomeHits that represents alignments of different parts of a read
* can be combined together. Also, GenomeHit can be extended in both directions.
*/
template <typename index_t>
struct GenomeHit {
GenomeHit() :
_fw(false),
_rdoff((index_t)INDEX_MAX),
_len((index_t)INDEX_MAX),
_trim5(0),
_trim3(0),
_tidx((index_t)INDEX_MAX),
_toff((index_t)INDEX_MAX),
_joinedOff((index_t)INDEX_MAX),
_repeat(false),
_edits(NULL),
_ht_list(NULL),
_score(MIN_I64),
_localscore(MIN_I64),
_hitcount(1),
_edits_node(NULL),
_ht_list_node(NULL),
_sharedVars(NULL)
{
}
GenomeHit(const GenomeHit& otherHit) :
_edits(NULL),
_ht_list(NULL),
_hitcount(1),
_edits_node(NULL),
_ht_list_node(NULL),
_sharedVars(NULL)
{
init(otherHit._fw,
otherHit._rdoff,
otherHit._len,
otherHit._trim5,
otherHit._trim3,
otherHit._tidx,
otherHit._toff,
otherHit._joinedOff,
*(otherHit._sharedVars),
otherHit._repeat,
otherHit._edits,
otherHit._ht_list,
otherHit._score,
otherHit._localscore,
otherHit._splicescore);
}
GenomeHit<index_t>& operator=(const GenomeHit<index_t>& otherHit) {
if(this == &otherHit) return *this;
init(otherHit._fw,
otherHit._rdoff,
otherHit._len,
otherHit._trim5,
otherHit._trim3,
otherHit._tidx,
otherHit._toff,
otherHit._joinedOff,
*(otherHit._sharedVars),
otherHit._repeat,
otherHit._edits,
otherHit._ht_list,
otherHit._score,
otherHit._localscore,
otherHit._splicescore);
return *this;
}
~GenomeHit() {
if(_edits_node != NULL) {
assert(_edits != NULL);
assert(_sharedVars != NULL);
_sharedVars->raw_edits.delete_node(_edits_node);
_edits = NULL;
_edits_node = NULL;
}
if(_ht_list_node != NULL) {
assert(_ht_list != NULL);
assert(_sharedVars != NULL);
_sharedVars->raw_ht_lists.delete_node(_ht_list_node);
_ht_list = NULL;
_ht_list_node = NULL;
}
_sharedVars = NULL;
}
void init(
bool fw,
index_t rdoff,
index_t len,
index_t trim5,
index_t trim3,
index_t tidx,
index_t toff,
index_t joinedOff,
SharedTempVars<index_t>& sharedVars,
bool repeat = false,
EList<Edit>* edits = NULL,
EList<pair<index_t, index_t> >* ht_list = NULL,
int64_t score = 0,
int64_t localscore = 0,
double splicescore = 0.0)
{
_fw = fw;
_rdoff = rdoff;
_len = len;
_trim5 = trim5;
_trim3 = trim3;
_tidx = tidx;
_toff = toff;
_joinedOff = joinedOff;
_repeat = repeat;
_score = score;
_localscore = localscore;
_splicescore = splicescore;
assert(_sharedVars == NULL || _sharedVars == &sharedVars);
_sharedVars = &sharedVars;
if(_edits == NULL) {
assert(_edits_node == NULL);
_edits_node = _sharedVars->raw_edits.new_node();
assert(_edits_node != NULL);
_edits = &(_edits_node->payload);
}
assert(_edits != NULL);
_edits->clear();
if(edits != NULL) *_edits = *edits;
if(_ht_list == NULL) {
assert(_ht_list_node == NULL);
_ht_list_node = _sharedVars->raw_ht_lists.new_node();
assert(_ht_list_node != NULL);
_ht_list = &(_ht_list_node->payload);
}
assert(_ht_list != NULL);
_ht_list->clear();
if(ht_list != NULL) *_ht_list = *ht_list;
_hitcount = 1;
}
bool inited() const {
return _len >= 0 && _len < (index_t)INDEX_MAX;
}
/**
* Check if it is compatible with another GenomeHit with respect to indels or introns
*/
bool compatibleWith(
const GenomeHit<index_t>& otherHit,
index_t minIntronLen,
index_t maxIntronLen,
bool no_spliced_alignment = false) const;
/**
* Combine itself with another GenomeHit
*/
bool combineWith(
const GenomeHit& otherHit,
const Read& rd,
const GFM<index_t>& gfm,
const BitPairReference& ref,
const ALTDB<index_t>& altdb,
const RepeatDB<index_t>& repeatdb,
SpliceSiteDB& ssdb,
SwAligner& swa,
SwMetrics& swm,
const Scoring& sc,
TAlScore minsc,
RandomSource& rnd, // pseudo-random source
index_t minK_local,
index_t minIntronLen,
index_t maxIntronLen,
index_t minAnchorLen, // minimum anchor length for canonical splice site
index_t minAnchorLen_noncan, // minimum anchor length for non-canonical splice site
const index_t maxAltsTried,
const SpliceSite* spliceSite = NULL, // penalty for splice site
bool no_spliced_alignment = false);
/**
* Extend the partial alignment (GenomeHit) bidirectionally
*/
bool extend(
const Read& rd,
const GFM<index_t>& gfm,
const BitPairReference& ref,
const ALTDB<index_t>& altdb,
const RepeatDB<index_t>& repeatdb,
SpliceSiteDB& ssdb,
SwAligner& swa,
SwMetrics& swm,
PerReadMetrics& prm,
const Scoring& sc,
TAlScore minsc,
RandomSource& rnd, // pseudo-random source
index_t minK_local,
const TranscriptomePolicy& tpol,
const GraphPolicy& gpol,
index_t& leftext,
index_t& rightext,
index_t mm = 0);
/**
* Adjust alignment with respect to SNPs, usually updating Edits
*
*/
static bool adjustWithALT(
index_t rdoff,
index_t len,
const Coord& coord,
SharedTempVars<index_t>& sharedVars,
EList<GenomeHit<index_t> >& genomeHits,
const Read& rd,
const GFM<index_t>& gfm,
const ALTDB<index_t>& altdb,
const BitPairReference& ref,
const GraphPolicy& gpol);
/**
* Adjust alignment with respect to SNPs, usually updating Edits
*
*/
bool adjustWithALT(
const Read& rd,
const GFM<index_t>& gfm,
const ALTDB<index_t>& altdb,
const BitPairReference& ref,
const GraphPolicy& gpol);
/*
*
*/
static void findSSOffs(
const GFM<index_t>& gfm,
const ALTDB<index_t>& altdb,
index_t start,
index_t end,
EList<pair<index_t, int> >& ssOffs);
/*
* Find offset differences due to deletions
*/
static index_t findOffDiffs(
const GFM<index_t>& gfm,
const ALTDB<index_t>& altdb,
index_t start,
index_t end,
EList<pair<index_t, int> >& offDiffs);
/*
*
*/
static index_t alignWithALTs(
const EList<ALT<index_t> >& alts,
const EList<Haplotype<index_t> >& haplotypes,
const EList<index_t>& haplotype_maxrights,
index_t joinedOff,
const BTDnaString& rdseq,
index_t base_rdoff,
index_t rdoff,
index_t rdlen,
const BitPairReference& ref,
SharedTempVars<index_t>& sharedVar,
index_t tidx,
int rfoff,
index_t rflen,
bool left,
const GraphPolicy& gpol,
EList<Edit>& edits,
ELList<pair<index_t, index_t> >& ht_llist,
EList<pair<index_t, index_t> >& ht_list,
Haplotype<index_t>& cmp_ht,
int cycle_3N,
ELList<Edit, 128, 4>* candidate_edits = NULL,
index_t mm = 0,
index_t* numNs = NULL)
{
int best_rdoff = (int)rdoff;
if(numNs != NULL) *numNs = 0;
index_t numALTsTried = 0;
EList<Edit>& alt_edits = sharedVar.alt_edits;
alt_edits = edits;
index_t nedits = (index_t)edits.size();
if(candidate_edits != NULL) candidate_edits->clear();
ht_llist.clear();
// ht_llist.expand();
// ht_llist[0] = ht_list;
alignWithALTs_recur(
alts,
haplotypes,
haplotype_maxrights,
joinedOff,
rdseq,
rdoff - base_rdoff,
rdoff,
rdlen,
ref,
sharedVar.raw_refbufs,
ASSERT_ONLY(sharedVar.destU32,)
alt_edits,
best_rdoff,
NULL, /* rfseq */
tidx,
rfoff,
rflen,
left,
edits,
mm,
ht_llist,
cmp_ht,
candidate_edits,
0, /* tmp_numNs */
numNs,
0, /* dep */
gpol,
numALTsTried,
cycle_3N);
index_t extlen = 0;
if(left) {
assert_geq(best_rdoff, -1);
assert_leq(best_rdoff, (int)rdoff);
extlen = rdoff - best_rdoff;
} else {
assert_leq(best_rdoff, (int)(rdoff + rdlen));
assert_geq(best_rdoff, (int)rdoff);
extlen = best_rdoff - rdoff;
}
if(extlen > 0 && edits.size() > 0) {
const Edit& f = edits.front();
if(f.pos + extlen == base_rdoff + 1) {
if(f.type == EDIT_TYPE_READ_GAP ||
f.type == EDIT_TYPE_REF_GAP ||
f.type == EDIT_TYPE_SPL) {
extlen = 0;
}
if(f.type == EDIT_TYPE_MM && f.chr == 'N') {
extlen = 0;
}
}
const Edit& b = edits.back();
if(extlen > 0 && b.pos == rdoff - base_rdoff + extlen - 1) {
if(b.type == EDIT_TYPE_READ_GAP ||
b.type == EDIT_TYPE_REF_GAP) {
extlen = 0;
}
}
if(extlen == 0 && edits.size() > nedits) {
if(left) {
edits.erase(0, edits.size() - nedits);
} else {
edits.resize(nedits);
}
}
}
return extlen;
}
/*
*
*/
static index_t alignWithALTs_recur(
const EList<ALT<index_t> >& alts,
const EList<Haplotype<index_t> >& haplotypes,
const EList<index_t>& haplotype_maxrights,
index_t joinedOff,
const BTDnaString& rdseq,
index_t rdoff_add,
index_t rdoff,
index_t rdlen,
const BitPairReference& ref,
EList<SStringExpandable<char> >& raw_refbufs,
ASSERT_ONLY(SStringExpandable<uint32_t> destU32,)
EList<Edit>& tmp_edits,
int& best_rdoff,
const char* rfseq,
index_t tidx,
int rfoff,
index_t rflen,
bool left,
EList<Edit>& edits,
index_t mm,
ELList<pair<index_t, index_t> >& ht_llist,
Haplotype<index_t>& cmp_ht,
ELList<Edit, 128, 4>* candidate_edits,
index_t tmp_numNs,
index_t* numNs,
index_t dep,
const GraphPolicy& gpol,
index_t& numALTsTried,
int cycle_3N,
ALT_TYPE prev_alt_type = ALT_NONE);
/**
* For alignment involving indel, move the indels
* to the left most possible position
*/
void leftAlign(const Read& rd);
index_t rdoff() const { return _rdoff; }
index_t len() const { return _len; }
index_t trim5() const { return _trim5; }
index_t trim3() const { return _trim3; }
void trim5(index_t trim5,
const Read& rd,
SpliceSiteDB& ssdb,
const Scoring& sc,
index_t minK_local,
index_t minIntronLen,
index_t maxIntronLen,
index_t minAnchorLen,
index_t minAnchorLen_noncan,
const BitPairReference& ref)
{
assert_eq(_rdoff, trim5);
assert_eq(_trim5, 0);
_trim5 = trim5;
calculateScore(rd,
ssdb,
sc,
minK_local,
minIntronLen,
maxIntronLen,
minAnchorLen,
minAnchorLen_noncan,
ref);
}
void trim3(index_t trim3,
const Read& rd,
SpliceSiteDB& ssdb,
const Scoring& sc,
index_t minK_local,
index_t minIntronLen,
index_t maxIntronLen,
index_t minAnchorLen,
index_t minAnchorLen_noncan,
const BitPairReference& ref)
{
_trim3 = trim3;
calculateScore(rd,
ssdb,
sc,
minK_local,
minIntronLen,
maxIntronLen,
minAnchorLen,
minAnchorLen_noncan,
ref);
}
// for repeat alignments
// reverse fw
void reverse(const Read& rd)
{
_fw = !_fw;
index_t end = _trim5 + _rdoff + _len;
assert_leq(end, rd.length());
_rdoff = rd.length() - end;
index_t tmp_trim = _trim5;
_trim5 = _trim3;
_trim3 = tmp_trim;
Edit::invertPoss(*_edits, rd.length());
// complements
Edit::complement(*_edits);
}
index_t ref() const { return _tidx; }
index_t refoff() const { return _toff; }
index_t fw() const { return _fw; }
bool repeat() const { return _repeat; }
void repeat(bool repeat) { _repeat = repeat;}
index_t hitcount() const { return _hitcount; }
/**
* Leftmost coordinate
*/
Coord coord() const {
return Coord(_tidx, _toff, _fw);
}
int64_t score() const { return _score; }
int64_t localscore() const { return _localscore; }
double splicescore() const { return _splicescore; }
const EList<Edit>& edits() const { return *_edits; }
/**
* Retrieve the partial alignment from the left until indel or intron
*/
void getLeft(index_t& rdoff,
index_t& len,
index_t& toff,
int64_t* score = NULL,
const Read* rd = NULL,
const Scoring* sc = NULL) const
{
assert(inited());
toff = _toff, rdoff = _rdoff, len = _len;
const BTString* qual = NULL;
if(score != NULL) {
assert(rd != NULL);
assert(sc != NULL);
*score = 0;
qual = &(_fw ? rd->qual : rd->qualRev);
}
for(index_t i = 0; i < _edits->size(); i++) {
const Edit& edit = (*_edits)[i];
if(edit.type == EDIT_TYPE_SPL ||
edit.type == EDIT_TYPE_READ_GAP ||
edit.type == EDIT_TYPE_REF_GAP ||
(edit.type == EDIT_TYPE_MM && edit.snpID != (index_t)INDEX_MAX)) {
len = edit.pos;
break;
}
if(score != NULL) {
if(edit.type == EDIT_TYPE_MM) {
assert(qual != NULL);
if(edit.snpID == (index_t)INDEX_MAX) {
*score += sc->score(
dna2col[edit.qchr] - '0',
asc2dnamask[edit.chr],
(*qual)[this->_rdoff + edit.pos] - 33);
}
}
}
}
assert_geq(len, 0);
}
/**
* Retrieve the partial alignment from the right until indel or intron
*/
void getRight(index_t& rdoff,
index_t& len,
index_t& toff,
int64_t* score = NULL,
const Read* rd = NULL,
const Scoring* sc = NULL) const
{
assert(inited());
toff = _toff, rdoff = _rdoff, len = _len;
const BTString* qual = NULL;
if(score != NULL) {
assert(rd != NULL);
assert(sc != NULL);
*score = 0;
qual = &(_fw ? rd->qual : rd->qualRev);
}
if(_edits->size() == 0) return;
for(int i = (int)_edits->size() - 1; i >= 0; i--) {
const Edit& edit = (*_edits)[i];
if(edit.type == EDIT_TYPE_SPL ||
edit.type == EDIT_TYPE_READ_GAP ||
edit.type == EDIT_TYPE_REF_GAP ||
(edit.type == EDIT_TYPE_MM && edit.snpID != (index_t)INDEX_MAX)) {
rdoff = _rdoff + edit.pos;
assert_lt(edit.pos, _len);
len = _len - edit.pos;
if(edit.type == EDIT_TYPE_REF_GAP) {
assert_lt(edit.pos + 1, _len);
assert_gt(len, 1);
rdoff++;
len--;
} else if(edit.type == EDIT_TYPE_MM) {
assert_leq(edit.pos + 1, _len);
assert_geq(len, 1);
rdoff++;
len--;
}
toff = getRightOff() - len;
break;
}
if(score != NULL) {
if(edit.type == EDIT_TYPE_MM) {
assert(qual != NULL);
if(edit.snpID == (index_t)INDEX_MAX) {
*score += sc->score(
dna2col[edit.qchr] - '0',
asc2dnamask[edit.chr],
(*qual)[this->_rdoff + edit.pos] - 33);
}
}
}
}
assert_geq(len, 0);
}
/**
* Retrieve the genomic offset of the right end
*/
index_t getRightOff() const {
assert(inited());
index_t toff = _toff + _len;
for(index_t i = 0; i < _edits->size(); i++) {
const Edit& ed = (*_edits)[i];
if(ed.type == EDIT_TYPE_SPL) {
toff += ed.splLen;
} else if(ed.type == EDIT_TYPE_READ_GAP) {
toff++;
} else if(ed.type == EDIT_TYPE_REF_GAP) {
assert_gt(toff, 0);
toff--;
}
}
return toff;
}
/**
* Retrieve left anchor length and number of edits in the anchor
*/
void getLeftAnchor(index_t& leftanchor,
index_t& nedits) const
{
assert(inited());
leftanchor = _len;
nedits = 0;
for(index_t i = 0; i < _edits->size(); i++) {
const Edit& edit = (*_edits)[i];
if(edit.type == EDIT_TYPE_SPL) {
leftanchor = edit.pos;
break;
} else if(edit.type == EDIT_TYPE_MM ||
edit.type == EDIT_TYPE_READ_GAP ||
edit.type == EDIT_TYPE_REF_GAP) {
nedits++;
}
}
}
/**
* Retrieve right anchor length and number of edits in the anchor
*/
void getRightAnchor(index_t& rightanchor,
index_t& nedits) const
{
rightanchor = _len;
nedits = 0;
if(_edits->size() == 0) return;
for(int i = (int)_edits->size() - 1; i >= 0; i--) {
const Edit& edit = (*_edits)[i];
if(edit.type == EDIT_TYPE_SPL) {
rightanchor = _len - edit.pos - 1;
break;
} else if(edit.type == EDIT_TYPE_MM ||
edit.type == EDIT_TYPE_READ_GAP ||
edit.type == EDIT_TYPE_REF_GAP) {
nedits++;
}
}
}
/**
* Is it spliced alignment?
* Return: first is spliced-alignment, second is spliced-alignment to known transcripts
*/
pair<bool, bool> spliced() const {
pair<bool, bool> result(false, true);
for(index_t i = 0; i < _edits->size(); i++) {
const Edit& e = (*_edits)[i];
if(e.type == EDIT_TYPE_SPL) {
result.first = true;
result.second &= e.knownSpl;
}
}
result.second &= result.first;
return result;
}
/**
*
*/
bool spliced_consistently() const {
int splDir = SPL_UNKNOWN;
for(index_t i = 0; i < _edits->size(); i++) {
const Edit& edit = (*_edits)[i];
if(edit.type != EDIT_TYPE_SPL) continue;
if(splDir != SPL_UNKNOWN) {
if(edit.splDir != SPL_UNKNOWN) {
if(splDir == SPL_FW || splDir == SPL_SEMI_FW) {
if(edit.splDir != SPL_FW && edit.splDir != SPL_SEMI_FW)
return false;
}
if(splDir == SPL_RC || splDir == SPL_SEMI_RC) {
if(edit.splDir != SPL_RC && edit.splDir != SPL_SEMI_RC)
return false;
}
}
} else {
splDir = edit.splDir;
}
}
return true;
}
/**
* return one of EDIT_SPL_FW, EDIT_SPL_RC, EDIT_SPL_UNKNOWN
*/
int splicing_dir() const {
int splDir = SPL_UNKNOWN;
for(index_t i = 0; i < _edits->size(); i++) {
const Edit& edit = (*_edits)[i];
if(edit.type != EDIT_TYPE_SPL) continue;
if(splDir != SPL_UNKNOWN) {
if(edit.splDir != SPL_UNKNOWN) {
if(splDir == SPL_FW || splDir == SPL_SEMI_FW) {
if(edit.splDir != SPL_FW && edit.splDir != SPL_SEMI_FW)
return SPL_UNKNOWN;
}
if(splDir == SPL_RC || splDir == SPL_SEMI_RC) {
if(edit.splDir != SPL_RC && edit.splDir != SPL_SEMI_RC)
return SPL_UNKNOWN;
}
}
} else {
splDir = edit.splDir;
}
}
if(splDir == SPL_FW || splDir == SPL_SEMI_FW)
return SPL_FW;
else if(splDir == SPL_RC || splDir == SPL_SEMI_RC)
return SPL_RC;
else
return SPL_UNKNOWN;
}
bool operator== (const GenomeHit<index_t>& other) const {
if(_fw != other._fw ||
_rdoff != other._rdoff ||
_len != other._len ||
_tidx != other._tidx ||
_toff != other._toff ||
_trim5 != other._trim5 ||
_trim3 != other._trim3) {
return false;
}
if(_edits->size() != other._edits->size()) return false;
for(index_t i = 0; i < _edits->size(); i++) {
const Edit& e = (*_edits)[i];
const Edit& oe = (*other._edits)[i];
if(e.isReadGap()) {
if(!oe.isReadGap()) return false;
} else if(e.isRefGap()) {
if(!oe.isRefGap()) return false;
} else {
if(!(e == oe)) {
return false;
}
}
}
// daehwan - this may not be true when some splice sites are provided from outside
// assert_eq(_score, other._score);
return true;
}
bool contains(const GenomeHit<index_t>& other) const {
return (*this) == other;
}
/**
* Return number of mismatches in the alignment.
*/
int mms() const {
#if 0
if (_e2.inited()) return 2;
else if(_e1.inited()) return 1;
else return 0;
#endif
return 0;
}
/**
* Return the number of Ns involved in the alignment.
*/
int ns() const {
#if 0
int ns = 0;
if(_e1.inited() && _e1.hasN()) {
ns++;
if(_e2.inited() && _e2.hasN()) {
ns++;
}
}
return ns;
#endif
return 0;
}
int ngaps() const {
return 0;
}
#ifndef NDEBUG
/**
* Check that hit is sane w/r/t read.
*/
bool repOk(const Read& rd, const BitPairReference& ref);
#endif
void replace_edits_with_alts(const Read& rd,
const EList<ALT<index_t> >& alts,
SpliceSiteDB& ssdb,
const Scoring& sc,
index_t minK_local,
index_t minIntronLen,
index_t maxIntronLen,
index_t minAnchorLen,
index_t minAnchorLen_noncan,
const BitPairReference& ref) {
assert(inited());
if(alts.size() <= 0)
return;
if(_edits->size() <= 0)
return;
index_t joinedOff = _joinedOff;
int offset = 0;
size_t i = 0, next_i;
while(i < _edits->size()) {
next_i = i + 1;
Edit& ed = (*_edits)[i];
if(ed.type == EDIT_TYPE_SPL) {
assert(false);
} else if(ed.type == EDIT_TYPE_READ_GAP || ed.type == EDIT_TYPE_REF_GAP) {
for(; next_i < _edits->size(); next_i++) {
Edit& next_ed = (*_edits)[next_i];
if(ed.type != next_ed.type) break;
}
}
if(ed.snpID == (index_t)INDEX_MAX) {
ALT<index_t> cmp_alt;
cmp_alt.pos = joinedOff + ed.pos + offset;
index_t alt_i = (index_t)alts.bsearchLoBound(cmp_alt);
for(; alt_i < alts.size(); alt_i++) {
const ALT<index_t>& alt = alts[alt_i];
if(alt.left > cmp_alt.pos) break;
if(ed.type == EDIT_TYPE_MM) {
if(alt.type != ALT_SNP_SGL) continue;
if("ACGT"[alt.seq] == ed.qchr) {
ed.snpID = alt_i;
break;
}
} else {
size_t gap = next_i - i;
if(ed.type == EDIT_TYPE_READ_GAP) {
if(alt.type != ALT_SNP_DEL) continue;
if(alt.len == gap) {
for(size_t ii = i; ii < next_i; ii++) {
Edit& ii_ed = (*_edits)[ii];
ii_ed.snpID = alt_i;
}
break;
}
} else {
assert_eq(ed.type, EDIT_TYPE_REF_GAP);
if(alt.type != ALT_SNP_INS) continue;
if(alt.len == gap) {
uint64_t seq = 0;
for(size_t ii = i; ii < next_i; ii++) {
Edit& ii_ed = (*_edits)[ii];
seq = (seq << 2) | asc2dna[ii_ed.qchr];
}
if(alt.seq == seq) {
for(size_t ii = i; ii < next_i; ii++) {
Edit& ii_ed = (*_edits)[ii];
ii_ed.snpID = alt_i;
}
break;
}
}
}
}
}
}
if(ed.type == EDIT_TYPE_SPL) {
offset += ed.splLen;
} else if(ed.type == EDIT_TYPE_READ_GAP || ed.type == EDIT_TYPE_REF_GAP) {
size_t gap = next_i - i;
if(ed.type == EDIT_TYPE_READ_GAP) {
assert_gt(joinedOff, gap);
offset += gap;
} else {
offset -= gap;
}
}
i = next_i;
}
calculateScore(rd,
ssdb,
sc,
minK_local,
minIntronLen,
maxIntronLen,
minAnchorLen,
minAnchorLen_noncan,
ref);
}
private:
/**
* Calculate alignment score
*/
int64_t calculateScore(
const Read& rd,
SpliceSiteDB& ssdb,
const Scoring& sc,
index_t minK_local,
index_t minIntronLen,
index_t maxIntronLen,
index_t minAnchorLen,
index_t minAnchorLen_noncan,
const BitPairReference& ref);
public:
bool _fw;
index_t _rdoff;
index_t _len;
index_t _trim5;
index_t _trim3;
index_t _tidx;
index_t _toff;
index_t _joinedOff;
bool _repeat;
EList<Edit>* _edits;
EList<pair<index_t, index_t> >* _ht_list;
int64_t _score;
int64_t _localscore;
double _splicescore;
index_t _hitcount; // for selection purposes
LinkedEListNode<EList<Edit> >* _edits_node;
LinkedEListNode<EList<pair<index_t, index_t> > >* _ht_list_node;
SharedTempVars<index_t>* _sharedVars;
};
/**
* Check if it is compatible with another GenomeHit with respect to indels or introns
*/
template <typename index_t>
bool GenomeHit<index_t>::compatibleWith(
const GenomeHit<index_t>& otherHit,
index_t minIntronLen,
index_t maxIntronLen,
bool no_spliced_alignment) const
{
if(this == &otherHit) return false;
// check if they are on the same strand and on the same contig
if(_fw != otherHit._fw || _tidx != otherHit._tidx) return false;
// make sure itself is closer to the left end of read than otherHit
if(_rdoff > otherHit._rdoff) return false;
// do not consider a case itself (read portion) includes otherHit
if(_rdoff + _len > otherHit._rdoff + otherHit._len) return false;
// make sure itself comes before otherHit wrt. genomic positions
if(_toff > otherHit._toff) return false;
index_t this_rdoff, this_len, this_toff;
this->getRight(this_rdoff, this_len, this_toff);
assert_geq(this_len, 0);
index_t other_rdoff, other_len, other_toff;
otherHit.getLeft(other_rdoff, other_len, other_toff);
assert_geq(other_len, 0);
if(this_rdoff > other_rdoff) return false;
if(this_rdoff + this_len > other_rdoff + other_len) return false;
if(this_toff > other_toff) return false;
index_t refdif = other_toff - this_toff;
index_t rddif = other_rdoff - this_rdoff;
// check if there is a deletion, an insertion, or a potential intron
// between the two partial alignments
if(!no_spliced_alignment) {
if(refdif > rddif + maxIntronLen) {
return false;
}
}
return true;
}
static inline char get_ref_base(int threeN, int* mapping, char base)
{
return threeN ? mapping[base] : base;
}
/**
* Combine itself with another GenomeHit
* while allowing mismatches, an insertion, a deletion, or an intron
*/
template <typename index_t>
bool GenomeHit<index_t>::combineWith(
const GenomeHit& otherHit,
const Read& rd,
const GFM<index_t>& gfm,
const BitPairReference& ref,
const ALTDB<index_t>& altdb,
const RepeatDB<index_t>& repeatdb,
SpliceSiteDB& ssdb,
SwAligner& swa,
SwMetrics& swm,
const Scoring& sc,
TAlScore minsc,
RandomSource& rnd, // pseudo-random source
index_t minK_local,
index_t minIntronLen,
index_t maxIntronLen,
index_t minAnchorLen, // minimum anchor length for canonical splice site
index_t minAnchorLen_noncan, // minimum anchor length for non-canonical splice site
const index_t maxAltsTried,
const SpliceSite* spliceSite, // penalty for splice site
bool no_spliced_alignment)
{
if(this == &otherHit) return false;
assert(compatibleWith(otherHit, minIntronLen, maxIntronLen, no_spliced_alignment));
assert_eq(this->_tidx, otherHit._tidx);
assert_lt(this->_tidx, ref.numRefs());
// get the partial part of the alignment from the right
// until an indel or splice sites
index_t this_rdoff, this_len, this_toff;
int64_t this_score;
this->getRight(this_rdoff, this_len, this_toff, &this_score, &rd, &sc);
assert_geq(this_len, 0);
assert_leq(this_score, 0);
assert_geq(this_score, this->_score);
// get the partial part of the other alignment from the left
// until an indel or splice sites
index_t other_rdoff, other_len, other_toff;
int64_t other_score;
otherHit.getLeft(other_rdoff, other_len, other_toff, &other_score, &rd, &sc);
assert_geq(other_len, 0);
assert_leq(other_score, 0);
assert_geq(other_score, otherHit._score);
assert_leq(this_rdoff, other_rdoff);
if(this_len != 0 &&
other_len != 0 &&
this_rdoff + this_len > other_rdoff + other_len) return false;
assert_leq(this_rdoff + this_len, other_rdoff + other_len);
index_t len = other_rdoff - this_rdoff + other_len;
const index_t reflen = ref.approxLen(_tidx);
if(this_toff + len > reflen) return false;
assert_leq(this_toff + len, reflen);
// check if an indel or an intron is necessary
index_t refdif = other_toff - this_toff;
index_t rddif = other_rdoff - this_rdoff;
bool spliced = false, ins = false, del = false;
if(refdif != rddif) {
if(refdif > rddif) {
if(!no_spliced_alignment && refdif - rddif >= minIntronLen) {
assert_leq(refdif - rddif, maxIntronLen);
spliced = true;
} else {
del = true;
}
} else {
ins = true;
}
}
#ifndef NDEBUG
if(ins) {
assert(!spliced && !del);
} else {
if(spliced) assert(!del);
else assert(!spliced);
}
#endif
if(no_spliced_alignment) {
if(spliced) return false;
}
// if the combination of the two alignments does not involve an indel or an intron,
// then simply combine them and return
if(!spliced && !ins && !del && this_rdoff + this_len == other_rdoff) {
index_t addoff = otherHit._rdoff - this->_rdoff;
for(index_t i = 0; i < otherHit._edits->size(); i++) {
_edits->push_back((*otherHit._edits)[i]);
_edits->back().pos += addoff;
}
_len += otherHit._len;
calculateScore(
rd,
ssdb,
sc,
minK_local,
minIntronLen,
maxIntronLen,
minAnchorLen,
minAnchorLen_noncan,
ref);
assert(repOk(rd, ref));
return true;
}
// calculate the maximum gap lengths based on the current score and the mimumimu alignment score to be reported
const BTDnaString& seq = this->_fw ? rd.patFw : rd.patRc;
const BTString& qual = this->_fw ? rd.qual : rd.qualRev;
index_t rdlen = (index_t)seq.length();
int64_t remainsc = minsc - (_score - this_score) - (otherHit._score - other_score);
if(remainsc > 0) remainsc = 0;
int read_gaps = 0, ref_gaps = 0;
if(!spliced) {
read_gaps = sc.maxReadGaps(remainsc + sc.canSpl(), rdlen);
ref_gaps = sc.maxRefGaps(remainsc + sc.canSpl(), rdlen);
}
if(ins) {
if(refdif + ref_gaps < rddif) {
return false;
}
} else if(del) {
if(rddif + read_gaps < refdif) {
return false;
}
}
int this_ref_ext = read_gaps;
if(spliced) this_ref_ext += (int)intronic_len;
if(this_toff + len > reflen) return false;
if(this_toff + len + this_ref_ext > reflen) this_ref_ext = reflen - (this_toff + len);
assert(_sharedVars != NULL);
SStringExpandable<char>& raw_refbuf = _sharedVars->raw_refbuf;
EList<int64_t>& temp_scores = _sharedVars->temp_scores;
EList<int64_t>& temp_scores2 = _sharedVars->temp_scores2;
ASSERT_ONLY(SStringExpandable<uint32_t>& destU32 = _sharedVars->destU32);
raw_refbuf.resize(len + this_ref_ext + 16);
int off = ref.getStretch(
reinterpret_cast<uint32_t*>(raw_refbuf.wbuf()),
(size_t)this->_tidx,
(size_t)this_toff,
len + this_ref_ext
ASSERT_ONLY(, destU32));
assert_lt(off, 16);
char *refbuf = raw_refbuf.wbuf() + off, *refbuf2 = NULL;
// discover a splice site, an insertion, or a deletion
index_t maxscorei = (index_t)INDEX_MAX;
int64_t maxscore = MIN_I64;
uint32_t maxspldir = SPL_UNKNOWN;
float maxsplscore = 0.0f;
// allow an indel near a splice site
index_t splice_gap_maxscorei = (index_t)INDEX_MAX;
int64_t donor_seq = 0, acceptor_seq = 0;
int splice_gap_off = 0;
int refConversion_3N[5] = {0, 1, 2, 3, 4};
if (threeN){
if (((rd.threeN_cycle == threeN_type1conversion_FW || rd.threeN_cycle == threeN_type2conversion_RC) && !rd.oppositeConversion_3N) ||
((rd.threeN_cycle == threeN_type1conversion_RC || rd.threeN_cycle == threeN_type2conversion_FW) && rd.oppositeConversion_3N)) {
// type 1 conversion
refConversion_3N[asc2dna[hs3N_convertedFrom]] = asc2dna[hs3N_convertedTo];
} else {
// type 2 conversion
refConversion_3N[asc2dna[hs3N_convertedFromComplement]] = asc2dna[hs3N_convertedToComplement];
}
}
if(spliced || ins || del) {
int other_ref_ext = min<int>(read_gaps + (int)intronic_len, other_toff + other_len - len);
SStringExpandable<char>& raw_refbuf2 = _sharedVars->raw_refbuf2;
raw_refbuf2.resize(len + other_ref_ext + 16);
int off2 = ref.getStretch(
reinterpret_cast<uint32_t*>(raw_refbuf2.wbuf()),
(size_t)otherHit._tidx,
(size_t)(other_toff + other_len - len - other_ref_ext),
len + other_ref_ext
ASSERT_ONLY(, destU32));
refbuf2 = raw_refbuf2.wbuf() + off2 + other_ref_ext;
temp_scores.resize(len);
temp_scores2.resize(len);
if(spliced) {
static const char GT = 0x23, AG = 0x02;
static const char GTrc = 0x01, AGrc = 0x13;
static const char GC = 0x21, GCrc = 0x21;
static const char AT = 0x03, AC = 0x01;
static const char ATrc = 0x03, ACrc = 0x20;
static const char AA = 0x00, AArc = 0x33;
int i;
for(i = 0; i < (int)len; i++) {
int rdc = seq[this_rdoff + i], rfc = get_ref_base(threeN, refConversion_3N, refbuf[i]);
if(i > 0) {
temp_scores[i] = temp_scores[i-1];
} else {
temp_scores[i] = 0;
}
if(rdc != rfc) {
temp_scores[i] += sc.score(rdc, 1 << rfc, qual[this_rdoff + i] - 33);
}
if(temp_scores[i] < remainsc) {
break;
}
}
int i_limit = min<int>(i, len);
int i2;
for(i2 = len - 1; i2 >= 0; i2--) {
int rdc = seq[this_rdoff + i2], rfc = get_ref_base(threeN, refConversion_3N, refbuf2[i2]);
if((index_t)(i2 + 1) < len) {
temp_scores2[i2] = temp_scores2[i2+1];
} else {
temp_scores2[i2] = 0;
}
if(rdc != rfc) {
temp_scores2[i2] += sc.score(rdc, 1 << rfc, qual[this_rdoff + i2] - 33);
}
if(temp_scores2[i2] < remainsc) {
break;
}
}
int i2_limit = max<int>(i2, 0);
if(spliceSite != NULL){
assert_leq(this_toff, (int)spliceSite->left());
if(i2_limit <= (int)(spliceSite->left() - this_toff)) {
i2_limit = (int)(spliceSite->left() - this_toff);
i_limit = i2_limit + 1;
} else {
i_limit = i2_limit;
}
}
for(i = i2_limit, i2 = i2_limit + 1;
i < i_limit && i2 < (int)len;
i++, i2++) {
int64_t tempscore = temp_scores[i] + temp_scores2[i2];
char donor = 0xff, acceptor = 0xff;
if((index_t)(i + 2) < len + this_ref_ext) {
donor = refbuf[i + 1];
donor = (donor << 4) | refbuf[i + 2];
}
if(i2 - 2 >= -other_ref_ext) {
acceptor = refbuf2[i2 - 2];
acceptor = (acceptor << 4) | refbuf2[i2 - 1];
}
bool canonical = false, semi_canonical = false;
uint32_t spldir = SPL_UNKNOWN;
if((donor == GT && acceptor == AG) /* || (donor == AT && acceptor == AC) */) {
spldir = SPL_FW;
canonical = true;
} else if((donor == AGrc && acceptor == GTrc) /* || (donor == ACrc && acceptor == ATrc) */) {
spldir = SPL_RC;
canonical = true;
} else if((donor == GC && acceptor == AG) || (donor == AT && acceptor == AC)) {
spldir = SPL_SEMI_FW;
semi_canonical = true;
} else if((donor == AGrc && acceptor == GCrc) || (donor == ACrc && acceptor == ATrc)) {
spldir = SPL_SEMI_RC;
semi_canonical = true;
}
tempscore -= (canonical ? sc.canSpl() : sc.noncanSpl());
int64_t temp_donor_seq = 0, temp_acceptor_seq = 0;
float splscore = 0.0f;
if(canonical) {
// in case of canonical splice site, extract donor side sequence and acceptor side sequence
// to calculate a score of the splicing event.
if(spldir == SPL_FW) {
if(i + 1 >= (int)donor_exonic_len &&
(int)(len + this_ref_ext) > i + (int)donor_intronic_len &&
i2 + (int)other_ref_ext >= (int)acceptor_intronic_len &&
(int)len > i2 + (int)acceptor_exonic_len - 1) {
int from = i + 1 - (int)donor_exonic_len;
int to = i + (int)donor_intronic_len;
for(int j = from; j <= to; j++) {
assert_geq(j, 0);
assert_lt(j, (int)(len + this_ref_ext));
int base = refbuf[j];
if(base > 3) base = 0;
temp_donor_seq = temp_donor_seq << 2 | base;
}
from = i2 - acceptor_intronic_len;
to = i2 + acceptor_exonic_len - 1;
for(int j = from; j <= to; j++) {
assert_geq(j, -(int)other_ref_ext);
assert_lt(j, (int)len);
int base = refbuf2[j];
if(base > 3) base = 0;
temp_acceptor_seq = temp_acceptor_seq << 2 | base;
}
}
} else if(spldir == SPL_RC) {
if(i + 1 >= (int)acceptor_exonic_len &&
(int)(len + this_ref_ext) > i + (int)acceptor_intronic_len &&
i2 + (int)other_ref_ext >= (int)donor_intronic_len &&
(int)len > i2 + (int)donor_exonic_len - 1) {
int from = i + 1 - (int)acceptor_exonic_len;
int to = i + (int)acceptor_intronic_len;
for(int j = to; j >= from; j--) {
assert_geq(j, 0);
assert_lt(j, (int)(len + this_ref_ext));
int base = refbuf[j];
if(base > 3) base = 0;
temp_acceptor_seq = temp_acceptor_seq << 2 | (base ^ 0x3);
}
from = i2 - donor_intronic_len;
to = i2 + donor_exonic_len - 1;
for(int j = to; j >= from; j--) {
assert_geq(j, -(int)other_ref_ext);
assert_lt(j, (int)len);
int base = refbuf2[j];
if(base > 3) base = 0;
temp_donor_seq = temp_donor_seq << 2 | (base ^ 0x3);
}
}
}
splscore = SpliceSiteDB::probscore(temp_donor_seq, temp_acceptor_seq);
}
// daehwan - for debugging purposes
// choose a splice site with the better score
if((maxspldir == SPL_UNKNOWN && spldir == SPL_UNKNOWN && maxscore < tempscore) ||
(maxspldir == SPL_UNKNOWN && spldir == SPL_UNKNOWN && maxscore == tempscore && semi_canonical) ||
(maxspldir != SPL_UNKNOWN && spldir != SPL_UNKNOWN && (maxscore < tempscore || (maxscore == tempscore && maxsplscore < splscore))) ||
(maxspldir == SPL_UNKNOWN && spldir != SPL_UNKNOWN)) {
maxscore = tempscore;
maxscorei = i;
maxspldir = spldir;
maxsplscore = splscore;
if(maxspldir != SPL_UNKNOWN) {
donor_seq = temp_donor_seq;
acceptor_seq = temp_acceptor_seq;
} else {
donor_seq = 0;
acceptor_seq = 0;
}
}
}
} else {
// discover an insertion or a deletion
assert(ins || del);
int inslen = (ins ? rddif - refdif : 0);
int dellen = (del ? refdif - rddif : 0);
int64_t gap_penalty;
if(ins) {
gap_penalty = -(sc.refGapOpen() + sc.refGapExtend() * (inslen - 1));
} else {
assert(del);
gap_penalty = -(sc.readGapOpen() + sc.readGapExtend() * (dellen - 1));
}
if(gap_penalty < remainsc) return false;
int i;
for(i = 0; i < (int)len; i++) {
int rdc = seq[this_rdoff + i], rfc = get_ref_base(threeN, refConversion_3N, refbuf[i]);
if(i > 0) {
temp_scores[i] = temp_scores[i-1];
} else {
temp_scores[i] = 0;
}
if(rdc != rfc) {
temp_scores[i] += sc.score(rdc, 1 << rfc, qual[this_rdoff + i] - 33);
}
if(temp_scores[i] + gap_penalty < remainsc) {
break;
}
}
int i_limit = min<int>(i, len);
int i2;
for(i2 = len - 1; i2 >= 0; i2--) {
int rdc = seq[this_rdoff + i2], rfc = get_ref_base(threeN, refConversion_3N, refbuf2[i2]);
if((index_t)(i2 + 1) < len) {
temp_scores2[i2] = temp_scores2[i2+1];
} else {
temp_scores2[i2] = 0;
}
if(rdc != rfc) {
temp_scores2[i2] += sc.score(rdc, 1 << rfc, qual[this_rdoff + i2] - 33);
}
if(temp_scores2[i2] + gap_penalty < remainsc) {
break;
}
}
int i2_limit = (i2 < inslen ? 0 : i2 - inslen);
for(i = i2_limit, i2 = i2_limit + 1 + inslen;
i < i_limit && i2 < (int)len;
i++, i2++) {
int64_t tempscore = temp_scores[i] + temp_scores2[i2] + gap_penalty;
if(maxscore < tempscore) {
maxscore = tempscore;
maxscorei = i;
}
}
}
if(maxscore == MIN_I64) return false;
assert_lt(maxscorei, len);
if(spliced && spliceSite == NULL) {
uint32_t shorter_anchor_len = min<uint32_t>(maxscorei + 1, len - maxscorei - 1);
assert_leq(this_toff, other_toff);
if(maxspldir == SPL_SEMI_FW || maxspldir == SPL_SEMI_RC || maxspldir == SPL_UNKNOWN) {
if(shorter_anchor_len < minAnchorLen_noncan) {
float intronLenProb = intronLen_prob_noncan(shorter_anchor_len, other_toff - this_toff, maxIntronLen);
if(intronLenProb > 0.01f)
return false;
}
} else {
if(shorter_anchor_len < minAnchorLen) {
float intronLenProb = intronLen_prob(shorter_anchor_len, other_toff - this_toff, maxIntronLen);
if(intronLenProb > 0.01f)
return false;
}
}
}
if(maxscore < remainsc)
return false;
}
bool clear = true;
for(int i = (int)_edits->size() - 1; i >= 0; i--) {
const Edit& edit = (*_edits)[i];
if(edit.type == EDIT_TYPE_SPL ||
edit.type == EDIT_TYPE_READ_GAP ||
edit.type == EDIT_TYPE_REF_GAP ||
(edit.type == EDIT_TYPE_MM && edit.snpID != (index_t)INDEX_MAX)) {
_edits->resize(i+1);
clear = false;
break;
}
}
if(clear) this->_edits->clear();
// combine two alignments while updating edits
if(spliced) {
assert_geq(this_rdoff, this->_rdoff);
index_t addoff = this_rdoff - this->_rdoff;
int rd_gap_off = -min<int>(splice_gap_off, 0);
int ref_gap_off = max<int>(splice_gap_off, 0);
for(int i = 0; i < (int)len; i++) {
assert_lt(this_rdoff + i, rdlen);
int rdc = seq[this_rdoff + i];
assert_range(0, 4, rdc);
int rfc;
if(splice_gap_maxscorei <= maxscorei) {
if(i <= (int)splice_gap_maxscorei) {
rfc = get_ref_base(threeN, refConversion_3N, refbuf[i]);
} else if(i <= (int)maxscorei) {
rfc = get_ref_base(threeN, refConversion_3N, refbuf[i - ref_gap_off + rd_gap_off]);
} else {
rfc = get_ref_base(threeN, refConversion_3N, refbuf2[i]);
}
} else {
if(i <= (int)maxscorei) {
rfc = get_ref_base(threeN, refConversion_3N, refbuf[i]);
} else if(i <= (int)splice_gap_maxscorei) {
rfc = get_ref_base(threeN, refConversion_3N, refbuf2[i + ref_gap_off - rd_gap_off]);
} else {
rfc = get_ref_base(threeN, refConversion_3N, refbuf2[i]);
}
}
assert_range(0, 4, rfc);
if(rdc != rfc) {
Edit e((uint32_t)(i + addoff), rfc, rdc, EDIT_TYPE_MM, false);
_edits->push_back(e);
}
if(i == (int)maxscorei) {
index_t left = this_toff + i + 1;
if(splice_gap_maxscorei <= maxscorei) {
left = left - ref_gap_off + rd_gap_off;
}
index_t right = other_toff + other_len - (len - i - 1);
if(splice_gap_maxscorei > maxscorei) {
right = right + ref_gap_off - rd_gap_off;
}
index_t skipLen = 0;
assert_lt(left, right);
skipLen = right - left;
Edit e((uint32_t)(i + 1 + addoff), 0, 0, EDIT_TYPE_SPL, skipLen, maxspldir, spliceSite != NULL, false);
e.donor_seq = donor_seq;
e.acceptor_seq = acceptor_seq;
_edits->push_back(e);
}
if(i == (int)splice_gap_maxscorei && splice_gap_off != 0) {
if(rd_gap_off > 0) {
assert_lt(left, right);
for(index_t j = 0; j < (index_t)rd_gap_off; j++) {
int temp_rfc_off = i + 1 + j;
int temp_rfc;
if(i < (int)maxscorei) {
temp_rfc = get_ref_base(threeN, refConversion_3N, refbuf[temp_rfc_off]);
} else {
temp_rfc = get_ref_base(threeN, refConversion_3N, refbuf2[temp_rfc_off - rd_gap_off]);
}
assert_range(0, 4, temp_rfc);
Edit e((uint32_t)(i + 1 + addoff), "ACGTN"[temp_rfc], '-', EDIT_TYPE_READ_GAP);
_edits->push_back(e);
}
} else {
assert_gt(ref_gap_off, 0);
for(index_t j = 0; j < (index_t)ref_gap_off; j++) {
assert_lt(this_rdoff + i + 1 + j, rdlen);
int temp_rdc = seq[this_rdoff + i + 1 + j];
assert_range(0, 4, temp_rdc);
Edit e((uint32_t)(i + 1 + j + addoff), '-', "ACGTN"[temp_rdc], EDIT_TYPE_REF_GAP);
_edits->push_back(e);
}
i += ref_gap_off;
}
}
}
} else {
index_t ins_len = 0;
for(index_t i = 0; i < len; i++) {
char rdc = seq[this_rdoff + i];
char rfc = (i <= maxscorei ? get_ref_base(threeN, refConversion_3N, refbuf[i]) : get_ref_base(threeN, refConversion_3N, refbuf2[i]));
assert_geq(this_rdoff, this->_rdoff);
index_t addoff = this_rdoff - this->_rdoff;
if(rdc != rfc) {
ALT<index_t> cmp_alt;
assert_geq(this_toff, this->_toff);
cmp_alt.pos = this->_joinedOff + i + (this_toff - this->_toff) - ins_len;
index_t alt_i = (index_t)altdb.alts().bsearchLoBound(cmp_alt);
index_t add_alt_i = std::numeric_limits<index_t>::max();
for(; alt_i < altdb.alts().size(); alt_i++) {
const ALT<index_t>& alt = altdb.alts()[alt_i];
if(alt.left > cmp_alt.pos) break;
if(alt.type != ALT_SNP_SGL) continue;
if(alt.seq == rdc) {
add_alt_i = alt_i;
break;
}
}
Edit e((uint32_t)(i + addoff), rfc, rdc, EDIT_TYPE_MM, false, add_alt_i);
_edits->push_back(e);
}
if(i == maxscorei) {
index_t left = this_toff + i + 1;
if(other_toff + other_len < len - i - 1)
return false;
index_t right = other_toff + other_len - (len - i - 1);
index_t skipLen = 0;
if(del) {
assert_lt(left, right);
skipLen = right - left;
for(index_t j = 0; j < skipLen; j++) {
int temp_rfc;
if(i + 1 + j < len) temp_rfc = get_ref_base(threeN, refConversion_3N, refbuf[i + 1 + j]);
else temp_rfc = get_ref_base(threeN, refConversion_3N, ref.getBase(this->_tidx, this_toff + i + 1 + j));
assert_range(0, 4, temp_rfc);
Edit e((uint32_t)(i + 1 + addoff), "ACGTN"[temp_rfc], '-', EDIT_TYPE_READ_GAP);
_edits->push_back(e);
}
} else {
assert(ins);
assert_lt(right, left);
skipLen = left - right;
for(index_t j = 0; j < skipLen; j++) {
assert_lt(this_rdoff + i + 1 + j, seq.length());
int temp_rdc = seq[this_rdoff + i + 1 + j];
assert_range(0, 4, temp_rdc);
Edit e((uint32_t)(i + 1 + j + addoff), '-', "ACGTN"[temp_rdc], EDIT_TYPE_REF_GAP);
_edits->push_back(e);
}
i += skipLen;
ins_len += skipLen;
}
}
}
}
index_t fsi = (index_t)otherHit._edits->size();
for(index_t i = 0; i < otherHit._edits->size(); i++) {
const Edit& edit = (*otherHit._edits)[i];
if(edit.type == EDIT_TYPE_SPL ||
edit.type == EDIT_TYPE_READ_GAP ||
edit.type == EDIT_TYPE_REF_GAP ||
(edit.type == EDIT_TYPE_MM && edit.snpID != (index_t)INDEX_MAX)) {
fsi = i;
break;
}
}
assert_leq(this->_rdoff, otherHit._rdoff);
index_t addoff = otherHit._rdoff - this->_rdoff;
for(index_t i = fsi; i < otherHit._edits->size(); i++) {
_edits->push_back((*otherHit._edits)[i]);
_edits->back().pos += addoff;
}
// for alignment involving indel, left align so that
// indels go to the left most of the combined alignment
if(ins || del || (spliced && splice_gap_off != 0)) {
leftAlign(rd);
}
// update alignment score, trims
assert_leq(this->_rdoff + this->_len, otherHit._rdoff + otherHit._len);
_len = otherHit._rdoff + otherHit._len - this->_rdoff;
assert_eq(_trim3, 0);
_trim3 += otherHit._trim3;
calculateScore(
rd,
ssdb,
sc,
minK_local,
minIntronLen,
maxIntronLen,
minAnchorLen,
minAnchorLen_noncan,
ref);
#ifndef NDEBUG
if(_joinedOff != (index_t)INDEX_MAX) {
ASSERT_ONLY(bool straddled = false);
ASSERT_ONLY(index_t tmp_tidx = 0, tmp_toff = 0, tmp_tlen = 0);
gfm.joinedToTextOff(
0,
_joinedOff,
tmp_tidx,
tmp_toff,
tmp_tlen,
true, // reject straddlers?
straddled); // straddled?
assert_eq(tmp_tidx, _tidx);
assert_eq(tmp_toff, _toff);
}
#endif
assert(repOk(rd, ref));
return true;
}
/**
* Extend the partial alignment (GenomeHit) bidirectionally
*/
template <typename index_t>
bool GenomeHit<index_t>::extend(
const Read& rd,
const GFM<index_t>& gfm,
const BitPairReference& ref,
const ALTDB<index_t>& altdb,
const RepeatDB<index_t>& repeatdb,
SpliceSiteDB& ssdb,
SwAligner& swa,
SwMetrics& swm,
PerReadMetrics& prm,
const Scoring& sc,
TAlScore minsc,
RandomSource& rnd, // pseudo-random source
index_t minK_local,
const TranscriptomePolicy& tpol,
const GraphPolicy& gpol,
index_t& leftext,
index_t& rightext,
index_t mm)
{
assert_lt(this->_tidx, ref.numRefs());
index_t max_leftext = leftext, max_rightext = rightext;
assert(max_leftext > 0 || max_rightext > 0);
leftext = 0, rightext = 0;
index_t rdlen = (index_t)rd.length();
bool doLeftAlign = false;
assert(_sharedVars != NULL);
const index_t minIntronLen = tpol.minIntronLen();
const index_t maxIntronLen = tpol.maxIntronLen();
const index_t minAnchorLen = tpol.minAnchorLen();
const index_t minAnchorLen_noncan = tpol.minAnchorLen_noncan();
// extend the alignment further in the left direction
// with 'mm' mismatches allowed
const BTDnaString& seq = _fw ? rd.patFw : rd.patRc;
if(max_leftext > 0 && _rdoff > 0) {
assert_gt(_rdoff, 0);
index_t left_rdoff, left_len, left_toff;
this->getLeft(left_rdoff, left_len, left_toff);
assert_eq(left_rdoff, _rdoff);
assert_eq(left_toff, _toff);
if(_toff <= 0) return false;
int rl = (int)_toff - (int)_rdoff;
assert_geq(_score, minsc);
index_t reflen = _rdoff + 10;
rl -= (reflen - _rdoff);
if(rl < 0) {
reflen += rl;
rl = 0;
}
index_t numNs = 0;
index_t num_prev_edits = (index_t)_edits->size();
index_t best_ext = alignWithALTs(
altdb.alts(),
altdb.haplotypes(),
altdb.haplotype_maxrights(),
this->_joinedOff,
seq,
this->_rdoff - 1,
this->_rdoff - 1,
this->_rdoff,
ref,
*_sharedVars,
_tidx,
rl,
reflen,
true, /* left? */
gpol,
*this->_edits,
_sharedVars->ht_llist,
*this->_ht_list,
_sharedVars->cmp_ht,
rd.threeN_cycle,
NULL,
mm,
&numNs);
// Do not allow for any edits including known snps and splice sites when extending zero-length hit
if(_len == 0 && mm == 0 && _edits->size() > 0) {
_edits->clear();
return false;
}
if(best_ext > 0) {
leftext = best_ext;
assert_leq(num_prev_edits, _edits->size());
index_t added_edits = (index_t)_edits->size() - num_prev_edits;
int ref_ext = (int)best_ext;
for(index_t i = 0; i < added_edits; i++) {
const Edit& edit = (*_edits)[i];
if(edit.type == EDIT_TYPE_REF_GAP) ref_ext--;
else if(edit.type == EDIT_TYPE_READ_GAP) ref_ext++;
else if(edit.type == EDIT_TYPE_SPL) ref_ext += edit.splLen;
}
assert_leq(best_ext, _rdoff);
_rdoff -= best_ext;
assert_leq(ref_ext, _toff);
_toff -= ref_ext;
_len += best_ext;
assert_leq(_len, rdlen);
assert_leq((int)numNs, ref_ext);
assert_leq(ref_ext - (int)numNs, _joinedOff);
_joinedOff -= (ref_ext - (int)numNs);
for(index_t i = 0; i < _edits->size(); i++) {
if(i < added_edits) {
assert_geq((*_edits)[i].pos, _rdoff);
(*_edits)[i].pos -= _rdoff;
} else {
(*_edits)[i].pos += best_ext;
}
}
}
}
// extend the alignment further in the right direction
// with 'mm' mismatches allowed
if(max_rightext > 0 && _rdoff + _len < rdlen) {
index_t right_rdoff, right_len, right_toff;
this->getRight(right_rdoff, right_len, right_toff);
index_t rl = right_toff + right_len;
assert_eq(_rdoff + _len, right_rdoff + right_len);
index_t rr = rdlen - (right_rdoff + right_len);
index_t tlen = ref.approxLen(_tidx);
if(rl < tlen) {
index_t reflen = rr + 10;
if(rl + reflen > tlen) {
reflen = tlen - rl;
}
int ref_ext = (int)_len;
for(index_t ei = 0; ei < _edits->size(); ei++) {
const Edit& e = (*_edits)[ei];
if(e.type == EDIT_TYPE_REF_GAP) ref_ext--;
else if(e.type == EDIT_TYPE_READ_GAP) ref_ext++;
else if(e.type == EDIT_TYPE_SPL) ref_ext += e.splLen;
else if(e.type == EDIT_TYPE_MM && e.chr == 'N') ref_ext--;
}
index_t best_ext = alignWithALTs(
altdb.alts(),
altdb.haplotypes(),
altdb.haplotype_maxrights(),
this->_joinedOff + ref_ext,
seq,
this->_rdoff,
this->_rdoff + this->_len,
rdlen - (this->_rdoff + this->_len),
ref,
*_sharedVars,
_tidx,
(int)rl,
reflen,
false,
gpol,
*this->_edits,
_sharedVars->ht_llist,
*this->_ht_list,
_sharedVars->cmp_ht,
rd.threeN_cycle,
NULL,
mm);
// Do not allow for any edits including known snps and splice sites when extending zero-length hit
if(_len == 0 && mm == 0 && _edits->size() > 0) {
_edits->clear();
return false;
}
if(best_ext > 0) {
rightext = best_ext;
_len += best_ext;
}
}
}
#ifndef NDEBUG
if(_joinedOff != (index_t)INDEX_MAX && seq[_rdoff] < 4) {
ASSERT_ONLY(bool straddled = false);
ASSERT_ONLY(index_t tmp_tidx = 0, tmp_toff = 0, tmp_tlen = 0);
gfm.joinedToTextOff(
0,
_joinedOff,
tmp_tidx,
tmp_toff,
tmp_tlen,
true, // reject straddlers?
straddled); // straddled?
if(!gfm.repeat()) {
assert_eq(tmp_tidx, _tidx);
}
assert_eq(tmp_toff, _toff);
}
#endif
if(doLeftAlign) leftAlign(rd);
assert_leq(_rdoff + _len, rdlen);
calculateScore(
rd,
ssdb,
sc,
minK_local,
minIntronLen,
maxIntronLen,
minAnchorLen,
minAnchorLen_noncan,
ref);
assert(repOk(rd, ref));
return leftext > 0 || rightext > 0;
}
/**
* Adjust alignment with respect to SNPs, usually updating Edits
*
*/
template <typename index_t>
bool GenomeHit<index_t>::adjustWithALT(
index_t rdoff,
index_t len,
const Coord& coord,
SharedTempVars<index_t>& sharedVars,
EList<GenomeHit<index_t> >& genomeHits,
const Read& rd,
const GFM<index_t>& gfm,
const ALTDB<index_t>& altdb,
const BitPairReference& ref,
const GraphPolicy& gpol)
{
if(gfm.gh().linearFM()) {
genomeHits.expand();
genomeHits.back().init(
coord.orient(),
rdoff,
len,
0, // trim5
0, // trim3
(index_t)coord.ref(),
(index_t)coord.off(),
(index_t)coord.joinedOff(),
sharedVars);
return true;
}
index_t width = 1 << (gfm.gh()._offRate + 2);
EList<pair<index_t, int> >& ssOffs = sharedVars.ssOffs;
findSSOffs(gfm, altdb, (coord.joinedOff() >= width ? (index_t)(coord.joinedOff() - width) : 0), (index_t)(coord.joinedOff() + width), ssOffs);
assert_gt(ssOffs.size(), 0);
bool found = false;
for(index_t s = 0; s < ssOffs.size(); s++) {
index_t off = (index_t)coord.off();
index_t joinedOff = (index_t)coord.joinedOff();
pair<index_t, int>& ssOff = ssOffs[s];
if(ssOff.first > 0) {
assert_neq(ssOff.second, 0);
if(ssOff.second > 0) {
off += ssOff.first;
joinedOff += ssOff.first;
} else {
off -= ssOff.first;
joinedOff -= ssOff.first;
}
}
size_t numGenomeHits = genomeHits.size();
genomeHits.expand();
genomeHits.back().init(
coord.orient(),
rdoff,
len,
0, // trim5
0, // trim3
(index_t)coord.ref(),
off,
joinedOff,
sharedVars);
GenomeHit<index_t>& genomeHit = genomeHits.back();
EList<pair<index_t, int> >& offDiffs = sharedVars.offDiffs;
const index_t single_offDiffs_size = findOffDiffs(gfm,
altdb,
(genomeHit._joinedOff >= width ? genomeHit._joinedOff - width : 0),
genomeHit._joinedOff + width,
offDiffs);
assert_leq(single_offDiffs_size, offDiffs.size());
const BTDnaString& seq = genomeHit._fw ? rd.patFw : rd.patRc;
const EList<ALT<index_t> >& alts = altdb.alts();
index_t orig_joinedOff = genomeHit._joinedOff;
index_t orig_toff = genomeHit._toff;
bool found2 = false;
// maxAltsTried is not directly related to the size of offDiffs,
// but let's make the size of offDiffs is determined by maxAltsTried
const index_t max_offDiffs_size = max<index_t>(4, gpol.maxAltsTried() / 4);
if(offDiffs.size() - single_offDiffs_size > max_offDiffs_size) offDiffs.resize(single_offDiffs_size + max_offDiffs_size);
for(index_t o = 0; o < offDiffs.size() && !found2; o++) {
const pair<index_t, int>& offDiff = offDiffs[o];
#ifndef NDEBUG
if(o == 0) {
assert_eq(offDiff.first, 0);
assert_eq(offDiff.second, 0);
}
#endif
if(offDiff.second >= 0) {
genomeHit._joinedOff = orig_joinedOff + offDiff.first;
genomeHit._toff = orig_toff + offDiff.first;
} else {
if(orig_toff < offDiff.first) continue;
assert_geq(orig_joinedOff, offDiff.first);
genomeHit._joinedOff = orig_joinedOff - offDiff.first;
genomeHit._toff = orig_toff - offDiff.first;
}
genomeHit._edits->clear();
ELList<Edit, 128, 4>& candidate_edits = sharedVars.candidate_edits;
candidate_edits.clear();
index_t reflen = genomeHit._len + 10;
index_t alignedLen = alignWithALTs(
alts,
altdb.haplotypes(),
altdb.haplotype_maxrights(),
genomeHit._joinedOff,
seq,
genomeHit._rdoff,
genomeHit._rdoff,
genomeHit._len,
ref,
sharedVars,
genomeHit._tidx,
(int)genomeHit._toff,
reflen,
false, /* left? */
gpol,
*genomeHit._edits,
sharedVars.ht_llist,
*genomeHit._ht_list,
sharedVars.cmp_ht,
rd.threeN_cycle,
&candidate_edits);
if(alignedLen == genomeHit._len) {
found2 = true;
assert(genomeHit.repOk(rd, ref));
for(index_t i = 0; i < genomeHits.size() - 1; i++) {
if(genomeHits[i] == genomeHits.back()) {
found2 = false;
}
}
if(found2) {
for(index_t e = 0; e < candidate_edits.size(); e++) {
genomeHits.expand();
genomeHits.back() = genomeHits[genomeHits.size() - 2];
*(genomeHits.back()._edits) = candidate_edits[e];
assert(genomeHits.back().repOk(rd, ref));
for(size_t i = 0; i < genomeHits.size() - 1; i++) {
if(genomeHits[i] == genomeHits.back()) {
genomeHits.pop_back();
break;
}
}
}
}
} else {
genomeHit._edits->clear();
}
}
if(!found2) genomeHits.pop_back();
found = genomeHits.size() > numGenomeHits;
}
return found;
}
/**
* Adjust alignment with respect to SNPs, usually updating Edits
*
*/
template <typename index_t>
bool GenomeHit<index_t>::adjustWithALT(
const Read& rd,
const GFM<index_t>& gfm,
const ALTDB<index_t>& altdb,
const BitPairReference& ref,
const GraphPolicy& gpol)
{
if(gfm.gh().linearFM()) return true;
assert_lt(this->_tidx, ref.numRefs());
assert(_sharedVars != NULL);
EList<pair<index_t, int> >& offDiffs = _sharedVars->offDiffs;
index_t width = 1 << (gfm.gh()._offRate + 2);
const index_t single_offDiffs_size = findOffDiffs(gfm,
altdb,
(this->_joinedOff >= width ? this->_joinedOff - width : 0),
this->_joinedOff + width,
offDiffs);
assert_leq(single_offDiffs_size, offDiffs.size());
const BTDnaString& seq = _fw ? rd.patFw : rd.patRc;
const EList<ALT<index_t> >& alts = altdb.alts();
index_t orig_joinedOff = this->_joinedOff;
index_t orig_toff = this->_toff;
bool found = false;
// maxAltsTried is not directly related to the size of offDiffs,
// but let's make the size of offDiffs is determined by maxAltsTried
const index_t max_offDiffs_size = max<index_t>(4, gpol.maxAltsTried() / 4);
if(offDiffs.size() - single_offDiffs_size > max_offDiffs_size) offDiffs.resize(single_offDiffs_size + max_offDiffs_size);
for(index_t o = 0; o < offDiffs.size() && !found; o++) {
const pair<index_t, int>& offDiff = offDiffs[o];
#ifndef NDEBUG
if(o == 0) {
assert_eq(offDiff.first, 0);
assert_eq(offDiff.second, 0);
}
#endif
if(offDiff.second >= 0) {
this->_joinedOff = orig_joinedOff + offDiff.first;
this->_toff = orig_toff + offDiff.first;
} else {
if(orig_toff < offDiff.first) continue;
assert_geq(orig_joinedOff, offDiff.first);
this->_joinedOff = orig_joinedOff - offDiff.first;
this->_toff = orig_toff - offDiff.first;
}
index_t reflen = this->_len + 10;
index_t alignedLen = alignWithALTs(
alts,
altdb.haplotypes(),
altdb.haplotype_maxrights(),
this->_joinedOff,
seq,
this->_rdoff,
this->_rdoff,
this->_len,
ref,
*_sharedVars,
this->_tidx,
(int)this->_toff,
reflen,
false, /* left? */
gpol,
*this->_edits,
_sharedVars->ht_llist,
*this->_ht_list,
_sharedVars->cmp_ht,
rd.threeN_cycle,
&_sharedVars->candidate_edits);
if(alignedLen == this->_len) {
found = true;
} else {
this->_edits->clear();
}
}
#ifndef NDEBUG
if(found) {
assert(repOk(rd, ref));
}
#endif
return found;
}
/*
* Find offset differences due to splice sites
*/
template <typename index_t>
void GenomeHit<index_t>::findSSOffs(
const GFM<index_t>& gfm,
const ALTDB<index_t>& altdb,
index_t start,
index_t end,
EList<pair<index_t, int> >& ssOffs)
{
ssOffs.clear();
ssOffs.expand();
ssOffs.back().first = ssOffs.back().second = 0;
if(gfm.gh().linearFM() || !altdb.hasSpliceSites()) return;
const EList<ALT<index_t> >& alts = altdb.alts();
// Find splice sites included in this region
ALT<index_t> alt_search;
alt_search.left = start;
for(index_t i = (index_t)alts.bsearchLoBound(alt_search); i < alts.size(); i++) {
const ALT<index_t>& alt = alts[i];
if(alt.left >= end) break;
if(!alt.splicesite()) continue;
//
if(alt.left < alt.right) {
ssOffs.expand();
ssOffs.back().first = alt.right - alt.left + 1;
ssOffs.back().second = 1;
const index_t relax = 5;
if(alt.right > relax) alt_search.left = alt.right - relax;
else alt_search.left = 0;
for(index_t j = (index_t)alts.bsearchLoBound(alt_search); j < alts.size(); j++) {
const ALT<index_t>& alt2 = alts[j];
if(!alt2.splicesite()) continue;
if(alt2.left < alt2.right) continue;
if(alt2.left + alt2.right == alt.left + alt.right) continue;
if(alt2.left > alt.right + relax) break;
ssOffs.expand();
if(alt2.right < alt.left) {
ssOffs.back().first = alt.left - alt2.right;
ssOffs.back().second = -1;
} else {
ssOffs.back().first = alt2.right - alt.left;
ssOffs.back().second = 1;
}
}
} else {
ssOffs.expand();
ssOffs.back().first = alt.left - alt.right + 1;
ssOffs.back().second = -1;
}
}
if(ssOffs.size() > 1) {
ssOffs.sort();
index_t new_size = (index_t)(unique(ssOffs.begin(), ssOffs.end()) - ssOffs.begin());
ssOffs.resize(new_size);
}
}
/*
* Find offset differences due to indels
*/
template <typename index_t>
index_t GenomeHit<index_t>::findOffDiffs(
const GFM<index_t>& gfm,
const ALTDB<index_t>& altdb,
index_t start,
index_t end,
EList<pair<index_t, int> >& offDiffs)
{
offDiffs.clear();
offDiffs.expand();
offDiffs.back().first = offDiffs.back().second = 0;
if(gfm.gh().linearFM()) return offDiffs.size();
const EList<ALT<index_t> >& alts = altdb.alts();
pair<index_t, index_t> alt_range;
// Find SNPs included in this region
{
ALT<index_t> alt_search;
alt_search.pos = start;
alt_range.first = alt_range.second = (index_t)alts.bsearchLoBound(alt_search);
for(alt_range.second = alt_range.first; alt_range.second < alts.size(); alt_range.second++) {
const ALT<index_t>& alt = alts[alt_range.second];
if(alt.splicesite() && alt.left > alt.right) continue;
if(alt.deletion() && alt.reversed) continue;
if(alt.pos >= end) break;
}
}
if(alt_range.first >= alt_range.second) return offDiffs.size();
for(index_t second = alt_range.second; second > alt_range.first; second--) {
assert_leq(second, alts.size());
const ALT<index_t>& alt = alts[second - 1];
if(!alt.gap() || alt.splicesite() || (alt.deletion() && alt.reversed))
continue;
int off = 0;
if(alt.type == ALT_SNP_DEL) {
off = alt.len;
} else {
assert_eq(alt.type, ALT_SNP_INS);
off = -alt.len;
}
assert_neq(off, 0);
offDiffs.expand();
offDiffs.back().first = abs(off);
offDiffs.back().second = (off > 0 ? 1 : -1);
}
if(offDiffs.size() > 1) {
offDiffs.sort();
index_t new_size = (index_t)(unique(offDiffs.begin(), offDiffs.end()) - offDiffs.begin());
offDiffs.resize(new_size);
}
const index_t single_offDiffs_size = offDiffs.size();
for(index_t second = alt_range.second; second > alt_range.first; second--) {
assert_leq(alt_range.second, alts.size());
const ALT<index_t>& alt = alts[second - 1];
if(!alt.gap() || alt.splicesite() || (alt.deletion() && alt.reversed))
continue;
int off = 0;
if(alt.type == ALT_SNP_DEL) {
off = alt.len;
} else {
assert_eq(alt.type, ALT_SNP_INS);
off = -alt.len;
}
for(index_t second2 = second - 1; second2 > alt_range.first; second2--) {
const ALT<index_t>& alt2 = alts[second2 - 1];
if(!alt2.gap() || alt2.splicesite() || (alt2.deletion() && alt2.reversed))
continue;
if(alt2.type == ALT_SNP_DEL) {
if(alt2.pos + alt2.len >= alt.pos)
continue;
off += alt2.len;
} else {
assert_eq(alt2.type, ALT_SNP_INS);
if(alt2.pos >= alt.pos)
continue;
off -= alt2.len;
}
bool found = false;
for(index_t i = 0; i < offDiffs.size(); i++) {
int off_cmp = offDiffs[i].first * offDiffs[i].second;
if(off == off_cmp) {
found = true;
break;
}
}
if(!found) {
offDiffs.expand();
offDiffs.back().first = abs(off);
offDiffs.back().second = (off > 0 ? 1 : -1);
}
}
}
return single_offDiffs_size;
}
/*
*
*/
template <typename index_t>
void add_haplotypes(
const EList<ALT<index_t> >& alts,
const EList<Haplotype<index_t> >& haplotypes,
const EList<index_t>& haplotype_maxrights,
Haplotype<index_t>& cmp_ht,
EList<pair<index_t, index_t> >& ht_list,
index_t rdlen,
bool left_ext = true,
bool initial = false)
{
pair<int, int> ht_range;
ht_range.first = ht_range.second = (int)haplotypes.bsearchLoBound(cmp_ht);
if(ht_range.first >= haplotypes.size())
return;
if(left_ext) {
for(; ht_range.first >= 0; ht_range.first--) {
const Haplotype<index_t>& ht = haplotypes[ht_range.first];
if(!initial) {
if(ht.right >= cmp_ht.left) continue;
}
index_t ht_maxright = haplotype_maxrights[ht_range.first];
assert_geq(ht_maxright, ht.right);
if(ht_maxright + rdlen - 1 < cmp_ht.left) break;
if(ht.alts.size() <= 0) continue;
bool added = false;
for(index_t h = 0; h < ht_list.size(); h++) {
if(ht_list[h].first == ht_range.first) {
added = true;
break;
}
}
if(added) continue;
ht_list.expand();
ht_list.back().first = ht_range.first;
assert_gt(ht.alts.size(), 0);
if(ht.right < cmp_ht.left) {
ht_list.back().second = ht.alts.size() - 1;
} else {
assert(initial);
ht_list.back().second = ht.alts.size();
for(int a = (int)ht.alts.size() - 1; a >= 0; a--) {
index_t alti = ht.alts[a];
assert_lt(alti, alts.size());
const ALT<index_t>& alt = alts[alti];
assert(alt.snp());
ht_list.back().second = (index_t)a;
if(cmp_ht.left > alt.pos) break;
}
if(ht_list.back().second == ht.alts.size()) {
ht_list.pop_back();
}
}
}
} else {
if(initial) {
for(; ht_range.first >= 0; ht_range.first--) {
const Haplotype<index_t>& ht = haplotypes[ht_range.first];
index_t ht_maxright = haplotype_maxrights[ht_range.first];
assert_geq(ht_maxright, ht.right);
if(ht_maxright < cmp_ht.left) break;
if(ht.right < cmp_ht.left || ht.left > cmp_ht.left) continue;
if(ht.alts.size() <= 0) continue;
bool added = false;
for(index_t h = 0; h < ht_list.size(); h++) {
if(ht_list[h].first == ht_range.first) {
added = true;
break;
}
}
if(added) continue;
ht_list.expand();
ht_list.back().first = ht_range.first;
assert_gt(ht.alts.size(), 0);
ht_list.back().second = ht.alts.size();
for(index_t a = 0; a < ht.alts.size(); a++) {
index_t alti = ht.alts[a];
assert_lt(alti, alts.size());
const ALT<index_t>& alt = alts[alti];
assert(alt.snp());
ht_list.back().second = a;
if(cmp_ht.left <= alt.pos) break;
}
if(ht_list.back().second == ht.alts.size()) {
ht_list.pop_back();
}
}
}
for(; ht_range.second < haplotypes.size(); ht_range.second++) {
const Haplotype<index_t>& ht = haplotypes[ht_range.second];
if(ht.left < cmp_ht.right) continue;
if(ht.left >= cmp_ht.right + rdlen) break;
if(ht.alts.size() <= 0) continue;
bool added = false;
for(index_t h = 0; h < ht_list.size(); h++) {
if(ht_list[h].first == ht_range.second) {
added = true;
break;
}
}
if(added) continue;
ht_list.expand();
ht_list.back().first = ht_range.second;
assert_gt(ht.alts.size(), 0);
ht_list.back().second = 0;
}
}
}
/*
*
*/
template <typename index_t>
index_t GenomeHit<index_t>::alignWithALTs_recur(
const EList<ALT<index_t> >& alts,
const EList<Haplotype<index_t> >& haplotypes,
const EList<index_t>& haplotype_maxrights,
index_t joinedOff,
const BTDnaString& rdseq,
index_t rdoff_add,
index_t rdoff,
index_t rdlen,
const BitPairReference& ref,
EList<SStringExpandable<char> >& raw_refbufs,
ASSERT_ONLY(SStringExpandable<uint32_t> destU32,)
EList<Edit>& tmp_edits,
int& best_rdoff,
const char* rfseq,
index_t tidx,
int rfoff,
index_t rflen,
bool left,
EList<Edit>& edits,
index_t mm,
ELList<pair<index_t, index_t> >& ht_llist,
Haplotype<index_t>& cmp_ht,
ELList<Edit, 128, 4>* candidate_edits,
index_t tmp_numNs,
index_t* numNs,
index_t dep,
const GraphPolicy& gpol,
index_t& numALTsTried,
int cycle_3N,
ALT_TYPE prev_alt_type)
{
if(numALTsTried > gpol.maxAltsTried() + dep) return 0;
assert_gt(rdlen, 0);
assert_gt(rflen, 0);
if(ht_llist.size() <= dep) ht_llist.expand();
if(raw_refbufs.size() <= dep) raw_refbufs.expand();
if(rfoff < -16) return 0;
size_t contig_len = ref.approxLen(tidx);
if(rfoff >= contig_len) return 0;
if(rfoff >= 0 && rfoff + rflen > contig_len) {
rflen = contig_len - rfoff;
} else if(rfoff < 0 && rflen > contig_len) {
rflen = contig_len;
}
if(rflen == 0) return 0;
if(rfseq == NULL) {
SStringExpandable<char>& raw_refbuf = raw_refbufs[dep];
raw_refbuf.resize(rflen + 16 + 16);
raw_refbuf.fill(0x4);
int off = ref.getStretch(
reinterpret_cast<uint32_t*>(raw_refbuf.wbuf() + 16),
tidx,
max<int>(rfoff, 0),
rfoff > 0 ? rflen : rflen + rfoff
ASSERT_ONLY(, destU32));
assert_lt(off, 16);
rfseq = raw_refbuf.wbuf() + 16 + off + min<int>(rfoff, 0);
}
int refConversion_3N[5] = {0, 1, 2, 3, 4};
if (threeN){
if (cycle_3N == 0 || cycle_3N == 3) {
// C to T conversion
refConversion_3N[asc2dna[hs3N_convertedFrom]] = asc2dna[hs3N_convertedTo];
} else {
//G to A conversion
refConversion_3N[asc2dna[hs3N_convertedFromComplement]] = asc2dna[hs3N_convertedToComplement];
}
}
if(left) {
index_t tmp_mm = 0;
int min_rd_i = (int)rdoff;
int mm_min_rd_i = (int)rdoff;
index_t mm_tmp_numNs = 0;
for(int rf_i = (int)rflen - 1; rf_i >= 0 && mm_min_rd_i >= 0; rf_i--, mm_min_rd_i--) {
int rf_bp = get_ref_base(threeN, refConversion_3N, rfseq[rf_i]);
int rd_bp = rdseq[mm_min_rd_i];
if(rf_bp != rd_bp || rd_bp == 4) {
if(tmp_mm == 0) {
min_rd_i = mm_min_rd_i;
}
if(tmp_mm >= mm) break;
tmp_mm++;
Edit e(
mm_min_rd_i,
"ACGTN"[rf_bp],
"ACGTN"[rd_bp],
EDIT_TYPE_MM);
tmp_edits.insert(e, 0);
}
if(rf_bp == 4) {
if(tmp_mm == 0) tmp_numNs++;
mm_tmp_numNs++;
}
}
if(tmp_mm == 0) {
min_rd_i = mm_min_rd_i;
}
if(mm_min_rd_i < best_rdoff) {
best_rdoff = mm_min_rd_i;
edits = tmp_edits;
if(numNs != NULL) *numNs = mm_tmp_numNs;
}
if(mm_min_rd_i < 0) return rdlen;
if(tmp_mm > 0) {
tmp_edits.erase(0, tmp_mm);
tmp_mm = 0;
}
// Find SNPs included in this region
pair<int, int> alt_range(0, 0);
if(alts.size() > 0) {
ALT<index_t> cmp_alt;
const index_t minK = 16;
assert_leq(mm_min_rd_i, rdoff);
index_t rd_diff = rdoff - mm_min_rd_i;
rd_diff = (rd_diff > minK ? rd_diff - minK : 0);
if(gpol.enableCODIS()) {
rd_diff = 0;
}
if(rd_diff >= joinedOff) {
cmp_alt.pos = joinedOff;
} else {
cmp_alt.pos = joinedOff - rd_diff;
}
alt_range.first = alt_range.second = (int)alts.bsearchLoBound(cmp_alt);
if(alt_range.first >= alts.size()) {
assert_gt(alts.size(), 0);
alt_range.first = alt_range.second = alt_range.second - 1;
}
for(; alt_range.first >= 0; alt_range.first--) {
const ALT<index_t>& alt = alts[alt_range.first];
if(alt.snp()) {
if(alt.deletion() && !alt.reversed) continue;
if(alt.pos + rdlen < joinedOff) break;
} else if(alt.splicesite()) {
if(alt.left < alt.right) continue;
if(alt.left + rdlen - 1 < joinedOff) break;
} else {
assert(alt.exon());
continue;
}
}
}
// Update and find Haplotypes
EList<pair<index_t, index_t> >& ht_list = ht_llist[dep];
ht_list.clear();
if(gpol.useHaplotype() && haplotypes.size() > 0) {
if(dep > 0) {
EList<pair<index_t, index_t> >& ht_prev_list = ht_llist[dep-1];
for(index_t p = 0; p < ht_prev_list.size(); p++) {
const pair<index_t, index_t>& ht_ref = ht_prev_list[p];
const Haplotype<index_t>& ht = haplotypes[ht_ref.first];
assert_lt(ht_ref.second, ht.alts.size());
index_t alt_id = ht.alts[ht_ref.second];
assert_gt(tmp_edits.size(), 0);
const ALT<index_t>& alt = alts[tmp_edits[0].snpID];
const ALT<index_t>& ht_alt = alts[alt_id];
if(!alt.isSame(ht_alt)) continue;
if(ht_ref.second == 0) {
cmp_ht.left = cmp_ht.right = joinedOff;
add_haplotypes(alts,
haplotypes,
haplotype_maxrights,
cmp_ht,
ht_list,
rdlen);
} else {
ht_list.push_back(ht_ref);
ht_list.back().second--;
}
}
}
if(ht_list.size() <= 0) {
cmp_ht.left = cmp_ht.right = joinedOff;
add_haplotypes(alts,
haplotypes,
haplotype_maxrights,
cmp_ht,
ht_list,
rdlen,
true, // left_ext?
dep == 0); // initial?
}
}
assert_geq(rdoff, 0);
const index_t orig_nedits = (index_t)tmp_edits.size();
for(; alt_range.second > alt_range.first; alt_range.second--) {
ALT<index_t> alt = alts[alt_range.second];
if(alt.pos >= joinedOff) continue;
if(alt.splicesite()) {
if(alt.left < alt.right) continue;
index_t tmp = alt.left;
alt.left = alt.right;
alt.right = tmp;
}
if(alt.deletion()) {
if(!alt.reversed) continue;
alt.pos = alt.pos - alt.len + 1;
}
if(alt.exon()) continue;
bool alt_compatible = false;
int rf_i = (int)rflen - 1, rd_i = (int)rdoff;
int diff = 0;
if(alt.type == ALT_SNP_SGL) {
diff = joinedOff - alt.pos - 1;
} else if(alt.type == ALT_SNP_DEL) {
if(alt.pos + alt.len >= joinedOff) continue;
diff = joinedOff - (alt.pos + alt.len);
} else if(alt.type == ALT_SNP_INS) {
diff = joinedOff - alt.pos;
} else {
assert(alt.splicesite());
diff = joinedOff - (alt.right + 1);
}
if(rf_i < diff || rd_i < diff) continue;
rf_i -= diff;
rd_i -= diff;
int rd_bp = rdseq[rd_i];
if(rd_i < min_rd_i) {
if(alt.type == ALT_SNP_INS) {
if(rd_i + 1 >= min_rd_i) continue;
}
break;
}
// Check to see if there is a haplotype that supports this alt
if(ht_list.size() > 0 && alt.snp()) {
bool ht_found = false;
for(index_t h = 0; h < ht_list.size(); h++) {
const pair<index_t, index_t>& ht_ref = ht_list[h];
const Haplotype<index_t>& ht = haplotypes[ht_ref.first];
assert_lt(ht_ref.second, ht.alts.size());
index_t ht_alti = ht.alts[ht_ref.second];
const ALT<index_t>& ht_alt = alts[ht_alti];
if(alts[alt_range.second].isSame(ht_alt)) {
ht_found = true;
break;
}
}
if(!ht_found) continue;
}
if(alt.type == ALT_SNP_SGL) {
if(rd_bp == (int)alt.seq) {
int rf_bp = get_ref_base(threeN, refConversion_3N, rfseq[rf_i]);
Edit e(
rd_i,
"ACGTN"[rf_bp],
"ACGTN"[rd_bp],
EDIT_TYPE_MM,
true, /* chars? */
alt_range.second);
tmp_edits.insert(e, 0);
rd_i--;
rf_i--;
alt_compatible = true;
}
} else if(alt.type == ALT_SNP_DEL) {
if(rfoff + rf_i > (int)alt.len) {
if(rf_i > (int)alt.len) {
for(index_t i = 0; i < alt.len; i++) {
int rf_bp = get_ref_base(threeN, refConversion_3N, rfseq[rf_i - i]);
Edit e(
rd_i + 1,
"ACGTN"[rf_bp],
'-',
EDIT_TYPE_READ_GAP,
true, /* chars? */
alt_range.second);
tmp_edits.insert(e, 0);
}
} else {
// long deletions
int new_rfoff = rfoff - alt.len;
index_t new_rflen = rf_i + alt.len + 10;
if(raw_refbufs.size() <= dep + 1) raw_refbufs.expand();
SStringExpandable<char>& raw_refbuf = raw_refbufs[dep + 1];
raw_refbuf.resize(new_rflen + 16 + 16);
raw_refbuf.fill(0x4);
int off = ref.getStretch(
reinterpret_cast<uint32_t*>(raw_refbuf.wbuf() + 16),
tidx,
max<int>(new_rfoff, 0),
new_rfoff > 0 ? new_rflen : new_rflen + new_rfoff
ASSERT_ONLY(, destU32));
assert_lt(off, 16);
const char* new_rfseq = raw_refbuf.wbuf() + 16 + off + min<int>(new_rfoff, 0);
for(int i = 0; i < alt.len; i++) {
int rf_bp = get_ref_base(threeN, refConversion_3N, new_rfseq[rf_i - i + alt.len]);
Edit e(
rd_i + 1,
"ACGTN"[rf_bp],
'-',
EDIT_TYPE_READ_GAP,
true, /* chars? */
alt_range.second);
tmp_edits.insert(e, 0);
}
}
rf_i -= (int)alt.len;
alt_compatible = true;
}
} else if(alt.type == ALT_SNP_INS) {
if(rd_i > (int)alt.len) {
bool same_seq = true;
for(index_t i = 0; i < alt.len; i++) {
rd_bp = rdseq[rd_i - i];
int snp_bp = (alt.seq >> (i << 1)) & 0x3;
if(rd_bp != snp_bp) {
same_seq = false;
break;
}
Edit e(
rd_i - i,
'-',
"ACGTN"[rd_bp],
EDIT_TYPE_REF_GAP,
true, /* chars? */
alt_range.second);
tmp_edits.insert(e, 0);
}
if(same_seq) {
rd_i -= (int)alt.len;
alt_compatible = true;
}
}
} else if(alt.type == ALT_SPLICESITE) {
bool add_splicesite = true;
if(rd_i == rdoff && prev_alt_type == ALT_SPLICESITE) {
add_splicesite = false;
}
if(add_splicesite) {
assert_lt(rd_i, rflen);
assert_lt(alt.left, alt.right);
index_t intronLen = alt.right - alt.left + 1;
Edit e(rd_i + 1,
0,
0,
EDIT_TYPE_SPL,
intronLen,
alt.fw ? SPL_FW : SPL_RC,
true, /* known splice site? */
false); /* chrs? */
tmp_edits.insert(e, 0);
alt_compatible = true;
}
}
if(alt_compatible) {
numALTsTried++;
assert_leq(rd_i, (int)rdoff);
if(rd_i < 0) {
best_rdoff = rd_i;
edits = tmp_edits;
return rdlen;
}
index_t next_joinedOff = alt.pos;
int next_rfoff = rfoff, next_rdoff = rd_i;
const char* next_rfseq = rfseq;
int next_rflen = rf_i + 1, next_rdlen = rd_i + 1;
if(alt.splicesite()) {
assert_lt(alt.left, alt.right);
next_joinedOff = alt.left;
index_t intronLen = alt.right - alt.left + 1;
assert_geq(next_rfoff, intronLen);
next_rfoff -= intronLen;
next_rfseq = NULL;
}
if(next_rflen < next_rdlen) {
int add_len = next_rdlen + 10 - next_rflen;
if(next_rfoff < add_len) add_len = next_rfoff;
next_rfoff -= add_len;
next_rflen += add_len;
next_rfseq = NULL;
}
index_t alignedLen = alignWithALTs_recur(
alts,
haplotypes,
haplotype_maxrights,
next_joinedOff,
rdseq,
rdoff_add,
next_rdoff,
next_rdlen,
ref,
raw_refbufs,
ASSERT_ONLY(destU32,)
tmp_edits,
best_rdoff,
next_rfseq,
tidx,
next_rfoff,
next_rflen,
left,
edits,
mm,
ht_llist,
cmp_ht,
candidate_edits,
tmp_numNs,
numNs,
dep + 1,
gpol,
numALTsTried,
cycle_3N,
alt.type);
if(alignedLen == next_rdlen) return rdlen;
}
// Restore to the earlier state
assert_leq(orig_nedits, tmp_edits.size());
if(orig_nedits < tmp_edits.size()) tmp_edits.erase(0, tmp_edits.size() - orig_nedits);
}
return 0;
} else {
index_t tmp_mm = 0;
index_t max_rd_i = 0;
index_t mm_max_rd_i = 0;
index_t mm_tmp_numNs = 0;
for(index_t rf_i = 0; rf_i < rflen && mm_max_rd_i < rdlen; rf_i++, mm_max_rd_i++) {
int rf_bp = get_ref_base(threeN, refConversion_3N, rfseq[rf_i]);
int rd_bp = rdseq[rdoff + mm_max_rd_i];
if(rf_bp != rd_bp || rd_bp == 4) {
if(tmp_mm == 0) {
max_rd_i = mm_max_rd_i;
}
if(tmp_mm >= mm) break;
tmp_mm++;
Edit e(
mm_max_rd_i + rdoff_add,
"ACGTN"[rf_bp],
"ACGTN"[rd_bp],
EDIT_TYPE_MM);
tmp_edits.push_back(e);
}
if(rf_bp == 4) {
if(tmp_mm == 0) tmp_numNs++;
mm_tmp_numNs++;
}
}
if(tmp_mm == 0) {
max_rd_i = mm_max_rd_i;
}
if(mm_max_rd_i + rdoff > best_rdoff) {
best_rdoff = mm_max_rd_i + rdoff;
edits = tmp_edits;
if(numNs != NULL) *numNs = mm_tmp_numNs;
if(candidate_edits != NULL) candidate_edits->clear();
} else if(mm_max_rd_i + rdoff == best_rdoff) {
if(candidate_edits != NULL) {
candidate_edits->expand();
candidate_edits->back() = tmp_edits;
}
}
if(mm_max_rd_i == rflen) {
return mm_max_rd_i;
}
// Find SNPs included in this region
pair<index_t, index_t> alt_range;
{
ALT<index_t> cmp_alt;
const index_t minK = 16;
index_t rd_diff = (max_rd_i > minK ? max_rd_i - minK : 0);
if(gpol.enableCODIS()) {
rd_diff = 0;
}
cmp_alt.pos = joinedOff + rd_diff;
alt_range.first = alt_range.second = (index_t)alts.bsearchLoBound(cmp_alt);
if(alt_range.first >= alts.size()) return 0;
for(; alt_range.second < alts.size(); alt_range.second++) {
const ALT<index_t>& alt = alts[alt_range.second];
if(alt.splicesite()) {
if(alt.left > alt.right) continue;
}
if(alt.deletion()) {
if(alt.reversed) continue;
}
if(alt.left > joinedOff + max_rd_i) break;
}
}
if(mm_max_rd_i == rdlen) {
bool further_search = false;
for(index_t s = alt_range.first; s < alt_range.second; s++) {
const ALT<index_t>& alt = alts[s];
if(alt.splicesite() && alt.left < alt.right) {
further_search = true;
break;
}
}
if(!further_search) return mm_max_rd_i;
}
if(tmp_mm > 0) {
tmp_edits.resize(tmp_edits.size() - tmp_mm);
tmp_mm = 0;
}
// Update and find Haplotypes
EList<pair<index_t, index_t> >& ht_list = ht_llist[dep];
ht_list.clear();
if(gpol.useHaplotype() && haplotypes.size() > 0) {
if(dep > 0) {
EList<pair<index_t, index_t> >& ht_prev_list = ht_llist[dep-1];
for(index_t p = 0; p < ht_prev_list.size(); p++) {
const pair<index_t, index_t>& ht_ref = ht_prev_list[p];
const Haplotype<index_t>& ht = haplotypes[ht_ref.first];
if(ht_ref.second < ht.alts.size()) {
index_t alt_id = ht.alts[ht_ref.second];
assert_gt(tmp_edits.size(), 0);
const ALT<index_t>& alt = alts[tmp_edits.back().snpID];
const ALT<index_t>& ht_alt = alts[alt_id];
if(!alt.isSame(ht_alt)) continue;
}
if(ht_ref.second + 1 >= ht.alts.size() && joinedOff > ht.right) {
cmp_ht.left = cmp_ht.right = joinedOff;
add_haplotypes(alts,
haplotypes,
haplotype_maxrights,
cmp_ht,
ht_list,
rdlen,
false); // left_ext?
} else {
ht_list.push_back(ht_ref);
ht_list.back().second++;
}
}
}
if(ht_list.size() <= 0) {
cmp_ht.left = cmp_ht.right = joinedOff;
add_haplotypes(alts,
haplotypes,
haplotype_maxrights,
cmp_ht,
ht_list,
rdlen,
false, // left_ext?
dep == 0 && rdoff_add == 0); // initial?
}
}
const index_t orig_nedits = (index_t)tmp_edits.size();
for(; alt_range.first < alt_range.second; alt_range.first++) {
const ALT<index_t>& alt = alts[alt_range.first];
if(alt.splicesite()) {
if(alt.left > alt.right) continue;
}
if(alt.exon()) continue;
if(alt.deletion()) {
if(alt.reversed) continue;
}
bool alt_compatible = false;
assert_leq(joinedOff, alt.pos);
index_t rf_i, rd_i;
rf_i = rd_i = alt.pos - joinedOff;
if(rd_i >= rdlen) continue;
assert_leq(rd_i, max_rd_i);
int rf_bp = get_ref_base(threeN, refConversion_3N, rfseq[rf_i]);
int rd_bp = rdseq[rdoff + rd_i];
// Check to see if there is a haplotype that supports this alt
if(ht_list.size() > 0 && alt.snp()) {
bool ht_found = false;
for(index_t h = 0; h < ht_list.size(); h++) {
const pair<index_t, index_t>& ht_ref = ht_list[h];
const Haplotype<index_t>& ht = haplotypes[ht_ref.first];
if(ht_ref.second >= ht.alts.size())
continue;
index_t ht_alti = ht.alts[ht_ref.second];
const ALT<index_t>& ht_alt = alts[ht_alti];
if(alts[alt_range.first].isSame(ht_alt)) {
ht_found = true;
break;
}
}
if(!ht_found) continue;
}
if(alt.type == ALT_SNP_SGL) {
if(rd_bp == (int)alt.seq) {
Edit e(
rd_i + rdoff_add,
"ACGTN"[rf_bp],
"ACGTN"[rd_bp],
EDIT_TYPE_MM,
true, /* chars? */
alt_range.first);
tmp_edits.push_back(e);
rd_i++;
rf_i++;
alt_compatible = true;
}
} else if(alt.type == ALT_SNP_DEL) {
bool try_del = rd_i > 0;
if(rd_i == 0 && dep > 0) {
// Avoid consecutive deletions
assert_gt(tmp_edits.size(), 0);
const Edit& e = tmp_edits.back();
if(e.type != EDIT_TYPE_READ_GAP) {
try_del = true;
}
}
if(try_del) {
if(rf_i + alt.len <= rflen) {
for(index_t i = 0; i < alt.len; i++) {
rf_bp = get_ref_base(threeN, refConversion_3N, rfseq[rf_i + i]);
Edit e(
rd_i + rdoff_add,
"ACGTN"[rf_bp],
'-',
EDIT_TYPE_READ_GAP,
true, /* chars? */
alt_range.first);
tmp_edits.push_back(e);
}
rf_i += alt.len;
alt_compatible = true;
} else {
// long deletions
index_t new_rflen = rf_i + alt.len + 10;
if(raw_refbufs.size() <= dep + 1) raw_refbufs.expand();
SStringExpandable<char>& raw_refbuf = raw_refbufs[dep + 1];
raw_refbuf.resize(new_rflen + 16 + 16);
raw_refbuf.fill(0x4);
int off = ref.getStretch(
reinterpret_cast<uint32_t*>(raw_refbuf.wbuf() + 16),
tidx,
max<int>(rfoff, 0),
rfoff > 0 ? new_rflen : new_rflen + rfoff
ASSERT_ONLY(, destU32));
assert_lt(off, 16);
const char* new_rfseq = raw_refbuf.wbuf() + 16 + off + min<int>(rfoff, 0);
for(index_t i = 0; i < alt.len; i++) {
rf_bp = get_ref_base(threeN, refConversion_3N, new_rfseq[rf_i + i]);
Edit e(
rd_i + rdoff_add,
"ACGTN"[rf_bp],
'-',
EDIT_TYPE_READ_GAP,
true, /* chars? */
alt_range.first);
tmp_edits.push_back(e);
}
rf_i += alt.len;
alt_compatible = true;
}
}
} else if(alt.type == ALT_SNP_INS) {
if(rd_i + alt.len <= rdlen && rf_i > 0) {
bool same_seq = true;
for(index_t i = 0; i < alt.len; i++) {
rd_bp = rdseq[rdoff + rd_i + i];
int snp_bp = (alt.seq >> ((alt.len - i - 1) << 1)) & 0x3;
if(rd_bp != snp_bp) {
same_seq = false;
break;
}
Edit e(
rd_i + i + rdoff_add,
'-',
"ACGTN"[rd_bp],
EDIT_TYPE_REF_GAP,
true, /* chars? */
alt_range.first);
tmp_edits.push_back(e);
}
if(same_seq) {
rd_i += alt.len;
alt_compatible = true;
}
}
} else if(alt.type == ALT_SPLICESITE) {
bool try_splice = rd_i > 0;
if(rd_i == 0 && dep > 0) {
// Avoid consecutive introns
assert_gt(tmp_edits.size(), 0);
const Edit& e = tmp_edits.back();
if(e.type != EDIT_TYPE_SPL) {
try_splice = true;
}
}
if(try_splice) {
assert_lt(rd_i, rflen);
index_t intronLen = alt.right - alt.left + 1;
Edit e(rd_i + rdoff_add,
0,
0,
EDIT_TYPE_SPL,
intronLen,
alt.fw ? SPL_FW : SPL_RC,
true, /* known splice site? */
false); /* chrs? */
tmp_edits.push_back(e);
alt_compatible = true;
}
}
if(alt_compatible) {
numALTsTried++;
if(rd_i == rdlen) {
assert_leq(best_rdoff, rdoff + rd_i);
if(best_rdoff < rdoff + rd_i) {
if(candidate_edits != NULL) candidate_edits->clear();
}
if(candidate_edits != NULL) {
candidate_edits->expand();
candidate_edits->back() = tmp_edits;
}
best_rdoff = rdoff + rd_i;
edits = tmp_edits;
return rd_i;
}
index_t next_joinedOff = 0;
int next_rfoff = rfoff + rf_i, next_rdoff = rdoff + rd_i;
const char* next_rfseq = rfseq + rf_i;
index_t next_rflen = rflen - rf_i, next_rdlen = rdlen - rd_i;
if(alt.type == ALT_SNP_SGL) {
next_joinedOff = alt.pos + 1;
} else if(alt.type == ALT_SNP_DEL) {
next_joinedOff = alt.pos + alt.len;
if(rflen <= rf_i) {
next_rflen = 0; // Will reset next_rfseq and next_rflen below
}
} else if(alt.type == ALT_SNP_INS) {
next_joinedOff = alt.pos;
} else if(alt.type == ALT_SPLICESITE) {
next_joinedOff = alt.right + 1;
index_t intronLen = alt.right - alt.left + 1;
next_rfoff += intronLen;
next_rfseq = NULL;
} else {
assert(false);
}
if(next_rflen < next_rdlen) {
next_rflen = next_rdlen + 10;
next_rfseq = NULL;
}
index_t alignedLen = alignWithALTs_recur(
alts,
haplotypes,
haplotype_maxrights,
next_joinedOff,
rdseq,
rdoff_add + rd_i,
next_rdoff,
next_rdlen,
ref,
raw_refbufs,
ASSERT_ONLY(destU32,)
tmp_edits,
best_rdoff,
next_rfseq,
tidx,
next_rfoff,
next_rflen,
left,
edits,
mm,
ht_llist,
cmp_ht,
candidate_edits,
tmp_numNs,
numNs,
dep + 1,
gpol,
numALTsTried,
cycle_3N,
alt.type);
if(alignedLen > 0) {
assert_leq(rdoff + rd_i + alignedLen, best_rdoff);
bool search_further = false;
if(alt.splicesite()) {
for(index_t sf = alt_range.first + 1; sf < alt_range.second; sf++) {
const ALT<index_t>& alt2 = alts[sf];
if(alt2.splicesite() && alt2.left < alt2.right) {
search_further = true;
break;
}
}
}
if(!search_further) {
if(rd_i + alignedLen == rdlen) {
return rd_i + alignedLen;
}
}
}
}
// Restore to the earlier state
assert_leq(orig_nedits, tmp_edits.size());
if(orig_nedits < tmp_edits.size()) tmp_edits.resize(orig_nedits);
}
return 0;
}
}
/**
* For alignment involving indel, move the indels
* to the left most possible position
*/
template <typename index_t>
void GenomeHit<index_t>::leftAlign(const Read& rd)
{
ASSERT_ONLY(const index_t rdlen = (index_t)rd.length());
const BTDnaString& seq = _fw ? rd.patFw : rd.patRc;
for(index_t ei = 0; ei < _edits->size(); ei++) {
Edit& edit = (*_edits)[ei];
if(edit.type != EDIT_TYPE_READ_GAP && edit.type != EDIT_TYPE_REF_GAP)
continue;
if(edit.snpID != (index_t)INDEX_MAX)
continue;
index_t ei2 = ei + 1;
for(; ei2 < _edits->size(); ei2++) {
const Edit& edit2 = (*_edits)[ei2];
if(edit2.type != edit.type) break;
if(edit.type == EDIT_TYPE_READ_GAP) {
if(edit.pos != edit2.pos) break;
} else {
assert_eq(edit.type, EDIT_TYPE_REF_GAP);
if(edit.pos + ei2 - ei != edit2.pos) break;
}
}
assert_gt(ei2, 0);
ei2 -= 1;
Edit& edit2 = (*_edits)[ei2];
int b = 0;
if(ei > 0) {
const Edit& prev_edit = (*_edits)[ei - 1];
b = prev_edit.pos;
}
int l = edit.pos - 1;
while(l > b) {
assert_lt(l, (int)rdlen);
int rdc = seq[_rdoff + l];
assert_range(0, 4, rdc);
char rfc = (edit.type == EDIT_TYPE_READ_GAP ? edit2.chr : edit2.qchr);
if(rfc != "ACGTN"[rdc]) break;
for(int ei3 = ei2; ei3 > (int)ei; ei3--) {
if(edit.type == EDIT_TYPE_READ_GAP) {
(*_edits)[ei3].chr = (*_edits)[ei3 - 1].chr;
} else {
(*_edits)[ei3].qchr = (*_edits)[ei3 - 1].qchr;
}
(*_edits)[ei3].pos -= 1;
}
rdc = seq[_rdoff + l];
assert_range(0, 4, rdc);
if(edit.type == EDIT_TYPE_READ_GAP) {
edit.chr = "ACGTN"[rdc];
} else {
edit.qchr = "ACGTN"[rdc];
}
edit.pos -= 1;
l--;
}
ei = ei2;
}
}
#ifndef NDEBUG
/**
* Check that hit is sane w/r/t read.
*/
template <typename index_t>
bool GenomeHit<index_t>::repOk(const Read& rd, const BitPairReference& ref)
{
if(_len <= 0) return true;
assert(_sharedVars != NULL);
SStringExpandable<char>& raw_refbuf = _sharedVars->raw_refbuf;
SStringExpandable<uint32_t>& destU32 = _sharedVars->destU32;
BTDnaString& editstr = _sharedVars->editstr;
BTDnaString& partialseq = _sharedVars->partialseq;
BTDnaString& refstr = _sharedVars->refstr;
EList<index_t>& reflens = _sharedVars->reflens;
EList<index_t>& refoffs = _sharedVars->refoffs;
editstr.clear(); partialseq.clear(); refstr.clear();
reflens.clear(); refoffs.clear();
const BTDnaString& seq = _fw ? rd.patFw : rd.patRc;
partialseq.install(seq.buf() + this->_rdoff, (size_t)this->_len);
Edit::toRef(partialseq, *_edits, editstr);
index_t refallen = 0;
int64_t reflen = 0;
int64_t refoff = this->_toff;
refoffs.push_back((index_t)refoff);
size_t eidx = 0;
for(size_t i = 0; i < _len; i++, reflen++, refoff++) {
while(eidx < _edits->size() && (*_edits)[eidx].pos == i) {
const Edit& edit = (*_edits)[eidx];
if(edit.isReadGap()) {
reflen++;
refoff++;
} else if(edit.isRefGap()) {
reflen--;
refoff--;
}
if(edit.isSpliced()) {
assert_gt(reflen, 0);
refallen += reflen;
reflens.push_back((index_t)reflen);
reflen = 0;
refoff += edit.splLen;
assert_gt(refoff, 0);
refoffs.push_back((index_t)refoff);
}
eidx++;
}
}
assert_gt(reflen, 0);
refallen += (index_t)reflen;
reflens.push_back((index_t)reflen);
assert_gt(reflens.size(), 0);
assert_gt(refoffs.size(), 0);
assert_eq(reflens.size(), refoffs.size());
refstr.clear();
int refConversion_3N[5] = {0, 1, 2, 3, 4};
if (threeN){
if (((rd.threeN_cycle == threeN_type1conversion_FW || rd.threeN_cycle == threeN_type2conversion_RC) && !rd.oppositeConversion_3N) ||
((rd.threeN_cycle == threeN_type1conversion_RC || rd.threeN_cycle == threeN_type2conversion_FW) && rd.oppositeConversion_3N)) {
// type 1 conversion
refConversion_3N[asc2dna[hs3N_convertedFrom]] = asc2dna[hs3N_convertedTo];
} else {
// type 2 conversion
refConversion_3N[asc2dna[hs3N_convertedFromComplement]] = asc2dna[hs3N_convertedToComplement];
}
}
for(index_t i = 0; i < reflens.size(); i++) {
assert_gt(reflens[i], 0);
if(i > 0) {
assert_gt(refoffs[i], refoffs[i-1]);
}
raw_refbuf.resize(reflens[i] + 16);
raw_refbuf.clear();
int off = ref.getStretch(
reinterpret_cast<uint32_t*>(raw_refbuf.wbuf()),
(size_t)this->_tidx,
(size_t)max<TRefOff>(refoffs[i], 0),
reflens[i],
destU32);
assert_leq(off, 16);
for(index_t j = 0; j < reflens[i]; j++) {
char rfc = refConversion_3N[*(raw_refbuf.buf()+off+j)];
refstr.append(rfc);
}
char* bufA = raw_refbuf.wbuf() + off;
string test_string = "";
string bases = "ACGTN";
for (int k = 0; k < reflens[i]; k++) {
int a = bufA[k];
test_string += bases[a];
}
}
if(refstr != editstr) {
cerr << "Decoded nucleotides and edits don't match reference:" << endl;
//cerr << " score: " << score.score()
//<< " (" << gaps << " gaps)" << endl;
cerr << " edits: ";
Edit::print(cerr, *_edits);
cerr << endl;
cerr << " decoded nucs: " << partialseq << endl;
cerr << " edited nucs: " << editstr << endl;
cerr << " reference nucs: " << refstr << endl;
assert(0);
}
return true;
}
#endif
/**
* Calculate alignment score
*/
template <typename index_t>
int64_t GenomeHit<index_t>::calculateScore(
const Read& rd,
SpliceSiteDB& ssdb,
const Scoring& sc,
index_t minK_local,
index_t minIntronLen,
index_t maxIntronLen,
index_t minAnchorLen,
index_t minAnchorLen_noncan,
const BitPairReference& ref)
{
int64_t score = 0;
double splicescore = 0;
int64_t localscore = 0;
index_t numsplices = 0;
index_t mm = 0;
const BTDnaString& seq = _fw ? rd.patFw : rd.patRc;
const BTString& qual = _fw ? rd.qual : rd.qualRev;
index_t rdlen = (index_t)seq.length();
int64_t toff_base = _toff;
bool conflict_splicesites = false;
uint8_t whichsense = SPL_UNKNOWN;
for(index_t i = 0; i < _edits->size(); i++) {
const Edit& edit = (*_edits)[i];
assert_lt(edit.pos, _len);
if(edit.type == EDIT_TYPE_MM) {
if(edit.snpID == std::numeric_limits<uint32_t>::max()) {
int pen = sc.score(
dna2col[edit.qchr] - '0',
asc2dnamask[edit.chr],
qual[this->_rdoff + edit.pos] - 33);
score += pen;
mm++;
}
} else if(edit.type == EDIT_TYPE_SPL) {
// int left = toff_base + edit.pos - 1;
// assert_geq(left, 0);
// int right = left + edit.splLen + 1;
// assert_geq(right, 0);
if(!edit.knownSpl) {
int left_anchor_len = _rdoff + edit.pos;
assert_gt(left_anchor_len, 0);
assert_lt(left_anchor_len, (int)rdlen);
int right_anchor_len = rdlen - left_anchor_len;
index_t mm2 = 0;
for(index_t j = i + 1; j < _edits->size(); j++) {
const Edit& edit2 = (*_edits)[j];
if(edit2.type == EDIT_TYPE_MM ||
edit2.type == EDIT_TYPE_READ_GAP ||
edit2.type == EDIT_TYPE_REF_GAP) mm2++;
}
left_anchor_len -= (mm * 2);
right_anchor_len -= (mm2 * 2);
int shorter_anchor_len = min<int>(left_anchor_len, right_anchor_len);
if(shorter_anchor_len <= 0) shorter_anchor_len = 1;
assert_gt(shorter_anchor_len, 0);
uint32_t intronLen_thresh = ((edit.splDir == SPL_FW || edit.splDir == SPL_RC) ?
MaxIntronLen(shorter_anchor_len, minAnchorLen) :
MaxIntronLen_noncan(shorter_anchor_len, minAnchorLen_noncan));
if(intronLen_thresh < maxIntronLen) {
if(edit.splLen > intronLen_thresh) {
score += MIN_I32;
}
if(edit.splDir == SPL_FW || edit.splDir == SPL_RC) {
float probscore = ssdb.probscore(edit.donor_seq, edit.acceptor_seq);
float probscore_thresh = 0.8f;
if(edit.splLen >> 16) probscore_thresh = 0.99f;
else if(edit.splLen >> 15) probscore_thresh = 0.97f;
else if(edit.splLen >> 14) probscore_thresh = 0.94f;
else if(edit.splLen >> 13) probscore_thresh = 0.91f;
else if(edit.splLen >> 12) probscore_thresh = 0.88f;
if(probscore < probscore_thresh) score += MIN_I32;
}
if(shorter_anchor_len == left_anchor_len) {
if(_trim5 > 0) score += MIN_I32;
for(int j = (int)i - 1; j >= 0; j--) {
if((*_edits)[j].type == EDIT_TYPE_MM ||
(*_edits)[j].type == EDIT_TYPE_READ_GAP ||
(*_edits)[j].type == EDIT_TYPE_REF_GAP)
score += MIN_I32;
}
} else {
if(_trim3 > 0) score += MIN_I32;
for(index_t j = i + 1; j < _edits->size(); j++) {
if((*_edits)[j].type == EDIT_TYPE_MM ||
(*_edits)[j].type == EDIT_TYPE_READ_GAP ||
(*_edits)[j].type == EDIT_TYPE_REF_GAP)
score += MIN_I32;
}
}
}
if(edit.snpID == std::numeric_limits<uint32_t>::max()) {
if(edit.splDir == SPL_FW || edit.splDir == SPL_RC) {
score -= sc.canSpl((int)edit.splLen);
} else {
score -= sc.noncanSpl((int)edit.splLen);
}
}
// daehwan - for debugging purposes
if(shorter_anchor_len <= 15) {
numsplices += 1;
splicescore += (double)edit.splLen;
}
}
if(!conflict_splicesites) {
if(whichsense == SPL_UNKNOWN) {
whichsense = edit.splDir;
} else if(edit.splDir != SPL_UNKNOWN) {
assert_neq(whichsense, SPL_UNKNOWN);
if(edit.splDir == SPL_FW || edit.splDir == SPL_SEMI_FW) {
if(whichsense != SPL_FW && whichsense != SPL_SEMI_FW) {
conflict_splicesites = true;
}
}
if(edit.splDir == SPL_RC || edit.splDir == SPL_SEMI_RC) {
if(whichsense != SPL_RC && whichsense != SPL_SEMI_RC) {
conflict_splicesites = true;
}
}
}
}
toff_base += edit.splLen;
} else if(edit.type == EDIT_TYPE_READ_GAP) {
bool open = true;
if(i > 0 &&
(*_edits)[i-1].type == EDIT_TYPE_READ_GAP &&
(*_edits)[i-1].pos == edit.pos) {
open = false;
}
if(edit.snpID == std::numeric_limits<uint32_t>::max()) {
if(open) score -= sc.readGapOpen();
else score -= sc.readGapExtend();
}
toff_base++;
} else if(edit.type == EDIT_TYPE_REF_GAP) {
bool open = true;
if(i > 0 &&
(*_edits)[i-1].type == EDIT_TYPE_REF_GAP &&
(*_edits)[i-1].pos + 1 == edit.pos) {
open = false;
}
if(edit.snpID == std::numeric_limits<uint32_t>::max()) {
if(open) score -= sc.refGapOpen();
else score -= sc.refGapExtend();
}
toff_base--;
}
#ifndef NDEBUG
else {
assert(false);
}
#endif
}
// Penalty for soft-clipping
for(index_t i = 0; i < _trim5; i++) {
score -= sc.sc(qual[i]);
}
for(index_t i = 0; i < _trim3; i++) {
score -= sc.sc(qual[i]);
}
if(conflict_splicesites) {
score -= sc.conflictSpl();
}
if (numsplices > 1) splicescore /= (double)numsplices;
score += (_len - mm) * sc.match();
_score = score;
_splicescore = splicescore;
_localscore = localscore;
return score;
}
/**
* Encapsulates counters that measure how much work has been done by
* hierarchical indexing
*/
struct HIMetrics {
HIMetrics() : mutex_m() {
reset();
}
void reset() {
anchoratts = 0;
localatts = 0;
localindexatts = 0;
localextatts = 0;
localsearchrecur = 0;
globalgenomecoords = 0;
localgenomecoords = 0;
}
void init(
uint64_t localatts_,
uint64_t anchoratts_,
uint64_t localindexatts_,
uint64_t localextatts_,
uint64_t localsearchrecur_,
uint64_t globalgenomecoords_,
uint64_t localgenomecoords_)
{
localatts = localatts_;
anchoratts = anchoratts_;
localindexatts = localindexatts_;
localextatts = localextatts_;
localsearchrecur = localsearchrecur_;
globalgenomecoords = globalgenomecoords_;
localgenomecoords = localgenomecoords_;
}
/**
* Merge (add) the counters in the given HIMetrics object into this
* object. This is the only safe way to update a HIMetrics shared
* by multiple threads.
*/
void merge(const HIMetrics& r, bool getLock = false) {
ThreadSafe ts(&mutex_m, getLock);
localatts += r.localatts;
anchoratts += r.anchoratts;
localindexatts += r.localindexatts;
localextatts += r.localextatts;
localsearchrecur += r.localsearchrecur;
globalgenomecoords += r.globalgenomecoords;
localgenomecoords += r.localgenomecoords;
}
uint64_t localatts; // # attempts of local search
uint64_t anchoratts; // # attempts of anchor search
uint64_t localindexatts; // # attempts of local index search
uint64_t localextatts; // # attempts of extension search
uint64_t localsearchrecur;
uint64_t globalgenomecoords;
uint64_t localgenomecoords;
MUTEX_T mutex_m;
};
/**
* With a hierarchical indexing, SplicedAligner provides several alignment strategies
* , which enable effective alignment of RNA-seq reads
*/
template <typename index_t, typename local_index_t>
class HI_Aligner {
public:
/**
* Initialize with index.
*/
HI_Aligner(
const GFM<index_t>& gfm,
bool anchorStop = true,
uint64_t threads_rids_mindist = 0) :
_anchorStop(anchorStop),
_gwstate(GW_CAT),
_gwstate_local(GW_CAT),
_thread_rids_mindist(threads_rids_mindist)
{
index_t genomeLen = gfm.gh().len();
_minK = 0;
while(genomeLen > 0) {
genomeLen >>= 2;
_minK++;
}
_minK_local = 8;
}
HI_Aligner() {
}
/**
*/
void initRead(Read *rd, bool nofw, bool norc, TAlScore minsc, TAlScore maxpen, bool rightendonly = false) {
assert(rd != NULL);
_rds[0] = rd;
_rds[1] = NULL;
_paired = false;
_rightendonly = rightendonly;
_nofw[0] = nofw;
_nofw[1] = true;
_norc[0] = norc;
_norc[1] = true;
_minsc[0] = minsc;
_minsc[1] = INDEX_MAX;
_maxpen[0] = maxpen;
_maxpen[1] = INDEX_MAX;
for(size_t fwi = 0; fwi < 2; fwi++) {
bool fw = (fwi == 0);
_hits[0][fwi].init(fw, (index_t)_rds[0]->length());
}
_genomeHits.clear();
_genomeHits_rep[0].clear();
_hits_searched[0].clear();
assert(!_paired);
}
/**
*/
void initReads(Read *rds[2], bool nofw[2], bool norc[2], TAlScore minsc[2], TAlScore maxpen[2]) {
assert(rds[0] != NULL && rds[1] != NULL);
_paired = true;
_rightendonly = false;
for(size_t rdi = 0; rdi < 2; rdi++) {
_rds[rdi] = rds[rdi];
_nofw[rdi] = nofw[rdi];
_norc[rdi] = norc[rdi];
_minsc[rdi] = minsc[rdi];
_maxpen[rdi] = maxpen[rdi];
for(size_t fwi = 0; fwi < 2; fwi++) {
bool fw = (fwi == 0);
_hits[rdi][fwi].init(fw, (index_t)_rds[rdi]->length());
}
_hits_searched[rdi].clear();
}
_genomeHits.clear();
_genomeHits_rep[0].clear();
_genomeHits_rep[1].clear();
_concordantIdxInspected.first = _concordantIdxInspected.second = 0;
assert(_paired);
assert(!_rightendonly);
}
/**
* Aligns a read or a pair
* This funcion is called per read or pair
*/
virtual
int go(
const Scoring& sc,
const PairedEndPolicy& pepol, // paired-end policy
const TranscriptomePolicy& tpol,
const GraphPolicy& gpol,
const GFM<index_t>& gfm,
const GFM<index_t>* rgfm,
const ALTDB<index_t>& altdb,
const RepeatDB<index_t>& repeatdb,
const ALTDB<index_t>& raltdb,
const BitPairReference& ref,
const BitPairReference* rref,
SwAligner& swa,
SpliceSiteDB& ssdb,
WalkMetrics& wlm,
PerReadMetrics& prm,
SwMetrics& swm,
HIMetrics& him,
RandomSource& rnd,
AlnSinkWrap<index_t>& sink)
{
const ReportingParams& rp = sink.reportingParams();
index_t rdi;
bool fw;
bool found[2][2] = {{true, true}, {this->_paired, this->_paired}};
// given read and its reverse complement
// (and mate and the reverse complement of mate in case of pair alignment),
// pick up one with best partial alignment
while(nextBWT(sc, pepol, tpol, gpol, gfm, altdb, ref, rdi, fw, wlm, prm, him, rnd, sink)) {
// given the partial alignment, try to extend it to full alignments
index_t fwi = (fw == true ? 0 : 1);
found[rdi][fwi] = align(sc, pepol, tpol, gpol, gfm, altdb, repeatdb, ref, swa, ssdb, rdi, fw, wlm, prm, swm, him, rnd, sink);
if(!found[0][0] && !found[0][1] && !found[1][0] && !found[1][1]) {
break;
}
// try to combine this alignment with some of mate alignments
// to produce pair alignment
if(this->_paired) {
pairReads(sc, pepol, tpol, gpol, gfm, altdb, repeatdb, ref, wlm, prm, him, rnd, sink);
// if(sink.bestPair() >= _minsc[0] + _minsc[1]) break;
}
}
// if no concordant pair is found, try to use alignment of one-end
// as an anchor to align the other-end
if(this->_paired) {
if(sink.numPair() == 0 &&
(sink.bestUnp1() >= _minsc[0] || sink.bestUnp2() >= _minsc[1])) {
bool mate_found = false;
const EList<AlnRes> *rs[2] = {NULL, NULL};
sink.getUnp1(rs[0]); assert(rs[0] != NULL);
sink.getUnp2(rs[1]); assert(rs[1] != NULL);
index_t rs_size[2] = {(index_t)rs[0]->size(), (index_t)rs[1]->size()};
for(index_t i = 0; i < 2; i++) {
for(index_t j = 0; j < rs_size[i]; j++) {
const AlnRes& res = (*rs[i])[j];
bool fw = (res.orient() == 1);
mate_found |= alignMate(
sc,
pepol,
tpol,
gpol,
gfm,
altdb,
repeatdb,
ref,
swa,
ssdb,
i,
fw,
wlm,
prm,
swm,
him,
rnd,
sink,
(index_t)res.refid(),
(index_t)res.refoff());
}
}
if(mate_found) {
pairReads(
sc,
pepol,
tpol,
gpol,
gfm,
altdb,
repeatdb,
ref,
wlm,
prm,
him,
rnd,
sink);
}
}
}
// Determine whether reads map to repetitive sequences
bool repeat[2][2] = {{false, false}, {false, false}};
bool perform_repeat_alignment = false;
index_t indexIdx[2] = {0, 0};
#if 1
if(rgfm != NULL && !((RFM<index_t>*)rgfm)->empty()) {
// use repeat index to decide whether a read or a pair is from repetitive sequences
indexIdx[0] = ((RFM<index_t>*)rgfm)->getLocalRFM_idx((*_rds)[0].length());
if(_paired) {
indexIdx[1] = ((RFM<index_t>*)rgfm)->getLocalRFM_idx((*_rds)[1].length());
}
LocalRFM<index_t>& rfm = ((RFM<index_t>*)rgfm)->getLocalRFM(indexIdx[0]);
bool skip_repeat[2][2] = {{false, false}, {false, false}};
if(_paired) {
const EList<AlnRes> *rs[2] = {NULL, NULL};
sink.getPair(rs[0], rs[1]);
assert_eq(rs[0]->size(), rs[1]->size());
TAlScore bestScore[2][2] = {{_minsc[rdi], _minsc[rdi]}, {_minsc[rdi], _minsc[rdi]}};
for(size_t r = 0; r < rs[0]->size(); r++) {
const AlnRes& rs1 = (*rs[0])[r];
const AlnRes& rs2 = (*rs[1])[r];
TAlScore score = rs1.score().score() + rs2.score().score();
int fwi[2] = {rs1.fw() ? 0 : 1, rs2.fw() ? 0 : 1};
if(score > bestScore[fwi[0]][fwi[1]]) {
bestScore[fwi[0]][fwi[1]] = score;
}
}
for(index_t fwi = 0; fwi < 2; fwi++) {
for(index_t fwi2 = 0; fwi2 < 2; fwi2++) {
if(bestScore[fwi][fwi2] < 0)
continue;
ReadBWTHit<index_t>& hit = _hits[0][fwi];
bool unique = false;
for(size_t hi = 0; hi < hit.offsetSize(); hi++) {
BWTHit<index_t>& partialHit = hit.getPartialHit(hi);
if(partialHit.len() >= _minK + 8 && partialHit.size() == 1) {
unique = true;
break;
}
}
if(!unique)
continue;
bool unique2 = false;
ReadBWTHit<index_t>& hit2 = _hits[1][fwi2];
for(size_t hi = 0; hi < hit2.offsetSize(); hi++) {
BWTHit<index_t>& partialHit = hit2.getPartialHit(hi);
if(partialHit.len() >= _minK + 8 && partialHit.size() == 1) {
unique2 = true;
break;
}
}
if(!unique2)
continue;
skip_repeat[0][fwi] = skip_repeat[1][fwi2] = true;
}
}
} else {
const EList<AlnRes> *rs = NULL;
if(rdi == 0) sink.getUnp1(rs);
else sink.getUnp2(rs);
TAlScore bestScore[2] = {_minsc[rdi], _minsc[rdi]};
for(index_t r = 0; r < rs->size(); r++) {
TAlScore score = (*rs)[r].score().score();
if((*rs)[r].fw()) {
if(score > bestScore[0]) {
bestScore[0] = score;
}
} else {
if(score > bestScore[1]) {
bestScore[1] = score;
}
}
}
for(index_t fwi = 0; fwi < 2; fwi++) {
if(bestScore[fwi] < 0)
continue;
ReadBWTHit<index_t>& hit = _hits[rdi][fwi];
index_t offsetSize = hit.offsetSize();
for(size_t hi = 0; hi < offsetSize; hi++) {
BWTHit<index_t>& partialHit = hit.getPartialHit(hi);
if(partialHit.len() >= _minK + 8 && partialHit.size() == 1) {
skip_repeat[rdi][fwi] = true;
break;
}
}
if(skip_repeat[rdi][fwi]) break;
}
}
for(size_t rdi = 0; rdi < (_paired ? 2 : 1); rdi++) {
for(size_t fwi = 0; fwi < 2; fwi++) {
if(skip_repeat[rdi][fwi]) continue;
bool fw = (fwi == 0);
_hits[rdi][fwi].init(fw, (index_t)_rds[rdi]->length());
}
}
while(nextBWT(sc, pepol, tpol, gpol, rfm, altdb, *rref, rdi, fw, wlm, prm, him, rnd, sink));
for(size_t rdi = 0; rdi < (_paired ? 2 : 1); rdi++) {
for(size_t fwi = 0; fwi < 2; fwi++) {
if(skip_repeat[rdi][fwi]) continue;
ReadBWTHit<index_t>& hit = _hits[rdi][fwi];
index_t offsetSize = hit.offsetSize();
//assert_gt(offsetSize, 0);
for(size_t hi = 0; hi < offsetSize; hi++) {
BWTHit<index_t>& partialHit = hit.getPartialHit(hi);
if(partialHit.len() >= (rref->getMinK() << 1)) {
repeat[rdi][fwi] = true;
perform_repeat_alignment = true;
break;
}
}
}
}
}
#else
// use minimizer to decide whether a read or a pair is from repetitive sequences
perform_repeat_alignment = false;
for(size_t rdi = 0; rdi < (_paired ? 2 : 1); rdi++) {
Read& read = *_rds[rdi];
for(size_t fwi = 0; fwi < 2; fwi++) {
const BTDnaString& seq = (fwi == 0 ? read.patFw : read.patRc);
repeat[rdi][fwi] = repeat_kmertable.isRepeat(seq, _tmp_minimizers);
perform_repeat_alignment |= repeat[rdi][fwi];
}
}
#endif
// Handle alignment to repetitive regions
if(rgfm != NULL &&
perform_repeat_alignment) {
LocalRFM<index_t>& rfm = ((RFM<index_t>*)rgfm)->getLocalRFM(indexIdx[0]);
RB_KmerTable& repeatKmertable = ((RFM<index_t>*)rgfm)->getKmertable(indexIdx[0]);
_repeatConcordant.clear();
index_t prev_align_size[2] = {0, 0};
for(size_t rdi = 0; rdi < (_paired ? 2 : 1); rdi++) {
const EList<AlnRes> *rs = NULL;
if(rdi == 0) sink.getUnp1(rs);
else sink.getUnp2(rs);
prev_align_size[rdi] = rs->size();
}
for(size_t rdi = 0; rdi < (_paired ? 2 : 1); rdi++) {
for(size_t fwi = 0; fwi < 2; fwi++) {
if(!repeat[rdi][fwi]) continue;
// choose candidate partial alignments for further alignment
index_t maxsize = max<index_t>(rp.khits, rp.kseeds);
#if 0
ReadBWTHit<index_t>& hit = _hits[rdi][fwi];
if(!hit.done()) continue;
getAnchorHits(rfm,
pepol,
tpol,
gpol,
altdb,
repeatdb,
*rref,
rnd,
rdi,
fwi == 0, // fw
_genomeHits_rep[rdi],
_genomeHits_rep[rdi].size() + maxsize,
_sharedVars,
wlm,
prm,
him,
true); // repeat?
#else
getRepeatHits(rfm,
pepol,
tpol,
gpol,
raltdb,
repeatdb,
repeatKmertable,
*rref,
rnd,
rdi,
fwi == 0, // fw
indexIdx[0],
_genomeHits_rep[rdi],
_genomeHits_rep[rdi].size() + maxsize,
_sharedVars,
sc,
swa,
ssdb,
swm,
wlm,
prm,
him,
sink);
#endif
}
}
EList<pair<RepeatCoord<index_t>, RepeatCoord<index_t> > >& positions = _positions;
for(size_t rdi = 0; rdi < (_paired ? 2 : 1); rdi++) {
for(size_t i = 0; i < _genomeHits_rep[rdi].size(); i++) {
if(_genomeHits_rep[rdi][i].len() < (_minK << 1)) continue;
// DK - debugging purposes
#if 0
positions.clear();
repeatdb.getCoords(_genomeHits_rep[rdi][i]._tidx,
_genomeHits_rep[rdi][i]._joinedOff,
_genomeHits_rep[rdi][i]._joinedOff + _genomeHits_rep[rdi][i].len(),
_snpIDs,
raltdb,
positions,
rp.khits * 1000);
#endif
const EList<AlnRes> *rs = NULL;
if(rdi == 0) sink.getUnp2(rs);
else sink.getUnp1(rs);
assert(rs != NULL);
bool candidate_found = false;
for(size_t j = 0; j < prev_align_size[1-rdi]; j++) {
const AlnRes& res = (*rs)[j];
if(res.repeat())
continue;
TAlScore estScore = res.score().score() + _genomeHits_rep[rdi][i].score();
if(sink.bestPair() >= estScore && sink.numBestPair().first > rp.khits)
break;
positions.clear();
index_t joinedOff = 0;
gfm.textOffToJoined(res.refid(), res.refoff(), joinedOff);
repeatdb.findCoords(joinedOff,
joinedOff + res.refExtent(),
_genomeHits_rep[rdi][i]._tidx,
_genomeHits_rep[rdi][i]._joinedOff,
_genomeHits_rep[rdi][i]._joinedOff + _genomeHits_rep[rdi][i].len(),
_snpIDs,
raltdb,
positions,
rp.khits * 10);
if(positions.size() <= 0)
continue;
for(size_t p = 0; p < positions.size(); p++) {
if(positions[p].first.tid != res.refid()) continue;
if(positions[p].first.toff + 1000 < res.refoff() ||
res.refoff() + 1000 < positions[p].first.toff) continue;
if(sink.bestPair() >= estScore && sink.numBestPair().first > rp.khits)
break;
candidate_found = true;
_genomeHits.clear();
_genomeHits.expand();
_genomeHits.back() = _genomeHits_rep[rdi][i];
_genomeHits.back()._tidx = positions[p].first.tid;
_genomeHits.back()._toff = positions[p].first.toff;
_genomeHits.back()._joinedOff = positions[p].first.joinedOff;
if(!positions[p].first.fw) {
_genomeHits.back().reverse(*_rds[rdi]);
_rds[rdi]->oppositeConversion_3N = true;
} else {
_rds[rdi]->oppositeConversion_3N = false;
}
// extend the partial alignments bidirectionally using
// local search, extension, and (less often) global search
hybridSearch(sc,
pepol,
tpol,
gpol,
gfm,
altdb,
repeatdb,
ref,
swa,
ssdb,
rdi,
_genomeHits.back()._fw,
wlm,
prm,
swm,
him,
rnd,
sink);
}
if(candidate_found) {
pairReads(
sc,
pepol,
tpol,
gpol,
gfm,
altdb,
repeatdb,
ref,
wlm,
prm,
him,
rnd,
sink);
}
}
if(rdi == 0 && _paired) {
for(size_t j = 0; j < _genomeHits_rep[1].size(); j++) {
if(_genomeHits_rep[1][j].len() < (_minK << 1)) continue;
TAlScore estScore = _genomeHits_rep[0][i].score() + _genomeHits_rep[1][j].score();
// if(sink.bestPair() >= estScore && sink.numBestPair().first > rp.khits)
// break;
positions.clear();
repeatdb.findCommonCoords(_genomeHits_rep[0][i]._tidx,
_genomeHits_rep[0][i]._joinedOff,
_genomeHits_rep[0][i]._joinedOff + _genomeHits_rep[0][i].len(),
_snpIDs,
_genomeHits_rep[1][j]._tidx,
_genomeHits_rep[1][j]._joinedOff,
_genomeHits_rep[1][j]._joinedOff + _genomeHits_rep[1][j].len(),
_snpIDs2,
raltdb,
positions,
rp.khits * 10);
if(positions.size() <= 0) continue;
_repeatConcordant.expand();
_repeatConcordant.back().first = _genomeHits_rep[0][i]._joinedOff;
_repeatConcordant.back().second = _genomeHits_rep[1][j]._joinedOff;
for(size_t p = 0; p < positions.size(); p++) {
if(sink.bestPair() >= estScore && sink.numBestPair().first > rp.khits)
break;
_genomeHits.clear();
_genomeHits.expand();
_genomeHits.back() = _genomeHits_rep[0][i];
_genomeHits.back()._tidx = positions[p].first.tid;
_genomeHits.back()._toff = positions[p].first.toff;
_genomeHits.back()._joinedOff = positions[p].first.joinedOff;
if(!positions[p].first.fw) {
_genomeHits.back().reverse(*_rds[0]);
_rds[0]->oppositeConversion_3N = true;
} else {
_rds[0]->oppositeConversion_3N = false;
}
// extend the partial alignments bidirectionally using
// local search, extension, and (less often) global search
hybridSearch(sc,
pepol,
tpol,
gpol,
gfm,
altdb,
repeatdb,
ref,
swa,
ssdb,
0,
_genomeHits.back()._fw,
wlm,
prm,
swm,
him,
rnd,
sink);
_genomeHits.clear();
_genomeHits.expand();
_genomeHits.back() = _genomeHits_rep[1][j];
_genomeHits.back()._tidx = positions[p].second.tid;
_genomeHits.back()._toff = positions[p].second.toff;
_genomeHits.back()._joinedOff = positions[p].second.joinedOff;
if(!positions[p].second.fw) {
_genomeHits.back().reverse(*_rds[1]);
_rds[1]->oppositeConversion_3N = true;
} else {
_rds[1]->oppositeConversion_3N = false;
}
// extend the partial alignments bidirectionally using
// local search, extension, and (less often) global search
hybridSearch(sc,
pepol,
tpol,
gpol,
gfm,
altdb,
repeatdb,
ref,
swa,
ssdb,
1,
_genomeHits.back()._fw,
wlm,
prm,
swm,
him,
rnd,
sink);
}
if(positions.size() > 0) {
pairReads(
sc,
pepol,
tpol,
gpol,
gfm,
altdb,
repeatdb,
ref,
wlm,
prm,
him,
rnd,
sink);
}
}
} // if(rdi == 0)
} // for(size_t i = 0; i < _genomeHits_rep[rdi].size()
bool align2repeat = false;
if(_paired) {
index_t numBestPair = sink.numBestPair().first;
align2repeat = (numBestPair == 0 || numBestPair > rp.khits);
} else {
const EList<AlnRes> *rs = NULL;
if(rdi == 0) sink.getUnp1(rs);
else sink.getUnp2(rs);
assert(rs != NULL);
align2repeat = (rs->size() == 0 || sink.numBestUnp(rdi).first > rp.khits);
}
_rds[0]->oppositeConversion_3N = false;
if (_paired) {
_rds[1]->oppositeConversion_3N = false;
}
if(align2repeat) {
for(size_t i = 0; i < _genomeHits_rep[rdi].size(); i++) {
_genomeHits.clear();
_genomeHits.expand();
_genomeHits.back() = _genomeHits_rep[rdi][i];
_genomeHits.back()._repeat = true;
hybridSearch(sc,
pepol,
tpol,
gpol,
rfm,
altdb,
repeatdb,
*rref,
swa,
ssdb,
rdi,
_genomeHits.back()._fw,
wlm,
prm,
swm,
him,
rnd,
sink);
}
if(_paired && rdi == 1) {
if(sink.numBestUnp(rdi).first > rp.khits) {
pairReads(
sc,
pepol,
tpol,
gpol,
gfm,
altdb,
repeatdb,
ref,
wlm,
prm,
him,
rnd,
sink);
}
}
}
} // for(size_t rdi = 0
} // repeat
return EXTEND_POLICY_FULFILLED;
}
/**
* Given a read or its reverse complement (or mate),
* align the unmapped portion using the global FM index
*/
virtual
bool nextBWT(
const Scoring& sc,
const PairedEndPolicy& pepol, // paired-end policy
const TranscriptomePolicy& tpol,
const GraphPolicy& gpol,
const GFM<index_t>& gfm,
const ALTDB<index_t>& altdb,
const BitPairReference& ref,
index_t& rdi,
bool& fw,
WalkMetrics& wlm,
PerReadMetrics& prm,
HIMetrics& him,
RandomSource& rnd,
AlnSinkWrap<index_t>& sink)
{
const ReportingParams& rp = sink.reportingParams();
// Pick up a candidate from a read or its reverse complement
// (for pair, also consider mate and its reverse complement)
while(pickNextReadToSearch(rdi, fw)) {
size_t mineFw = 0, mineRc = 0;
index_t fwi = (fw ? 0 : 1);
ReadBWTHit<index_t>& hit = _hits[rdi][fwi];
assert(!hit.done());
bool pseudogeneStop = gfm.gh().linearFM() && !tpol.no_spliced_alignment();
bool anchorStop = _anchorStop && !gfm.repeat();
if(!rp.secondary) {
index_t numSearched = hit.numActualPartialSearch();
int64_t bestScore = 0;
if(rdi == 0) {
bestScore = sink.bestUnp1();
if(bestScore >= _minsc[rdi]) {
// do not further align this candidate
// unless it may be at least as good as the alignment of its reverse complement
index_t maxmm = (index_t)((-bestScore + sc.mmpMax - 1) / sc.mmpMax);
if(numSearched > maxmm + sink.bestSplicedUnp1() + 1) {
hit.done(true);
if(_paired) {
if(sink.bestUnp2() >= _minsc[1-rdi] &&
sink.numPair() > 0) return false;
else continue;
} else {
return false;
}
}
}
} else {
assert(_paired);
assert_eq(rdi, 1);
bestScore = sink.bestUnp2();
if(bestScore >= _minsc[rdi]) {
// Do not further extend this alignment
// unless it may be at least as good as the previous alignemnt
index_t maxmm = (index_t)((-bestScore + sc.mmpMax - 1) / sc.mmpMax);
if(numSearched > maxmm + sink.bestSplicedUnp2() + 1) {
hit.done(true);
if(_paired) {
if(sink.bestUnp1() >= _minsc[1-rdi] &&
sink.numPair() > 0) return false;
else continue;
} else {
return false;
}
}
}
}
ReadBWTHit<index_t>& rchit = _hits[rdi][1-fwi];
if(rchit.done() && bestScore < _minsc[rdi]) {
if(numSearched > rchit.numActualPartialSearch() + (anchorStop ? 1 : 0)) {
hit.done(true);
return false;
}
}
}
// Align this read beginning from previously stopped base
// stops when it is uniquelly mapped with at least 28bp or
// it may involve processed pseudogene
partialSearch(
gfm,
*_rds[rdi],
sc,
sink.reportingParams(),
fw,
0,
mineFw,
mineRc,
hit,
rnd,
pseudogeneStop,
anchorStop);
assert(hit.repOk());
if(hit.done()) return true;
// Advance hit._cur by 1
if(!pseudogeneStop) {
if(hit._cur + 1 < hit._len) hit._cur++;
}
if(anchorStop) {
hit.done(true);
return true;
}
// hit.adjustOffset(_minK);
}
return false;
}
/**
* Given partial alignments of a read, try to further extend
* the alignment bidirectionally
*/
virtual
bool align(
const Scoring& sc,
const PairedEndPolicy& pepol, // paired-end policy
const TranscriptomePolicy& tpol,
const GraphPolicy& gpol,
const GFM<index_t>& gfm,
const ALTDB<index_t>& altdb,
const RepeatDB<index_t>& repeatdb,
const BitPairReference& ref,
SwAligner& swa,
SpliceSiteDB& ssdb,
index_t rdi,
bool fw,
WalkMetrics& wlm,
PerReadMetrics& prm,
SwMetrics& swm,
HIMetrics& him,
RandomSource& rnd,
AlnSinkWrap<index_t>& sink);
/**
* Given the alignment of its mate as an anchor,
* align the read
*/
virtual
bool alignMate(
const Scoring& sc,
const PairedEndPolicy& pepol, // paired-end policy
const TranscriptomePolicy& tpol,
const GraphPolicy& gpol,
const GFM<index_t>& gfm,
const ALTDB<index_t>& altdb,
const RepeatDB<index_t>& repeatdb,
const BitPairReference& ref,
SwAligner& swa,
SpliceSiteDB& ssdb,
index_t rdi,
bool fw,
WalkMetrics& wlm,
PerReadMetrics& prm,
SwMetrics& swm,
HIMetrics& him,
RandomSource& rnd,
AlnSinkWrap<index_t>& sink,
index_t tidx,
index_t toff);
/**
* Given a partial alignment of a read, try to further extend
* the alignment bidirectionally using a combination of
* local search, extension, and global search
*/
virtual
void hybridSearch(
const Scoring& sc,
const PairedEndPolicy& pepol, // paired-end policy
const TranscriptomePolicy& tpol,
const GraphPolicy& gpol,
const GFM<index_t>& gfm,
const ALTDB<index_t>& altdb,
const RepeatDB<index_t>& repeatdb,
const BitPairReference& ref,
SwAligner& swa,
SpliceSiteDB& ssdb,
index_t rdi,
bool fw,
WalkMetrics& wlm,
PerReadMetrics& prm,
SwMetrics& swm,
HIMetrics& him,
RandomSource& rnd,
AlnSinkWrap<index_t>& sink)
{}
/**
* Given a partial alignment of a read, try to further extend
* the alignment bidirectionally using a combination of
* local search, extension, and global search
*/
virtual
int64_t hybridSearch_recur(
const Scoring& sc,
const PairedEndPolicy& pepol, // paired-end policy
const TranscriptomePolicy& tpol,
const GraphPolicy& gpol,
const GFM<index_t>& gfm,
const ALTDB<index_t>& altdb,
const RepeatDB<index_t>& repeatdb,
const BitPairReference& ref,
SwAligner& swa,
SpliceSiteDB& ssdb,
index_t rdi,
const GenomeHit<index_t>& hit,
index_t hitoff,
index_t hitlen,
WalkMetrics& wlm,
PerReadMetrics& prm,
SwMetrics& swm,
HIMetrics& him,
RandomSource& rnd,
AlnSinkWrap<index_t>& sink,
bool alignMate = false,
index_t dep = 0)
{ return numeric_limits<int64_t>::min(); }
/**
* Choose a candidate for alignment from a read or its reverse complement
* (also from a mate or its reverse complement for pair)
*/
bool pickNextReadToSearch(index_t& rdi, bool& fw) {
rdi = 0; fw = true;
bool picked = false;
int64_t maxScore = std::numeric_limits<int64_t>::min();
for(index_t rdi2 = 0; rdi2 < (_paired ? 2 : 1); rdi2++) {
assert(_rds[rdi2] != NULL);
for(index_t fwi = 0; fwi < 2; fwi++) {
if (fwi == 0 && _nofw[rdi2]) continue;
else if(fwi == 1 && _norc[rdi2]) continue;
if(_hits[rdi2][fwi].done()) continue;
int64_t curScore = _hits[rdi2][fwi].searchScore((index_t)_minK);
if(_hits[rdi2][fwi].cur() == 0) {
curScore = std::numeric_limits<int64_t>::max();
}
assert_gt(curScore, std::numeric_limits<int64_t>::min());
if(curScore > maxScore) {
maxScore = curScore;
rdi = rdi2;
fw = (fwi == 0);
picked = true;
}
}
}
return picked;
}
/**
* Align a part of a read without any edits
*/
index_t partialSearch(
const GFM<index_t>& gfm, // GFM index
const Read& read, // read to align
const Scoring& sc, // scoring scheme
const ReportingParams& rp,
bool fw, // don't align forward read
size_t mineMax, // don't care about edit bounds > this
size_t& mineFw, // minimum # edits for forward read
size_t& mineRc, // minimum # edits for revcomp read
ReadBWTHit<index_t>& hit, // holds all the seed hits (and exact hit)
RandomSource& rnd,
bool& pseudogeneStop, // stop if mapped to multiple locations due to processed pseudogenes
bool& anchorStop,
index_t maxHitLen = (index_t)INDEX_MAX);
/**
* Global FM index search
*/
index_t globalGFMSearch(
const GFM<index_t>& gfm, // GFM index
const Read& read, // read to align
const Scoring& sc, // scoring scheme
const ReportingParams& rp,
bool fw,
index_t hitoff,
index_t& hitlen,
index_t& top,
index_t& bot,
index_t& node_top,
index_t& node_bot,
EList<pair<index_t, index_t> >& node_iedge_count,
RandomSource& rnd,
bool& uniqueStop,
index_t maxHitLen = (index_t)INDEX_MAX);
/**
* Local FM index search
*/
index_t localGFMSearch(
const LocalGFM<local_index_t, index_t>& gfm, // GFM index
const Read& read, // read to align
const Scoring& sc, // scoring scheme
const ReportingParams& rp,
bool fw,
index_t rdoff,
index_t& hitlen,
local_index_t& top,
local_index_t& bot,
local_index_t& node_top,
local_index_t& node_bot,
EList<pair<local_index_t, local_index_t> >& local_node_iedge_count,
RandomSource& rnd,
bool& uniqueStop,
local_index_t minUniqueLen,
local_index_t maxHitLen = (local_index_t)INDEX_MAX,
local_index_t maxHits = 0);
/**
* Convert FM offsets to the corresponding genomic offset (chromosome id, offset)
**/
bool getGenomeCoords(
const GFM<index_t>& gfm,
const ALTDB<index_t>& altdb,
const BitPairReference& ref,
RandomSource& rnd,
index_t top,
index_t bot,
index_t node_top,
index_t node_bot,
const EList<pair<index_t, index_t> >& node_iedge_count,
bool fw,
index_t maxelt,
index_t rdoff,
index_t rdlen,
EList<Coord>& coords,
WalkMetrics& met,
PerReadMetrics& prm,
HIMetrics& him,
bool rejectStraddle,
bool& straddled);
/**
* Convert FM offsets to the corresponding genomic offset (chromosome id, offset)
**/
bool getGenomeCoords_local(
const GFM<local_index_t>& gfm,
const ALTDB<index_t>& altdb,
const BitPairReference& ref,
RandomSource& rnd,
local_index_t top,
local_index_t bot,
local_index_t node_top,
local_index_t node_bot,
const EList<pair<local_index_t, local_index_t> >& node_iedge_count,
bool fw,
index_t rdoff,
index_t rdlen,
EList<Coord>& coords,
WalkMetrics& met,
PerReadMetrics& prm,
HIMetrics& him,
bool rejectStraddle,
bool& straddled);
/**
* Given a set of partial alignments for a read,
* choose some that are longer and mapped to fewer places
*/
index_t getAnchorHits(
const GFM<index_t>& gfm,
const PairedEndPolicy& pepol, // paired-end policy
const TranscriptomePolicy& tpol,
const GraphPolicy& gpol,
const ALTDB<index_t>& altdb,
const RepeatDB<index_t>& repeatdb,
const BitPairReference& ref,
RandomSource& rnd,
index_t rdi,
bool fw,
EList<GenomeHit<index_t> >& genomeHits,
index_t maxGenomeHitSize,
SharedTempVars<index_t>& sharedVars,
WalkMetrics& wlm,
PerReadMetrics& prm,
HIMetrics& him,
bool repeat = false)
{
index_t fwi = (fw ? 0 : 1);
assert_lt(rdi, 2);
assert(_rds[rdi] != NULL);
ReadBWTHit<index_t>& hit = _hits[rdi][fwi];
assert(hit.done());
index_t offsetSize = hit.offsetSize();
assert_gt(offsetSize, 0);
for(size_t hi = 0; hi < offsetSize; hi++) {
index_t hj = 0;
for(; hj < offsetSize; hj++) {
BWTHit<index_t>& partialHit_j = hit.getPartialHit(hj);
if(partialHit_j.empty() ||
partialHit_j.hasGenomeCoords() ||
partialHit_j.len() <= _minK + 2) continue;
else break;
}
if(hj >= offsetSize) break;
for(index_t hk = hj + 1; hk < offsetSize; hk++) {
BWTHit<index_t>& partialHit_j = hit.getPartialHit(hj);
BWTHit<index_t>& partialHit_k = hit.getPartialHit(hk);
if(partialHit_k.empty() ||
partialHit_k.hasGenomeCoords() ||
partialHit_k.len() <= _minK + 2) continue;
if(partialHit_j._hit_type == partialHit_k._hit_type) {
if((partialHit_j.size() > partialHit_k.size()) ||
(partialHit_j.size() == partialHit_k.size() && partialHit_j.len() < partialHit_k.len())) {
hj = hk;
}
} else {
if(partialHit_k._hit_type > partialHit_j._hit_type) {
hj = hk;
}
}
}
BWTHit<index_t>& partialHit = hit.getPartialHit(hj);
assert(!partialHit.hasGenomeCoords());
// Retrieve genomic coordinates
// If there are too many genomic coordinates to get,
// then we randomly choose and retrieve a small set of them
assert_leq(genomeHits.size(), maxGenomeHitSize);
index_t remainedGenomeHitSize = maxGenomeHitSize - genomeHits.size();
if(remainedGenomeHitSize <= 0)
break;
index_t expectedNumCoords = partialHit._node_bot - partialHit._node_top;
bool straddled = false;
if(expectedNumCoords <= remainedGenomeHitSize) {
getGenomeCoords(
gfm,
altdb,
ref,
rnd,
partialHit._top,
partialHit._bot,
partialHit._node_top,
partialHit._node_bot,
partialHit._node_iedge_count,
fw,
partialHit._bot - partialHit._top,
hit._len - partialHit._bwoff - partialHit._len,
partialHit._len,
partialHit._coords,
wlm,
prm,
him,
false, // reject straddled
straddled);
} else {
index_t edgeIdx = 0;
index_t top = partialHit._top;
index_t added = 0;
for(index_t node = partialHit._node_top; node < partialHit._node_bot; node++, expectedNumCoords--) {
index_t bot = top + 1;
_tmp_node_iedge_count.clear();
if(edgeIdx < partialHit._node_iedge_count.size()) {
assert_leq(node - partialHit._node_top, partialHit._node_iedge_count[edgeIdx].first);
if(node - partialHit._node_top == partialHit._node_iedge_count[edgeIdx].first) {
bot += partialHit._node_iedge_count[edgeIdx].second;
_tmp_node_iedge_count.expand();
_tmp_node_iedge_count.back().first = 0;
_tmp_node_iedge_count.back().second = partialHit._node_iedge_count[edgeIdx].second;
edgeIdx++;
}
}
assert_lt(added, remainedGenomeHitSize);
uint32_t rndi = rnd.nextU32() % expectedNumCoords;
if(rndi < remainedGenomeHitSize - added) {
getGenomeCoords(
gfm,
altdb,
ref,
rnd,
top,
bot,
node,
node + 1,
_tmp_node_iedge_count,
fw,
partialHit._bot - partialHit._top,
hit._len - partialHit._bwoff - partialHit._len,
partialHit._len,
partialHit._coords,
wlm,
prm,
him,
false, // reject straddled
straddled);
added++;
if(added >= remainedGenomeHitSize) break;
}
top = bot;
}
}
if(!partialHit.hasGenomeCoords()) continue;
EList<Coord>& coords = partialHit._coords;
assert_gt(coords.size(), 0);
const index_t genomeHit_size = (index_t)genomeHits.size();
if(genomeHit_size + coords.size() > maxGenomeHitSize) {
coords.shufflePortion(0, coords.size(), rnd);
}
for(index_t k = 0; k < coords.size(); k++) {
const Coord& coord = coords[k];
if(coord.ref() == numeric_limits<index_t>::max())
continue;
index_t len = partialHit._len;
index_t rdoff = hit._len - partialHit._bwoff - len;
bool overlapped = false;
for(index_t l = 0; l < genomeHit_size; l++) {
GenomeHit<index_t>& genomeHit = genomeHits[l];
if(genomeHit.ref() != (index_t)coord.ref() || genomeHit.fw() != coord.fw()) continue;
assert_lt(genomeHit.rdoff(), hit._len);
assert_lt(rdoff, hit._len);
index_t hitoff = genomeHit.refoff() + hit._len - genomeHit.rdoff();
index_t hitoff2 = (index_t)coord.off() + hit._len - rdoff;
int64_t hitoff_diff = (tpol.no_spliced_alignment() ? 0 : tpol.maxIntronLen());
if(abs((int64_t)hitoff - (int64_t)hitoff2) <= hitoff_diff) {
overlapped = true;
genomeHit._hitcount++;
break;
}
}
if(repeat) {
if(!repeatdb.repeatExist(coord.ref(), coord.off(), coord.off() + len)) {
continue;
}
}
if(!overlapped) {
GenomeHit<index_t>::adjustWithALT(
rdoff,
len,
coord,
_sharedVars,
genomeHits,
*_rds[rdi],
gfm,
altdb,
ref,
gpol);
}
if(partialHit._hit_type == CANDIDATE_HIT && genomeHits.size() >= maxGenomeHitSize) break;
}
if(partialHit._hit_type == CANDIDATE_HIT && genomeHits.size() >= maxGenomeHitSize) break;
}
return (index_t)genomeHits.size();
}
/**
*
*/
index_t getRepeatHits(
const GFM<index_t>& gfm,
const PairedEndPolicy& pepol, // paired-end policy
const TranscriptomePolicy& tpol,
const GraphPolicy& gpol,
const ALTDB<index_t>& altdb,
const RepeatDB<index_t>& repeatdb,
const RB_KmerTable& repeat_kmertable,
const BitPairReference& ref,
RandomSource& rnd,
index_t rdi,
bool fw,
index_t repID,
EList<GenomeHit<index_t> >& genomeHits,
index_t maxGenomeHitSize,
SharedTempVars<index_t>& sharedVars,
const Scoring& sc,
SwAligner& swa,
SpliceSiteDB& ssdb,
SwMetrics& swm,
WalkMetrics& wlm,
PerReadMetrics& prm,
HIMetrics& him,
AlnSinkWrap<index_t>& sink)
{
assert_lt(rdi, 2);
assert(_rds[rdi] != NULL);
Read& rd = *_rds[rdi];
const BTDnaString& seq = fw ? rd.patFw : rd.patRc;
repeat_kmertable.findAlignments(seq,
_tmp_minimizers,
_tmp_position2D,
_tmp_alignments);
const TAlScore cushion = sc.mmpMax;
TAlScore bestScore = _minsc[rdi];
size_t prev_numHits = genomeHits.size();
for(index_t i = 0; i < _tmp_alignments.size(); i++) {
const RB_Alignment& coord = _tmp_alignments[i];
index_t len = seq.length();
index_t rdoff = 0;
if(!repeatdb.repeatExist(repID, coord.pos, coord.pos + len)) {
continue;
}
genomeHits.expand();
GenomeHit<index_t>& genomeHit = genomeHits.back();
genomeHit.init(fw,
rdoff,
0,
0, // trim5
0, // trim3
repID, // ref,
coord.pos,
coord.pos,
this->_sharedVars);
index_t maxmm = (index_t)(-bestScore / sc.mmpMax);
index_t leftext = 0, rightext = len;
genomeHit.extend(rd,
gfm,
ref,
altdb,
repeatdb,
ssdb,
swa,
swm,
prm,
sc,
this->_minsc[rdi],
rnd,
(index_t)this->_minK_local,
tpol,
gpol,
leftext,
rightext,
maxmm);
if(genomeHit.len() < len) {
genomeHits.pop_back();
continue;
}
if(genomeHit.score() > bestScore) {
bestScore = genomeHit.score();
size_t remove_count = 0;
size_t k = prev_numHits;
for(size_t j = prev_numHits; j < genomeHits.size(); j++) {
if(genomeHits[j].score() >= max(_minsc[rdi], bestScore - cushion)) {
assert_leq(k, j);
if(k < j) {
genomeHits[k] = genomeHits[j];
}
k++;
} else {
remove_count++;
}
}
assert_eq(k + remove_count, genomeHits.size());
assert_leq(prev_numHits + remove_count, genomeHits.size());
if(remove_count > 0) {
genomeHits.resize(genomeHits.size() - remove_count);
}
} else if(genomeHit.score() < max(_minsc[rdi], bestScore - cushion)) {
genomeHits.pop_back();
}
}
return (index_t)genomeHits.size();
}
bool pairReads(
const Scoring& sc,
const PairedEndPolicy& pepol, // paired-end policy
const TranscriptomePolicy& tpol,
const GraphPolicy& gpol,
const GFM<index_t>& gfm,
const ALTDB<index_t>& altdb,
const RepeatDB<index_t>& repeatdb,
const BitPairReference& ref,
WalkMetrics& wlm,
PerReadMetrics& prm,
HIMetrics& him,
RandomSource& rnd,
AlnSinkWrap<index_t>& sink);
/**
*
**/
bool reportHit(
const Scoring& sc,
const PairedEndPolicy& pepol, // paired-end policy
const TranscriptomePolicy& tpol,
const GraphPolicy& gpol,
const GFM<index_t>& gfm,
const ALTDB<index_t>& altdb,
const RepeatDB<index_t>& repeatdb,
const BitPairReference& ref,
const SpliceSiteDB& ssdb,
AlnSinkWrap<index_t>& sink,
index_t rdi,
const GenomeHit<index_t>& hit,
bool alignMate = false,
const GenomeHit<index_t>* ohit = NULL);
/**
* check this alignment is already examined
**/
bool redundant(
AlnSinkWrap<index_t>& sink,
index_t rdi,
index_t tidx,
index_t toff);
/**
* check this alignment is already examined
**/
bool redundant(
AlnSinkWrap<index_t>& sink,
index_t rdi,
const GenomeHit<index_t>& hit);
/**
*
**/
bool isSearched(
const GenomeHit<index_t>& hit,
index_t rdi);
/**
*
**/
void addSearched(const GenomeHit<index_t>& hit,
index_t rdi);
protected:
Read * _rds[2];
bool _paired;
bool _rightendonly;
bool _nofw[2];
bool _norc[2];
TAlScore _minsc[2];
TAlScore _maxpen[2];
bool _anchorStop;
ReadBWTHit<index_t> _hits[2][2];
EList<index_t, 16> _offs;
SARangeWithOffs<EListSlice<index_t, 16>, index_t> _sas;
GroupWalk2S<index_t, EListSlice<index_t, 16>, 16> _gws;
GroupWalkState<index_t> _gwstate;
EList<local_index_t, 16> _offs_local;
SARangeWithOffs<EListSlice<local_index_t, 16>, local_index_t> _sas_local;
GroupWalk2S<local_index_t, EListSlice<local_index_t, 16>, 16> _gws_local;
GroupWalkState<local_index_t> _gwstate_local;
// temporary and shared variables used for GenomeHit
// this should be defined before _genomeHits and _hits_searched
SharedTempVars<index_t> _sharedVars;
// temporary and shared variables for AlnRes
LinkedEList<EList<Edit> > _rawEdits;
// temporary
EList<GenomeHit<index_t> > _genomeHits;
EList<GenomeHit<index_t> > _genomeHits_rep[2];
EList<index_t> _snpIDs;
EList<index_t> _snpIDs2;
EList<bool> _genomeHits_done;
ELList<Coord> _coords;
EList<pair<RepeatCoord<index_t>, RepeatCoord<index_t> > > _positions;
ELList<SpliceSite> _spliceSites;
pair<index_t, index_t> _concordantIdxInspected;
EList<pair<index_t, index_t> > _repeatConcordant;
size_t _minK; // log4 of the size of a genome
size_t _minK_local; // log4 of the size of a local index (8)
ELList<GenomeHit<index_t> > _local_genomeHits;
EList<uint8_t> _anchors_added;
uint64_t max_localindexatts;
uint64_t bwops_; // Burrows-Wheeler operations
uint64_t bwedits_; // Burrows-Wheeler edits
//
EList<GenomeHit<index_t> > _hits_searched[2];
uint64_t _thread_rids_mindist;
//
EList<pair<index_t, index_t> > _node_iedge_count;
EList<pair<index_t, index_t> > _tmp_node_iedge_count;
EList<pair<local_index_t, local_index_t> > _local_node_iedge_count;
EList<pair<local_index_t, local_index_t> > _tmp_local_node_iedge_count;
EList<pair<uint64_t, size_t> > _tmp_minimizers;
ELList<RB_Alignment> _tmp_position2D;
EList<RB_Alignment> _tmp_alignments;
// For AlnRes::matchesRef
ASSERT_ONLY(EList<bool> raw_matches_);
ASSERT_ONLY(BTDnaString tmp_rf_);
ASSERT_ONLY(BTDnaString tmp_rdseq_);
ASSERT_ONLY(BTString tmp_qseq_);
};
#define HIER_INIT_LOCS(top, bot, tloc, bloc, e) { \
if(bot - top == 1) { \
tloc.initFromRow(top, (e).gh(), (e).gfm()); \
bloc.invalidate(); \
} else { \
SideLocus<index_t>::initFromTopBot(top, bot, (e).gh(), (e).gfm(), tloc, bloc); \
assert(bloc.valid()); \
} \
}
#define HIER_SANITY_CHECK_4TUP(t, b, tp, bp) { \
ASSERT_ONLY(cur_index_t tot = (b[0]-t[0])+(b[1]-t[1])+(b[2]-t[2])+(b[3]-t[3])); \
ASSERT_ONLY(cur_index_t totp = (bp[0]-tp[0])+(bp[1]-tp[1])+(bp[2]-tp[2])+(bp[3]-tp[3])); \
assert_eq(tot, totp); \
}
#define LOCAL_INIT_LOCS(top, bot, tloc, bloc, e) { \
if(bot - top == 1) { \
tloc.initFromRow(top, (e).gh(), (e).gfm()); \
bloc.invalidate(); \
} else { \
SideLocus<local_index_t>::initFromTopBot(top, bot, (e).gh(), (e).gfm(), tloc, bloc); \
assert(bloc.valid()); \
} \
}
/**
* Given partial alignments of a read, try to further extend
* the alignment bidirectionally
*/
template <typename index_t, typename local_index_t>
bool HI_Aligner<index_t, local_index_t>::align(
const Scoring& sc,
const PairedEndPolicy& pepol, // paired-end policy
const TranscriptomePolicy& tpol,
const GraphPolicy& gpol,
const GFM<index_t>& gfm,
const ALTDB<index_t>& altdb,
const RepeatDB<index_t>& repeatdb,
const BitPairReference& ref,
SwAligner& swa,
SpliceSiteDB& ssdb,
index_t rdi,
bool fw,
WalkMetrics& wlm,
PerReadMetrics& prm,
SwMetrics& swm,
HIMetrics& him,
RandomSource& rnd,
AlnSinkWrap<index_t>& sink)
{
const ReportingParams& rp = sink.reportingParams();
index_t fwi = (fw ? 0 : 1);
assert_lt(rdi, 2);
assert(_rds[rdi] != NULL);
ReadBWTHit<index_t>& hit = _hits[rdi][fwi];
assert(hit.done());
index_t minOff = 0;
if(hit.minWidth(minOff) == std::numeric_limits<index_t>::max()) return false;
// Don't try to align if the potential alignment for this read might be
// worse than the best alignment of its reverse complement
int64_t bestScore = (rdi == 0 ? sink.bestUnp1() : sink.bestUnp2());
index_t num_spliced = (rdi == 0 ? sink.bestSplicedUnp1() : sink.bestSplicedUnp2());
if(bestScore < _minsc[rdi]) bestScore = _minsc[rdi];
index_t maxmm = (index_t)((-bestScore + sc.mmpMax - 1) / sc.mmpMax);
index_t numActualPartialSearch = hit.numActualPartialSearch();
if(!rp.secondary && numActualPartialSearch > maxmm + num_spliced + 1) return true;
// choose candidate partial alignments for further alignment
const index_t maxsize = max<index_t>(rp.khits, rp.kseeds);
_genomeHits.clear();
index_t numHits = getAnchorHits(gfm,
pepol,
tpol,
gpol,
altdb,
repeatdb,
ref,
rnd,
rdi,
fw,
_genomeHits,
maxsize,
_sharedVars,
wlm,
prm,
him,
gfm.repeat());
if(numHits <= 0) return false;
// limit the number of local index searches used for alignment of the read
uint64_t add = 0;
if(rp.secondary) add = (-_minsc[rdi] / sc.mmpMax) * numHits * 2;
else add = (-_minsc[rdi] / sc.mmpMax) * numHits;
max_localindexatts = him.localindexatts + max<uint64_t>(10, add);
// extend the partial alignments bidirectionally using
// local search, extension, and (less often) global search
hybridSearch(sc,
pepol,
tpol,
gpol,
gfm,
altdb,
repeatdb,
ref,
swa,
ssdb,
rdi,
fw,
wlm,
prm,
swm,
him,
rnd,
sink);
return true;
}
/**
* Given the alignment of its mate as an anchor,
* align the read
*/
template <typename index_t, typename local_index_t>
bool HI_Aligner<index_t, local_index_t>::alignMate(
const Scoring& sc,
const PairedEndPolicy& pepol, // paired-end policy
const TranscriptomePolicy& tpol,
const GraphPolicy& gpol,
const GFM<index_t>& gfm,
const ALTDB<index_t>& altdb,
const RepeatDB<index_t>& repeatdb,
const BitPairReference& ref,
SwAligner& swa,
SpliceSiteDB& ssdb,
index_t rdi,
bool fw,
WalkMetrics& wlm,
PerReadMetrics& prm,
SwMetrics& swm,
HIMetrics& him,
RandomSource& rnd,
AlnSinkWrap<index_t>& sink,
index_t tidx,
index_t toff)
{
const ReportingParams& rp = sink.reportingParams();
assert_lt(rdi, 2);
index_t ordi = 1 - rdi;
bool ofw = (fw == gMate2fw ? gMate1fw : gMate2fw);
assert(_rds[ordi] != NULL);
const Read& ord = *_rds[ordi];
index_t rdlen = (index_t)ord.length();
assert_gt(rdlen, 0);
_genomeHits.clear();
if(_coords.size() == 0) {
_coords.expand();
}
EList<Coord>& coords = _coords.front();
// local search to find anchors
const HGFM<index_t, local_index_t>* hGFM = (const HGFM<index_t, local_index_t>*)(&gfm);
const LocalGFM<local_index_t, index_t>* lGFM = hGFM->getLocalGFM(tidx, toff);
bool first = true;
index_t count = 0;
index_t max_hitlen = 0;
while(count++ < 2) {
if(first) {
first = false;
} else {
if(_genomeHits.size() > 0) break;
if(fw) {
lGFM = hGFM->nextLocalGFM(lGFM);
} else {
lGFM = hGFM->prevLocalGFM(lGFM);
}
if(lGFM == NULL || lGFM->empty()) break;
}
index_t hitoff = rdlen - 1;
while(hitoff >= _minK_local - 1) {
index_t hitlen = 0;
local_index_t top = (local_index_t)INDEX_MAX, bot = (local_index_t)INDEX_MAX;
local_index_t node_top = (local_index_t)INDEX_MAX, node_bot = (local_index_t)INDEX_MAX;
_local_node_iedge_count.clear();
bool uniqueStop = false;
index_t nelt = localGFMSearch(
*lGFM, // GFM index
ord, // read to align
sc, // scoring scheme
sink.reportingParams(),
ofw,
hitoff,
hitlen,
top,
bot,
node_top,
node_bot,
_local_node_iedge_count,
rnd,
uniqueStop,
_minK_local);
assert_leq(top, bot);
assert_eq(nelt, (index_t)(node_bot - node_top));
assert_leq(hitlen, hitoff + 1);
if(nelt > 0 && nelt <= rp.kseeds && hitlen > max_hitlen) {
coords.clear();
bool straddled = false;
getGenomeCoords_local(
*lGFM,
altdb,
ref,
rnd,
top,
bot,
node_top,
node_bot,
_local_node_iedge_count,
ofw,
hitoff - hitlen + 1,
hitlen,
coords,
wlm,
prm,
him,
true, // reject straddled?
straddled);
assert_leq(coords.size(), nelt);
_genomeHits.clear();
for(index_t ri = 0; ri < coords.size(); ri++) {
const Coord& coord = coords[ri];
if(tpol.no_spliced_alignment()) {
if(coord.off() + pepol.maxFragLen() * 2 < toff || toff + pepol.maxFragLen() * 2 < coord.off())
continue;
}
GenomeHit<index_t>::adjustWithALT(
hitoff - hitlen + 1,
hitlen,
coord,
_sharedVars,
_genomeHits,
*this->_rds[ordi],
gfm,
altdb,
ref,
gpol);
}
max_hitlen = hitlen;
}
assert_leq(hitlen, hitoff + 1);
if(hitlen > 0) hitoff -= (hitlen - 1);
if(hitoff > 0) hitoff -= 1;
} // while(hitoff >= minHitLen - 1)
} // while(count++ < 2)
// randomly select
const index_t maxsize = rp.kseeds;
if(_genomeHits.size() > maxsize) {
_genomeHits.shufflePortion(0, _genomeHits.size(), rnd);
_genomeHits.resize(maxsize);
}
// local search using the anchor
for(index_t hi = 0; hi < _genomeHits.size(); hi++) {
him.anchoratts++;
GenomeHit<index_t>& genomeHit = _genomeHits[hi];
index_t leftext = (index_t)INDEX_MAX, rightext = (index_t)INDEX_MAX;
genomeHit.extend(
ord,
gfm,
ref,
altdb,
repeatdb,
ssdb,
swa,
swm,
prm,
sc,
_minsc[ordi],
rnd,
(index_t)_minK_local,
tpol,
gpol,
leftext,
rightext);
hybridSearch_recur(
sc,
pepol,
tpol,
gpol,
gfm,
altdb,
repeatdb,
ref,
swa,
ssdb,
ordi,
genomeHit,
genomeHit.rdoff(),
genomeHit.len(),
wlm,
prm,
swm,
him,
rnd,
sink,
true); // alignMate?
}
return true;
}
/**
* convert FM offsets to the corresponding genomic offset (chromosome id, offset)
**/
template <typename index_t, typename local_index_t>
bool HI_Aligner<index_t, local_index_t>::getGenomeCoords(
const GFM<index_t>& gfm,
const ALTDB<index_t>& altdb,
const BitPairReference& ref,
RandomSource& rnd,
index_t top,
index_t bot,
index_t node_top,
index_t node_bot,
const EList<pair<index_t, index_t> >& node_iedge_count,
bool fw,
index_t maxelt,
index_t rdoff,
index_t rdlen,
EList<Coord>& coords,
WalkMetrics& met,
PerReadMetrics& prm,
HIMetrics& him,
bool rejectStraddle,
bool& straddled)
{
straddled = false;
assert_gt(bot, top);
assert_leq(node_bot - node_top, bot - top);
index_t nelt = node_bot - node_top;
nelt = min<index_t>(nelt, maxelt);
him.globalgenomecoords += nelt;
_offs.resize(nelt);
_offs.fill((index_t)INDEX_MAX);
_sas.init(
top,
bot,
node_top,
node_bot,
node_iedge_count,
rdlen,
EListSlice<index_t, 16>(_offs, 0, nelt));
_gws.init(gfm, ref, _sas, rnd, met);
for(index_t off = 0; off < nelt; off++) {
WalkResult<index_t> wr;
index_t tidx = 0, toff = 0, tlen = 0;
_gws.advanceElement(
off,
gfm, // forward Bowtie index for walking left
ref, // bitpair-encoded reference
_sas, // SA range with offsets
_gwstate, // GroupWalk state; scratch space
wr, // put the result here
met, // metrics
prm); // per-read metrics
assert_neq(wr.toff, (index_t)INDEX_MAX);
bool straddled2 = false;
gfm.joinedToTextOff(
wr.elt.len,
wr.toff,
tidx,
toff,
tlen,
rejectStraddle, // reject straddlers?
straddled2); // straddled?
straddled |= straddled2;
if(tidx == (index_t)INDEX_MAX) {
// The seed hit straddled a reference boundary so the seed
// hit isn't valid
return false;
}
index_t global_toff = toff, global_tidx = tidx;
// Coordinate of the seed hit w/r/t the pasted reference string
coords.expand();
if(!straddled2) {
coords.back().init(global_tidx, (int64_t)global_toff, fw, wr.toff);
} else {
coords.back().init(numeric_limits<index_t>::max(), (int64_t)global_toff, fw, wr.toff);
}
}
return true;
}
/**
* convert FM offsets to the corresponding genomic offset (chromosome id, offset)
**/
template <typename index_t, typename local_index_t>
bool HI_Aligner<index_t, local_index_t>::getGenomeCoords_local(
const GFM<local_index_t>& gfm,
const ALTDB<index_t>& altdb,
const BitPairReference& ref,
RandomSource& rnd,
local_index_t top,
local_index_t bot,
local_index_t node_top,
local_index_t node_bot,
const EList<pair<local_index_t, local_index_t> >& node_iedge_count,
bool fw,
index_t rdoff,
index_t rdlen,
EList<Coord>& coords,
WalkMetrics& met,
PerReadMetrics& prm,
HIMetrics& him,
bool rejectStraddle,
bool& straddled)
{
straddled = false;
assert_gt(bot, top);
assert_leq(node_bot - node_top, bot - top);
index_t nelt = node_bot - node_top;
him.localgenomecoords += nelt;
_offs_local.resize(nelt);
_offs_local.fill((local_index_t)INDEX_MAX);
_sas_local.init(
top,
bot,
node_top,
node_bot,
node_iedge_count,
rdlen,
EListSlice<local_index_t, 16>(_offs_local, 0, nelt));
_gws_local.init(gfm, ref, _sas_local, rnd, met);
for(local_index_t off = 0; off < nelt; off++) {
WalkResult<local_index_t> wr;
local_index_t tidx = 0, toff = 0, tlen = 0;
_gws_local.advanceElement(
off,
gfm, // forward Bowtie index for walking left
ref, // bitpair-encoded reference
_sas_local, // SA range with offsets
_gwstate_local, // GroupWalk state; scratch space
wr, // put the result here
met, // metrics
prm); // per-read metrics
assert_neq(wr.toff, (local_index_t)INDEX_MAX);
bool straddled2 = false;
bool result = gfm.joinedToTextOff(
wr.elt.len,
wr.toff,
tidx,
toff,
tlen,
rejectStraddle, // reject straddlers?
straddled2); // straddled?
if(!result) continue;
straddled |= straddled2;
if(tidx == (local_index_t)INDEX_MAX) {
// The seed hit straddled a reference boundary so the seed
// hit isn't valid
return false;
}
LocalGFM<local_index_t, index_t>* localGFM = (LocalGFM<local_index_t, index_t>*)&gfm;
index_t global_tidx = localGFM->_tidx;
index_t global_toff = toff + localGFM->_localOffset;
index_t joinedOff = wr.toff + localGFM->_joinedOffset;
if(global_toff < rdoff) continue;
// Coordinate of the seed hit w/r/t the pasted reference string
coords.expand();
coords.back().init(global_tidx, (int64_t)global_toff, fw, joinedOff);
}
return true;
}
/**
* examine alignments of left and right reads to produce concordant pair alignment
**/
template <typename index_t, typename local_index_t>
bool HI_Aligner<index_t, local_index_t>::pairReads(
const Scoring& sc,
const PairedEndPolicy& pepol, // paired-end policy
const TranscriptomePolicy& tpol,
const GraphPolicy& gpol,
const GFM<index_t>& gfm,
const ALTDB<index_t>& altdb,
const RepeatDB<index_t>& repeatdb,
const BitPairReference& ref,
WalkMetrics& wlm,
PerReadMetrics& prm,
HIMetrics& him,
RandomSource& rnd,
AlnSinkWrap<index_t>& sink)
{
const ReportingParams& rp = sink.reportingParams();
assert(_paired);
const EList<AlnRes> *rs1 = NULL, *rs2 = NULL;
sink.getUnp1(rs1); assert(rs1 != NULL);
sink.getUnp2(rs2); assert(rs2 != NULL);
index_t start_i = _concordantIdxInspected.first, start_j = _concordantIdxInspected.second;
_concordantIdxInspected.first = rs1->size();
_concordantIdxInspected.second = rs2->size();
for(index_t i = 0; i < rs1->size(); i++) {
for(index_t j = (i >= start_i ? 0 : start_j); j < rs2->size(); j++) {
if(sink.state().doneConcordant()) {
return true;
}
const AlnRes& r1 = (*rs1)[i];
Coord left = r1.refcoord(), right = r1.refcoord_right();
assert_eq(left.ref(), right.ref());
const AlnRes& r2 = (*rs2)[j];
Coord left2 = r2.refcoord(), right2 = r2.refcoord_right();
assert_eq(left2.ref(), right2.ref());
if(r1.repeat() != r2.repeat())
continue;
bool dna_frag_pass = true;
if(r1.repeat() && r2.repeat()) {
bool found = false;
for(size_t r = 0; r < _repeatConcordant.size(); r++) {
if(_repeatConcordant[r].first == left.off() &&
_repeatConcordant[r].second == left2.off()) {
found = true;
break;
}
}
if(!found) {
dna_frag_pass = false;
}
} else{
if(left.ref() != left2.ref()) continue;
assert_eq(left.orient(), right.orient());
assert_eq(left2.orient(), right2.orient());
if(left.orient() == gMate1fw) {
if(left2.orient() != gMate2fw) continue;
} else {
if(left2.orient() == gMate2fw) continue;
Coord temp = left; left = left2; left2 = temp;
temp = right; right = right2; right2 = temp;
}
if(left.off() > left2.off()) continue;
if(right.off() > right2.off()) continue;
if(right.off() + (int)tpol.maxIntronLen() < left2.off()) continue;
assert_geq(r1.score().score(), _minsc[0]);
assert_geq(r2.score().score(), _minsc[1]);
if(tpol.no_spliced_alignment()){
int pairCl = PE_ALS_DISCORD;
assert_eq(r1.refid(), r2.refid());
index_t off1, off2, len1, len2;
bool fw1, fw2;
if(r1.refoff() < r2.refoff()) {
off1 = r1.refoff(); off2 = r2.refoff();
len1 = r1.refExtent(); len2 = r2.refExtent();
fw1 = r1.fw(); fw2 = r2.fw();
} else {
off1 = r2.refoff(); off2 = r1.refoff();
len1 = r2.refExtent(); len2 = r1.refExtent();
fw1 = r2.fw(); fw2 = r1.fw();
}
// Check that final mate alignments are consistent with
// paired-end fragment constraints
pairCl = pepol.peClassifyPair(
off1,
len1,
fw1,
off2,
len2,
fw2);
dna_frag_pass = (pairCl != PE_ALS_DISCORD);
}
}
if(!tpol.no_spliced_alignment() || dna_frag_pass) {
TAlScore threshold = sink.bestPair();
if(sink.bestUnp1() >= _minsc[0] && sink.bestUnp2() >= _minsc[1]) {
TAlScore tmp = sink.bestUnp1() + sink.bestUnp2() - (r1.readLength() + r2.readLength()) * 0.03 * sc.mm(255);
if(tmp > threshold) {
threshold = tmp;
}
}
if(r1.score().score() + r2.score().score() >= threshold || rp.secondary) {
sink.report(0, &r1, &r2);
}
}
}
}
return true;
}
/**
* report read (or pair) alignment
**/
template <typename index_t, typename local_index_t>
bool HI_Aligner<index_t, local_index_t>::reportHit(
const Scoring& sc,
const PairedEndPolicy& pepol, // paired-end policy
const TranscriptomePolicy& tpol,
const GraphPolicy& gpol,
const GFM<index_t>& gfm,
const ALTDB<index_t>& altdb,
const RepeatDB<index_t>& repeatdb,
const BitPairReference& ref,
const SpliceSiteDB& ssdb,
AlnSinkWrap<index_t>& sink,
index_t rdi,
const GenomeHit<index_t>& hit,
bool alignMate,
const GenomeHit<index_t>* ohit)
{
assert_lt(rdi, 2);
assert(_rds[rdi] != NULL);
const Read& rd = *_rds[rdi];
index_t rdlen = (index_t)rd.length();
if(hit.rdoff() - hit.trim5() > 0 || hit.len() + hit.trim5() + hit.trim3() < rdlen) return false;
if(hit.score() < _minsc[rdi]) return false;
if(!sink.reportingParams().repeat && hit.repeat()) return false;
// Edits are represented from 5' end of read to 3' end, not an alignment of read
EList<Edit>& edits = const_cast<EList<Edit>&>(hit.edits());
if(hit.trim5() > 0) {
for(size_t i = 0; i < edits.size(); i++) {
edits[i].pos += hit.trim5();
}
}
if(!hit.fw()) {
Edit::invertPoss(edits, rdlen, false);
}
// in case of multiple exonic alignments, choose the ones near (known) splice sites
// this helps eliminate cases of reads being mapped to pseudogenes
pair<bool, bool> spliced = hit.spliced(); // pair<spliced, spliced_to_known>
if(tpol.xs_only() && spliced.first) {
if(hit.splicing_dir() == SPL_UNKNOWN)
return false;
}
if(!tpol.no_spliced_alignment() && tpol.avoid_pseudogene()) {
if(!spliced.first) {
assert(!spliced.second);
const index_t max_exon_size = 10000;
index_t left = 0;
if(hit.refoff() > max_exon_size) {
left = hit.refoff() - max_exon_size;
}
index_t right = hit.refoff() + hit.len() + max_exon_size;
spliced.first = ssdb.hasSpliceSites(
hit.ref(),
left,
right,
left,
right,
true); // include novel splice sites
if(altdb.hasExons()) {
spliced.second = ssdb.insideExon(hit.ref(), hit.refoff(), hit.refoff() + hit.len() - 1);
}
}
}
if(tpol.transcriptome_mapping_only() && !spliced.second)
return false;
AlnScore asc(
hit.score(), // numeric score
hit.ns(), // # Ns
hit.ngaps(), // # gaps
hit.repeat(),
hit.splicescore(), // splice scorehit
spliced.second, // mapped to known transcripts?
spliced.first, // spliced alignment or near splice sites (novel)?
hit.trim5(), // left trim length
hit.trim3()); // right trim length
bool softTrim = hit.trim5() > 0 || hit.trim3() > 0;
AlnRes rs;
rs.init(
rdlen, // # chars after hard trimming
rd.rdid, // read ID
asc, // alignment score
&hit.edits(), // nucleotide edits array
0, // nucleotide edits first pos
hit.edits().size(), // nucleotide edits last pos
NULL, // ambig base array
0, // ambig base first pos
0, // ambig base last pos
hit.coord(), // coord of leftmost aligned char in ref
hit.repeat() ? gfm.plen()[0] : gfm.plen()[hit.ref()], // length of reference aligned to
&_rawEdits,
-1, // # seed mms allowed
-1, // seed length
-1, // seed interval
0, // minimum score for valid alignment (daehwan)
-1, // nuc5p (for colorspace)
-1, // nuc3p (for colorspace)
false, // soft pre-trimming?
0, // 5p pre-trimming
0, // 3p pre-trimming
softTrim, // soft trimming?
hit.fw() ? hit.trim5() : hit.trim3(), // 5p trimming
hit.fw() ? hit.trim3() : hit.trim5(), // 3p trimming
hit.repeat()); // repeat?
if(!hit.fw()) {
Edit::invertPoss(edits, rdlen, false);
}
if(hit.trim5() > 0) {
for(size_t i = 0; i < edits.size(); i++) {
edits[i].pos -= hit.trim5();
}
}
//rs.setRefNs(nrefn);
/*assert(rs.matchesRef(
rd,
ref,
tmp_rf_,
tmp_rdseq_,
tmp_qseq_,
_sharedVars.raw_refbuf,
_sharedVars.destU32,
raw_matches_,
_sharedVars.raw_refbuf2,
_sharedVars.reflens,
_sharedVars.refoffs));*/
if(ohit == NULL) {
bool done;
if(rdi == 0 && !_rightendonly) {
done = sink.report(0, &rs, NULL, alignMate);
} else {
done = sink.report(0, NULL, &rs, alignMate);
}
return done;
}
assert(ohit != NULL);
const Read& ord = *_rds[1-rdi];
index_t ordlen = (index_t)ord.length();
if(ohit->rdoff() - ohit->trim5() > 0 || ohit->len() + ohit->trim5() + ohit->trim3() < ordlen) return false;
if(ohit->score() < _minsc[1-rdi]) return false;
EList<Edit>& oedits = const_cast<EList<Edit>&>(ohit->edits());
if(ohit->trim5() > 0) {
for(size_t i = 0; i < oedits.size(); i++) {
oedits[i].pos += ohit->trim5();
}
}
if(!ohit->fw()) {
Edit::invertPoss(oedits, ordlen, false);
}
AlnScore oasc(
ohit->score(), // numeric score
ohit->ns(), // # Ns
ohit->ngaps(), // # gaps
ohit->repeat()); // repeat?
bool osoftTrim = ohit->trim5() > 0 || ohit->trim3() > 0;
AlnRes ors;
ors.init(
ordlen, // # chars after hard trimming
ord.rdid, // read ID
oasc, // alignment score
&ohit->edits(), // nucleotide edits array
0, // nucleotide edits first pos
ohit->edits().size(), // nucleotide edits last pos
NULL, // ambig base array
0, // ambig base first pos
0, // ambig base last pos
ohit->coord(), // coord of leftmost aligned char in ref
gfm.plen()[ohit->ref()], // length of reference aligned to
&_rawEdits,
-1, // # seed mms allowed
-1, // seed length
-1, // seed interval
0, // minimum score for valid alignment (daehwan)
-1, // nuc5p (for colorspace)
-1, // nuc3p (for colorspace)
false, // soft pre-trimming?
0, // 5p pre-trimming
0, // 3p pre-trimming
osoftTrim, // soft trimming?
ohit->fw() ? ohit->trim5() : ohit->trim3(), // 5p trimming
ohit->fw() ? ohit->trim3() : ohit->trim5(), // 3p trimming
ohit->repeat()); // repeat?
if(!ohit->fw()) {
Edit::invertPoss(oedits, ordlen, false);
}
if(ohit->trim5() > 0) {
for(size_t i = 0; i < oedits.size(); i++) {
oedits[i].pos -= ohit->trim5();
}
}
//rs.setRefNs(nrefn);
assert(ors.matchesRef(
ord,
ref,
tmp_rf_,
tmp_rdseq_,
tmp_qseq_,
_sharedVars.raw_refbuf,
_sharedVars.destU32,
raw_matches_,
_sharedVars.raw_refbuf2,
_sharedVars.reflens,
_sharedVars.refoffs));
bool done;
if(rdi == 0) {
done = sink.report(0, &rs, &ors);
} else {
done = sink.report(0, &ors, &rs);
}
return done;
}
/**
* check this alignment is already examined
**/
template <typename index_t, typename local_index_t>
bool HI_Aligner<index_t, local_index_t>::redundant(
AlnSinkWrap<index_t>& sink,
index_t rdi,
index_t tidx,
index_t toff)
{
assert_lt(rdi, 2);
const EList<AlnRes>* rs = NULL;
if(rdi == 0) sink.getUnp1(rs);
else sink.getUnp2(rs);
assert(rs != NULL);
for(index_t i = 0; i < rs->size(); i++) {
Coord coord_left = (*rs)[i].refcoord(), coord_right = (*rs)[i].refcoord_right();
assert_eq(coord_left.ref(), coord_right.ref());
assert_lt(coord_left.off(), coord_right.off());
assert_eq(coord_left.orient(), coord_right.orient());
if(tidx != coord_left.ref()) continue;
if(toff >= coord_left.off() && toff <= coord_right.off()) return true;
}
return false;
}
/**
* check this alignment is already examined
**/
template <typename index_t, typename local_index_t>
bool HI_Aligner<index_t, local_index_t>::redundant(
AlnSinkWrap<index_t>& sink,
index_t rdi,
const GenomeHit<index_t>& hit)
{
assert_lt(rdi, 2);
assert(_rds[rdi] != NULL);
index_t rdlen = (index_t)_rds[rdi]->length();
const EList<AlnRes>* rs = NULL;
if(rdi == 0) sink.getUnp1(rs);
else sink.getUnp2(rs);
assert(rs != NULL);
for(index_t i = 0; i < rs->size(); i++) {
const AlnRes& rsi = (*rs)[i];
if(rsi.refcoord() == hit.coord()) {
const EList<Edit>& editsi = rsi.ned();
const EList<Edit>& edits = hit.edits();
if(editsi.size() == edits.size()) {
size_t eidx = 0;
if(!hit.fw()) {
Edit::invertPoss(const_cast<EList<Edit>&>(edits), rdlen, false);
}
// daehwan: this is a temporary solution to compare edits
for(; eidx < editsi.size(); eidx++) {
if(!(editsi[eidx] == edits[eidx])) {
break;
}
}
if(!hit.fw()) {
Edit::invertPoss(const_cast<EList<Edit>&>(edits), rdlen, false);
}
if(eidx >= editsi.size()) {
assert_eq(eidx, editsi.size());
return true;
}
}
}
}
return false;
}
/**
* Sweep right-to-left and left-to-right using exact matching. Remember all
* the SA ranges encountered along the way. Report exact matches if there are
* any. Calculate a lower bound on the number of edits in an end-to-end
* alignment.
*/
template <typename index_t, typename local_index_t>
index_t HI_Aligner<index_t, local_index_t>::partialSearch(
const GFM<index_t>& gfm, // BWT index
const Read& read, // read to align
const Scoring& sc, // scoring scheme
const ReportingParams& rp,
bool fw,
size_t mineMax, // don't care about edit bounds > this
size_t& mineFw, // minimum # edits for forward read
size_t& mineRc, // minimum # edits for revcomp read
ReadBWTHit<index_t>& hit, // holds all the seed hits (and exact hit)
RandomSource& rnd, // pseudo-random source
bool& pseudogeneStop,
bool& anchorStop,
index_t maxHitLen)
{
bool pseudogeneStop_ = pseudogeneStop, anchorStop_ = anchorStop;
pseudogeneStop = anchorStop = false;
const index_t ftabLen = gfm.gh().ftabChars();
const bool linearFM = gfm.gh().linearFM();
SideLocus<index_t> tloc, bloc;
const index_t len = (index_t)read.length();
const BTDnaString& seq = fw ? read.patFw : read.patRc;
assert(!seq.empty());
size_t nelt = 0;
EList<BWTHit<index_t> >& partialHits = hit._partialHits;
index_t& cur = hit._cur;
assert_lt(cur, hit._len);
hit._numPartialSearch++;
index_t offset = cur;
index_t dep = offset;
pair<index_t, index_t> range(0, 0);
pair<index_t, index_t> rangeTemp(0, 0);
pair<index_t, index_t> node_range(0, 0);
pair<index_t, index_t> node_rangeTemp(0, 0);
_node_iedge_count.clear();
_tmp_node_iedge_count.clear();
index_t left = len - dep;
assert_gt(left, 0);
if(left < ftabLen + 1) {
cur = hit._len;
partialHits.expand();
partialHits.back().init((index_t)INDEX_MAX,
(index_t)INDEX_MAX,
(index_t)INDEX_MAX,
(index_t)INDEX_MAX,
_node_iedge_count,
fw,
(index_t)offset,
(index_t)(cur - offset));
hit.done(true);
return 0;
}
// Does N interfere with use of Ftab?
for(index_t i = 0; i < ftabLen; i++) {
int c = seq[len-dep-1-i];
if(c > 3) {
cur += (i+1);
partialHits.expand();
partialHits.back().init((index_t)INDEX_MAX,
(index_t)INDEX_MAX,
(index_t)INDEX_MAX,
(index_t)INDEX_MAX,
_node_iedge_count,
fw,
(index_t)offset,
(index_t)(cur - offset));
if(cur >= hit._len) {
hit.done(true);
}
return 0;
}
}
// Use ftab
gfm.ftabLoHi(seq, len - dep - ftabLen, false, range.first, range.second);
dep += ftabLen;
if(range.first >= range.second) {
cur = dep;
partialHits.expand();
partialHits.back().init((index_t)INDEX_MAX,
(index_t)INDEX_MAX,
(index_t)INDEX_MAX,
(index_t)INDEX_MAX,
_node_iedge_count,
fw,
(index_t)offset,
(index_t)(cur - offset));
if(cur >= hit._len) {
hit.done(true);
}
return 0;
}
index_t same_range = 0, similar_range = 0;
HIER_INIT_LOCS(range.first, range.second, tloc, bloc, gfm);
// Keep going
while(dep < len && dep - offset < maxHitLen) {
int c = seq[len-dep-1];
if(c > 3) {
rangeTemp.first = rangeTemp.second = 0;
node_rangeTemp.first = node_rangeTemp.second = 0;
_tmp_node_iedge_count.clear();
} else {
if(bloc.valid()) {
bwops_ += 2;
if(linearFM) {
rangeTemp = gfm.mapLF(tloc, bloc, c, &node_rangeTemp);
} else {
rangeTemp = gfm.mapGLF(tloc, bloc, c, &node_rangeTemp, &_tmp_node_iedge_count, (index_t)rp.kseeds);
}
} else {
bwops_++;
rangeTemp = gfm.mapGLF1(range.first, tloc, c, &node_rangeTemp);
if(rangeTemp.first + 1 < rangeTemp.second) {
assert_eq(node_rangeTemp.first + 1, node_rangeTemp.second);
_tmp_node_iedge_count.clear();
_tmp_node_iedge_count.expand();
_tmp_node_iedge_count.back().first = 0;
_tmp_node_iedge_count.back().second = rangeTemp.second - rangeTemp.first - 1;
}
}
}
if(rangeTemp.first >= rangeTemp.second) {
break;
}
if(pseudogeneStop_) {
if(node_rangeTemp.second - node_rangeTemp.first < node_range.second - node_range.first && node_range.second - node_range.first <= min<index_t>(5, (index_t)rp.khits)) {
static const index_t minLenForPseudogene = (index_t)_minK + 6;
if(dep - offset >= minLenForPseudogene && similar_range >= 5) {
hit._numUniqueSearch++;
pseudogeneStop = true;
break;
}
}
if(node_rangeTemp.second - node_rangeTemp.first != 1) {
if(node_rangeTemp.second - node_rangeTemp.first + 2 >= node_range.second - node_range.first) similar_range++;
else if(node_rangeTemp.second - node_rangeTemp.first + 4 < node_range.second - node_range.first) similar_range = 0;
} else {
pseudogeneStop_ = false;
}
}
if(anchorStop_) {
if(node_rangeTemp.second - node_rangeTemp.first != 1 && node_range.second - node_range.first == node_rangeTemp.second - node_rangeTemp.first) {
same_range++;
if(same_range >= 5) {
anchorStop_ = false;
}
} else {
same_range = 0;
}
if(dep - offset >= _minK + 8 && node_rangeTemp.second - node_rangeTemp.first >= 4) {
anchorStop_ = false;
}
}
range = rangeTemp;
node_range = node_rangeTemp;
if(_tmp_node_iedge_count.size() > 0) {
_node_iedge_count = _tmp_node_iedge_count;
_tmp_node_iedge_count.clear();
} else {
_node_iedge_count.clear();
}
dep++;
if(anchorStop_) {
if(dep - offset >= _minK + 12 && range.second - range.first == 1) {
hit._numUniqueSearch++;
anchorStop = true;
break;
}
}
HIER_INIT_LOCS(range.first, range.second, tloc, bloc, gfm);
}
// Done
if(range.first < range.second) {
assert_leq(node_range.second - node_range.first, range.second - range.first);
assert_gt(dep, offset);
assert_leq(dep, len);
partialHits.expand();
index_t hit_type = CANDIDATE_HIT;
if(anchorStop) hit_type = ANCHOR_HIT;
else if(pseudogeneStop) hit_type = PSEUDOGENE_HIT;
bool report = node_range.first < node_range.second;
if(node_range.second - node_range.first < range.second - range.first) {
if(_node_iedge_count.size() == 0) report = false;
}
if(report) {
#ifndef NDEBUG
if(node_range.second - node_range.first < range.second - range.first) {
ASSERT_ONLY(index_t add = 0);
for(index_t e = 0; e < _node_iedge_count.size(); e++) {
if(e > 0) {
assert_lt(_node_iedge_count[e-1].first, _node_iedge_count[e].first);
}
assert_gt(_node_iedge_count[e].second, 0);
add += _node_iedge_count[e].second;
}
assert_eq(node_range.second - node_range.first + add, range.second - range.first);
} else {
assert(_node_iedge_count.empty());
}
#endif
partialHits.back().init(range.first,
range.second,
node_range.first,
node_range.second,
_node_iedge_count,
fw,
(index_t)offset,
(index_t)(dep - offset),
hit_type);
} else {
_node_iedge_count.clear();
partialHits.back().init(INDEX_MAX,
INDEX_MAX,
INDEX_MAX,
INDEX_MAX,
_node_iedge_count,
fw,
(index_t)offset,
(index_t)(dep - offset),
hit_type);
}
nelt += (node_range.second - node_range.first);
cur = dep;
if(cur >= hit._len) {
if(hit_type == CANDIDATE_HIT) hit._numUniqueSearch++;
hit.done(true);
}
}
return (index_t)nelt;
}
/**
*/
template <typename index_t, typename local_index_t>
index_t HI_Aligner<index_t, local_index_t>::globalGFMSearch(
const GFM<index_t>& gfm, // BWT index
const Read& read, // read to align
const Scoring& sc, // scoring scheme
const ReportingParams& rp,
bool fw,
index_t hitoff,
index_t& hitlen,
index_t& top,
index_t& bot,
index_t& node_top,
index_t& node_bot,
EList<pair<index_t, index_t> >& node_iedge_count,
RandomSource& rnd,
bool& uniqueStop,
index_t maxHitLen)
{
bool uniqueStop_ = uniqueStop;
uniqueStop = false;
const index_t ftabLen = gfm.gh().ftabChars();
const bool linearFM = gfm.gh().linearFM();
SideLocus<index_t> tloc, bloc;
const index_t len = (index_t)read.length();
size_t nelt = 0;
const BTDnaString& seq = fw ? read.patFw : read.patRc;
assert(!seq.empty());
index_t offset = len - hitoff - 1;
index_t dep = offset;
pair<index_t, index_t> range(0, 0);
pair<index_t, index_t> rangeTemp(0, 0);
pair<index_t, index_t> node_range(0, 0);
pair<index_t, index_t> node_rangeTemp(0, 0);
node_iedge_count.clear();
_tmp_node_iedge_count.clear();
index_t left = len - dep;
assert_gt(left, 0);
if(left < ftabLen + 1) {
hitlen = left;
return 0;
}
// Does N interfere with use of Ftab?
for(index_t i = 0; i < ftabLen; i++) {
int c = seq[len-dep-1-i];
if(c > 3) {
hitlen = (i+1);
return 0;
}
}
// Use ftab
gfm.ftabLoHi(seq, len - dep - ftabLen, false, range.first, range.second);
dep += ftabLen;
if(range.first >= range.second) {
hitlen = ftabLen;
return 0;
}
HIER_INIT_LOCS(range.first, range.second, tloc, bloc, gfm);
// Keep going
while(dep < len) {
int c = seq[len-dep-1];
if(c > 3) {
rangeTemp.first = rangeTemp.second = 0;
node_rangeTemp.first = node_rangeTemp.second = 0;
_tmp_node_iedge_count.clear();
} else {
if(bloc.valid()) {
bwops_ += 2;
if(linearFM) {
rangeTemp = gfm.mapLF(tloc, bloc, c, &node_rangeTemp);
} else {
rangeTemp = gfm.mapGLF(tloc, bloc, c, &node_rangeTemp, &_tmp_node_iedge_count, (index_t)rp.kseeds);
}
} else {
bwops_++;
rangeTemp = gfm.mapGLF1(range.first, tloc, c, &node_rangeTemp);
if(rangeTemp.first + 1 < rangeTemp.second) {
assert_eq(node_rangeTemp.first + 1, node_rangeTemp.second);
_tmp_node_iedge_count.clear();
_tmp_node_iedge_count.expand();
_tmp_node_iedge_count.back().first = 0;
_tmp_node_iedge_count.back().second = rangeTemp.second - rangeTemp.first - 1;
}
}
}
if(rangeTemp.first >= rangeTemp.second) {
break;
}
range = rangeTemp;
node_range = node_rangeTemp;
if(_tmp_node_iedge_count.size() > 0) {
node_iedge_count = _tmp_node_iedge_count;
_tmp_node_iedge_count.clear();
} else {
node_iedge_count.clear();
}
dep++;
if(uniqueStop_) {
if(range.second - range.first == 1 && dep - offset >= _minK) {
uniqueStop = true;
break;
}
}
HIER_INIT_LOCS(range.first, range.second, tloc, bloc, gfm);
}
// Done
if(node_range.first < node_range.second && node_range.second - node_range.first <= rp.kseeds) {
assert_leq(node_range.second - node_range.first, range.second - range.first);
#ifndef NDEBUG
if(node_range.second - node_range.first < range.second - range.first) {
ASSERT_ONLY(index_t add = 0);
for(index_t e = 0; e < node_iedge_count.size(); e++) {
if(e > 0) {
assert_lt(node_iedge_count[e-1].first, node_iedge_count[e].first);
}
assert_gt(node_iedge_count[e].second, 0);
add += node_iedge_count[e].second;
}
assert_eq(node_range.second - node_range.first + add, range.second - range.first);
} else {
assert(node_iedge_count.empty());
}
#endif
assert_gt(dep, offset);
assert_leq(dep, len);
top = range.first; bot = range.second;
node_top = node_range.first; node_bot = node_range.second;
nelt += (node_bot - node_top);
hitlen = dep - offset;
}
return (index_t)nelt;
}
/**
*
**/
template <typename index_t, typename local_index_t>
index_t HI_Aligner<index_t, local_index_t>::localGFMSearch(
const LocalGFM<local_index_t, index_t>& gfm, // GFM index
const Read& read, // read to align
const Scoring& sc, // scoring scheme
const ReportingParams& rp,
bool fw,
index_t rdoff,
index_t& hitlen,
local_index_t& top,
local_index_t& bot,
local_index_t& node_top,
local_index_t& node_bot,
EList<pair<local_index_t, local_index_t> >& local_node_iedge_count,
RandomSource& rnd,
bool& uniqueStop,
local_index_t minUniqueLen,
local_index_t maxHitLen,
local_index_t maxHits)
{
maxHits = max<local_index_t>(maxHits, rp.kseeds);
bool uniqueStop_ = uniqueStop;
uniqueStop = false;
const local_index_t ftabLen = (local_index_t)gfm.gh().ftabChars();
const bool linearFM = gfm.gh().linearFM();
SideLocus<local_index_t> tloc, bloc;
const local_index_t len = (local_index_t)read.length();
size_t nelt = 0;
const BTDnaString& seq = fw ? read.patFw : read.patRc;
assert(!seq.empty());
local_index_t offset = len - rdoff - 1;
local_index_t dep = offset;
pair<local_index_t, local_index_t> range(0, 0);
pair<local_index_t, local_index_t> rangeTemp(0, 0);
pair<local_index_t, local_index_t> node_range(0, 0);
pair<local_index_t, local_index_t> node_rangeTemp(0, 0);
top = bot = node_top = node_bot = 0;
local_node_iedge_count.clear();
_tmp_local_node_iedge_count.clear();
local_index_t left = len - dep;
assert_gt(left, 0);
if(left < ftabLen + 1) {
hitlen = left;
return 0;
}
// Does N interfere with use of Ftab?
for(local_index_t i = 0; i < ftabLen; i++) {
int c = seq[len-dep-1-i];
if(c > 3) {
hitlen = i + 1;
return 0;
}
}
gfm.ftabLoHi(seq, len - dep - ftabLen, false, range.first, range.second);
dep += ftabLen;
if(range.first >= range.second) {
hitlen = ftabLen;
return 0;
}
LOCAL_INIT_LOCS(range.first, range.second, tloc, bloc, gfm);
// Keep going
while(dep < len) {
int c = seq[len-dep-1];
if(c > 3) {
rangeTemp.first = rangeTemp.second = 0;
node_rangeTemp.first = node_rangeTemp.second = 0;
_tmp_local_node_iedge_count.clear();
} else {
if(bloc.valid()) {
bwops_ += 2;
if(linearFM) {
rangeTemp = gfm.mapLF(tloc, bloc, c, &node_rangeTemp);
} else {
rangeTemp = gfm.mapGLF(tloc, bloc, c, &node_rangeTemp, &_tmp_local_node_iedge_count, rp.kseeds);
}
} else {
bwops_++;
rangeTemp = gfm.mapGLF1(range.first, tloc, c, &node_rangeTemp);
if(rangeTemp.first + 1 < rangeTemp.second) {
assert_eq(node_rangeTemp.first + 1, node_rangeTemp.second);
_tmp_local_node_iedge_count.clear();
_tmp_local_node_iedge_count.expand();
_tmp_local_node_iedge_count.back().first = 0;
_tmp_local_node_iedge_count.back().second = rangeTemp.second - rangeTemp.first - 1;
}
}
}
if(rangeTemp.first >= rangeTemp.second) {
break;
}
range = rangeTemp;
node_range = node_rangeTemp;
if(_tmp_local_node_iedge_count.size() > 0) {
local_node_iedge_count = _tmp_local_node_iedge_count;
_tmp_local_node_iedge_count.clear();
} else {
local_node_iedge_count.clear();
}
dep++;
if(uniqueStop_) {
if(range.second - range.first == 1 && dep - offset >= minUniqueLen) {
uniqueStop = true;
break;
}
}
if(dep - offset >= maxHitLen) break;
LOCAL_INIT_LOCS(range.first, range.second, tloc, bloc, gfm);
}
// Done
if(node_range.first < node_range.second && node_range.second - node_range.first <= maxHits) {
assert_leq(node_range.second - node_range.first, range.second - range.first);
#ifndef NDEBUG
if(node_range.second - node_range.first < range.second - range.first) {
ASSERT_ONLY(index_t add = 0);
for(index_t e = 0; e < local_node_iedge_count.size(); e++) {
if(e > 0) {
assert_lt(local_node_iedge_count[e-1].first, local_node_iedge_count[e].first);
}
assert_gt(local_node_iedge_count[e].second, 0);
add += local_node_iedge_count[e].second;
}
assert_eq(node_range.second - node_range.first + add, range.second - range.first);
} else {
assert(local_node_iedge_count.empty());
}
#endif
assert_gt(dep, offset);
assert_leq(dep, len);
top = range.first; bot = range.second;
node_top = node_range.first; node_bot = node_range.second;
nelt += (node_bot - node_top);
hitlen = dep - offset;
}
return (index_t)nelt;
}
/**
*
**/
template <typename index_t, typename local_index_t>
bool HI_Aligner<index_t, local_index_t>::isSearched(
const GenomeHit<index_t>& hit,
index_t rdi)
{
assert_lt(rdi, 2);
EList<GenomeHit<index_t> >& searchedHits = _hits_searched[rdi];
for(index_t i = 0; i < searchedHits.size(); i++) {
if(searchedHits[i].contains(hit)) return true;
}
return false;
}
/**
*
**/
template <typename index_t, typename local_index_t>
void HI_Aligner<index_t, local_index_t>::addSearched(
const GenomeHit<index_t>& hit,
index_t rdi)
{
assert_lt(rdi, 2);
assert(!isSearched(hit, rdi));
EList<GenomeHit<index_t> >& searchedHits = _hits_searched[rdi];
searchedHits.push_back(hit);
}
#endif /*HI_ALIGNER_H_*/