4772 lines
161 KiB
C++
4772 lines
161 KiB
C++
|
|
/*
|
|
* Copyright 2018, Chanhee Park <parkchanhee@gmail.com> and Daehwan Kim <infphilo@gmail.com>
|
|
*
|
|
* This file is part of HISAT 2.
|
|
*
|
|
* HISAT 2 is free software: you can redistribute it and/or modify
|
|
* it under the terms of the GNU General Public License as published by
|
|
* the Free Software Foundation, either version 3 of the License, or
|
|
* (at your option) any later version.
|
|
*
|
|
* HISAT 2 is distributed in the hope that it will be useful,
|
|
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
|
* GNU General Public License for more details.
|
|
*
|
|
* You should have received a copy of the GNU General Public License
|
|
* along with HISAT 2. If not, see <http://www.gnu.org/licenses/>.
|
|
*/
|
|
|
|
#include <iostream>
|
|
#include <vector>
|
|
#include <algorithm>
|
|
#include <cmath>
|
|
#include "timer.h"
|
|
#include "aligner_sw.h"
|
|
#include "aligner_result.h"
|
|
#include "scoring.h"
|
|
#include "sstring.h"
|
|
|
|
#include "repeat_builder.h"
|
|
#include "repeat_kmer.h"
|
|
#include "bit_packed_array.h"
|
|
|
|
unsigned int levenshtein_distance(const std::string& s1, const std::string& s2)
|
|
{
|
|
const std::size_t len1 = s1.size(), len2 = s2.size();
|
|
std::vector<unsigned int> col(len2+1), prevCol(len2+1);
|
|
|
|
for (unsigned int i = 0; i < prevCol.size(); i++)
|
|
prevCol[i] = i;
|
|
for (unsigned int i = 0; i < len1; i++) {
|
|
col[0] = i+1;
|
|
for (unsigned int j = 0; j < len2; j++)
|
|
// note that std::min({arg1, arg2, arg3}) works only in C++11,
|
|
// for C++98 use std::min(std::min(arg1, arg2), arg3)
|
|
col[j+1] = std::min( std::min(prevCol[1 + j] + 1, col[j] + 1), prevCol[j] + (s1[i]==s2[j] ? 0 : 1) );
|
|
col.swap(prevCol);
|
|
}
|
|
return prevCol[len2];
|
|
}
|
|
|
|
|
|
unsigned int hamming_distance(const string& s1, const string& s2)
|
|
{
|
|
assert_eq(s1.length(), s2.length());
|
|
|
|
uint32_t cnt = 0;
|
|
|
|
for(size_t i = 0; i < s1.length(); i++) {
|
|
if(s1[i] != s2[i]) {
|
|
cnt++;
|
|
}
|
|
}
|
|
|
|
return cnt;
|
|
}
|
|
|
|
string reverse(const string& str)
|
|
{
|
|
string rev = str;
|
|
size_t str_len = str.length();
|
|
|
|
for(size_t i = 0; i < str_len; i++) {
|
|
rev[i] = str[str_len - i - 1];
|
|
}
|
|
|
|
return rev;
|
|
}
|
|
|
|
string reverse_complement(const string& str)
|
|
{
|
|
string rev = str;
|
|
size_t str_len = str.length();
|
|
|
|
for(size_t i = 0; i < str_len; i++) {
|
|
char nt = str[str_len - i - 1];
|
|
rev[i] = asc2dnacomp[nt];
|
|
}
|
|
|
|
return rev;
|
|
}
|
|
|
|
template<typename TStr>
|
|
TIndexOffU getLCP(const TStr& s,
|
|
CoordHelper& coordHelper,
|
|
TIndexOffU a,
|
|
TIndexOffU b,
|
|
TIndexOffU max_len = 1000)
|
|
{
|
|
TIndexOffU a_end = coordHelper.getEnd(a);
|
|
TIndexOffU b_end = coordHelper.getEnd(b);
|
|
|
|
assert_leq(a_end, s.length());
|
|
assert_leq(b_end, s.length());
|
|
|
|
TIndexOffU k = 0;
|
|
while((a + k) < a_end && (b + k) < b_end && k < max_len) {
|
|
if(s[a + k] != s[b + k]) {
|
|
break;
|
|
}
|
|
k++;
|
|
}
|
|
|
|
return k;
|
|
}
|
|
|
|
template<typename TStr>
|
|
bool isSameSequenceUpto(const TStr& s,
|
|
CoordHelper& coordHelper,
|
|
TIndexOffU a,
|
|
TIndexOffU b,
|
|
TIndexOffU upto)
|
|
{
|
|
TIndexOffU a_end = coordHelper.getEnd(a);
|
|
TIndexOffU b_end = coordHelper.getEnd(b);
|
|
assert_leq(a_end, s.length());
|
|
assert_leq(b_end, s.length());
|
|
|
|
if(a + upto > a_end || b + upto > b_end)
|
|
return false;
|
|
|
|
for(int k = upto - 1; k >= 0; k--) {
|
|
if(s[a + k] != s[b + k]) {
|
|
return false;
|
|
}
|
|
}
|
|
|
|
return true;
|
|
}
|
|
|
|
bool isSenseDominant(CoordHelper& coordHelper,
|
|
const EList<TIndexOffU>& positions,
|
|
size_t seed_len)
|
|
{
|
|
// count the number of seeds on the sense strand
|
|
size_t sense_mer_count = 0;
|
|
for(size_t i = 0; i < positions.size(); i++) {
|
|
if(positions[i] < coordHelper.forward_length())
|
|
sense_mer_count++;
|
|
}
|
|
|
|
// there exists two sets of seeds that are essentially the same
|
|
// due to sense and antisense strands except palindromic sequences
|
|
// choose one and skip the other one
|
|
assert_leq(sense_mer_count, positions.size());
|
|
size_t antisense_mer_count = positions.size() - sense_mer_count;
|
|
if(sense_mer_count < antisense_mer_count) {
|
|
return false;
|
|
} else if(sense_mer_count == antisense_mer_count) {
|
|
assert_geq(positions.back(), coordHelper.forward_length());
|
|
TIndexOffU sense_pos = coordHelper.length() - positions.back() - seed_len;
|
|
if(positions[0] > sense_pos) return false;
|
|
}
|
|
|
|
return true;
|
|
}
|
|
|
|
static const Range EMPTY_RANGE = Range(1, 0);
|
|
|
|
struct RepeatRange {
|
|
RepeatRange() {
|
|
forward = true;
|
|
};
|
|
RepeatRange(Range r, int id) :
|
|
range(r), rg_id(id), forward(true) {};
|
|
RepeatRange(Range r, int id, bool fw) :
|
|
range(r), rg_id(id), forward(fw) {};
|
|
|
|
Range range;
|
|
int rg_id;
|
|
bool forward;
|
|
};
|
|
|
|
Range reverseRange(const Range& r, TIndexOffU size)
|
|
{
|
|
size_t len = r.second - r.first;
|
|
Range rc;
|
|
|
|
rc.first = size - r.second;
|
|
rc.second = rc.first + len;
|
|
|
|
return rc;
|
|
}
|
|
|
|
string reverseComplement(const string& str)
|
|
{
|
|
string rc;
|
|
for(TIndexOffU si = str.length(); si > 0; si--) {
|
|
rc.push_back(asc2dnacomp[str[si - 1]]);
|
|
}
|
|
return rc;
|
|
}
|
|
|
|
|
|
string toMDZ(const EList<Edit>& edits, const string& read)
|
|
{
|
|
StackedAln stk;
|
|
BTDnaString btread;
|
|
BTString buf;
|
|
|
|
btread.install(read.c_str(), true);
|
|
|
|
stk.init(btread, edits, 0, 0, 0, 0);
|
|
stk.buildMdz();
|
|
stk.writeMdz(&buf, NULL);
|
|
|
|
return string(buf.toZBuf());
|
|
}
|
|
|
|
string applyEdits(const string& ref, size_t read_len, const EList<Edit>& edits, const Coord& coord)
|
|
{
|
|
string read;
|
|
size_t ref_pos = coord.off();
|
|
size_t read_pos = 0;
|
|
|
|
for(size_t i = 0; i < edits.size(); i++) {
|
|
for(; read_pos < edits[i].pos; read_pos++, ref_pos++) {
|
|
read.push_back(ref[ref_pos]);
|
|
}
|
|
|
|
if(edits[i].type == EDIT_TYPE_READ_GAP) {
|
|
// delete on read
|
|
ref_pos++;
|
|
} else if(edits[i].type == EDIT_TYPE_REF_GAP) {
|
|
// insert on read
|
|
read.push_back(edits[i].qchr);
|
|
read_pos++;
|
|
} else if(edits[i].type == EDIT_TYPE_MM) {
|
|
assert_eq(ref[ref_pos], edits[i].chr);
|
|
read.push_back(edits[i].qchr);
|
|
|
|
read_pos++;
|
|
ref_pos++;
|
|
} else {
|
|
assert(false);
|
|
}
|
|
}
|
|
|
|
for(; read_pos < read_len; read_pos++, ref_pos++) {
|
|
read.push_back(ref[ref_pos]);
|
|
}
|
|
|
|
return read;
|
|
}
|
|
|
|
template<typename index_t>
|
|
static bool compareRepeatCoordByJoinedOff(const RepeatCoord<index_t>& a, const RepeatCoord<index_t>& b)
|
|
{
|
|
return a.joinedOff < b.joinedOff;
|
|
}
|
|
|
|
|
|
template<typename TStr>
|
|
string getString(const TStr& ref, TIndexOffU start, size_t len)
|
|
{
|
|
ASSERT_ONLY(const size_t ref_len = ref.length());
|
|
assert_leq(start + len, ref_len);
|
|
|
|
string s;
|
|
for(size_t i = 0; i < len; i++) {
|
|
char nt = "ACGT"[ref[start + i]];
|
|
s.push_back(nt);
|
|
}
|
|
|
|
return s;
|
|
}
|
|
|
|
template<typename TStr>
|
|
void getString(const TStr& ref, TIndexOffU start, size_t len, string& s)
|
|
{
|
|
s.clear();
|
|
ASSERT_ONLY(const size_t ref_len = ref.length());
|
|
assert_leq(start + len, ref_len);
|
|
for(size_t i = 0; i < len; i++) {
|
|
char nt = "ACGT"[ref[start + i]];
|
|
s.push_back(nt);
|
|
}
|
|
}
|
|
|
|
|
|
template<typename TStr>
|
|
inline uint8_t getSequenceBase(const TStr& ref, TIndexOffU pos)
|
|
{
|
|
assert_lt(pos, ref.length());
|
|
return (uint8_t)ref[pos];
|
|
}
|
|
|
|
|
|
template<typename TStr>
|
|
void dump_tstr(const TStr& s)
|
|
{
|
|
static int print_width = 60;
|
|
|
|
size_t s_len = s.length();
|
|
|
|
for(size_t i = 0; i < s_len; i += print_width) {
|
|
string buf;
|
|
for(size_t j = 0; (j < print_width) && (i + j < s_len); j++) {
|
|
buf.push_back("ACGTN"[s[i + j]]);
|
|
}
|
|
cerr << buf << endl;
|
|
}
|
|
cerr << endl;
|
|
}
|
|
|
|
|
|
size_t getMaxMatchLen(const EList<Edit>& edits, const size_t read_len)
|
|
{
|
|
size_t max_length = 0;
|
|
uint32_t last_edit_pos = 0;
|
|
uint32_t len = 0;
|
|
|
|
if (edits.size() == 0) {
|
|
// no edits --> exact match
|
|
return read_len;
|
|
}
|
|
|
|
for(size_t i = 0; i < edits.size(); i++) {
|
|
if (last_edit_pos > edits[i].pos) {
|
|
continue;
|
|
}
|
|
|
|
len = edits[i].pos - last_edit_pos;
|
|
if(len > max_length) {
|
|
max_length = len;
|
|
}
|
|
|
|
last_edit_pos = edits[i].pos + 1;
|
|
}
|
|
|
|
if (last_edit_pos < read_len) {
|
|
len = read_len - last_edit_pos;
|
|
if(len > max_length) {
|
|
max_length = len;
|
|
}
|
|
}
|
|
|
|
return max_length;
|
|
}
|
|
|
|
int max_index(size_t array[4])
|
|
{
|
|
int max_idx = 0;
|
|
|
|
for(size_t i = 1; i < 4; i++) {
|
|
if(array[max_idx] < array[i]) {
|
|
max_idx = i;
|
|
}
|
|
}
|
|
|
|
return max_idx;
|
|
}
|
|
|
|
CoordHelper::CoordHelper(TIndexOffU length,
|
|
TIndexOffU forward_length,
|
|
const EList<RefRecord>& szs,
|
|
const EList<string>& ref_names) :
|
|
length_(length),
|
|
forward_length_(forward_length),
|
|
szs_(szs),
|
|
ref_namelines_(ref_names)
|
|
{
|
|
// build ref_names_ from ref_namelines_
|
|
buildNames();
|
|
buildJoinedFragment();
|
|
}
|
|
|
|
CoordHelper::~CoordHelper()
|
|
{
|
|
}
|
|
|
|
void CoordHelper::buildNames()
|
|
{
|
|
ref_names_.resize(ref_namelines_.size());
|
|
for(size_t i = 0; i < ref_namelines_.size(); i++) {
|
|
string& nameline = ref_namelines_[i];
|
|
|
|
for(size_t j = 0; j < nameline.length(); j++) {
|
|
char n = nameline[j];
|
|
if(n == ' ') {
|
|
break;
|
|
}
|
|
ref_names_[i].push_back(n);
|
|
}
|
|
}
|
|
}
|
|
|
|
int CoordHelper::mapJoinedOffToSeq(TIndexOffU joinedOff)
|
|
{
|
|
/* search from cached_list */
|
|
if(num_cached_ > 0) {
|
|
for(size_t i = 0; i < num_cached_; i++) {
|
|
Fragments *frag = &cached_[i];
|
|
if(frag->contain(joinedOff)) {
|
|
return frag->frag_id;
|
|
}
|
|
}
|
|
/* fall through */
|
|
}
|
|
|
|
/* search list */
|
|
int top = 0;
|
|
int bot = fraglist_.size() - 1;
|
|
int pos = 0;
|
|
|
|
Fragments *frag = &fraglist_[pos];
|
|
while((bot - top) > 1) {
|
|
pos = top + ((bot - top) >> 1);
|
|
frag = &fraglist_[pos];
|
|
|
|
if (joinedOff < frag->joinedOff) {
|
|
bot = pos;
|
|
} else {
|
|
top = pos;
|
|
}
|
|
}
|
|
|
|
frag = &fraglist_[top];
|
|
if (frag->contain(joinedOff)) {
|
|
// update cache
|
|
if (num_cached_ < CACHE_SIZE_JOINEDFRG) {
|
|
cached_[num_cached_] = *frag;
|
|
num_cached_++;
|
|
} else {
|
|
cached_[victim_] = *frag;
|
|
victim_++; // round-robin
|
|
victim_ %= CACHE_SIZE_JOINEDFRG;
|
|
}
|
|
|
|
return top;
|
|
}
|
|
|
|
return -1;
|
|
}
|
|
|
|
int CoordHelper::getGenomeCoord(TIndexOffU joinedOff,
|
|
string& chr_name,
|
|
TIndexOffU& pos_in_chr)
|
|
{
|
|
int seq_id = mapJoinedOffToSeq(joinedOff);
|
|
if (seq_id < 0) {
|
|
return -1;
|
|
}
|
|
|
|
Fragments *frag = &fraglist_[seq_id];
|
|
TIndexOffU offset = joinedOff - frag->joinedOff;
|
|
|
|
pos_in_chr = frag->seqOff + offset;
|
|
chr_name = ref_names_[frag->seq_id];
|
|
|
|
return 0;
|
|
}
|
|
|
|
void CoordHelper::buildJoinedFragment()
|
|
{
|
|
TIndexOffU acc_joinedOff = 0;
|
|
TIndexOffU acc_seqOff = 0;
|
|
TIndexOffU seq_id = 0;
|
|
TIndexOffU frag_id = 0;
|
|
for(size_t i = 0; i < szs_.size(); i++) {
|
|
const RefRecord& rec = szs_[i];
|
|
if(rec.len == 0) {
|
|
continue;
|
|
}
|
|
if (rec.first) {
|
|
acc_seqOff = 0;
|
|
seq_id++;
|
|
}
|
|
|
|
fraglist_.expand();
|
|
|
|
fraglist_.back().joinedOff = acc_joinedOff;
|
|
fraglist_.back().length = rec.len;
|
|
|
|
fraglist_.back().frag_id = frag_id++;
|
|
fraglist_.back().seq_id = seq_id - 1;
|
|
fraglist_.back().first = rec.first;
|
|
|
|
acc_seqOff += rec.off;
|
|
fraglist_.back().seqOff = acc_seqOff;
|
|
|
|
acc_joinedOff += rec.len;
|
|
acc_seqOff += rec.len;
|
|
}
|
|
|
|
// Add Last Fragment(empty)
|
|
fraglist_.expand();
|
|
fraglist_.back().joinedOff = acc_joinedOff;
|
|
fraglist_.back().length = 0;
|
|
fraglist_.back().seqOff = acc_seqOff;
|
|
fraglist_.back().first = false;
|
|
fraglist_.back().frag_id = frag_id;
|
|
fraglist_.back().seq_id = seq_id;
|
|
}
|
|
|
|
TIndexOffU CoordHelper::getEnd(TIndexOffU e) {
|
|
assert_lt(e, length_)
|
|
|
|
TIndexOffU end = 0;
|
|
if(e < forward_length_) {
|
|
int frag_id = mapJoinedOffToSeq(e);
|
|
assert_geq(frag_id, 0);
|
|
end = fraglist_[frag_id].joinedOff + fraglist_[frag_id].length;
|
|
} else {
|
|
// ReverseComplement
|
|
// a, b are positions w.r.t reverse complement string.
|
|
// fragment map is based on forward string
|
|
int frag_id = mapJoinedOffToSeq(length_ - e - 1);
|
|
assert_geq(frag_id, 0);
|
|
end = length_ - fraglist_[frag_id].joinedOff;
|
|
}
|
|
|
|
assert_leq(end, length_);
|
|
return end;
|
|
}
|
|
|
|
TIndexOffU CoordHelper::getStart(TIndexOffU e) {
|
|
assert_lt(e, length_)
|
|
|
|
TIndexOffU start = 0;
|
|
if(e < forward_length_) {
|
|
int frag_id = mapJoinedOffToSeq(e);
|
|
assert_geq(frag_id, 0);
|
|
start = fraglist_[frag_id].joinedOff;
|
|
} else {
|
|
// ReverseComplement
|
|
// a, b are positions w.r.t reverse complement string.
|
|
// fragment map is based on forward string
|
|
int frag_id = mapJoinedOffToSeq(length_ - e - 1);
|
|
assert_geq(frag_id, 0);
|
|
start = length_ - (fraglist_[frag_id].joinedOff + fraglist_[frag_id].length);
|
|
}
|
|
|
|
assert_leq(start, length_);
|
|
return start;
|
|
}
|
|
|
|
template<typename TStr>
|
|
void SeedExt::getExtendedSeedSequence(const TStr& s,
|
|
string& seq) const
|
|
{
|
|
seq.clear();
|
|
TIndexOffU prev_end = orig_pos.first;
|
|
for(size_t j = 0; j < left_gaps.size(); j++) {
|
|
TIndexOffU curr_end = orig_pos.first - left_gaps[j].first;
|
|
assert_leq(curr_end, prev_end);
|
|
if(curr_end < prev_end) {
|
|
seq = getString(s, curr_end, prev_end - curr_end) + seq;
|
|
}
|
|
int gap_len = left_gaps[j].second;
|
|
assert_neq(gap_len, 0);
|
|
if(gap_len > 0) { // deletion
|
|
string gap_str(gap_len, '-');
|
|
seq = gap_str + seq;
|
|
} else {
|
|
curr_end += gap_len;
|
|
}
|
|
prev_end = curr_end;
|
|
}
|
|
assert_leq(pos.first, prev_end);
|
|
if(pos.first < prev_end) {
|
|
seq = getString(s, pos.first, prev_end - pos.first) + seq;
|
|
}
|
|
if(orig_pos.second - orig_pos.first > 0)
|
|
seq += getString(s, orig_pos.first, orig_pos.second - orig_pos.first);
|
|
TIndexOffU prev_begin = orig_pos.second;
|
|
for(size_t j = 0; j < right_gaps.size(); j++) {
|
|
TIndexOffU curr_begin = orig_pos.second + right_gaps[j].first;
|
|
assert_leq(prev_begin, curr_begin);
|
|
if(prev_begin < curr_begin) {
|
|
seq += getString(s, prev_begin, curr_begin - prev_begin);
|
|
}
|
|
int gap_len = right_gaps[j].second;
|
|
assert_neq(gap_len, 0);
|
|
if(gap_len > 0) { // deletion
|
|
string gap_str(gap_len, '-');
|
|
seq += gap_str;
|
|
} else {
|
|
curr_begin -= gap_len;
|
|
}
|
|
prev_begin = curr_begin;
|
|
}
|
|
assert_leq(prev_begin, pos.second);
|
|
if(prev_begin < pos.second) {
|
|
seq += getString(s, prev_begin, pos.second - prev_begin);
|
|
}
|
|
}
|
|
|
|
SeedSNP *lookup_add_SNP(EList<SeedSNP *>& repeat_snps, SeedSNP& snp)
|
|
{
|
|
for(size_t i = 0; i < repeat_snps.size(); i++) {
|
|
if(*repeat_snps[i] == snp) {
|
|
return repeat_snps[i];
|
|
}
|
|
}
|
|
|
|
repeat_snps.expand();
|
|
repeat_snps.back() = new SeedSNP();
|
|
*(repeat_snps.back()) = snp;
|
|
return repeat_snps.back();
|
|
}
|
|
|
|
template<typename TStr>
|
|
void SeedExt::generateSNPs(const TStr &s, const string& consensus, EList<SeedSNP *>& repeat_snps)
|
|
{
|
|
EList<pair<TIndexOffU, int> > gaps;
|
|
|
|
// Merge Gaps
|
|
{
|
|
TIndexOffU left_ext_len = getLeftExtLength();
|
|
for(size_t g = 0; g < left_gaps.size(); g++) {
|
|
gaps.expand();
|
|
gaps.back().first = left_ext_len - left_gaps[g].first + left_gaps[g].second;
|
|
gaps.back().second = left_gaps[g].second;
|
|
}
|
|
TIndexOffU right_base = orig_pos.second - pos.first;
|
|
for(size_t g = 0; g < right_gaps.size(); g++) {
|
|
gaps.expand();
|
|
gaps.back().first = right_base + right_gaps[g].first;
|
|
gaps.back().second = right_gaps[g].second;
|
|
}
|
|
gaps.sort();
|
|
}
|
|
|
|
//
|
|
string con_ref;
|
|
string seq_read;
|
|
TIndexOffU prev_con_pos = consensus_pos.first;
|
|
TIndexOffU prev_seq_pos = pos.first;
|
|
|
|
for(size_t g = 0; g < gaps.size(); g++) {
|
|
TIndexOffU curr_seq_pos = pos.first + gaps[g].first;
|
|
TIndexOffU curr_con_pos = prev_con_pos + (curr_seq_pos - prev_seq_pos);
|
|
|
|
con_ref = consensus.substr(prev_con_pos, curr_con_pos - prev_con_pos);
|
|
seq_read = getString(s, prev_seq_pos, curr_seq_pos - prev_seq_pos);
|
|
for(size_t l = 0; l < con_ref.length(); l++) {
|
|
if(seq_read[l] != con_ref[l]) {
|
|
// Single
|
|
SeedSNP snp;
|
|
snp.init(EDIT_TYPE_MM, prev_con_pos + l, 1, seq_read[l]);
|
|
|
|
// lookup repeat_snp
|
|
snps.expand();
|
|
snps.back() = lookup_add_SNP(repeat_snps, snp);
|
|
}
|
|
}
|
|
|
|
int gap_len = gaps[g].second;
|
|
assert_neq(gap_len, 0);
|
|
if(gap_len > 0) {
|
|
// Deletion (gap on read)
|
|
|
|
string gap_str(gap_len, '-');
|
|
|
|
SeedSNP snp;
|
|
snp.init(EDIT_TYPE_READ_GAP, curr_con_pos, gap_len, consensus.substr(curr_con_pos, gap_len));
|
|
|
|
snps.expand();
|
|
snps.back() = lookup_add_SNP(repeat_snps, snp);
|
|
|
|
curr_con_pos += gap_len;
|
|
|
|
} else {
|
|
// Insertion (gap on reference)
|
|
string gap_str(abs(gap_len), '-');
|
|
|
|
SeedSNP snp;
|
|
snp.init(EDIT_TYPE_REF_GAP, curr_con_pos, abs(gap_len), getString(s, curr_seq_pos, abs(gap_len)));
|
|
|
|
snps.expand();
|
|
snps.back() = lookup_add_SNP(repeat_snps, snp);
|
|
|
|
curr_seq_pos += abs(gap_len);
|
|
}
|
|
|
|
prev_con_pos = curr_con_pos;
|
|
prev_seq_pos = curr_seq_pos;
|
|
}
|
|
|
|
assert_eq(consensus_pos.second - prev_con_pos, pos.second - prev_seq_pos);
|
|
con_ref = consensus.substr(prev_con_pos, consensus_pos.second - prev_con_pos);
|
|
seq_read = getString(s, prev_seq_pos, pos.second - prev_seq_pos);
|
|
|
|
for(size_t l = 0; l < con_ref.length(); l++) {
|
|
if(seq_read[l] != con_ref[l]) {
|
|
// Single
|
|
|
|
SeedSNP snp;
|
|
snp.init(EDIT_TYPE_MM, prev_con_pos + l, 1, seq_read[l]);
|
|
|
|
snps.expand();
|
|
snps.back() = lookup_add_SNP(repeat_snps, snp);
|
|
}
|
|
}
|
|
}
|
|
|
|
bool seedCmp(const SeedExt& s, const SeedExt& s2)
|
|
{
|
|
if(s.getLength() != s2.getLength())
|
|
return s.getLength() > s2.getLength();
|
|
if(s.pos.first != s2.pos.first)
|
|
return s.pos.first < s2.pos.first;
|
|
if(s.pos.second != s2.pos.second)
|
|
return s.pos.second < s2.pos.second;
|
|
return false;
|
|
}
|
|
|
|
template<typename TStr>
|
|
void RB_Repeat::init(const RepeatParameter& rp,
|
|
const TStr& s,
|
|
CoordHelper& coordHelper,
|
|
const RB_SubSA& subSA,
|
|
const RB_RepeatBase& repeatBase)
|
|
{
|
|
consensus_ = repeatBase.seq;
|
|
assert_geq(consensus_.length(), rp.min_repeat_len);
|
|
assert_eq(consensus_.length(), rp.min_repeat_len + repeatBase.nodes.size() - 1);
|
|
|
|
EList<TIndexOffU> positions, next_positions;
|
|
|
|
const EList<TIndexOffU>& repeat_index = subSA.getRepeatIndex();
|
|
seeds_.clear();
|
|
for(size_t n = 0; n < repeatBase.nodes.size(); n++) {
|
|
TIndexOffU idx = repeatBase.nodes[n];
|
|
TIndexOffU saBegin = repeat_index[idx];
|
|
TIndexOffU saEnd = (idx + 1 < repeat_index.size() ? repeat_index[idx+1] : subSA.size());
|
|
|
|
next_positions.clear();
|
|
for(size_t sa = saBegin; sa < saEnd; sa++) {
|
|
TIndexOffU left = subSA.get(sa);
|
|
TIndexOffU right = left + subSA.seed_len();
|
|
next_positions.push_back(left);
|
|
|
|
if(left > 0) {
|
|
size_t idx = positions.bsearchLoBound(left - 1);
|
|
if(idx < positions.size() && positions[idx] == left - 1)
|
|
continue;
|
|
}
|
|
|
|
seeds_.expand();
|
|
SeedExt& seed = seeds_.back();
|
|
seed.reset();
|
|
seed.orig_pos = pair<TIndexOffU, TIndexOffU>(left, right);
|
|
seed.pos = seed.orig_pos;
|
|
seed.consensus_pos.first = n;
|
|
seed.consensus_pos.second = n + rp.min_repeat_len;
|
|
seed.bound = pair<TIndexOffU, TIndexOffU>(coordHelper.getStart(left), coordHelper.getEnd(left));
|
|
|
|
#ifndef NDEBUG
|
|
string tmp_str = getString(s, seed.pos.first, seed.pos.second - seed.pos.first);
|
|
assert_eq(tmp_str, consensus_.substr(n, seed.pos.second - seed.pos.first));
|
|
#endif
|
|
|
|
for(size_t p = seed.consensus_pos.second; p < consensus_.length(); p++) {
|
|
size_t pos = seed.pos.second;
|
|
if(pos >= seed.bound.second)
|
|
break;
|
|
int nt = getSequenceBase(s, pos);
|
|
if("ACGT"[nt] != consensus_[p])
|
|
break;
|
|
seed.pos.second++;
|
|
seed.consensus_pos.second++;
|
|
}
|
|
}
|
|
positions = next_positions;
|
|
}
|
|
|
|
internal_update();
|
|
}
|
|
|
|
template<typename TStr>
|
|
void RB_Repeat::extendConsensus(const RepeatParameter& rp,
|
|
const TStr& s)
|
|
{
|
|
size_t remain = seeds_.size();
|
|
EList<string> left_consensuses, right_consensuses, empty_consensuses;
|
|
EList<size_t> ed_seed_nums;
|
|
|
|
const TIndexOffU default_max_ext_len = (rp.max_repeat_len - rp.seed_len) / 2;
|
|
const TIndexOffU seed_mm = 1;
|
|
string left_ext_consensus = "", right_ext_consensus = "";
|
|
|
|
empty_consensuses.resize(seed_mm + 1);
|
|
empty_consensuses.fill("");
|
|
|
|
while(remain >= rp.repeat_count) {
|
|
for(size_t i = 0; i < remain; i++) {
|
|
seeds_[i].done = false;
|
|
seeds_[i].curr_ext_len = 0;
|
|
}
|
|
|
|
// extend seeds in left or right direction
|
|
TIndexOffU max_ext_len = min(default_max_ext_len, (TIndexOffU)(rp.max_repeat_len - consensus_.length()));
|
|
get_consensus_seq(s,
|
|
seeds_,
|
|
0, // seed begin
|
|
remain, // seed end
|
|
max_ext_len,
|
|
max_ext_len,
|
|
seed_mm,
|
|
rp,
|
|
ed_seed_nums,
|
|
&left_consensuses,
|
|
&right_consensuses);
|
|
|
|
size_t allowed_seed_mm = 0;
|
|
left_ext_consensus.clear();
|
|
right_ext_consensus.clear();
|
|
for(int i = 0 /* (int)seed_mm */; i >= 0; i--) {
|
|
size_t extlen = (ed_seed_nums[i] < rp.repeat_count ? 0 : (left_consensuses[i].length() + right_consensuses[i].length()));
|
|
// if(extlen <= 0 || extlen < max_ext_len * i / seed_mm)
|
|
//if(extlen < max_ext_len * 2)
|
|
// continue;
|
|
if(extlen <= 0)
|
|
continue;
|
|
|
|
left_ext_consensus = left_consensuses[i];
|
|
right_ext_consensus = right_consensuses[i];
|
|
allowed_seed_mm = (size_t)i;
|
|
assert_gt(left_ext_consensus.length(), 0);
|
|
assert_gt(right_ext_consensus.length(), 0);
|
|
break;
|
|
}
|
|
size_t num_passed_seeds = 0;
|
|
if(left_ext_consensus.length() > 0 && right_ext_consensus.length()) {
|
|
consensus_ = reverse(left_ext_consensus) + consensus_;
|
|
consensus_ += right_ext_consensus;
|
|
|
|
#if 0
|
|
calc_edit_dist(s,
|
|
seeds_,
|
|
0,
|
|
remain,
|
|
left ? consensuses : empty_consensuses,
|
|
left ? empty_consensuses : consensuses,
|
|
allowed_seed_mm);
|
|
#endif
|
|
|
|
// update seeds
|
|
for(size_t i = 0; i < seeds_.size(); i++) {
|
|
SeedExt& seed = seeds_[i];
|
|
|
|
if(i < remain) {
|
|
if(seed.ed <= allowed_seed_mm) {
|
|
num_passed_seeds++;
|
|
seed.done = true;
|
|
seed.total_ed += seed.ed;
|
|
seed.pos.first -= left_ext_consensus.length();
|
|
seed.pos.second += right_ext_consensus.length();
|
|
seed.consensus_pos.first = 0;
|
|
seed.consensus_pos.second = consensus_.length();
|
|
|
|
if(seed.left_gaps.size() > 0 &&
|
|
seed.left_gaps.back().first >= seed.orig_pos.first - seed.pos.first) {
|
|
int gap_len = seed.left_gaps.back().second;
|
|
seed.pos.first += gap_len;
|
|
}
|
|
if(seed.right_gaps.size() > 0 &&
|
|
seed.right_gaps.back().first >= seed.pos.second - seed.orig_pos.second) {
|
|
int gap_len = seed.right_gaps.back().second;
|
|
seed.pos.second -= gap_len;
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
// move up "done" seeds
|
|
size_t j = 0;
|
|
for(size_t i = 0; i < remain; i++) {
|
|
if(!seeds_[i].done) continue;
|
|
assert_geq(i, j);
|
|
if(i > j) {
|
|
SeedExt temp = seeds_[j];
|
|
seeds_[j] = seeds_[i];
|
|
seeds_[i] = temp;
|
|
// Find next "undone" seed
|
|
j++;
|
|
while(j < i && seeds_[j].done) {
|
|
j++;
|
|
}
|
|
assert(j < remain && !seeds_[j].done);
|
|
} else {
|
|
j = i + 1;
|
|
}
|
|
}
|
|
}
|
|
|
|
remain = num_passed_seeds;
|
|
break;
|
|
} // while(remain >= rp.repeat_count)
|
|
|
|
#ifndef NDEBUG
|
|
// make sure seed positions are unique
|
|
EList<pair<TIndexOffU, TIndexOffU> > seed_poses;
|
|
for(size_t i = 0; i < seeds_.size(); i++) {
|
|
seed_poses.expand();
|
|
seed_poses.back() = seeds_[i].orig_pos;
|
|
}
|
|
seed_poses.sort();
|
|
for(size_t i = 0; i + 1 < seed_poses.size(); i++) {
|
|
if(seed_poses[i].first == seed_poses[i+1].first) {
|
|
assert_lt(seed_poses[i].second, seed_poses[i+1].second);
|
|
} else {
|
|
assert_lt(seed_poses[i].first, seed_poses[i+1].first);
|
|
}
|
|
}
|
|
|
|
for(size_t i = 0; i < seeds_.size(); i++) {
|
|
assert(seeds_[i].valid());
|
|
}
|
|
#endif
|
|
|
|
RB_Repeat::internal_update();
|
|
|
|
// check repeats within a repeat sequence
|
|
self_repeat_ = false;
|
|
for(size_t i = 0; i + 1 < seed_ranges_.size(); i++) {
|
|
const RB_AlleleCoord& range = seed_ranges_[i];
|
|
for(size_t j = i + 1; j < seed_ranges_.size(); j++) {
|
|
RB_AlleleCoord& range2 = seed_ranges_[j];
|
|
if(range.right <= range2.left)
|
|
break;
|
|
self_repeat_ = true;
|
|
}
|
|
}
|
|
}
|
|
|
|
template<typename TStr>
|
|
void RB_Repeat::getNextRepeat(const RepeatParameter& rp,
|
|
const TStr& s,
|
|
RB_Repeat& o)
|
|
{
|
|
o.reset();
|
|
|
|
size_t i = 0;
|
|
for(i = 0; i < seeds_.size() && seeds_[i].done; i++);
|
|
if(i >= seeds_.size())
|
|
return;
|
|
|
|
for(size_t j = i; j < seeds_.size(); j++) {
|
|
assert(!seeds_[j].done);
|
|
o.seeds_.expand();
|
|
o.seeds_.back() = seeds_[j];
|
|
|
|
}
|
|
seeds_.resizeExact(i);
|
|
internal_update();
|
|
|
|
assert_gt(o.seeds_.size(), 0);
|
|
o.consensus_ = getString(s, o.seeds_[0].orig_pos.first, o.seeds_[0].orig_pos.second - o.seeds_[0].orig_pos.first);
|
|
for(size_t j = 0; j < o.seeds_.size(); j++) {
|
|
o.seeds_[j].pos = o.seeds_[j].orig_pos;
|
|
o.seeds_[j].left_gaps.clear();
|
|
o.seeds_[j].right_gaps.clear();
|
|
o.seeds_[j].ed = o.seeds_[j].total_ed = 0;
|
|
}
|
|
}
|
|
|
|
void RB_Repeat::internal_update()
|
|
{
|
|
sort(seeds_.begin(), seeds_.end(), seedCmp);
|
|
|
|
seed_ranges_.resizeExact(seeds_.size());
|
|
for(size_t i = 0; i < seeds_.size(); i++) {
|
|
seed_ranges_[i].left = seeds_[i].pos.first;
|
|
seed_ranges_[i].right = seeds_[i].pos.second;
|
|
seed_ranges_[i].idx = i;
|
|
}
|
|
seed_ranges_.sort();
|
|
|
|
// check repeats within a repeat sequence
|
|
size_t remove_count = 0;
|
|
for(size_t i = 0; i + 1 < seed_ranges_.size(); i++) {
|
|
const RB_AlleleCoord& range = seed_ranges_[i];
|
|
if(range.left == numeric_limits<size_t>::max())
|
|
continue;
|
|
for(size_t j = i + 1; j < seed_ranges_.size(); j++) {
|
|
RB_AlleleCoord& range2 = seed_ranges_[j];
|
|
if(range2.left == numeric_limits<size_t>::max())
|
|
continue;
|
|
if(range.right <= range2.left)
|
|
break;
|
|
if(range.right >= range2.right) {
|
|
range2.left = numeric_limits<size_t>::max();
|
|
remove_count++;
|
|
}
|
|
}
|
|
}
|
|
|
|
if(remove_count <= 0)
|
|
return;
|
|
|
|
for(size_t i = 0; i < seed_ranges_.size(); i++) {
|
|
if(seed_ranges_[i].left == numeric_limits<size_t>::max())
|
|
seeds_[seed_ranges_[i].idx].reset();
|
|
}
|
|
sort(seeds_.begin(), seeds_.end(), seedCmp);
|
|
|
|
#ifndef NDEBUG
|
|
for(size_t i = 0; i < seeds_.size(); i++) {
|
|
if(i < seeds_.size() - remove_count) {
|
|
assert_lt(seeds_[i].pos.first, seeds_[i].pos.second);
|
|
} else {
|
|
assert_eq(seeds_[i].pos.first, seeds_[i].pos.second);
|
|
}
|
|
}
|
|
#endif
|
|
|
|
seeds_.resize(seeds_.size() - remove_count);
|
|
seed_ranges_.resize(seeds_.size());
|
|
for(size_t i = 0; i < seeds_.size(); i++) {
|
|
seed_ranges_[i].left = seeds_[i].pos.first;
|
|
seed_ranges_[i].right = seeds_[i].pos.second;
|
|
seed_ranges_[i].idx = i;
|
|
}
|
|
seed_ranges_.sort();
|
|
}
|
|
|
|
bool RB_Repeat::contain(TIndexOffU left, TIndexOffU right) const
|
|
{
|
|
size_t l = 0, r = seed_ranges_.size();
|
|
while(l < r) {
|
|
size_t m = (l + r) / 2;
|
|
const RB_AlleleCoord& coord = seed_ranges_[m];
|
|
if(right <= coord.left) {
|
|
r = m;
|
|
} else if(left >= coord.right) {
|
|
l = m + 1;
|
|
} else {
|
|
return coord.left <= left && right <= coord.right;
|
|
}
|
|
}
|
|
|
|
return false;
|
|
}
|
|
|
|
template<typename TStr>
|
|
void RB_Repeat::saveSeedExtension(const RepeatParameter& rp,
|
|
const TStr& s,
|
|
CoordHelper& coordHelper,
|
|
TIndexOffU grp_id,
|
|
ostream& fp,
|
|
size_t& total_repeat_seq_len,
|
|
size_t& total_allele_seq_len) const
|
|
{
|
|
// apply color, which is compatible with linux commands such as cat and less -r
|
|
#if 1
|
|
const string red = "\033[31m", reset = "\033[0m", resetline = "\033[4m", redline = "\033[4;31m";
|
|
#else
|
|
const string red = "", reset = "", resetline = "", redline = "";
|
|
#endif
|
|
bool show_exterior_seq = true;
|
|
const size_t max_show_seq_len = 700;
|
|
|
|
size_t total_count = 0;
|
|
for(size_t i = 0; i < seeds_.size(); i++) {
|
|
const SeedExt& seed = seeds_[i];
|
|
size_t ext_len = seed.getLength();
|
|
if(ext_len < rp.min_repeat_len) continue;
|
|
total_allele_seq_len += ext_len;
|
|
total_count++;
|
|
bool sense_strand = seed.pos.first < coordHelper.forward_length();
|
|
|
|
fp << setw(6) << repeat_id_ << " " << setw(5) << seeds_.size();
|
|
fp << " " << setw(4) << i;
|
|
fp << " " << setw(4) << ext_len;
|
|
fp << " " << (sense_strand ? '+' : '-');
|
|
fp << " " << setw(10) << seed.pos.first << " " << setw(10) << seed.pos.second;
|
|
fp << " " << setw(10) << seed.orig_pos.first << " " << setw(10) << seed.orig_pos.second;
|
|
|
|
string chr_name;
|
|
TIndexOffU pos_in_chr;
|
|
if(sense_strand) {
|
|
coordHelper.getGenomeCoord(seed.pos.first, chr_name, pos_in_chr);
|
|
} else {
|
|
coordHelper.getGenomeCoord(s.length() - seed.pos.first - (seed.pos.second - seed.pos.first), chr_name, pos_in_chr);
|
|
}
|
|
fp << " " << setw(5) << chr_name << ":" << setw(10) << std::left << pos_in_chr << std::right;
|
|
|
|
if(sense_strand) {
|
|
coordHelper.getGenomeCoord(seed.pos.second, chr_name, pos_in_chr);
|
|
} else {
|
|
coordHelper.getGenomeCoord(s.length() - seed.pos.second - (seed.pos.second - seed.pos.first), chr_name, pos_in_chr);
|
|
}
|
|
fp << " " << setw(5) << chr_name << ":" << setw(10) << std::left << pos_in_chr << std::right;
|
|
|
|
if(!seed.aligned) {
|
|
fp << endl;
|
|
continue;
|
|
}
|
|
|
|
string deststr = "";
|
|
seed.getExtendedSeedSequence(s, deststr);
|
|
|
|
// add exterior sequences
|
|
if(seed.consensus_pos.first > 0) {
|
|
TIndexOffU seq_pos, seq_len;
|
|
if(seed.pos.first >= seed.consensus_pos.first) {
|
|
seq_pos = seed.pos.first - seed.consensus_pos.first;
|
|
seq_len = seed.consensus_pos.first;
|
|
} else {
|
|
seq_pos = 0;
|
|
seq_len = seed.pos.first;
|
|
}
|
|
deststr = getString(s, seq_pos, seq_len) + deststr;
|
|
if(seq_len < seed.consensus_pos.first) {
|
|
deststr = string(seed.consensus_pos.first - seq_len, 'N') + deststr;
|
|
}
|
|
}
|
|
if(seed.consensus_pos.second < consensus_.length()) {
|
|
deststr += getString(s, seed.pos.second, consensus_.length() - seed.consensus_pos.second);
|
|
}
|
|
|
|
assert_eq(consensus_.length(), deststr.length());
|
|
fp << " ";
|
|
|
|
// print sequence w.r.t. the current group
|
|
for(size_t j = 0; j < min(consensus_.length(), max_show_seq_len); j++) {
|
|
bool outside = j < seed.consensus_pos.first || j >= seed.consensus_pos.second;
|
|
bool different = (consensus_[j] != deststr[j]);
|
|
if(outside) {
|
|
if(show_exterior_seq) {
|
|
if(different) {
|
|
fp << redline;
|
|
fp << deststr[j];
|
|
fp << reset;
|
|
} else {
|
|
fp << resetline;
|
|
fp << deststr[j];
|
|
fp << reset;
|
|
}
|
|
} else {
|
|
if(j < seed.consensus_pos.first)
|
|
fp << " ";
|
|
}
|
|
} else {
|
|
if(different) fp << red;
|
|
fp << deststr[j];
|
|
if(different) fp << reset;
|
|
}
|
|
}
|
|
|
|
#if 0
|
|
fp << "\t";
|
|
for(size_t ei = 0; ei < seed.edits.size(); ei++) {
|
|
const Edit& edit = seed.edits[ei];
|
|
if(ei > 0) fp << ",";
|
|
fp << edit;
|
|
if (edit.snpID != std::numeric_limits<uint32_t>::max()) {
|
|
fp << "@" << edit.snpID;
|
|
}
|
|
}
|
|
#endif
|
|
|
|
fp << endl;
|
|
}
|
|
|
|
if(total_count > 0) fp << setw(5) << total_count << endl << endl;
|
|
total_repeat_seq_len += consensus_.length();
|
|
}
|
|
|
|
template<typename TStr>
|
|
size_t calc_edit_dist(const TStr& s,
|
|
const SeedExt& seed,
|
|
const SeedExt& seed2,
|
|
size_t left_ext,
|
|
size_t right_ext,
|
|
size_t max_ed)
|
|
{
|
|
if(seed.bound.first + left_ext > seed.pos.first ||
|
|
seed.pos.second + right_ext > seed.bound.second ||
|
|
seed2.bound.first + left_ext > seed2.pos.first ||
|
|
seed2.pos.second + right_ext > seed2.bound.second)
|
|
return max_ed + 1;
|
|
|
|
size_t ed = 0;
|
|
for(size_t i = 0; i < left_ext; i++) {
|
|
int ch = getSequenceBase(s, seed.pos.first - i - 1);
|
|
assert_range(0, 3, ch);
|
|
int ch2 = getSequenceBase(s, seed2.pos.first - i - 1);
|
|
assert_range(0, 3, ch2);
|
|
if(ch != ch2) ed++;
|
|
if(ed > max_ed)
|
|
return ed;
|
|
|
|
}
|
|
for(size_t i = 0; i < right_ext; i++) {
|
|
int ch = getSequenceBase(s, seed.pos.second + i);
|
|
assert_range(0, 3, ch);
|
|
int ch2 = getSequenceBase(s, seed2.pos.second + i);
|
|
assert_range(0, 3, ch2);
|
|
if(ch != ch2) ed++;
|
|
if(ed > max_ed)
|
|
return ed;
|
|
}
|
|
|
|
return ed;
|
|
}
|
|
|
|
size_t extract_kmer(const string& seq, size_t offset, size_t k = 5)
|
|
{
|
|
assert_leq(offset + k, seq.length());
|
|
size_t kmer = 0;
|
|
for(size_t i = 0; i < k; i++) {
|
|
kmer = (kmer << 2) | asc2dna[seq[offset + i]];
|
|
}
|
|
return kmer;
|
|
}
|
|
|
|
size_t next_kmer(size_t kmer, char base, size_t k = 5)
|
|
{
|
|
kmer &= ((1 << ((k-1)*2))) - 1;
|
|
kmer = (kmer << 2) | asc2dna[base];
|
|
return kmer;
|
|
}
|
|
|
|
void build_kmer_table(const string& consensus, EList<pair<size_t, size_t> >& kmer_table, size_t k = 5)
|
|
{
|
|
kmer_table.clear();
|
|
if(consensus.length() < k)
|
|
return;
|
|
size_t kmer = 0;
|
|
for(size_t i = 0; i + k <= consensus.length(); i++) {
|
|
if(i == 0) {
|
|
kmer = extract_kmer(consensus, i, k);
|
|
} else {
|
|
kmer = next_kmer(kmer, consensus[i+k-1], k);
|
|
}
|
|
kmer_table.expand();
|
|
kmer_table.back().first = kmer;
|
|
kmer_table.back().second = i;
|
|
}
|
|
kmer_table.sort();
|
|
}
|
|
|
|
void find_gap_pos(const string& s,
|
|
const string& s2,
|
|
EList<size_t>& ed,
|
|
EList<size_t>& ed2,
|
|
bool del,
|
|
size_t gap_len,
|
|
size_t max_mm,
|
|
size_t& gap_pos,
|
|
size_t& mm,
|
|
bool debug = false)
|
|
{
|
|
assert_eq(s.length(), s2.length());
|
|
size_t seq_len = s.length();
|
|
ed.resizeExact(seq_len); ed.fill(max_mm + 1);
|
|
ed2.resizeExact(seq_len); ed2.fill(max_mm + 1);
|
|
|
|
// from left to right
|
|
for(size_t i = 0; i < seq_len; i++) {
|
|
size_t add = (s[i] == s2[i] ? 0 : 1);
|
|
if(i == 0) ed[i] = add;
|
|
else ed[i] = ed[i-1] + add;
|
|
if(ed[i] >= max_mm + 1) break;
|
|
}
|
|
|
|
// from right to left
|
|
size_t s_sub = del ? 0 : gap_len;
|
|
size_t s2_sub = del ? gap_len : 0;
|
|
for(int i = seq_len - 1; i >= gap_len; i--) {
|
|
size_t add = (s[i - s_sub] == s2[i - s2_sub] ? 0 : 1);
|
|
if(i == seq_len - 1) ed2[i] = add;
|
|
else ed2[i] = ed2[i+1] + add;
|
|
if(ed2[i] > max_mm) break;
|
|
}
|
|
|
|
if(debug) {
|
|
cout << s << endl << s2 << endl;
|
|
for(size_t i = 0; i < ed.size(); i++) {
|
|
cout << (ed[i] % 10);
|
|
}
|
|
cout << endl;
|
|
for(size_t i = 0; i < ed2.size(); i++) {
|
|
cout << (ed2[i] % 10);
|
|
}
|
|
cout << endl;
|
|
}
|
|
|
|
size_t min_mm = ed2[gap_len];
|
|
int min_mm_i = -1;
|
|
assert_eq(ed.size(), ed2.size());
|
|
for(size_t i = 0; i + gap_len + 1 < ed.size(); i++) {
|
|
if(ed[i] > max_mm) break;
|
|
size_t cur_mm = ed[i] + ed2[i + gap_len + 1];
|
|
if(cur_mm < min_mm) {
|
|
min_mm = cur_mm;
|
|
min_mm_i = i;
|
|
}
|
|
}
|
|
|
|
mm = min_mm;
|
|
if(mm > max_mm)
|
|
return;
|
|
gap_pos = (size_t)(min_mm_i + 1);
|
|
}
|
|
|
|
void align_with_one_gap(const string& s,
|
|
const EList<pair<size_t, size_t> >& s_kmer_table,
|
|
const string& s2,
|
|
EList<size_t>& counts,
|
|
size_t max_gap,
|
|
size_t max_mm,
|
|
size_t& mm,
|
|
size_t& gap_pos,
|
|
int& gap_len,
|
|
size_t k = 5)
|
|
{
|
|
mm = max_mm + 1;
|
|
assert_eq(s.length(), s2.length());
|
|
counts.resizeExact(max_gap * 2 + 1);
|
|
counts.fillZero();
|
|
size_t max_count = 0, max_count_i = 0;
|
|
for(size_t i = 0; i + k <= s2.length(); i += 1) {
|
|
pair<size_t, size_t> kmer(0, 0);
|
|
kmer.first = extract_kmer(s2, i, k);
|
|
size_t lb = s_kmer_table.bsearchLoBound(kmer);
|
|
while(lb < s_kmer_table.size() && kmer.first == s_kmer_table[lb].first) {
|
|
int gap = (int)s_kmer_table[lb].second - (int)i;
|
|
if(gap != 0 && abs(gap) < max_gap) {
|
|
size_t gap_i = (size_t)(gap + max_gap);
|
|
counts[gap_i]++;
|
|
if(counts[gap_i] > max_count) {
|
|
max_count = counts[gap_i];
|
|
max_count_i = gap_i;
|
|
}
|
|
}
|
|
lb++;
|
|
}
|
|
}
|
|
|
|
if(max_count <= 0)
|
|
return;
|
|
|
|
assert_lt(max_count_i, counts.size());
|
|
int gap = (int)max_count_i - (int)max_gap;
|
|
assert_neq(gap, 0);
|
|
size_t abs_gap = abs(gap);
|
|
bool del = gap > 0;
|
|
|
|
EList<size_t> ed, ed2;
|
|
find_gap_pos(s,
|
|
s2,
|
|
ed,
|
|
ed2,
|
|
del,
|
|
abs_gap,
|
|
max_mm,
|
|
gap_pos,
|
|
mm);
|
|
if(mm > max_mm)
|
|
return;
|
|
|
|
gap_len = del ? abs_gap : -abs_gap;
|
|
|
|
#ifndef NDEBUG
|
|
assert_leq(mm, max_mm);
|
|
string ds = s, ds2 = s2;
|
|
size_t debug_mm = 0;
|
|
if(del) ds.erase(gap_pos, abs_gap);
|
|
else ds2.erase(gap_pos, abs_gap);
|
|
|
|
for(size_t i = 0; i < min(ds.length(), ds2.length()); i++) {
|
|
if(ds[i] != ds2[i])
|
|
debug_mm++;
|
|
}
|
|
assert_eq(debug_mm, mm);
|
|
#endif
|
|
}
|
|
|
|
template<typename TStr>
|
|
void calc_edit_dist(const TStr& s,
|
|
EList<SeedExt>& seeds,
|
|
size_t sb,
|
|
size_t se,
|
|
const EList<string>& left_consensuses,
|
|
const EList<string>& right_consensuses,
|
|
uint32_t max_ed)
|
|
{
|
|
string left_consensus = left_consensuses[max_ed];
|
|
string right_consensus = right_consensuses[max_ed];
|
|
|
|
EList<pair<size_t, size_t> > left_kmer_table, right_kmer_table;
|
|
build_kmer_table(left_consensus, left_kmer_table);
|
|
build_kmer_table(right_consensus, right_kmer_table);
|
|
|
|
string left_seq, right_seq;
|
|
const size_t max_gap = 10;
|
|
EList<size_t> counts;
|
|
|
|
size_t left_ext = left_consensus.length();
|
|
size_t right_ext = right_consensus.length();
|
|
for(size_t i = sb; i < se; i++) {
|
|
SeedExt& seed = seeds[i];
|
|
if(seed.bound.first + left_ext > seed.pos.first ||
|
|
seed.pos.second + right_ext > seed.bound.second) {
|
|
seed.ed = max_ed + 1;
|
|
continue;
|
|
}
|
|
|
|
size_t left_ed = 0;
|
|
if(left_ext > 0) {
|
|
getString(s, seed.pos.first - left_ext, left_ext, left_seq);
|
|
reverse(left_seq.begin(), left_seq.end());
|
|
for(size_t j = 0; j < left_ext; j++) {
|
|
if(left_seq[j] != left_consensus[j]) left_ed++;
|
|
if(left_ed <= max_ed && j < left_consensuses[left_ed].length()) {
|
|
seed.curr_ext_len = j + 1;
|
|
}
|
|
}
|
|
|
|
if(left_ed > max_ed) {
|
|
size_t gap_pos = 0;
|
|
int gap_len = 0;
|
|
align_with_one_gap(left_consensus,
|
|
left_kmer_table,
|
|
left_seq,
|
|
counts,
|
|
min(left_ed - max_ed, max_gap),
|
|
max_ed,
|
|
left_ed,
|
|
gap_pos,
|
|
gap_len);
|
|
if(left_ed <= max_ed) {
|
|
seed.left_gaps.expand();
|
|
seed.left_gaps.back().first = seed.getLeftExtLength() + gap_pos;
|
|
seed.left_gaps.back().second = gap_len;
|
|
}
|
|
}
|
|
} else {
|
|
left_seq.clear();
|
|
}
|
|
|
|
size_t right_ed = 0;
|
|
if(right_ext > 0) {
|
|
getString(s, seed.pos.second, right_ext, right_seq);
|
|
for(size_t j = 0; j < right_ext; j++) {
|
|
if(right_seq[j] != right_consensus[j]) right_ed++;
|
|
if(right_ed <= max_ed && j < right_consensuses[right_ed].length()) {
|
|
seed.curr_ext_len = j + 1;
|
|
}
|
|
}
|
|
if(right_ed > max_ed) {
|
|
size_t gap_pos = 0;
|
|
int gap_len = 0;
|
|
align_with_one_gap(right_consensus,
|
|
right_kmer_table,
|
|
right_seq,
|
|
counts,
|
|
min(right_ed - max_ed, max_gap),
|
|
max_ed,
|
|
right_ed,
|
|
gap_pos,
|
|
gap_len);
|
|
if(right_ed <= max_ed) {
|
|
seed.right_gaps.expand();
|
|
seed.right_gaps.back().first = seed.getRightExtLength() + gap_pos;
|
|
seed.right_gaps.back().second = gap_len;
|
|
}
|
|
}
|
|
} else {
|
|
right_seq.clear();
|
|
}
|
|
|
|
seed.ed = left_ed + right_ed;
|
|
}
|
|
}
|
|
|
|
template<typename TStr>
|
|
void RB_Repeat::get_consensus_seq(const TStr& s,
|
|
EList<SeedExt>& seeds,
|
|
size_t sb,
|
|
size_t se,
|
|
size_t min_left_ext,
|
|
size_t min_right_ext,
|
|
size_t max_ed,
|
|
const RepeatParameter& rp,
|
|
EList<size_t>& ed_seed_nums,
|
|
EList<string>* left_consensuses,
|
|
EList<string>* right_consensuses) const
|
|
{
|
|
assert_lt(sb, se);
|
|
assert_geq(se - sb, rp.seed_count);
|
|
assert_leq(se, seeds.size());
|
|
if(left_consensuses != NULL) {
|
|
left_consensuses->clear(); left_consensuses->resize(max_ed + 1);
|
|
for(size_t i = 0; i < max_ed + 1; i++) {
|
|
(*left_consensuses)[i].clear();
|
|
}
|
|
}
|
|
if(right_consensuses != NULL) {
|
|
right_consensuses->clear(); right_consensuses->resize(max_ed + 1);
|
|
for(size_t i = 0; i < max_ed + 1; i++) {
|
|
(*right_consensuses)[i].clear();
|
|
}
|
|
}
|
|
|
|
assert_eq(min_left_ext, min_right_ext);
|
|
|
|
EList<string> seqs;
|
|
seqs.reserveExact(seeds.size());
|
|
size_t max_i = 0, max_count = 0;
|
|
while(min_left_ext > 0) {
|
|
seqs.clear();
|
|
max_i = 0;
|
|
max_count = 0;
|
|
for(size_t i = sb; i < se; i++) {
|
|
const SeedExt& seed = seeds[i];
|
|
if(seeds[i].bound.first + min_left_ext > seed.pos.first ||
|
|
seed.pos.second + min_right_ext > seed.bound.second)
|
|
continue;
|
|
|
|
seqs.expand();
|
|
if(min_left_ext > 0) {
|
|
getString(s, seed.pos.first - min_left_ext, min_left_ext, seqs.back());
|
|
}
|
|
if(min_right_ext > 0) {
|
|
seqs.back() += getString(s, seed.pos.second, min_right_ext);
|
|
}
|
|
}
|
|
seqs.sort();
|
|
for(size_t i = 0; i + max_count < seqs.size();) {
|
|
size_t count = 1;
|
|
for(size_t j = i + 1; j < seqs.size(); j++) {
|
|
if(seqs[i] == seqs[j]) count++;
|
|
else break;
|
|
}
|
|
if(count >= max_count) {
|
|
max_count = count;
|
|
max_i = i;
|
|
}
|
|
i = i + count;
|
|
}
|
|
if(max_count >= rp.seed_count)
|
|
break;
|
|
|
|
min_left_ext--;
|
|
min_right_ext--;
|
|
}
|
|
|
|
// update ed
|
|
// extend consensus string
|
|
ed_seed_nums.resize(max_ed + 1); ed_seed_nums.fillZero();
|
|
EList<size_t> next_ed_seed_nums; next_ed_seed_nums.resize(max_ed + 1);
|
|
|
|
if(max_count < rp.seed_count)
|
|
return;
|
|
|
|
for(size_t i = sb; i < se; i++) seeds[i].ed = 0;
|
|
size_t seed_ext_len = 0;
|
|
while(seed_ext_len < max(min_left_ext, min_right_ext)) {
|
|
// select base to be used for extension
|
|
uint8_t left_ext_base = 0;
|
|
if(seed_ext_len < min_left_ext) left_ext_base = seqs[max_i][min_left_ext - seed_ext_len - 1];
|
|
uint8_t right_ext_base = 0;
|
|
if(seed_ext_len < min_right_ext) right_ext_base = seqs[max_i][min_left_ext + seed_ext_len];
|
|
|
|
// estimate extended ed
|
|
next_ed_seed_nums.fillZero();
|
|
for(size_t i = sb; i < se; i++) {
|
|
uint32_t est_ed = seeds[i].ed;
|
|
if(seed_ext_len < min_left_ext) {
|
|
if(seeds[i].bound.first + seed_ext_len + 1 <= seeds[i].pos.first) {
|
|
TIndexOffU left_pos = seeds[i].pos.first - seed_ext_len - 1;
|
|
int ch = getSequenceBase(s, left_pos);
|
|
assert_range(0, 3, ch);
|
|
if ("ACGT"[ch] != left_ext_base) {
|
|
est_ed++;
|
|
}
|
|
} else {
|
|
est_ed = max_ed + 1;
|
|
}
|
|
}
|
|
|
|
if(seed_ext_len < min_right_ext) {
|
|
TIndexOffU right_pos = seeds[i].pos.second + seed_ext_len;
|
|
if(right_pos >= seeds[i].bound.second) {
|
|
est_ed = max_ed + 1;
|
|
} else {
|
|
int ch = getSequenceBase(s, right_pos);
|
|
assert_range(0, 3, ch);
|
|
if ("ACGT"[ch] != right_ext_base) {
|
|
est_ed++;
|
|
}
|
|
}
|
|
}
|
|
|
|
if(est_ed <= max_ed) {
|
|
next_ed_seed_nums[est_ed]++;
|
|
}
|
|
}
|
|
|
|
for(size_t i = 1; i < next_ed_seed_nums.size(); i++) next_ed_seed_nums[i] += next_ed_seed_nums[i-1];
|
|
if(next_ed_seed_nums[max_ed] < rp.repeat_count) {
|
|
break;
|
|
}
|
|
|
|
for(size_t i = sb; i < se; i++) {
|
|
if(seed_ext_len < min_left_ext) {
|
|
if(seeds[i].bound.first + seed_ext_len + 1 <= seeds[i].pos.first) {
|
|
TIndexOffU left_pos = seeds[i].pos.first - seed_ext_len - 1;
|
|
int ch = getSequenceBase(s, left_pos);
|
|
assert_range(0, 3, ch);
|
|
if ("ACGT"[ch] != left_ext_base) {
|
|
seeds[i].ed++;
|
|
}
|
|
} else {
|
|
seeds[i].ed = max_ed + 1;
|
|
}
|
|
}
|
|
|
|
if(seed_ext_len < min_right_ext) {
|
|
TIndexOffU right_pos = seeds[i].pos.second + seed_ext_len;
|
|
if(right_pos >= seeds[i].bound.second) {
|
|
seeds[i].ed = max_ed + 1;
|
|
} else {
|
|
int ch = getSequenceBase(s, right_pos);
|
|
assert_range(0, 3, ch);
|
|
if ("ACGT"[ch] != right_ext_base) {
|
|
seeds[i].ed++;
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
for(size_t i = 0; i < next_ed_seed_nums.size(); i++) {
|
|
if(next_ed_seed_nums[i] < rp.repeat_count)
|
|
continue;
|
|
ed_seed_nums[i] = next_ed_seed_nums[i];
|
|
if(seed_ext_len < min_left_ext) {
|
|
assert(left_consensuses != NULL);
|
|
(*left_consensuses)[i] += left_ext_base;
|
|
}
|
|
if(seed_ext_len < min_right_ext) {
|
|
assert(right_consensuses != NULL);
|
|
(*right_consensuses)[i] += right_ext_base;
|
|
}
|
|
}
|
|
|
|
seed_ext_len++;
|
|
}
|
|
}
|
|
|
|
template<typename TStr>
|
|
void RB_RepeatExt::get_consensus_seq(const TStr& s,
|
|
EList<SeedExt>& seeds,
|
|
size_t sb,
|
|
size_t se,
|
|
size_t min_left_ext,
|
|
size_t min_right_ext,
|
|
size_t max_ed,
|
|
const RepeatParameter& rp,
|
|
EList<size_t>& ed_seed_nums,
|
|
EList<string>* left_consensuses,
|
|
EList<string>* right_consensuses) const
|
|
{
|
|
assert_lt(sb, se);
|
|
assert_leq(se, seeds.size());
|
|
if(left_consensuses != NULL) {
|
|
left_consensuses->clear(); left_consensuses->resize(max_ed + 1);
|
|
for(size_t i = 0; i < max_ed + 1; i++) {
|
|
(*left_consensuses)[i].clear();
|
|
}
|
|
}
|
|
if(right_consensuses != NULL) {
|
|
right_consensuses->clear(); right_consensuses->resize(max_ed + 1);
|
|
for(size_t i = 0; i < max_ed + 1; i++) {
|
|
(*right_consensuses)[i].clear();
|
|
}
|
|
}
|
|
|
|
// cluster sequences
|
|
EList<size_t> belongto;
|
|
belongto.resizeExact(se - sb);
|
|
for(size_t i = 0; i < belongto.size(); i++) belongto[i] = i;
|
|
|
|
for(size_t i = 0; i + 1 < belongto.size(); i++) {
|
|
for(size_t j = i + 1; j < belongto.size(); j++) {
|
|
if(belongto[j] != j) continue;
|
|
size_t ed = calc_edit_dist(s,
|
|
seeds[sb + i],
|
|
seeds[sb + j],
|
|
min_left_ext,
|
|
min_right_ext,
|
|
max_ed + 1);
|
|
if(ed <= max_ed + 1) {
|
|
belongto[j] = belongto[i];
|
|
}
|
|
|
|
}
|
|
}
|
|
|
|
// find the maximum group that has the most sequences
|
|
EList<size_t> vote;
|
|
vote.resizeExact(seeds.size());
|
|
vote.fillZero();
|
|
size_t max_group = 0;
|
|
for(size_t i = 0; i < belongto.size(); i++) {
|
|
size_t cur_group = belongto[i];
|
|
assert_lt(cur_group, vote.size());
|
|
vote[cur_group]++;
|
|
if(cur_group != max_group && vote[cur_group] > vote[max_group]) {
|
|
max_group = cur_group;
|
|
}
|
|
}
|
|
|
|
// reuse vote to store seeds
|
|
EList<size_t>& consensus_group = vote;
|
|
consensus_group.clear();
|
|
for(size_t i = 0; i < belongto.size(); i++) {
|
|
if(belongto[i] == max_group)
|
|
consensus_group.push_back(i);
|
|
}
|
|
|
|
// update ed
|
|
// extend consensus string
|
|
ed_seed_nums.resize(max_ed + 1); ed_seed_nums.fillZero();
|
|
EList<size_t> next_ed_seed_nums; next_ed_seed_nums.resize(max_ed + 1);
|
|
for(size_t i = sb; i < se; i++) seeds[i].ed = 0;
|
|
size_t seed_ext_len = 0;
|
|
while(seed_ext_len < max(min_left_ext, min_right_ext)) {
|
|
// count base
|
|
size_t l_count[4] = {0, };
|
|
size_t r_count[4] = {0, };
|
|
|
|
for(size_t i = 0; i < consensus_group.size(); i++) {
|
|
size_t si = sb + consensus_group[i];
|
|
int ch;
|
|
if(seed_ext_len < min_left_ext) {
|
|
if(seeds[i].bound.first + seed_ext_len + 1 <= seeds[i].pos.first) {
|
|
ch = getSequenceBase(s, seeds[si].pos.first - seed_ext_len - 1);
|
|
assert_range(0, 3, ch);
|
|
l_count[ch]++;
|
|
}
|
|
}
|
|
if(seed_ext_len < min_right_ext) {
|
|
if(seeds[i].bound.second + seed_ext_len) {
|
|
ch = getSequenceBase(s, seeds[si].pos.second + seed_ext_len);
|
|
assert_range(0, 3, ch);
|
|
r_count[ch]++;
|
|
}
|
|
}
|
|
}
|
|
|
|
// select base to be used for extension
|
|
uint8_t left_ext_base = 0;
|
|
if(seed_ext_len < min_left_ext) left_ext_base = max_index(l_count);
|
|
uint8_t right_ext_base = 0;
|
|
if(seed_ext_len < min_right_ext) right_ext_base = max_index(r_count);
|
|
|
|
// estimate extended ed
|
|
next_ed_seed_nums.fillZero();
|
|
for(size_t i = sb; i < se; i++) {
|
|
uint32_t est_ed = seeds[i].ed;
|
|
if(seed_ext_len < min_left_ext) {
|
|
if(seeds[i].bound.first + seed_ext_len + 1 <= seeds[i].pos.first) {
|
|
TIndexOffU left_pos = seeds[i].pos.first - seed_ext_len - 1;
|
|
int ch = getSequenceBase(s, left_pos);
|
|
assert_range(0, 3, ch);
|
|
if (ch != left_ext_base) {
|
|
est_ed++;
|
|
}
|
|
} else {
|
|
est_ed = max_ed + 1;
|
|
}
|
|
}
|
|
|
|
if(seed_ext_len < min_right_ext) {
|
|
TIndexOffU right_pos = seeds[i].pos.second + seed_ext_len;
|
|
if(right_pos >= seeds[i].bound.second) {
|
|
est_ed = max_ed + 1;
|
|
} else {
|
|
int ch = getSequenceBase(s, right_pos);
|
|
assert_range(0, 3, ch);
|
|
if (ch != right_ext_base) {
|
|
est_ed++;
|
|
}
|
|
}
|
|
}
|
|
|
|
if(est_ed <= max_ed) {
|
|
next_ed_seed_nums[est_ed]++;
|
|
}
|
|
}
|
|
|
|
for(size_t i = 1; i < next_ed_seed_nums.size(); i++) next_ed_seed_nums[i] += next_ed_seed_nums[i-1];
|
|
if(next_ed_seed_nums[max_ed] < rp.repeat_count) {
|
|
break;
|
|
}
|
|
|
|
for(size_t i = sb; i < se; i++) {
|
|
if(seed_ext_len < min_left_ext) {
|
|
if(seeds[i].bound.first + seed_ext_len + 1 <= seeds[i].pos.first) {
|
|
TIndexOffU left_pos = seeds[i].pos.first - seed_ext_len - 1;
|
|
int ch = getSequenceBase(s, left_pos);
|
|
assert_range(0, 3, ch);
|
|
if (ch != left_ext_base) {
|
|
seeds[i].ed++;
|
|
}
|
|
} else {
|
|
seeds[i].ed = max_ed + 1;
|
|
}
|
|
}
|
|
|
|
if(seed_ext_len < min_right_ext) {
|
|
TIndexOffU right_pos = seeds[i].pos.second + seed_ext_len;
|
|
if(right_pos >= seeds[i].bound.second) {
|
|
seeds[i].ed = max_ed + 1;
|
|
} else {
|
|
int ch = getSequenceBase(s, right_pos);
|
|
assert_range(0, 3, ch);
|
|
if (ch != right_ext_base) {
|
|
seeds[i].ed++;
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
for(size_t i = 0; i < next_ed_seed_nums.size(); i++) {
|
|
if(next_ed_seed_nums[i] < rp.repeat_count)
|
|
continue;
|
|
ed_seed_nums[i] = next_ed_seed_nums[i];
|
|
if(seed_ext_len < min_left_ext) {
|
|
assert(left_consensuses != NULL);
|
|
(*left_consensuses)[i] += (char)("ACGT"[left_ext_base]);
|
|
}
|
|
if(seed_ext_len < min_right_ext) {
|
|
assert(right_consensuses != NULL);
|
|
(*right_consensuses)[i] += (char)("ACGT"[right_ext_base]);
|
|
}
|
|
}
|
|
|
|
seed_ext_len++;
|
|
}
|
|
}
|
|
|
|
EList<size_t> RB_Repeat::ca_ed_;
|
|
EList<size_t> RB_Repeat::ca_ed2_;
|
|
string RB_Repeat::ca_s_;
|
|
string RB_Repeat::ca_s2_;
|
|
|
|
size_t RB_Repeat::seed_merge_tried = 0;
|
|
size_t RB_Repeat::seed_merged = 0;
|
|
|
|
template<typename TStr>
|
|
void RB_RepeatExt::extendConsensus(const RepeatParameter& rp,
|
|
const TStr& s)
|
|
{
|
|
size_t remain = seeds_.size();
|
|
EList<string> consensuses, empty_consensuses;
|
|
EList<size_t> ed_seed_nums;
|
|
bool left = true;
|
|
|
|
const TIndexOffU default_max_ext_len = 100;
|
|
const TIndexOffU seed_mm = 4;
|
|
string ext_consensus = "";
|
|
|
|
empty_consensuses.resize(seed_mm + 1);
|
|
empty_consensuses.fill("");
|
|
|
|
while(remain >= rp.repeat_count) {
|
|
for(size_t i = 0; i < remain; i++) {
|
|
seeds_[i].done = false;
|
|
seeds_[i].curr_ext_len = 0;
|
|
}
|
|
|
|
// extend seeds in left or right direction
|
|
TIndexOffU max_ext_len = min(default_max_ext_len, (TIndexOffU)(rp.max_repeat_len - consensus_.length()));
|
|
get_consensus_seq(s,
|
|
seeds_,
|
|
0, // seed begin
|
|
remain, // seed end
|
|
left ? max_ext_len : 0,
|
|
left ? 0 : max_ext_len,
|
|
seed_mm,
|
|
rp,
|
|
ed_seed_nums,
|
|
left ? &consensuses : NULL,
|
|
left ? NULL : &consensuses);
|
|
|
|
size_t allowed_seed_mm = 0;
|
|
ext_consensus.clear();
|
|
for(int i = (int)seed_mm; i >= 0; i--) {
|
|
size_t extlen = (ed_seed_nums[i] < rp.repeat_count ? 0 : consensuses[i].length());
|
|
if(extlen <= 0 || extlen < max_ext_len * i / seed_mm)
|
|
continue;
|
|
|
|
if(i > 0 && consensuses[i].length() <= consensuses[i-1].length() + 5)
|
|
continue;
|
|
|
|
ext_consensus = consensuses[i];
|
|
allowed_seed_mm = (size_t)i;
|
|
assert(ext_consensus.length() > 0);
|
|
break;
|
|
}
|
|
size_t num_passed_seeds = 0;
|
|
if(ext_consensus.length() > 0) {
|
|
if(left) consensus_ = reverse(ext_consensus) + consensus_;
|
|
else consensus_ += ext_consensus;
|
|
|
|
calc_edit_dist(s,
|
|
seeds_,
|
|
0,
|
|
remain,
|
|
left ? consensuses : empty_consensuses,
|
|
left ? empty_consensuses : consensuses,
|
|
allowed_seed_mm);
|
|
|
|
// update seeds
|
|
for(size_t i = 0; i < seeds_.size(); i++) {
|
|
SeedExt& seed = seeds_[i];
|
|
|
|
if(i < remain) {
|
|
if(seed.ed <= allowed_seed_mm) {
|
|
num_passed_seeds++;
|
|
seed.done = true;
|
|
seed.total_ed += seed.ed;
|
|
if(left) {
|
|
if(seed.left_gaps.size() > 0 &&
|
|
seed.left_gaps.back().first >= seed.orig_pos.first - seed.pos.first) {
|
|
int gap_len = seed.left_gaps.back().second;
|
|
seed.pos.first += gap_len;
|
|
}
|
|
seed.pos.first -= ext_consensus.length();
|
|
seed.consensus_pos.first = 0;
|
|
seed.consensus_pos.second = consensus_.length();
|
|
} else {
|
|
if(seed.right_gaps.size() > 0 &&
|
|
seed.right_gaps.back().first >= seed.pos.second - seed.orig_pos.second) {
|
|
int gap_len = seed.right_gaps.back().second;
|
|
seed.pos.second -= gap_len;
|
|
}
|
|
seed.pos.second += ext_consensus.length();
|
|
seed.consensus_pos.second = consensus_.length();
|
|
}
|
|
} else {
|
|
if(left) {
|
|
assert_leq(seed.curr_ext_len, ext_consensus.length());
|
|
TIndexOffU adjust = ext_consensus.length() - seed.curr_ext_len;
|
|
seed.consensus_pos.first += adjust;
|
|
seed.consensus_pos.second += ext_consensus.length();
|
|
assert_leq(seed.curr_ext_len, seed.pos.first);
|
|
seed.pos.first -= seed.curr_ext_len;
|
|
} else {
|
|
assert_leq(seed.curr_ext_len, ext_consensus.length());
|
|
seed.consensus_pos.second += seed.curr_ext_len;
|
|
seed.pos.second += seed.curr_ext_len;
|
|
}
|
|
}
|
|
} else {
|
|
if(left) {
|
|
seed.consensus_pos.first += ext_consensus.length();
|
|
seed.consensus_pos.second += ext_consensus.length();
|
|
}
|
|
}
|
|
}
|
|
|
|
// move up "done" seeds
|
|
size_t j = 0;
|
|
for(size_t i = 0; i < remain; i++) {
|
|
if(!seeds_[i].done) continue;
|
|
assert_geq(i, j);
|
|
if(i > j) {
|
|
SeedExt temp = seeds_[j];
|
|
seeds_[j] = seeds_[i];
|
|
seeds_[i] = temp;
|
|
// Find next "undone" seed
|
|
j++;
|
|
while(j < i && seeds_[j].done) {
|
|
j++;
|
|
}
|
|
assert(j < remain && !seeds_[j].done);
|
|
} else {
|
|
j = i + 1;
|
|
}
|
|
}
|
|
}
|
|
|
|
remain = num_passed_seeds;
|
|
if(remain < rp.repeat_count) {
|
|
if(left) {
|
|
left = false;
|
|
remain = seeds_.size();
|
|
}
|
|
}
|
|
} // while(remain >= rp.repeat_count)
|
|
|
|
#ifndef NDEBUG
|
|
// make sure seed positions are unique
|
|
EList<pair<TIndexOffU, TIndexOffU> > seed_poses;
|
|
for(size_t i = 0; i < seeds_.size(); i++) {
|
|
seed_poses.expand();
|
|
seed_poses.back() = seeds_[i].orig_pos;
|
|
}
|
|
seed_poses.sort();
|
|
for(size_t i = 0; i + 1 < seed_poses.size(); i++) {
|
|
if(seed_poses[i].first == seed_poses[i+1].first) {
|
|
assert_lt(seed_poses[i].second, seed_poses[i+1].second);
|
|
} else {
|
|
assert_lt(seed_poses[i].first, seed_poses[i+1].first);
|
|
}
|
|
}
|
|
|
|
for(size_t i = 0; i < seeds_.size(); i++) {
|
|
assert(seeds_[i].valid());
|
|
}
|
|
#endif
|
|
|
|
internal_update();
|
|
|
|
// check repeats within a repeat sequence
|
|
self_repeat_ = false;
|
|
for(size_t i = 0; i + 1 < seed_ranges_.size(); i++) {
|
|
const RB_AlleleCoord& range = seed_ranges_[i];
|
|
for(size_t j = i + 1; j < seed_ranges_.size(); j++) {
|
|
RB_AlleleCoord& range2 = seed_ranges_[j];
|
|
if(range.right <= range2.left)
|
|
break;
|
|
self_repeat_ = true;
|
|
}
|
|
}
|
|
}
|
|
|
|
bool RB_Repeat::overlap(const RB_Repeat& o,
|
|
bool& contain,
|
|
bool& left,
|
|
size_t& seed_i,
|
|
size_t& seed_j,
|
|
bool debug) const
|
|
{
|
|
contain = left = false;
|
|
seed_i = seed_j = 0;
|
|
size_t p = 0, p2 = 0;
|
|
while(p < seed_ranges_.size() && p2 < o.seed_ranges_.size()) {
|
|
const RB_AlleleCoord& range = seed_ranges_[p];
|
|
const SeedExt& seed = seeds_[range.idx];
|
|
Range range_ = seed.getExtendedRange(consensus_.length());
|
|
RB_AlleleCoord ex_range(range_.first, range_.second, p);
|
|
bool representative = (float)range.len() >= consensus_.length() * 0.95f;
|
|
|
|
const RB_AlleleCoord& range2 = o.seed_ranges_[p2];
|
|
const SeedExt& seed2 = o.seeds_[range2.idx];
|
|
Range range2_ = seed2.getExtendedRange(o.consensus().length());
|
|
RB_AlleleCoord ex_range2(range2_.first, range2_.second, p2);
|
|
bool representative2 = (float)range2.len() >= o.consensus_.length() * 0.95f;
|
|
|
|
seed_i = range.idx;
|
|
seed_j = range2.idx;
|
|
|
|
const size_t relax = 5;
|
|
if(representative && representative2) {
|
|
if(ex_range.overlap_len(ex_range2) > 0) {
|
|
if(ex_range.contain(ex_range2, relax)) {
|
|
contain = true;
|
|
left = true;
|
|
} else if(ex_range2.contain(ex_range, relax)) {
|
|
contain = true;
|
|
left = false;
|
|
} else {
|
|
left = ex_range.left <= ex_range2.left;
|
|
}
|
|
return true;
|
|
}
|
|
} else if(representative) {
|
|
if(range2.contain(ex_range, relax)) {
|
|
contain = true;
|
|
left = false;
|
|
return true;
|
|
}
|
|
} else if(representative2) {
|
|
if(range.contain(ex_range2, relax)) {
|
|
contain = true;
|
|
left = true;
|
|
return true;
|
|
}
|
|
}
|
|
|
|
if(range.right <= range2.right) p++;
|
|
if(range2.right <= range.right) p2++;
|
|
}
|
|
return false;
|
|
}
|
|
|
|
void RB_Repeat::showInfo(const RepeatParameter& rp,
|
|
CoordHelper& coordHelper) const
|
|
{
|
|
cerr << "\trepeat id: " << repeat_id_ << endl;
|
|
cerr << "\tnumber of alleles: " << seeds_.size() << endl;
|
|
cerr << "\tconsensus length: " << consensus_.length() << endl;
|
|
cerr << consensus_ << endl;
|
|
for(size_t i = 0; i < seeds_.size(); i++) {
|
|
const SeedExt& seed = seeds_[i];
|
|
size_t ext_len = seed.getLength();
|
|
if(ext_len < rp.min_repeat_len) continue;
|
|
bool sense_strand = seed.pos.first < coordHelper.forward_length();
|
|
|
|
cerr << "\t\t" << setw(4) << i << " " << setw(4) << ext_len;
|
|
cerr << " " << (sense_strand ? '+' : '-');
|
|
cerr << " " << setw(10) << seed.pos.first << " " << setw(10) << seed.pos.second;
|
|
cerr << " " << setw(4) << seed.consensus_pos.first << " " << setw(4) << seed.consensus_pos.second;
|
|
cerr << " " << (seed.aligned ? "aligned" : "unaligned");
|
|
cerr << endl;
|
|
}
|
|
}
|
|
|
|
template<typename TStr>
|
|
void RB_Repeat::generateSNPs(const RepeatParameter& rp, const TStr& s, TIndexOffU grp_id) {
|
|
EList<SeedExt>& seeds = this->seeds();
|
|
|
|
for(size_t i = 0; i < seeds.size(); i++) {
|
|
SeedExt& seed = seeds[i];
|
|
|
|
if(!seed.aligned) {
|
|
continue;
|
|
}
|
|
|
|
if(seed.getLength() < rp.min_repeat_len) {
|
|
continue;
|
|
}
|
|
assert_eq(seed.getLength(), seed.pos.second - seed.pos.first);
|
|
seed.generateSNPs(s, consensus(), snps());
|
|
}
|
|
if(snps().size() > 0) {
|
|
sort(snps_.begin(), snps().end(), SeedSNP::cmpSeedSNPByPos);
|
|
}
|
|
}
|
|
|
|
void RB_Repeat::saveSNPs(ofstream &fp, TIndexOffU grp_id, TIndexOffU& snp_id_base)
|
|
{
|
|
string chr_name = "rep" + to_string(repeat_id_);
|
|
|
|
for(size_t j = 0; j < snps_.size(); j++) {
|
|
SeedSNP& snp = *snps_[j];
|
|
// assign SNPid
|
|
snp.id = (snp_id_base++);
|
|
|
|
fp << "rps" << snp.id;
|
|
fp << "\t";
|
|
|
|
if(snp.type == EDIT_TYPE_MM) {
|
|
fp << "single";
|
|
} else if(snp.type == EDIT_TYPE_READ_GAP) {
|
|
fp << "deletion";
|
|
} else if(snp.type == EDIT_TYPE_REF_GAP) {
|
|
fp << "insertion";
|
|
} else {
|
|
assert(false);
|
|
}
|
|
fp << "\t";
|
|
|
|
fp << chr_name;
|
|
fp << "\t";
|
|
|
|
fp << snp.pos;
|
|
fp << "\t";
|
|
if(snp.type == EDIT_TYPE_MM || snp.type == EDIT_TYPE_REF_GAP) {
|
|
fp << snp.base;
|
|
} else if(snp.type == EDIT_TYPE_READ_GAP) {
|
|
fp << snp.len;
|
|
} else {
|
|
assert(false);
|
|
}
|
|
fp << endl;
|
|
}
|
|
}
|
|
|
|
float RB_RepeatExt::mergeable(const RB_Repeat& o) const
|
|
{
|
|
const EList<RB_AlleleCoord>& ranges = seed_ranges();
|
|
const EList<RB_AlleleCoord>& ranges2 = o.seed_ranges();
|
|
size_t num_overlap_bp = 0;
|
|
size_t p = 0, p2 = 0;
|
|
while(p < ranges.size() && p2 < ranges2.size()) {
|
|
const RB_AlleleCoord& range = ranges[p];
|
|
const RB_AlleleCoord& range2 = ranges2[p2];
|
|
TIndexOffU overlap = range.overlap_len(range2);
|
|
num_overlap_bp += overlap;
|
|
if(range.right <= range2.right) p++;
|
|
else p2++;
|
|
}
|
|
size_t num_total_bp = 0, num_total_bp2 = 0;
|
|
for(size_t r = 0; r < ranges.size(); r++) num_total_bp += (ranges[r].right - ranges[r].left);
|
|
for(size_t r = 0; r < ranges2.size(); r++) num_total_bp2 += (ranges2[r].right - ranges2[r].left);
|
|
float portion = float(num_overlap_bp) / float(min(num_total_bp, num_total_bp2));
|
|
return portion;
|
|
}
|
|
|
|
inline void get_next_range(const EList<int>& offsets, size_t i, Range& r, float& avg)
|
|
{
|
|
r.first = i;
|
|
for(; r.first < offsets.size() && offsets[r.first] < 0; r.first++);
|
|
r.second = r.first + 1;
|
|
avg = 0.0f;
|
|
if(r.first < offsets.size()) {
|
|
float diff = (float)offsets[r.first] - (float)r.first;
|
|
avg += diff;
|
|
}
|
|
for(; r.second < offsets.size() &&
|
|
offsets[r.second] >= 0 &&
|
|
(r.second == 0 || offsets[r.second] >= offsets[r.second - 1]);
|
|
r.second++) {
|
|
float diff = (float)offsets[r.second] - (float)r.second;
|
|
avg += diff;
|
|
}
|
|
avg /= (float)(r.second - r.first);
|
|
return;
|
|
}
|
|
|
|
template<typename TStr>
|
|
bool RB_RepeatExt::align(const RepeatParameter& rp,
|
|
const TStr& ref,
|
|
const string& s,
|
|
const EList<pair<size_t, size_t> >& s_kmer_table,
|
|
const string& s2,
|
|
EList<int>& offsets,
|
|
size_t k,
|
|
SeedExt& seed,
|
|
int consensus_approx_left,
|
|
int consensus_approx_right,
|
|
size_t left,
|
|
size_t right,
|
|
bool debug)
|
|
{
|
|
RB_Repeat::seed_merge_tried++;
|
|
|
|
seed.reset();
|
|
seed.pos.first = left;
|
|
seed.pos.second = right;
|
|
seed.orig_pos.first = seed.pos.first;
|
|
seed.orig_pos.second = seed.orig_pos.first;
|
|
seed.consensus_pos.first = 0;
|
|
seed.consensus_pos.second = consensus_.length();
|
|
seed.aligned = false;
|
|
|
|
{
|
|
const int query_len = right - left;
|
|
const int consensus_len = consensus_approx_right - consensus_approx_left;
|
|
const int abs_gap_len = max<int>(abs(consensus_len - query_len), 5);
|
|
|
|
offsets.resize(s2.length());
|
|
offsets.fill(-1);
|
|
pair<size_t, size_t> kmer(0, 0);
|
|
for(size_t i = 0; i + k <= s2.length(); i++) {
|
|
if(i == 0) {
|
|
kmer.first = extract_kmer(s2, i, k);
|
|
} else {
|
|
kmer.first = next_kmer(kmer.first, s2[i+k-1], k);
|
|
}
|
|
size_t lb = s_kmer_table.bsearchLoBound(kmer);
|
|
while(lb < s_kmer_table.size() && kmer.first == s_kmer_table[lb].first) {
|
|
int expected = (int)i + consensus_approx_left;
|
|
int real = (int)s_kmer_table[lb].second;
|
|
int abs_diff = abs(expected - real);
|
|
if(abs_diff <= abs_gap_len * 2 || debug) {
|
|
if(offsets[i] == -1) {
|
|
offsets[i] = (int)s_kmer_table[lb].second;
|
|
} else if(offsets[i] >= 0) {
|
|
offsets[i] = -2;
|
|
} else {
|
|
assert_lt(offsets[i], -1);
|
|
offsets[i] -= 1;
|
|
}
|
|
}
|
|
lb++;
|
|
}
|
|
if(offsets[i] > 0 && i + k == s2.length()) {
|
|
for(size_t j = i + 1; j < s2.length(); j++) {
|
|
offsets[j] = offsets[j-1] + 1;
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
if(debug) {
|
|
cerr << "initial offsets" << endl;
|
|
for(size_t j = 0; j < offsets.size(); j++) {
|
|
cout << j << ": " << offsets[j] << " " << s2[j] << ": " << (offsets[j] < 0 ? ' ' : s[offsets[j]]) << endl;
|
|
}
|
|
}
|
|
|
|
// remove inconsistent positions
|
|
Range range; float range_avg;
|
|
get_next_range(offsets, 0, range, range_avg);
|
|
while(range.second < offsets.size()) {
|
|
Range range2; float range_avg2;
|
|
get_next_range(offsets, range.second, range2, range_avg2);
|
|
if(range2.first >= offsets.size())
|
|
break;
|
|
|
|
assert_leq(range.second, range2.first);
|
|
|
|
float abs_diff = abs(range_avg - range_avg2);
|
|
if(offsets[range.second - 1] > offsets[range2.first] ||
|
|
(abs_diff > 10.0f && (range.second - range.first < 5 || range2.second - range2.first < 5)) ||
|
|
abs_diff > 20.0f) {
|
|
if(range.second - range.first < range2.second - range2.first) {
|
|
for(size_t i = range.first; i < range.second; i++) {
|
|
offsets[i] = -1;
|
|
}
|
|
range = range2;
|
|
range_avg = range_avg2;
|
|
} else {
|
|
for(size_t i = range2.first; i < range2.second; i++) {
|
|
offsets[i] = -1;
|
|
}
|
|
}
|
|
} else {
|
|
range = range2;
|
|
range_avg = range_avg2;
|
|
}
|
|
}
|
|
|
|
bool weighted_avg_inited = false;
|
|
float weighted_avg = -1.0f;
|
|
for(size_t i = 0; i < offsets.size(); i++) {
|
|
if(offsets[i] < 0)
|
|
continue;
|
|
|
|
float diff = (float)offsets[i] - (float)i;
|
|
if(weighted_avg_inited) {
|
|
if(abs(diff - weighted_avg) > 20.0f) {
|
|
offsets[i] = -1;
|
|
continue;
|
|
}
|
|
}
|
|
|
|
if(weighted_avg < 0.0f) {
|
|
weighted_avg = diff;
|
|
weighted_avg_inited = true;
|
|
} else {
|
|
weighted_avg = 0.8f * weighted_avg + 0.2f * diff;
|
|
}
|
|
}
|
|
|
|
if(debug) {
|
|
cerr << "after filtering" << endl;
|
|
for(size_t j = 0; j < offsets.size(); j++) {
|
|
cout << j << ": " << offsets[j] << " " << s2[j] << ": " << (offsets[j] < 0 ? ' ' : s[offsets[j]]) << endl;
|
|
}
|
|
}
|
|
|
|
int i = 0;
|
|
while(i < offsets.size()) {
|
|
// skip non-negative offsets
|
|
for(; i < offsets.size() && offsets[i] >= 0; i++);
|
|
if(i >= offsets.size()) break;
|
|
int j = i;
|
|
for(; j < offsets.size(); j++) {
|
|
if(offsets[j] >= 0)
|
|
break;
|
|
}
|
|
assert(i >= offsets.size() || offsets[i] < 0);
|
|
assert(j >= offsets.size() || offsets[j] >= 0);
|
|
if(i > 0 && j < offsets.size()) {
|
|
i -= 1;
|
|
int left = offsets[i], right = offsets[j];
|
|
assert_geq(left, 0);
|
|
if(left > right) return false;
|
|
assert_leq(left, right);
|
|
int ref_len = right - left + 1;
|
|
int query_len = j - i + 1;
|
|
if(query_len == ref_len) { // match
|
|
for(size_t i2 = i + 1; i2 < j; i2++)
|
|
offsets[i2] = offsets[i2-1] + 1;
|
|
} else { // deletion or insertion
|
|
bool del = query_len < ref_len;
|
|
size_t gap_len = del ? ref_len - query_len : query_len - ref_len;
|
|
size_t max_len = max(ref_len, query_len);
|
|
const size_t max_mm = max_len / 25 + 1;
|
|
const size_t very_max_mm = max<size_t>(max_len / 2, max_mm);
|
|
ca_s_ = s.substr(left, max_len);
|
|
ca_s2_ = s2.substr(i, max_len);
|
|
|
|
size_t gap_pos = 0, mm = very_max_mm + 1;
|
|
find_gap_pos(ca_s_,
|
|
ca_s2_,
|
|
ca_ed_,
|
|
ca_ed2_,
|
|
del,
|
|
gap_len,
|
|
very_max_mm,
|
|
gap_pos,
|
|
mm,
|
|
debug);
|
|
|
|
if(mm > very_max_mm) {
|
|
return false;
|
|
}
|
|
|
|
assert_lt(gap_pos, query_len);
|
|
if(del) {
|
|
for(size_t i2 = i + 1; i2 < j; i2++) {
|
|
if(i2 - i == gap_pos) {
|
|
offsets[i2] = offsets[i2-1] + gap_len;
|
|
} else {
|
|
offsets[i2] = offsets[i2-1] + 1;
|
|
}
|
|
}
|
|
} else {
|
|
for(size_t i2 = i + 1; i2 < j; i2++) {
|
|
if(i2 - i >= gap_pos && i2 - i < gap_pos + gap_len) {
|
|
offsets[i2] = offsets[i2-1];
|
|
} else {
|
|
offsets[i2] = offsets[i2-1] + 1;
|
|
}
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
i = j;
|
|
}
|
|
|
|
if(debug) {
|
|
cerr << "final offsets" << endl;
|
|
for(size_t j = 0; j < offsets.size(); j++) {
|
|
cout << j << ": " << offsets[j] << " " << s2[j] << ": " << (offsets[j] < 0 ? ' ' : s[offsets[j]]) << endl;
|
|
}
|
|
}
|
|
|
|
#ifndef NDEBUG
|
|
for(size_t i = 1; i < offsets.size(); i++) {
|
|
if(offsets[i-1] < 0 || offsets[i] < 0) continue;
|
|
assert_leq(offsets[i-1], offsets[i]);
|
|
}
|
|
#endif
|
|
|
|
size_t b, e;
|
|
for(b = 0; b < offsets.size() && offsets[b] < 0; b++);
|
|
if(b >= offsets.size()) return false;
|
|
assert_lt((size_t)b, offsets.size());
|
|
for(e = offsets.size() - 1; e > b && offsets[e] < 0; e--);
|
|
if(b == e) return false;
|
|
for(size_t p = b; p <= e; p++) {
|
|
if(offsets[p] < 0) return false;
|
|
}
|
|
|
|
// try to fill the ends
|
|
if(b > 0) {
|
|
int mm = 0, pb = (int)b;
|
|
for(int i = pb - 1; i >= 0; i--) {
|
|
assert_geq(offsets[i+1], 0);
|
|
if(offsets[i+1] == 0) break;
|
|
if(s2[i] != s[offsets[i+1] - 1])
|
|
mm++;
|
|
if(pb - i < 25 * (mm - 1))
|
|
break;
|
|
offsets[i] = offsets[i+1] - 1;
|
|
b = (size_t)i;
|
|
}
|
|
}
|
|
if(e + 1 < offsets.size()) {
|
|
int mm = 0, prev_end = (int)e;
|
|
for(int i = prev_end + 1; i < offsets.size(); i++) {
|
|
assert_geq(offsets[i-1], 0);
|
|
if(offsets[i-1] + 1 >= s.length()) break;
|
|
if(s2[i] != s[offsets[i-1] + 1])
|
|
mm++;
|
|
if(i - prev_end < 25 * (mm - 1))
|
|
break;
|
|
offsets[i] = offsets[i-1] + 1;
|
|
e = (size_t)i;
|
|
}
|
|
}
|
|
|
|
assert_geq(seed.pos.first, (TIndexOffU)b);
|
|
seed.pos.first += (TIndexOffU)b;
|
|
seed.pos.second = seed.pos.first + e - b + 1;
|
|
seed.orig_pos.first = seed.pos.first;
|
|
seed.orig_pos.second = seed.pos.first;
|
|
seed.consensus_pos.first = offsets[b];
|
|
seed.consensus_pos.second = offsets[e] + 1;
|
|
seed.aligned = true;
|
|
|
|
for(int p = b; p < e; p++) {
|
|
assert_geq(offsets[p], 0);
|
|
assert_leq(offsets[p], offsets[p+1]);
|
|
if(offsets[p] + 1 == offsets[p+1]) { // match
|
|
continue;
|
|
} else if(offsets[p] + 1 < offsets[p+1]) { // deletion
|
|
seed.right_gaps.expand();
|
|
seed.right_gaps.back().first = p + 1 - b;
|
|
seed.right_gaps.back().second = offsets[p+1] - offsets[p] - 1;
|
|
} else {
|
|
assert_eq(offsets[p], offsets[p+1]);
|
|
int p2 = p + 1;
|
|
for(; offsets[p2] == offsets[p2+1]; p2++);
|
|
seed.right_gaps.expand();
|
|
seed.right_gaps.back().first = p + 1 - b;
|
|
seed.right_gaps.back().second = (int)p - (int)p2;
|
|
p = p2;
|
|
}
|
|
}
|
|
|
|
if(debug) {
|
|
string consensus_str = consensus_.substr(seed.consensus_pos.first, seed.consensus_pos.second - seed.consensus_pos.first);
|
|
string seed_str;
|
|
seed.getExtendedSeedSequence(ref, seed_str);
|
|
assert_eq(consensus_str.length(), seed_str.length());
|
|
cerr << "consensus: " << consensus_str << endl;
|
|
cerr << "seed : " << seed_str << endl;
|
|
}
|
|
|
|
RB_Repeat::seed_merged++;
|
|
|
|
return true;
|
|
}
|
|
|
|
bool RB_RepeatExt::isSelfRepeat(const RepeatParameter& rp,
|
|
const string& s,
|
|
const EList<pair<size_t, size_t> >& s_kmer_table,
|
|
EList<int>& offsets,
|
|
size_t k,
|
|
bool debug)
|
|
{
|
|
offsets.resize(s.length());
|
|
offsets.fill(-1);
|
|
pair<size_t, size_t> kmer(0, 0);
|
|
for(size_t i = 0; i + k <= s.length(); i++) {
|
|
if(i == 0) {
|
|
kmer.first = extract_kmer(s, i, k);
|
|
} else {
|
|
kmer.first = next_kmer(kmer.first, s[i+k-1], k);
|
|
}
|
|
size_t lb = s_kmer_table.bsearchLoBound(kmer);
|
|
while(lb < s_kmer_table.size() && kmer.first == s_kmer_table[lb].first) {
|
|
if(offsets[i] == -1) {
|
|
offsets[i] = (int)s_kmer_table[lb].second;
|
|
} else if(offsets[i] >= 0) {
|
|
offsets[i] = -2;
|
|
} else {
|
|
assert_lt(offsets[i], -1);
|
|
offsets[i] -= 1;
|
|
}
|
|
lb++;
|
|
}
|
|
if(offsets[i] > 0 && i + k == s.length()) {
|
|
for(size_t j = i + 1; j < s.length(); j++) {
|
|
offsets[j] = offsets[j-1] + 1;
|
|
}
|
|
}
|
|
}
|
|
|
|
if(debug) {
|
|
cerr << "offsets" << endl;
|
|
for(size_t j = 0; j < offsets.size(); j++) {
|
|
cout << j << ": " << offsets[j] << " " << s[j] << ": " << (offsets[j] < 0 ? ' ' : s[offsets[j]]) << endl;
|
|
}
|
|
}
|
|
|
|
const size_t min_repeat_size = 100;
|
|
size_t repeat_count = 0;
|
|
for(size_t i = 0; i + min_repeat_size < offsets.size(); i++) {
|
|
if(offsets[i] >= -1)
|
|
continue;
|
|
size_t j = i + 1;
|
|
for(; j < offsets.size() && offsets[j] <= -2; j++);
|
|
if(j - i >= min_repeat_size)
|
|
repeat_count++;
|
|
i = j;
|
|
}
|
|
|
|
return repeat_count >= 2;
|
|
}
|
|
|
|
template<typename TStr>
|
|
bool RB_RepeatExt::merge(const RepeatParameter& rp,
|
|
const TStr& s,
|
|
RB_SWAligner& swalginer,
|
|
const RB_RepeatExt& o,
|
|
bool contain,
|
|
size_t seed_i,
|
|
size_t seed_j,
|
|
bool debug)
|
|
{
|
|
// construct a new consensus sequence
|
|
string prev_consensus = consensus_;
|
|
size_t consensus_add_len = 0;
|
|
{
|
|
assert_lt(seed_i, seeds_.size());
|
|
assert_lt(seed_j, o.seeds_.size());
|
|
const SeedExt& seed = seeds_[seed_i];
|
|
const SeedExt& oseed = o.seeds_[seed_j];
|
|
|
|
Range range = seed.getExtendedRange(consensus_.length());
|
|
Range orange = oseed.getExtendedRange(o.consensus_.length());
|
|
assert_leq(range.first, orange.first + 10);
|
|
|
|
consensus_add_len = (int)orange.first - (int)range.first;
|
|
if(!contain) {
|
|
if(range.second <= orange.first) {
|
|
cerr << "something wrong: " << range.first << "-" << range.second << " ";
|
|
cerr << orange.first << "-" << orange.second << " " << o.consensus_.length() << endl;
|
|
assert(false);
|
|
}
|
|
if(range.second > orange.first &&
|
|
range.second - orange.first < o.consensus_.length()) {
|
|
consensus_ += o.consensus_.substr(range.second - orange.first);
|
|
}
|
|
}
|
|
}
|
|
|
|
EList<pair<size_t, size_t> > merge_list;
|
|
merge_list.reserveExact(o.seed_ranges_.size());
|
|
size_t p = 0, p2 = 0;
|
|
const size_t relax = 5;
|
|
while(p < seed_ranges_.size() && p2 < o.seed_ranges_.size()) {
|
|
const RB_AlleleCoord& range = seed_ranges_[p];
|
|
assert_lt(range.idx, seeds_.size());
|
|
const RB_AlleleCoord& range2 = o.seed_ranges_[p2];
|
|
assert_lt(range2.idx, o.seeds_.size());
|
|
if(range.contain(range2, relax)) {
|
|
merge_list.expand();
|
|
merge_list.back().first = p;
|
|
merge_list.back().second = p2;
|
|
if(debug) {
|
|
cerr << p << ":" << range.left << "-" << range.right << " > ";
|
|
cerr << p2 << ":" << range2.left << "-" << range2.right << endl;
|
|
}
|
|
} else if(range2.contain(range, relax)) {
|
|
merge_list.expand();
|
|
merge_list.back().first = p;
|
|
merge_list.back().second = p2;
|
|
if(debug) {
|
|
cerr << p << ":" << range.left << "-" << range.right << " < ";
|
|
cerr << p2 << ":" << range2.left << "-" << range2.right << endl;
|
|
}
|
|
} else {
|
|
TIndexOffU overlap_len = range.overlap_len(range2);
|
|
bool stored = !merge_list.empty() && merge_list.back().first == p;
|
|
bool stored2 = !merge_list.empty() && merge_list.back().second == p2;
|
|
if(overlap_len > 0) {
|
|
if(!stored && !stored2) {
|
|
merge_list.expand();
|
|
merge_list.back().first = p;
|
|
merge_list.back().second = p2;
|
|
|
|
if(debug) {
|
|
cerr << p << ":" << range.left << "-" << range.right << " ol ";
|
|
cerr << p2 << ":" << range2.left << "-" << range2.right << endl;
|
|
}
|
|
}
|
|
} else {
|
|
if(range2.right <= range.left) {
|
|
if(!stored2) {
|
|
merge_list.expand();
|
|
merge_list.back().first = numeric_limits<size_t>::max();
|
|
merge_list.back().second = p2;
|
|
|
|
if(debug) {
|
|
cerr << p << ":" << range.left << "-" << range.right << " <> ";
|
|
cerr << p2 << ":" << range2.left << "-" << range2.right << endl;
|
|
}
|
|
}
|
|
} else {
|
|
assert_leq(range.right, range2.left);
|
|
if(!stored) {
|
|
merge_list.expand();
|
|
merge_list.back().first = p;
|
|
merge_list.back().second = numeric_limits<size_t>::max();
|
|
|
|
if(debug) {
|
|
cerr << p << ":" << range.left << "-" << range.right << " <> ";
|
|
cerr << p2 << ":" << range2.left << "-" << range2.right << endl;
|
|
}
|
|
}
|
|
}
|
|
}
|
|
}
|
|
if(range.right <= range2.right) p++;
|
|
if(range2.right <= range.right) p2++;
|
|
}
|
|
assert(p == seeds_.size() || p2 == o.seeds_.size());
|
|
|
|
while(p < seed_ranges_.size()) {
|
|
bool stored = !merge_list.empty() && merge_list.back().first == p;
|
|
if(!stored) {
|
|
const RB_AlleleCoord& range = seed_ranges_[p];
|
|
merge_list.expand();
|
|
merge_list.back().first = p;
|
|
merge_list.back().second = numeric_limits<size_t>::max();
|
|
|
|
if(debug) {
|
|
cerr << p << ":" << range.left << "-" << range.right << " : <> " << endl;
|
|
}
|
|
}
|
|
|
|
p++;
|
|
}
|
|
|
|
while(p2 < o.seed_ranges_.size()) {
|
|
bool stored2 = !merge_list.empty() && merge_list.back().second == p2;
|
|
if(!stored2) {
|
|
const RB_AlleleCoord& range2 = o.seed_ranges_[p2];
|
|
merge_list.expand();
|
|
merge_list.back().first = numeric_limits<size_t>::max();
|
|
merge_list.back().second = p2;
|
|
|
|
if(debug) {
|
|
cerr << ": <> " << p2 << ":" << range2.left << "-" << range2.right << endl;
|
|
}
|
|
}
|
|
|
|
p2++;
|
|
}
|
|
|
|
assert(!merge_list.empty());
|
|
|
|
if(debug) {
|
|
for(size_t i = 0; i < merge_list.size(); i++) {
|
|
cerr << "merge list:" << endl;
|
|
cerr << "\t" << (merge_list[i].first < seed_ranges_.size() ? (int)merge_list[i].first : -1);
|
|
cerr << " " << (merge_list[i].second < o.seed_ranges_.size() ? (int)merge_list[i].second : -1) << endl;
|
|
}
|
|
}
|
|
|
|
const size_t kmer_len = 12;
|
|
EList<pair<size_t, size_t> > kmer_table;
|
|
build_kmer_table(consensus_, kmer_table, kmer_len);
|
|
EList<int> offsets;
|
|
|
|
bool self_repeat = isSelfRepeat(rp,
|
|
consensus_,
|
|
kmer_table,
|
|
offsets,
|
|
kmer_len,
|
|
debug);
|
|
if(self_repeat) {
|
|
consensus_ = prev_consensus;
|
|
return false;
|
|
}
|
|
|
|
string seq;
|
|
for(size_t i = 0; i < merge_list.size(); i++) {
|
|
size_t seed_id = (merge_list[i].first < seed_ranges_.size() ? seed_ranges_[merge_list[i].first].idx : merge_list[i].first);
|
|
size_t oseed_id = (merge_list[i].second < o.seed_ranges_.size() ? o.seed_ranges_[merge_list[i].second].idx : merge_list[i].second);
|
|
|
|
if(seed_id < seeds_.size()) {
|
|
if(oseed_id >= o.seeds_.size())
|
|
continue;
|
|
else if(seed_ranges_[merge_list[i].first].contain(o.seed_ranges_[merge_list[i].second]))
|
|
continue;
|
|
}
|
|
|
|
const SeedExt* seed = (seed_id < seeds_.size() ? &seeds_[seed_id] : NULL);
|
|
const SeedExt* oseed = (oseed_id < o.seeds_.size() ? &o.seeds_[oseed_id] : NULL);
|
|
assert(seed != NULL || oseed != NULL);
|
|
|
|
size_t left = (seed != NULL ? seed->pos.first : numeric_limits<size_t>::max());
|
|
size_t right = (seed != NULL ? seed->pos.second : 0);
|
|
int consensus_approx_left = (seed != NULL ? seed->consensus_pos.first : numeric_limits<int>::max());
|
|
int consensus_approx_right = (seed != NULL ? seed->consensus_pos.second : 0);
|
|
if(oseed != NULL) {
|
|
if(oseed->pos.first < left) {
|
|
left = oseed->pos.first;
|
|
}
|
|
if(oseed->pos.second > right) {
|
|
right = oseed->pos.second;
|
|
}
|
|
|
|
int oconsensus_approx_left = oseed->consensus_pos.first + consensus_add_len;
|
|
if(oconsensus_approx_left < consensus_approx_left) {
|
|
consensus_approx_left = oconsensus_approx_left;
|
|
}
|
|
int oconsensus_approx_right = oseed->consensus_pos.second + consensus_add_len;
|
|
if(oconsensus_approx_right > consensus_approx_right) {
|
|
consensus_approx_right = oconsensus_approx_right;
|
|
}
|
|
}
|
|
|
|
getString(s, left, right - left, seq);
|
|
if(seed_id >= seeds_.size()) {
|
|
seed_id = seeds_.size();
|
|
seeds_.expand();
|
|
}
|
|
|
|
/* bool succ = */ align(rp,
|
|
s,
|
|
consensus_,
|
|
kmer_table,
|
|
seq,
|
|
offsets,
|
|
kmer_len,
|
|
seeds_[seed_id],
|
|
consensus_approx_left,
|
|
consensus_approx_right,
|
|
left,
|
|
right,
|
|
debug && right - left == 1080);
|
|
}
|
|
|
|
while(true) {
|
|
internal_update();
|
|
|
|
size_t remove_count = 0;
|
|
for(size_t i = 0; i + 1 < seed_ranges_.size(); i++) {
|
|
RB_AlleleCoord& range = seed_ranges_[i];
|
|
if(range.left == numeric_limits<size_t>::max())
|
|
break;
|
|
for(size_t j = i + 1; j < seed_ranges_.size(); j++) {
|
|
RB_AlleleCoord& range2 = seed_ranges_[j];
|
|
if(range2.left == numeric_limits<size_t>::max())
|
|
break;
|
|
if(range.right <= range2.left)
|
|
break;
|
|
|
|
getString(s, range.left, range2.right - range.left, seq);
|
|
|
|
/* bool succ = */ align(rp,
|
|
s,
|
|
consensus_,
|
|
kmer_table,
|
|
seq,
|
|
offsets,
|
|
kmer_len,
|
|
seeds_[range.idx],
|
|
seeds_[range.idx].consensus_pos.first,
|
|
seeds_[range2.idx].consensus_pos.second,
|
|
range.left,
|
|
range2.right,
|
|
debug && range.left == 692422);
|
|
|
|
range.left = seeds_[range.idx].pos.first;
|
|
range.right = seeds_[range.idx].pos.second;
|
|
seeds_[range2.idx].reset();
|
|
range2.left = numeric_limits<size_t>::max();
|
|
remove_count++;
|
|
}
|
|
}
|
|
|
|
if(remove_count <= 0) break;
|
|
|
|
sort(seeds_.begin(), seeds_.end(), seedCmp);
|
|
seeds_.resize(seeds_.size() - remove_count);
|
|
}
|
|
|
|
return true;
|
|
}
|
|
|
|
#define DMAX std::numeric_limits<double>::max()
|
|
|
|
RB_SWAligner::RB_SWAligner()
|
|
{
|
|
rnd_.init(0);
|
|
}
|
|
|
|
RB_SWAligner::~RB_SWAligner()
|
|
{
|
|
if(sc_) {
|
|
delete sc_;
|
|
}
|
|
}
|
|
|
|
void RB_SWAligner::init_dyn(const RepeatParameter& rp)
|
|
{
|
|
const int MM_PEN = 3;
|
|
// const int MM_PEN = 6;
|
|
const int GAP_PEN_LIN = 2;
|
|
// const int GAP_PEN_LIN = (((MM_PEN) * rpt_edit_ + 1) * 1.0);
|
|
const int GAP_PEN_CON = 4;
|
|
// const int GAP_PEN_CON = (((MM_PEN) * rpt_edit_ + 1) * 1.0);
|
|
const int MAX_PEN = MAX_I16;
|
|
|
|
scoreMin_.init(SIMPLE_FUNC_LINEAR, rp.max_edit * MM_PEN * -1.0, 0.0);
|
|
nCeil_.init(SIMPLE_FUNC_LINEAR, 0.0, 0.0);
|
|
|
|
penCanIntronLen_.init(SIMPLE_FUNC_LOG, -8, 1);
|
|
penNoncanIntronLen_.init(SIMPLE_FUNC_LOG, -8, 1);
|
|
|
|
sc_ = new Scoring(
|
|
DEFAULT_MATCH_BONUS, // constant reward for match
|
|
DEFAULT_MM_PENALTY_TYPE, // how to penalize mismatches
|
|
MM_PEN, // max mm penalty
|
|
MM_PEN, // min mm penalty
|
|
MAX_PEN, // max sc penalty
|
|
MAX_PEN, // min sc penalty
|
|
scoreMin_, // min score as function of read len
|
|
nCeil_, // max # Ns as function of read len
|
|
DEFAULT_N_PENALTY_TYPE, // how to penalize Ns in the read
|
|
DEFAULT_N_PENALTY, // constant if N pelanty is a constant
|
|
DEFAULT_N_CAT_PAIR, // whether to concat mates before N filtering
|
|
|
|
|
|
GAP_PEN_CON, // constant coeff for read gap cost
|
|
GAP_PEN_CON, // constant coeff for ref gap cost
|
|
GAP_PEN_LIN, // linear coeff for read gap cost
|
|
GAP_PEN_LIN, // linear coeff for ref gap cost
|
|
1 /* gGapBarrier */ // # rows at top/bot only entered diagonally
|
|
);
|
|
}
|
|
|
|
void RB_SWAligner::makePadString(const string& ref,
|
|
const string& read,
|
|
string& pad,
|
|
size_t len)
|
|
{
|
|
pad.resize(len);
|
|
|
|
for(size_t i = 0; i < len; i++) {
|
|
// shift A->C, C->G, G->T, T->A
|
|
pad[i] = "CGTA"[asc2dna[ref[i]]];
|
|
|
|
if(read[i] == pad[i]) {
|
|
// shift
|
|
pad[i] = "CGTA"[asc2dna[pad[i]]];
|
|
}
|
|
}
|
|
|
|
int head_len = len / 2;
|
|
size_t pad_start = len - head_len;
|
|
|
|
for(size_t i = 0; i < head_len; i++) {
|
|
if(read[i] == pad[pad_start + i]) {
|
|
// shift
|
|
pad[pad_start + i] = "CGTA"[asc2dna[pad[pad_start + i]]];
|
|
}
|
|
}
|
|
}
|
|
|
|
int RB_SWAligner::alignStrings(const string &ref,
|
|
const string &read,
|
|
EList<Edit>& edits,
|
|
Coord& coord)
|
|
{
|
|
// Prepare Strings
|
|
|
|
// Read -> BTDnaString
|
|
// Ref -> bit-encoded string
|
|
|
|
//SwAligner swa;
|
|
|
|
BTDnaString btread;
|
|
BTString btqual;
|
|
BTString btref;
|
|
BTString btref2;
|
|
|
|
BTDnaString btreadrc;
|
|
BTString btqualrc;
|
|
|
|
|
|
string qual = "";
|
|
for(size_t i = 0; i < read.length(); i++) {
|
|
qual.push_back('I');
|
|
}
|
|
|
|
#if 0
|
|
cerr << "REF : " << ref << endl;
|
|
cerr << "READ: " << read << endl;
|
|
cerr << "QUAL: " << qual << endl;
|
|
#endif
|
|
|
|
btread.install(read.c_str(), true);
|
|
btreadrc = btread;
|
|
btreadrc.reverseComp();
|
|
|
|
btqual.install(qual.c_str());
|
|
btqualrc = btqual;
|
|
|
|
btref.install(ref.c_str());
|
|
|
|
TAlScore min_score = sc_->scoreMin.f<TAlScore >((double)btread.length());
|
|
|
|
btref2 = btref;
|
|
|
|
size_t nceil = 0;
|
|
// size_t nrow = btread.length();
|
|
|
|
// Convert reference string to mask
|
|
for(size_t i = 0; i < btref2.length(); i++) {
|
|
if(toupper(btref2[i]) == 'N') {
|
|
btref2.set(16, i);
|
|
} else {
|
|
int num = 0;
|
|
int alts[] = {4, 4, 4, 4};
|
|
decodeNuc(toupper(btref2[i]), num, alts);
|
|
assert_leq(num, 4);
|
|
assert_gt(num, 0);
|
|
btref2.set(0, i);
|
|
for(int j = 0; j < num; j++) {
|
|
btref2.set(btref2[i] | (1 << alts[j]), i);
|
|
}
|
|
}
|
|
}
|
|
|
|
|
|
bool fw = true;
|
|
uint32_t refidx = 0;
|
|
|
|
swa.initRead(
|
|
btread, // read sequence
|
|
btreadrc,
|
|
btqual, // read qualities
|
|
btqualrc,
|
|
0, // offset of first character within 'read' to consider
|
|
btread.length(), // offset of last char (exclusive) in 'read' to consider
|
|
*sc_); // local-alignment score floor
|
|
|
|
DynProgFramer dpframe(false);
|
|
size_t readgaps = 0;
|
|
size_t refgaps = 0;
|
|
size_t maxhalf = 0;
|
|
|
|
DPRect rect;
|
|
dpframe.frameSeedExtensionRect(
|
|
0, // ref offset implied by seed hit assuming no gaps
|
|
btread.length(), // length of read sequence used in DP table
|
|
btref2.length(), // length of reference
|
|
readgaps, // max # of read gaps permitted in opp mate alignment
|
|
refgaps, // max # of ref gaps permitted in opp mate alignment
|
|
(size_t)nceil, // # Ns permitted
|
|
maxhalf, // max width in either direction
|
|
rect); // DP Rectangle
|
|
|
|
assert(rect.repOk());
|
|
|
|
size_t cminlen = 2000, cpow2 = 4;
|
|
|
|
swa.initRef(
|
|
fw, // whether to align forward or revcomp read
|
|
refidx, // reference ID
|
|
rect, // DP rectangle
|
|
btref2.wbuf(), // reference strings
|
|
0, // offset of first reference char to align to
|
|
btref2.length(), // offset of last reference char to align to
|
|
btref2.length(), // length of reference sequence
|
|
*sc_, // scoring scheme
|
|
min_score, // minimum score
|
|
true, // use 8-bit SSE if positions
|
|
cminlen, // minimum length for using checkpointing scheme
|
|
cpow2, // interval b/t checkpointed diags; 1 << this
|
|
false, // triangular mini-fills?
|
|
false // is this a seed extension?
|
|
);
|
|
|
|
|
|
TAlScore best = std::numeric_limits<TAlScore>::min();
|
|
bool found = swa.align(rnd_, best);
|
|
|
|
#ifdef DEBUGLOG
|
|
cerr << "found: " << found << "\t" << best << "\t" << "minsc: " << min_score << endl;
|
|
#endif
|
|
|
|
if (found) {
|
|
#ifdef DEBUGLOG
|
|
//cerr << "CP " << "found: " << found << "\t" << best << "\t" << "minsc: " << min_score << endl;
|
|
cerr << "REF : " << ref << endl;
|
|
cerr << "READ: " << read << endl;
|
|
#endif
|
|
|
|
SwResult res;
|
|
int max_match_len = 0;
|
|
res.reset();
|
|
res.alres.init_raw_edits(&rawEdits_);
|
|
|
|
found = swa.nextAlignment(res, best, rnd_);
|
|
if (found) {
|
|
edits = res.alres.ned();
|
|
//const TRefOff ref_off = res.alres.refoff();
|
|
//const Coord& coord = res.alres.refcoord();
|
|
coord = res.alres.refcoord();
|
|
//assert_geq(genomeHit._joinedOff + coord.off(), genomeHit.refoff());
|
|
|
|
#ifdef DEBUGLOG
|
|
cerr << "num edits: " << edits.size() << endl;
|
|
cerr << "coord: " << coord.off();
|
|
cerr << ", " << coord.ref();
|
|
cerr << ", " << coord.orient();
|
|
cerr << ", " << coord.joinedOff();
|
|
cerr << endl;
|
|
Edit::print(cerr, edits); cerr << endl;
|
|
Edit::printQAlign(cerr, btread, edits);
|
|
#endif
|
|
|
|
max_match_len = getMaxMatchLen(edits, btread.length());
|
|
#ifdef DEBUGLOG
|
|
cerr << "max match length: " << max_match_len << endl;
|
|
#endif
|
|
}
|
|
#ifdef DEBUGLOG
|
|
cerr << "nextAlignment: " << found << endl;
|
|
cerr << "-------------------------" << endl;
|
|
#endif
|
|
}
|
|
|
|
return 0;
|
|
}
|
|
|
|
void RB_SWAligner::doTest(const RepeatParameter& rp,
|
|
const string& refstr,
|
|
const string& readstr)
|
|
{
|
|
init_dyn(rp);
|
|
|
|
doTestCase1(refstr,
|
|
readstr,
|
|
rp.max_edit);
|
|
}
|
|
|
|
void RB_SWAligner::doTestCase1(const string& refstr,
|
|
const string& readstr,
|
|
TIndexOffU rpt_edit)
|
|
{
|
|
cerr << "doTestCase1----------------" << endl;
|
|
EList<Edit> edits;
|
|
Coord coord;
|
|
|
|
if (refstr.length() == 0 ||
|
|
readstr.length() == 0) {
|
|
return;
|
|
}
|
|
|
|
EList<Edit> ed;
|
|
|
|
string pad;
|
|
makePadString(refstr, readstr, pad, 5);
|
|
|
|
string ref2 = pad + refstr + pad;
|
|
string read2 = pad + readstr + pad;
|
|
alignStrings(refstr, readstr, edits, coord);
|
|
|
|
size_t left = pad.length();
|
|
size_t right = left + readstr.length();
|
|
|
|
edits.reserveExact(ed.size());
|
|
for(size_t i = 0; i < ed.size(); i++) {
|
|
if(ed[i].pos >= left && ed[i].pos <= right) {
|
|
edits.push_back(ed[i]);
|
|
edits.back().pos -= left;
|
|
}
|
|
}
|
|
|
|
|
|
#if 0
|
|
RepeatGroup rg;
|
|
|
|
rg.edits = edits;
|
|
rg.coord = coord;
|
|
rg.seq = readstr;
|
|
rg.base_offset = 0;
|
|
|
|
string chr_name = "rep";
|
|
|
|
cerr << "REF : " << refstr << endl;
|
|
cerr << "READ: " << readstr << endl;
|
|
size_t snpids = 0;
|
|
rg.buildSNPs(snpids);
|
|
rg.writeSNPs(cerr, chr_name); cerr << endl;
|
|
#endif
|
|
}
|
|
|
|
|
|
template<typename TStr>
|
|
RepeatBuilder<TStr>::RepeatBuilder(TStr& s,
|
|
TStr& sOriginal,
|
|
const EList<RefRecord>& szs,
|
|
const EList<string>& ref_names,
|
|
bool forward_only,
|
|
const string& filename) :
|
|
s_(s),
|
|
sOriginal_(sOriginal),
|
|
coordHelper_(s.length(), forward_only ? s.length() : s.length() / 2, szs, ref_names),
|
|
forward_only_(forward_only),
|
|
filename_(filename),
|
|
forward_length_(forward_only ? s.length() : s.length() / 2)
|
|
{
|
|
cerr << "RepeatBuilder: " << filename_ << endl;
|
|
}
|
|
|
|
template<typename TStr>
|
|
RepeatBuilder<TStr>::~RepeatBuilder()
|
|
{
|
|
for(map<size_t, RB_Repeat*>::iterator it = repeat_map_.begin(); it != repeat_map_.end(); it++) {
|
|
delete it->second;
|
|
}
|
|
repeat_map_.clear();
|
|
}
|
|
|
|
template<typename TStr>
|
|
void RepeatBuilder<TStr>::readSA(const RepeatParameter& rp,
|
|
BlockwiseSA<TStr>& sa)
|
|
{
|
|
TIndexOffU count = 0;
|
|
subSA_.init(s_.length() + 1, rp.min_repeat_len, rp.repeat_count);
|
|
|
|
while(count < s_.length() + 1) {
|
|
TIndexOffU saElt = sa.nextSuffix();
|
|
count++;
|
|
|
|
if(count && (count % 10000000 == 0)) {
|
|
cerr << "SA count " << count << endl;
|
|
}
|
|
|
|
if(saElt == s_.length()) {
|
|
assert_eq(count, s_.length() + 1);
|
|
break;
|
|
}
|
|
|
|
subSA_.push_back(s_, coordHelper_, saElt, count == s_.length());
|
|
}
|
|
|
|
cerr << "subSA size is " << subSA_.size() << endl;
|
|
subSA_.dump();
|
|
#if 0
|
|
for(size_t i = 0; i < subSA_.size(); i++) {
|
|
TIndexOffU joinedOff = subSA_[i];
|
|
fp << setw(10) << joinedOff << " " << getString(s_, joinedOff, rp.seed_len) << endl;
|
|
}
|
|
#endif
|
|
cerr << "subSA mem Usage: " << subSA_.getMemUsage() << endl << endl;
|
|
}
|
|
|
|
template<typename TStr>
|
|
void RepeatBuilder<TStr>::readSA(const RepeatParameter &rp,
|
|
const BitPackedArray &sa)
|
|
{
|
|
TIndexOffU count = 0;
|
|
|
|
subSA_.init(s_.length() + 1, rp.min_repeat_len, rp.repeat_count);
|
|
|
|
for(size_t i = 0; i < sa.size(); i++) {
|
|
TIndexOffU saElt = sa[i];
|
|
count++;
|
|
|
|
if(count && (count % 10000000 == 0)) {
|
|
cerr << "RB count " << count << endl;
|
|
}
|
|
|
|
if(saElt == s_.length()) {
|
|
assert_eq(count, s_.length() + 1);
|
|
break;
|
|
}
|
|
|
|
subSA_.push_back(s_, coordHelper_, saElt, count == s_.length());
|
|
}
|
|
|
|
cerr << "subSA size: " << endl;
|
|
subSA_.dump();
|
|
#if 0
|
|
for(size_t i = 0; i < subSA_.size(); i++) {
|
|
TIndexOffU joinedOff = subSA_[i];
|
|
fp << setw(10) << joinedOff << " " << getString(s_, joinedOff, rp.seed_len) << endl;
|
|
}
|
|
#endif
|
|
cerr << "subSA mem Usage: " << subSA_.getMemUsage() << endl << endl;
|
|
}
|
|
|
|
template<typename TStr>
|
|
void RepeatBuilder<TStr>::build(const RepeatParameter& rp)
|
|
{
|
|
string rpt_len_str;
|
|
rpt_len_str = to_string(rp.min_repeat_len) + "-" + to_string(rp.max_repeat_len);
|
|
|
|
string seed_filename = filename_ + ".rep." + rpt_len_str + ".seed";
|
|
ofstream fp(seed_filename.c_str());
|
|
|
|
swaligner_.init_dyn(rp);
|
|
|
|
RB_RepeatManager* repeat_manager = new RB_RepeatManager;
|
|
|
|
EList<RB_RepeatBase> repeatBases;
|
|
subSA_.buildRepeatBase(s_,
|
|
coordHelper_,
|
|
rp.max_repeat_len,
|
|
repeatBases);
|
|
|
|
size_t repeat_id = 0;
|
|
for(size_t i = 0; i < repeatBases.size(); i++) {
|
|
addRepeatGroup(rp,
|
|
repeat_id,
|
|
*repeat_manager,
|
|
repeatBases[i],
|
|
fp);
|
|
}
|
|
|
|
{
|
|
// Build and test minimizer-based k-mer table
|
|
const size_t window = RB_Minimizer<TStr>::default_w;
|
|
const size_t k = RB_Minimizer<TStr>::default_k;
|
|
RB_KmerTable kmer_table;
|
|
EList<string> seqs;
|
|
seqs.reserveExact(repeat_map_.size());
|
|
for(map<size_t, RB_Repeat*>::const_iterator it = repeat_map_.begin(); it != repeat_map_.end(); it++) {
|
|
const RB_Repeat& repeat = *(it->second);
|
|
assert(repeat.satisfy(rp));
|
|
seqs.expand();
|
|
seqs.back() = repeat.consensus();
|
|
}
|
|
kmer_table.build(seqs,
|
|
window,
|
|
k);
|
|
kmer_table.dump(cerr);
|
|
cerr << endl;
|
|
|
|
string query, rc_query;
|
|
string queryOriginal;
|
|
EList<pair<uint64_t, size_t> > minimizers;
|
|
size_t total = 0, num_repeat = 0, correct = 0, false_positive = 0, false_negative = 0;
|
|
for(size_t i = 0; i + rp.min_repeat_len <= forward_length_; i += 1000) {
|
|
if(coordHelper_.getEnd(i) != coordHelper_.getEnd(i + rp.min_repeat_len))
|
|
continue;
|
|
query = getString(s_, i, rp.min_repeat_len);
|
|
queryOriginal = getString(sOriginal_, i, rp.min_repeat_len);
|
|
rc_query = reverseComplement(queryOriginal);
|
|
|
|
TIndexOffU idx = subSA_.find_repeat_idx(s_, query);
|
|
const EList<TIndexOffU>& test_repeat_index = subSA_.getRepeatIndex();
|
|
bool repeat = (idx < test_repeat_index.size());
|
|
bool est_repeat = kmer_table.isRepeat(query,
|
|
rc_query,
|
|
minimizers);
|
|
|
|
total++;
|
|
if(repeat) num_repeat++;
|
|
if(repeat == est_repeat) {
|
|
correct++;
|
|
} else {
|
|
if(est_repeat) {
|
|
false_positive++;
|
|
} else {
|
|
false_negative++;
|
|
//assert(false);
|
|
}
|
|
}
|
|
}
|
|
|
|
cerr << "total: " << total << endl;
|
|
cerr << "repeat: " << num_repeat << endl;
|
|
cerr << "correct: " << correct << endl;
|
|
cerr << "false positive: " << false_positive << endl;
|
|
cerr << "false negative: " << false_negative << endl;
|
|
cerr << endl;
|
|
|
|
ELList<RB_Alignment> position2D; EList<RB_Alignment> alignments;
|
|
size_t repeat_total = 0, repeat_aligned = 0;
|
|
const EList<TIndexOffU>& test_repeat_index = subSA_.getRepeatIndex();
|
|
size_t interval = 1;
|
|
if(test_repeat_index.size() >= 1000) {
|
|
interval = test_repeat_index.size() / 1000;
|
|
}
|
|
size_t total_alignments = 0, max_alignments = 0;
|
|
string query2, rc_query2;
|
|
string queryOriginal2, rc_queryOriginal2;
|
|
for(size_t i = 0; i < test_repeat_index.size(); i += interval) {
|
|
TIndexOffU saElt_idx = test_repeat_index[i];
|
|
TIndexOffU saElt_idx_end = (i + 1 < test_repeat_index.size() ? test_repeat_index[i+1] : subSA_.size());
|
|
TIndexOffU saElt_size = saElt_idx_end - saElt_idx;
|
|
TIndexOffU saElt = subSA_[saElt_idx];
|
|
query = getString(s_, saElt, rp.min_repeat_len);
|
|
query2 = query;
|
|
|
|
queryOriginal = getString(sOriginal_, saElt, rp.min_repeat_len);
|
|
queryOriginal2 = queryOriginal;
|
|
|
|
// introduce three mismatches into a query when the query is at lest 100-bp
|
|
if(rp.min_repeat_len >= 100) {
|
|
const size_t mid_pos1 = (size_t)(rp.min_repeat_len * 0.1);
|
|
if(query2[mid_pos1] == 'A') {
|
|
query2[mid_pos1] = 'C';
|
|
queryOriginal2[mid_pos1] ='C';
|
|
} else {
|
|
query2[mid_pos1] = 'A';
|
|
queryOriginal2[mid_pos1] = 'A';
|
|
}
|
|
const size_t mid_pos2 = (size_t)(rp.min_repeat_len * 0.5);
|
|
if(query2[mid_pos2] == 'C') {
|
|
query2[mid_pos2] = 'G';
|
|
queryOriginal2[mid_pos2] = 'G';
|
|
} else {
|
|
query2[mid_pos2] = 'C';
|
|
queryOriginal2[mid_pos1] = 'G';
|
|
}
|
|
const size_t mid_pos3 = (size_t)(rp.min_repeat_len * 0.9);
|
|
if(query2[mid_pos3] == 'G') {
|
|
query2[mid_pos3] = 'T';
|
|
queryOriginal2[mid_pos3] = 'T';
|
|
} else {
|
|
query2[mid_pos3] = 'G';
|
|
queryOriginal2[mid_pos3] = 'G';
|
|
}
|
|
}
|
|
|
|
repeat_total += saElt_size;
|
|
|
|
size_t found = 0;
|
|
size_t cur_alignments = 0;
|
|
if(kmer_table.isRepeat(query2, minimizers)) {
|
|
kmer_table.findAlignments(query2,
|
|
minimizers,
|
|
position2D,
|
|
alignments);
|
|
total_alignments += (alignments.size() * saElt_size);
|
|
cur_alignments = alignments.size();
|
|
TIndexOffU baseoff = 0;
|
|
for(size_t s = 0; s < seqs.size(); s++) {
|
|
int spos = seqs[s].find(query);
|
|
if(spos != string::npos) {
|
|
for(size_t a = 0; a < alignments.size(); a++) {
|
|
if(alignments[a].pos == baseoff + spos) {
|
|
found++;
|
|
}
|
|
}
|
|
}
|
|
baseoff += seqs[s].length();
|
|
}
|
|
|
|
assert_leq(found, 1);
|
|
}
|
|
|
|
rc_query = reverseComplement(queryOriginal);
|
|
rc_query2 = reverseComplement(queryOriginal2);
|
|
size_t rc_found = 0;
|
|
if(kmer_table.isRepeat(rc_query2, minimizers)) {
|
|
kmer_table.findAlignments(rc_query2,
|
|
minimizers,
|
|
position2D,
|
|
alignments);
|
|
total_alignments += (alignments.size() * saElt_size);
|
|
cur_alignments += alignments.size();
|
|
if(cur_alignments > max_alignments) {
|
|
max_alignments = cur_alignments;
|
|
}
|
|
|
|
TIndexOffU baseoff = 0;
|
|
for(size_t s = 0; s < seqs.size(); s++) {
|
|
int spos = seqs[s].find(rc_query);
|
|
if(spos != string::npos) {
|
|
for(size_t a = 0; a < alignments.size(); a++) {
|
|
if(alignments[a].pos == baseoff + spos) {
|
|
rc_found++;
|
|
}
|
|
}
|
|
}
|
|
baseoff += seqs[s].length();
|
|
}
|
|
assert_leq(rc_found, 1);
|
|
}
|
|
|
|
if(found + rc_found == 1) {
|
|
repeat_aligned += saElt_size;
|
|
}
|
|
}
|
|
|
|
cerr << "num repeats: " << repeat_total << endl;
|
|
cerr << "repeat aligned using minimizers: " << repeat_aligned << endl;
|
|
cerr << "average number of alignments: " << (float)total_alignments / repeat_total << endl;
|
|
cerr << "max alignment: " << max_alignments << endl;
|
|
cerr << endl;
|
|
}
|
|
|
|
cerr << "number of repeats is " << repeat_map_.size() << endl;
|
|
|
|
size_t total_rep_seq_len = 0, total_allele_seq_len = 0;
|
|
size_t i = 0;
|
|
for(map<size_t, RB_Repeat*>::iterator it = repeat_map_.begin(); it != repeat_map_.end(); it++, i++) {
|
|
RB_Repeat& repeat = *(it->second);
|
|
if(!repeat.satisfy(rp))
|
|
continue;
|
|
|
|
repeat.saveSeedExtension(rp,
|
|
s_,
|
|
coordHelper_,
|
|
i,
|
|
fp,
|
|
total_rep_seq_len,
|
|
total_allele_seq_len);
|
|
}
|
|
|
|
size_t total_qual_seeds = 0;
|
|
for(map<size_t, RB_Repeat*>::iterator it = repeat_map_.begin(); it != repeat_map_.end(); it++, i++) {
|
|
RB_Repeat& repeat = *(it->second);
|
|
EList<SeedExt>& seeds = repeat.seeds();
|
|
for(size_t i = 0; i < seeds.size(); i++) {
|
|
if(seeds[i].getLength() < rp.min_repeat_len)
|
|
continue;
|
|
total_qual_seeds++;
|
|
}
|
|
}
|
|
|
|
cerr << "total repeat sequence length: " << total_rep_seq_len << endl;
|
|
cerr << "total allele sequence length: " << total_allele_seq_len << endl;
|
|
cerr << "total number of seeds including those that are position-wise different, but sequence-wise identical: " << total_qual_seeds << endl;
|
|
cerr << endl;
|
|
fp << "total repeat sequence length: " << total_rep_seq_len << endl;
|
|
fp << "total allele sequence length: " << total_allele_seq_len << endl;
|
|
fp << "total number of seeds including those that are position-wise different, but sequence-wise identical: " << total_qual_seeds << endl;
|
|
fp.close();
|
|
|
|
delete repeat_manager;
|
|
repeat_manager = NULL;
|
|
|
|
// remove non-qualifying repeats
|
|
// and update repeat IDs for those remaining
|
|
{
|
|
map<size_t, RB_Repeat*> temp_repeat_map;
|
|
size_t i = 0;
|
|
for(map<size_t, RB_Repeat*>::iterator it = repeat_map_.begin(); it != repeat_map_.end(); it++) {
|
|
RB_Repeat* repeat = it->second;
|
|
if(!repeat->satisfy(rp)) {
|
|
delete repeat;
|
|
continue;
|
|
}
|
|
repeat->repeat_id(i);
|
|
temp_repeat_map[repeat->repeat_id()] = repeat;
|
|
i++;
|
|
}
|
|
repeat_map_ = temp_repeat_map;
|
|
}
|
|
|
|
const bool sanity_check = true;
|
|
if(sanity_check) {
|
|
EList<pair<size_t, size_t> > kmer_table;
|
|
string query;
|
|
string queryOriginal;
|
|
size_t total = 0, match = 0;
|
|
EList<TIndexOffU> positions;
|
|
const EList<TIndexOffU>& test_repeat_index = subSA_.getRepeatIndex();
|
|
size_t interval = 1;
|
|
if(test_repeat_index.size() >= 10000) {
|
|
interval = test_repeat_index.size() / 10000;
|
|
}
|
|
for(size_t i = 0; i < test_repeat_index.size(); i += interval) {
|
|
TIndexOffU saElt_idx = test_repeat_index[i];
|
|
TIndexOffU saElt_idx_end = (i + 1 < test_repeat_index.size() ? test_repeat_index[i+1] : subSA_.size());
|
|
positions.clear();
|
|
for(size_t j = saElt_idx; j < saElt_idx_end; j++) {
|
|
positions.push_back(subSA_[j]);
|
|
#ifndef NDEBUG
|
|
if(j > saElt_idx) {
|
|
TIndexOffU lcp_len = getLCP(s_,
|
|
coordHelper_,
|
|
positions[0],
|
|
positions.back(),
|
|
rp.min_repeat_len);
|
|
assert_eq(lcp_len, rp.min_repeat_len);
|
|
}
|
|
|
|
TIndexOffU saElt = subSA_[j];
|
|
TIndexOffU start = coordHelper_.getStart(saElt);
|
|
TIndexOffU start2 = coordHelper_.getStart(saElt + rp.min_repeat_len - 1);
|
|
assert_eq(start, start2);
|
|
#endif
|
|
}
|
|
|
|
TIndexOffU saElt = subSA_[saElt_idx];
|
|
size_t true_count = saElt_idx_end - saElt_idx;
|
|
getString(s_, saElt, rp.min_repeat_len, query);
|
|
getString(sOriginal_, saElt, rp.min_repeat_len, queryOriginal);
|
|
total++;
|
|
|
|
size_t count = 0, rc_count = 0;
|
|
string rc_query = reverse_complement(queryOriginal);
|
|
for(map<size_t, RB_Repeat*>::iterator it = repeat_map_.begin(); it != repeat_map_.end(); it++) {
|
|
RB_Repeat& repeat = *(it->second);
|
|
int pos = repeat.consensus().find(query);
|
|
if(pos != string::npos) {
|
|
for(size_t s = 0; s < repeat.seeds().size(); s++) {
|
|
SeedExt& seed = repeat.seeds()[s];
|
|
string seq;
|
|
seed.getExtendedSeedSequence(s_, seq);
|
|
if(seq.find(query) != string::npos)
|
|
count++;
|
|
}
|
|
}
|
|
pos = repeat.consensus().find(rc_query);
|
|
if(pos != string::npos) {
|
|
for(size_t s = 0; s < repeat.seeds().size(); s++) {
|
|
SeedExt& seed = repeat.seeds()[s];
|
|
string seq;
|
|
seed.getExtendedSeedSequence(s_, seq);
|
|
if(seq.find(rc_query) != string::npos)
|
|
rc_count++;
|
|
}
|
|
}
|
|
}
|
|
|
|
if(count == true_count || rc_count == true_count) {
|
|
match++;
|
|
} else if(total - match <= 10) {
|
|
cerr << " query: " << query << endl;
|
|
cerr << "rc_query: " << rc_query << endl;
|
|
cerr << "true count: " << true_count << endl;
|
|
cerr << "found count: " << count << endl;
|
|
cerr << "rc found count: " << rc_count << endl;
|
|
cerr << endl;
|
|
}
|
|
}
|
|
|
|
cerr << "RepeatBuilder: sanity check: " << match << " passed (out of " << total << ")" << endl << endl;
|
|
}
|
|
}
|
|
|
|
template<typename TStr>
|
|
void RepeatBuilder<TStr>::writeHaploType(const EList<SeedHP>& haplo_lists,
|
|
const EList<SeedExt>& seeds,
|
|
TIndexOffU& hapl_id_base,
|
|
const string& seq_name,
|
|
ostream &fp)
|
|
{
|
|
if(haplo_lists.size() == 0) {
|
|
return;
|
|
}
|
|
|
|
for(size_t i = 0; i < haplo_lists.size(); i++) {
|
|
const SeedHP &haploType = haplo_lists[i];
|
|
|
|
fp << "rpht" << hapl_id_base++;
|
|
fp << "\t" << seq_name;
|
|
fp << "\t" << haploType.range.first;
|
|
fp << "\t" << haploType.range.second;
|
|
fp << "\t";
|
|
|
|
assert_gt(haploType.snpIDs.size(), 0);
|
|
|
|
for (size_t j = 0; j < haploType.snpIDs.size(); j++) {
|
|
if(j) {
|
|
fp << ",";
|
|
}
|
|
fp << haploType.snpIDs[j];
|
|
}
|
|
fp << endl;
|
|
}
|
|
}
|
|
|
|
template<typename TStr>
|
|
void RepeatBuilder<TStr>::writeAllele(TIndexOffU grp_id,
|
|
TIndexOffU allele_id,
|
|
Range range,
|
|
const string& seq_name,
|
|
TIndexOffU baseoff,
|
|
const EList<SeedExt>& seeds,
|
|
ostream &fp)
|
|
{
|
|
// >rpt_name*0\trep\trep_pos\trep_len\tpos_count\t0
|
|
// chr_name:pos:direction chr_name:pos:direction
|
|
//
|
|
// >rep1*0 rep 0 100 470 0
|
|
// 22:112123123:+ 22:1232131113:+
|
|
//
|
|
size_t snp_size = seeds[range.first].snps.size();
|
|
size_t pos_size = range.second - range.first;
|
|
fp << ">";
|
|
fp << "rpt_" << grp_id << "*" << allele_id;
|
|
fp << "\t" << seq_name;
|
|
fp << "\t" << baseoff + seeds[range.first].consensus_pos.first;
|
|
fp << "\t" << seeds[range.first].consensus_pos.second - seeds[range.first].consensus_pos.first;
|
|
fp << "\t" << pos_size;
|
|
fp << "\t" << snp_size;
|
|
|
|
fp << "\t";
|
|
for(size_t i = 0; i < snp_size; i++) {
|
|
if(i > 0) {fp << ",";}
|
|
fp << "rps" << seeds[range.first].snps[i]->id;
|
|
}
|
|
fp << endl;
|
|
|
|
// print positions
|
|
for(size_t i = 0; i < pos_size; i++) {
|
|
if(i > 0 && (i % 10 == 0)) {
|
|
fp << endl;
|
|
}
|
|
|
|
if(i % 10 != 0) {
|
|
fp << " ";
|
|
}
|
|
string chr_name;
|
|
TIndexOffU pos_in_chr;
|
|
TIndexOffU joinedOff = seeds[range.first + i].pos.first;
|
|
|
|
bool fw = true;
|
|
if(joinedOff < forward_length_) {
|
|
fw = true;
|
|
} else {
|
|
fw = false;
|
|
joinedOff = s_.length() - joinedOff - seeds[range.first].getLength();
|
|
}
|
|
|
|
coordHelper_.getGenomeCoord(joinedOff, chr_name, pos_in_chr);
|
|
|
|
char direction = fw ? '+' : '-';
|
|
fp << chr_name << ":" << pos_in_chr << ":" << direction;
|
|
}
|
|
|
|
fp << endl;
|
|
}
|
|
|
|
template<typename TStr>
|
|
void RepeatBuilder<TStr>::writeSNPs(ostream& fp,
|
|
const string& rep_chr_name,
|
|
const ESet<Edit>& editset)
|
|
{
|
|
for(size_t i = 0; i < editset.size(); i++) {
|
|
const Edit& ed = editset[i];
|
|
|
|
if(ed.isMismatch()) {
|
|
fp << "rps" << ed.snpID;
|
|
fp << "\t" << "single";
|
|
fp << "\t" << rep_chr_name;
|
|
fp << "\t" << ed.pos;
|
|
fp << "\t" << ed.qchr;
|
|
fp << endl;
|
|
} else {
|
|
assert(false);
|
|
}
|
|
}
|
|
}
|
|
|
|
|
|
template<typename TStr>
|
|
void RepeatBuilder<TStr>::saveFile(const RepeatParameter& rp)
|
|
{
|
|
saveRepeats(rp);
|
|
}
|
|
|
|
bool RB_RepeatManager::checkRedundant(const RepeatParameter& rp,
|
|
const map<size_t, RB_Repeat*>& repeat_map,
|
|
const EList<TIndexOffU>& positions,
|
|
EList<size_t>& to_remove) const
|
|
{
|
|
to_remove.clear();
|
|
bool replace = false;
|
|
for(size_t i = 0; i < positions.size(); i++) {
|
|
TIndexOffU seed_pos = positions[i];
|
|
Range seed_range(seed_pos + rp.seed_len, 0);
|
|
map<Range, EList<size_t> >::const_iterator it = range_to_repeats_.upper_bound(seed_range);
|
|
if(it == range_to_repeats_.end())
|
|
continue;
|
|
|
|
for(map<Range, EList<size_t> >::const_reverse_iterator rit(it); rit != range_to_repeats_.rend(); rit++) {
|
|
Range repeat_range = rit->first;
|
|
if(repeat_range.first + rp.max_repeat_len <= seed_pos)
|
|
break;
|
|
|
|
const EList<size_t>& repeat_ids = rit->second;
|
|
assert_gt(repeat_ids.size(), 0);
|
|
for(size_t r = 0; r < repeat_ids.size(); r++) {
|
|
size_t repeat_id = repeat_ids[r];
|
|
size_t idx = to_remove.bsearchLoBound(repeat_id);
|
|
if(idx < to_remove.size() && to_remove[idx] == repeat_id)
|
|
continue;
|
|
|
|
bool overlap = seed_pos < repeat_range.second && seed_pos + rp.seed_len > repeat_range.first;
|
|
if(!overlap)
|
|
continue;
|
|
|
|
map<size_t, RB_Repeat*>::const_iterator it2 = repeat_map.find(repeat_id);
|
|
assert(it2 != repeat_map.end());
|
|
const RB_Repeat& repeat = *(it2->second);
|
|
const EList<RB_AlleleCoord>& allele_ranges = repeat.seed_ranges();
|
|
size_t num_contain = 0, num_overlap = 0, num_close = 0, num_overlap_bp = 0;
|
|
size_t p = 0, p2 = 0;
|
|
while(p < positions.size() && p2 < allele_ranges.size()) {
|
|
RB_AlleleCoord range;
|
|
range.left = positions[p];
|
|
range.right = positions[p] + rp.seed_len;
|
|
RB_AlleleCoord range2 = allele_ranges[p2];
|
|
if(range2.contain(range)) {
|
|
num_contain++;
|
|
num_overlap_bp += rp.seed_len;
|
|
} else {
|
|
TIndexOffU overlap = range2.overlap_len(range);
|
|
if(overlap > 0) {
|
|
num_overlap++;
|
|
num_overlap_bp += (range2.right - range.left);
|
|
} else if(range.right + 10 > range2.left && range2.right + 10 > range.left) {
|
|
num_close++;
|
|
}
|
|
}
|
|
if(range.right <= range2.right) p++;
|
|
else p2++;
|
|
}
|
|
|
|
// if the number of matches is >= 90% of positions in the smaller group
|
|
if((num_contain + num_overlap) * 10 + num_close * 8 >= min(positions.size(), allele_ranges.size()) * 9) {
|
|
if(positions.size() <= allele_ranges.size()) {
|
|
return true;
|
|
} else {
|
|
replace = true;
|
|
to_remove.push_back(repeat_id);
|
|
to_remove.sort();
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
// DK - check this out
|
|
if(replace)
|
|
break;
|
|
}
|
|
return false;
|
|
}
|
|
|
|
void RB_RepeatManager::addRepeat(const RB_Repeat* repeat)
|
|
{
|
|
const EList<RB_AlleleCoord>& allele_ranges = repeat->seed_ranges();
|
|
for(size_t i = 0; i < allele_ranges.size(); i++) {
|
|
Range allele_range(allele_ranges[i].left, allele_ranges[i].right);
|
|
addRepeat(allele_range, repeat->repeat_id());
|
|
}
|
|
}
|
|
|
|
void RB_RepeatManager::addRepeat(Range range, size_t repeat_id)
|
|
{
|
|
if(range_to_repeats_.find(range) == range_to_repeats_.end()) {
|
|
range_to_repeats_[range] = EList<size_t>();
|
|
}
|
|
EList<size_t>& repeat_ids = range_to_repeats_[range];
|
|
size_t idx = repeat_ids.bsearchLoBound(repeat_id);
|
|
if(idx < repeat_ids.size() && repeat_ids[idx] == repeat_id)
|
|
return;
|
|
repeat_ids.push_back(repeat_id);
|
|
repeat_ids.sort();
|
|
}
|
|
|
|
void RB_RepeatManager::removeRepeat(const RB_Repeat* repeat)
|
|
{
|
|
const EList<RB_AlleleCoord>& allele_ranges = repeat->seed_ranges();
|
|
for(size_t p = 0; p < allele_ranges.size(); p++) {
|
|
Range range(allele_ranges[p].left, allele_ranges[p].right);
|
|
removeRepeat(range, repeat->repeat_id());
|
|
}
|
|
}
|
|
|
|
void RB_RepeatManager::removeRepeat(Range range, size_t repeat_id)
|
|
{
|
|
EList<size_t>& repeat_ids = range_to_repeats_[range];
|
|
TIndexOffU idx = repeat_ids.bsearchLoBound(repeat_id);
|
|
if(idx < repeat_ids.size() && repeat_id == repeat_ids[idx]) {
|
|
repeat_ids.erase(idx);
|
|
if(repeat_ids.empty())
|
|
range_to_repeats_.erase(range);
|
|
}
|
|
}
|
|
|
|
void RB_RepeatManager::showInfo(const RepeatParameter& rp,
|
|
CoordHelper& coordHelper,
|
|
const map<size_t, RB_Repeat*>& repeat_map,
|
|
size_t num_case) const
|
|
{
|
|
size_t count = 0;
|
|
for(map<Range, EList<size_t> >::const_iterator it = range_to_repeats_.begin();
|
|
it != range_to_repeats_.end(); it++) {
|
|
const Range& range = it->first;
|
|
if(range.second - range.first < rp.min_repeat_len) continue;
|
|
map<Range, EList<size_t> >::const_iterator jt = it; jt++;
|
|
for(; jt != range_to_repeats_.end(); jt++) {
|
|
const Range& range2 = jt->first;
|
|
if(range2.second - range2.first < rp.min_repeat_len) continue;
|
|
if(range.second <= range2.first)
|
|
break;
|
|
if(count < num_case) {
|
|
cerr << "range (" << range.first << ", " << range.second << ") vs. range2 (";
|
|
cerr << range2.first << ", " << range2.second << ")" << endl;
|
|
for(size_t i = 0; i < it->second.size(); i++) {
|
|
cerr << "\t1 " << it->second[i] << endl;
|
|
const RB_Repeat* repeat = repeat_map.find(it->second[i])->second;
|
|
repeat->showInfo(rp, coordHelper);
|
|
}
|
|
for(size_t i = 0; i < jt->second.size(); i++) {
|
|
cerr << "\t2 " << jt->second[i] << endl;
|
|
const RB_Repeat* repeat = repeat_map.find(jt->second[i])->second;
|
|
repeat->showInfo(rp, coordHelper);
|
|
}
|
|
cerr << endl << endl;
|
|
}
|
|
count++;
|
|
}
|
|
}
|
|
cerr << "ShowInfo - count: " << count << endl;
|
|
}
|
|
|
|
template<typename TStr>
|
|
void RepeatBuilder<TStr>::reassignSeeds(const RepeatParameter& rp,
|
|
size_t repeat_bid, // repeat begin id
|
|
size_t repeat_eid, // repeat end id
|
|
EList<SeedExt>& seeds)
|
|
{
|
|
assert_lt(repeat_bid, repeat_eid);
|
|
EList<bool> updated;
|
|
updated.resizeExact(repeat_eid - repeat_bid);
|
|
updated.fillZero();
|
|
|
|
string seq;
|
|
for(size_t s = 0; s < seeds.size(); s++) {
|
|
SeedExt& seed = seeds[s];
|
|
size_t max_repeat_id = repeat_bid;
|
|
size_t max_ext_len = 0;
|
|
for(size_t i = repeat_bid; i < repeat_eid; i++) {
|
|
map<size_t, RB_Repeat*>::iterator it = repeat_map_.find(i);
|
|
assert(it != repeat_map_.end());
|
|
const RB_Repeat& repeat = *(it->second);
|
|
const string& consensus = repeat.consensus();
|
|
const size_t ext_len = (consensus.length() - rp.seed_len) / 2;
|
|
|
|
if(seed.bound.first + ext_len > seed.pos.first)
|
|
continue;
|
|
if(seed.pos.second + ext_len > seed.bound.second)
|
|
continue;
|
|
|
|
getString(s_, seed.pos.first - ext_len, rp.seed_len + ext_len * 2, seq);
|
|
assert_eq(consensus.length(), seq.length());
|
|
size_t tmp_ext_len = 0;
|
|
for(size_t j = 0; j < ext_len; j++, tmp_ext_len++) {
|
|
if(seq[ext_len - j - 1] != consensus[ext_len - j - 1] ||
|
|
seq[ext_len + rp.seed_len + j] != consensus[ext_len + rp.seed_len + j]) {
|
|
break;
|
|
}
|
|
}
|
|
|
|
if(tmp_ext_len > max_ext_len) {
|
|
max_repeat_id = i;
|
|
max_ext_len = tmp_ext_len;
|
|
}
|
|
}
|
|
if(rp.seed_len + max_ext_len * 2 >= rp.min_repeat_len) {
|
|
seed.pos.first -= max_ext_len;
|
|
seed.pos.second += max_ext_len;
|
|
map<size_t, RB_Repeat*>::iterator it = repeat_map_.find(max_repeat_id);
|
|
assert(it != repeat_map_.end());
|
|
RB_Repeat& repeat = *(it->second);
|
|
const string& consensus = repeat.consensus();
|
|
const size_t ext_len = (consensus.length() - rp.seed_len) / 2;
|
|
assert_leq(max_ext_len, ext_len);
|
|
seed.consensus_pos.first = ext_len - max_ext_len;
|
|
seed.consensus_pos.second = consensus.length() - (ext_len - max_ext_len);
|
|
assert_leq(seed.consensus_pos.second - seed.consensus_pos.first, consensus.length());
|
|
repeat.addSeed(seed);
|
|
updated[max_repeat_id - repeat_bid] = true;
|
|
}
|
|
}
|
|
|
|
for(size_t i = repeat_bid; i < repeat_eid; i++) {
|
|
if(!updated[i - repeat_bid])
|
|
continue;
|
|
map<size_t, RB_Repeat*>::iterator it = repeat_map_.find(i);
|
|
assert(it != repeat_map_.end());
|
|
RB_Repeat& repeat = *(it->second);
|
|
repeat.update();
|
|
}
|
|
}
|
|
|
|
/**
|
|
* TODO
|
|
* @brief
|
|
*
|
|
* @param rpt_seq
|
|
* @param rpt_range
|
|
*/
|
|
template<typename TStr>
|
|
void RepeatBuilder<TStr>::addRepeatGroup(const RepeatParameter& rp,
|
|
size_t& repeat_id,
|
|
RB_RepeatManager& repeat_manager,
|
|
const RB_RepeatBase& repeatBase,
|
|
ostream& fp)
|
|
{
|
|
RB_Repeat* repeat = new RB_Repeat;
|
|
repeat->repeat_id(repeat_id);
|
|
repeat->init(rp,
|
|
s_,
|
|
coordHelper_,
|
|
subSA_,
|
|
repeatBase);
|
|
|
|
assert(repeat_map_.find(repeat->repeat_id()) == repeat_map_.end());
|
|
repeat_map_[repeat->repeat_id()] = repeat;
|
|
repeat_id++;
|
|
}
|
|
|
|
template<typename TStr>
|
|
bool RepeatBuilder<TStr>::checkSequenceMergeable(const string& ref,
|
|
const string& read,
|
|
EList<Edit>& edits,
|
|
Coord& coord,
|
|
TIndexOffU rpt_len,
|
|
TIndexOffU max_edit)
|
|
{
|
|
size_t max_matchlen = 0;
|
|
EList<Edit> ed;
|
|
|
|
string pad;
|
|
swaligner_.makePadString(ref, read, pad, 5);
|
|
|
|
string ref2 = pad + ref;
|
|
string read2 = pad + read;
|
|
|
|
swaligner_.alignStrings(ref2, read2, ed, coord);
|
|
|
|
// match should start from pad string
|
|
if(coord.off() != 0) {
|
|
return false;
|
|
}
|
|
|
|
// no edits on pad string
|
|
if(ed.size() > 0 && ed[0].pos < pad.length()) {
|
|
return false;
|
|
}
|
|
|
|
size_t left = pad.length();
|
|
size_t right = left + read.length();
|
|
|
|
edits.clear();
|
|
edits.reserveExact(ed.size());
|
|
for(size_t i = 0; i < ed.size(); i++) {
|
|
if(ed[i].pos >= left && ed[i].pos <= right) {
|
|
edits.push_back(ed[i]);
|
|
edits.back().pos -= left;
|
|
}
|
|
}
|
|
|
|
max_matchlen = getMaxMatchLen(edits, read.length());
|
|
|
|
#ifdef DEBUGLOG
|
|
{
|
|
cerr << "After pad removed" << endl;
|
|
BTDnaString btread;
|
|
btread.install(read.c_str(), true);
|
|
Edit::print(cerr, edits); cerr << endl;
|
|
Edit::printQAlign(cerr, btread, edits);
|
|
}
|
|
#endif
|
|
|
|
return (max_matchlen >= rpt_len);
|
|
}
|
|
|
|
|
|
template<typename TStr>
|
|
void RepeatBuilder<TStr>::saveRepeats(const RepeatParameter &rp)
|
|
{
|
|
ios_base::openmode mode = ios_base::out;
|
|
if(rp.append_result) {
|
|
mode |= ios_base::app;
|
|
} else {
|
|
mode |= ios_base::trunc;
|
|
}
|
|
// Generate SNPs
|
|
size_t i = 0;
|
|
for(map<size_t, RB_Repeat*>::iterator it = repeat_map_.begin(); it != repeat_map_.end(); it++, i++) {
|
|
RB_Repeat& repeat = *(it->second);
|
|
if(!repeat.satisfy(rp))
|
|
continue;
|
|
|
|
// for each repeats
|
|
repeat.generateSNPs(rp, s_, i);
|
|
}
|
|
|
|
// save snp, consensus sequenuce, info
|
|
string snp_fname = filename_ + ".rep.snp";
|
|
string info_fname = filename_ + ".rep.info";
|
|
string hapl_fname = filename_ + ".rep.haplotype";
|
|
|
|
ofstream snp_fp(snp_fname.c_str(), mode);
|
|
ofstream info_fp(info_fname.c_str(), mode);
|
|
ofstream hapl_fp(hapl_fname.c_str(), mode);
|
|
|
|
const string repName = "rep" + to_string(rp.min_repeat_len) + "-" + to_string(rp.max_repeat_len);
|
|
|
|
i = 0;
|
|
TIndexOffU consensus_baseoff = 0;
|
|
TIndexOffU snp_id_base = 0;
|
|
TIndexOffU hapl_id_base = 0;
|
|
for(map<size_t, RB_Repeat*>::iterator it = repeat_map_.begin(); it != repeat_map_.end(); it++, i++) {
|
|
RB_Repeat& repeat = *(it->second);
|
|
if(!repeat.satisfy(rp))
|
|
continue;
|
|
|
|
// for each repeats
|
|
repeat.saveSNPs(snp_fp, i, snp_id_base);
|
|
saveAlleles(rp,
|
|
repName,
|
|
repeat,
|
|
info_fp,
|
|
hapl_fp,
|
|
i,
|
|
hapl_id_base,
|
|
consensus_baseoff);
|
|
|
|
consensus_baseoff += repeat.consensus().length();
|
|
}
|
|
|
|
snp_fp.close();
|
|
info_fp.close();
|
|
hapl_fp.close();
|
|
|
|
// save all consensus sequence
|
|
saveConsensus(rp, repName);
|
|
}
|
|
|
|
template<typename TStr>
|
|
void RepeatBuilder<TStr>::saveConsensus(const RepeatParameter &rp,
|
|
const string& repName) {
|
|
ios_base::openmode mode = ios_base::out;
|
|
if(rp.append_result) {
|
|
mode |= ios_base::app;
|
|
} else {
|
|
mode |= ios_base::trunc;
|
|
}
|
|
|
|
string fa_fname = filename_ + ".rep.fa";
|
|
ofstream fa_fp(fa_fname.c_str(), mode);
|
|
|
|
fa_fp << ">" << repName << endl;
|
|
|
|
size_t oskip = 0;
|
|
for(map<size_t, RB_Repeat*>::iterator it = repeat_map_.begin(); it != repeat_map_.end(); it++) {
|
|
RB_Repeat& repeat = *(it->second);
|
|
if(!repeat.satisfy(rp))
|
|
continue;
|
|
|
|
// for each repeats
|
|
const string& constr = repeat.consensus();
|
|
size_t si = 0;
|
|
size_t constr_len = constr.length();
|
|
while(si < constr_len) {
|
|
size_t out_len = std::min((size_t)(output_width - oskip), (size_t)(constr_len - si));
|
|
fa_fp << constr.substr(si, out_len);
|
|
|
|
if((oskip + out_len) == output_width) {
|
|
fa_fp << endl;
|
|
oskip = 0;
|
|
} else {
|
|
// last line
|
|
oskip = oskip + out_len;
|
|
}
|
|
|
|
si += out_len;
|
|
}
|
|
}
|
|
if(oskip) {
|
|
fa_fp << endl;
|
|
}
|
|
|
|
fa_fp.close();
|
|
}
|
|
|
|
template<typename TStr>
|
|
void RepeatBuilder<TStr>::saveAlleles(
|
|
const RepeatParameter& rp,
|
|
const string& repName,
|
|
RB_Repeat& repeat,
|
|
ofstream& fp,
|
|
ofstream& hapl_fp,
|
|
TIndexOffU grp_id,
|
|
TIndexOffU& hapl_id_base,
|
|
TIndexOffU baseoff)
|
|
{
|
|
const EList<SeedExt>& seeds = repeat.seeds();
|
|
EList<SeedHP> haplo_lists;
|
|
Range range(0, seeds.size());
|
|
|
|
int allele_id = 0;
|
|
|
|
for(size_t sb = range.first; sb < range.second;) {
|
|
size_t se = sb + 1;
|
|
for(; se < range.second; se++) {
|
|
if(!(SeedExt::isSameConsensus(seeds[sb], seeds[se])
|
|
&& SeedExt::isSameSNPs(seeds[sb], seeds[se])
|
|
&& seeds[sb].aligned == seeds[se].aligned)) {
|
|
break;
|
|
}
|
|
}
|
|
|
|
if(!seeds[sb].aligned) {
|
|
sb = se;
|
|
continue;
|
|
}
|
|
|
|
if(seeds[sb].getLength() < rp.min_repeat_len) {
|
|
sb = se;
|
|
continue;
|
|
}
|
|
|
|
// [sb, se) are same alleles
|
|
writeAllele(grp_id, allele_id, Range(sb, se),
|
|
repName, baseoff, seeds, fp);
|
|
generateHaploType(Range(sb, se), seeds, haplo_lists);
|
|
|
|
allele_id++;
|
|
sb = se;
|
|
}
|
|
|
|
// sort HaploType List by pos
|
|
haplo_lists.sort();
|
|
writeHaploType(haplo_lists, seeds, hapl_id_base, repName, hapl_fp);
|
|
}
|
|
|
|
template<typename TStr>
|
|
void RepeatBuilder<TStr>::generateHaploType(Range range, const EList<SeedExt> &seeds, EList<SeedHP> &haplo_list)
|
|
{
|
|
const EList<SeedSNP *>& snps = seeds[range.first].snps;
|
|
if(snps.size() == 0)
|
|
return;
|
|
|
|
// right-most position
|
|
TIndexOffU max_right_pos = seeds[range.first].consensus_pos.second - 1;
|
|
|
|
#ifndef NDEBUG
|
|
for(size_t i = 0; i < snps.size(); i++) {
|
|
assert_leq(snps[i]->pos, max_right_pos);
|
|
}
|
|
#endif
|
|
|
|
// create haplotypes of at least 16 bp long to prevent combinations of SNPs
|
|
// break a list of SNPs into several haplotypes if a SNP is far from the next SNP in the list
|
|
const TIndexOffU min_ht_len = 16;
|
|
size_t eb = 0, ee = 1;
|
|
while(ee < snps.size() + 1) {
|
|
if(ee == snps.size() ||
|
|
snps[eb]->pos + (min_ht_len << 1) < snps[ee]->pos) {
|
|
TIndexOffU left_pos = snps[eb]->pos;
|
|
TIndexOffU right_pos = snps[ee-1]->pos;
|
|
|
|
if(snps[ee-1]->type == EDIT_TYPE_READ_GAP) {
|
|
right_pos += snps[ee-1]->len;
|
|
}
|
|
if(left_pos + min_ht_len - 1 > right_pos) {
|
|
right_pos = left_pos + min_ht_len - 1;
|
|
}
|
|
right_pos = min<TIndexOffU>(max_right_pos, right_pos);
|
|
assert_leq(left_pos, right_pos);
|
|
|
|
SeedHP seedHP;
|
|
|
|
seedHP.range.first = left_pos;
|
|
seedHP.range.second = right_pos;
|
|
for(size_t i = eb; i < ee; i++) {
|
|
string snp_ids = "rps" + to_string(snps[i]->id);
|
|
seedHP.snpIDs.push_back(snp_ids);
|
|
}
|
|
|
|
// Add to haplo_list
|
|
bool found = false;
|
|
for(size_t i = 0; i < haplo_list.size(); i++) {
|
|
if(haplo_list[i] == seedHP) {
|
|
found = true;
|
|
break;
|
|
}
|
|
}
|
|
if(!found) {
|
|
// Add
|
|
haplo_list.push_back(seedHP);
|
|
}
|
|
|
|
eb = ee;
|
|
}
|
|
ee++;
|
|
}
|
|
}
|
|
|
|
|
|
void RB_SubSA::init(TIndexOffU sa_size,
|
|
TIndexOffU seed_len,
|
|
TIndexOffU seed_count)
|
|
{
|
|
assert_gt(sa_size, 1);
|
|
sa_size_ = sa_size;
|
|
assert_gt(seed_len, 0);
|
|
seed_len_ = seed_len;
|
|
assert_gt(seed_count, 1);
|
|
seed_count_ = seed_count;
|
|
temp_suffixes_.clear();
|
|
|
|
repeat_list_.clear();
|
|
repeat_index_.clear();
|
|
}
|
|
|
|
template<typename TStr>
|
|
void RB_SubSA::push_back(const TStr& s,
|
|
CoordHelper& coordHelper,
|
|
TIndexOffU saElt,
|
|
bool lastInput)
|
|
{
|
|
if(saElt + seed_len() <= coordHelper.getEnd(saElt)) {
|
|
if(seed_count_ == 1) {
|
|
repeat_list_.push_back(saElt);
|
|
} else {
|
|
assert_gt(seed_count_, 1);
|
|
|
|
if(temp_suffixes_.empty()) {
|
|
temp_suffixes_.push_back(saElt);
|
|
return;
|
|
}
|
|
|
|
TIndexOffU prev_saElt = temp_suffixes_.back();
|
|
|
|
// calculate common prefix length between two text.
|
|
// text1 is started from prev_saElt and text2 is started from saElt
|
|
bool same = isSameSequenceUpto(s,
|
|
coordHelper,
|
|
prev_saElt,
|
|
saElt,
|
|
seed_len_);
|
|
|
|
if(same) {
|
|
temp_suffixes_.push_back(saElt);
|
|
}
|
|
|
|
if(!same || lastInput) {
|
|
if(temp_suffixes_.size() >= seed_count_) {
|
|
repeat_index_.push_back(repeat_list_.size());
|
|
temp_suffixes_.sort();
|
|
for(size_t pi = 0; pi < temp_suffixes_.size(); pi++) {
|
|
repeat_list_.push_back(temp_suffixes_[pi]);
|
|
}
|
|
}
|
|
temp_suffixes_.clear();
|
|
if(!lastInput) {
|
|
temp_suffixes_.push_back(saElt);
|
|
} else {
|
|
temp_suffixes_.nullify();
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
if(lastInput) {
|
|
size_t bit = sizeof(uint32_t) * 8;
|
|
size_t num = (repeat_list_.size() + bit - 1) / bit;
|
|
done_.resizeExact(num);
|
|
done_.fillZero();
|
|
|
|
#ifndef NDEBUG
|
|
string prev_seq = "";
|
|
for(size_t i = 0; i < repeat_index_.size(); i++) {
|
|
TIndexOffU saBegin = repeat_index_[i];
|
|
TIndexOffU saEnd = (i + 1 < repeat_index_.size() ? repeat_index_[i+1] : repeat_list_.size());
|
|
string seq = getString(s, repeat_list_[saBegin], seed_len());
|
|
for(size_t j = saBegin + 1; j < saEnd; j++) {
|
|
string tmp_seq = getString(s, repeat_list_[j], seed_len());
|
|
assert_eq(seq, tmp_seq);
|
|
}
|
|
if(prev_seq != "" ) {
|
|
assert_lt(prev_seq, seq);
|
|
}
|
|
prev_seq = seq;
|
|
}
|
|
#endif
|
|
}
|
|
}
|
|
|
|
template<typename TStr>
|
|
Range RB_SubSA::find(const TStr& s,
|
|
const string& seq) const
|
|
{
|
|
TIndexOffU i = find_repeat_idx(s, seq);
|
|
if(i >= repeat_index_.size())
|
|
return Range(0, 0);
|
|
|
|
Range range(repeat_index_[i],
|
|
i + 1 < repeat_index_.size() ? repeat_index_[i+1] : repeat_list_.size());
|
|
return range;
|
|
}
|
|
|
|
template<typename TStr>
|
|
TIndexOffU RB_SubSA::find_repeat_idx(const TStr& s,
|
|
const string& seq) const
|
|
{
|
|
assert_eq(seq.length(), seed_len_);
|
|
|
|
string temp;
|
|
size_t l = 0, r = repeat_index_.size();
|
|
while(l < r) {
|
|
size_t m = (r + l) >> 1;
|
|
TIndexOffU saElt = repeat_list_[repeat_index_[m]];
|
|
getString(s, saElt, seed_len_, temp);
|
|
if(seq == temp) {
|
|
return m;
|
|
} else if(seq < temp) {
|
|
r = m;
|
|
} else {
|
|
assert(seq > temp);
|
|
l = m + 1;
|
|
}
|
|
}
|
|
return repeat_index_.size();
|
|
}
|
|
|
|
void RB_SubSA::setDone(TIndexOffU off, TIndexOffU len)
|
|
{
|
|
assert_leq(off + len, repeat_list_.size());
|
|
const TIndexOffU bit = sizeof(uint32_t) * 8;
|
|
for(TIndexOffU i = off; i < off + len; i++) {
|
|
TIndexOffU quotient = i / bit;
|
|
TIndexOffU remainder = i % bit;
|
|
assert_lt(quotient, done_.size());
|
|
uint32_t num = done_[quotient];
|
|
num = num | (1 << remainder);
|
|
done_[quotient] = num;
|
|
}
|
|
}
|
|
|
|
bool RB_SubSA::isDone(TIndexOffU off, TIndexOffU len) const
|
|
{
|
|
assert_leq(off + len, repeat_list_.size());
|
|
const TIndexOffU bit = sizeof(uint32_t) * 8;
|
|
for(TIndexOffU i = off; i < off + len; i++) {
|
|
TIndexOffU quotient = i / bit;
|
|
TIndexOffU remainder = i % bit;
|
|
assert_lt(quotient, done_.size());
|
|
uint32_t num = done_[quotient];
|
|
num = num & (1 << remainder);
|
|
if(num == 0)
|
|
return false;
|
|
}
|
|
|
|
return true;
|
|
}
|
|
|
|
template<typename TStr>
|
|
void RB_SubSA::buildRepeatBase(const TStr& s,
|
|
CoordHelper& coordHelper,
|
|
const size_t max_len,
|
|
EList<RB_RepeatBase>& repeatBases)
|
|
{
|
|
if(repeat_index_.empty())
|
|
return;
|
|
|
|
done_.fillZero();
|
|
|
|
EList<uint8_t> senseDominant;
|
|
senseDominant.resizeExact(repeat_index_.size());
|
|
senseDominant.fillZero();
|
|
|
|
EList<size_t> repeatStack;
|
|
EList<pair<TIndexOffU, TIndexOffU> > size_table;
|
|
size_table.reserveExact(repeat_index_.size() / 2 + 1);
|
|
for(size_t i = 0; i < repeat_index_.size(); i++) {
|
|
TIndexOffU begin = repeat_index_[i];
|
|
TIndexOffU end = (i + 1 < repeat_index_.size() ? repeat_index_[i+1] : repeat_list_.size());
|
|
assert_lt(begin, end);
|
|
EList<TIndexOffU> positions; positions.reserveExact(end - begin);
|
|
for(size_t j = begin; j < end; j++) positions.push_back(repeat_list_[j]);
|
|
|
|
if(!isSenseDominant(coordHelper, positions, seed_len_) && !threeN)
|
|
continue;
|
|
|
|
senseDominant[i] = 1;
|
|
size_table.expand();
|
|
size_table.back().first = end - begin;
|
|
size_table.back().second = i;
|
|
}
|
|
size_table.sort();
|
|
|
|
size_t bundle_count = 0;
|
|
string tmp_str;
|
|
EList<Range> tmp_ranges; tmp_ranges.resizeExact(4);
|
|
EList<pair<size_t, size_t> > tmp_sort_ranges; tmp_sort_ranges.resizeExact(4);
|
|
for(int64_t i = (int64_t)size_table.size() - 1; i >= 0; i--) {
|
|
TIndexOffU idx = size_table[i].second;
|
|
ASSERT_ONLY(TIndexOffU num = size_table[i].first);
|
|
assert_lt(idx, repeat_index_.size());
|
|
#ifndef NDEBUG
|
|
if(idx + 1 < repeat_index_.size()) {
|
|
assert_eq(repeat_index_[idx] + num, repeat_index_[idx+1]);
|
|
} else {
|
|
assert_eq(repeat_index_[idx] + num, repeat_list_.size());
|
|
}
|
|
#endif
|
|
TIndexOffU saBegin = repeat_index_[idx];
|
|
if(isDone(saBegin)) {
|
|
assert(isDone(saBegin, num));
|
|
continue;
|
|
}
|
|
|
|
ASSERT_ONLY(size_t rb_done = 0);
|
|
assert_lt(saBegin, repeat_list_.size());
|
|
repeatStack.push_back(idx);
|
|
while(!repeatStack.empty()) {
|
|
TIndexOffU idx = repeatStack.back();
|
|
assert(senseDominant[idx]);
|
|
repeatStack.pop_back();
|
|
assert_lt(idx, repeat_index_.size());
|
|
TIndexOffU saBegin = repeat_index_[idx];
|
|
TIndexOffU saEnd = (idx + 1 < repeat_index_.size() ? repeat_index_[idx + 1] : repeat_list_.size());
|
|
if(isDone(saBegin)) {
|
|
assert(isDone(saBegin, saEnd - saBegin));
|
|
continue;
|
|
}
|
|
|
|
TIndexOffU saElt = repeat_list_[saBegin];
|
|
size_t ri = repeatBases.size();
|
|
repeatBases.expand();
|
|
repeatBases.back().seq = getString(s, saElt, seed_len_);
|
|
repeatBases.back().nodes.clear();
|
|
repeatBases.back().nodes.push_back(idx);
|
|
ASSERT_ONLY(rb_done++);
|
|
setDone(saBegin, saEnd - saBegin);
|
|
bool left = true;
|
|
while(repeatBases[ri].seq.length() <= max_len) {
|
|
if(left) {
|
|
tmp_str = "N";
|
|
tmp_str += repeatBases[ri].seq.substr(0, seed_len_ - 1);
|
|
} else {
|
|
tmp_str = repeatBases[ri].seq.substr(repeatBases[ri].seq.length() - seed_len_ + 1, seed_len_ - 1);
|
|
tmp_str.push_back('N');
|
|
}
|
|
assert_eq(tmp_str.length(), seed_len_);
|
|
|
|
for(size_t c = 0; c < 4; c++) {
|
|
if(left) tmp_str[0] = "ACGT"[c];
|
|
else tmp_str.back() = "ACGT"[c];
|
|
assert_eq(tmp_str.length(), seed_len_);
|
|
TIndexOffU idx = find_repeat_idx(s, tmp_str);
|
|
size_t num = 0;
|
|
if(idx < repeat_index_.size()) {
|
|
if(idx + 1 < repeat_index_.size()) {
|
|
num = repeat_index_[idx+1] - repeat_index_[idx];
|
|
} else {
|
|
num = repeat_list_.size() - repeat_index_[idx];
|
|
}
|
|
}
|
|
tmp_ranges[c].first = idx;
|
|
tmp_ranges[c].second = num;
|
|
assert(num == 0 || num >= seed_count_);
|
|
tmp_sort_ranges[c].first = num;
|
|
tmp_sort_ranges[c].second = c;
|
|
if(idx == repeat_index_.size() ||
|
|
isDone(repeat_index_[idx]) ||
|
|
!senseDominant[idx]) {
|
|
#ifndef NDEBUG
|
|
if(idx < repeat_index_.size()) {
|
|
assert(isDone(repeat_index_[idx], num) ||
|
|
!senseDominant[idx]);
|
|
}
|
|
#endif
|
|
tmp_sort_ranges[c].first = 0;
|
|
tmp_ranges[c].second = 0;
|
|
}
|
|
}
|
|
tmp_sort_ranges.sort();
|
|
if(tmp_sort_ranges[3].first < seed_count_) {
|
|
if(left) {
|
|
left = false;
|
|
continue;
|
|
} else {
|
|
break;
|
|
}
|
|
}
|
|
|
|
for(size_t cc = 0; cc < 3; cc++) {
|
|
assert_leq(tmp_sort_ranges[cc].first, tmp_sort_ranges[cc+1].first);
|
|
if(tmp_sort_ranges[cc].first < seed_count_)
|
|
continue;
|
|
|
|
size_t c = tmp_sort_ranges[cc].second;
|
|
repeatStack.push_back(tmp_ranges[c].first);
|
|
}
|
|
|
|
size_t c = tmp_sort_ranges[3].second;
|
|
if(repeatBases[ri].seq.length() >= max_len) {
|
|
assert_eq(repeatBases[ri].seq.length(), max_len);
|
|
TIndexOffU idx = tmp_ranges[c].first;
|
|
assert(!isDone(repeat_index_[idx]));
|
|
repeatStack.push_back(idx);
|
|
if(left) {
|
|
left = false;
|
|
continue;
|
|
} else {
|
|
break;
|
|
}
|
|
} else {
|
|
TIndexOffU idx = tmp_ranges[c].first;
|
|
TIndexOffU num = tmp_ranges[c].second;
|
|
setDone(repeat_index_[idx], num);
|
|
if(left) {
|
|
repeatBases[ri].seq.insert(0, 1, "ACGT"[c]);
|
|
repeatBases[ri].nodes.insert(idx, 0);
|
|
} else {
|
|
repeatBases[ri].seq.push_back("ACGT"[c]);
|
|
repeatBases[ri].nodes.push_back(idx);
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
assert_gt(rb_done, 0);
|
|
bundle_count++;
|
|
}
|
|
|
|
cerr << "Bundle count: " << bundle_count << endl;
|
|
|
|
#ifndef NDEBUG
|
|
{
|
|
set<TIndexOffU> idx_set;
|
|
for(size_t i = 0; i < repeatBases.size(); i++) {
|
|
const RB_RepeatBase& repeatBase = repeatBases[i];
|
|
for(size_t j = 0; j < repeatBase.nodes.size(); j++) {
|
|
assert(idx_set.find(repeatBase.nodes[j]) == idx_set.end());
|
|
idx_set.insert(repeatBase.nodes[j]);
|
|
}
|
|
}
|
|
}
|
|
#endif
|
|
|
|
#if 0
|
|
{
|
|
EList<pair<size_t, size_t> > kmer_table;
|
|
string query;
|
|
size_t total = 0, match = 0;
|
|
size_t interval = 1;
|
|
if(repeat_index_.size() >= 10000) {
|
|
interval = repeat_index_.size() / 10000;
|
|
}
|
|
EList<TIndexOffU> positions;
|
|
for(size_t i = 0; i < repeat_index_.size(); i += interval) {
|
|
TIndexOffU saElt_idx = repeat_index_[i];
|
|
TIndexOffU saElt_idx_end = (i + 1 < repeat_index_.size() ? repeat_index_[i+1] : repeat_list_.size());
|
|
positions.clear();
|
|
for(size_t j = saElt_idx; j < saElt_idx_end; j++) {
|
|
positions.push_back(repeat_list_[j]);
|
|
#ifndef NDEBUG
|
|
if(j > saElt_idx) {
|
|
TIndexOffU lcp_len = getLCP(s,
|
|
coordHelper,
|
|
positions[0],
|
|
positions.back(),
|
|
seed_len_);
|
|
assert_geq(lcp_len, seed_len_);
|
|
}
|
|
|
|
TIndexOffU saElt = repeat_list_[j];
|
|
TIndexOffU start = coordHelper.getStart(saElt);
|
|
TIndexOffU start2 = coordHelper.getStart(saElt + seed_len_ - 1);
|
|
assert_eq(start, start2);
|
|
#endif
|
|
}
|
|
|
|
TIndexOffU saElt = repeat_list_[saElt_idx];
|
|
size_t true_count = saElt_idx_end - saElt_idx;
|
|
getString(s, saElt, seed_len_, query);
|
|
|
|
total++;
|
|
|
|
size_t count = 0;
|
|
for(size_t r = 0; r < repeatBases.size(); r++) {
|
|
const RB_RepeatBase& repeat = repeatBases[r];
|
|
int pos = repeat.seq.find(query);
|
|
if(pos != string::npos) {
|
|
for(size_t j = 0; j < repeat.nodes.size(); j++) {
|
|
TIndexOffU _node = repeat.nodes[j];
|
|
TIndexOffU _saElt_idx = repeat_index_[_node];
|
|
TIndexOffU _saElt_idx_end = (_node + 1 < repeat_index_.size() ? repeat_index_[_node+1] : repeat_list_.size());
|
|
TIndexOffU _saElt = repeat_list_[_saElt_idx];
|
|
string seq = getString(s, _saElt, seed_len());
|
|
if(query == seq) {
|
|
count += (_saElt_idx_end - _saElt_idx);
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
size_t rc_count = 0;
|
|
string rc_query = reverse_complement(query);
|
|
for(size_t r = 0; r < repeatBases.size(); r++) {
|
|
const RB_RepeatBase& repeat = repeatBases[r];
|
|
int pos = repeat.seq.find(rc_query);
|
|
if(pos != string::npos) {
|
|
for(size_t j = 0; j < repeat.nodes.size(); j++) {
|
|
TIndexOffU _node = repeat.nodes[j];
|
|
TIndexOffU _saElt_idx = repeat_index_[_node];
|
|
TIndexOffU _saElt_idx_end = (_node + 1 < repeat_index_.size() ? repeat_index_[_node+1] : repeat_list_.size());
|
|
TIndexOffU _saElt = repeat_list_[_saElt_idx];
|
|
string seq = getString(s, _saElt, seed_len());
|
|
if(rc_query == seq) {
|
|
rc_count += (_saElt_idx_end - _saElt_idx);
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
if(count == true_count || rc_count == true_count) {
|
|
match++;
|
|
} else if(total - match <= 10) {
|
|
cerr << "query: " << query << endl;
|
|
cerr << "true count: " << true_count << endl;
|
|
cerr << "found count: " << count << endl;
|
|
cerr << "rc found count: " << rc_count << endl;
|
|
cerr << endl;
|
|
}
|
|
}
|
|
|
|
cerr << "RB_SubSA: sanity check: " << match << " passed (out of " << total << ")" << endl << endl;
|
|
}
|
|
#endif
|
|
}
|
|
|
|
|
|
#define write_fp(x) fp.write((const char *)&(x), sizeof((x)))
|
|
|
|
void RB_SubSA::writeFile(ofstream& fp)
|
|
{
|
|
write_fp(sa_size_);
|
|
write_fp(seed_len_);
|
|
write_fp(seed_count_);
|
|
|
|
size_t sz = temp_suffixes_.size();
|
|
write_fp(sz);
|
|
for(size_t i = 0; i < sz; i++) {
|
|
write_fp(temp_suffixes_[i]);
|
|
}
|
|
|
|
sz = repeat_index_.size();
|
|
write_fp(sz);
|
|
for(size_t i = 0; i < sz; i++) {
|
|
write_fp(repeat_index_[i]);
|
|
}
|
|
|
|
sz = done_.size();
|
|
write_fp(sz);
|
|
for(size_t i = 0; i < sz; i++) {
|
|
write_fp(done_[i]);
|
|
}
|
|
}
|
|
|
|
#define read_fp(x) fp.read((char *)&(x), sizeof((x)))
|
|
|
|
void RB_SubSA::readFile(ifstream& fp)
|
|
{
|
|
TIndexOffU val;
|
|
|
|
read_fp(val);
|
|
rt_assert_eq(val, sa_size_);
|
|
|
|
read_fp(val);
|
|
rt_assert_eq(val, seed_len_);
|
|
|
|
read_fp(val);
|
|
rt_assert_eq(val, seed_count_);
|
|
|
|
size_t sz;
|
|
read_fp(sz);
|
|
temp_suffixes_.resizeExact(sz);
|
|
for(size_t i = 0; i < sz; i++) {
|
|
read_fp(temp_suffixes_[i]);
|
|
}
|
|
|
|
size_t val_sz;
|
|
|
|
// repeat_index_
|
|
read_fp(val_sz);
|
|
repeat_index_.resizeExact(val_sz);
|
|
for(size_t i = 0; i < val_sz; i++) {
|
|
read_fp(repeat_index_[i]);
|
|
}
|
|
|
|
// done
|
|
read_fp(val_sz);
|
|
done_.resizeExact(val_sz);
|
|
for(size_t i = 0; i < val_sz; i++) {
|
|
read_fp(done_[i]);
|
|
}
|
|
|
|
}
|
|
|
|
|
|
/****************************/
|
|
template class RepeatBuilder<SString<char> >;
|
|
template void dump_tstr(const SString<char>& );
|
|
template bool compareRepeatCoordByJoinedOff(const RepeatCoord<TIndexOffU>& , const RepeatCoord<TIndexOffU>&);
|