965 lines
27 KiB
C
965 lines
27 KiB
C
|
/*
|
||
|
* Copyright 2018, Chanhee Park <parkchanhee@gmail.com> and Daehwan Kim <infphilo@gmail.com>
|
||
|
*
|
||
|
* This file is part of HISAT 2.
|
||
|
*
|
||
|
* HISAT 2 is free software: you can redistribute it and/or modify
|
||
|
* it under the terms of the GNU General Public License as published by
|
||
|
* the Free Software Foundation, either version 3 of the License, or
|
||
|
* (at your option) any later version.
|
||
|
*
|
||
|
* HISAT 2 is distributed in the hope that it will be useful,
|
||
|
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||
|
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||
|
* GNU General Public License for more details.
|
||
|
*
|
||
|
* You should have received a copy of the GNU General Public License
|
||
|
* along with HISAT 2. If not, see <http://www.gnu.org/licenses/>.
|
||
|
*/
|
||
|
|
||
|
#ifndef __REPEAT_BUILDER_H__
|
||
|
#define __REPEAT_BUILDER_H__
|
||
|
|
||
|
#include <iostream>
|
||
|
#include <fstream>
|
||
|
#include <limits>
|
||
|
#include <map>
|
||
|
#include "assert_helpers.h"
|
||
|
#include "word_io.h"
|
||
|
#include "mem_ids.h"
|
||
|
#include "ref_coord.h"
|
||
|
#include "ref_read.h"
|
||
|
#include "edit.h"
|
||
|
#include "ds.h"
|
||
|
#include "repeat.h"
|
||
|
#include "blockwise_sa.h"
|
||
|
#include "simple_func.h"
|
||
|
#include "scoring.h"
|
||
|
#include "aligner_sw.h"
|
||
|
#include "bit_packed_array.h"
|
||
|
|
||
|
//#define DEBUGLOG
|
||
|
|
||
|
using namespace std;
|
||
|
|
||
|
/**
|
||
|
* Encapsulates repeat parameters.
|
||
|
*/
|
||
|
class RepeatParameter {
|
||
|
public:
|
||
|
TIndexOffU seed_len; // seed length
|
||
|
TIndexOffU seed_count; // seed count
|
||
|
TIndexOffU seed_mm; // maximum edit distance allowed during initial seed extension
|
||
|
TIndexOffU repeat_count; // repeat count
|
||
|
TIndexOffU min_repeat_len; // minimum repeat length
|
||
|
TIndexOffU max_repeat_len; // maximum repeat length
|
||
|
TIndexOffU max_edit; // maximum edit distance allowed
|
||
|
bool symmetric_extend; // extend symmetrically
|
||
|
TIndexOffU extend_unit_len; // extend seeds no longer than this length at a time
|
||
|
bool append_result;
|
||
|
};
|
||
|
|
||
|
typedef pair<TIndexOffU, TIndexOffU> Range;
|
||
|
|
||
|
struct Fragments {
|
||
|
bool contain(TIndexOffU pos) {
|
||
|
if (pos >= joinedOff && pos < (joinedOff + length)) {
|
||
|
return true;
|
||
|
}
|
||
|
return false;
|
||
|
}
|
||
|
|
||
|
TIndexOffU joinedOff; // index within joined text
|
||
|
TIndexOffU length;
|
||
|
|
||
|
int frag_id;
|
||
|
int seq_id;
|
||
|
TIndexOffU seqOff; // index within sequence
|
||
|
bool first;
|
||
|
};
|
||
|
|
||
|
class CoordHelper {
|
||
|
public:
|
||
|
CoordHelper(TIndexOffU length,
|
||
|
TIndexOffU forward_length,
|
||
|
const EList<RefRecord>& szs,
|
||
|
const EList<string>& ref_names);
|
||
|
~CoordHelper();
|
||
|
int mapJoinedOffToSeq(TIndexOffU joined_pos);
|
||
|
int getGenomeCoord(TIndexOffU joined_pos,
|
||
|
string& chr_name,
|
||
|
TIndexOffU& pos_in_chr);
|
||
|
|
||
|
TIndexOffU getEnd(TIndexOffU e);
|
||
|
TIndexOffU getStart(TIndexOffU e);
|
||
|
TIndexOffU forward_length() const { return forward_length_; }
|
||
|
TIndexOffU length() const { return length_; }
|
||
|
|
||
|
private:
|
||
|
void buildNames();
|
||
|
void buildJoinedFragment();
|
||
|
|
||
|
private:
|
||
|
TIndexOffU length_;
|
||
|
TIndexOffU forward_length_;
|
||
|
EList<RefRecord> szs_;
|
||
|
EList<string> ref_names_;
|
||
|
EList<string> ref_namelines_;
|
||
|
|
||
|
// mapping info from joined string to genome
|
||
|
EList<Fragments> fraglist_;
|
||
|
|
||
|
// Fragments Cache
|
||
|
#define CACHE_SIZE_JOINEDFRG 10
|
||
|
Fragments cached_[CACHE_SIZE_JOINEDFRG];
|
||
|
int num_cached_ = 0;
|
||
|
int victim_ = 0; /* round-robin */
|
||
|
};
|
||
|
|
||
|
struct SeedHP {
|
||
|
bool operator==(const SeedHP &rhs) const {
|
||
|
return range == rhs.range &&
|
||
|
snpIDs == rhs.snpIDs;
|
||
|
}
|
||
|
|
||
|
bool operator!=(const SeedHP &rhs) const {
|
||
|
return !(rhs == *this);
|
||
|
}
|
||
|
|
||
|
bool operator<(const SeedHP &rhs) const {
|
||
|
return range < rhs.range;
|
||
|
}
|
||
|
|
||
|
bool operator>(const SeedHP &rhs) const {
|
||
|
return rhs < *this;
|
||
|
}
|
||
|
|
||
|
bool operator<=(const SeedHP &rhs) const {
|
||
|
return !(rhs < *this);
|
||
|
}
|
||
|
|
||
|
bool operator>=(const SeedHP &rhs) const {
|
||
|
return !(*this < rhs);
|
||
|
}
|
||
|
|
||
|
Range range;
|
||
|
EList<string> snpIDs;
|
||
|
};
|
||
|
|
||
|
struct SeedSNP {
|
||
|
int type; // EDIT_TYPE_MM, EDIT_TYPE_READ_GAP, EDIT_TYPE_REF_GAP
|
||
|
TIndexOffU pos;
|
||
|
size_t len;
|
||
|
string base; // valid when ty = MM, REF_GAP
|
||
|
TIndexOffU id;
|
||
|
|
||
|
SeedSNP() {}
|
||
|
|
||
|
void init(int ty, TIndexOffU po, size_t ln, string bs) {
|
||
|
type = ty;
|
||
|
pos = po;
|
||
|
len = ln;
|
||
|
base = bs;
|
||
|
}
|
||
|
void init(int ty, TIndexOffU po, size_t ln, char bs) {
|
||
|
type = ty;
|
||
|
pos = po;
|
||
|
len = ln;
|
||
|
base.assign(1, bs);
|
||
|
}
|
||
|
|
||
|
friend ostream &operator<<(ostream &os, const SeedSNP &snp) {
|
||
|
|
||
|
if(snp.type == EDIT_TYPE_MM) {
|
||
|
os << "single";
|
||
|
} else if(snp.type == EDIT_TYPE_REF_GAP) {
|
||
|
os << "insertion";
|
||
|
} else if(snp.type == EDIT_TYPE_READ_GAP) {
|
||
|
os << "deletion";
|
||
|
}
|
||
|
os << " ";
|
||
|
os << snp.pos << " ";
|
||
|
|
||
|
if(snp.type == EDIT_TYPE_MM || snp.type == EDIT_TYPE_REF_GAP) {
|
||
|
os << snp.base;
|
||
|
} else if(snp.type == EDIT_TYPE_READ_GAP) {
|
||
|
os << snp.len;
|
||
|
}
|
||
|
return os;
|
||
|
}
|
||
|
|
||
|
bool operator==(const SeedSNP &rhs) const {
|
||
|
return type == rhs.type &&
|
||
|
pos == rhs.pos &&
|
||
|
len == rhs.len &&
|
||
|
base == rhs.base;
|
||
|
}
|
||
|
|
||
|
bool operator!=(const SeedSNP &rhs) const {
|
||
|
return !(rhs == *this);
|
||
|
}
|
||
|
|
||
|
static bool cmpSeedSNPByPos( SeedSNP * const& a, SeedSNP * const& b) {
|
||
|
return a->pos < b->pos;
|
||
|
}
|
||
|
|
||
|
};
|
||
|
|
||
|
class SeedExt {
|
||
|
public:
|
||
|
SeedExt() {
|
||
|
reset();
|
||
|
};
|
||
|
|
||
|
void reset() {
|
||
|
done = false;
|
||
|
curr_ext_len = 0;
|
||
|
ed = total_ed = 0;
|
||
|
orig_pos.first = 0;
|
||
|
orig_pos.second = 0;
|
||
|
pos.first = 0;
|
||
|
pos.second = 0;
|
||
|
bound.first = 0;
|
||
|
bound.second = 0;
|
||
|
consensus_pos.first = 0;
|
||
|
consensus_pos.second = 0;
|
||
|
left_gaps.clear();
|
||
|
right_gaps.clear();
|
||
|
aligned = true;
|
||
|
};
|
||
|
|
||
|
TIndexOffU getLeftExtLength() const {
|
||
|
assert_leq(pos.first, orig_pos.first);
|
||
|
TIndexOffU len = orig_pos.first - pos.first;
|
||
|
return len;
|
||
|
}
|
||
|
|
||
|
TIndexOffU getRightExtLength() const {
|
||
|
assert_geq(pos.second, orig_pos.second);
|
||
|
return pos.second - orig_pos.second;
|
||
|
}
|
||
|
|
||
|
TIndexOffU getLength() const {
|
||
|
TIndexOffU len = orig_pos.second - orig_pos.first;
|
||
|
len += getLeftExtLength();
|
||
|
len += getRightExtLength();
|
||
|
return len;
|
||
|
}
|
||
|
|
||
|
template<typename TStr>
|
||
|
void getLeftExtString(const TStr& s, string& seq)
|
||
|
{
|
||
|
seq.clear();
|
||
|
seq = getString(s, pos.first, orig_pos.first - pos.first);
|
||
|
}
|
||
|
|
||
|
template<typename TStr>
|
||
|
void getRightExtString(const TStr& s, string& seq)
|
||
|
{
|
||
|
seq.clear();
|
||
|
seq = getString(s, orig_pos.second, pos.second - orig_pos.second);
|
||
|
}
|
||
|
|
||
|
template<typename TStr>
|
||
|
void getExtendedSeedSequence(const TStr& s,
|
||
|
string& seq) const;
|
||
|
|
||
|
template<typename TStr>
|
||
|
void generateSNPs(const TStr& s, const string& consensus, EList<SeedSNP *>& repeat_snps);
|
||
|
|
||
|
static bool isSameConsensus(const SeedExt& a, const SeedExt& b) {
|
||
|
return (a.consensus_pos == b.consensus_pos);
|
||
|
//&& (a.getLength() == b.getLength());
|
||
|
}
|
||
|
|
||
|
static bool isSameSNPs(const SeedExt& a, const SeedExt& b) {
|
||
|
const EList<SeedSNP *>& a_edit = a.snps;
|
||
|
const EList<SeedSNP *>& b_edit = b.snps;
|
||
|
|
||
|
if(a_edit.size() != b_edit.size()) {
|
||
|
return false;
|
||
|
}
|
||
|
|
||
|
for(size_t i = 0; i < a_edit.size(); i++) {
|
||
|
if(!(*a_edit[i] == *b_edit[i])) {
|
||
|
return false;
|
||
|
}
|
||
|
}
|
||
|
return true;
|
||
|
}
|
||
|
|
||
|
static bool isSameAllele(const SeedExt& a, const SeedExt& b) {
|
||
|
const EList<Edit>& a_edit = a.edits;
|
||
|
const EList<Edit>& b_edit = b.edits;
|
||
|
|
||
|
if(a_edit.size() != b_edit.size()) {
|
||
|
return false;
|
||
|
}
|
||
|
|
||
|
for(size_t i = 0; i < a_edit.size(); i++) {
|
||
|
if(!(a_edit[i] == b_edit[i])) {
|
||
|
return false;
|
||
|
}
|
||
|
}
|
||
|
return true;
|
||
|
}
|
||
|
|
||
|
static bool sort_by_edits(const SeedExt& a, const SeedExt& b) {
|
||
|
const EList<Edit>& a_edit = a.edits;
|
||
|
const EList<Edit>& b_edit = b.edits;
|
||
|
|
||
|
size_t i = 0;
|
||
|
|
||
|
while((i < a_edit.size())
|
||
|
&& (i < b_edit.size())) {
|
||
|
|
||
|
if(!(a_edit[i] == b_edit[i])) {
|
||
|
if(a_edit[i] < b_edit[i]) {
|
||
|
return true;
|
||
|
} else {
|
||
|
return false;
|
||
|
}
|
||
|
}
|
||
|
|
||
|
i++;
|
||
|
}
|
||
|
|
||
|
if(a_edit.size() < b_edit.size()) {
|
||
|
return true;
|
||
|
}
|
||
|
|
||
|
if((a_edit.size() == b_edit.size())
|
||
|
&& (a.pos.first < b.pos.first)) {
|
||
|
return true;
|
||
|
}
|
||
|
return false;
|
||
|
}
|
||
|
|
||
|
void generateEdits(const string& consensus_merged, const string& seed_ext)
|
||
|
{
|
||
|
size_t ed_done = 0;
|
||
|
size_t ext_len = getLength();
|
||
|
assert_eq(ext_len, seed_ext.length());
|
||
|
|
||
|
edits.clear();
|
||
|
|
||
|
for(size_t i = 0; i < ext_len && ed_done < total_ed; i++) {
|
||
|
char con_base = consensus_merged[consensus_pos.first + i];
|
||
|
char seed_base = seed_ext[i];
|
||
|
|
||
|
if (con_base != seed_base) {
|
||
|
edits.expand();
|
||
|
edits.back().init(consensus_pos.first + i,
|
||
|
con_base, seed_base, EDIT_TYPE_MM);
|
||
|
ed_done++;
|
||
|
}
|
||
|
}
|
||
|
}
|
||
|
|
||
|
Range getExtendedRange(size_t consensus_len) const
|
||
|
{
|
||
|
assert_leq(consensus_pos.second, consensus_len);
|
||
|
Range range;
|
||
|
range.first = (pos.first < consensus_pos.first ? 0 : pos.first - consensus_pos.first);
|
||
|
range.second = pos.second + (consensus_len - consensus_pos.second);
|
||
|
return range;
|
||
|
}
|
||
|
|
||
|
#ifndef NDEBUG
|
||
|
bool valid() const {
|
||
|
assert_leq(consensus_pos.first, consensus_pos.second);
|
||
|
TIndexOffU constr_len = consensus_pos.second - consensus_pos.first;
|
||
|
TIndexOffU allele_len = getLength();
|
||
|
|
||
|
TIndexOffU cur_off = 0;
|
||
|
for(size_t i = 0; i < left_gaps.size(); i++) {
|
||
|
assert_geq(left_gaps[i].first, cur_off);
|
||
|
cur_off = left_gaps[i].first;
|
||
|
int gap_len = left_gaps[i].second;
|
||
|
assert_neq(gap_len, 0);
|
||
|
if(gap_len > 0) { // deletion
|
||
|
allele_len += gap_len;
|
||
|
} else {
|
||
|
allele_len += gap_len;
|
||
|
cur_off += (-gap_len);
|
||
|
}
|
||
|
}
|
||
|
cur_off = 0;
|
||
|
for(size_t i = 0; i < right_gaps.size(); i++) {
|
||
|
assert_geq(right_gaps[i].first, cur_off);
|
||
|
cur_off = right_gaps[i].first;
|
||
|
int gap_len = right_gaps[i].second;
|
||
|
assert_neq(gap_len, 0);
|
||
|
if(gap_len > 0) { // deletion
|
||
|
allele_len += gap_len;
|
||
|
} else {
|
||
|
allele_len += gap_len;
|
||
|
cur_off += (-gap_len);
|
||
|
}
|
||
|
}
|
||
|
assert_eq(constr_len, allele_len);
|
||
|
return true;
|
||
|
}
|
||
|
#endif
|
||
|
|
||
|
public:
|
||
|
// seed extended position [first, second)
|
||
|
pair<TIndexOffU, TIndexOffU> orig_pos;
|
||
|
pair<TIndexOffU, TIndexOffU> pos;
|
||
|
|
||
|
// extension bound. the seed must be placed on same fragment
|
||
|
// [first, second)
|
||
|
pair<TIndexOffU, TIndexOffU> bound;
|
||
|
|
||
|
// positions relative to consensus sequence
|
||
|
pair<TIndexOffU, TIndexOffU> consensus_pos;
|
||
|
|
||
|
// a list of gaps (deletions and insertions) in both directions
|
||
|
// offsets from seed's left and right ("pos" above)
|
||
|
// positive and negative values indicate deletions and insertions, resp.
|
||
|
EList<pair<TIndexOffU, int> > left_gaps;
|
||
|
EList<pair<TIndexOffU, int> > right_gaps;
|
||
|
|
||
|
uint32_t ed; // edit distance
|
||
|
uint32_t total_ed; // total edit distance
|
||
|
bool done; // done flag
|
||
|
uint32_t curr_ext_len; //
|
||
|
|
||
|
bool aligned;
|
||
|
|
||
|
EList<Edit> edits; // edits w.r.t. consensus_merged
|
||
|
|
||
|
EList<SeedSNP *> snps;
|
||
|
};
|
||
|
|
||
|
class RB_AlleleCoord {
|
||
|
public:
|
||
|
RB_AlleleCoord() :
|
||
|
left(0),
|
||
|
right(0),
|
||
|
idx(0)
|
||
|
{}
|
||
|
|
||
|
RB_AlleleCoord(TIndexOffU l, TIndexOffU r, size_t i) :
|
||
|
left(l),
|
||
|
right(r),
|
||
|
idx(i)
|
||
|
{}
|
||
|
|
||
|
TIndexOffU len() const { return right - left; }
|
||
|
|
||
|
bool operator<(const RB_AlleleCoord& o) const
|
||
|
{
|
||
|
if(left != o.left)
|
||
|
return left < o.left;
|
||
|
if(right != o.right)
|
||
|
return right > o.right;
|
||
|
return false;
|
||
|
}
|
||
|
|
||
|
bool contain(const RB_AlleleCoord& o, size_t relax = 0) const
|
||
|
{
|
||
|
if(o.left + relax >= left && o.right <= right + relax)
|
||
|
return true;
|
||
|
else
|
||
|
return false;
|
||
|
}
|
||
|
|
||
|
bool contained(const RB_AlleleCoord& o) const
|
||
|
{
|
||
|
return o.contain(*this);
|
||
|
}
|
||
|
|
||
|
TIndexOffU overlap_len(const RB_AlleleCoord& o) const
|
||
|
{
|
||
|
if(contain(o)) return o.len();
|
||
|
else if(o.contain(*this)) return len();
|
||
|
else if(left < o.right && right > o.left) {
|
||
|
if(left <= o.left) {
|
||
|
return right - o.left;
|
||
|
} else {
|
||
|
return o.right - left;
|
||
|
}
|
||
|
}
|
||
|
return 0;
|
||
|
}
|
||
|
|
||
|
public:
|
||
|
TIndexOffU left;
|
||
|
TIndexOffU right;
|
||
|
size_t idx;
|
||
|
};
|
||
|
|
||
|
class RB_RepeatManager;
|
||
|
class RB_SWAligner;
|
||
|
class RB_SubSA;
|
||
|
|
||
|
class RB_RepeatBase {
|
||
|
public:
|
||
|
string seq;
|
||
|
EList<TIndexOffU> nodes;
|
||
|
};
|
||
|
|
||
|
class RB_Repeat {
|
||
|
public:
|
||
|
RB_Repeat() {}
|
||
|
~RB_Repeat() {
|
||
|
if(snps_.size() > 0) {
|
||
|
for(size_t i = 0; i < snps_.size(); i++) {
|
||
|
delete snps_[i];
|
||
|
}
|
||
|
}
|
||
|
}
|
||
|
|
||
|
void repeat_id(size_t repeat_id) { repeat_id_ = repeat_id; }
|
||
|
size_t repeat_id() const { return repeat_id_; }
|
||
|
|
||
|
void parent_id(size_t parent_id) { parent_id_ = parent_id; }
|
||
|
size_t parent_id() const { return parent_id_; }
|
||
|
|
||
|
string& consensus() { return consensus_; }
|
||
|
const string& consensus() const { return consensus_; }
|
||
|
|
||
|
EList<SeedExt>& seeds() { return seeds_; }
|
||
|
const EList<SeedExt>& seeds() const { return seeds_; }
|
||
|
|
||
|
EList<RB_AlleleCoord>& seed_ranges() { return seed_ranges_; }
|
||
|
const EList<RB_AlleleCoord>& seed_ranges() const { return seed_ranges_; }
|
||
|
|
||
|
EList<SeedSNP *>& snps() { return snps_; }
|
||
|
const EList<SeedSNP *>& snps() const { return snps_; }
|
||
|
|
||
|
template<typename TStr>
|
||
|
void init(const RepeatParameter& rp,
|
||
|
const TStr& s,
|
||
|
CoordHelper& coordHelper,
|
||
|
const RB_SubSA& subSA,
|
||
|
const RB_RepeatBase& repeatBase);
|
||
|
|
||
|
template<typename TStr>
|
||
|
void extendConsensus(const RepeatParameter& rp,
|
||
|
const TStr& s);
|
||
|
|
||
|
template<typename TStr>
|
||
|
void getNextRepeat(const RepeatParameter& rp,
|
||
|
const TStr& s,
|
||
|
RB_Repeat& o);
|
||
|
|
||
|
|
||
|
template<typename TStr>
|
||
|
void saveSeedExtension(const RepeatParameter& rp,
|
||
|
const TStr& s,
|
||
|
CoordHelper& coordHelper,
|
||
|
TIndexOffU seed_grp_id,
|
||
|
ostream& fp,
|
||
|
size_t& total_repeat_seq_len,
|
||
|
size_t& total_allele_seq_len) const;
|
||
|
|
||
|
void saveSNPs(ofstream& fp,
|
||
|
TIndexOffU grp_id,
|
||
|
TIndexOffU& snp_id_base);
|
||
|
void saveConsensus(ofstream& fp,
|
||
|
TIndexOffU grp_id);
|
||
|
|
||
|
bool overlap(const RB_Repeat& o,
|
||
|
bool& contain,
|
||
|
bool& left,
|
||
|
size_t& seed_i,
|
||
|
size_t& seed_j,
|
||
|
bool debug = false) const;
|
||
|
|
||
|
bool satisfy(const RepeatParameter& rp) const
|
||
|
{
|
||
|
if(consensus_.length() < rp.min_repeat_len)
|
||
|
return false;
|
||
|
if(seeds_.size() < rp.repeat_count)
|
||
|
return false;
|
||
|
if(seeds_[rp.repeat_count - 1].getLength() < rp.min_repeat_len)
|
||
|
return false;
|
||
|
return true;
|
||
|
}
|
||
|
|
||
|
void reset() {
|
||
|
consensus_.clear();
|
||
|
for(size_t i = 0; i < seeds_.size(); i++)
|
||
|
seeds_[i].reset();
|
||
|
seeds_.clear();
|
||
|
seed_ranges_.clear();
|
||
|
}
|
||
|
|
||
|
void showInfo(const RepeatParameter& rp,
|
||
|
CoordHelper& coordHelper) const;
|
||
|
|
||
|
template<typename TStr>
|
||
|
void generateSNPs(const RepeatParameter&, const TStr& s, TIndexOffU grp_id);
|
||
|
|
||
|
bool self_repeat() const { return self_repeat_; }
|
||
|
|
||
|
void update() { internal_update(); }
|
||
|
void addSeed(const SeedExt& seed)
|
||
|
{
|
||
|
seeds_.expand();
|
||
|
seeds_.back() = seed;
|
||
|
}
|
||
|
|
||
|
bool contain(TIndexOffU left, TIndexOffU right) const;
|
||
|
|
||
|
protected:
|
||
|
template<typename TStr>
|
||
|
void get_consensus_seq(const TStr& s,
|
||
|
EList<SeedExt>& seeds,
|
||
|
size_t sb,
|
||
|
size_t se,
|
||
|
size_t min_left_ext,
|
||
|
size_t min_right_ext,
|
||
|
size_t max_ed,
|
||
|
const RepeatParameter& rp,
|
||
|
EList<size_t>& ed_seed_nums,
|
||
|
EList<string>* left_consensuses,
|
||
|
EList<string>* right_consensuses) const;
|
||
|
|
||
|
void internal_update();
|
||
|
|
||
|
|
||
|
protected:
|
||
|
size_t repeat_id_;
|
||
|
size_t parent_id_;
|
||
|
string consensus_;
|
||
|
EList<SeedExt> seeds_;
|
||
|
EList<RB_AlleleCoord> seed_ranges_;
|
||
|
EList<SeedSNP*> snps_;
|
||
|
bool self_repeat_;
|
||
|
|
||
|
static EList<size_t> ca_ed_;
|
||
|
static EList<size_t> ca_ed2_;
|
||
|
static string ca_s_;
|
||
|
static string ca_s2_;
|
||
|
|
||
|
public:
|
||
|
static size_t seed_merge_tried;
|
||
|
static size_t seed_merged;
|
||
|
};
|
||
|
|
||
|
class RB_RepeatExt : public RB_Repeat {
|
||
|
public:
|
||
|
RB_RepeatExt() {}
|
||
|
~RB_RepeatExt() {}
|
||
|
|
||
|
template<typename TStr>
|
||
|
void extendConsensus(const RepeatParameter& rp,
|
||
|
const TStr& s);
|
||
|
|
||
|
float mergeable(const RB_Repeat& o) const;
|
||
|
|
||
|
template<typename TStr>
|
||
|
bool merge(const RepeatParameter& rp,
|
||
|
const TStr& s,
|
||
|
RB_SWAligner& swalginer,
|
||
|
const RB_RepeatExt& o,
|
||
|
bool contain,
|
||
|
size_t seed_i,
|
||
|
size_t seed_j,
|
||
|
bool debug = false);
|
||
|
|
||
|
protected:
|
||
|
template<typename TStr>
|
||
|
void get_consensus_seq(const TStr& s,
|
||
|
EList<SeedExt>& seeds,
|
||
|
size_t sb,
|
||
|
size_t se,
|
||
|
size_t min_left_ext,
|
||
|
size_t min_right_ext,
|
||
|
size_t max_ed,
|
||
|
const RepeatParameter& rp,
|
||
|
EList<size_t>& ed_seed_nums,
|
||
|
EList<string>* left_consensuses,
|
||
|
EList<string>* right_consensuses) const;
|
||
|
|
||
|
template<typename TStr>
|
||
|
bool align(const RepeatParameter& rp,
|
||
|
const TStr& ref,
|
||
|
const string& s,
|
||
|
const EList<pair<size_t, size_t> >& s_kmer_table,
|
||
|
const string& s2,
|
||
|
EList<int>& offsets,
|
||
|
size_t k,
|
||
|
SeedExt& seed,
|
||
|
int consensus_approx_left,
|
||
|
int consensus_approx_right,
|
||
|
size_t left,
|
||
|
size_t right,
|
||
|
bool debug);
|
||
|
|
||
|
bool isSelfRepeat(const RepeatParameter& rp,
|
||
|
const string& s,
|
||
|
const EList<pair<size_t, size_t> >& s_kmer_table,
|
||
|
EList<int>& offsets,
|
||
|
size_t k,
|
||
|
bool debug);
|
||
|
|
||
|
};
|
||
|
|
||
|
// check if a set of seeds are already processed
|
||
|
class RB_RepeatManager {
|
||
|
public:
|
||
|
size_t numCoords() const { return range_to_repeats_.size(); }
|
||
|
|
||
|
bool checkRedundant(const RepeatParameter& rp,
|
||
|
const map<size_t, RB_Repeat*>& repeat_map,
|
||
|
const EList<TIndexOffU>& positions,
|
||
|
EList<size_t>& to_remove) const;
|
||
|
|
||
|
void addRepeat(const RB_Repeat* repeat);
|
||
|
void addRepeat(Range range, size_t repeat_id);
|
||
|
void removeRepeat(const RB_Repeat* repeat);
|
||
|
void removeRepeat(Range range, size_t repeat_id);
|
||
|
|
||
|
public:
|
||
|
void showInfo(const RepeatParameter& rp,
|
||
|
CoordHelper& coordHelper,
|
||
|
const map<size_t, RB_Repeat*>& repeat_map,
|
||
|
size_t num_case = 5) const;
|
||
|
|
||
|
private:
|
||
|
map<Range, EList<size_t> > range_to_repeats_;
|
||
|
};
|
||
|
|
||
|
class RB_SWAligner {
|
||
|
public:
|
||
|
RB_SWAligner();
|
||
|
~RB_SWAligner();
|
||
|
|
||
|
void init_dyn(const RepeatParameter& rp);
|
||
|
|
||
|
int alignStrings(const string &ref,
|
||
|
const string &read,
|
||
|
EList<Edit>& edits,
|
||
|
Coord& coord);
|
||
|
|
||
|
void makePadString(const string& ref,
|
||
|
const string& read,
|
||
|
string& pad,
|
||
|
size_t len);
|
||
|
|
||
|
void doTest(const RepeatParameter& rp,
|
||
|
const string& refstr,
|
||
|
const string& readstr);
|
||
|
|
||
|
void doTestCase1(const string& refstr,
|
||
|
const string& readstr,
|
||
|
TIndexOffU rpt_edit);
|
||
|
|
||
|
private:
|
||
|
//
|
||
|
SimpleFunc scoreMin_;
|
||
|
SimpleFunc nCeil_;
|
||
|
SimpleFunc penCanIntronLen_;
|
||
|
SimpleFunc penNoncanIntronLen_;
|
||
|
|
||
|
Scoring *sc_;
|
||
|
SwAligner swa;
|
||
|
LinkedEList<EList<Edit> > rawEdits_;
|
||
|
RandomSource rnd_;
|
||
|
};
|
||
|
|
||
|
// SA Subset
|
||
|
class RB_SubSA {
|
||
|
public:
|
||
|
RB_SubSA() {}
|
||
|
~RB_SubSA() {}
|
||
|
|
||
|
void writeFile(ofstream& fp);
|
||
|
void readFile(ifstream &fp);
|
||
|
|
||
|
void init(TIndexOffU sa_size,
|
||
|
TIndexOffU seed_len,
|
||
|
TIndexOffU seed_count);
|
||
|
|
||
|
template<typename TStr>
|
||
|
void push_back(const TStr& s,
|
||
|
CoordHelper& coordHelper,
|
||
|
TIndexOffU saElt,
|
||
|
bool lastInput = false);
|
||
|
|
||
|
inline TIndexOffU seed_len() const { return seed_len_; }
|
||
|
inline TIndexOffU seed_count() const { return seed_count_; }
|
||
|
|
||
|
inline size_t size() const { return repeat_list_.size(); }
|
||
|
TIndexOffU get(size_t idx) const { return repeat_list_[idx]; }
|
||
|
inline TIndexOffU operator[](size_t i) const { return get(i); }
|
||
|
|
||
|
const EList<TIndexOffU>& getRepeatIndex() const { return repeat_index_; }
|
||
|
|
||
|
template<typename TStr>
|
||
|
Range find(const TStr& s,
|
||
|
const string& seq) const;
|
||
|
|
||
|
template<typename TStr>
|
||
|
TIndexOffU find_repeat_idx(const TStr& s,
|
||
|
const string& seq) const;
|
||
|
|
||
|
void setDone(TIndexOffU off, TIndexOffU len = 1);
|
||
|
bool isDone(TIndexOffU off, TIndexOffU len = 1) const;
|
||
|
|
||
|
void dump() const
|
||
|
{
|
||
|
cerr << "seed length: " << seed_len_ << endl;
|
||
|
cerr << "minimum seed count: " << seed_count_ << endl;
|
||
|
cerr << "number of seed groups: " << repeat_index_.size() << endl;
|
||
|
cerr << "number of seeds: " << repeat_list_.size() << endl;
|
||
|
}
|
||
|
|
||
|
size_t getMemUsage() const
|
||
|
{
|
||
|
size_t tot = 0;
|
||
|
|
||
|
tot += repeat_list_.totalCapacityBytes();
|
||
|
tot += repeat_index_.totalCapacityBytes();
|
||
|
tot += done_.totalCapacityBytes();
|
||
|
tot += temp_suffixes_.totalCapacityBytes();
|
||
|
|
||
|
return tot;
|
||
|
}
|
||
|
|
||
|
template<typename TStr>
|
||
|
void buildRepeatBase(const TStr& s,
|
||
|
CoordHelper& coordHelper,
|
||
|
const size_t max_len,
|
||
|
EList<RB_RepeatBase>& repeatBases);
|
||
|
|
||
|
private:
|
||
|
TIndexOffU sa_size_;
|
||
|
TIndexOffU seed_len_;
|
||
|
TIndexOffU seed_count_;
|
||
|
EList<TIndexOffU> temp_suffixes_;
|
||
|
|
||
|
// repeat index
|
||
|
EList<TIndexOffU> repeat_list_;
|
||
|
EList<TIndexOffU> repeat_index_;
|
||
|
EList<uint32_t> done_;
|
||
|
};
|
||
|
|
||
|
|
||
|
// find and write repeats
|
||
|
template<typename TStr>
|
||
|
class RepeatBuilder {
|
||
|
|
||
|
public:
|
||
|
RepeatBuilder(TStr& s,
|
||
|
TStr& sOriginal,
|
||
|
const EList<RefRecord>& szs,
|
||
|
const EList<string>& ref_names,
|
||
|
bool forward_only,
|
||
|
const string& filename);
|
||
|
~RepeatBuilder();
|
||
|
|
||
|
|
||
|
public:
|
||
|
void readSA(const RepeatParameter& rp,
|
||
|
BlockwiseSA<TStr>& sa);
|
||
|
|
||
|
void readSA(const RepeatParameter& rp,
|
||
|
const string& filename);
|
||
|
|
||
|
void readSA(const RepeatParameter& rp,
|
||
|
const BitPackedArray& sa);
|
||
|
|
||
|
void writeSA(const RepeatParameter& rp,
|
||
|
const string& filename);
|
||
|
|
||
|
void build(const RepeatParameter& rp);
|
||
|
|
||
|
void sortRepeatGroup();
|
||
|
|
||
|
void saveRepeats(const RepeatParameter& rp);
|
||
|
void saveConsensus(const RepeatParameter& rp,
|
||
|
const string& repName);
|
||
|
void saveFile(const RepeatParameter& rp);
|
||
|
|
||
|
void addRepeatGroup(const RepeatParameter& rp,
|
||
|
size_t& repeat_id,
|
||
|
RB_RepeatManager& repeat_manger,
|
||
|
const RB_RepeatBase& repeatBase,
|
||
|
ostream& fp);
|
||
|
|
||
|
void reassignSeeds(const RepeatParameter& rp,
|
||
|
size_t repeat_bid,
|
||
|
size_t repeat_eid,
|
||
|
EList<SeedExt>& seeds);
|
||
|
|
||
|
bool checkSequenceMergeable(const string& ref,
|
||
|
const string& read,
|
||
|
EList<Edit>& edits,
|
||
|
Coord& coord,
|
||
|
TIndexOffU rpt_len,
|
||
|
TIndexOffU max_edit = 10);
|
||
|
|
||
|
void doTest(const RepeatParameter& rp,
|
||
|
const string& refstr,
|
||
|
const string& readstr)
|
||
|
{
|
||
|
swaligner_.doTest(rp, refstr, readstr);
|
||
|
}
|
||
|
|
||
|
private:
|
||
|
void saveAlleles(const RepeatParameter& rp,
|
||
|
const string& repName,
|
||
|
RB_Repeat& repeat,
|
||
|
ofstream&,
|
||
|
ofstream&,
|
||
|
TIndexOffU grp_id,
|
||
|
TIndexOffU&,
|
||
|
TIndexOffU baseoff);
|
||
|
|
||
|
void writeAllele(TIndexOffU grp_id,
|
||
|
TIndexOffU allele_id,
|
||
|
Range range,
|
||
|
const string& seq_name,
|
||
|
TIndexOffU baseoff,
|
||
|
const EList<SeedExt>& seeds,
|
||
|
ostream &fp);
|
||
|
|
||
|
|
||
|
void writeSNPs(ostream& fp,
|
||
|
const string& rep_chr_name,
|
||
|
const ESet<Edit>& editset);
|
||
|
|
||
|
void writeHaploType(const EList<SeedHP>& haplo_list,
|
||
|
const EList<SeedExt>& seeds,
|
||
|
TIndexOffU& hapl_id_base,
|
||
|
const string& seq_name,
|
||
|
ostream& fp);
|
||
|
|
||
|
void generateHaploType(Range range,
|
||
|
const EList<SeedExt>& seeds,
|
||
|
EList<SeedHP>& haplo_list);
|
||
|
|
||
|
private:
|
||
|
const int output_width = 60;
|
||
|
|
||
|
TStr& s_;
|
||
|
TStr& sOriginal_;
|
||
|
bool forward_only_;
|
||
|
string filename_;
|
||
|
|
||
|
RB_SubSA subSA_;
|
||
|
|
||
|
TIndexOffU forward_length_;
|
||
|
|
||
|
CoordHelper coordHelper_;
|
||
|
|
||
|
//
|
||
|
RB_SWAligner swaligner_;
|
||
|
|
||
|
// Seeds
|
||
|
EList<string> consensus_all_;
|
||
|
ELList<SeedExt> seeds_;
|
||
|
map<size_t, RB_Repeat*> repeat_map_;
|
||
|
};
|
||
|
|
||
|
|
||
|
int strcmpPos(const string&, const string&, TIndexOffU&);
|
||
|
template<typename TStr> void dump_tstr(TStr& s);
|
||
|
|
||
|
#endif /* __REPEAT_BUILDER_H__ */
|