hisat-3n/repeat_builder.h

965 lines
27 KiB
C
Raw Permalink Normal View History

2025-01-18 13:09:52 +00:00
/*
* Copyright 2018, Chanhee Park <parkchanhee@gmail.com> and Daehwan Kim <infphilo@gmail.com>
*
* This file is part of HISAT 2.
*
* HISAT 2 is free software: you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation, either version 3 of the License, or
* (at your option) any later version.
*
* HISAT 2 is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with HISAT 2. If not, see <http://www.gnu.org/licenses/>.
*/
#ifndef __REPEAT_BUILDER_H__
#define __REPEAT_BUILDER_H__
#include <iostream>
#include <fstream>
#include <limits>
#include <map>
#include "assert_helpers.h"
#include "word_io.h"
#include "mem_ids.h"
#include "ref_coord.h"
#include "ref_read.h"
#include "edit.h"
#include "ds.h"
#include "repeat.h"
#include "blockwise_sa.h"
#include "simple_func.h"
#include "scoring.h"
#include "aligner_sw.h"
#include "bit_packed_array.h"
//#define DEBUGLOG
using namespace std;
/**
* Encapsulates repeat parameters.
*/
class RepeatParameter {
public:
TIndexOffU seed_len; // seed length
TIndexOffU seed_count; // seed count
TIndexOffU seed_mm; // maximum edit distance allowed during initial seed extension
TIndexOffU repeat_count; // repeat count
TIndexOffU min_repeat_len; // minimum repeat length
TIndexOffU max_repeat_len; // maximum repeat length
TIndexOffU max_edit; // maximum edit distance allowed
bool symmetric_extend; // extend symmetrically
TIndexOffU extend_unit_len; // extend seeds no longer than this length at a time
bool append_result;
};
typedef pair<TIndexOffU, TIndexOffU> Range;
struct Fragments {
bool contain(TIndexOffU pos) {
if (pos >= joinedOff && pos < (joinedOff + length)) {
return true;
}
return false;
}
TIndexOffU joinedOff; // index within joined text
TIndexOffU length;
int frag_id;
int seq_id;
TIndexOffU seqOff; // index within sequence
bool first;
};
class CoordHelper {
public:
CoordHelper(TIndexOffU length,
TIndexOffU forward_length,
const EList<RefRecord>& szs,
const EList<string>& ref_names);
~CoordHelper();
int mapJoinedOffToSeq(TIndexOffU joined_pos);
int getGenomeCoord(TIndexOffU joined_pos,
string& chr_name,
TIndexOffU& pos_in_chr);
TIndexOffU getEnd(TIndexOffU e);
TIndexOffU getStart(TIndexOffU e);
TIndexOffU forward_length() const { return forward_length_; }
TIndexOffU length() const { return length_; }
private:
void buildNames();
void buildJoinedFragment();
private:
TIndexOffU length_;
TIndexOffU forward_length_;
EList<RefRecord> szs_;
EList<string> ref_names_;
EList<string> ref_namelines_;
// mapping info from joined string to genome
EList<Fragments> fraglist_;
// Fragments Cache
#define CACHE_SIZE_JOINEDFRG 10
Fragments cached_[CACHE_SIZE_JOINEDFRG];
int num_cached_ = 0;
int victim_ = 0; /* round-robin */
};
struct SeedHP {
bool operator==(const SeedHP &rhs) const {
return range == rhs.range &&
snpIDs == rhs.snpIDs;
}
bool operator!=(const SeedHP &rhs) const {
return !(rhs == *this);
}
bool operator<(const SeedHP &rhs) const {
return range < rhs.range;
}
bool operator>(const SeedHP &rhs) const {
return rhs < *this;
}
bool operator<=(const SeedHP &rhs) const {
return !(rhs < *this);
}
bool operator>=(const SeedHP &rhs) const {
return !(*this < rhs);
}
Range range;
EList<string> snpIDs;
};
struct SeedSNP {
int type; // EDIT_TYPE_MM, EDIT_TYPE_READ_GAP, EDIT_TYPE_REF_GAP
TIndexOffU pos;
size_t len;
string base; // valid when ty = MM, REF_GAP
TIndexOffU id;
SeedSNP() {}
void init(int ty, TIndexOffU po, size_t ln, string bs) {
type = ty;
pos = po;
len = ln;
base = bs;
}
void init(int ty, TIndexOffU po, size_t ln, char bs) {
type = ty;
pos = po;
len = ln;
base.assign(1, bs);
}
friend ostream &operator<<(ostream &os, const SeedSNP &snp) {
if(snp.type == EDIT_TYPE_MM) {
os << "single";
} else if(snp.type == EDIT_TYPE_REF_GAP) {
os << "insertion";
} else if(snp.type == EDIT_TYPE_READ_GAP) {
os << "deletion";
}
os << " ";
os << snp.pos << " ";
if(snp.type == EDIT_TYPE_MM || snp.type == EDIT_TYPE_REF_GAP) {
os << snp.base;
} else if(snp.type == EDIT_TYPE_READ_GAP) {
os << snp.len;
}
return os;
}
bool operator==(const SeedSNP &rhs) const {
return type == rhs.type &&
pos == rhs.pos &&
len == rhs.len &&
base == rhs.base;
}
bool operator!=(const SeedSNP &rhs) const {
return !(rhs == *this);
}
static bool cmpSeedSNPByPos( SeedSNP * const& a, SeedSNP * const& b) {
return a->pos < b->pos;
}
};
class SeedExt {
public:
SeedExt() {
reset();
};
void reset() {
done = false;
curr_ext_len = 0;
ed = total_ed = 0;
orig_pos.first = 0;
orig_pos.second = 0;
pos.first = 0;
pos.second = 0;
bound.first = 0;
bound.second = 0;
consensus_pos.first = 0;
consensus_pos.second = 0;
left_gaps.clear();
right_gaps.clear();
aligned = true;
};
TIndexOffU getLeftExtLength() const {
assert_leq(pos.first, orig_pos.first);
TIndexOffU len = orig_pos.first - pos.first;
return len;
}
TIndexOffU getRightExtLength() const {
assert_geq(pos.second, orig_pos.second);
return pos.second - orig_pos.second;
}
TIndexOffU getLength() const {
TIndexOffU len = orig_pos.second - orig_pos.first;
len += getLeftExtLength();
len += getRightExtLength();
return len;
}
template<typename TStr>
void getLeftExtString(const TStr& s, string& seq)
{
seq.clear();
seq = getString(s, pos.first, orig_pos.first - pos.first);
}
template<typename TStr>
void getRightExtString(const TStr& s, string& seq)
{
seq.clear();
seq = getString(s, orig_pos.second, pos.second - orig_pos.second);
}
template<typename TStr>
void getExtendedSeedSequence(const TStr& s,
string& seq) const;
template<typename TStr>
void generateSNPs(const TStr& s, const string& consensus, EList<SeedSNP *>& repeat_snps);
static bool isSameConsensus(const SeedExt& a, const SeedExt& b) {
return (a.consensus_pos == b.consensus_pos);
//&& (a.getLength() == b.getLength());
}
static bool isSameSNPs(const SeedExt& a, const SeedExt& b) {
const EList<SeedSNP *>& a_edit = a.snps;
const EList<SeedSNP *>& b_edit = b.snps;
if(a_edit.size() != b_edit.size()) {
return false;
}
for(size_t i = 0; i < a_edit.size(); i++) {
if(!(*a_edit[i] == *b_edit[i])) {
return false;
}
}
return true;
}
static bool isSameAllele(const SeedExt& a, const SeedExt& b) {
const EList<Edit>& a_edit = a.edits;
const EList<Edit>& b_edit = b.edits;
if(a_edit.size() != b_edit.size()) {
return false;
}
for(size_t i = 0; i < a_edit.size(); i++) {
if(!(a_edit[i] == b_edit[i])) {
return false;
}
}
return true;
}
static bool sort_by_edits(const SeedExt& a, const SeedExt& b) {
const EList<Edit>& a_edit = a.edits;
const EList<Edit>& b_edit = b.edits;
size_t i = 0;
while((i < a_edit.size())
&& (i < b_edit.size())) {
if(!(a_edit[i] == b_edit[i])) {
if(a_edit[i] < b_edit[i]) {
return true;
} else {
return false;
}
}
i++;
}
if(a_edit.size() < b_edit.size()) {
return true;
}
if((a_edit.size() == b_edit.size())
&& (a.pos.first < b.pos.first)) {
return true;
}
return false;
}
void generateEdits(const string& consensus_merged, const string& seed_ext)
{
size_t ed_done = 0;
size_t ext_len = getLength();
assert_eq(ext_len, seed_ext.length());
edits.clear();
for(size_t i = 0; i < ext_len && ed_done < total_ed; i++) {
char con_base = consensus_merged[consensus_pos.first + i];
char seed_base = seed_ext[i];
if (con_base != seed_base) {
edits.expand();
edits.back().init(consensus_pos.first + i,
con_base, seed_base, EDIT_TYPE_MM);
ed_done++;
}
}
}
Range getExtendedRange(size_t consensus_len) const
{
assert_leq(consensus_pos.second, consensus_len);
Range range;
range.first = (pos.first < consensus_pos.first ? 0 : pos.first - consensus_pos.first);
range.second = pos.second + (consensus_len - consensus_pos.second);
return range;
}
#ifndef NDEBUG
bool valid() const {
assert_leq(consensus_pos.first, consensus_pos.second);
TIndexOffU constr_len = consensus_pos.second - consensus_pos.first;
TIndexOffU allele_len = getLength();
TIndexOffU cur_off = 0;
for(size_t i = 0; i < left_gaps.size(); i++) {
assert_geq(left_gaps[i].first, cur_off);
cur_off = left_gaps[i].first;
int gap_len = left_gaps[i].second;
assert_neq(gap_len, 0);
if(gap_len > 0) { // deletion
allele_len += gap_len;
} else {
allele_len += gap_len;
cur_off += (-gap_len);
}
}
cur_off = 0;
for(size_t i = 0; i < right_gaps.size(); i++) {
assert_geq(right_gaps[i].first, cur_off);
cur_off = right_gaps[i].first;
int gap_len = right_gaps[i].second;
assert_neq(gap_len, 0);
if(gap_len > 0) { // deletion
allele_len += gap_len;
} else {
allele_len += gap_len;
cur_off += (-gap_len);
}
}
assert_eq(constr_len, allele_len);
return true;
}
#endif
public:
// seed extended position [first, second)
pair<TIndexOffU, TIndexOffU> orig_pos;
pair<TIndexOffU, TIndexOffU> pos;
// extension bound. the seed must be placed on same fragment
// [first, second)
pair<TIndexOffU, TIndexOffU> bound;
// positions relative to consensus sequence
pair<TIndexOffU, TIndexOffU> consensus_pos;
// a list of gaps (deletions and insertions) in both directions
// offsets from seed's left and right ("pos" above)
// positive and negative values indicate deletions and insertions, resp.
EList<pair<TIndexOffU, int> > left_gaps;
EList<pair<TIndexOffU, int> > right_gaps;
uint32_t ed; // edit distance
uint32_t total_ed; // total edit distance
bool done; // done flag
uint32_t curr_ext_len; //
bool aligned;
EList<Edit> edits; // edits w.r.t. consensus_merged
EList<SeedSNP *> snps;
};
class RB_AlleleCoord {
public:
RB_AlleleCoord() :
left(0),
right(0),
idx(0)
{}
RB_AlleleCoord(TIndexOffU l, TIndexOffU r, size_t i) :
left(l),
right(r),
idx(i)
{}
TIndexOffU len() const { return right - left; }
bool operator<(const RB_AlleleCoord& o) const
{
if(left != o.left)
return left < o.left;
if(right != o.right)
return right > o.right;
return false;
}
bool contain(const RB_AlleleCoord& o, size_t relax = 0) const
{
if(o.left + relax >= left && o.right <= right + relax)
return true;
else
return false;
}
bool contained(const RB_AlleleCoord& o) const
{
return o.contain(*this);
}
TIndexOffU overlap_len(const RB_AlleleCoord& o) const
{
if(contain(o)) return o.len();
else if(o.contain(*this)) return len();
else if(left < o.right && right > o.left) {
if(left <= o.left) {
return right - o.left;
} else {
return o.right - left;
}
}
return 0;
}
public:
TIndexOffU left;
TIndexOffU right;
size_t idx;
};
class RB_RepeatManager;
class RB_SWAligner;
class RB_SubSA;
class RB_RepeatBase {
public:
string seq;
EList<TIndexOffU> nodes;
};
class RB_Repeat {
public:
RB_Repeat() {}
~RB_Repeat() {
if(snps_.size() > 0) {
for(size_t i = 0; i < snps_.size(); i++) {
delete snps_[i];
}
}
}
void repeat_id(size_t repeat_id) { repeat_id_ = repeat_id; }
size_t repeat_id() const { return repeat_id_; }
void parent_id(size_t parent_id) { parent_id_ = parent_id; }
size_t parent_id() const { return parent_id_; }
string& consensus() { return consensus_; }
const string& consensus() const { return consensus_; }
EList<SeedExt>& seeds() { return seeds_; }
const EList<SeedExt>& seeds() const { return seeds_; }
EList<RB_AlleleCoord>& seed_ranges() { return seed_ranges_; }
const EList<RB_AlleleCoord>& seed_ranges() const { return seed_ranges_; }
EList<SeedSNP *>& snps() { return snps_; }
const EList<SeedSNP *>& snps() const { return snps_; }
template<typename TStr>
void init(const RepeatParameter& rp,
const TStr& s,
CoordHelper& coordHelper,
const RB_SubSA& subSA,
const RB_RepeatBase& repeatBase);
template<typename TStr>
void extendConsensus(const RepeatParameter& rp,
const TStr& s);
template<typename TStr>
void getNextRepeat(const RepeatParameter& rp,
const TStr& s,
RB_Repeat& o);
template<typename TStr>
void saveSeedExtension(const RepeatParameter& rp,
const TStr& s,
CoordHelper& coordHelper,
TIndexOffU seed_grp_id,
ostream& fp,
size_t& total_repeat_seq_len,
size_t& total_allele_seq_len) const;
void saveSNPs(ofstream& fp,
TIndexOffU grp_id,
TIndexOffU& snp_id_base);
void saveConsensus(ofstream& fp,
TIndexOffU grp_id);
bool overlap(const RB_Repeat& o,
bool& contain,
bool& left,
size_t& seed_i,
size_t& seed_j,
bool debug = false) const;
bool satisfy(const RepeatParameter& rp) const
{
if(consensus_.length() < rp.min_repeat_len)
return false;
if(seeds_.size() < rp.repeat_count)
return false;
if(seeds_[rp.repeat_count - 1].getLength() < rp.min_repeat_len)
return false;
return true;
}
void reset() {
consensus_.clear();
for(size_t i = 0; i < seeds_.size(); i++)
seeds_[i].reset();
seeds_.clear();
seed_ranges_.clear();
}
void showInfo(const RepeatParameter& rp,
CoordHelper& coordHelper) const;
template<typename TStr>
void generateSNPs(const RepeatParameter&, const TStr& s, TIndexOffU grp_id);
bool self_repeat() const { return self_repeat_; }
void update() { internal_update(); }
void addSeed(const SeedExt& seed)
{
seeds_.expand();
seeds_.back() = seed;
}
bool contain(TIndexOffU left, TIndexOffU right) const;
protected:
template<typename TStr>
void get_consensus_seq(const TStr& s,
EList<SeedExt>& seeds,
size_t sb,
size_t se,
size_t min_left_ext,
size_t min_right_ext,
size_t max_ed,
const RepeatParameter& rp,
EList<size_t>& ed_seed_nums,
EList<string>* left_consensuses,
EList<string>* right_consensuses) const;
void internal_update();
protected:
size_t repeat_id_;
size_t parent_id_;
string consensus_;
EList<SeedExt> seeds_;
EList<RB_AlleleCoord> seed_ranges_;
EList<SeedSNP*> snps_;
bool self_repeat_;
static EList<size_t> ca_ed_;
static EList<size_t> ca_ed2_;
static string ca_s_;
static string ca_s2_;
public:
static size_t seed_merge_tried;
static size_t seed_merged;
};
class RB_RepeatExt : public RB_Repeat {
public:
RB_RepeatExt() {}
~RB_RepeatExt() {}
template<typename TStr>
void extendConsensus(const RepeatParameter& rp,
const TStr& s);
float mergeable(const RB_Repeat& o) const;
template<typename TStr>
bool merge(const RepeatParameter& rp,
const TStr& s,
RB_SWAligner& swalginer,
const RB_RepeatExt& o,
bool contain,
size_t seed_i,
size_t seed_j,
bool debug = false);
protected:
template<typename TStr>
void get_consensus_seq(const TStr& s,
EList<SeedExt>& seeds,
size_t sb,
size_t se,
size_t min_left_ext,
size_t min_right_ext,
size_t max_ed,
const RepeatParameter& rp,
EList<size_t>& ed_seed_nums,
EList<string>* left_consensuses,
EList<string>* right_consensuses) const;
template<typename TStr>
bool align(const RepeatParameter& rp,
const TStr& ref,
const string& s,
const EList<pair<size_t, size_t> >& s_kmer_table,
const string& s2,
EList<int>& offsets,
size_t k,
SeedExt& seed,
int consensus_approx_left,
int consensus_approx_right,
size_t left,
size_t right,
bool debug);
bool isSelfRepeat(const RepeatParameter& rp,
const string& s,
const EList<pair<size_t, size_t> >& s_kmer_table,
EList<int>& offsets,
size_t k,
bool debug);
};
// check if a set of seeds are already processed
class RB_RepeatManager {
public:
size_t numCoords() const { return range_to_repeats_.size(); }
bool checkRedundant(const RepeatParameter& rp,
const map<size_t, RB_Repeat*>& repeat_map,
const EList<TIndexOffU>& positions,
EList<size_t>& to_remove) const;
void addRepeat(const RB_Repeat* repeat);
void addRepeat(Range range, size_t repeat_id);
void removeRepeat(const RB_Repeat* repeat);
void removeRepeat(Range range, size_t repeat_id);
public:
void showInfo(const RepeatParameter& rp,
CoordHelper& coordHelper,
const map<size_t, RB_Repeat*>& repeat_map,
size_t num_case = 5) const;
private:
map<Range, EList<size_t> > range_to_repeats_;
};
class RB_SWAligner {
public:
RB_SWAligner();
~RB_SWAligner();
void init_dyn(const RepeatParameter& rp);
int alignStrings(const string &ref,
const string &read,
EList<Edit>& edits,
Coord& coord);
void makePadString(const string& ref,
const string& read,
string& pad,
size_t len);
void doTest(const RepeatParameter& rp,
const string& refstr,
const string& readstr);
void doTestCase1(const string& refstr,
const string& readstr,
TIndexOffU rpt_edit);
private:
//
SimpleFunc scoreMin_;
SimpleFunc nCeil_;
SimpleFunc penCanIntronLen_;
SimpleFunc penNoncanIntronLen_;
Scoring *sc_;
SwAligner swa;
LinkedEList<EList<Edit> > rawEdits_;
RandomSource rnd_;
};
// SA Subset
class RB_SubSA {
public:
RB_SubSA() {}
~RB_SubSA() {}
void writeFile(ofstream& fp);
void readFile(ifstream &fp);
void init(TIndexOffU sa_size,
TIndexOffU seed_len,
TIndexOffU seed_count);
template<typename TStr>
void push_back(const TStr& s,
CoordHelper& coordHelper,
TIndexOffU saElt,
bool lastInput = false);
inline TIndexOffU seed_len() const { return seed_len_; }
inline TIndexOffU seed_count() const { return seed_count_; }
inline size_t size() const { return repeat_list_.size(); }
TIndexOffU get(size_t idx) const { return repeat_list_[idx]; }
inline TIndexOffU operator[](size_t i) const { return get(i); }
const EList<TIndexOffU>& getRepeatIndex() const { return repeat_index_; }
template<typename TStr>
Range find(const TStr& s,
const string& seq) const;
template<typename TStr>
TIndexOffU find_repeat_idx(const TStr& s,
const string& seq) const;
void setDone(TIndexOffU off, TIndexOffU len = 1);
bool isDone(TIndexOffU off, TIndexOffU len = 1) const;
void dump() const
{
cerr << "seed length: " << seed_len_ << endl;
cerr << "minimum seed count: " << seed_count_ << endl;
cerr << "number of seed groups: " << repeat_index_.size() << endl;
cerr << "number of seeds: " << repeat_list_.size() << endl;
}
size_t getMemUsage() const
{
size_t tot = 0;
tot += repeat_list_.totalCapacityBytes();
tot += repeat_index_.totalCapacityBytes();
tot += done_.totalCapacityBytes();
tot += temp_suffixes_.totalCapacityBytes();
return tot;
}
template<typename TStr>
void buildRepeatBase(const TStr& s,
CoordHelper& coordHelper,
const size_t max_len,
EList<RB_RepeatBase>& repeatBases);
private:
TIndexOffU sa_size_;
TIndexOffU seed_len_;
TIndexOffU seed_count_;
EList<TIndexOffU> temp_suffixes_;
// repeat index
EList<TIndexOffU> repeat_list_;
EList<TIndexOffU> repeat_index_;
EList<uint32_t> done_;
};
// find and write repeats
template<typename TStr>
class RepeatBuilder {
public:
RepeatBuilder(TStr& s,
TStr& sOriginal,
const EList<RefRecord>& szs,
const EList<string>& ref_names,
bool forward_only,
const string& filename);
~RepeatBuilder();
public:
void readSA(const RepeatParameter& rp,
BlockwiseSA<TStr>& sa);
void readSA(const RepeatParameter& rp,
const string& filename);
void readSA(const RepeatParameter& rp,
const BitPackedArray& sa);
void writeSA(const RepeatParameter& rp,
const string& filename);
void build(const RepeatParameter& rp);
void sortRepeatGroup();
void saveRepeats(const RepeatParameter& rp);
void saveConsensus(const RepeatParameter& rp,
const string& repName);
void saveFile(const RepeatParameter& rp);
void addRepeatGroup(const RepeatParameter& rp,
size_t& repeat_id,
RB_RepeatManager& repeat_manger,
const RB_RepeatBase& repeatBase,
ostream& fp);
void reassignSeeds(const RepeatParameter& rp,
size_t repeat_bid,
size_t repeat_eid,
EList<SeedExt>& seeds);
bool checkSequenceMergeable(const string& ref,
const string& read,
EList<Edit>& edits,
Coord& coord,
TIndexOffU rpt_len,
TIndexOffU max_edit = 10);
void doTest(const RepeatParameter& rp,
const string& refstr,
const string& readstr)
{
swaligner_.doTest(rp, refstr, readstr);
}
private:
void saveAlleles(const RepeatParameter& rp,
const string& repName,
RB_Repeat& repeat,
ofstream&,
ofstream&,
TIndexOffU grp_id,
TIndexOffU&,
TIndexOffU baseoff);
void writeAllele(TIndexOffU grp_id,
TIndexOffU allele_id,
Range range,
const string& seq_name,
TIndexOffU baseoff,
const EList<SeedExt>& seeds,
ostream &fp);
void writeSNPs(ostream& fp,
const string& rep_chr_name,
const ESet<Edit>& editset);
void writeHaploType(const EList<SeedHP>& haplo_list,
const EList<SeedExt>& seeds,
TIndexOffU& hapl_id_base,
const string& seq_name,
ostream& fp);
void generateHaploType(Range range,
const EList<SeedExt>& seeds,
EList<SeedHP>& haplo_list);
private:
const int output_width = 60;
TStr& s_;
TStr& sOriginal_;
bool forward_only_;
string filename_;
RB_SubSA subSA_;
TIndexOffU forward_length_;
CoordHelper coordHelper_;
//
RB_SWAligner swaligner_;
// Seeds
EList<string> consensus_all_;
ELList<SeedExt> seeds_;
map<size_t, RB_Repeat*> repeat_map_;
};
int strcmpPos(const string&, const string&, TIndexOffU&);
template<typename TStr> void dump_tstr(TStr& s);
#endif /* __REPEAT_BUILDER_H__ */