/* * Copyright 2018, Chanhee Park and Daehwan Kim * * This file is part of HISAT 2. * * HISAT 2 is free software: you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation, either version 3 of the License, or * (at your option) any later version. * * HISAT 2 is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with HISAT 2. If not, see . */ #ifndef __REPEAT_BUILDER_H__ #define __REPEAT_BUILDER_H__ #include #include #include #include #include "assert_helpers.h" #include "word_io.h" #include "mem_ids.h" #include "ref_coord.h" #include "ref_read.h" #include "edit.h" #include "ds.h" #include "repeat.h" #include "blockwise_sa.h" #include "simple_func.h" #include "scoring.h" #include "aligner_sw.h" #include "bit_packed_array.h" //#define DEBUGLOG using namespace std; /** * Encapsulates repeat parameters. */ class RepeatParameter { public: TIndexOffU seed_len; // seed length TIndexOffU seed_count; // seed count TIndexOffU seed_mm; // maximum edit distance allowed during initial seed extension TIndexOffU repeat_count; // repeat count TIndexOffU min_repeat_len; // minimum repeat length TIndexOffU max_repeat_len; // maximum repeat length TIndexOffU max_edit; // maximum edit distance allowed bool symmetric_extend; // extend symmetrically TIndexOffU extend_unit_len; // extend seeds no longer than this length at a time bool append_result; }; typedef pair Range; struct Fragments { bool contain(TIndexOffU pos) { if (pos >= joinedOff && pos < (joinedOff + length)) { return true; } return false; } TIndexOffU joinedOff; // index within joined text TIndexOffU length; int frag_id; int seq_id; TIndexOffU seqOff; // index within sequence bool first; }; class CoordHelper { public: CoordHelper(TIndexOffU length, TIndexOffU forward_length, const EList& szs, const EList& ref_names); ~CoordHelper(); int mapJoinedOffToSeq(TIndexOffU joined_pos); int getGenomeCoord(TIndexOffU joined_pos, string& chr_name, TIndexOffU& pos_in_chr); TIndexOffU getEnd(TIndexOffU e); TIndexOffU getStart(TIndexOffU e); TIndexOffU forward_length() const { return forward_length_; } TIndexOffU length() const { return length_; } private: void buildNames(); void buildJoinedFragment(); private: TIndexOffU length_; TIndexOffU forward_length_; EList szs_; EList ref_names_; EList ref_namelines_; // mapping info from joined string to genome EList fraglist_; // Fragments Cache #define CACHE_SIZE_JOINEDFRG 10 Fragments cached_[CACHE_SIZE_JOINEDFRG]; int num_cached_ = 0; int victim_ = 0; /* round-robin */ }; struct SeedHP { bool operator==(const SeedHP &rhs) const { return range == rhs.range && snpIDs == rhs.snpIDs; } bool operator!=(const SeedHP &rhs) const { return !(rhs == *this); } bool operator<(const SeedHP &rhs) const { return range < rhs.range; } bool operator>(const SeedHP &rhs) const { return rhs < *this; } bool operator<=(const SeedHP &rhs) const { return !(rhs < *this); } bool operator>=(const SeedHP &rhs) const { return !(*this < rhs); } Range range; EList snpIDs; }; struct SeedSNP { int type; // EDIT_TYPE_MM, EDIT_TYPE_READ_GAP, EDIT_TYPE_REF_GAP TIndexOffU pos; size_t len; string base; // valid when ty = MM, REF_GAP TIndexOffU id; SeedSNP() {} void init(int ty, TIndexOffU po, size_t ln, string bs) { type = ty; pos = po; len = ln; base = bs; } void init(int ty, TIndexOffU po, size_t ln, char bs) { type = ty; pos = po; len = ln; base.assign(1, bs); } friend ostream &operator<<(ostream &os, const SeedSNP &snp) { if(snp.type == EDIT_TYPE_MM) { os << "single"; } else if(snp.type == EDIT_TYPE_REF_GAP) { os << "insertion"; } else if(snp.type == EDIT_TYPE_READ_GAP) { os << "deletion"; } os << " "; os << snp.pos << " "; if(snp.type == EDIT_TYPE_MM || snp.type == EDIT_TYPE_REF_GAP) { os << snp.base; } else if(snp.type == EDIT_TYPE_READ_GAP) { os << snp.len; } return os; } bool operator==(const SeedSNP &rhs) const { return type == rhs.type && pos == rhs.pos && len == rhs.len && base == rhs.base; } bool operator!=(const SeedSNP &rhs) const { return !(rhs == *this); } static bool cmpSeedSNPByPos( SeedSNP * const& a, SeedSNP * const& b) { return a->pos < b->pos; } }; class SeedExt { public: SeedExt() { reset(); }; void reset() { done = false; curr_ext_len = 0; ed = total_ed = 0; orig_pos.first = 0; orig_pos.second = 0; pos.first = 0; pos.second = 0; bound.first = 0; bound.second = 0; consensus_pos.first = 0; consensus_pos.second = 0; left_gaps.clear(); right_gaps.clear(); aligned = true; }; TIndexOffU getLeftExtLength() const { assert_leq(pos.first, orig_pos.first); TIndexOffU len = orig_pos.first - pos.first; return len; } TIndexOffU getRightExtLength() const { assert_geq(pos.second, orig_pos.second); return pos.second - orig_pos.second; } TIndexOffU getLength() const { TIndexOffU len = orig_pos.second - orig_pos.first; len += getLeftExtLength(); len += getRightExtLength(); return len; } template void getLeftExtString(const TStr& s, string& seq) { seq.clear(); seq = getString(s, pos.first, orig_pos.first - pos.first); } template void getRightExtString(const TStr& s, string& seq) { seq.clear(); seq = getString(s, orig_pos.second, pos.second - orig_pos.second); } template void getExtendedSeedSequence(const TStr& s, string& seq) const; template void generateSNPs(const TStr& s, const string& consensus, EList& repeat_snps); static bool isSameConsensus(const SeedExt& a, const SeedExt& b) { return (a.consensus_pos == b.consensus_pos); //&& (a.getLength() == b.getLength()); } static bool isSameSNPs(const SeedExt& a, const SeedExt& b) { const EList& a_edit = a.snps; const EList& b_edit = b.snps; if(a_edit.size() != b_edit.size()) { return false; } for(size_t i = 0; i < a_edit.size(); i++) { if(!(*a_edit[i] == *b_edit[i])) { return false; } } return true; } static bool isSameAllele(const SeedExt& a, const SeedExt& b) { const EList& a_edit = a.edits; const EList& b_edit = b.edits; if(a_edit.size() != b_edit.size()) { return false; } for(size_t i = 0; i < a_edit.size(); i++) { if(!(a_edit[i] == b_edit[i])) { return false; } } return true; } static bool sort_by_edits(const SeedExt& a, const SeedExt& b) { const EList& a_edit = a.edits; const EList& b_edit = b.edits; size_t i = 0; while((i < a_edit.size()) && (i < b_edit.size())) { if(!(a_edit[i] == b_edit[i])) { if(a_edit[i] < b_edit[i]) { return true; } else { return false; } } i++; } if(a_edit.size() < b_edit.size()) { return true; } if((a_edit.size() == b_edit.size()) && (a.pos.first < b.pos.first)) { return true; } return false; } void generateEdits(const string& consensus_merged, const string& seed_ext) { size_t ed_done = 0; size_t ext_len = getLength(); assert_eq(ext_len, seed_ext.length()); edits.clear(); for(size_t i = 0; i < ext_len && ed_done < total_ed; i++) { char con_base = consensus_merged[consensus_pos.first + i]; char seed_base = seed_ext[i]; if (con_base != seed_base) { edits.expand(); edits.back().init(consensus_pos.first + i, con_base, seed_base, EDIT_TYPE_MM); ed_done++; } } } Range getExtendedRange(size_t consensus_len) const { assert_leq(consensus_pos.second, consensus_len); Range range; range.first = (pos.first < consensus_pos.first ? 0 : pos.first - consensus_pos.first); range.second = pos.second + (consensus_len - consensus_pos.second); return range; } #ifndef NDEBUG bool valid() const { assert_leq(consensus_pos.first, consensus_pos.second); TIndexOffU constr_len = consensus_pos.second - consensus_pos.first; TIndexOffU allele_len = getLength(); TIndexOffU cur_off = 0; for(size_t i = 0; i < left_gaps.size(); i++) { assert_geq(left_gaps[i].first, cur_off); cur_off = left_gaps[i].first; int gap_len = left_gaps[i].second; assert_neq(gap_len, 0); if(gap_len > 0) { // deletion allele_len += gap_len; } else { allele_len += gap_len; cur_off += (-gap_len); } } cur_off = 0; for(size_t i = 0; i < right_gaps.size(); i++) { assert_geq(right_gaps[i].first, cur_off); cur_off = right_gaps[i].first; int gap_len = right_gaps[i].second; assert_neq(gap_len, 0); if(gap_len > 0) { // deletion allele_len += gap_len; } else { allele_len += gap_len; cur_off += (-gap_len); } } assert_eq(constr_len, allele_len); return true; } #endif public: // seed extended position [first, second) pair orig_pos; pair pos; // extension bound. the seed must be placed on same fragment // [first, second) pair bound; // positions relative to consensus sequence pair consensus_pos; // a list of gaps (deletions and insertions) in both directions // offsets from seed's left and right ("pos" above) // positive and negative values indicate deletions and insertions, resp. EList > left_gaps; EList > right_gaps; uint32_t ed; // edit distance uint32_t total_ed; // total edit distance bool done; // done flag uint32_t curr_ext_len; // bool aligned; EList edits; // edits w.r.t. consensus_merged EList snps; }; class RB_AlleleCoord { public: RB_AlleleCoord() : left(0), right(0), idx(0) {} RB_AlleleCoord(TIndexOffU l, TIndexOffU r, size_t i) : left(l), right(r), idx(i) {} TIndexOffU len() const { return right - left; } bool operator<(const RB_AlleleCoord& o) const { if(left != o.left) return left < o.left; if(right != o.right) return right > o.right; return false; } bool contain(const RB_AlleleCoord& o, size_t relax = 0) const { if(o.left + relax >= left && o.right <= right + relax) return true; else return false; } bool contained(const RB_AlleleCoord& o) const { return o.contain(*this); } TIndexOffU overlap_len(const RB_AlleleCoord& o) const { if(contain(o)) return o.len(); else if(o.contain(*this)) return len(); else if(left < o.right && right > o.left) { if(left <= o.left) { return right - o.left; } else { return o.right - left; } } return 0; } public: TIndexOffU left; TIndexOffU right; size_t idx; }; class RB_RepeatManager; class RB_SWAligner; class RB_SubSA; class RB_RepeatBase { public: string seq; EList nodes; }; class RB_Repeat { public: RB_Repeat() {} ~RB_Repeat() { if(snps_.size() > 0) { for(size_t i = 0; i < snps_.size(); i++) { delete snps_[i]; } } } void repeat_id(size_t repeat_id) { repeat_id_ = repeat_id; } size_t repeat_id() const { return repeat_id_; } void parent_id(size_t parent_id) { parent_id_ = parent_id; } size_t parent_id() const { return parent_id_; } string& consensus() { return consensus_; } const string& consensus() const { return consensus_; } EList& seeds() { return seeds_; } const EList& seeds() const { return seeds_; } EList& seed_ranges() { return seed_ranges_; } const EList& seed_ranges() const { return seed_ranges_; } EList& snps() { return snps_; } const EList& snps() const { return snps_; } template void init(const RepeatParameter& rp, const TStr& s, CoordHelper& coordHelper, const RB_SubSA& subSA, const RB_RepeatBase& repeatBase); template void extendConsensus(const RepeatParameter& rp, const TStr& s); template void getNextRepeat(const RepeatParameter& rp, const TStr& s, RB_Repeat& o); template void saveSeedExtension(const RepeatParameter& rp, const TStr& s, CoordHelper& coordHelper, TIndexOffU seed_grp_id, ostream& fp, size_t& total_repeat_seq_len, size_t& total_allele_seq_len) const; void saveSNPs(ofstream& fp, TIndexOffU grp_id, TIndexOffU& snp_id_base); void saveConsensus(ofstream& fp, TIndexOffU grp_id); bool overlap(const RB_Repeat& o, bool& contain, bool& left, size_t& seed_i, size_t& seed_j, bool debug = false) const; bool satisfy(const RepeatParameter& rp) const { if(consensus_.length() < rp.min_repeat_len) return false; if(seeds_.size() < rp.repeat_count) return false; if(seeds_[rp.repeat_count - 1].getLength() < rp.min_repeat_len) return false; return true; } void reset() { consensus_.clear(); for(size_t i = 0; i < seeds_.size(); i++) seeds_[i].reset(); seeds_.clear(); seed_ranges_.clear(); } void showInfo(const RepeatParameter& rp, CoordHelper& coordHelper) const; template void generateSNPs(const RepeatParameter&, const TStr& s, TIndexOffU grp_id); bool self_repeat() const { return self_repeat_; } void update() { internal_update(); } void addSeed(const SeedExt& seed) { seeds_.expand(); seeds_.back() = seed; } bool contain(TIndexOffU left, TIndexOffU right) const; protected: template void get_consensus_seq(const TStr& s, EList& seeds, size_t sb, size_t se, size_t min_left_ext, size_t min_right_ext, size_t max_ed, const RepeatParameter& rp, EList& ed_seed_nums, EList* left_consensuses, EList* right_consensuses) const; void internal_update(); protected: size_t repeat_id_; size_t parent_id_; string consensus_; EList seeds_; EList seed_ranges_; EList snps_; bool self_repeat_; static EList ca_ed_; static EList ca_ed2_; static string ca_s_; static string ca_s2_; public: static size_t seed_merge_tried; static size_t seed_merged; }; class RB_RepeatExt : public RB_Repeat { public: RB_RepeatExt() {} ~RB_RepeatExt() {} template void extendConsensus(const RepeatParameter& rp, const TStr& s); float mergeable(const RB_Repeat& o) const; template bool merge(const RepeatParameter& rp, const TStr& s, RB_SWAligner& swalginer, const RB_RepeatExt& o, bool contain, size_t seed_i, size_t seed_j, bool debug = false); protected: template void get_consensus_seq(const TStr& s, EList& seeds, size_t sb, size_t se, size_t min_left_ext, size_t min_right_ext, size_t max_ed, const RepeatParameter& rp, EList& ed_seed_nums, EList* left_consensuses, EList* right_consensuses) const; template bool align(const RepeatParameter& rp, const TStr& ref, const string& s, const EList >& s_kmer_table, const string& s2, EList& offsets, size_t k, SeedExt& seed, int consensus_approx_left, int consensus_approx_right, size_t left, size_t right, bool debug); bool isSelfRepeat(const RepeatParameter& rp, const string& s, const EList >& s_kmer_table, EList& offsets, size_t k, bool debug); }; // check if a set of seeds are already processed class RB_RepeatManager { public: size_t numCoords() const { return range_to_repeats_.size(); } bool checkRedundant(const RepeatParameter& rp, const map& repeat_map, const EList& positions, EList& to_remove) const; void addRepeat(const RB_Repeat* repeat); void addRepeat(Range range, size_t repeat_id); void removeRepeat(const RB_Repeat* repeat); void removeRepeat(Range range, size_t repeat_id); public: void showInfo(const RepeatParameter& rp, CoordHelper& coordHelper, const map& repeat_map, size_t num_case = 5) const; private: map > range_to_repeats_; }; class RB_SWAligner { public: RB_SWAligner(); ~RB_SWAligner(); void init_dyn(const RepeatParameter& rp); int alignStrings(const string &ref, const string &read, EList& edits, Coord& coord); void makePadString(const string& ref, const string& read, string& pad, size_t len); void doTest(const RepeatParameter& rp, const string& refstr, const string& readstr); void doTestCase1(const string& refstr, const string& readstr, TIndexOffU rpt_edit); private: // SimpleFunc scoreMin_; SimpleFunc nCeil_; SimpleFunc penCanIntronLen_; SimpleFunc penNoncanIntronLen_; Scoring *sc_; SwAligner swa; LinkedEList > rawEdits_; RandomSource rnd_; }; // SA Subset class RB_SubSA { public: RB_SubSA() {} ~RB_SubSA() {} void writeFile(ofstream& fp); void readFile(ifstream &fp); void init(TIndexOffU sa_size, TIndexOffU seed_len, TIndexOffU seed_count); template void push_back(const TStr& s, CoordHelper& coordHelper, TIndexOffU saElt, bool lastInput = false); inline TIndexOffU seed_len() const { return seed_len_; } inline TIndexOffU seed_count() const { return seed_count_; } inline size_t size() const { return repeat_list_.size(); } TIndexOffU get(size_t idx) const { return repeat_list_[idx]; } inline TIndexOffU operator[](size_t i) const { return get(i); } const EList& getRepeatIndex() const { return repeat_index_; } template Range find(const TStr& s, const string& seq) const; template TIndexOffU find_repeat_idx(const TStr& s, const string& seq) const; void setDone(TIndexOffU off, TIndexOffU len = 1); bool isDone(TIndexOffU off, TIndexOffU len = 1) const; void dump() const { cerr << "seed length: " << seed_len_ << endl; cerr << "minimum seed count: " << seed_count_ << endl; cerr << "number of seed groups: " << repeat_index_.size() << endl; cerr << "number of seeds: " << repeat_list_.size() << endl; } size_t getMemUsage() const { size_t tot = 0; tot += repeat_list_.totalCapacityBytes(); tot += repeat_index_.totalCapacityBytes(); tot += done_.totalCapacityBytes(); tot += temp_suffixes_.totalCapacityBytes(); return tot; } template void buildRepeatBase(const TStr& s, CoordHelper& coordHelper, const size_t max_len, EList& repeatBases); private: TIndexOffU sa_size_; TIndexOffU seed_len_; TIndexOffU seed_count_; EList temp_suffixes_; // repeat index EList repeat_list_; EList repeat_index_; EList done_; }; // find and write repeats template class RepeatBuilder { public: RepeatBuilder(TStr& s, TStr& sOriginal, const EList& szs, const EList& ref_names, bool forward_only, const string& filename); ~RepeatBuilder(); public: void readSA(const RepeatParameter& rp, BlockwiseSA& sa); void readSA(const RepeatParameter& rp, const string& filename); void readSA(const RepeatParameter& rp, const BitPackedArray& sa); void writeSA(const RepeatParameter& rp, const string& filename); void build(const RepeatParameter& rp); void sortRepeatGroup(); void saveRepeats(const RepeatParameter& rp); void saveConsensus(const RepeatParameter& rp, const string& repName); void saveFile(const RepeatParameter& rp); void addRepeatGroup(const RepeatParameter& rp, size_t& repeat_id, RB_RepeatManager& repeat_manger, const RB_RepeatBase& repeatBase, ostream& fp); void reassignSeeds(const RepeatParameter& rp, size_t repeat_bid, size_t repeat_eid, EList& seeds); bool checkSequenceMergeable(const string& ref, const string& read, EList& edits, Coord& coord, TIndexOffU rpt_len, TIndexOffU max_edit = 10); void doTest(const RepeatParameter& rp, const string& refstr, const string& readstr) { swaligner_.doTest(rp, refstr, readstr); } private: void saveAlleles(const RepeatParameter& rp, const string& repName, RB_Repeat& repeat, ofstream&, ofstream&, TIndexOffU grp_id, TIndexOffU&, TIndexOffU baseoff); void writeAllele(TIndexOffU grp_id, TIndexOffU allele_id, Range range, const string& seq_name, TIndexOffU baseoff, const EList& seeds, ostream &fp); void writeSNPs(ostream& fp, const string& rep_chr_name, const ESet& editset); void writeHaploType(const EList& haplo_list, const EList& seeds, TIndexOffU& hapl_id_base, const string& seq_name, ostream& fp); void generateHaploType(Range range, const EList& seeds, EList& haplo_list); private: const int output_width = 60; TStr& s_; TStr& sOriginal_; bool forward_only_; string filename_; RB_SubSA subSA_; TIndexOffU forward_length_; CoordHelper coordHelper_; // RB_SWAligner swaligner_; // Seeds EList consensus_all_; ELList seeds_; map repeat_map_; }; int strcmpPos(const string&, const string&, TIndexOffU&); template void dump_tstr(TStr& s); #endif /* __REPEAT_BUILDER_H__ */