/* * Copyright 2018, Chanhee Park and Daehwan Kim * * This file is part of HISAT 2. * * HISAT 2 is free software: you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation, either version 3 of the License, or * (at your option) any later version. * * HISAT 2 is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with HISAT 2. If not, see . */ #include #include #include #include #include "timer.h" #include "aligner_sw.h" #include "aligner_result.h" #include "scoring.h" #include "sstring.h" #include "repeat_builder.h" #include "repeat_kmer.h" #include "bit_packed_array.h" unsigned int levenshtein_distance(const std::string& s1, const std::string& s2) { const std::size_t len1 = s1.size(), len2 = s2.size(); std::vector col(len2+1), prevCol(len2+1); for (unsigned int i = 0; i < prevCol.size(); i++) prevCol[i] = i; for (unsigned int i = 0; i < len1; i++) { col[0] = i+1; for (unsigned int j = 0; j < len2; j++) // note that std::min({arg1, arg2, arg3}) works only in C++11, // for C++98 use std::min(std::min(arg1, arg2), arg3) col[j+1] = std::min( std::min(prevCol[1 + j] + 1, col[j] + 1), prevCol[j] + (s1[i]==s2[j] ? 0 : 1) ); col.swap(prevCol); } return prevCol[len2]; } unsigned int hamming_distance(const string& s1, const string& s2) { assert_eq(s1.length(), s2.length()); uint32_t cnt = 0; for(size_t i = 0; i < s1.length(); i++) { if(s1[i] != s2[i]) { cnt++; } } return cnt; } string reverse(const string& str) { string rev = str; size_t str_len = str.length(); for(size_t i = 0; i < str_len; i++) { rev[i] = str[str_len - i - 1]; } return rev; } string reverse_complement(const string& str) { string rev = str; size_t str_len = str.length(); for(size_t i = 0; i < str_len; i++) { char nt = str[str_len - i - 1]; rev[i] = asc2dnacomp[nt]; } return rev; } template TIndexOffU getLCP(const TStr& s, CoordHelper& coordHelper, TIndexOffU a, TIndexOffU b, TIndexOffU max_len = 1000) { TIndexOffU a_end = coordHelper.getEnd(a); TIndexOffU b_end = coordHelper.getEnd(b); assert_leq(a_end, s.length()); assert_leq(b_end, s.length()); TIndexOffU k = 0; while((a + k) < a_end && (b + k) < b_end && k < max_len) { if(s[a + k] != s[b + k]) { break; } k++; } return k; } template bool isSameSequenceUpto(const TStr& s, CoordHelper& coordHelper, TIndexOffU a, TIndexOffU b, TIndexOffU upto) { TIndexOffU a_end = coordHelper.getEnd(a); TIndexOffU b_end = coordHelper.getEnd(b); assert_leq(a_end, s.length()); assert_leq(b_end, s.length()); if(a + upto > a_end || b + upto > b_end) return false; for(int k = upto - 1; k >= 0; k--) { if(s[a + k] != s[b + k]) { return false; } } return true; } bool isSenseDominant(CoordHelper& coordHelper, const EList& positions, size_t seed_len) { // count the number of seeds on the sense strand size_t sense_mer_count = 0; for(size_t i = 0; i < positions.size(); i++) { if(positions[i] < coordHelper.forward_length()) sense_mer_count++; } // there exists two sets of seeds that are essentially the same // due to sense and antisense strands except palindromic sequences // choose one and skip the other one assert_leq(sense_mer_count, positions.size()); size_t antisense_mer_count = positions.size() - sense_mer_count; if(sense_mer_count < antisense_mer_count) { return false; } else if(sense_mer_count == antisense_mer_count) { assert_geq(positions.back(), coordHelper.forward_length()); TIndexOffU sense_pos = coordHelper.length() - positions.back() - seed_len; if(positions[0] > sense_pos) return false; } return true; } static const Range EMPTY_RANGE = Range(1, 0); struct RepeatRange { RepeatRange() { forward = true; }; RepeatRange(Range r, int id) : range(r), rg_id(id), forward(true) {}; RepeatRange(Range r, int id, bool fw) : range(r), rg_id(id), forward(fw) {}; Range range; int rg_id; bool forward; }; Range reverseRange(const Range& r, TIndexOffU size) { size_t len = r.second - r.first; Range rc; rc.first = size - r.second; rc.second = rc.first + len; return rc; } string reverseComplement(const string& str) { string rc; for(TIndexOffU si = str.length(); si > 0; si--) { rc.push_back(asc2dnacomp[str[si - 1]]); } return rc; } string toMDZ(const EList& edits, const string& read) { StackedAln stk; BTDnaString btread; BTString buf; btread.install(read.c_str(), true); stk.init(btread, edits, 0, 0, 0, 0); stk.buildMdz(); stk.writeMdz(&buf, NULL); return string(buf.toZBuf()); } string applyEdits(const string& ref, size_t read_len, const EList& edits, const Coord& coord) { string read; size_t ref_pos = coord.off(); size_t read_pos = 0; for(size_t i = 0; i < edits.size(); i++) { for(; read_pos < edits[i].pos; read_pos++, ref_pos++) { read.push_back(ref[ref_pos]); } if(edits[i].type == EDIT_TYPE_READ_GAP) { // delete on read ref_pos++; } else if(edits[i].type == EDIT_TYPE_REF_GAP) { // insert on read read.push_back(edits[i].qchr); read_pos++; } else if(edits[i].type == EDIT_TYPE_MM) { assert_eq(ref[ref_pos], edits[i].chr); read.push_back(edits[i].qchr); read_pos++; ref_pos++; } else { assert(false); } } for(; read_pos < read_len; read_pos++, ref_pos++) { read.push_back(ref[ref_pos]); } return read; } template static bool compareRepeatCoordByJoinedOff(const RepeatCoord& a, const RepeatCoord& b) { return a.joinedOff < b.joinedOff; } template string getString(const TStr& ref, TIndexOffU start, size_t len) { ASSERT_ONLY(const size_t ref_len = ref.length()); assert_leq(start + len, ref_len); string s; for(size_t i = 0; i < len; i++) { char nt = "ACGT"[ref[start + i]]; s.push_back(nt); } return s; } template void getString(const TStr& ref, TIndexOffU start, size_t len, string& s) { s.clear(); ASSERT_ONLY(const size_t ref_len = ref.length()); assert_leq(start + len, ref_len); for(size_t i = 0; i < len; i++) { char nt = "ACGT"[ref[start + i]]; s.push_back(nt); } } template inline uint8_t getSequenceBase(const TStr& ref, TIndexOffU pos) { assert_lt(pos, ref.length()); return (uint8_t)ref[pos]; } template void dump_tstr(const TStr& s) { static int print_width = 60; size_t s_len = s.length(); for(size_t i = 0; i < s_len; i += print_width) { string buf; for(size_t j = 0; (j < print_width) && (i + j < s_len); j++) { buf.push_back("ACGTN"[s[i + j]]); } cerr << buf << endl; } cerr << endl; } size_t getMaxMatchLen(const EList& edits, const size_t read_len) { size_t max_length = 0; uint32_t last_edit_pos = 0; uint32_t len = 0; if (edits.size() == 0) { // no edits --> exact match return read_len; } for(size_t i = 0; i < edits.size(); i++) { if (last_edit_pos > edits[i].pos) { continue; } len = edits[i].pos - last_edit_pos; if(len > max_length) { max_length = len; } last_edit_pos = edits[i].pos + 1; } if (last_edit_pos < read_len) { len = read_len - last_edit_pos; if(len > max_length) { max_length = len; } } return max_length; } int max_index(size_t array[4]) { int max_idx = 0; for(size_t i = 1; i < 4; i++) { if(array[max_idx] < array[i]) { max_idx = i; } } return max_idx; } CoordHelper::CoordHelper(TIndexOffU length, TIndexOffU forward_length, const EList& szs, const EList& ref_names) : length_(length), forward_length_(forward_length), szs_(szs), ref_namelines_(ref_names) { // build ref_names_ from ref_namelines_ buildNames(); buildJoinedFragment(); } CoordHelper::~CoordHelper() { } void CoordHelper::buildNames() { ref_names_.resize(ref_namelines_.size()); for(size_t i = 0; i < ref_namelines_.size(); i++) { string& nameline = ref_namelines_[i]; for(size_t j = 0; j < nameline.length(); j++) { char n = nameline[j]; if(n == ' ') { break; } ref_names_[i].push_back(n); } } } int CoordHelper::mapJoinedOffToSeq(TIndexOffU joinedOff) { /* search from cached_list */ if(num_cached_ > 0) { for(size_t i = 0; i < num_cached_; i++) { Fragments *frag = &cached_[i]; if(frag->contain(joinedOff)) { return frag->frag_id; } } /* fall through */ } /* search list */ int top = 0; int bot = fraglist_.size() - 1; int pos = 0; Fragments *frag = &fraglist_[pos]; while((bot - top) > 1) { pos = top + ((bot - top) >> 1); frag = &fraglist_[pos]; if (joinedOff < frag->joinedOff) { bot = pos; } else { top = pos; } } frag = &fraglist_[top]; if (frag->contain(joinedOff)) { // update cache if (num_cached_ < CACHE_SIZE_JOINEDFRG) { cached_[num_cached_] = *frag; num_cached_++; } else { cached_[victim_] = *frag; victim_++; // round-robin victim_ %= CACHE_SIZE_JOINEDFRG; } return top; } return -1; } int CoordHelper::getGenomeCoord(TIndexOffU joinedOff, string& chr_name, TIndexOffU& pos_in_chr) { int seq_id = mapJoinedOffToSeq(joinedOff); if (seq_id < 0) { return -1; } Fragments *frag = &fraglist_[seq_id]; TIndexOffU offset = joinedOff - frag->joinedOff; pos_in_chr = frag->seqOff + offset; chr_name = ref_names_[frag->seq_id]; return 0; } void CoordHelper::buildJoinedFragment() { TIndexOffU acc_joinedOff = 0; TIndexOffU acc_seqOff = 0; TIndexOffU seq_id = 0; TIndexOffU frag_id = 0; for(size_t i = 0; i < szs_.size(); i++) { const RefRecord& rec = szs_[i]; if(rec.len == 0) { continue; } if (rec.first) { acc_seqOff = 0; seq_id++; } fraglist_.expand(); fraglist_.back().joinedOff = acc_joinedOff; fraglist_.back().length = rec.len; fraglist_.back().frag_id = frag_id++; fraglist_.back().seq_id = seq_id - 1; fraglist_.back().first = rec.first; acc_seqOff += rec.off; fraglist_.back().seqOff = acc_seqOff; acc_joinedOff += rec.len; acc_seqOff += rec.len; } // Add Last Fragment(empty) fraglist_.expand(); fraglist_.back().joinedOff = acc_joinedOff; fraglist_.back().length = 0; fraglist_.back().seqOff = acc_seqOff; fraglist_.back().first = false; fraglist_.back().frag_id = frag_id; fraglist_.back().seq_id = seq_id; } TIndexOffU CoordHelper::getEnd(TIndexOffU e) { assert_lt(e, length_) TIndexOffU end = 0; if(e < forward_length_) { int frag_id = mapJoinedOffToSeq(e); assert_geq(frag_id, 0); end = fraglist_[frag_id].joinedOff + fraglist_[frag_id].length; } else { // ReverseComplement // a, b are positions w.r.t reverse complement string. // fragment map is based on forward string int frag_id = mapJoinedOffToSeq(length_ - e - 1); assert_geq(frag_id, 0); end = length_ - fraglist_[frag_id].joinedOff; } assert_leq(end, length_); return end; } TIndexOffU CoordHelper::getStart(TIndexOffU e) { assert_lt(e, length_) TIndexOffU start = 0; if(e < forward_length_) { int frag_id = mapJoinedOffToSeq(e); assert_geq(frag_id, 0); start = fraglist_[frag_id].joinedOff; } else { // ReverseComplement // a, b are positions w.r.t reverse complement string. // fragment map is based on forward string int frag_id = mapJoinedOffToSeq(length_ - e - 1); assert_geq(frag_id, 0); start = length_ - (fraglist_[frag_id].joinedOff + fraglist_[frag_id].length); } assert_leq(start, length_); return start; } template void SeedExt::getExtendedSeedSequence(const TStr& s, string& seq) const { seq.clear(); TIndexOffU prev_end = orig_pos.first; for(size_t j = 0; j < left_gaps.size(); j++) { TIndexOffU curr_end = orig_pos.first - left_gaps[j].first; assert_leq(curr_end, prev_end); if(curr_end < prev_end) { seq = getString(s, curr_end, prev_end - curr_end) + seq; } int gap_len = left_gaps[j].second; assert_neq(gap_len, 0); if(gap_len > 0) { // deletion string gap_str(gap_len, '-'); seq = gap_str + seq; } else { curr_end += gap_len; } prev_end = curr_end; } assert_leq(pos.first, prev_end); if(pos.first < prev_end) { seq = getString(s, pos.first, prev_end - pos.first) + seq; } if(orig_pos.second - orig_pos.first > 0) seq += getString(s, orig_pos.first, orig_pos.second - orig_pos.first); TIndexOffU prev_begin = orig_pos.second; for(size_t j = 0; j < right_gaps.size(); j++) { TIndexOffU curr_begin = orig_pos.second + right_gaps[j].first; assert_leq(prev_begin, curr_begin); if(prev_begin < curr_begin) { seq += getString(s, prev_begin, curr_begin - prev_begin); } int gap_len = right_gaps[j].second; assert_neq(gap_len, 0); if(gap_len > 0) { // deletion string gap_str(gap_len, '-'); seq += gap_str; } else { curr_begin -= gap_len; } prev_begin = curr_begin; } assert_leq(prev_begin, pos.second); if(prev_begin < pos.second) { seq += getString(s, prev_begin, pos.second - prev_begin); } } SeedSNP *lookup_add_SNP(EList& repeat_snps, SeedSNP& snp) { for(size_t i = 0; i < repeat_snps.size(); i++) { if(*repeat_snps[i] == snp) { return repeat_snps[i]; } } repeat_snps.expand(); repeat_snps.back() = new SeedSNP(); *(repeat_snps.back()) = snp; return repeat_snps.back(); } template void SeedExt::generateSNPs(const TStr &s, const string& consensus, EList& repeat_snps) { EList > gaps; // Merge Gaps { TIndexOffU left_ext_len = getLeftExtLength(); for(size_t g = 0; g < left_gaps.size(); g++) { gaps.expand(); gaps.back().first = left_ext_len - left_gaps[g].first + left_gaps[g].second; gaps.back().second = left_gaps[g].second; } TIndexOffU right_base = orig_pos.second - pos.first; for(size_t g = 0; g < right_gaps.size(); g++) { gaps.expand(); gaps.back().first = right_base + right_gaps[g].first; gaps.back().second = right_gaps[g].second; } gaps.sort(); } // string con_ref; string seq_read; TIndexOffU prev_con_pos = consensus_pos.first; TIndexOffU prev_seq_pos = pos.first; for(size_t g = 0; g < gaps.size(); g++) { TIndexOffU curr_seq_pos = pos.first + gaps[g].first; TIndexOffU curr_con_pos = prev_con_pos + (curr_seq_pos - prev_seq_pos); con_ref = consensus.substr(prev_con_pos, curr_con_pos - prev_con_pos); seq_read = getString(s, prev_seq_pos, curr_seq_pos - prev_seq_pos); for(size_t l = 0; l < con_ref.length(); l++) { if(seq_read[l] != con_ref[l]) { // Single SeedSNP snp; snp.init(EDIT_TYPE_MM, prev_con_pos + l, 1, seq_read[l]); // lookup repeat_snp snps.expand(); snps.back() = lookup_add_SNP(repeat_snps, snp); } } int gap_len = gaps[g].second; assert_neq(gap_len, 0); if(gap_len > 0) { // Deletion (gap on read) string gap_str(gap_len, '-'); SeedSNP snp; snp.init(EDIT_TYPE_READ_GAP, curr_con_pos, gap_len, consensus.substr(curr_con_pos, gap_len)); snps.expand(); snps.back() = lookup_add_SNP(repeat_snps, snp); curr_con_pos += gap_len; } else { // Insertion (gap on reference) string gap_str(abs(gap_len), '-'); SeedSNP snp; snp.init(EDIT_TYPE_REF_GAP, curr_con_pos, abs(gap_len), getString(s, curr_seq_pos, abs(gap_len))); snps.expand(); snps.back() = lookup_add_SNP(repeat_snps, snp); curr_seq_pos += abs(gap_len); } prev_con_pos = curr_con_pos; prev_seq_pos = curr_seq_pos; } assert_eq(consensus_pos.second - prev_con_pos, pos.second - prev_seq_pos); con_ref = consensus.substr(prev_con_pos, consensus_pos.second - prev_con_pos); seq_read = getString(s, prev_seq_pos, pos.second - prev_seq_pos); for(size_t l = 0; l < con_ref.length(); l++) { if(seq_read[l] != con_ref[l]) { // Single SeedSNP snp; snp.init(EDIT_TYPE_MM, prev_con_pos + l, 1, seq_read[l]); snps.expand(); snps.back() = lookup_add_SNP(repeat_snps, snp); } } } bool seedCmp(const SeedExt& s, const SeedExt& s2) { if(s.getLength() != s2.getLength()) return s.getLength() > s2.getLength(); if(s.pos.first != s2.pos.first) return s.pos.first < s2.pos.first; if(s.pos.second != s2.pos.second) return s.pos.second < s2.pos.second; return false; } template void RB_Repeat::init(const RepeatParameter& rp, const TStr& s, CoordHelper& coordHelper, const RB_SubSA& subSA, const RB_RepeatBase& repeatBase) { consensus_ = repeatBase.seq; assert_geq(consensus_.length(), rp.min_repeat_len); assert_eq(consensus_.length(), rp.min_repeat_len + repeatBase.nodes.size() - 1); EList positions, next_positions; const EList& repeat_index = subSA.getRepeatIndex(); seeds_.clear(); for(size_t n = 0; n < repeatBase.nodes.size(); n++) { TIndexOffU idx = repeatBase.nodes[n]; TIndexOffU saBegin = repeat_index[idx]; TIndexOffU saEnd = (idx + 1 < repeat_index.size() ? repeat_index[idx+1] : subSA.size()); next_positions.clear(); for(size_t sa = saBegin; sa < saEnd; sa++) { TIndexOffU left = subSA.get(sa); TIndexOffU right = left + subSA.seed_len(); next_positions.push_back(left); if(left > 0) { size_t idx = positions.bsearchLoBound(left - 1); if(idx < positions.size() && positions[idx] == left - 1) continue; } seeds_.expand(); SeedExt& seed = seeds_.back(); seed.reset(); seed.orig_pos = pair(left, right); seed.pos = seed.orig_pos; seed.consensus_pos.first = n; seed.consensus_pos.second = n + rp.min_repeat_len; seed.bound = pair(coordHelper.getStart(left), coordHelper.getEnd(left)); #ifndef NDEBUG string tmp_str = getString(s, seed.pos.first, seed.pos.second - seed.pos.first); assert_eq(tmp_str, consensus_.substr(n, seed.pos.second - seed.pos.first)); #endif for(size_t p = seed.consensus_pos.second; p < consensus_.length(); p++) { size_t pos = seed.pos.second; if(pos >= seed.bound.second) break; int nt = getSequenceBase(s, pos); if("ACGT"[nt] != consensus_[p]) break; seed.pos.second++; seed.consensus_pos.second++; } } positions = next_positions; } internal_update(); } template void RB_Repeat::extendConsensus(const RepeatParameter& rp, const TStr& s) { size_t remain = seeds_.size(); EList left_consensuses, right_consensuses, empty_consensuses; EList ed_seed_nums; const TIndexOffU default_max_ext_len = (rp.max_repeat_len - rp.seed_len) / 2; const TIndexOffU seed_mm = 1; string left_ext_consensus = "", right_ext_consensus = ""; empty_consensuses.resize(seed_mm + 1); empty_consensuses.fill(""); while(remain >= rp.repeat_count) { for(size_t i = 0; i < remain; i++) { seeds_[i].done = false; seeds_[i].curr_ext_len = 0; } // extend seeds in left or right direction TIndexOffU max_ext_len = min(default_max_ext_len, (TIndexOffU)(rp.max_repeat_len - consensus_.length())); get_consensus_seq(s, seeds_, 0, // seed begin remain, // seed end max_ext_len, max_ext_len, seed_mm, rp, ed_seed_nums, &left_consensuses, &right_consensuses); size_t allowed_seed_mm = 0; left_ext_consensus.clear(); right_ext_consensus.clear(); for(int i = 0 /* (int)seed_mm */; i >= 0; i--) { size_t extlen = (ed_seed_nums[i] < rp.repeat_count ? 0 : (left_consensuses[i].length() + right_consensuses[i].length())); // if(extlen <= 0 || extlen < max_ext_len * i / seed_mm) //if(extlen < max_ext_len * 2) // continue; if(extlen <= 0) continue; left_ext_consensus = left_consensuses[i]; right_ext_consensus = right_consensuses[i]; allowed_seed_mm = (size_t)i; assert_gt(left_ext_consensus.length(), 0); assert_gt(right_ext_consensus.length(), 0); break; } size_t num_passed_seeds = 0; if(left_ext_consensus.length() > 0 && right_ext_consensus.length()) { consensus_ = reverse(left_ext_consensus) + consensus_; consensus_ += right_ext_consensus; #if 0 calc_edit_dist(s, seeds_, 0, remain, left ? consensuses : empty_consensuses, left ? empty_consensuses : consensuses, allowed_seed_mm); #endif // update seeds for(size_t i = 0; i < seeds_.size(); i++) { SeedExt& seed = seeds_[i]; if(i < remain) { if(seed.ed <= allowed_seed_mm) { num_passed_seeds++; seed.done = true; seed.total_ed += seed.ed; seed.pos.first -= left_ext_consensus.length(); seed.pos.second += right_ext_consensus.length(); seed.consensus_pos.first = 0; seed.consensus_pos.second = consensus_.length(); if(seed.left_gaps.size() > 0 && seed.left_gaps.back().first >= seed.orig_pos.first - seed.pos.first) { int gap_len = seed.left_gaps.back().second; seed.pos.first += gap_len; } if(seed.right_gaps.size() > 0 && seed.right_gaps.back().first >= seed.pos.second - seed.orig_pos.second) { int gap_len = seed.right_gaps.back().second; seed.pos.second -= gap_len; } } } } // move up "done" seeds size_t j = 0; for(size_t i = 0; i < remain; i++) { if(!seeds_[i].done) continue; assert_geq(i, j); if(i > j) { SeedExt temp = seeds_[j]; seeds_[j] = seeds_[i]; seeds_[i] = temp; // Find next "undone" seed j++; while(j < i && seeds_[j].done) { j++; } assert(j < remain && !seeds_[j].done); } else { j = i + 1; } } } remain = num_passed_seeds; break; } // while(remain >= rp.repeat_count) #ifndef NDEBUG // make sure seed positions are unique EList > seed_poses; for(size_t i = 0; i < seeds_.size(); i++) { seed_poses.expand(); seed_poses.back() = seeds_[i].orig_pos; } seed_poses.sort(); for(size_t i = 0; i + 1 < seed_poses.size(); i++) { if(seed_poses[i].first == seed_poses[i+1].first) { assert_lt(seed_poses[i].second, seed_poses[i+1].second); } else { assert_lt(seed_poses[i].first, seed_poses[i+1].first); } } for(size_t i = 0; i < seeds_.size(); i++) { assert(seeds_[i].valid()); } #endif RB_Repeat::internal_update(); // check repeats within a repeat sequence self_repeat_ = false; for(size_t i = 0; i + 1 < seed_ranges_.size(); i++) { const RB_AlleleCoord& range = seed_ranges_[i]; for(size_t j = i + 1; j < seed_ranges_.size(); j++) { RB_AlleleCoord& range2 = seed_ranges_[j]; if(range.right <= range2.left) break; self_repeat_ = true; } } } template void RB_Repeat::getNextRepeat(const RepeatParameter& rp, const TStr& s, RB_Repeat& o) { o.reset(); size_t i = 0; for(i = 0; i < seeds_.size() && seeds_[i].done; i++); if(i >= seeds_.size()) return; for(size_t j = i; j < seeds_.size(); j++) { assert(!seeds_[j].done); o.seeds_.expand(); o.seeds_.back() = seeds_[j]; } seeds_.resizeExact(i); internal_update(); assert_gt(o.seeds_.size(), 0); o.consensus_ = getString(s, o.seeds_[0].orig_pos.first, o.seeds_[0].orig_pos.second - o.seeds_[0].orig_pos.first); for(size_t j = 0; j < o.seeds_.size(); j++) { o.seeds_[j].pos = o.seeds_[j].orig_pos; o.seeds_[j].left_gaps.clear(); o.seeds_[j].right_gaps.clear(); o.seeds_[j].ed = o.seeds_[j].total_ed = 0; } } void RB_Repeat::internal_update() { sort(seeds_.begin(), seeds_.end(), seedCmp); seed_ranges_.resizeExact(seeds_.size()); for(size_t i = 0; i < seeds_.size(); i++) { seed_ranges_[i].left = seeds_[i].pos.first; seed_ranges_[i].right = seeds_[i].pos.second; seed_ranges_[i].idx = i; } seed_ranges_.sort(); // check repeats within a repeat sequence size_t remove_count = 0; for(size_t i = 0; i + 1 < seed_ranges_.size(); i++) { const RB_AlleleCoord& range = seed_ranges_[i]; if(range.left == numeric_limits::max()) continue; for(size_t j = i + 1; j < seed_ranges_.size(); j++) { RB_AlleleCoord& range2 = seed_ranges_[j]; if(range2.left == numeric_limits::max()) continue; if(range.right <= range2.left) break; if(range.right >= range2.right) { range2.left = numeric_limits::max(); remove_count++; } } } if(remove_count <= 0) return; for(size_t i = 0; i < seed_ranges_.size(); i++) { if(seed_ranges_[i].left == numeric_limits::max()) seeds_[seed_ranges_[i].idx].reset(); } sort(seeds_.begin(), seeds_.end(), seedCmp); #ifndef NDEBUG for(size_t i = 0; i < seeds_.size(); i++) { if(i < seeds_.size() - remove_count) { assert_lt(seeds_[i].pos.first, seeds_[i].pos.second); } else { assert_eq(seeds_[i].pos.first, seeds_[i].pos.second); } } #endif seeds_.resize(seeds_.size() - remove_count); seed_ranges_.resize(seeds_.size()); for(size_t i = 0; i < seeds_.size(); i++) { seed_ranges_[i].left = seeds_[i].pos.first; seed_ranges_[i].right = seeds_[i].pos.second; seed_ranges_[i].idx = i; } seed_ranges_.sort(); } bool RB_Repeat::contain(TIndexOffU left, TIndexOffU right) const { size_t l = 0, r = seed_ranges_.size(); while(l < r) { size_t m = (l + r) / 2; const RB_AlleleCoord& coord = seed_ranges_[m]; if(right <= coord.left) { r = m; } else if(left >= coord.right) { l = m + 1; } else { return coord.left <= left && right <= coord.right; } } return false; } template void RB_Repeat::saveSeedExtension(const RepeatParameter& rp, const TStr& s, CoordHelper& coordHelper, TIndexOffU grp_id, ostream& fp, size_t& total_repeat_seq_len, size_t& total_allele_seq_len) const { // apply color, which is compatible with linux commands such as cat and less -r #if 1 const string red = "\033[31m", reset = "\033[0m", resetline = "\033[4m", redline = "\033[4;31m"; #else const string red = "", reset = "", resetline = "", redline = ""; #endif bool show_exterior_seq = true; const size_t max_show_seq_len = 700; size_t total_count = 0; for(size_t i = 0; i < seeds_.size(); i++) { const SeedExt& seed = seeds_[i]; size_t ext_len = seed.getLength(); if(ext_len < rp.min_repeat_len) continue; total_allele_seq_len += ext_len; total_count++; bool sense_strand = seed.pos.first < coordHelper.forward_length(); fp << setw(6) << repeat_id_ << " " << setw(5) << seeds_.size(); fp << " " << setw(4) << i; fp << " " << setw(4) << ext_len; fp << " " << (sense_strand ? '+' : '-'); fp << " " << setw(10) << seed.pos.first << " " << setw(10) << seed.pos.second; fp << " " << setw(10) << seed.orig_pos.first << " " << setw(10) << seed.orig_pos.second; string chr_name; TIndexOffU pos_in_chr; if(sense_strand) { coordHelper.getGenomeCoord(seed.pos.first, chr_name, pos_in_chr); } else { coordHelper.getGenomeCoord(s.length() - seed.pos.first - (seed.pos.second - seed.pos.first), chr_name, pos_in_chr); } fp << " " << setw(5) << chr_name << ":" << setw(10) << std::left << pos_in_chr << std::right; if(sense_strand) { coordHelper.getGenomeCoord(seed.pos.second, chr_name, pos_in_chr); } else { coordHelper.getGenomeCoord(s.length() - seed.pos.second - (seed.pos.second - seed.pos.first), chr_name, pos_in_chr); } fp << " " << setw(5) << chr_name << ":" << setw(10) << std::left << pos_in_chr << std::right; if(!seed.aligned) { fp << endl; continue; } string deststr = ""; seed.getExtendedSeedSequence(s, deststr); // add exterior sequences if(seed.consensus_pos.first > 0) { TIndexOffU seq_pos, seq_len; if(seed.pos.first >= seed.consensus_pos.first) { seq_pos = seed.pos.first - seed.consensus_pos.first; seq_len = seed.consensus_pos.first; } else { seq_pos = 0; seq_len = seed.pos.first; } deststr = getString(s, seq_pos, seq_len) + deststr; if(seq_len < seed.consensus_pos.first) { deststr = string(seed.consensus_pos.first - seq_len, 'N') + deststr; } } if(seed.consensus_pos.second < consensus_.length()) { deststr += getString(s, seed.pos.second, consensus_.length() - seed.consensus_pos.second); } assert_eq(consensus_.length(), deststr.length()); fp << " "; // print sequence w.r.t. the current group for(size_t j = 0; j < min(consensus_.length(), max_show_seq_len); j++) { bool outside = j < seed.consensus_pos.first || j >= seed.consensus_pos.second; bool different = (consensus_[j] != deststr[j]); if(outside) { if(show_exterior_seq) { if(different) { fp << redline; fp << deststr[j]; fp << reset; } else { fp << resetline; fp << deststr[j]; fp << reset; } } else { if(j < seed.consensus_pos.first) fp << " "; } } else { if(different) fp << red; fp << deststr[j]; if(different) fp << reset; } } #if 0 fp << "\t"; for(size_t ei = 0; ei < seed.edits.size(); ei++) { const Edit& edit = seed.edits[ei]; if(ei > 0) fp << ","; fp << edit; if (edit.snpID != std::numeric_limits::max()) { fp << "@" << edit.snpID; } } #endif fp << endl; } if(total_count > 0) fp << setw(5) << total_count << endl << endl; total_repeat_seq_len += consensus_.length(); } template size_t calc_edit_dist(const TStr& s, const SeedExt& seed, const SeedExt& seed2, size_t left_ext, size_t right_ext, size_t max_ed) { if(seed.bound.first + left_ext > seed.pos.first || seed.pos.second + right_ext > seed.bound.second || seed2.bound.first + left_ext > seed2.pos.first || seed2.pos.second + right_ext > seed2.bound.second) return max_ed + 1; size_t ed = 0; for(size_t i = 0; i < left_ext; i++) { int ch = getSequenceBase(s, seed.pos.first - i - 1); assert_range(0, 3, ch); int ch2 = getSequenceBase(s, seed2.pos.first - i - 1); assert_range(0, 3, ch2); if(ch != ch2) ed++; if(ed > max_ed) return ed; } for(size_t i = 0; i < right_ext; i++) { int ch = getSequenceBase(s, seed.pos.second + i); assert_range(0, 3, ch); int ch2 = getSequenceBase(s, seed2.pos.second + i); assert_range(0, 3, ch2); if(ch != ch2) ed++; if(ed > max_ed) return ed; } return ed; } size_t extract_kmer(const string& seq, size_t offset, size_t k = 5) { assert_leq(offset + k, seq.length()); size_t kmer = 0; for(size_t i = 0; i < k; i++) { kmer = (kmer << 2) | asc2dna[seq[offset + i]]; } return kmer; } size_t next_kmer(size_t kmer, char base, size_t k = 5) { kmer &= ((1 << ((k-1)*2))) - 1; kmer = (kmer << 2) | asc2dna[base]; return kmer; } void build_kmer_table(const string& consensus, EList >& kmer_table, size_t k = 5) { kmer_table.clear(); if(consensus.length() < k) return; size_t kmer = 0; for(size_t i = 0; i + k <= consensus.length(); i++) { if(i == 0) { kmer = extract_kmer(consensus, i, k); } else { kmer = next_kmer(kmer, consensus[i+k-1], k); } kmer_table.expand(); kmer_table.back().first = kmer; kmer_table.back().second = i; } kmer_table.sort(); } void find_gap_pos(const string& s, const string& s2, EList& ed, EList& ed2, bool del, size_t gap_len, size_t max_mm, size_t& gap_pos, size_t& mm, bool debug = false) { assert_eq(s.length(), s2.length()); size_t seq_len = s.length(); ed.resizeExact(seq_len); ed.fill(max_mm + 1); ed2.resizeExact(seq_len); ed2.fill(max_mm + 1); // from left to right for(size_t i = 0; i < seq_len; i++) { size_t add = (s[i] == s2[i] ? 0 : 1); if(i == 0) ed[i] = add; else ed[i] = ed[i-1] + add; if(ed[i] >= max_mm + 1) break; } // from right to left size_t s_sub = del ? 0 : gap_len; size_t s2_sub = del ? gap_len : 0; for(int i = seq_len - 1; i >= gap_len; i--) { size_t add = (s[i - s_sub] == s2[i - s2_sub] ? 0 : 1); if(i == seq_len - 1) ed2[i] = add; else ed2[i] = ed2[i+1] + add; if(ed2[i] > max_mm) break; } if(debug) { cout << s << endl << s2 << endl; for(size_t i = 0; i < ed.size(); i++) { cout << (ed[i] % 10); } cout << endl; for(size_t i = 0; i < ed2.size(); i++) { cout << (ed2[i] % 10); } cout << endl; } size_t min_mm = ed2[gap_len]; int min_mm_i = -1; assert_eq(ed.size(), ed2.size()); for(size_t i = 0; i + gap_len + 1 < ed.size(); i++) { if(ed[i] > max_mm) break; size_t cur_mm = ed[i] + ed2[i + gap_len + 1]; if(cur_mm < min_mm) { min_mm = cur_mm; min_mm_i = i; } } mm = min_mm; if(mm > max_mm) return; gap_pos = (size_t)(min_mm_i + 1); } void align_with_one_gap(const string& s, const EList >& s_kmer_table, const string& s2, EList& counts, size_t max_gap, size_t max_mm, size_t& mm, size_t& gap_pos, int& gap_len, size_t k = 5) { mm = max_mm + 1; assert_eq(s.length(), s2.length()); counts.resizeExact(max_gap * 2 + 1); counts.fillZero(); size_t max_count = 0, max_count_i = 0; for(size_t i = 0; i + k <= s2.length(); i += 1) { pair kmer(0, 0); kmer.first = extract_kmer(s2, i, k); size_t lb = s_kmer_table.bsearchLoBound(kmer); while(lb < s_kmer_table.size() && kmer.first == s_kmer_table[lb].first) { int gap = (int)s_kmer_table[lb].second - (int)i; if(gap != 0 && abs(gap) < max_gap) { size_t gap_i = (size_t)(gap + max_gap); counts[gap_i]++; if(counts[gap_i] > max_count) { max_count = counts[gap_i]; max_count_i = gap_i; } } lb++; } } if(max_count <= 0) return; assert_lt(max_count_i, counts.size()); int gap = (int)max_count_i - (int)max_gap; assert_neq(gap, 0); size_t abs_gap = abs(gap); bool del = gap > 0; EList ed, ed2; find_gap_pos(s, s2, ed, ed2, del, abs_gap, max_mm, gap_pos, mm); if(mm > max_mm) return; gap_len = del ? abs_gap : -abs_gap; #ifndef NDEBUG assert_leq(mm, max_mm); string ds = s, ds2 = s2; size_t debug_mm = 0; if(del) ds.erase(gap_pos, abs_gap); else ds2.erase(gap_pos, abs_gap); for(size_t i = 0; i < min(ds.length(), ds2.length()); i++) { if(ds[i] != ds2[i]) debug_mm++; } assert_eq(debug_mm, mm); #endif } template void calc_edit_dist(const TStr& s, EList& seeds, size_t sb, size_t se, const EList& left_consensuses, const EList& right_consensuses, uint32_t max_ed) { string left_consensus = left_consensuses[max_ed]; string right_consensus = right_consensuses[max_ed]; EList > left_kmer_table, right_kmer_table; build_kmer_table(left_consensus, left_kmer_table); build_kmer_table(right_consensus, right_kmer_table); string left_seq, right_seq; const size_t max_gap = 10; EList counts; size_t left_ext = left_consensus.length(); size_t right_ext = right_consensus.length(); for(size_t i = sb; i < se; i++) { SeedExt& seed = seeds[i]; if(seed.bound.first + left_ext > seed.pos.first || seed.pos.second + right_ext > seed.bound.second) { seed.ed = max_ed + 1; continue; } size_t left_ed = 0; if(left_ext > 0) { getString(s, seed.pos.first - left_ext, left_ext, left_seq); reverse(left_seq.begin(), left_seq.end()); for(size_t j = 0; j < left_ext; j++) { if(left_seq[j] != left_consensus[j]) left_ed++; if(left_ed <= max_ed && j < left_consensuses[left_ed].length()) { seed.curr_ext_len = j + 1; } } if(left_ed > max_ed) { size_t gap_pos = 0; int gap_len = 0; align_with_one_gap(left_consensus, left_kmer_table, left_seq, counts, min(left_ed - max_ed, max_gap), max_ed, left_ed, gap_pos, gap_len); if(left_ed <= max_ed) { seed.left_gaps.expand(); seed.left_gaps.back().first = seed.getLeftExtLength() + gap_pos; seed.left_gaps.back().second = gap_len; } } } else { left_seq.clear(); } size_t right_ed = 0; if(right_ext > 0) { getString(s, seed.pos.second, right_ext, right_seq); for(size_t j = 0; j < right_ext; j++) { if(right_seq[j] != right_consensus[j]) right_ed++; if(right_ed <= max_ed && j < right_consensuses[right_ed].length()) { seed.curr_ext_len = j + 1; } } if(right_ed > max_ed) { size_t gap_pos = 0; int gap_len = 0; align_with_one_gap(right_consensus, right_kmer_table, right_seq, counts, min(right_ed - max_ed, max_gap), max_ed, right_ed, gap_pos, gap_len); if(right_ed <= max_ed) { seed.right_gaps.expand(); seed.right_gaps.back().first = seed.getRightExtLength() + gap_pos; seed.right_gaps.back().second = gap_len; } } } else { right_seq.clear(); } seed.ed = left_ed + right_ed; } } template void RB_Repeat::get_consensus_seq(const TStr& s, EList& seeds, size_t sb, size_t se, size_t min_left_ext, size_t min_right_ext, size_t max_ed, const RepeatParameter& rp, EList& ed_seed_nums, EList* left_consensuses, EList* right_consensuses) const { assert_lt(sb, se); assert_geq(se - sb, rp.seed_count); assert_leq(se, seeds.size()); if(left_consensuses != NULL) { left_consensuses->clear(); left_consensuses->resize(max_ed + 1); for(size_t i = 0; i < max_ed + 1; i++) { (*left_consensuses)[i].clear(); } } if(right_consensuses != NULL) { right_consensuses->clear(); right_consensuses->resize(max_ed + 1); for(size_t i = 0; i < max_ed + 1; i++) { (*right_consensuses)[i].clear(); } } assert_eq(min_left_ext, min_right_ext); EList seqs; seqs.reserveExact(seeds.size()); size_t max_i = 0, max_count = 0; while(min_left_ext > 0) { seqs.clear(); max_i = 0; max_count = 0; for(size_t i = sb; i < se; i++) { const SeedExt& seed = seeds[i]; if(seeds[i].bound.first + min_left_ext > seed.pos.first || seed.pos.second + min_right_ext > seed.bound.second) continue; seqs.expand(); if(min_left_ext > 0) { getString(s, seed.pos.first - min_left_ext, min_left_ext, seqs.back()); } if(min_right_ext > 0) { seqs.back() += getString(s, seed.pos.second, min_right_ext); } } seqs.sort(); for(size_t i = 0; i + max_count < seqs.size();) { size_t count = 1; for(size_t j = i + 1; j < seqs.size(); j++) { if(seqs[i] == seqs[j]) count++; else break; } if(count >= max_count) { max_count = count; max_i = i; } i = i + count; } if(max_count >= rp.seed_count) break; min_left_ext--; min_right_ext--; } // update ed // extend consensus string ed_seed_nums.resize(max_ed + 1); ed_seed_nums.fillZero(); EList next_ed_seed_nums; next_ed_seed_nums.resize(max_ed + 1); if(max_count < rp.seed_count) return; for(size_t i = sb; i < se; i++) seeds[i].ed = 0; size_t seed_ext_len = 0; while(seed_ext_len < max(min_left_ext, min_right_ext)) { // select base to be used for extension uint8_t left_ext_base = 0; if(seed_ext_len < min_left_ext) left_ext_base = seqs[max_i][min_left_ext - seed_ext_len - 1]; uint8_t right_ext_base = 0; if(seed_ext_len < min_right_ext) right_ext_base = seqs[max_i][min_left_ext + seed_ext_len]; // estimate extended ed next_ed_seed_nums.fillZero(); for(size_t i = sb; i < se; i++) { uint32_t est_ed = seeds[i].ed; if(seed_ext_len < min_left_ext) { if(seeds[i].bound.first + seed_ext_len + 1 <= seeds[i].pos.first) { TIndexOffU left_pos = seeds[i].pos.first - seed_ext_len - 1; int ch = getSequenceBase(s, left_pos); assert_range(0, 3, ch); if ("ACGT"[ch] != left_ext_base) { est_ed++; } } else { est_ed = max_ed + 1; } } if(seed_ext_len < min_right_ext) { TIndexOffU right_pos = seeds[i].pos.second + seed_ext_len; if(right_pos >= seeds[i].bound.second) { est_ed = max_ed + 1; } else { int ch = getSequenceBase(s, right_pos); assert_range(0, 3, ch); if ("ACGT"[ch] != right_ext_base) { est_ed++; } } } if(est_ed <= max_ed) { next_ed_seed_nums[est_ed]++; } } for(size_t i = 1; i < next_ed_seed_nums.size(); i++) next_ed_seed_nums[i] += next_ed_seed_nums[i-1]; if(next_ed_seed_nums[max_ed] < rp.repeat_count) { break; } for(size_t i = sb; i < se; i++) { if(seed_ext_len < min_left_ext) { if(seeds[i].bound.first + seed_ext_len + 1 <= seeds[i].pos.first) { TIndexOffU left_pos = seeds[i].pos.first - seed_ext_len - 1; int ch = getSequenceBase(s, left_pos); assert_range(0, 3, ch); if ("ACGT"[ch] != left_ext_base) { seeds[i].ed++; } } else { seeds[i].ed = max_ed + 1; } } if(seed_ext_len < min_right_ext) { TIndexOffU right_pos = seeds[i].pos.second + seed_ext_len; if(right_pos >= seeds[i].bound.second) { seeds[i].ed = max_ed + 1; } else { int ch = getSequenceBase(s, right_pos); assert_range(0, 3, ch); if ("ACGT"[ch] != right_ext_base) { seeds[i].ed++; } } } } for(size_t i = 0; i < next_ed_seed_nums.size(); i++) { if(next_ed_seed_nums[i] < rp.repeat_count) continue; ed_seed_nums[i] = next_ed_seed_nums[i]; if(seed_ext_len < min_left_ext) { assert(left_consensuses != NULL); (*left_consensuses)[i] += left_ext_base; } if(seed_ext_len < min_right_ext) { assert(right_consensuses != NULL); (*right_consensuses)[i] += right_ext_base; } } seed_ext_len++; } } template void RB_RepeatExt::get_consensus_seq(const TStr& s, EList& seeds, size_t sb, size_t se, size_t min_left_ext, size_t min_right_ext, size_t max_ed, const RepeatParameter& rp, EList& ed_seed_nums, EList* left_consensuses, EList* right_consensuses) const { assert_lt(sb, se); assert_leq(se, seeds.size()); if(left_consensuses != NULL) { left_consensuses->clear(); left_consensuses->resize(max_ed + 1); for(size_t i = 0; i < max_ed + 1; i++) { (*left_consensuses)[i].clear(); } } if(right_consensuses != NULL) { right_consensuses->clear(); right_consensuses->resize(max_ed + 1); for(size_t i = 0; i < max_ed + 1; i++) { (*right_consensuses)[i].clear(); } } // cluster sequences EList belongto; belongto.resizeExact(se - sb); for(size_t i = 0; i < belongto.size(); i++) belongto[i] = i; for(size_t i = 0; i + 1 < belongto.size(); i++) { for(size_t j = i + 1; j < belongto.size(); j++) { if(belongto[j] != j) continue; size_t ed = calc_edit_dist(s, seeds[sb + i], seeds[sb + j], min_left_ext, min_right_ext, max_ed + 1); if(ed <= max_ed + 1) { belongto[j] = belongto[i]; } } } // find the maximum group that has the most sequences EList vote; vote.resizeExact(seeds.size()); vote.fillZero(); size_t max_group = 0; for(size_t i = 0; i < belongto.size(); i++) { size_t cur_group = belongto[i]; assert_lt(cur_group, vote.size()); vote[cur_group]++; if(cur_group != max_group && vote[cur_group] > vote[max_group]) { max_group = cur_group; } } // reuse vote to store seeds EList& consensus_group = vote; consensus_group.clear(); for(size_t i = 0; i < belongto.size(); i++) { if(belongto[i] == max_group) consensus_group.push_back(i); } // update ed // extend consensus string ed_seed_nums.resize(max_ed + 1); ed_seed_nums.fillZero(); EList next_ed_seed_nums; next_ed_seed_nums.resize(max_ed + 1); for(size_t i = sb; i < se; i++) seeds[i].ed = 0; size_t seed_ext_len = 0; while(seed_ext_len < max(min_left_ext, min_right_ext)) { // count base size_t l_count[4] = {0, }; size_t r_count[4] = {0, }; for(size_t i = 0; i < consensus_group.size(); i++) { size_t si = sb + consensus_group[i]; int ch; if(seed_ext_len < min_left_ext) { if(seeds[i].bound.first + seed_ext_len + 1 <= seeds[i].pos.first) { ch = getSequenceBase(s, seeds[si].pos.first - seed_ext_len - 1); assert_range(0, 3, ch); l_count[ch]++; } } if(seed_ext_len < min_right_ext) { if(seeds[i].bound.second + seed_ext_len) { ch = getSequenceBase(s, seeds[si].pos.second + seed_ext_len); assert_range(0, 3, ch); r_count[ch]++; } } } // select base to be used for extension uint8_t left_ext_base = 0; if(seed_ext_len < min_left_ext) left_ext_base = max_index(l_count); uint8_t right_ext_base = 0; if(seed_ext_len < min_right_ext) right_ext_base = max_index(r_count); // estimate extended ed next_ed_seed_nums.fillZero(); for(size_t i = sb; i < se; i++) { uint32_t est_ed = seeds[i].ed; if(seed_ext_len < min_left_ext) { if(seeds[i].bound.first + seed_ext_len + 1 <= seeds[i].pos.first) { TIndexOffU left_pos = seeds[i].pos.first - seed_ext_len - 1; int ch = getSequenceBase(s, left_pos); assert_range(0, 3, ch); if (ch != left_ext_base) { est_ed++; } } else { est_ed = max_ed + 1; } } if(seed_ext_len < min_right_ext) { TIndexOffU right_pos = seeds[i].pos.second + seed_ext_len; if(right_pos >= seeds[i].bound.second) { est_ed = max_ed + 1; } else { int ch = getSequenceBase(s, right_pos); assert_range(0, 3, ch); if (ch != right_ext_base) { est_ed++; } } } if(est_ed <= max_ed) { next_ed_seed_nums[est_ed]++; } } for(size_t i = 1; i < next_ed_seed_nums.size(); i++) next_ed_seed_nums[i] += next_ed_seed_nums[i-1]; if(next_ed_seed_nums[max_ed] < rp.repeat_count) { break; } for(size_t i = sb; i < se; i++) { if(seed_ext_len < min_left_ext) { if(seeds[i].bound.first + seed_ext_len + 1 <= seeds[i].pos.first) { TIndexOffU left_pos = seeds[i].pos.first - seed_ext_len - 1; int ch = getSequenceBase(s, left_pos); assert_range(0, 3, ch); if (ch != left_ext_base) { seeds[i].ed++; } } else { seeds[i].ed = max_ed + 1; } } if(seed_ext_len < min_right_ext) { TIndexOffU right_pos = seeds[i].pos.second + seed_ext_len; if(right_pos >= seeds[i].bound.second) { seeds[i].ed = max_ed + 1; } else { int ch = getSequenceBase(s, right_pos); assert_range(0, 3, ch); if (ch != right_ext_base) { seeds[i].ed++; } } } } for(size_t i = 0; i < next_ed_seed_nums.size(); i++) { if(next_ed_seed_nums[i] < rp.repeat_count) continue; ed_seed_nums[i] = next_ed_seed_nums[i]; if(seed_ext_len < min_left_ext) { assert(left_consensuses != NULL); (*left_consensuses)[i] += (char)("ACGT"[left_ext_base]); } if(seed_ext_len < min_right_ext) { assert(right_consensuses != NULL); (*right_consensuses)[i] += (char)("ACGT"[right_ext_base]); } } seed_ext_len++; } } EList RB_Repeat::ca_ed_; EList RB_Repeat::ca_ed2_; string RB_Repeat::ca_s_; string RB_Repeat::ca_s2_; size_t RB_Repeat::seed_merge_tried = 0; size_t RB_Repeat::seed_merged = 0; template void RB_RepeatExt::extendConsensus(const RepeatParameter& rp, const TStr& s) { size_t remain = seeds_.size(); EList consensuses, empty_consensuses; EList ed_seed_nums; bool left = true; const TIndexOffU default_max_ext_len = 100; const TIndexOffU seed_mm = 4; string ext_consensus = ""; empty_consensuses.resize(seed_mm + 1); empty_consensuses.fill(""); while(remain >= rp.repeat_count) { for(size_t i = 0; i < remain; i++) { seeds_[i].done = false; seeds_[i].curr_ext_len = 0; } // extend seeds in left or right direction TIndexOffU max_ext_len = min(default_max_ext_len, (TIndexOffU)(rp.max_repeat_len - consensus_.length())); get_consensus_seq(s, seeds_, 0, // seed begin remain, // seed end left ? max_ext_len : 0, left ? 0 : max_ext_len, seed_mm, rp, ed_seed_nums, left ? &consensuses : NULL, left ? NULL : &consensuses); size_t allowed_seed_mm = 0; ext_consensus.clear(); for(int i = (int)seed_mm; i >= 0; i--) { size_t extlen = (ed_seed_nums[i] < rp.repeat_count ? 0 : consensuses[i].length()); if(extlen <= 0 || extlen < max_ext_len * i / seed_mm) continue; if(i > 0 && consensuses[i].length() <= consensuses[i-1].length() + 5) continue; ext_consensus = consensuses[i]; allowed_seed_mm = (size_t)i; assert(ext_consensus.length() > 0); break; } size_t num_passed_seeds = 0; if(ext_consensus.length() > 0) { if(left) consensus_ = reverse(ext_consensus) + consensus_; else consensus_ += ext_consensus; calc_edit_dist(s, seeds_, 0, remain, left ? consensuses : empty_consensuses, left ? empty_consensuses : consensuses, allowed_seed_mm); // update seeds for(size_t i = 0; i < seeds_.size(); i++) { SeedExt& seed = seeds_[i]; if(i < remain) { if(seed.ed <= allowed_seed_mm) { num_passed_seeds++; seed.done = true; seed.total_ed += seed.ed; if(left) { if(seed.left_gaps.size() > 0 && seed.left_gaps.back().first >= seed.orig_pos.first - seed.pos.first) { int gap_len = seed.left_gaps.back().second; seed.pos.first += gap_len; } seed.pos.first -= ext_consensus.length(); seed.consensus_pos.first = 0; seed.consensus_pos.second = consensus_.length(); } else { if(seed.right_gaps.size() > 0 && seed.right_gaps.back().first >= seed.pos.second - seed.orig_pos.second) { int gap_len = seed.right_gaps.back().second; seed.pos.second -= gap_len; } seed.pos.second += ext_consensus.length(); seed.consensus_pos.second = consensus_.length(); } } else { if(left) { assert_leq(seed.curr_ext_len, ext_consensus.length()); TIndexOffU adjust = ext_consensus.length() - seed.curr_ext_len; seed.consensus_pos.first += adjust; seed.consensus_pos.second += ext_consensus.length(); assert_leq(seed.curr_ext_len, seed.pos.first); seed.pos.first -= seed.curr_ext_len; } else { assert_leq(seed.curr_ext_len, ext_consensus.length()); seed.consensus_pos.second += seed.curr_ext_len; seed.pos.second += seed.curr_ext_len; } } } else { if(left) { seed.consensus_pos.first += ext_consensus.length(); seed.consensus_pos.second += ext_consensus.length(); } } } // move up "done" seeds size_t j = 0; for(size_t i = 0; i < remain; i++) { if(!seeds_[i].done) continue; assert_geq(i, j); if(i > j) { SeedExt temp = seeds_[j]; seeds_[j] = seeds_[i]; seeds_[i] = temp; // Find next "undone" seed j++; while(j < i && seeds_[j].done) { j++; } assert(j < remain && !seeds_[j].done); } else { j = i + 1; } } } remain = num_passed_seeds; if(remain < rp.repeat_count) { if(left) { left = false; remain = seeds_.size(); } } } // while(remain >= rp.repeat_count) #ifndef NDEBUG // make sure seed positions are unique EList > seed_poses; for(size_t i = 0; i < seeds_.size(); i++) { seed_poses.expand(); seed_poses.back() = seeds_[i].orig_pos; } seed_poses.sort(); for(size_t i = 0; i + 1 < seed_poses.size(); i++) { if(seed_poses[i].first == seed_poses[i+1].first) { assert_lt(seed_poses[i].second, seed_poses[i+1].second); } else { assert_lt(seed_poses[i].first, seed_poses[i+1].first); } } for(size_t i = 0; i < seeds_.size(); i++) { assert(seeds_[i].valid()); } #endif internal_update(); // check repeats within a repeat sequence self_repeat_ = false; for(size_t i = 0; i + 1 < seed_ranges_.size(); i++) { const RB_AlleleCoord& range = seed_ranges_[i]; for(size_t j = i + 1; j < seed_ranges_.size(); j++) { RB_AlleleCoord& range2 = seed_ranges_[j]; if(range.right <= range2.left) break; self_repeat_ = true; } } } bool RB_Repeat::overlap(const RB_Repeat& o, bool& contain, bool& left, size_t& seed_i, size_t& seed_j, bool debug) const { contain = left = false; seed_i = seed_j = 0; size_t p = 0, p2 = 0; while(p < seed_ranges_.size() && p2 < o.seed_ranges_.size()) { const RB_AlleleCoord& range = seed_ranges_[p]; const SeedExt& seed = seeds_[range.idx]; Range range_ = seed.getExtendedRange(consensus_.length()); RB_AlleleCoord ex_range(range_.first, range_.second, p); bool representative = (float)range.len() >= consensus_.length() * 0.95f; const RB_AlleleCoord& range2 = o.seed_ranges_[p2]; const SeedExt& seed2 = o.seeds_[range2.idx]; Range range2_ = seed2.getExtendedRange(o.consensus().length()); RB_AlleleCoord ex_range2(range2_.first, range2_.second, p2); bool representative2 = (float)range2.len() >= o.consensus_.length() * 0.95f; seed_i = range.idx; seed_j = range2.idx; const size_t relax = 5; if(representative && representative2) { if(ex_range.overlap_len(ex_range2) > 0) { if(ex_range.contain(ex_range2, relax)) { contain = true; left = true; } else if(ex_range2.contain(ex_range, relax)) { contain = true; left = false; } else { left = ex_range.left <= ex_range2.left; } return true; } } else if(representative) { if(range2.contain(ex_range, relax)) { contain = true; left = false; return true; } } else if(representative2) { if(range.contain(ex_range2, relax)) { contain = true; left = true; return true; } } if(range.right <= range2.right) p++; if(range2.right <= range.right) p2++; } return false; } void RB_Repeat::showInfo(const RepeatParameter& rp, CoordHelper& coordHelper) const { cerr << "\trepeat id: " << repeat_id_ << endl; cerr << "\tnumber of alleles: " << seeds_.size() << endl; cerr << "\tconsensus length: " << consensus_.length() << endl; cerr << consensus_ << endl; for(size_t i = 0; i < seeds_.size(); i++) { const SeedExt& seed = seeds_[i]; size_t ext_len = seed.getLength(); if(ext_len < rp.min_repeat_len) continue; bool sense_strand = seed.pos.first < coordHelper.forward_length(); cerr << "\t\t" << setw(4) << i << " " << setw(4) << ext_len; cerr << " " << (sense_strand ? '+' : '-'); cerr << " " << setw(10) << seed.pos.first << " " << setw(10) << seed.pos.second; cerr << " " << setw(4) << seed.consensus_pos.first << " " << setw(4) << seed.consensus_pos.second; cerr << " " << (seed.aligned ? "aligned" : "unaligned"); cerr << endl; } } template void RB_Repeat::generateSNPs(const RepeatParameter& rp, const TStr& s, TIndexOffU grp_id) { EList& seeds = this->seeds(); for(size_t i = 0; i < seeds.size(); i++) { SeedExt& seed = seeds[i]; if(!seed.aligned) { continue; } if(seed.getLength() < rp.min_repeat_len) { continue; } assert_eq(seed.getLength(), seed.pos.second - seed.pos.first); seed.generateSNPs(s, consensus(), snps()); } if(snps().size() > 0) { sort(snps_.begin(), snps().end(), SeedSNP::cmpSeedSNPByPos); } } void RB_Repeat::saveSNPs(ofstream &fp, TIndexOffU grp_id, TIndexOffU& snp_id_base) { string chr_name = "rep" + to_string(repeat_id_); for(size_t j = 0; j < snps_.size(); j++) { SeedSNP& snp = *snps_[j]; // assign SNPid snp.id = (snp_id_base++); fp << "rps" << snp.id; fp << "\t"; if(snp.type == EDIT_TYPE_MM) { fp << "single"; } else if(snp.type == EDIT_TYPE_READ_GAP) { fp << "deletion"; } else if(snp.type == EDIT_TYPE_REF_GAP) { fp << "insertion"; } else { assert(false); } fp << "\t"; fp << chr_name; fp << "\t"; fp << snp.pos; fp << "\t"; if(snp.type == EDIT_TYPE_MM || snp.type == EDIT_TYPE_REF_GAP) { fp << snp.base; } else if(snp.type == EDIT_TYPE_READ_GAP) { fp << snp.len; } else { assert(false); } fp << endl; } } float RB_RepeatExt::mergeable(const RB_Repeat& o) const { const EList& ranges = seed_ranges(); const EList& ranges2 = o.seed_ranges(); size_t num_overlap_bp = 0; size_t p = 0, p2 = 0; while(p < ranges.size() && p2 < ranges2.size()) { const RB_AlleleCoord& range = ranges[p]; const RB_AlleleCoord& range2 = ranges2[p2]; TIndexOffU overlap = range.overlap_len(range2); num_overlap_bp += overlap; if(range.right <= range2.right) p++; else p2++; } size_t num_total_bp = 0, num_total_bp2 = 0; for(size_t r = 0; r < ranges.size(); r++) num_total_bp += (ranges[r].right - ranges[r].left); for(size_t r = 0; r < ranges2.size(); r++) num_total_bp2 += (ranges2[r].right - ranges2[r].left); float portion = float(num_overlap_bp) / float(min(num_total_bp, num_total_bp2)); return portion; } inline void get_next_range(const EList& offsets, size_t i, Range& r, float& avg) { r.first = i; for(; r.first < offsets.size() && offsets[r.first] < 0; r.first++); r.second = r.first + 1; avg = 0.0f; if(r.first < offsets.size()) { float diff = (float)offsets[r.first] - (float)r.first; avg += diff; } for(; r.second < offsets.size() && offsets[r.second] >= 0 && (r.second == 0 || offsets[r.second] >= offsets[r.second - 1]); r.second++) { float diff = (float)offsets[r.second] - (float)r.second; avg += diff; } avg /= (float)(r.second - r.first); return; } template bool RB_RepeatExt::align(const RepeatParameter& rp, const TStr& ref, const string& s, const EList >& s_kmer_table, const string& s2, EList& offsets, size_t k, SeedExt& seed, int consensus_approx_left, int consensus_approx_right, size_t left, size_t right, bool debug) { RB_Repeat::seed_merge_tried++; seed.reset(); seed.pos.first = left; seed.pos.second = right; seed.orig_pos.first = seed.pos.first; seed.orig_pos.second = seed.orig_pos.first; seed.consensus_pos.first = 0; seed.consensus_pos.second = consensus_.length(); seed.aligned = false; { const int query_len = right - left; const int consensus_len = consensus_approx_right - consensus_approx_left; const int abs_gap_len = max(abs(consensus_len - query_len), 5); offsets.resize(s2.length()); offsets.fill(-1); pair kmer(0, 0); for(size_t i = 0; i + k <= s2.length(); i++) { if(i == 0) { kmer.first = extract_kmer(s2, i, k); } else { kmer.first = next_kmer(kmer.first, s2[i+k-1], k); } size_t lb = s_kmer_table.bsearchLoBound(kmer); while(lb < s_kmer_table.size() && kmer.first == s_kmer_table[lb].first) { int expected = (int)i + consensus_approx_left; int real = (int)s_kmer_table[lb].second; int abs_diff = abs(expected - real); if(abs_diff <= abs_gap_len * 2 || debug) { if(offsets[i] == -1) { offsets[i] = (int)s_kmer_table[lb].second; } else if(offsets[i] >= 0) { offsets[i] = -2; } else { assert_lt(offsets[i], -1); offsets[i] -= 1; } } lb++; } if(offsets[i] > 0 && i + k == s2.length()) { for(size_t j = i + 1; j < s2.length(); j++) { offsets[j] = offsets[j-1] + 1; } } } } if(debug) { cerr << "initial offsets" << endl; for(size_t j = 0; j < offsets.size(); j++) { cout << j << ": " << offsets[j] << " " << s2[j] << ": " << (offsets[j] < 0 ? ' ' : s[offsets[j]]) << endl; } } // remove inconsistent positions Range range; float range_avg; get_next_range(offsets, 0, range, range_avg); while(range.second < offsets.size()) { Range range2; float range_avg2; get_next_range(offsets, range.second, range2, range_avg2); if(range2.first >= offsets.size()) break; assert_leq(range.second, range2.first); float abs_diff = abs(range_avg - range_avg2); if(offsets[range.second - 1] > offsets[range2.first] || (abs_diff > 10.0f && (range.second - range.first < 5 || range2.second - range2.first < 5)) || abs_diff > 20.0f) { if(range.second - range.first < range2.second - range2.first) { for(size_t i = range.first; i < range.second; i++) { offsets[i] = -1; } range = range2; range_avg = range_avg2; } else { for(size_t i = range2.first; i < range2.second; i++) { offsets[i] = -1; } } } else { range = range2; range_avg = range_avg2; } } bool weighted_avg_inited = false; float weighted_avg = -1.0f; for(size_t i = 0; i < offsets.size(); i++) { if(offsets[i] < 0) continue; float diff = (float)offsets[i] - (float)i; if(weighted_avg_inited) { if(abs(diff - weighted_avg) > 20.0f) { offsets[i] = -1; continue; } } if(weighted_avg < 0.0f) { weighted_avg = diff; weighted_avg_inited = true; } else { weighted_avg = 0.8f * weighted_avg + 0.2f * diff; } } if(debug) { cerr << "after filtering" << endl; for(size_t j = 0; j < offsets.size(); j++) { cout << j << ": " << offsets[j] << " " << s2[j] << ": " << (offsets[j] < 0 ? ' ' : s[offsets[j]]) << endl; } } int i = 0; while(i < offsets.size()) { // skip non-negative offsets for(; i < offsets.size() && offsets[i] >= 0; i++); if(i >= offsets.size()) break; int j = i; for(; j < offsets.size(); j++) { if(offsets[j] >= 0) break; } assert(i >= offsets.size() || offsets[i] < 0); assert(j >= offsets.size() || offsets[j] >= 0); if(i > 0 && j < offsets.size()) { i -= 1; int left = offsets[i], right = offsets[j]; assert_geq(left, 0); if(left > right) return false; assert_leq(left, right); int ref_len = right - left + 1; int query_len = j - i + 1; if(query_len == ref_len) { // match for(size_t i2 = i + 1; i2 < j; i2++) offsets[i2] = offsets[i2-1] + 1; } else { // deletion or insertion bool del = query_len < ref_len; size_t gap_len = del ? ref_len - query_len : query_len - ref_len; size_t max_len = max(ref_len, query_len); const size_t max_mm = max_len / 25 + 1; const size_t very_max_mm = max(max_len / 2, max_mm); ca_s_ = s.substr(left, max_len); ca_s2_ = s2.substr(i, max_len); size_t gap_pos = 0, mm = very_max_mm + 1; find_gap_pos(ca_s_, ca_s2_, ca_ed_, ca_ed2_, del, gap_len, very_max_mm, gap_pos, mm, debug); if(mm > very_max_mm) { return false; } assert_lt(gap_pos, query_len); if(del) { for(size_t i2 = i + 1; i2 < j; i2++) { if(i2 - i == gap_pos) { offsets[i2] = offsets[i2-1] + gap_len; } else { offsets[i2] = offsets[i2-1] + 1; } } } else { for(size_t i2 = i + 1; i2 < j; i2++) { if(i2 - i >= gap_pos && i2 - i < gap_pos + gap_len) { offsets[i2] = offsets[i2-1]; } else { offsets[i2] = offsets[i2-1] + 1; } } } } } i = j; } if(debug) { cerr << "final offsets" << endl; for(size_t j = 0; j < offsets.size(); j++) { cout << j << ": " << offsets[j] << " " << s2[j] << ": " << (offsets[j] < 0 ? ' ' : s[offsets[j]]) << endl; } } #ifndef NDEBUG for(size_t i = 1; i < offsets.size(); i++) { if(offsets[i-1] < 0 || offsets[i] < 0) continue; assert_leq(offsets[i-1], offsets[i]); } #endif size_t b, e; for(b = 0; b < offsets.size() && offsets[b] < 0; b++); if(b >= offsets.size()) return false; assert_lt((size_t)b, offsets.size()); for(e = offsets.size() - 1; e > b && offsets[e] < 0; e--); if(b == e) return false; for(size_t p = b; p <= e; p++) { if(offsets[p] < 0) return false; } // try to fill the ends if(b > 0) { int mm = 0, pb = (int)b; for(int i = pb - 1; i >= 0; i--) { assert_geq(offsets[i+1], 0); if(offsets[i+1] == 0) break; if(s2[i] != s[offsets[i+1] - 1]) mm++; if(pb - i < 25 * (mm - 1)) break; offsets[i] = offsets[i+1] - 1; b = (size_t)i; } } if(e + 1 < offsets.size()) { int mm = 0, prev_end = (int)e; for(int i = prev_end + 1; i < offsets.size(); i++) { assert_geq(offsets[i-1], 0); if(offsets[i-1] + 1 >= s.length()) break; if(s2[i] != s[offsets[i-1] + 1]) mm++; if(i - prev_end < 25 * (mm - 1)) break; offsets[i] = offsets[i-1] + 1; e = (size_t)i; } } assert_geq(seed.pos.first, (TIndexOffU)b); seed.pos.first += (TIndexOffU)b; seed.pos.second = seed.pos.first + e - b + 1; seed.orig_pos.first = seed.pos.first; seed.orig_pos.second = seed.pos.first; seed.consensus_pos.first = offsets[b]; seed.consensus_pos.second = offsets[e] + 1; seed.aligned = true; for(int p = b; p < e; p++) { assert_geq(offsets[p], 0); assert_leq(offsets[p], offsets[p+1]); if(offsets[p] + 1 == offsets[p+1]) { // match continue; } else if(offsets[p] + 1 < offsets[p+1]) { // deletion seed.right_gaps.expand(); seed.right_gaps.back().first = p + 1 - b; seed.right_gaps.back().second = offsets[p+1] - offsets[p] - 1; } else { assert_eq(offsets[p], offsets[p+1]); int p2 = p + 1; for(; offsets[p2] == offsets[p2+1]; p2++); seed.right_gaps.expand(); seed.right_gaps.back().first = p + 1 - b; seed.right_gaps.back().second = (int)p - (int)p2; p = p2; } } if(debug) { string consensus_str = consensus_.substr(seed.consensus_pos.first, seed.consensus_pos.second - seed.consensus_pos.first); string seed_str; seed.getExtendedSeedSequence(ref, seed_str); assert_eq(consensus_str.length(), seed_str.length()); cerr << "consensus: " << consensus_str << endl; cerr << "seed : " << seed_str << endl; } RB_Repeat::seed_merged++; return true; } bool RB_RepeatExt::isSelfRepeat(const RepeatParameter& rp, const string& s, const EList >& s_kmer_table, EList& offsets, size_t k, bool debug) { offsets.resize(s.length()); offsets.fill(-1); pair kmer(0, 0); for(size_t i = 0; i + k <= s.length(); i++) { if(i == 0) { kmer.first = extract_kmer(s, i, k); } else { kmer.first = next_kmer(kmer.first, s[i+k-1], k); } size_t lb = s_kmer_table.bsearchLoBound(kmer); while(lb < s_kmer_table.size() && kmer.first == s_kmer_table[lb].first) { if(offsets[i] == -1) { offsets[i] = (int)s_kmer_table[lb].second; } else if(offsets[i] >= 0) { offsets[i] = -2; } else { assert_lt(offsets[i], -1); offsets[i] -= 1; } lb++; } if(offsets[i] > 0 && i + k == s.length()) { for(size_t j = i + 1; j < s.length(); j++) { offsets[j] = offsets[j-1] + 1; } } } if(debug) { cerr << "offsets" << endl; for(size_t j = 0; j < offsets.size(); j++) { cout << j << ": " << offsets[j] << " " << s[j] << ": " << (offsets[j] < 0 ? ' ' : s[offsets[j]]) << endl; } } const size_t min_repeat_size = 100; size_t repeat_count = 0; for(size_t i = 0; i + min_repeat_size < offsets.size(); i++) { if(offsets[i] >= -1) continue; size_t j = i + 1; for(; j < offsets.size() && offsets[j] <= -2; j++); if(j - i >= min_repeat_size) repeat_count++; i = j; } return repeat_count >= 2; } template bool RB_RepeatExt::merge(const RepeatParameter& rp, const TStr& s, RB_SWAligner& swalginer, const RB_RepeatExt& o, bool contain, size_t seed_i, size_t seed_j, bool debug) { // construct a new consensus sequence string prev_consensus = consensus_; size_t consensus_add_len = 0; { assert_lt(seed_i, seeds_.size()); assert_lt(seed_j, o.seeds_.size()); const SeedExt& seed = seeds_[seed_i]; const SeedExt& oseed = o.seeds_[seed_j]; Range range = seed.getExtendedRange(consensus_.length()); Range orange = oseed.getExtendedRange(o.consensus_.length()); assert_leq(range.first, orange.first + 10); consensus_add_len = (int)orange.first - (int)range.first; if(!contain) { if(range.second <= orange.first) { cerr << "something wrong: " << range.first << "-" << range.second << " "; cerr << orange.first << "-" << orange.second << " " << o.consensus_.length() << endl; assert(false); } if(range.second > orange.first && range.second - orange.first < o.consensus_.length()) { consensus_ += o.consensus_.substr(range.second - orange.first); } } } EList > merge_list; merge_list.reserveExact(o.seed_ranges_.size()); size_t p = 0, p2 = 0; const size_t relax = 5; while(p < seed_ranges_.size() && p2 < o.seed_ranges_.size()) { const RB_AlleleCoord& range = seed_ranges_[p]; assert_lt(range.idx, seeds_.size()); const RB_AlleleCoord& range2 = o.seed_ranges_[p2]; assert_lt(range2.idx, o.seeds_.size()); if(range.contain(range2, relax)) { merge_list.expand(); merge_list.back().first = p; merge_list.back().second = p2; if(debug) { cerr << p << ":" << range.left << "-" << range.right << " > "; cerr << p2 << ":" << range2.left << "-" << range2.right << endl; } } else if(range2.contain(range, relax)) { merge_list.expand(); merge_list.back().first = p; merge_list.back().second = p2; if(debug) { cerr << p << ":" << range.left << "-" << range.right << " < "; cerr << p2 << ":" << range2.left << "-" << range2.right << endl; } } else { TIndexOffU overlap_len = range.overlap_len(range2); bool stored = !merge_list.empty() && merge_list.back().first == p; bool stored2 = !merge_list.empty() && merge_list.back().second == p2; if(overlap_len > 0) { if(!stored && !stored2) { merge_list.expand(); merge_list.back().first = p; merge_list.back().second = p2; if(debug) { cerr << p << ":" << range.left << "-" << range.right << " ol "; cerr << p2 << ":" << range2.left << "-" << range2.right << endl; } } } else { if(range2.right <= range.left) { if(!stored2) { merge_list.expand(); merge_list.back().first = numeric_limits::max(); merge_list.back().second = p2; if(debug) { cerr << p << ":" << range.left << "-" << range.right << " <> "; cerr << p2 << ":" << range2.left << "-" << range2.right << endl; } } } else { assert_leq(range.right, range2.left); if(!stored) { merge_list.expand(); merge_list.back().first = p; merge_list.back().second = numeric_limits::max(); if(debug) { cerr << p << ":" << range.left << "-" << range.right << " <> "; cerr << p2 << ":" << range2.left << "-" << range2.right << endl; } } } } } if(range.right <= range2.right) p++; if(range2.right <= range.right) p2++; } assert(p == seeds_.size() || p2 == o.seeds_.size()); while(p < seed_ranges_.size()) { bool stored = !merge_list.empty() && merge_list.back().first == p; if(!stored) { const RB_AlleleCoord& range = seed_ranges_[p]; merge_list.expand(); merge_list.back().first = p; merge_list.back().second = numeric_limits::max(); if(debug) { cerr << p << ":" << range.left << "-" << range.right << " : <> " << endl; } } p++; } while(p2 < o.seed_ranges_.size()) { bool stored2 = !merge_list.empty() && merge_list.back().second == p2; if(!stored2) { const RB_AlleleCoord& range2 = o.seed_ranges_[p2]; merge_list.expand(); merge_list.back().first = numeric_limits::max(); merge_list.back().second = p2; if(debug) { cerr << ": <> " << p2 << ":" << range2.left << "-" << range2.right << endl; } } p2++; } assert(!merge_list.empty()); if(debug) { for(size_t i = 0; i < merge_list.size(); i++) { cerr << "merge list:" << endl; cerr << "\t" << (merge_list[i].first < seed_ranges_.size() ? (int)merge_list[i].first : -1); cerr << " " << (merge_list[i].second < o.seed_ranges_.size() ? (int)merge_list[i].second : -1) << endl; } } const size_t kmer_len = 12; EList > kmer_table; build_kmer_table(consensus_, kmer_table, kmer_len); EList offsets; bool self_repeat = isSelfRepeat(rp, consensus_, kmer_table, offsets, kmer_len, debug); if(self_repeat) { consensus_ = prev_consensus; return false; } string seq; for(size_t i = 0; i < merge_list.size(); i++) { size_t seed_id = (merge_list[i].first < seed_ranges_.size() ? seed_ranges_[merge_list[i].first].idx : merge_list[i].first); size_t oseed_id = (merge_list[i].second < o.seed_ranges_.size() ? o.seed_ranges_[merge_list[i].second].idx : merge_list[i].second); if(seed_id < seeds_.size()) { if(oseed_id >= o.seeds_.size()) continue; else if(seed_ranges_[merge_list[i].first].contain(o.seed_ranges_[merge_list[i].second])) continue; } const SeedExt* seed = (seed_id < seeds_.size() ? &seeds_[seed_id] : NULL); const SeedExt* oseed = (oseed_id < o.seeds_.size() ? &o.seeds_[oseed_id] : NULL); assert(seed != NULL || oseed != NULL); size_t left = (seed != NULL ? seed->pos.first : numeric_limits::max()); size_t right = (seed != NULL ? seed->pos.second : 0); int consensus_approx_left = (seed != NULL ? seed->consensus_pos.first : numeric_limits::max()); int consensus_approx_right = (seed != NULL ? seed->consensus_pos.second : 0); if(oseed != NULL) { if(oseed->pos.first < left) { left = oseed->pos.first; } if(oseed->pos.second > right) { right = oseed->pos.second; } int oconsensus_approx_left = oseed->consensus_pos.first + consensus_add_len; if(oconsensus_approx_left < consensus_approx_left) { consensus_approx_left = oconsensus_approx_left; } int oconsensus_approx_right = oseed->consensus_pos.second + consensus_add_len; if(oconsensus_approx_right > consensus_approx_right) { consensus_approx_right = oconsensus_approx_right; } } getString(s, left, right - left, seq); if(seed_id >= seeds_.size()) { seed_id = seeds_.size(); seeds_.expand(); } /* bool succ = */ align(rp, s, consensus_, kmer_table, seq, offsets, kmer_len, seeds_[seed_id], consensus_approx_left, consensus_approx_right, left, right, debug && right - left == 1080); } while(true) { internal_update(); size_t remove_count = 0; for(size_t i = 0; i + 1 < seed_ranges_.size(); i++) { RB_AlleleCoord& range = seed_ranges_[i]; if(range.left == numeric_limits::max()) break; for(size_t j = i + 1; j < seed_ranges_.size(); j++) { RB_AlleleCoord& range2 = seed_ranges_[j]; if(range2.left == numeric_limits::max()) break; if(range.right <= range2.left) break; getString(s, range.left, range2.right - range.left, seq); /* bool succ = */ align(rp, s, consensus_, kmer_table, seq, offsets, kmer_len, seeds_[range.idx], seeds_[range.idx].consensus_pos.first, seeds_[range2.idx].consensus_pos.second, range.left, range2.right, debug && range.left == 692422); range.left = seeds_[range.idx].pos.first; range.right = seeds_[range.idx].pos.second; seeds_[range2.idx].reset(); range2.left = numeric_limits::max(); remove_count++; } } if(remove_count <= 0) break; sort(seeds_.begin(), seeds_.end(), seedCmp); seeds_.resize(seeds_.size() - remove_count); } return true; } #define DMAX std::numeric_limits::max() RB_SWAligner::RB_SWAligner() { rnd_.init(0); } RB_SWAligner::~RB_SWAligner() { if(sc_) { delete sc_; } } void RB_SWAligner::init_dyn(const RepeatParameter& rp) { const int MM_PEN = 3; // const int MM_PEN = 6; const int GAP_PEN_LIN = 2; // const int GAP_PEN_LIN = (((MM_PEN) * rpt_edit_ + 1) * 1.0); const int GAP_PEN_CON = 4; // const int GAP_PEN_CON = (((MM_PEN) * rpt_edit_ + 1) * 1.0); const int MAX_PEN = MAX_I16; scoreMin_.init(SIMPLE_FUNC_LINEAR, rp.max_edit * MM_PEN * -1.0, 0.0); nCeil_.init(SIMPLE_FUNC_LINEAR, 0.0, 0.0); penCanIntronLen_.init(SIMPLE_FUNC_LOG, -8, 1); penNoncanIntronLen_.init(SIMPLE_FUNC_LOG, -8, 1); sc_ = new Scoring( DEFAULT_MATCH_BONUS, // constant reward for match DEFAULT_MM_PENALTY_TYPE, // how to penalize mismatches MM_PEN, // max mm penalty MM_PEN, // min mm penalty MAX_PEN, // max sc penalty MAX_PEN, // min sc penalty scoreMin_, // min score as function of read len nCeil_, // max # Ns as function of read len DEFAULT_N_PENALTY_TYPE, // how to penalize Ns in the read DEFAULT_N_PENALTY, // constant if N pelanty is a constant DEFAULT_N_CAT_PAIR, // whether to concat mates before N filtering GAP_PEN_CON, // constant coeff for read gap cost GAP_PEN_CON, // constant coeff for ref gap cost GAP_PEN_LIN, // linear coeff for read gap cost GAP_PEN_LIN, // linear coeff for ref gap cost 1 /* gGapBarrier */ // # rows at top/bot only entered diagonally ); } void RB_SWAligner::makePadString(const string& ref, const string& read, string& pad, size_t len) { pad.resize(len); for(size_t i = 0; i < len; i++) { // shift A->C, C->G, G->T, T->A pad[i] = "CGTA"[asc2dna[ref[i]]]; if(read[i] == pad[i]) { // shift pad[i] = "CGTA"[asc2dna[pad[i]]]; } } int head_len = len / 2; size_t pad_start = len - head_len; for(size_t i = 0; i < head_len; i++) { if(read[i] == pad[pad_start + i]) { // shift pad[pad_start + i] = "CGTA"[asc2dna[pad[pad_start + i]]]; } } } int RB_SWAligner::alignStrings(const string &ref, const string &read, EList& edits, Coord& coord) { // Prepare Strings // Read -> BTDnaString // Ref -> bit-encoded string //SwAligner swa; BTDnaString btread; BTString btqual; BTString btref; BTString btref2; BTDnaString btreadrc; BTString btqualrc; string qual = ""; for(size_t i = 0; i < read.length(); i++) { qual.push_back('I'); } #if 0 cerr << "REF : " << ref << endl; cerr << "READ: " << read << endl; cerr << "QUAL: " << qual << endl; #endif btread.install(read.c_str(), true); btreadrc = btread; btreadrc.reverseComp(); btqual.install(qual.c_str()); btqualrc = btqual; btref.install(ref.c_str()); TAlScore min_score = sc_->scoreMin.f((double)btread.length()); btref2 = btref; size_t nceil = 0; // size_t nrow = btread.length(); // Convert reference string to mask for(size_t i = 0; i < btref2.length(); i++) { if(toupper(btref2[i]) == 'N') { btref2.set(16, i); } else { int num = 0; int alts[] = {4, 4, 4, 4}; decodeNuc(toupper(btref2[i]), num, alts); assert_leq(num, 4); assert_gt(num, 0); btref2.set(0, i); for(int j = 0; j < num; j++) { btref2.set(btref2[i] | (1 << alts[j]), i); } } } bool fw = true; uint32_t refidx = 0; swa.initRead( btread, // read sequence btreadrc, btqual, // read qualities btqualrc, 0, // offset of first character within 'read' to consider btread.length(), // offset of last char (exclusive) in 'read' to consider *sc_); // local-alignment score floor DynProgFramer dpframe(false); size_t readgaps = 0; size_t refgaps = 0; size_t maxhalf = 0; DPRect rect; dpframe.frameSeedExtensionRect( 0, // ref offset implied by seed hit assuming no gaps btread.length(), // length of read sequence used in DP table btref2.length(), // length of reference readgaps, // max # of read gaps permitted in opp mate alignment refgaps, // max # of ref gaps permitted in opp mate alignment (size_t)nceil, // # Ns permitted maxhalf, // max width in either direction rect); // DP Rectangle assert(rect.repOk()); size_t cminlen = 2000, cpow2 = 4; swa.initRef( fw, // whether to align forward or revcomp read refidx, // reference ID rect, // DP rectangle btref2.wbuf(), // reference strings 0, // offset of first reference char to align to btref2.length(), // offset of last reference char to align to btref2.length(), // length of reference sequence *sc_, // scoring scheme min_score, // minimum score true, // use 8-bit SSE if positions cminlen, // minimum length for using checkpointing scheme cpow2, // interval b/t checkpointed diags; 1 << this false, // triangular mini-fills? false // is this a seed extension? ); TAlScore best = std::numeric_limits::min(); bool found = swa.align(rnd_, best); #ifdef DEBUGLOG cerr << "found: " << found << "\t" << best << "\t" << "minsc: " << min_score << endl; #endif if (found) { #ifdef DEBUGLOG //cerr << "CP " << "found: " << found << "\t" << best << "\t" << "minsc: " << min_score << endl; cerr << "REF : " << ref << endl; cerr << "READ: " << read << endl; #endif SwResult res; int max_match_len = 0; res.reset(); res.alres.init_raw_edits(&rawEdits_); found = swa.nextAlignment(res, best, rnd_); if (found) { edits = res.alres.ned(); //const TRefOff ref_off = res.alres.refoff(); //const Coord& coord = res.alres.refcoord(); coord = res.alres.refcoord(); //assert_geq(genomeHit._joinedOff + coord.off(), genomeHit.refoff()); #ifdef DEBUGLOG cerr << "num edits: " << edits.size() << endl; cerr << "coord: " << coord.off(); cerr << ", " << coord.ref(); cerr << ", " << coord.orient(); cerr << ", " << coord.joinedOff(); cerr << endl; Edit::print(cerr, edits); cerr << endl; Edit::printQAlign(cerr, btread, edits); #endif max_match_len = getMaxMatchLen(edits, btread.length()); #ifdef DEBUGLOG cerr << "max match length: " << max_match_len << endl; #endif } #ifdef DEBUGLOG cerr << "nextAlignment: " << found << endl; cerr << "-------------------------" << endl; #endif } return 0; } void RB_SWAligner::doTest(const RepeatParameter& rp, const string& refstr, const string& readstr) { init_dyn(rp); doTestCase1(refstr, readstr, rp.max_edit); } void RB_SWAligner::doTestCase1(const string& refstr, const string& readstr, TIndexOffU rpt_edit) { cerr << "doTestCase1----------------" << endl; EList edits; Coord coord; if (refstr.length() == 0 || readstr.length() == 0) { return; } EList ed; string pad; makePadString(refstr, readstr, pad, 5); string ref2 = pad + refstr + pad; string read2 = pad + readstr + pad; alignStrings(refstr, readstr, edits, coord); size_t left = pad.length(); size_t right = left + readstr.length(); edits.reserveExact(ed.size()); for(size_t i = 0; i < ed.size(); i++) { if(ed[i].pos >= left && ed[i].pos <= right) { edits.push_back(ed[i]); edits.back().pos -= left; } } #if 0 RepeatGroup rg; rg.edits = edits; rg.coord = coord; rg.seq = readstr; rg.base_offset = 0; string chr_name = "rep"; cerr << "REF : " << refstr << endl; cerr << "READ: " << readstr << endl; size_t snpids = 0; rg.buildSNPs(snpids); rg.writeSNPs(cerr, chr_name); cerr << endl; #endif } template RepeatBuilder::RepeatBuilder(TStr& s, TStr& sOriginal, const EList& szs, const EList& ref_names, bool forward_only, const string& filename) : s_(s), sOriginal_(sOriginal), coordHelper_(s.length(), forward_only ? s.length() : s.length() / 2, szs, ref_names), forward_only_(forward_only), filename_(filename), forward_length_(forward_only ? s.length() : s.length() / 2) { cerr << "RepeatBuilder: " << filename_ << endl; } template RepeatBuilder::~RepeatBuilder() { for(map::iterator it = repeat_map_.begin(); it != repeat_map_.end(); it++) { delete it->second; } repeat_map_.clear(); } template void RepeatBuilder::readSA(const RepeatParameter& rp, BlockwiseSA& sa) { TIndexOffU count = 0; subSA_.init(s_.length() + 1, rp.min_repeat_len, rp.repeat_count); while(count < s_.length() + 1) { TIndexOffU saElt = sa.nextSuffix(); count++; if(count && (count % 10000000 == 0)) { cerr << "SA count " << count << endl; } if(saElt == s_.length()) { assert_eq(count, s_.length() + 1); break; } subSA_.push_back(s_, coordHelper_, saElt, count == s_.length()); } cerr << "subSA size is " << subSA_.size() << endl; subSA_.dump(); #if 0 for(size_t i = 0; i < subSA_.size(); i++) { TIndexOffU joinedOff = subSA_[i]; fp << setw(10) << joinedOff << " " << getString(s_, joinedOff, rp.seed_len) << endl; } #endif cerr << "subSA mem Usage: " << subSA_.getMemUsage() << endl << endl; } template void RepeatBuilder::readSA(const RepeatParameter &rp, const BitPackedArray &sa) { TIndexOffU count = 0; subSA_.init(s_.length() + 1, rp.min_repeat_len, rp.repeat_count); for(size_t i = 0; i < sa.size(); i++) { TIndexOffU saElt = sa[i]; count++; if(count && (count % 10000000 == 0)) { cerr << "RB count " << count << endl; } if(saElt == s_.length()) { assert_eq(count, s_.length() + 1); break; } subSA_.push_back(s_, coordHelper_, saElt, count == s_.length()); } cerr << "subSA size: " << endl; subSA_.dump(); #if 0 for(size_t i = 0; i < subSA_.size(); i++) { TIndexOffU joinedOff = subSA_[i]; fp << setw(10) << joinedOff << " " << getString(s_, joinedOff, rp.seed_len) << endl; } #endif cerr << "subSA mem Usage: " << subSA_.getMemUsage() << endl << endl; } template void RepeatBuilder::build(const RepeatParameter& rp) { string rpt_len_str; rpt_len_str = to_string(rp.min_repeat_len) + "-" + to_string(rp.max_repeat_len); string seed_filename = filename_ + ".rep." + rpt_len_str + ".seed"; ofstream fp(seed_filename.c_str()); swaligner_.init_dyn(rp); RB_RepeatManager* repeat_manager = new RB_RepeatManager; EList repeatBases; subSA_.buildRepeatBase(s_, coordHelper_, rp.max_repeat_len, repeatBases); size_t repeat_id = 0; for(size_t i = 0; i < repeatBases.size(); i++) { addRepeatGroup(rp, repeat_id, *repeat_manager, repeatBases[i], fp); } { // Build and test minimizer-based k-mer table const size_t window = RB_Minimizer::default_w; const size_t k = RB_Minimizer::default_k; RB_KmerTable kmer_table; EList seqs; seqs.reserveExact(repeat_map_.size()); for(map::const_iterator it = repeat_map_.begin(); it != repeat_map_.end(); it++) { const RB_Repeat& repeat = *(it->second); assert(repeat.satisfy(rp)); seqs.expand(); seqs.back() = repeat.consensus(); } kmer_table.build(seqs, window, k); kmer_table.dump(cerr); cerr << endl; string query, rc_query; string queryOriginal; EList > minimizers; size_t total = 0, num_repeat = 0, correct = 0, false_positive = 0, false_negative = 0; for(size_t i = 0; i + rp.min_repeat_len <= forward_length_; i += 1000) { if(coordHelper_.getEnd(i) != coordHelper_.getEnd(i + rp.min_repeat_len)) continue; query = getString(s_, i, rp.min_repeat_len); queryOriginal = getString(sOriginal_, i, rp.min_repeat_len); rc_query = reverseComplement(queryOriginal); TIndexOffU idx = subSA_.find_repeat_idx(s_, query); const EList& test_repeat_index = subSA_.getRepeatIndex(); bool repeat = (idx < test_repeat_index.size()); bool est_repeat = kmer_table.isRepeat(query, rc_query, minimizers); total++; if(repeat) num_repeat++; if(repeat == est_repeat) { correct++; } else { if(est_repeat) { false_positive++; } else { false_negative++; //assert(false); } } } cerr << "total: " << total << endl; cerr << "repeat: " << num_repeat << endl; cerr << "correct: " << correct << endl; cerr << "false positive: " << false_positive << endl; cerr << "false negative: " << false_negative << endl; cerr << endl; ELList position2D; EList alignments; size_t repeat_total = 0, repeat_aligned = 0; const EList& test_repeat_index = subSA_.getRepeatIndex(); size_t interval = 1; if(test_repeat_index.size() >= 1000) { interval = test_repeat_index.size() / 1000; } size_t total_alignments = 0, max_alignments = 0; string query2, rc_query2; string queryOriginal2, rc_queryOriginal2; for(size_t i = 0; i < test_repeat_index.size(); i += interval) { TIndexOffU saElt_idx = test_repeat_index[i]; TIndexOffU saElt_idx_end = (i + 1 < test_repeat_index.size() ? test_repeat_index[i+1] : subSA_.size()); TIndexOffU saElt_size = saElt_idx_end - saElt_idx; TIndexOffU saElt = subSA_[saElt_idx]; query = getString(s_, saElt, rp.min_repeat_len); query2 = query; queryOriginal = getString(sOriginal_, saElt, rp.min_repeat_len); queryOriginal2 = queryOriginal; // introduce three mismatches into a query when the query is at lest 100-bp if(rp.min_repeat_len >= 100) { const size_t mid_pos1 = (size_t)(rp.min_repeat_len * 0.1); if(query2[mid_pos1] == 'A') { query2[mid_pos1] = 'C'; queryOriginal2[mid_pos1] ='C'; } else { query2[mid_pos1] = 'A'; queryOriginal2[mid_pos1] = 'A'; } const size_t mid_pos2 = (size_t)(rp.min_repeat_len * 0.5); if(query2[mid_pos2] == 'C') { query2[mid_pos2] = 'G'; queryOriginal2[mid_pos2] = 'G'; } else { query2[mid_pos2] = 'C'; queryOriginal2[mid_pos1] = 'G'; } const size_t mid_pos3 = (size_t)(rp.min_repeat_len * 0.9); if(query2[mid_pos3] == 'G') { query2[mid_pos3] = 'T'; queryOriginal2[mid_pos3] = 'T'; } else { query2[mid_pos3] = 'G'; queryOriginal2[mid_pos3] = 'G'; } } repeat_total += saElt_size; size_t found = 0; size_t cur_alignments = 0; if(kmer_table.isRepeat(query2, minimizers)) { kmer_table.findAlignments(query2, minimizers, position2D, alignments); total_alignments += (alignments.size() * saElt_size); cur_alignments = alignments.size(); TIndexOffU baseoff = 0; for(size_t s = 0; s < seqs.size(); s++) { int spos = seqs[s].find(query); if(spos != string::npos) { for(size_t a = 0; a < alignments.size(); a++) { if(alignments[a].pos == baseoff + spos) { found++; } } } baseoff += seqs[s].length(); } assert_leq(found, 1); } rc_query = reverseComplement(queryOriginal); rc_query2 = reverseComplement(queryOriginal2); size_t rc_found = 0; if(kmer_table.isRepeat(rc_query2, minimizers)) { kmer_table.findAlignments(rc_query2, minimizers, position2D, alignments); total_alignments += (alignments.size() * saElt_size); cur_alignments += alignments.size(); if(cur_alignments > max_alignments) { max_alignments = cur_alignments; } TIndexOffU baseoff = 0; for(size_t s = 0; s < seqs.size(); s++) { int spos = seqs[s].find(rc_query); if(spos != string::npos) { for(size_t a = 0; a < alignments.size(); a++) { if(alignments[a].pos == baseoff + spos) { rc_found++; } } } baseoff += seqs[s].length(); } assert_leq(rc_found, 1); } if(found + rc_found == 1) { repeat_aligned += saElt_size; } } cerr << "num repeats: " << repeat_total << endl; cerr << "repeat aligned using minimizers: " << repeat_aligned << endl; cerr << "average number of alignments: " << (float)total_alignments / repeat_total << endl; cerr << "max alignment: " << max_alignments << endl; cerr << endl; } cerr << "number of repeats is " << repeat_map_.size() << endl; size_t total_rep_seq_len = 0, total_allele_seq_len = 0; size_t i = 0; for(map::iterator it = repeat_map_.begin(); it != repeat_map_.end(); it++, i++) { RB_Repeat& repeat = *(it->second); if(!repeat.satisfy(rp)) continue; repeat.saveSeedExtension(rp, s_, coordHelper_, i, fp, total_rep_seq_len, total_allele_seq_len); } size_t total_qual_seeds = 0; for(map::iterator it = repeat_map_.begin(); it != repeat_map_.end(); it++, i++) { RB_Repeat& repeat = *(it->second); EList& seeds = repeat.seeds(); for(size_t i = 0; i < seeds.size(); i++) { if(seeds[i].getLength() < rp.min_repeat_len) continue; total_qual_seeds++; } } cerr << "total repeat sequence length: " << total_rep_seq_len << endl; cerr << "total allele sequence length: " << total_allele_seq_len << endl; cerr << "total number of seeds including those that are position-wise different, but sequence-wise identical: " << total_qual_seeds << endl; cerr << endl; fp << "total repeat sequence length: " << total_rep_seq_len << endl; fp << "total allele sequence length: " << total_allele_seq_len << endl; fp << "total number of seeds including those that are position-wise different, but sequence-wise identical: " << total_qual_seeds << endl; fp.close(); delete repeat_manager; repeat_manager = NULL; // remove non-qualifying repeats // and update repeat IDs for those remaining { map temp_repeat_map; size_t i = 0; for(map::iterator it = repeat_map_.begin(); it != repeat_map_.end(); it++) { RB_Repeat* repeat = it->second; if(!repeat->satisfy(rp)) { delete repeat; continue; } repeat->repeat_id(i); temp_repeat_map[repeat->repeat_id()] = repeat; i++; } repeat_map_ = temp_repeat_map; } const bool sanity_check = true; if(sanity_check) { EList > kmer_table; string query; string queryOriginal; size_t total = 0, match = 0; EList positions; const EList& test_repeat_index = subSA_.getRepeatIndex(); size_t interval = 1; if(test_repeat_index.size() >= 10000) { interval = test_repeat_index.size() / 10000; } for(size_t i = 0; i < test_repeat_index.size(); i += interval) { TIndexOffU saElt_idx = test_repeat_index[i]; TIndexOffU saElt_idx_end = (i + 1 < test_repeat_index.size() ? test_repeat_index[i+1] : subSA_.size()); positions.clear(); for(size_t j = saElt_idx; j < saElt_idx_end; j++) { positions.push_back(subSA_[j]); #ifndef NDEBUG if(j > saElt_idx) { TIndexOffU lcp_len = getLCP(s_, coordHelper_, positions[0], positions.back(), rp.min_repeat_len); assert_eq(lcp_len, rp.min_repeat_len); } TIndexOffU saElt = subSA_[j]; TIndexOffU start = coordHelper_.getStart(saElt); TIndexOffU start2 = coordHelper_.getStart(saElt + rp.min_repeat_len - 1); assert_eq(start, start2); #endif } TIndexOffU saElt = subSA_[saElt_idx]; size_t true_count = saElt_idx_end - saElt_idx; getString(s_, saElt, rp.min_repeat_len, query); getString(sOriginal_, saElt, rp.min_repeat_len, queryOriginal); total++; size_t count = 0, rc_count = 0; string rc_query = reverse_complement(queryOriginal); for(map::iterator it = repeat_map_.begin(); it != repeat_map_.end(); it++) { RB_Repeat& repeat = *(it->second); int pos = repeat.consensus().find(query); if(pos != string::npos) { for(size_t s = 0; s < repeat.seeds().size(); s++) { SeedExt& seed = repeat.seeds()[s]; string seq; seed.getExtendedSeedSequence(s_, seq); if(seq.find(query) != string::npos) count++; } } pos = repeat.consensus().find(rc_query); if(pos != string::npos) { for(size_t s = 0; s < repeat.seeds().size(); s++) { SeedExt& seed = repeat.seeds()[s]; string seq; seed.getExtendedSeedSequence(s_, seq); if(seq.find(rc_query) != string::npos) rc_count++; } } } if(count == true_count || rc_count == true_count) { match++; } else if(total - match <= 10) { cerr << " query: " << query << endl; cerr << "rc_query: " << rc_query << endl; cerr << "true count: " << true_count << endl; cerr << "found count: " << count << endl; cerr << "rc found count: " << rc_count << endl; cerr << endl; } } cerr << "RepeatBuilder: sanity check: " << match << " passed (out of " << total << ")" << endl << endl; } } template void RepeatBuilder::writeHaploType(const EList& haplo_lists, const EList& seeds, TIndexOffU& hapl_id_base, const string& seq_name, ostream &fp) { if(haplo_lists.size() == 0) { return; } for(size_t i = 0; i < haplo_lists.size(); i++) { const SeedHP &haploType = haplo_lists[i]; fp << "rpht" << hapl_id_base++; fp << "\t" << seq_name; fp << "\t" << haploType.range.first; fp << "\t" << haploType.range.second; fp << "\t"; assert_gt(haploType.snpIDs.size(), 0); for (size_t j = 0; j < haploType.snpIDs.size(); j++) { if(j) { fp << ","; } fp << haploType.snpIDs[j]; } fp << endl; } } template void RepeatBuilder::writeAllele(TIndexOffU grp_id, TIndexOffU allele_id, Range range, const string& seq_name, TIndexOffU baseoff, const EList& seeds, ostream &fp) { // >rpt_name*0\trep\trep_pos\trep_len\tpos_count\t0 // chr_name:pos:direction chr_name:pos:direction // // >rep1*0 rep 0 100 470 0 // 22:112123123:+ 22:1232131113:+ // size_t snp_size = seeds[range.first].snps.size(); size_t pos_size = range.second - range.first; fp << ">"; fp << "rpt_" << grp_id << "*" << allele_id; fp << "\t" << seq_name; fp << "\t" << baseoff + seeds[range.first].consensus_pos.first; fp << "\t" << seeds[range.first].consensus_pos.second - seeds[range.first].consensus_pos.first; fp << "\t" << pos_size; fp << "\t" << snp_size; fp << "\t"; for(size_t i = 0; i < snp_size; i++) { if(i > 0) {fp << ",";} fp << "rps" << seeds[range.first].snps[i]->id; } fp << endl; // print positions for(size_t i = 0; i < pos_size; i++) { if(i > 0 && (i % 10 == 0)) { fp << endl; } if(i % 10 != 0) { fp << " "; } string chr_name; TIndexOffU pos_in_chr; TIndexOffU joinedOff = seeds[range.first + i].pos.first; bool fw = true; if(joinedOff < forward_length_) { fw = true; } else { fw = false; joinedOff = s_.length() - joinedOff - seeds[range.first].getLength(); } coordHelper_.getGenomeCoord(joinedOff, chr_name, pos_in_chr); char direction = fw ? '+' : '-'; fp << chr_name << ":" << pos_in_chr << ":" << direction; } fp << endl; } template void RepeatBuilder::writeSNPs(ostream& fp, const string& rep_chr_name, const ESet& editset) { for(size_t i = 0; i < editset.size(); i++) { const Edit& ed = editset[i]; if(ed.isMismatch()) { fp << "rps" << ed.snpID; fp << "\t" << "single"; fp << "\t" << rep_chr_name; fp << "\t" << ed.pos; fp << "\t" << ed.qchr; fp << endl; } else { assert(false); } } } template void RepeatBuilder::saveFile(const RepeatParameter& rp) { saveRepeats(rp); } bool RB_RepeatManager::checkRedundant(const RepeatParameter& rp, const map& repeat_map, const EList& positions, EList& to_remove) const { to_remove.clear(); bool replace = false; for(size_t i = 0; i < positions.size(); i++) { TIndexOffU seed_pos = positions[i]; Range seed_range(seed_pos + rp.seed_len, 0); map >::const_iterator it = range_to_repeats_.upper_bound(seed_range); if(it == range_to_repeats_.end()) continue; for(map >::const_reverse_iterator rit(it); rit != range_to_repeats_.rend(); rit++) { Range repeat_range = rit->first; if(repeat_range.first + rp.max_repeat_len <= seed_pos) break; const EList& repeat_ids = rit->second; assert_gt(repeat_ids.size(), 0); for(size_t r = 0; r < repeat_ids.size(); r++) { size_t repeat_id = repeat_ids[r]; size_t idx = to_remove.bsearchLoBound(repeat_id); if(idx < to_remove.size() && to_remove[idx] == repeat_id) continue; bool overlap = seed_pos < repeat_range.second && seed_pos + rp.seed_len > repeat_range.first; if(!overlap) continue; map::const_iterator it2 = repeat_map.find(repeat_id); assert(it2 != repeat_map.end()); const RB_Repeat& repeat = *(it2->second); const EList& allele_ranges = repeat.seed_ranges(); size_t num_contain = 0, num_overlap = 0, num_close = 0, num_overlap_bp = 0; size_t p = 0, p2 = 0; while(p < positions.size() && p2 < allele_ranges.size()) { RB_AlleleCoord range; range.left = positions[p]; range.right = positions[p] + rp.seed_len; RB_AlleleCoord range2 = allele_ranges[p2]; if(range2.contain(range)) { num_contain++; num_overlap_bp += rp.seed_len; } else { TIndexOffU overlap = range2.overlap_len(range); if(overlap > 0) { num_overlap++; num_overlap_bp += (range2.right - range.left); } else if(range.right + 10 > range2.left && range2.right + 10 > range.left) { num_close++; } } if(range.right <= range2.right) p++; else p2++; } // if the number of matches is >= 90% of positions in the smaller group if((num_contain + num_overlap) * 10 + num_close * 8 >= min(positions.size(), allele_ranges.size()) * 9) { if(positions.size() <= allele_ranges.size()) { return true; } else { replace = true; to_remove.push_back(repeat_id); to_remove.sort(); } } } } // DK - check this out if(replace) break; } return false; } void RB_RepeatManager::addRepeat(const RB_Repeat* repeat) { const EList& allele_ranges = repeat->seed_ranges(); for(size_t i = 0; i < allele_ranges.size(); i++) { Range allele_range(allele_ranges[i].left, allele_ranges[i].right); addRepeat(allele_range, repeat->repeat_id()); } } void RB_RepeatManager::addRepeat(Range range, size_t repeat_id) { if(range_to_repeats_.find(range) == range_to_repeats_.end()) { range_to_repeats_[range] = EList(); } EList& repeat_ids = range_to_repeats_[range]; size_t idx = repeat_ids.bsearchLoBound(repeat_id); if(idx < repeat_ids.size() && repeat_ids[idx] == repeat_id) return; repeat_ids.push_back(repeat_id); repeat_ids.sort(); } void RB_RepeatManager::removeRepeat(const RB_Repeat* repeat) { const EList& allele_ranges = repeat->seed_ranges(); for(size_t p = 0; p < allele_ranges.size(); p++) { Range range(allele_ranges[p].left, allele_ranges[p].right); removeRepeat(range, repeat->repeat_id()); } } void RB_RepeatManager::removeRepeat(Range range, size_t repeat_id) { EList& repeat_ids = range_to_repeats_[range]; TIndexOffU idx = repeat_ids.bsearchLoBound(repeat_id); if(idx < repeat_ids.size() && repeat_id == repeat_ids[idx]) { repeat_ids.erase(idx); if(repeat_ids.empty()) range_to_repeats_.erase(range); } } void RB_RepeatManager::showInfo(const RepeatParameter& rp, CoordHelper& coordHelper, const map& repeat_map, size_t num_case) const { size_t count = 0; for(map >::const_iterator it = range_to_repeats_.begin(); it != range_to_repeats_.end(); it++) { const Range& range = it->first; if(range.second - range.first < rp.min_repeat_len) continue; map >::const_iterator jt = it; jt++; for(; jt != range_to_repeats_.end(); jt++) { const Range& range2 = jt->first; if(range2.second - range2.first < rp.min_repeat_len) continue; if(range.second <= range2.first) break; if(count < num_case) { cerr << "range (" << range.first << ", " << range.second << ") vs. range2 ("; cerr << range2.first << ", " << range2.second << ")" << endl; for(size_t i = 0; i < it->second.size(); i++) { cerr << "\t1 " << it->second[i] << endl; const RB_Repeat* repeat = repeat_map.find(it->second[i])->second; repeat->showInfo(rp, coordHelper); } for(size_t i = 0; i < jt->second.size(); i++) { cerr << "\t2 " << jt->second[i] << endl; const RB_Repeat* repeat = repeat_map.find(jt->second[i])->second; repeat->showInfo(rp, coordHelper); } cerr << endl << endl; } count++; } } cerr << "ShowInfo - count: " << count << endl; } template void RepeatBuilder::reassignSeeds(const RepeatParameter& rp, size_t repeat_bid, // repeat begin id size_t repeat_eid, // repeat end id EList& seeds) { assert_lt(repeat_bid, repeat_eid); EList updated; updated.resizeExact(repeat_eid - repeat_bid); updated.fillZero(); string seq; for(size_t s = 0; s < seeds.size(); s++) { SeedExt& seed = seeds[s]; size_t max_repeat_id = repeat_bid; size_t max_ext_len = 0; for(size_t i = repeat_bid; i < repeat_eid; i++) { map::iterator it = repeat_map_.find(i); assert(it != repeat_map_.end()); const RB_Repeat& repeat = *(it->second); const string& consensus = repeat.consensus(); const size_t ext_len = (consensus.length() - rp.seed_len) / 2; if(seed.bound.first + ext_len > seed.pos.first) continue; if(seed.pos.second + ext_len > seed.bound.second) continue; getString(s_, seed.pos.first - ext_len, rp.seed_len + ext_len * 2, seq); assert_eq(consensus.length(), seq.length()); size_t tmp_ext_len = 0; for(size_t j = 0; j < ext_len; j++, tmp_ext_len++) { if(seq[ext_len - j - 1] != consensus[ext_len - j - 1] || seq[ext_len + rp.seed_len + j] != consensus[ext_len + rp.seed_len + j]) { break; } } if(tmp_ext_len > max_ext_len) { max_repeat_id = i; max_ext_len = tmp_ext_len; } } if(rp.seed_len + max_ext_len * 2 >= rp.min_repeat_len) { seed.pos.first -= max_ext_len; seed.pos.second += max_ext_len; map::iterator it = repeat_map_.find(max_repeat_id); assert(it != repeat_map_.end()); RB_Repeat& repeat = *(it->second); const string& consensus = repeat.consensus(); const size_t ext_len = (consensus.length() - rp.seed_len) / 2; assert_leq(max_ext_len, ext_len); seed.consensus_pos.first = ext_len - max_ext_len; seed.consensus_pos.second = consensus.length() - (ext_len - max_ext_len); assert_leq(seed.consensus_pos.second - seed.consensus_pos.first, consensus.length()); repeat.addSeed(seed); updated[max_repeat_id - repeat_bid] = true; } } for(size_t i = repeat_bid; i < repeat_eid; i++) { if(!updated[i - repeat_bid]) continue; map::iterator it = repeat_map_.find(i); assert(it != repeat_map_.end()); RB_Repeat& repeat = *(it->second); repeat.update(); } } /** * TODO * @brief * * @param rpt_seq * @param rpt_range */ template void RepeatBuilder::addRepeatGroup(const RepeatParameter& rp, size_t& repeat_id, RB_RepeatManager& repeat_manager, const RB_RepeatBase& repeatBase, ostream& fp) { RB_Repeat* repeat = new RB_Repeat; repeat->repeat_id(repeat_id); repeat->init(rp, s_, coordHelper_, subSA_, repeatBase); assert(repeat_map_.find(repeat->repeat_id()) == repeat_map_.end()); repeat_map_[repeat->repeat_id()] = repeat; repeat_id++; } template bool RepeatBuilder::checkSequenceMergeable(const string& ref, const string& read, EList& edits, Coord& coord, TIndexOffU rpt_len, TIndexOffU max_edit) { size_t max_matchlen = 0; EList ed; string pad; swaligner_.makePadString(ref, read, pad, 5); string ref2 = pad + ref; string read2 = pad + read; swaligner_.alignStrings(ref2, read2, ed, coord); // match should start from pad string if(coord.off() != 0) { return false; } // no edits on pad string if(ed.size() > 0 && ed[0].pos < pad.length()) { return false; } size_t left = pad.length(); size_t right = left + read.length(); edits.clear(); edits.reserveExact(ed.size()); for(size_t i = 0; i < ed.size(); i++) { if(ed[i].pos >= left && ed[i].pos <= right) { edits.push_back(ed[i]); edits.back().pos -= left; } } max_matchlen = getMaxMatchLen(edits, read.length()); #ifdef DEBUGLOG { cerr << "After pad removed" << endl; BTDnaString btread; btread.install(read.c_str(), true); Edit::print(cerr, edits); cerr << endl; Edit::printQAlign(cerr, btread, edits); } #endif return (max_matchlen >= rpt_len); } template void RepeatBuilder::saveRepeats(const RepeatParameter &rp) { ios_base::openmode mode = ios_base::out; if(rp.append_result) { mode |= ios_base::app; } else { mode |= ios_base::trunc; } // Generate SNPs size_t i = 0; for(map::iterator it = repeat_map_.begin(); it != repeat_map_.end(); it++, i++) { RB_Repeat& repeat = *(it->second); if(!repeat.satisfy(rp)) continue; // for each repeats repeat.generateSNPs(rp, s_, i); } // save snp, consensus sequenuce, info string snp_fname = filename_ + ".rep.snp"; string info_fname = filename_ + ".rep.info"; string hapl_fname = filename_ + ".rep.haplotype"; ofstream snp_fp(snp_fname.c_str(), mode); ofstream info_fp(info_fname.c_str(), mode); ofstream hapl_fp(hapl_fname.c_str(), mode); const string repName = "rep" + to_string(rp.min_repeat_len) + "-" + to_string(rp.max_repeat_len); i = 0; TIndexOffU consensus_baseoff = 0; TIndexOffU snp_id_base = 0; TIndexOffU hapl_id_base = 0; for(map::iterator it = repeat_map_.begin(); it != repeat_map_.end(); it++, i++) { RB_Repeat& repeat = *(it->second); if(!repeat.satisfy(rp)) continue; // for each repeats repeat.saveSNPs(snp_fp, i, snp_id_base); saveAlleles(rp, repName, repeat, info_fp, hapl_fp, i, hapl_id_base, consensus_baseoff); consensus_baseoff += repeat.consensus().length(); } snp_fp.close(); info_fp.close(); hapl_fp.close(); // save all consensus sequence saveConsensus(rp, repName); } template void RepeatBuilder::saveConsensus(const RepeatParameter &rp, const string& repName) { ios_base::openmode mode = ios_base::out; if(rp.append_result) { mode |= ios_base::app; } else { mode |= ios_base::trunc; } string fa_fname = filename_ + ".rep.fa"; ofstream fa_fp(fa_fname.c_str(), mode); fa_fp << ">" << repName << endl; size_t oskip = 0; for(map::iterator it = repeat_map_.begin(); it != repeat_map_.end(); it++) { RB_Repeat& repeat = *(it->second); if(!repeat.satisfy(rp)) continue; // for each repeats const string& constr = repeat.consensus(); size_t si = 0; size_t constr_len = constr.length(); while(si < constr_len) { size_t out_len = std::min((size_t)(output_width - oskip), (size_t)(constr_len - si)); fa_fp << constr.substr(si, out_len); if((oskip + out_len) == output_width) { fa_fp << endl; oskip = 0; } else { // last line oskip = oskip + out_len; } si += out_len; } } if(oskip) { fa_fp << endl; } fa_fp.close(); } template void RepeatBuilder::saveAlleles( const RepeatParameter& rp, const string& repName, RB_Repeat& repeat, ofstream& fp, ofstream& hapl_fp, TIndexOffU grp_id, TIndexOffU& hapl_id_base, TIndexOffU baseoff) { const EList& seeds = repeat.seeds(); EList haplo_lists; Range range(0, seeds.size()); int allele_id = 0; for(size_t sb = range.first; sb < range.second;) { size_t se = sb + 1; for(; se < range.second; se++) { if(!(SeedExt::isSameConsensus(seeds[sb], seeds[se]) && SeedExt::isSameSNPs(seeds[sb], seeds[se]) && seeds[sb].aligned == seeds[se].aligned)) { break; } } if(!seeds[sb].aligned) { sb = se; continue; } if(seeds[sb].getLength() < rp.min_repeat_len) { sb = se; continue; } // [sb, se) are same alleles writeAllele(grp_id, allele_id, Range(sb, se), repName, baseoff, seeds, fp); generateHaploType(Range(sb, se), seeds, haplo_lists); allele_id++; sb = se; } // sort HaploType List by pos haplo_lists.sort(); writeHaploType(haplo_lists, seeds, hapl_id_base, repName, hapl_fp); } template void RepeatBuilder::generateHaploType(Range range, const EList &seeds, EList &haplo_list) { const EList& snps = seeds[range.first].snps; if(snps.size() == 0) return; // right-most position TIndexOffU max_right_pos = seeds[range.first].consensus_pos.second - 1; #ifndef NDEBUG for(size_t i = 0; i < snps.size(); i++) { assert_leq(snps[i]->pos, max_right_pos); } #endif // create haplotypes of at least 16 bp long to prevent combinations of SNPs // break a list of SNPs into several haplotypes if a SNP is far from the next SNP in the list const TIndexOffU min_ht_len = 16; size_t eb = 0, ee = 1; while(ee < snps.size() + 1) { if(ee == snps.size() || snps[eb]->pos + (min_ht_len << 1) < snps[ee]->pos) { TIndexOffU left_pos = snps[eb]->pos; TIndexOffU right_pos = snps[ee-1]->pos; if(snps[ee-1]->type == EDIT_TYPE_READ_GAP) { right_pos += snps[ee-1]->len; } if(left_pos + min_ht_len - 1 > right_pos) { right_pos = left_pos + min_ht_len - 1; } right_pos = min(max_right_pos, right_pos); assert_leq(left_pos, right_pos); SeedHP seedHP; seedHP.range.first = left_pos; seedHP.range.second = right_pos; for(size_t i = eb; i < ee; i++) { string snp_ids = "rps" + to_string(snps[i]->id); seedHP.snpIDs.push_back(snp_ids); } // Add to haplo_list bool found = false; for(size_t i = 0; i < haplo_list.size(); i++) { if(haplo_list[i] == seedHP) { found = true; break; } } if(!found) { // Add haplo_list.push_back(seedHP); } eb = ee; } ee++; } } void RB_SubSA::init(TIndexOffU sa_size, TIndexOffU seed_len, TIndexOffU seed_count) { assert_gt(sa_size, 1); sa_size_ = sa_size; assert_gt(seed_len, 0); seed_len_ = seed_len; assert_gt(seed_count, 1); seed_count_ = seed_count; temp_suffixes_.clear(); repeat_list_.clear(); repeat_index_.clear(); } template void RB_SubSA::push_back(const TStr& s, CoordHelper& coordHelper, TIndexOffU saElt, bool lastInput) { if(saElt + seed_len() <= coordHelper.getEnd(saElt)) { if(seed_count_ == 1) { repeat_list_.push_back(saElt); } else { assert_gt(seed_count_, 1); if(temp_suffixes_.empty()) { temp_suffixes_.push_back(saElt); return; } TIndexOffU prev_saElt = temp_suffixes_.back(); // calculate common prefix length between two text. // text1 is started from prev_saElt and text2 is started from saElt bool same = isSameSequenceUpto(s, coordHelper, prev_saElt, saElt, seed_len_); if(same) { temp_suffixes_.push_back(saElt); } if(!same || lastInput) { if(temp_suffixes_.size() >= seed_count_) { repeat_index_.push_back(repeat_list_.size()); temp_suffixes_.sort(); for(size_t pi = 0; pi < temp_suffixes_.size(); pi++) { repeat_list_.push_back(temp_suffixes_[pi]); } } temp_suffixes_.clear(); if(!lastInput) { temp_suffixes_.push_back(saElt); } else { temp_suffixes_.nullify(); } } } } if(lastInput) { size_t bit = sizeof(uint32_t) * 8; size_t num = (repeat_list_.size() + bit - 1) / bit; done_.resizeExact(num); done_.fillZero(); #ifndef NDEBUG string prev_seq = ""; for(size_t i = 0; i < repeat_index_.size(); i++) { TIndexOffU saBegin = repeat_index_[i]; TIndexOffU saEnd = (i + 1 < repeat_index_.size() ? repeat_index_[i+1] : repeat_list_.size()); string seq = getString(s, repeat_list_[saBegin], seed_len()); for(size_t j = saBegin + 1; j < saEnd; j++) { string tmp_seq = getString(s, repeat_list_[j], seed_len()); assert_eq(seq, tmp_seq); } if(prev_seq != "" ) { assert_lt(prev_seq, seq); } prev_seq = seq; } #endif } } template Range RB_SubSA::find(const TStr& s, const string& seq) const { TIndexOffU i = find_repeat_idx(s, seq); if(i >= repeat_index_.size()) return Range(0, 0); Range range(repeat_index_[i], i + 1 < repeat_index_.size() ? repeat_index_[i+1] : repeat_list_.size()); return range; } template TIndexOffU RB_SubSA::find_repeat_idx(const TStr& s, const string& seq) const { assert_eq(seq.length(), seed_len_); string temp; size_t l = 0, r = repeat_index_.size(); while(l < r) { size_t m = (r + l) >> 1; TIndexOffU saElt = repeat_list_[repeat_index_[m]]; getString(s, saElt, seed_len_, temp); if(seq == temp) { return m; } else if(seq < temp) { r = m; } else { assert(seq > temp); l = m + 1; } } return repeat_index_.size(); } void RB_SubSA::setDone(TIndexOffU off, TIndexOffU len) { assert_leq(off + len, repeat_list_.size()); const TIndexOffU bit = sizeof(uint32_t) * 8; for(TIndexOffU i = off; i < off + len; i++) { TIndexOffU quotient = i / bit; TIndexOffU remainder = i % bit; assert_lt(quotient, done_.size()); uint32_t num = done_[quotient]; num = num | (1 << remainder); done_[quotient] = num; } } bool RB_SubSA::isDone(TIndexOffU off, TIndexOffU len) const { assert_leq(off + len, repeat_list_.size()); const TIndexOffU bit = sizeof(uint32_t) * 8; for(TIndexOffU i = off; i < off + len; i++) { TIndexOffU quotient = i / bit; TIndexOffU remainder = i % bit; assert_lt(quotient, done_.size()); uint32_t num = done_[quotient]; num = num & (1 << remainder); if(num == 0) return false; } return true; } template void RB_SubSA::buildRepeatBase(const TStr& s, CoordHelper& coordHelper, const size_t max_len, EList& repeatBases) { if(repeat_index_.empty()) return; done_.fillZero(); EList senseDominant; senseDominant.resizeExact(repeat_index_.size()); senseDominant.fillZero(); EList repeatStack; EList > size_table; size_table.reserveExact(repeat_index_.size() / 2 + 1); for(size_t i = 0; i < repeat_index_.size(); i++) { TIndexOffU begin = repeat_index_[i]; TIndexOffU end = (i + 1 < repeat_index_.size() ? repeat_index_[i+1] : repeat_list_.size()); assert_lt(begin, end); EList positions; positions.reserveExact(end - begin); for(size_t j = begin; j < end; j++) positions.push_back(repeat_list_[j]); if(!isSenseDominant(coordHelper, positions, seed_len_) && !threeN) continue; senseDominant[i] = 1; size_table.expand(); size_table.back().first = end - begin; size_table.back().second = i; } size_table.sort(); size_t bundle_count = 0; string tmp_str; EList tmp_ranges; tmp_ranges.resizeExact(4); EList > tmp_sort_ranges; tmp_sort_ranges.resizeExact(4); for(int64_t i = (int64_t)size_table.size() - 1; i >= 0; i--) { TIndexOffU idx = size_table[i].second; ASSERT_ONLY(TIndexOffU num = size_table[i].first); assert_lt(idx, repeat_index_.size()); #ifndef NDEBUG if(idx + 1 < repeat_index_.size()) { assert_eq(repeat_index_[idx] + num, repeat_index_[idx+1]); } else { assert_eq(repeat_index_[idx] + num, repeat_list_.size()); } #endif TIndexOffU saBegin = repeat_index_[idx]; if(isDone(saBegin)) { assert(isDone(saBegin, num)); continue; } ASSERT_ONLY(size_t rb_done = 0); assert_lt(saBegin, repeat_list_.size()); repeatStack.push_back(idx); while(!repeatStack.empty()) { TIndexOffU idx = repeatStack.back(); assert(senseDominant[idx]); repeatStack.pop_back(); assert_lt(idx, repeat_index_.size()); TIndexOffU saBegin = repeat_index_[idx]; TIndexOffU saEnd = (idx + 1 < repeat_index_.size() ? repeat_index_[idx + 1] : repeat_list_.size()); if(isDone(saBegin)) { assert(isDone(saBegin, saEnd - saBegin)); continue; } TIndexOffU saElt = repeat_list_[saBegin]; size_t ri = repeatBases.size(); repeatBases.expand(); repeatBases.back().seq = getString(s, saElt, seed_len_); repeatBases.back().nodes.clear(); repeatBases.back().nodes.push_back(idx); ASSERT_ONLY(rb_done++); setDone(saBegin, saEnd - saBegin); bool left = true; while(repeatBases[ri].seq.length() <= max_len) { if(left) { tmp_str = "N"; tmp_str += repeatBases[ri].seq.substr(0, seed_len_ - 1); } else { tmp_str = repeatBases[ri].seq.substr(repeatBases[ri].seq.length() - seed_len_ + 1, seed_len_ - 1); tmp_str.push_back('N'); } assert_eq(tmp_str.length(), seed_len_); for(size_t c = 0; c < 4; c++) { if(left) tmp_str[0] = "ACGT"[c]; else tmp_str.back() = "ACGT"[c]; assert_eq(tmp_str.length(), seed_len_); TIndexOffU idx = find_repeat_idx(s, tmp_str); size_t num = 0; if(idx < repeat_index_.size()) { if(idx + 1 < repeat_index_.size()) { num = repeat_index_[idx+1] - repeat_index_[idx]; } else { num = repeat_list_.size() - repeat_index_[idx]; } } tmp_ranges[c].first = idx; tmp_ranges[c].second = num; assert(num == 0 || num >= seed_count_); tmp_sort_ranges[c].first = num; tmp_sort_ranges[c].second = c; if(idx == repeat_index_.size() || isDone(repeat_index_[idx]) || !senseDominant[idx]) { #ifndef NDEBUG if(idx < repeat_index_.size()) { assert(isDone(repeat_index_[idx], num) || !senseDominant[idx]); } #endif tmp_sort_ranges[c].first = 0; tmp_ranges[c].second = 0; } } tmp_sort_ranges.sort(); if(tmp_sort_ranges[3].first < seed_count_) { if(left) { left = false; continue; } else { break; } } for(size_t cc = 0; cc < 3; cc++) { assert_leq(tmp_sort_ranges[cc].first, tmp_sort_ranges[cc+1].first); if(tmp_sort_ranges[cc].first < seed_count_) continue; size_t c = tmp_sort_ranges[cc].second; repeatStack.push_back(tmp_ranges[c].first); } size_t c = tmp_sort_ranges[3].second; if(repeatBases[ri].seq.length() >= max_len) { assert_eq(repeatBases[ri].seq.length(), max_len); TIndexOffU idx = tmp_ranges[c].first; assert(!isDone(repeat_index_[idx])); repeatStack.push_back(idx); if(left) { left = false; continue; } else { break; } } else { TIndexOffU idx = tmp_ranges[c].first; TIndexOffU num = tmp_ranges[c].second; setDone(repeat_index_[idx], num); if(left) { repeatBases[ri].seq.insert(0, 1, "ACGT"[c]); repeatBases[ri].nodes.insert(idx, 0); } else { repeatBases[ri].seq.push_back("ACGT"[c]); repeatBases[ri].nodes.push_back(idx); } } } } assert_gt(rb_done, 0); bundle_count++; } cerr << "Bundle count: " << bundle_count << endl; #ifndef NDEBUG { set idx_set; for(size_t i = 0; i < repeatBases.size(); i++) { const RB_RepeatBase& repeatBase = repeatBases[i]; for(size_t j = 0; j < repeatBase.nodes.size(); j++) { assert(idx_set.find(repeatBase.nodes[j]) == idx_set.end()); idx_set.insert(repeatBase.nodes[j]); } } } #endif #if 0 { EList > kmer_table; string query; size_t total = 0, match = 0; size_t interval = 1; if(repeat_index_.size() >= 10000) { interval = repeat_index_.size() / 10000; } EList positions; for(size_t i = 0; i < repeat_index_.size(); i += interval) { TIndexOffU saElt_idx = repeat_index_[i]; TIndexOffU saElt_idx_end = (i + 1 < repeat_index_.size() ? repeat_index_[i+1] : repeat_list_.size()); positions.clear(); for(size_t j = saElt_idx; j < saElt_idx_end; j++) { positions.push_back(repeat_list_[j]); #ifndef NDEBUG if(j > saElt_idx) { TIndexOffU lcp_len = getLCP(s, coordHelper, positions[0], positions.back(), seed_len_); assert_geq(lcp_len, seed_len_); } TIndexOffU saElt = repeat_list_[j]; TIndexOffU start = coordHelper.getStart(saElt); TIndexOffU start2 = coordHelper.getStart(saElt + seed_len_ - 1); assert_eq(start, start2); #endif } TIndexOffU saElt = repeat_list_[saElt_idx]; size_t true_count = saElt_idx_end - saElt_idx; getString(s, saElt, seed_len_, query); total++; size_t count = 0; for(size_t r = 0; r < repeatBases.size(); r++) { const RB_RepeatBase& repeat = repeatBases[r]; int pos = repeat.seq.find(query); if(pos != string::npos) { for(size_t j = 0; j < repeat.nodes.size(); j++) { TIndexOffU _node = repeat.nodes[j]; TIndexOffU _saElt_idx = repeat_index_[_node]; TIndexOffU _saElt_idx_end = (_node + 1 < repeat_index_.size() ? repeat_index_[_node+1] : repeat_list_.size()); TIndexOffU _saElt = repeat_list_[_saElt_idx]; string seq = getString(s, _saElt, seed_len()); if(query == seq) { count += (_saElt_idx_end - _saElt_idx); } } } } size_t rc_count = 0; string rc_query = reverse_complement(query); for(size_t r = 0; r < repeatBases.size(); r++) { const RB_RepeatBase& repeat = repeatBases[r]; int pos = repeat.seq.find(rc_query); if(pos != string::npos) { for(size_t j = 0; j < repeat.nodes.size(); j++) { TIndexOffU _node = repeat.nodes[j]; TIndexOffU _saElt_idx = repeat_index_[_node]; TIndexOffU _saElt_idx_end = (_node + 1 < repeat_index_.size() ? repeat_index_[_node+1] : repeat_list_.size()); TIndexOffU _saElt = repeat_list_[_saElt_idx]; string seq = getString(s, _saElt, seed_len()); if(rc_query == seq) { rc_count += (_saElt_idx_end - _saElt_idx); } } } } if(count == true_count || rc_count == true_count) { match++; } else if(total - match <= 10) { cerr << "query: " << query << endl; cerr << "true count: " << true_count << endl; cerr << "found count: " << count << endl; cerr << "rc found count: " << rc_count << endl; cerr << endl; } } cerr << "RB_SubSA: sanity check: " << match << " passed (out of " << total << ")" << endl << endl; } #endif } #define write_fp(x) fp.write((const char *)&(x), sizeof((x))) void RB_SubSA::writeFile(ofstream& fp) { write_fp(sa_size_); write_fp(seed_len_); write_fp(seed_count_); size_t sz = temp_suffixes_.size(); write_fp(sz); for(size_t i = 0; i < sz; i++) { write_fp(temp_suffixes_[i]); } sz = repeat_index_.size(); write_fp(sz); for(size_t i = 0; i < sz; i++) { write_fp(repeat_index_[i]); } sz = done_.size(); write_fp(sz); for(size_t i = 0; i < sz; i++) { write_fp(done_[i]); } } #define read_fp(x) fp.read((char *)&(x), sizeof((x))) void RB_SubSA::readFile(ifstream& fp) { TIndexOffU val; read_fp(val); rt_assert_eq(val, sa_size_); read_fp(val); rt_assert_eq(val, seed_len_); read_fp(val); rt_assert_eq(val, seed_count_); size_t sz; read_fp(sz); temp_suffixes_.resizeExact(sz); for(size_t i = 0; i < sz; i++) { read_fp(temp_suffixes_[i]); } size_t val_sz; // repeat_index_ read_fp(val_sz); repeat_index_.resizeExact(val_sz); for(size_t i = 0; i < val_sz; i++) { read_fp(repeat_index_[i]); } // done read_fp(val_sz); done_.resizeExact(val_sz); for(size_t i = 0; i < val_sz; i++) { read_fp(done_[i]); } } /****************************/ template class RepeatBuilder >; template void dump_tstr(const SString& ); template bool compareRepeatCoordByJoinedOff(const RepeatCoord& , const RepeatCoord&);