/* * Copyright 2018, Chanhee Park and Daehwan Kim * * This file is part of HISAT 2. * * HISAT 2 is free software: you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation, either version 3 of the License, or * (at your option) any later version. * * HISAT 2 is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with HISAT 2. If not, see . */ #ifndef REPEAT_H_ #define REPEAT_H_ #include #include #include #include #include "assert_helpers.h" #include "word_io.h" #include "mem_ids.h" #include "ref_coord.h" #include "alt.h" using namespace std; template class RepeatCoord { public: bool operator< (const RepeatCoord& o) const { if(joinedOff != o.joinedOff) return joinedOff < o.joinedOff; if(fw != o.fw) return fw; if(alleleID != o.alleleID) return alleleID < o.alleleID; return false; } public: RepeatCoord() {}; RepeatCoord(index_t l_tid, index_t l_toff, index_t l_joinedOff, bool l_fw, index_t l_alleleID) : tid(l_tid), toff(l_toff), joinedOff(l_joinedOff), fw(l_fw) {}; index_t tid; index_t toff; index_t joinedOff; bool fw; index_t alleleID; }; template class RepeatAllele { public: RepeatAllele() { reset(); } void init(index_t allelePos_, index_t alleleLen_) { allelePos = allelePos_; alleleLen = alleleLen_; } void reset() { allelePos = 0; alleleLen = 0; } bool operator< (const RepeatAllele& o) const { if(allelePos != o.allelePos) return allelePos < o.allelePos; return alleleLen < o.alleleLen; } #ifndef NDEBUG bool repOk() const { return true; } #endif bool write(ofstream& f_out, bool bigEndian) const { writeU16(f_out, allelePos, bigEndian); writeU16(f_out, alleleLen, bigEndian); return true; } bool read(ifstream& f_in, bool bigEndian) { allelePos = readU16(f_in, bigEndian); alleleLen = readU16(f_in, bigEndian); return true; } bool compatible(index_t left, index_t right) const { if(left < allelePos || right > allelePos + alleleLen) return false; return true; } public: uint16_t allelePos; uint16_t alleleLen; }; // sorting functions template struct sort_pair_loci { bool operator()(const pair, index_t>& a, const pair, index_t>& b) { return a.first.joinedOff < b.first.joinedOff; } }; template struct sort_pair_loci_by_index { bool operator()(const pair, index_t>& a, const pair, index_t>& b) { return a.second < b.second; } }; template class Repeat { public: void init(const string& repName_, index_t repID_, index_t repPos_, index_t repLen_) { repName = repName_; repID = repID_; repPos = repPos_; repLen = repLen_; } bool write(ofstream& f_out, bool bigEndian) const { writeIndex(f_out, repID, bigEndian); writeIndex(f_out, repPos, bigEndian); writeIndex(f_out, repLen, bigEndian); writeIndex(f_out, alleles.size(), bigEndian); for(index_t i = 0; i < alleles.size(); i++) { alleles[i].write(f_out, bigEndian); } writeIndex(f_out, positions.size(), bigEndian); for(index_t i = 0; i < positions.size(); i++) { writeIndex(f_out, positions[i].joinedOff, bigEndian); writeU8(f_out, positions[i].fw); writeIndex(f_out, positions[i].alleleID, bigEndian); } return true; } bool read(ifstream& f_in, bool bigEndian) { repID = readIndex(f_in, bigEndian); repPos = readIndex(f_in, bigEndian); repLen = readIndex(f_in, bigEndian); index_t numAlleles = readIndex(f_in, bigEndian); alleles.resizeExact(numAlleles); for(index_t i = 0; i < numAlleles; i++) { alleles[i].read(f_in, bigEndian); } index_t numPositions = readIndex(f_in, bigEndian); positions.resizeExact(numPositions); for(index_t i = 0; i < numPositions; i++) { positions[i].tid = 0; positions[i].toff = 0; positions[i].joinedOff = readIndex(f_in, bigEndian); positions[i].fw = readU8(f_in); positions[i].alleleID = readIndex(f_in, bigEndian); assert_lt(positions[i].alleleID, alleles.size()); } return true; } public: string repName; index_t repID; index_t repPos; index_t repLen; EList > alleles; EList > positions; }; template class RepeatDB { public: RepeatDB() {} virtual ~RepeatDB() {} bool empty() const { return _repeats.size() == 0; } EList >& repeats() { return _repeats; } const EList >& repeats() const { return _repeats; } const ELList >& repeatMap() const { return _repeatMap; } void write(ofstream& f_out, bool bigEndian) const { if(_repeats.size() <= 0) { writeIndex(f_out, 0, bigEndian); return; } EList repeatGroup; for(index_t i = 0; i < _repeats.size(); i++) { #ifndef NDEBUG if(i + 1 < _repeats.size()) { assert_leq(_repeats[i].repID, _repeats[i+1].repID); } #endif if(_repeats[i].repID > repeatGroup.size()) { repeatGroup.push_back(i); assert_eq(_repeats[i].repID, repeatGroup.size()); } } repeatGroup.push_back(_repeats.size()); assert_eq(_repeats.back().repID + 1, repeatGroup.size()); writeIndex(f_out, repeatGroup.size(), bigEndian); streampos filepos = f_out.tellp(); EList repeatFilePos; for(index_t i = 0; i < repeatGroup.size(); i++) { writeIndex(f_out, 0, bigEndian); } for(index_t i = 0; i < repeatGroup.size(); i++) { index_t begin = (i == 0 ? 0 : repeatGroup[i-1]); index_t end = repeatGroup[i]; writeIndex(f_out, end - begin, bigEndian); for(index_t j = begin; j < end; j++) { _repeats[j].write(f_out, bigEndian); } repeatFilePos.push_back(f_out.tellp()); } assert_eq(repeatFilePos.size(), repeatGroup.size()); streampos origpos = f_out.tellp(); f_out.seekp(filepos); for(index_t i = 0; i < repeatFilePos.size(); i++) { writeIndex(f_out, repeatFilePos[i], bigEndian); } f_out.seekp(origpos); } void read(ifstream& f_in, bool bigEndian, const EList& includeRepeat) { index_t numRepeatGroup = readIndex(f_in, bigEndian); EList filePos; filePos.resizeExact(numRepeatGroup); for(index_t i = 0; i < numRepeatGroup; i++) { filePos[i] = readIndex(f_in, bigEndian); } assert_eq(numRepeatGroup, includeRepeat.size()); for(index_t i = 0, repID = 0; i < numRepeatGroup; i++) { if(!includeRepeat[i]) continue; if(i > 0) { f_in.seekg(filePos[i-1]); } index_t numRepeats = readIndex(f_in, bigEndian); index_t repeat_size = _repeats.size(); _repeats.resizeExact(repeat_size + numRepeats); for(index_t j = 0; j < numRepeats; j++) { _repeats[repeat_size+j].read(f_in, bigEndian); _repeats[repeat_size+j].repID = repID; } repID++; } f_in.seekg(filePos.back()); } // Build an internal table to enable rapid search of repeats // and converts joined offsets to chromosome IDs (tid) and loci (toff) void construct(const index_t* rstarts, index_t rlen) { _repeatMap.clear(); if(_repeats.empty()) return; for(index_t r = 0; r < _repeats.size(); r++) { if(_repeats[r].repID >= _repeatMap.size()) { _repeatMap.expand(); _repeatMap.back().clear(); } EList >& repeatMap = _repeatMap.back(); repeatMap.expand(); if(repeatMap.size() == 1) { repeatMap.back().first = _repeats[r].repLen; } else { repeatMap.back().first = repeatMap[repeatMap.size() - 2].first + _repeats[r].repLen; } repeatMap.back().second = r; } EList, index_t> > joinedOffList; for(index_t r = 0; r < _repeats.size(); r++) { Repeat& repeat = _repeats[r]; EList >& positions = repeat.positions; for(index_t p = 0; p < positions.size(); p++) { joinedOffList.expand(); joinedOffList.back().first.joinedOff = positions[p].joinedOff; joinedOffList.back().first.tid = 0; joinedOffList.back().first.toff = 0; joinedOffList.back().first.fw = positions[p].fw; joinedOffList.back().first.alleleID = positions[p].alleleID; joinedOffList.back().second = joinedOffList.size() - 1; } } sort(joinedOffList.begin(), joinedOffList.end(), sort_pair_loci()); index_t j = 0, r = 0; while(j < joinedOffList.size() && r < rlen) { index_t off = joinedOffList[j].first.joinedOff; index_t lower = rstarts[r*3]; index_t upper; if(r == rlen - 1) { upper = numeric_limits::max(); } else { upper = rstarts[(r+1)*3]; } assert_gt(upper, lower); if(off >= upper) { r++; continue; } assert_geq(off, lower); joinedOffList[j].first.tid = rstarts[(r*3)+1]; joinedOffList[j].first.toff = off - lower + rstarts[(r*3)+2]; j++; } sort(joinedOffList.begin(), joinedOffList.end(), sort_pair_loci_by_index()); index_t count = 0; for(index_t r = 0; r < _repeats.size(); r++) { Repeat& repeat = _repeats[r]; EList >& positions = _repeats[r].positions; for(index_t p = 0; p < positions.size(); p++) { assert_lt(count, joinedOffList.size()); assert_eq(positions[p].joinedOff, joinedOffList[count].first.joinedOff); positions[p] = joinedOffList[count].first; RepeatAllele& allele = repeat.alleles[positions[p].alleleID]; if(positions[p].fw) { positions[p].joinedOff -= allele.allelePos; positions[p].toff -= allele.allelePos; } else { assert_leq(allele.allelePos + allele.alleleLen, repeat.repLen); index_t subLen = repeat.repLen - allele.allelePos - allele.alleleLen; positions[p].joinedOff -= subLen; positions[p].toff -= subLen; } count++; } } } bool repeatExist(index_t repID, index_t left, index_t right) const { if(repID >= _repeatMap.size()) return false; // Find a repeat corresponding to a given location (left, right) const EList >& repeatMap = _repeatMap[repID]; pair repeat(left, numeric_limits::max()); index_t repeatIdx = repeatMap.bsearchLoBound(repeat); assert_lt(repeatIdx, repeatMap.size()); if(right > repeatMap[repeatIdx].first) return false; return true; } bool getCoords(index_t repID, index_t left, // left offset in the repeat sequence index_t right, // right offset const EList& snpIDs, // SNP IDs const ALTDB& altdb, EList, RepeatCoord > >& near_positions, index_t max_positions = numeric_limits::max()) const { near_positions.clear(); if(repID >= _repeatMap.size()) return false; // Find a repeat corresponding to a given location (left, right) const EList >& repeatMap = _repeatMap[repID]; pair repeat(left, numeric_limits::max()); index_t repeatIdx = repeatMap.bsearchLoBound(repeat); assert_lt(repeatIdx, repeatMap.size()); if(right > repeatMap[repeatIdx].first) return false; index_t repeatIdx_ = repeatMap[repeatIdx].second; assert_lt(repeatIdx_, _repeats.size()); const EList >& alleles = _repeats[repeatIdx_].alleles; index_t adjLeft = left, adjRight = right; if(repeatIdx > 0) { adjLeft -= repeatMap[repeatIdx-1].first; adjRight -= repeatMap[repeatIdx-1].first; } const EList >& positions = _repeats[repeatIdx_].positions; for(index_t p = 0; p < positions.size(); p++) { const RepeatCoord& position = positions[p]; assert_lt(position.alleleID, alleles.size()); const RepeatAllele& allele = alleles[position.alleleID]; if(!allele.compatible(adjLeft, adjRight)) continue; near_positions.expand(); near_positions.back().first = position; if(positions[p].fw) { near_positions.back().first.joinedOff += adjLeft; near_positions.back().first.toff += adjLeft; } else { const index_t len = right - left; assert_leq(adjLeft + len, _repeats[repeatIdx_].repLen); index_t rc_adjLeft = _repeats[repeatIdx_].repLen - adjLeft - len; near_positions.back().first.joinedOff += rc_adjLeft; near_positions.back().first.toff += rc_adjLeft; } if(near_positions.size() >= max_positions) break; } return near_positions.size() > 0; } bool findCoords(index_t anchor_left, index_t anchor_right, index_t repID, index_t left, // left offset in the repeat sequence index_t right, // right offset const EList& snpIDs, // SNP IDs const ALTDB& altdb, EList, RepeatCoord > >& near_positions, index_t max_positions = numeric_limits::max(), index_t dist = 1000) const { near_positions.clear(); if(repID >= _repeatMap.size()) return false; // Find a repeat corresponding to a given location (left, right) const EList >& repeatMap = _repeatMap[repID]; pair repeat(left, numeric_limits::max()); index_t repeatIdx = repeatMap.bsearchLoBound(repeat); assert_lt(repeatIdx, repeatMap.size()); if(right > repeatMap[repeatIdx].first) return false; index_t repeatIdx_ = repeatMap[repeatIdx].second; assert_lt(repeatIdx_, _repeats.size()); const EList >& alleles = _repeats[repeatIdx_].alleles; index_t adjLeft = left, adjRight = right; if(repeatIdx > 0) { adjLeft -= repeatMap[repeatIdx-1].first; adjRight -= repeatMap[repeatIdx-1].first; } const EList >& positions = _repeats[repeatIdx_].positions; RepeatCoord cmp; cmp.joinedOff = (anchor_left >= dist ? anchor_left - dist : 0); index_t p = positions.bsearchLoBound(cmp); for(; p < positions.size(); p++) { const RepeatCoord& position = positions[p]; index_t pos = positions[p].joinedOff + adjLeft; if(pos + dist < anchor_left) continue; if(anchor_right + dist < pos) break; assert_lt(position.alleleID, alleles.size()); const RepeatAllele& allele = alleles[position.alleleID]; if(!allele.compatible(adjLeft, adjRight)) continue; near_positions.expand(); near_positions.back().first = position; if(positions[p].fw) { near_positions.back().first.joinedOff += adjLeft; near_positions.back().first.toff += adjLeft; } else { const index_t len = right - left; assert_leq(adjLeft + len, _repeats[repeatIdx_].repLen); index_t rc_adjLeft = _repeats[repeatIdx_].repLen - adjLeft - len; near_positions.back().first.joinedOff += rc_adjLeft; near_positions.back().first.toff += rc_adjLeft; } if(near_positions.size() >= max_positions) break; } return near_positions.size() > 0; } bool findCommonCoords(index_t repID, index_t left, // left offset in the repeat sequence index_t right, // right offset const EList& snpIDs, // SNP IDs index_t repID2, index_t left2, // left offset 2 in the repeat sequence index_t right2, // right offset 2 const EList& snpIDs2, // SNP IDs const ALTDB& altdb, EList, RepeatCoord > >& common_positions, index_t max_positions = numeric_limits::max(), index_t dist = 1000) const { common_positions.clear(); if(repID >= _repeatMap.size() || repID2 >= _repeatMap.size()) return false; // Find a repeat corresponding to a given location (left, right) const EList >& repeatMap = _repeatMap[repID]; assert_lt(left, right); pair repeat(left, numeric_limits::max()); index_t repeatIdx = repeatMap.bsearchLoBound(repeat); assert_lt(repeatIdx, repeatMap.size()); if(right > repeatMap[repeatIdx].first) return false; index_t repeatIdx_ = repeatMap[repeatIdx].second; assert_lt(repeatIdx_, _repeats.size()); const EList >& alleles = _repeats[repeatIdx_].alleles; index_t adjLeft = left, adjRight = right; if(repeatIdx > 0) { adjLeft -= repeatMap[repeatIdx-1].first; adjRight -= repeatMap[repeatIdx-1].first; } // Find a repeat cooresponding to a given location (left2, right2) const EList >& repeatMap2 = _repeatMap[repID2]; assert_lt(left2, right2); pair repeat2(left2, numeric_limits::max()); index_t repeatIdx2 = repeatMap2.bsearchLoBound(repeat2); assert_lt(repeatIdx2, repeatMap2.size()); if(right2 > repeatMap2[repeatIdx2].first) return false; index_t repeatIdx2_ = repeatMap2[repeatIdx2].second; assert_lt(repeatIdx2_, _repeats.size()); const EList >& alleles2 = _repeats[repeatIdx2_].alleles; index_t adjLeft2 = left2, adjRight2 = right2; if(repeatIdx2 > 0) { adjLeft2 -= repeatMap2[repeatIdx2-1].first; adjRight2 -= repeatMap2[repeatIdx2-1].first; } const EList >& positions = _repeats[repeatIdx_].positions; const EList >& positions2 = _repeats[repeatIdx2_].positions; index_t jsave = 0; for(index_t i = 0; i < positions.size(); i++) { const RepeatAllele& allele = alleles[positions[i].alleleID]; if(!allele.compatible(adjLeft, adjRight)) continue; index_t i_pos = positions[i].joinedOff + adjLeft; for(index_t j = jsave; j < positions2.size(); j++) { index_t j_pos = positions2[j].joinedOff + adjLeft2; if(j_pos + dist < i_pos) { jsave = j + 1; continue; } if(i_pos + dist < j_pos) break; const RepeatAllele& allele2 = alleles2[positions2[j].alleleID]; if(!allele2.compatible(adjLeft2, adjRight2)) continue; common_positions.expand(); common_positions.back().first = positions[i]; if(positions[i].fw) { common_positions.back().first.joinedOff += adjLeft; common_positions.back().first.toff += adjLeft; } else { const index_t len = right - left; assert_leq(adjLeft + len, _repeats[repeatIdx_].repLen); index_t rc_adjLeft = _repeats[repeatIdx_].repLen - adjLeft - len; common_positions.back().first.joinedOff += rc_adjLeft; common_positions.back().first.toff += rc_adjLeft; } common_positions.back().second = positions2[j]; if(positions2[j].fw) { common_positions.back().second.toff += adjLeft2; common_positions.back().second.joinedOff += adjLeft2; } else { const index_t len = right2 - left2; assert_leq(adjLeft2 + len, _repeats[repeatIdx2_].repLen); index_t rc_adjLeft2 = _repeats[repeatIdx2_].repLen - adjLeft2 - len; common_positions.back().second.joinedOff += rc_adjLeft2; common_positions.back().second.toff += rc_adjLeft2; } if(common_positions.size() >= max_positions) break; } if(common_positions.size() >= max_positions) break; } return common_positions.size() > 0; } private: pair get_alt_range(const ALTDB& altdb, index_t left, index_t right) const { pair alt_range; ALT cmp_alt; cmp_alt.pos = left; alt_range.first = alt_range.second = (index_t)altdb.alts().bsearchLoBound(cmp_alt); for(; alt_range.second < altdb.alts().size(); alt_range.second++) { const ALT& alt = altdb.alts()[alt_range.second]; if(alt.left > right) break; } return alt_range; } private: EList > _repeats; ELList > _repeatMap; // pos to repeat id }; #endif /*ifndef REPEAT_H_*/