hisat-3n/repeat_kmer.h
2025-01-18 21:09:52 +08:00

607 lines
20 KiB
C++

/*
* Copyright 2018, Chanhee Park <parkchanhee@gmail.com> and Daehwan Kim <infphilo@gmail.com>
*
* This file is part of HISAT 2.
*
* HISAT 2 is free software: you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation, either version 3 of the License, or
* (at your option) any later version.
*
* HISAT 2 is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with HISAT 2. If not, see <http://www.gnu.org/licenses/>.
*/
#ifndef __REPEAT_KMER_H__
#define __REPEAT_KMER_H__
#include <iostream>
#include <fstream>
#include <limits>
#include <map>
#include <unordered_set>
#include "assert_helpers.h"
#include "word_io.h"
#include "mem_ids.h"
#include "ds.h"
template<typename TStr>
class RB_Minimizer {
public:
static const size_t default_w = 5;
static const size_t default_k = 31;
public:
static pair<uint64_t, size_t>
get_minimizer(const TStr& seq,
size_t off,
size_t window,
size_t k)
{
assert_leq(k, 32);
assert_leq(off + window + k - 1, seq.length());
pair<uint64_t, size_t> minimizer(get_kmer(seq, off, k), off);
uint64_t kmer = minimizer.first;
for(size_t i = off + 1; i < off + window; i++) {
uint64_t next_kmer = get_next_kmer(kmer, seq[i+k-1], k);
if(minimizer_leq(next_kmer, minimizer.first)) {
minimizer.first = next_kmer;
minimizer.second = i;
}
kmer = next_kmer;
}
return minimizer;
}
static void
get_minimizer(const TStr& seq,
size_t window,
size_t k,
EList<pair<uint64_t, size_t> >& minimizers)
{
assert_leq(k, 32);
assert_leq(window + k - 1, seq.length());
minimizers.clear();
pair<uint64_t, size_t> minimizer = get_minimizer(seq, 0, window, k);
minimizers.push_back(minimizer);
uint64_t kmer = get_kmer(seq, window - 1, k);
for(size_t i = 1; i + window + k - 1 <= seq.length(); i++) {
uint64_t next_kmer = get_next_kmer(kmer, seq[i+window+k-2], k);
if(minimizer.second < i) {
minimizer = get_minimizer(seq, i, window, k);
} else if(minimizer_leq(next_kmer, minimizer.first)) {
minimizer.first = next_kmer;
minimizer.second = i + window - 1;
}
minimizers.push_back(minimizer);
kmer = next_kmer;
}
#ifndef NDEBUG
assert_eq(minimizers.size() + window + k - 2, seq.length());
for(size_t i = 0; i + window + k - 1 <= seq.length(); i++) {
pair<uint64_t, size_t> minimizer = get_minimizer(seq, i, window, k);
assert(minimizer == minimizers[i]);
}
#endif
}
protected:
static bool
minimizer_leq(uint64_t kmer, uint64_t kmer2)
{
#if 1
kmer = convert_minimizer(kmer);
kmer2 = convert_minimizer(kmer2);
#endif
return kmer <= kmer2;
}
// Heng Li's minimap and minaisam paper, 2016
static uint64_t convert_minimizer(uint64_t x) {
x = (~x) + (x << 21);
x = x ^ (x >> 24);
x = x + (x << 3) + (x << 8);
x = x ^ (x >> 14);
x = x + (x << 2) + (x << 4);
x = x ^ (x >> 28);
x = x + (x << 31);
return x;
}
static uint64_t
get_kmer(const TStr& seq,
size_t offset,
size_t k)
{
assert_leq(offset + k, seq.length());
uint64_t kmer = 0;
for(size_t i = 0; i < k; i++) {
size_t c = seq[offset + i];
if(c > 3) c = asc2dna[c];
kmer = (kmer << 2 | c);
}
return kmer;
}
static uint64_t
get_next_kmer(uint64_t kmer,
size_t base,
size_t k)
{
kmer &= (((uint64_t)1 << ((k-1)*2))) - 1;
if(base > 3) base = asc2dna[base];
kmer = (kmer << 2) | base;
return kmer;
}
static TStr get_string(uint64_t kmer, size_t k)
{
TStr seq = "";
for(size_t i = 0; i < k; i++) {
size_t nt = kmer & 0x3;
seq.push_back("ACGT"[nt]);
kmer >>= 2;
}
reverse(seq.begin(), seq.end());
return seq;
}
};
struct RB_Alignment {
TIndexOffU pos;
TIndexOffU off;
TIndexOffU len;
bool operator<(const RB_Alignment& o) const
{
if(pos != o.pos)
return pos < o.pos;
return len > o.len;
}
};
struct RB_Alignment_CMPbyLen {
bool operator()(const RB_Alignment& a, const RB_Alignment& b)
{
if(a.len != b.len)
return a.len > b.len;
return a.pos < b.pos;
}
};
class RB_KmerTable {
public:
RB_KmerTable() { w_ = k_ = 0; }
~RB_KmerTable() {}
public:
bool isIn(uint64_t kmer) const
{
#if 1
pair<uint64_t, TIndexOffU> key(kmer, 0);
size_t idx = kmer_table_.bsearchLoBound(key);
return idx < kmer_table_.size() && kmer_table_[idx].first == kmer;
#else
return kmers_.find(kmer) != kmers_.end();
#endif
}
template<typename TStr>
bool isRepeat(const TStr& query,
const TStr& rc_query,
EList<pair<uint64_t, size_t> >& minimizers) const
{
return isRepeat(query, minimizers) || isRepeat(rc_query, minimizers);
}
template<typename TStr>
bool isRepeat(const TStr& query,
EList<pair<uint64_t, size_t> >& minimizers) const
{
RB_Minimizer<TStr>::get_minimizer(query, w_, k_, minimizers);
size_t est_count = 0;
uint64_t prev_minimizer = 0;
bool prev_in = false;
for(size_t j = 0; j < minimizers.size(); j++) {
bool curr_in = false;
if(minimizers[j].first == prev_minimizer) {
if(prev_in) est_count++;
curr_in = prev_in;
} else if(isIn(minimizers[j].first)) {
curr_in = true;
est_count++;
#if 1
return true;
#endif
}
prev_minimizer = minimizers[j].first;
prev_in = curr_in;
}
#if 1
return false;
#else
bool est_repeat = est_count * 10 >= minimizers.size();
return est_repeat;
#endif
}
template<typename TStr>
void findRepeats(const TStr& query,
EList<pair<uint64_t, size_t> >& minimizers,
EList<TIndexOffU>& repeats) const
{
repeats.clear();
RB_Minimizer<TStr>::get_minimizer(query, w_, k_, minimizers);
for(size_t i = 0; i < minimizers.size(); i++) {
if(i > 0 && minimizers[i].first == minimizers[i-1].first)
continue;
pair<uint64_t, size_t> minimizer(minimizers[i].first, 0);
size_t j = kmer_table_.bsearchLoBound(minimizer);
for(; j < kmer_table_.size() && minimizer.first == kmer_table_[j].first; j++) {
repeats.push_back(kmer_table_[j].second);
}
}
if(repeats.empty())
return;
size_t remove_count = 0;
repeats.sort();
for(size_t i = 0; i + 1 < repeats.size();) {
size_t j = i + 1;
for(; j < repeats.size(); j++) {
if(repeats[i] == repeats[j]) {
repeats[j] = std::numeric_limits<TIndexOffU>::max();
remove_count++;
} else break;
}
i = j;
}
repeats.sort();
assert_lt(remove_count, repeats.size());
repeats.resize(repeats.size() - remove_count);
}
template<typename TStr>
void findAlignments(const TStr& query,
EList<pair<uint64_t, size_t> >& minimizers,
ELList<RB_Alignment>& position2D,
EList<RB_Alignment>& alignments,
TIndexOffU max_num_alignment = 1000) const
{
minimizers.clear();
RB_Minimizer<TStr>::get_minimizer(query, w_, k_, minimizers);
position2D.clear();
for(size_t i = 0; i < minimizers.size(); i++) {
if(i > 0 && minimizers[i].first == minimizers[i-1].first)
continue;
pair<uint64_t, TIndexOffU> minimizer(minimizers[i].first, 0);
size_t idx = kmer_table_.bsearchLoBound(minimizer);
if(idx < kmer_table_.size() && kmer_table_[idx].first == minimizer.first) {
TIndexOffU begin = kmer_table_[idx].second;
TIndexOffU end = (idx + 1 < kmer_table_.size() ? kmer_table_[idx+1].second : pos_list_.size());
position2D.expand();
EList<RB_Alignment>& positions = position2D.back();
positions.clear();
positions.expand();
positions.back().pos = begin; // suffix begin
positions.back().off = i; // minimizer index
positions.expand();
positions.back().pos = end; // suffix end
positions.back().off = i; // minimizer index
}
}
alignments.clear();
if(position2D.empty())
return;
for(size_t i = 0; i < position2D.size(); i++) {
size_t num_i = 0, num_pos = numeric_limits<size_t>::max();
for(size_t j = 0; j < position2D.size(); j++) {
EList<RB_Alignment>& positions = position2D[j];
if(positions.empty())
continue;
assert_eq(positions.size(), 2);
TIndexOffU cur_num_pos = positions[1].pos - positions[0].pos;
if(cur_num_pos == 0)
continue;
if(cur_num_pos < num_pos) {
num_i = j;
num_pos = cur_num_pos;
}
}
if(num_pos > max_num_alignment && alignments.size() > 0)
break;
if(num_pos > max_num_alignment)
break;
EList<RB_Alignment>& positions = position2D[num_i];
TIndexOffU begin = positions[0].pos;
TIndexOffU end = positions[1].pos;
TIndexOffU min_i = positions[0].off;
assert_eq(num_pos, end - begin);
positions.clear();
for(TIndexOffU j = begin; j < end; j++) {
if(pos_list_[j] < minimizers[min_i].second)
continue;
positions.expand();
positions.back().pos = pos_list_[j];
positions.back().off = minimizers[min_i].second;
positions.back().len = k_;
}
if(i == 0) {
for(size_t j = 0; j < positions.size(); j++) {
alignments.expand();
alignments.back() = positions[j];
}
} else {
size_t a = 0, p = 0;
size_t num_alignment = alignments.size();
while(a < num_alignment && p < positions.size()) {
RB_Alignment& alignment = alignments[a];
RB_Alignment& position = positions[p];
if(alignment.pos < position.pos) {
TIndexOffU offDiff = position.off - alignment.off;
if(position.pos - alignment.pos == offDiff) {
alignment.len = min(offDiff, alignment.len) + position.len;
a++; p++;
} else {
a++;
}
} else if(alignment.pos > position.pos) {
TIndexOffU offDiff = alignment.off - position.off;
if(alignment.pos - position.pos == offDiff) {
alignment.pos = position.pos;
alignment.off = position.off;
alignment.len = min(offDiff, position.len) + alignment.len;
assert_geq(alignment.pos, alignment.off);
a++; p++;
} else {
alignments.expand();
alignments.back() = position;
p++;
}
} else {
assert_eq(alignment.pos, position.pos);
a++; p++;
}
}
while(p < positions.size()) {
RB_Alignment& position = positions[p];
alignments.expand();
alignments.back() = position;
p++;
}
if(i + 1 < position2D.size()) {
alignments.sort();
}
}
positions.clear();
if(alignments.size() >= max_num_alignment) {
break;
}
}
if(alignments.empty())
return;
for(size_t i = 0; i < alignments.size(); i++) {
alignments[i].pos -= alignments[i].off;
}
alignments.sort();
// remove duplicates
size_t remove_count = 0;
for(size_t i = 0; i + 1 + remove_count < alignments.size(); i++) {
size_t j = i + 1 + remove_count;
for(; j < alignments.size(); j++) {
if(alignments[i].pos != alignments[j].pos) {
assert_geq(j, i + 1);
if(j > i + 1) {
alignments[i+1] = alignments[j];
}
break;
} else {
remove_count++;
}
}
}
assert_lt(remove_count, alignments.size());
if(remove_count > 0) {
alignments.resize(alignments.size() - remove_count);
}
if(alignments.size() > 1) {
sort(alignments.begin(), alignments.end(), RB_Alignment_CMPbyLen());
}
}
bool write(ofstream& f_out, bool bigEndian) const {
writeIndex<size_t>(f_out, w_, bigEndian);
writeIndex<size_t>(f_out, k_, bigEndian);
writeIndex<size_t>(f_out, kmer_table_.size(), bigEndian);
for(size_t i = 0; i < kmer_table_.size(); i++) {
writeIndex<uint64_t>(f_out, kmer_table_[i].first, bigEndian);
if(sizeof(TIndexOffU) == 4) {
writeU32(f_out, kmer_table_[i].second, bigEndian);
} else {
assert_eq(sizeof(TIndexOffU), 8);
writeIndex<uint64_t>(f_out, kmer_table_[i].second, bigEndian);
}
}
writeIndex<size_t>(f_out, pos_list_.size(), bigEndian);
for(size_t i = 0; i < pos_list_.size(); i++) {
if(sizeof(TIndexOffU) == 4) {
writeU32(f_out, pos_list_[i], bigEndian);
} else {
assert_eq(sizeof(TIndexOffU), 8);
writeIndex<uint64_t>(f_out, pos_list_[i], bigEndian);
}
}
return true;
}
bool read(ifstream& f_in, bool bigEndian) {
w_ = readIndex<size_t>(f_in, bigEndian);
k_ = readIndex<size_t>(f_in, bigEndian);
size_t kmer_size = readIndex<size_t>(f_in, bigEndian);
kmer_table_.reserveExact(kmer_size);
while(kmer_table_.size() < kmer_size) {
kmer_table_.expand();
kmer_table_.back().first = readIndex<uint64_t>(f_in, bigEndian);
if(sizeof(TIndexOffU) == 4) {
kmer_table_.back().second = readU32(f_in, bigEndian);
} else {
assert_eq(sizeof(TIndexOffU), 8);
kmer_table_.back().second = readIndex<uint64_t>(f_in, bigEndian);
}
#if 0
kmers_.insert(kmer_table_.back().first);
#endif
}
size_t pos_size = readIndex<size_t>(f_in, bigEndian);
pos_list_.reserveExact(pos_size);
while(pos_list_.size() < pos_size) {
pos_list_.expand();
if(sizeof(TIndexOffU) == 4) {
pos_list_.back() = readU32(f_in, bigEndian);
} else {
assert_eq(sizeof(TIndexOffU), 8);
pos_list_.back() = readIndex<uint64_t>(f_in, bigEndian);
}
}
return true;
}
public:
template<typename TStr>
void build(const EList<TStr>& seqs,
size_t w,
size_t k)
{
w_ = w;
k_ = k;
kmer_table_.clear();
pos_list_.clear();
EList<pair<uint64_t, TIndexOffU> > tmp_table;
set<uint64_t> tmp_kmers;
TIndexOffU baseoff = 0;
EList<pair<uint64_t, size_t> > minimizers;
for(size_t s = 0; s < seqs.size(); s++) {
const TStr& seq = seqs[s];
RB_Minimizer<TStr>::get_minimizer(seq,
w_,
k_,
minimizers);
for(size_t i = 0; i < minimizers.size(); i++) {
if(!tmp_table.empty() &&
tmp_table.back().first == minimizers[i].first &&
tmp_table.back().second == baseoff + minimizers[i].second)
continue;
tmp_table.expand();
tmp_table.back().first = minimizers[i].first;
tmp_table.back().second = baseoff + minimizers[i].second;
tmp_kmers.insert(minimizers[i].first);
#if 0
kmers_.insert(minimizers[i].first);
#endif
}
baseoff += seq.length();
}
tmp_table.sort();
kmer_table_.reserveExact(tmp_kmers.size());
pos_list_.reserveExact(tmp_table.size());
for(size_t i = 0; i < tmp_table.size(); i++) {
#ifndef NDEBUG
if(!pos_list_.empty()) {
assert_neq(pos_list_.back(), tmp_table[i].second);
}
#endif
if(kmer_table_.empty() || kmer_table_.back().first != tmp_table[i].first) {
kmer_table_.expand();
kmer_table_.back().first = tmp_table[i].first;
kmer_table_.back().second = pos_list_.size();
}
pos_list_.push_back(tmp_table[i].second);
}
assert_eq(kmer_table_.size(), tmp_kmers.size());
assert_eq(pos_list_.size(), tmp_table.size());
}
void dump(ostream& o) const
{
o << "window : " << w_ << endl;
o << "k length : " << k_ << endl;
o << "number of kmer : " << kmer_table_.size() << endl;
o << "number of pos : " << pos_list_.size() << endl;
EList<size_t> counts, counts_10;
counts.resizeExact(10); counts_10.resizeExact(10);
counts.fillZero(); counts_10.fillZero();
for(size_t i = 0; i < kmer_table_.size(); i++) {
size_t count = 0;
if(i + 1 < kmer_table_.size()) {
count = kmer_table_[i+1].second - kmer_table_[i].second;
} else {
count = pos_list_.size() - kmer_table_[i].second;
}
assert_gt(count, 0);
count -= 1;
if(count < counts.size()) {
counts[count]++;
}
size_t count_10 = count / 10;
if(count_10 < counts_10.size()) {
counts_10[count_10]++;
} else {
counts_10.back()++;
}
}
for(size_t i = 0; i < counts.size(); i++) {
o << "\t" << i + 1 << ": " << counts[i] << endl;
}
for(size_t i = 1; i < counts_10.size(); i++) {
o << "\t" << i * 10 + 1;
if(i + 1 < counts_10.size()) {
o << " to " << (i+1) * 10 << ": ";
} else {
o << " or more: ";
}
o << counts_10[i] << endl;
}
}
private:
size_t w_;
size_t k_;
EList<pair<uint64_t, TIndexOffU> > kmer_table_;
EList<TIndexOffU> pos_list_;
unordered_set<uint64_t> kmers_;
};
#endif /* __REPEAT_KMER_H__ */