hisat-3n/utility_3n_table.h
2025-01-19 17:06:28 +08:00

333 lines
9.2 KiB
C++

/*
* Copyright 2020, Yun (Leo) Zhang <imzhangyun@gmail.com>
*
* This file is part of HISAT-3N.
*
* HISAT-3N is free software: you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation, either version 3 of the License, or
* (at your option) any later version.
*
* HISAT-3N is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with HISAT-3N. If not, see <http://www.gnu.org/licenses/>.
*/
#ifndef UTILITY_3N_TABLE_H
#define UTILITY_3N_TABLE_H
#include <algorithm>
#include <condition_variable>
#include <mutex>
#include <queue>
#include <atomic>
using namespace std;
/**
* return complement of input base.
*/
char asc2dnacomp[] = {
/* 0 */ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
/* 16 */ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
/* 32 */ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,'-', 0, 0,
/* 48 */ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
/* 64 */ 0,'T','V','G','H', 0, 0,'C','D', 0, 0,'M', 0,'K','N', 0,
/* A B C D G H K M N */
/* 80 */ 0, 0,'Y','S','A', 0,'B','W', 0,'R', 0, 0, 0, 0, 0, 0,
/* R S T V W Y */
/* 96 */ 0,'T','V','G','H', 0, 0,'C','D', 0, 0,'M', 0,'K','N', 0,
/* a b c d g h k m n */
/* 112 */ 0, 0,'Y','S','A', 0,'B','W', 0,'R', 0, 0, 0, 0, 0, 0,
/* r s t v w y */
/* 128 */ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
/* 144 */ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
/* 160 */ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
/* 176 */ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
/* 192 */ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
/* 208 */ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
/* 224 */ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
/* 240 */ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
};
/**
* the simple data structure to bind quality score and position (on reference) together.
*/
class PosQuality {
public:
int readPos; // 0-based
int refPos; // 0-based
char qual;
bool converted;
bool remove;
PosQuality(int& inputPos) {
readPos = inputPos;
refPos = inputPos;
remove = true;
}
void setQual (char& inputQual, bool inputConverted) {
qual = inputQual;
converted = inputConverted;
remove = false;
}
};
/**
* the base class for string we need to search.
*/
class string_search {
public:
int start;
string s;
int stringLen;
void initialize() {
start = 0;
stringLen = 0;
s.clear();
}
void loadString(string intputString) {
s = intputString;
stringLen = s.size();
start = 0;
}
};
/**
* to store CIGAR string and search segments in it.
*/
class CIGAR : public string_search{
public:
bool getNextSegment(int& len, char& symbol) {
if (start == stringLen) {
return false;
}
len = 0;
int currentIndex = start;
while (true) {
if (isalpha(s[currentIndex])) {
len = stoi(s.substr(start, currentIndex-start));
symbol = s[currentIndex];
start = currentIndex+1;
return true;
}
currentIndex++;
}
}
};
/**
* to store MD tag and search segments in it.
*/
class MD_tag : public string_search {
public:
bool getNextSegment(string& seg) {
if (start >= stringLen) {
return false;
}
seg.clear();
int currentIndex = start;
bool deletion = false;
while (true) {
if (currentIndex >= stringLen) {
start = currentIndex + 1;
return !seg.empty();
}
if (seg.empty() && s[currentIndex] == '0') {
currentIndex++;
continue;
}
if (isalpha(s[currentIndex])) {
if (seg.empty()) {
seg = s[currentIndex];
start = currentIndex+1;
return true;
} else {
if (deletion) {
seg += s[currentIndex];
//currentIndex++;
} else {
start = currentIndex;
return true;
}
}
} else if (s[currentIndex] == '^') {
if (seg.empty()) {
seg = s[currentIndex];
deletion = true;
} else {
start = currentIndex;
return true;
}
} else { // number
if (seg.empty()) {
seg = s[currentIndex];
} else {
if (deletion || isalpha(seg.back())) {
start = currentIndex;
return true;
} else {
seg += s[currentIndex];
}
}
}
currentIndex++;
}
}
};
template<typename T>
struct Channel {
Channel() = default;
Channel(const Channel &) = delete;
Channel &operator=(const Channel &) = delete;
Channel(Channel &&) = delete;
Channel &operator=(Channel &&) = delete;
~Channel() = default;
void send(T in) {
if (closed()) {
abort();
}
{
std::unique_lock<std::mutex> lock{mtx_};
queue_.push(std::move(in));
++size_;
}
cnd_.notify_one();
}
bool recv(T& out) {
if (closed() && empty()) {
return false;
}
{
std::unique_lock<std::mutex> lock{mtx_};
cnd_.wait(lock, [this] { return !empty() || closed(); });
if (empty()) {
return false;
}
out = std::move(queue_.front());
queue_.pop();
--size_;
}
if (closed()) {
cnd_.notify_all();
} else {
cnd_.notify_one();
}
return true;
}
size_t constexpr size() const noexcept {
return size_;
}
bool constexpr empty() const noexcept {
return size_ == 0;
}
void close() noexcept {
{
std::unique_lock<std::mutex> lock{mtx_};
is_closed_.store(true);
}
cnd_.notify_all();
}
bool closed() const noexcept {
return is_closed_.load();
}
private:
std::queue<T> queue_;
std::atomic<std::size_t> size_{0};
std::mutex mtx_;
std::condition_variable cnd_;
std::atomic<bool> is_closed_{false};
};
/**
* store one chromosome and it's stream position
*/
class ChromosomeFilePosition {
public:
string chromosome;
streampos linePos;
ChromosomeFilePosition(string inputChromosome, streampos inputPos) {
chromosome = inputChromosome;
linePos = inputPos;
}
bool operator < (const ChromosomeFilePosition& in) const{
return chromosome < in.chromosome;
}
};
/**
* store all chromosome and it's stream position
*/
class ChromosomeFilePositions {
public:
vector <ChromosomeFilePosition> pos;
/**
* input the chromosome name and it's streamPos, if it is not in pos, add it.
*/
void append (string &chromosome, streampos& linePos) {
pos.push_back(ChromosomeFilePosition(chromosome, linePos));
}
/**
* make binary search on pos for target chromosome name
*/
int findChromosome(string &targetChromosome, int start, int end) {
if (start <= end) {
int middle = (start + end) / 2;
if (pos[middle].chromosome == targetChromosome) {
return middle;
}
if (pos[middle].chromosome > targetChromosome) {
return findChromosome(targetChromosome, start, middle-1);
}
return findChromosome(targetChromosome, middle+1, end);
}
else
{
// cannot find the chromosome! throw!
cerr << "Cannot find the chromosome: " << targetChromosome << " in reference file." << endl;
throw 1;
}
}
/**
* given targetChromosome name, return its streampos
*/
streampos getChromosomePosInRefFile(string &targetChromosome)
{
int index = findChromosome(targetChromosome, 0, pos.size()-1);
assert(pos[index].chromosome == targetChromosome);
return pos[index].linePos;
}
/**
* sort the pos by chromosome name
*/
void sort()
{
std::sort(pos.begin(), pos.end());
}
};
#endif //UTILITY_3N_TABLE_H