333 lines
9.2 KiB
C++
333 lines
9.2 KiB
C++
/*
|
|
* Copyright 2020, Yun (Leo) Zhang <imzhangyun@gmail.com>
|
|
*
|
|
* This file is part of HISAT-3N.
|
|
*
|
|
* HISAT-3N is free software: you can redistribute it and/or modify
|
|
* it under the terms of the GNU General Public License as published by
|
|
* the Free Software Foundation, either version 3 of the License, or
|
|
* (at your option) any later version.
|
|
*
|
|
* HISAT-3N is distributed in the hope that it will be useful,
|
|
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
|
* GNU General Public License for more details.
|
|
*
|
|
* You should have received a copy of the GNU General Public License
|
|
* along with HISAT-3N. If not, see <http://www.gnu.org/licenses/>.
|
|
*/
|
|
|
|
#ifndef UTILITY_3N_TABLE_H
|
|
#define UTILITY_3N_TABLE_H
|
|
|
|
#include <algorithm>
|
|
#include <condition_variable>
|
|
#include <mutex>
|
|
#include <queue>
|
|
#include <atomic>
|
|
|
|
using namespace std;
|
|
|
|
/**
|
|
* return complement of input base.
|
|
*/
|
|
char asc2dnacomp[] = {
|
|
/* 0 */ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
|
|
/* 16 */ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
|
|
/* 32 */ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,'-', 0, 0,
|
|
/* 48 */ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
|
|
/* 64 */ 0,'T','V','G','H', 0, 0,'C','D', 0, 0,'M', 0,'K','N', 0,
|
|
/* A B C D G H K M N */
|
|
/* 80 */ 0, 0,'Y','S','A', 0,'B','W', 0,'R', 0, 0, 0, 0, 0, 0,
|
|
/* R S T V W Y */
|
|
/* 96 */ 0,'T','V','G','H', 0, 0,'C','D', 0, 0,'M', 0,'K','N', 0,
|
|
/* a b c d g h k m n */
|
|
/* 112 */ 0, 0,'Y','S','A', 0,'B','W', 0,'R', 0, 0, 0, 0, 0, 0,
|
|
/* r s t v w y */
|
|
/* 128 */ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
|
|
/* 144 */ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
|
|
/* 160 */ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
|
|
/* 176 */ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
|
|
/* 192 */ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
|
|
/* 208 */ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
|
|
/* 224 */ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
|
|
/* 240 */ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
|
|
};
|
|
|
|
/**
|
|
* the simple data structure to bind quality score and position (on reference) together.
|
|
*/
|
|
class PosQuality {
|
|
public:
|
|
int readPos; // 0-based
|
|
int refPos; // 0-based
|
|
char qual;
|
|
bool converted;
|
|
bool remove;
|
|
|
|
PosQuality(int& inputPos) {
|
|
readPos = inputPos;
|
|
refPos = inputPos;
|
|
remove = true;
|
|
}
|
|
|
|
void setQual (char& inputQual, bool inputConverted) {
|
|
qual = inputQual;
|
|
converted = inputConverted;
|
|
remove = false;
|
|
}
|
|
};
|
|
|
|
/**
|
|
* the base class for string we need to search.
|
|
*/
|
|
class string_search {
|
|
public:
|
|
int start;
|
|
string s;
|
|
int stringLen;
|
|
|
|
void initialize() {
|
|
start = 0;
|
|
stringLen = 0;
|
|
s.clear();
|
|
}
|
|
|
|
void loadString(string intputString) {
|
|
s = intputString;
|
|
stringLen = s.size();
|
|
start = 0;
|
|
}
|
|
};
|
|
|
|
|
|
/**
|
|
* to store CIGAR string and search segments in it.
|
|
*/
|
|
class CIGAR : public string_search{
|
|
public:
|
|
|
|
bool getNextSegment(int& len, char& symbol) {
|
|
if (start == stringLen) {
|
|
return false;
|
|
}
|
|
len = 0;
|
|
int currentIndex = start;
|
|
while (true) {
|
|
if (isalpha(s[currentIndex])) {
|
|
len = stoi(s.substr(start, currentIndex-start));
|
|
symbol = s[currentIndex];
|
|
start = currentIndex+1;
|
|
return true;
|
|
}
|
|
currentIndex++;
|
|
}
|
|
}
|
|
};
|
|
|
|
/**
|
|
* to store MD tag and search segments in it.
|
|
*/
|
|
class MD_tag : public string_search {
|
|
public:
|
|
|
|
bool getNextSegment(string& seg) {
|
|
if (start >= stringLen) {
|
|
return false;
|
|
}
|
|
seg.clear();
|
|
int currentIndex = start;
|
|
bool deletion = false;
|
|
|
|
while (true) {
|
|
if (currentIndex >= stringLen) {
|
|
start = currentIndex + 1;
|
|
return !seg.empty();
|
|
}
|
|
if (seg.empty() && s[currentIndex] == '0') {
|
|
currentIndex++;
|
|
continue;
|
|
}
|
|
if (isalpha(s[currentIndex])) {
|
|
if (seg.empty()) {
|
|
seg = s[currentIndex];
|
|
start = currentIndex+1;
|
|
return true;
|
|
} else {
|
|
if (deletion) {
|
|
seg += s[currentIndex];
|
|
//currentIndex++;
|
|
} else {
|
|
start = currentIndex;
|
|
return true;
|
|
}
|
|
}
|
|
} else if (s[currentIndex] == '^') {
|
|
if (seg.empty()) {
|
|
seg = s[currentIndex];
|
|
deletion = true;
|
|
} else {
|
|
start = currentIndex;
|
|
return true;
|
|
}
|
|
} else { // number
|
|
if (seg.empty()) {
|
|
seg = s[currentIndex];
|
|
} else {
|
|
if (deletion || isalpha(seg.back())) {
|
|
start = currentIndex;
|
|
return true;
|
|
} else {
|
|
seg += s[currentIndex];
|
|
}
|
|
}
|
|
}
|
|
currentIndex++;
|
|
}
|
|
}
|
|
};
|
|
|
|
template<typename T>
|
|
struct Channel {
|
|
Channel() = default;
|
|
Channel(const Channel &) = delete;
|
|
Channel &operator=(const Channel &) = delete;
|
|
Channel(Channel &&) = delete;
|
|
Channel &operator=(Channel &&) = delete;
|
|
~Channel() = default;
|
|
|
|
void send(T in) {
|
|
if (closed()) {
|
|
abort();
|
|
}
|
|
{
|
|
std::unique_lock<std::mutex> lock{mtx_};
|
|
queue_.push(std::move(in));
|
|
++size_;
|
|
}
|
|
cnd_.notify_one();
|
|
}
|
|
|
|
bool recv(T& out) {
|
|
if (closed() && empty()) {
|
|
return false;
|
|
}
|
|
{
|
|
std::unique_lock<std::mutex> lock{mtx_};
|
|
cnd_.wait(lock, [this] { return !empty() || closed(); });
|
|
if (empty()) {
|
|
return false;
|
|
}
|
|
out = std::move(queue_.front());
|
|
queue_.pop();
|
|
--size_;
|
|
}
|
|
if (closed()) {
|
|
cnd_.notify_all();
|
|
} else {
|
|
cnd_.notify_one();
|
|
}
|
|
return true;
|
|
}
|
|
|
|
size_t constexpr size() const noexcept {
|
|
return size_;
|
|
}
|
|
bool constexpr empty() const noexcept {
|
|
return size_ == 0;
|
|
}
|
|
|
|
void close() noexcept {
|
|
{
|
|
std::unique_lock<std::mutex> lock{mtx_};
|
|
is_closed_.store(true);
|
|
}
|
|
cnd_.notify_all();
|
|
}
|
|
|
|
bool closed() const noexcept {
|
|
return is_closed_.load();
|
|
}
|
|
|
|
private:
|
|
std::queue<T> queue_;
|
|
std::atomic<std::size_t> size_{0};
|
|
std::mutex mtx_;
|
|
std::condition_variable cnd_;
|
|
std::atomic<bool> is_closed_{false};
|
|
};
|
|
|
|
|
|
/**
|
|
* store one chromosome and it's stream position
|
|
*/
|
|
class ChromosomeFilePosition {
|
|
public:
|
|
string chromosome;
|
|
streampos linePos;
|
|
ChromosomeFilePosition(string inputChromosome, streampos inputPos) {
|
|
chromosome = inputChromosome;
|
|
linePos = inputPos;
|
|
}
|
|
|
|
bool operator < (const ChromosomeFilePosition& in) const{
|
|
return chromosome < in.chromosome;
|
|
}
|
|
};
|
|
|
|
/**
|
|
* store all chromosome and it's stream position
|
|
*/
|
|
class ChromosomeFilePositions {
|
|
public:
|
|
vector <ChromosomeFilePosition> pos;
|
|
|
|
/**
|
|
* input the chromosome name and it's streamPos, if it is not in pos, add it.
|
|
*/
|
|
void append (string &chromosome, streampos& linePos) {
|
|
pos.push_back(ChromosomeFilePosition(chromosome, linePos));
|
|
}
|
|
|
|
/**
|
|
* make binary search on pos for target chromosome name
|
|
*/
|
|
int findChromosome(string &targetChromosome, int start, int end) {
|
|
if (start <= end) {
|
|
int middle = (start + end) / 2;
|
|
if (pos[middle].chromosome == targetChromosome) {
|
|
return middle;
|
|
}
|
|
if (pos[middle].chromosome > targetChromosome) {
|
|
return findChromosome(targetChromosome, start, middle-1);
|
|
}
|
|
return findChromosome(targetChromosome, middle+1, end);
|
|
}
|
|
else
|
|
{
|
|
// cannot find the chromosome! throw!
|
|
cerr << "Cannot find the chromosome: " << targetChromosome << " in reference file." << endl;
|
|
throw 1;
|
|
}
|
|
}
|
|
|
|
/**
|
|
* given targetChromosome name, return its streampos
|
|
*/
|
|
streampos getChromosomePosInRefFile(string &targetChromosome)
|
|
{
|
|
int index = findChromosome(targetChromosome, 0, pos.size()-1);
|
|
assert(pos[index].chromosome == targetChromosome);
|
|
return pos[index].linePos;
|
|
}
|
|
|
|
/**
|
|
* sort the pos by chromosome name
|
|
*/
|
|
void sort()
|
|
{
|
|
std::sort(pos.begin(), pos.end());
|
|
}
|
|
};
|
|
#endif //UTILITY_3N_TABLE_H
|