328 lines
8.9 KiB
C++
328 lines
8.9 KiB
C++
/*
|
|
* Copyright 2020, Yun (Leo) Zhang <imzhangyun@gmail.com>
|
|
*
|
|
* This file is part of HISAT-3N.
|
|
*
|
|
* HISAT-3N is free software: you can redistribute it and/or modify
|
|
* it under the terms of the GNU General Public License as published by
|
|
* the Free Software Foundation, either version 3 of the License, or
|
|
* (at your option) any later version.
|
|
*
|
|
* HISAT-3N is distributed in the hope that it will be useful,
|
|
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
|
* GNU General Public License for more details.
|
|
*
|
|
* You should have received a copy of the GNU General Public License
|
|
* along with HISAT-3N. If not, see <http://www.gnu.org/licenses/>.
|
|
*/
|
|
|
|
#ifndef UTILITY_3N_TABLE_H
|
|
#define UTILITY_3N_TABLE_H
|
|
|
|
#include <mutex>
|
|
#include <queue>
|
|
#include <algorithm>
|
|
|
|
using namespace std;
|
|
|
|
/**
|
|
* return complement of input base.
|
|
*/
|
|
char asc2dnacomp[] = {
|
|
/* 0 */ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
|
|
/* 16 */ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
|
|
/* 32 */ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,'-', 0, 0,
|
|
/* 48 */ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
|
|
/* 64 */ 0,'T','V','G','H', 0, 0,'C','D', 0, 0,'M', 0,'K','N', 0,
|
|
/* A B C D G H K M N */
|
|
/* 80 */ 0, 0,'Y','S','A', 0,'B','W', 0,'R', 0, 0, 0, 0, 0, 0,
|
|
/* R S T V W Y */
|
|
/* 96 */ 0,'T','V','G','H', 0, 0,'C','D', 0, 0,'M', 0,'K','N', 0,
|
|
/* a b c d g h k m n */
|
|
/* 112 */ 0, 0,'Y','S','A', 0,'B','W', 0,'R', 0, 0, 0, 0, 0, 0,
|
|
/* r s t v w y */
|
|
/* 128 */ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
|
|
/* 144 */ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
|
|
/* 160 */ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
|
|
/* 176 */ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
|
|
/* 192 */ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
|
|
/* 208 */ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
|
|
/* 224 */ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
|
|
/* 240 */ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
|
|
};
|
|
|
|
/**
|
|
* the simple data structure to bind quality score and position (on reference) together.
|
|
*/
|
|
class PosQuality {
|
|
public:
|
|
int readPos; // 0-based
|
|
int refPos; // 0-based
|
|
char qual;
|
|
bool converted;
|
|
bool remove;
|
|
|
|
PosQuality(int& inputPos) {
|
|
readPos = inputPos;
|
|
refPos = inputPos;
|
|
remove = true;
|
|
}
|
|
|
|
void setQual (char& inputQual, bool inputConverted) {
|
|
qual = inputQual;
|
|
converted = inputConverted;
|
|
remove = false;
|
|
}
|
|
};
|
|
|
|
/**
|
|
* the base class for string we need to search.
|
|
*/
|
|
class string_search {
|
|
public:
|
|
int start;
|
|
string s;
|
|
int stringLen;
|
|
|
|
void initialize() {
|
|
start = 0;
|
|
stringLen = 0;
|
|
s.clear();
|
|
}
|
|
|
|
void loadString(string intputString) {
|
|
s = intputString;
|
|
stringLen = s.size();
|
|
start = 0;
|
|
}
|
|
};
|
|
|
|
|
|
/**
|
|
* to store CIGAR string and search segments in it.
|
|
*/
|
|
class CIGAR : public string_search{
|
|
public:
|
|
|
|
bool getNextSegment(int& len, char& symbol) {
|
|
if (start == stringLen) {
|
|
return false;
|
|
}
|
|
len = 0;
|
|
int currentIndex = start;
|
|
while (true) {
|
|
if (isalpha(s[currentIndex])) {
|
|
len = stoi(s.substr(start, currentIndex-start));
|
|
symbol = s[currentIndex];
|
|
start = currentIndex+1;
|
|
return true;
|
|
}
|
|
currentIndex++;
|
|
}
|
|
}
|
|
};
|
|
|
|
/**
|
|
* to store MD tag and search segments in it.
|
|
*/
|
|
class MD_tag : public string_search {
|
|
public:
|
|
|
|
bool getNextSegment(string& seg) {
|
|
if (start >= stringLen) {
|
|
return false;
|
|
}
|
|
seg.clear();
|
|
int currentIndex = start;
|
|
bool deletion = false;
|
|
|
|
while (true) {
|
|
if (currentIndex >= stringLen) {
|
|
start = currentIndex + 1;
|
|
return !seg.empty();
|
|
}
|
|
if (seg.empty() && s[currentIndex] == '0') {
|
|
currentIndex++;
|
|
continue;
|
|
}
|
|
if (isalpha(s[currentIndex])) {
|
|
if (seg.empty()) {
|
|
seg = s[currentIndex];
|
|
start = currentIndex+1;
|
|
return true;
|
|
} else {
|
|
if (deletion) {
|
|
seg += s[currentIndex];
|
|
//currentIndex++;
|
|
} else {
|
|
start = currentIndex;
|
|
return true;
|
|
}
|
|
}
|
|
} else if (s[currentIndex] == '^') {
|
|
if (seg.empty()) {
|
|
seg = s[currentIndex];
|
|
deletion = true;
|
|
} else {
|
|
start = currentIndex;
|
|
return true;
|
|
}
|
|
} else { // number
|
|
if (seg.empty()) {
|
|
seg = s[currentIndex];
|
|
} else {
|
|
if (deletion || isalpha(seg.back())) {
|
|
start = currentIndex;
|
|
return true;
|
|
} else {
|
|
seg += s[currentIndex];
|
|
}
|
|
}
|
|
}
|
|
currentIndex++;
|
|
}
|
|
}
|
|
};
|
|
|
|
/**
|
|
* simple safe queue
|
|
*/
|
|
template <typename T>
|
|
class SafeQueue {
|
|
private:
|
|
mutex mutex_;
|
|
queue<T> queue_;
|
|
|
|
string getReadName(string* line){
|
|
int startPosition = 0;
|
|
int endPosition;
|
|
|
|
endPosition = line->find("\t", startPosition);
|
|
string readName = line->substr(startPosition, endPosition - startPosition);
|
|
return readName;
|
|
}
|
|
|
|
public:
|
|
void pop() {
|
|
mutex_.lock();
|
|
queue_.pop();
|
|
mutex_.unlock();
|
|
}
|
|
|
|
T front() {
|
|
mutex_.lock();
|
|
T value = queue_.front();
|
|
mutex_.unlock();
|
|
return value;
|
|
}
|
|
|
|
int size() {
|
|
mutex_.lock();
|
|
int s = queue_.size();
|
|
mutex_.unlock();
|
|
return s;
|
|
}
|
|
|
|
/**
|
|
* return true if the queue is not empty and pop front and get value.
|
|
* return false if the queue is empty.
|
|
*/
|
|
bool popFront(T& value) {
|
|
mutex_.lock();
|
|
bool isEmpty = queue_.empty();
|
|
if (!isEmpty) {
|
|
value = queue_.front();
|
|
queue_.pop();
|
|
}
|
|
mutex_.unlock();
|
|
return !isEmpty;
|
|
}
|
|
|
|
void push(T value) {
|
|
mutex_.lock();
|
|
queue_.push(value);
|
|
mutex_.unlock();
|
|
}
|
|
|
|
bool empty() {
|
|
mutex_.lock();
|
|
bool check = queue_.empty();
|
|
mutex_.unlock();
|
|
return check;
|
|
}
|
|
};
|
|
|
|
/**
|
|
* store one chromosome and it's stream position
|
|
*/
|
|
class ChromosomeFilePosition {
|
|
public:
|
|
string chromosome;
|
|
streampos linePos;
|
|
ChromosomeFilePosition(string inputChromosome, streampos inputPos) {
|
|
chromosome = inputChromosome;
|
|
linePos = inputPos;
|
|
}
|
|
|
|
bool operator < (const ChromosomeFilePosition& in) const{
|
|
return chromosome < in.chromosome;
|
|
}
|
|
};
|
|
|
|
/**
|
|
* store all chromosome and it's stream position
|
|
*/
|
|
class ChromosomeFilePositions {
|
|
public:
|
|
vector <ChromosomeFilePosition> pos;
|
|
|
|
/**
|
|
* input the chromosome name and it's streamPos, if it is not in pos, add it.
|
|
*/
|
|
void append (string &chromosome, streampos& linePos) {
|
|
pos.push_back(ChromosomeFilePosition(chromosome, linePos));
|
|
}
|
|
|
|
/**
|
|
* make binary search on pos for target chromosome name
|
|
*/
|
|
int findChromosome(string &targetChromosome, int start, int end) {
|
|
if (start <= end) {
|
|
int middle = (start + end) / 2;
|
|
if (pos[middle].chromosome == targetChromosome) {
|
|
return middle;
|
|
}
|
|
if (pos[middle].chromosome > targetChromosome) {
|
|
return findChromosome(targetChromosome, start, middle-1);
|
|
}
|
|
return findChromosome(targetChromosome, middle+1, end);
|
|
}
|
|
else
|
|
{
|
|
// cannot find the chromosome! throw!
|
|
cerr << "Cannot find the chromosome: " << targetChromosome << " in reference file." << endl;
|
|
throw 1;
|
|
}
|
|
}
|
|
|
|
/**
|
|
* given targetChromosome name, return its streampos
|
|
*/
|
|
streampos getChromosomePosInRefFile(string &targetChromosome)
|
|
{
|
|
int index = findChromosome(targetChromosome, 0, pos.size()-1);
|
|
assert(pos[index].chromosome == targetChromosome);
|
|
return pos[index].linePos;
|
|
}
|
|
|
|
/**
|
|
* sort the pos by chromosome name
|
|
*/
|
|
void sort()
|
|
{
|
|
std::sort(pos.begin(), pos.end());
|
|
}
|
|
};
|
|
#endif //UTILITY_3N_TABLE_H
|