use channel

This commit is contained in:
Yaossg 2025-01-19 17:06:28 +08:00
parent 5e601d0401
commit a3817bf0b7
4 changed files with 124 additions and 177 deletions

View File

@ -300,7 +300,7 @@ repeat: hisat2-repeat
repeat-debug: hisat2-repeat-debug repeat-debug: hisat2-repeat-debug
DEFS :=-fno-strict-aliasing \ DEFS :=-fno-strict-aliasing \
-DHISAT2_VERSION="\"`cat HISAT2_VERSION`\"" \ -DHISAT2_VERSION="\"\"" \
-DBUILD_HOST="\"`hostname`\"" \ -DBUILD_HOST="\"`hostname`\"" \
-DBUILD_TIME="\"`date`\"" \ -DBUILD_TIME="\"`date`\"" \
-DCOMPILER_VERSION="\"`$(CXX) -v 2>&1 | tail -1`\"" \ -DCOMPILER_VERSION="\"`$(CXX) -v 2>&1 | tail -1`\"" \

View File

@ -246,9 +246,9 @@ int hisat_3n_table()
positions = new Positions(refFileName, nThreads, addedChrName, removedChrName); positions = new Positions(refFileName, nThreads, addedChrName, removedChrName);
// open #nThreads workers // open #nThreads workers
vector<thread*> workers; vector<thread> workers;
for (int i = 0; i < nThreads; i++) { for (int i = 0; i < nThreads; i++) {
workers.push_back(new thread(&Positions::append, positions, i)); workers.emplace_back(&Positions::append, positions, i);
} }
// open a output thread // open a output thread
@ -265,50 +265,37 @@ int hisat_3n_table()
alignmentFile = &inputFile; alignmentFile = &inputFile;
} }
string* line; // temporary string to get SAM line.
string samChromosome; // the chromosome name of current SAM line.
long long int samPos; // the position of current SAM line. long long int samPos; // the position of current SAM line.
long long int reloadPos; // the position in reference that we need to reload. long long int reloadPos; // the position in reference that we need to reload.
long long int lastPos = 0; // the position on last SAM line. compare lastPos with samPos to make sure the SAM is sorted. long long int lastPos = 0; // the position on last SAM line. compare lastPos with samPos to make sure the SAM is sorted.
while (alignmentFile->good()) { while (alignmentFile->good()) {
positions->getFreeStringPointer(line); string line;
if (!getline(*alignmentFile, *line)) { string samChromosome;
positions->returnLine(line);
if (!getline(*alignmentFile, line)) {
break; break;
} }
if (line->empty() || line->front() == '@') { if (line.empty() || line.front() == '@') {
positions->returnLine(line);
continue; continue;
} }
// limit the linePool size to save memory
while(positions->linePool.size() > 1000 * nThreads) {
this_thread::sleep_for (std::chrono::microseconds(1));
}
// if the SAM line is empty or unmapped, get the next SAM line. // if the SAM line is empty or unmapped, get the next SAM line.
if (!getSAMChromosomePos(line, samChromosome, samPos)) { if (!getSAMChromosomePos(&line, samChromosome, samPos)) {
positions->returnLine(line);
continue; continue;
} }
// if the samChromosome is different than current positions' chromosome, finish all SAM line. // if the samChromosome is different than current positions' chromosome, finish all SAM line.
// then load a new reference chromosome. // then load a new reference chromosome.
if (samChromosome != positions->chromosome) { if (samChromosome != *positions->chromosome) {
// wait all line is processed
while (!positions->linePool.empty() || positions->outputPositionPool.size() > 100000) {
this_thread::sleep_for (std::chrono::microseconds(1));
}
positions->appendingFinished(); positions->appendingFinished();
positions->moveAllToOutput(); positions->moveAllToOutput();
positions->loadNewChromosome(samChromosome); positions->loadNewChromosome(std::move(samChromosome));
reloadPos = loadingBlockSize; reloadPos = loadingBlockSize;
lastPos = 0; lastPos = 0;
} }
// if the samPos is larger than reloadPos, load 1 loadingBlockSize bp in from reference. // if the samPos is larger than reloadPos, load 1 loadingBlockSize bp in from reference.
while (samPos > reloadPos) { while (samPos > reloadPos) {
while (!positions->linePool.empty() || positions->outputPositionPool.size() > 100000) {
this_thread::sleep_for (std::chrono::microseconds(1));
}
positions->appendingFinished(); positions->appendingFinished();
positions->moveBlockToOutput(); positions->moveBlockToOutput();
positions->loadMore(); positions->loadMore();
@ -318,7 +305,7 @@ int hisat_3n_table()
cerr << "The input alignment file is not sorted. Please use sorted SAM file as alignment file." << endl; cerr << "The input alignment file is not sorted. Please use sorted SAM file as alignment file." << endl;
throw 1; throw 1;
} }
positions->linePool.push(line); positions->linePool.send(std::move(line));
lastPos = samPos; lastPos = samPos;
} }
//} //}
@ -326,30 +313,18 @@ int hisat_3n_table()
inputFile.close(); inputFile.close();
} }
positions->linePool.close();
// prepare to close everything. for (int i = 0; i < nThreads; i++){
workers[i].join();
// make sure linePool is empty
while (!positions->linePool.empty()) {
this_thread::sleep_for (std::chrono::microseconds(100));
} }
// make sure all workers finished their appending work. // make sure all workers finished their appending work.
positions->appendingFinished(); positions->appendingFinished();
// move all position to outputPool // move all position to outputPool
positions->moveAllToOutput(); positions->moveAllToOutput();
// wait until outputPool is empty // wait until outputPool is empty
while (!positions->outputPositionPool.empty()) {
this_thread::sleep_for (std::chrono::microseconds(100)); positions->outputPositionPool.close();
}
// stop all thread and clean
while(positions->freeLinePool.popFront(line)) {
delete line;
}
positions->working = false;
for (int i = 0; i < nThreads; i++){
workers[i]->join();
delete workers[i];
}
outputThread.join(); outputThread.join();
delete positions; delete positions;
return 0; return 0;

View File

@ -26,6 +26,9 @@
#include <mutex> #include <mutex>
#include <thread> #include <thread>
#include <cassert> #include <cassert>
#include <memory>
#include <deque>
#include "alignment_3n_table.h" #include "alignment_3n_table.h"
using namespace std; using namespace std;
@ -60,21 +63,21 @@ public:
class Position{ class Position{
mutex mutex_; mutex mutex_;
public: public:
string chromosome; // reference chromosome name shared_ptr<string> chromosome; // reference chromosome name
long long int location; // 1-based position long long int location; // 1-based position
char strand; // +(REF) or -(REF-RC) char strand; // +(REF) or -(REF-RC)
string convertedQualities; // each char is a mapping quality on this position for converted base. string convertedQualities; // each char is a mapping quality on this position for converted base.
string unconvertedQualities; // each char is a mapping quality on this position for unconverted base. string unconvertedQualities; // each char is a mapping quality on this position for unconverted base.
vector<uniqueID> uniqueIDs; // each value represent a readName which contributed the base information. deque<uniqueID> uniqueIDs; // each value represent a readName which contributed the base information.
// readNameIDs is to make sure no read contribute 2 times in same position. // readNameIDs is to make sure no read contribute 2 times in same position.
void initialize() { void initialize() {
chromosome.clear(); chromosome.reset();
location = -1; location = -1;
strand = '?'; strand = '?';
convertedQualities.clear(); convertedQualities.clear();
unconvertedQualities.clear(); unconvertedQualities.clear();
vector<uniqueID>().swap(uniqueIDs); uniqueIDs.clear();
} }
Position(){ Position(){
@ -92,8 +95,8 @@ public:
* set the chromosome, location (position), and strand information. * set the chromosome, location (position), and strand information.
*/ */
void set (string& inputChr, long long int inputLoc) { void set (shared_ptr<string> inputChr, long long int inputLoc) {
chromosome = inputChr; chromosome = std::move(inputChr);
location = inputLoc + 1; location = inputLoc + 1;
} }
@ -187,15 +190,13 @@ public:
*/ */
class Positions{ class Positions{
public: public:
vector<Position*> refPositions; // the pool of all current reference position. deque<Position*> refPositions; // the pool of all current reference position.
string chromosome; // current reference chromosome name. shared_ptr<string> chromosome; // current reference chromosome name.
long long int location; // current location (position) in reference chromosome. long long int location; // current location (position) in reference chromosome.
char lastBase = 'X'; // the last base of reference line. this is for CG_only mode. char lastBase = 'X'; // the last base of reference line. this is for CG_only mode.
SafeQueue<string*> linePool; // pool to store unprocessed SAM line. Channel<string> linePool; // pool to store unprocessed SAM line.
SafeQueue<string*> freeLinePool; // pool to store free string pointer for SAM line. Channel<Position*> freePositionPool; // pool to store free position pointer for reference position.
SafeQueue<Position*> freePositionPool; // pool to store free position pointer for reference position. Channel<Position*> outputPositionPool; // pool to store the reference position which is loaded and ready to output.
SafeQueue<Position*> outputPositionPool; // pool to store the reference position which is loaded and ready to output.
bool working;
mutex mutex_; mutex mutex_;
long long int refCoveredPosition; // this is the last position in reference chromosome we loaded in refPositions. long long int refCoveredPosition; // this is the last position in reference chromosome we loaded in refPositions.
ifstream refFile; ifstream refFile;
@ -206,7 +207,6 @@ public:
bool removedChrName = false; bool removedChrName = false;
Positions(string inputRefFileName, int inputNThreads, bool inputAddedChrName, bool inputRemovedChrName) { Positions(string inputRefFileName, int inputNThreads, bool inputAddedChrName, bool inputRemovedChrName) {
working = true;
nThreads = inputNThreads; nThreads = inputNThreads;
addedChrName = inputAddedChrName; addedChrName = inputAddedChrName;
removedChrName = inputRemovedChrName; removedChrName = inputRemovedChrName;
@ -222,7 +222,7 @@ public:
delete workerLock[i]; delete workerLock[i];
} }
Position* pos; Position* pos;
while(freePositionPool.popFront(pos)) { while(freePositionPool.recv(pos)) {
delete pos; delete pos;
} }
} }
@ -271,13 +271,13 @@ public:
while (refFile.good()) { while (refFile.good()) {
getline(refFile, line); getline(refFile, line);
if (line.front() == '>') { // this line is chromosome name if (line.front() == '>') { // this line is chromosome name
chromosome = getChrName(line); chromosome = make_shared<string>(getChrName(line));
streampos currentPos = refFile.tellg(); streampos currentPos = refFile.tellg();
chromosomePos.append(chromosome, currentPos); chromosomePos.append(*chromosome, currentPos);
} }
} }
chromosomePos.sort(); chromosomePos.sort();
chromosome.clear(); chromosome.reset();
} }
/** /**
@ -333,8 +333,7 @@ public:
*out_ << "ref\tpos\tstrand\tconvertedBaseQualities\tconvertedBaseCount\tunconvertedBaseQualities\tunconvertedBaseCount\n"; *out_ << "ref\tpos\tstrand\tconvertedBaseQualities\tconvertedBaseCount\tunconvertedBaseQualities\tunconvertedBaseCount\n";
Position* pos; Position* pos;
while (working) { while (outputPositionPool.recv(pos)) {
if (outputPositionPool.popFront(pos)) {
*out_ << pos->chromosome << '\t' *out_ << pos->chromosome << '\t'
<< to_string(pos->location) << '\t' << to_string(pos->location) << '\t'
<< pos->strand << '\t' << pos->strand << '\t'
@ -343,9 +342,6 @@ public:
<< pos->unconvertedQualities << '\t' << pos->unconvertedQualities << '\t'
<< to_string(pos->unconvertedQualities.size()) << '\n'; << to_string(pos->unconvertedQualities.size()) << '\n';
returnPosition(pos); returnPosition(pos);
} else {
this_thread::sleep_for (std::chrono::microseconds(1));
}
} }
tableFile.close(); tableFile.close();
} }
@ -363,16 +359,14 @@ public:
if (refPositions[index]->empty() || refPositions[index]->strand == '?') { if (refPositions[index]->empty() || refPositions[index]->strand == '?') {
returnPosition(refPositions[index]); returnPosition(refPositions[index]);
} else { } else {
outputPositionPool.push(refPositions[index]); outputPositionPool.send(refPositions[index]);
} }
} else { } else {
break; break;
} }
} }
if (index != 0) {
refPositions.erase(refPositions.begin(), refPositions.begin() + index); refPositions.erase(refPositions.begin(), refPositions.begin() + index);
} }
}
/** /**
* move all the refPosition into output pool. * move all the refPosition into output pool.
@ -385,8 +379,8 @@ public:
if (refPositions[index]->empty() || refPositions[index]->strand == '?') { if (refPositions[index]->empty() || refPositions[index]->strand == '?') {
returnPosition(refPositions[index]); returnPosition(refPositions[index]);
} else { } else {
vector<uniqueID>().swap(refPositions[index]->uniqueIDs); refPositions[index]->uniqueIDs.clear();
outputPositionPool.push(refPositions[index]); outputPositionPool.send(refPositions[index]);
} }
} }
refPositions.clear(); refPositions.clear();
@ -399,7 +393,7 @@ public:
refFile.clear(); refFile.clear();
// find the start position in file based on chromosome name. // find the start position in file based on chromosome name.
streampos startPos = chromosomePos.getChromosomePosInRefFile(targetChromosome); streampos startPos = chromosomePos.getChromosomePosInRefFile(targetChromosome);
chromosome = targetChromosome; chromosome = make_shared<string>(move(targetChromosome));
refFile.seekg(startPos, ios::beg); refFile.seekg(startPos, ios::beg);
refCoveredPosition = 2 * loadingBlockSize; refCoveredPosition = 2 * loadingBlockSize;
string line; string line;
@ -479,45 +473,23 @@ public:
} }
} }
/**
* get a string pointer from freeLinePool, if freeLinePool is empty, make a new string pointer.
*/
void getFreeStringPointer(string*& newLine) {
if (freeLinePool.popFront(newLine)) {
return;
} else {
newLine = new string();
}
}
/** /**
* get a Position pointer from freePositionPool, if freePositionPool is empty, make a new Position pointer. * get a Position pointer from freePositionPool, if freePositionPool is empty, make a new Position pointer.
*/ */
void getFreePosition(Position*& newPosition) { void getFreePosition(Position*& newPosition) {
while (outputPositionPool.size() >= 10000) { if (freePositionPool.recv(newPosition)) {
this_thread::sleep_for (std::chrono::microseconds(1));
}
if (freePositionPool.popFront(newPosition)) {
return; return;
} else { } else {
newPosition = new Position(); newPosition = new Position();
} }
} }
/**
* return the line to freeLinePool
*/
void returnLine(string* line) {
line->clear();
freeLinePool.push(line);
}
/** /**
* return the position to freePositionPool. * return the position to freePositionPool.
*/ */
void returnPosition(Position* pos) { void returnPosition(Position* pos) {
pos->initialize(); pos->initialize();
freePositionPool.push(pos); freePositionPool.send(pos);
} }
/** /**
@ -525,23 +497,18 @@ public:
* it take the SAM line from linePool, parse it. * it take the SAM line from linePool, parse it.
*/ */
void append(int threadID) { void append(int threadID) {
string* line; string line;
Alignment newAlignment; Alignment newAlignment;
while (working) {
workerLock[threadID]->lock(); while (true) {
if(!linePool.popFront(line)) { std::unique_lock<std::mutex> lk{*workerLock[threadID]};
workerLock[threadID]->unlock(); if (!linePool.recv(line)) break;
this_thread::sleep_for (std::chrono::nanoseconds(1));
continue;
}
while (refPositions.empty()) { while (refPositions.empty()) {
this_thread::sleep_for (std::chrono::microseconds(1)); this_thread::sleep_for (std::chrono::microseconds(1));
} }
newAlignment.parse(line); newAlignment.parse(&line);
returnLine(line);
appendPositions(newAlignment); appendPositions(newAlignment);
workerLock[threadID]->unlock();
} }
} }
}; };

View File

@ -20,9 +20,11 @@
#ifndef UTILITY_3N_TABLE_H #ifndef UTILITY_3N_TABLE_H
#define UTILITY_3N_TABLE_H #define UTILITY_3N_TABLE_H
#include <algorithm>
#include <condition_variable>
#include <mutex> #include <mutex>
#include <queue> #include <queue>
#include <algorithm> #include <atomic>
using namespace std; using namespace std;
@ -185,74 +187,77 @@ public:
} }
}; };
/**
* simple safe queue
*/
template<typename T> template<typename T>
class SafeQueue { struct Channel {
Channel() = default;
Channel(const Channel &) = delete;
Channel &operator=(const Channel &) = delete;
Channel(Channel &&) = delete;
Channel &operator=(Channel &&) = delete;
~Channel() = default;
void send(T in) {
if (closed()) {
abort();
}
{
std::unique_lock<std::mutex> lock{mtx_};
queue_.push(std::move(in));
++size_;
}
cnd_.notify_one();
}
bool recv(T& out) {
if (closed() && empty()) {
return false;
}
{
std::unique_lock<std::mutex> lock{mtx_};
cnd_.wait(lock, [this] { return !empty() || closed(); });
if (empty()) {
return false;
}
out = std::move(queue_.front());
queue_.pop();
--size_;
}
if (closed()) {
cnd_.notify_all();
} else {
cnd_.notify_one();
}
return true;
}
size_t constexpr size() const noexcept {
return size_;
}
bool constexpr empty() const noexcept {
return size_ == 0;
}
void close() noexcept {
{
std::unique_lock<std::mutex> lock{mtx_};
is_closed_.store(true);
}
cnd_.notify_all();
}
bool closed() const noexcept {
return is_closed_.load();
}
private: private:
mutex mutex_; std::queue<T> queue_;
queue<T> queue_; std::atomic<std::size_t> size_{0};
std::mutex mtx_;
string getReadName(string* line){ std::condition_variable cnd_;
int startPosition = 0; std::atomic<bool> is_closed_{false};
int endPosition;
endPosition = line->find("\t", startPosition);
string readName = line->substr(startPosition, endPosition - startPosition);
return readName;
}
public:
void pop() {
mutex_.lock();
queue_.pop();
mutex_.unlock();
}
T front() {
mutex_.lock();
T value = queue_.front();
mutex_.unlock();
return value;
}
int size() {
mutex_.lock();
int s = queue_.size();
mutex_.unlock();
return s;
}
/**
* return true if the queue is not empty and pop front and get value.
* return false if the queue is empty.
*/
bool popFront(T& value) {
mutex_.lock();
bool isEmpty = queue_.empty();
if (!isEmpty) {
value = queue_.front();
queue_.pop();
}
mutex_.unlock();
return !isEmpty;
}
void push(T value) {
mutex_.lock();
queue_.push(value);
mutex_.unlock();
}
bool empty() {
mutex_.lock();
bool check = queue_.empty();
mutex_.unlock();
return check;
}
}; };
/** /**
* store one chromosome and it's stream position * store one chromosome and it's stream position
*/ */