hisat-3n/hisat_3n_table.cpp

359 lines
12 KiB
C++
Raw Normal View History

2025-01-18 13:09:52 +00:00
/*
* Copyright 2020, Yun (Leo) Zhang <imzhangyun@gmail.com>
*
* This file is part of HISAT-3N.
*
* HISAT-3N is free software: you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation, either version 3 of the License, or
* (at your option) any later version.
*
* HISAT-3N is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with HISAT-3N. If not, see <http://www.gnu.org/licenses/>.
*/
#include <iostream>
#include <getopt.h>
#include "position_3n_table.h"
using namespace std;
string alignmentFileName;
bool standardInMode = false;
string refFileName;
string outputFileName;
bool uniqueOnly = false;
bool multipleOnly = false;
bool CG_only = false;
int nThreads = 1;
long long int loadingBlockSize = 1000000;
char convertFrom = '0';
char convertTo = '0';
char convertFromComplement;
char convertToComplement;
bool addedChrName = false;
bool removedChrName = false;
Positions* positions;
bool fileExist (string& filename) {
ifstream file(filename);
return file.good();
}
enum {
ARG_ADDED_CHRNAME = 256,
ARG_REMOVED_CHRNAME
};
static const char *short_options = "s:r:t:b:umcp:h";
static struct option long_options[] {
{"alignments", required_argument, 0, 'a'},
{"ref", required_argument, 0, 'r'},
{"output-name", required_argument, 0, 'o'},
{"base-change", required_argument, 0, 'b'},
{"unique-only", no_argument, 0, 'u'},
{"multiple-only", no_argument, 0, 'm'},
{"CG-only", no_argument, 0, 'c'},
{"threads", required_argument, 0, 'p'},
{"added-chrname", no_argument, 0, ARG_ADDED_CHRNAME },
{"removed-chrname", no_argument, 0, ARG_REMOVED_CHRNAME },
{"help", no_argument, 0, 'h'},
{0, 0, 0, 0}
};
static void printHelp(ostream& out) {
out << "hisat-3n-table developed by Yun (Leo) Zhang" << endl;
out << "Usage:" << endl
<< "hisat-3n-table [options]* --alignments <alignmentFile> --ref <refFile> --output-name <outputFile> --base-change <char1,char2>" << endl
<< " <alignmentFile> SORTED SAM filename. Please enter '-' for standard input." << endl
<< " <refFile> reference file (should be FASTA format)." << endl
<< " <outputFile> file name to save the 3n table (tsv format). By default, alignments are written to the “standard out” or “stdout” filehandle (i.e. the console)." << endl
<< " <chr1,chr2> the char1 is the nucleotide converted from, the char2 is the nucleotide converted to." << endl;
out << "Options (defaults in parentheses):" << endl
<< " Input:" << endl
<< " -u/--unique-only only count the base which is in unique mapped reads." << endl
<< " -m/--multiple-only only count the base which is in multiple mapped reads." << endl
<< " -c/--CG-only only count CG and ignore CH in reference." << endl
<< " --added-chrname please add this option if you use --add-chrname during HISAT-3N alignment." << endl
<< " --removed-chrname please add this option if you use --remove-chrname during HISAT-3N alignment." << endl
<< " -p/--threads <int> number of threads to launch (1)." << endl
<< " -h/--help print this usage message." << endl;
}
static void parseOption(int next_option, const char *optarg) {
switch (next_option) {
case 'a': {
alignmentFileName = optarg;
if (alignmentFileName == "-") {
standardInMode = true;
break;
}
if (!fileExist(alignmentFileName)) {
cerr << "The alignment file is not exist." << endl;
throw (1);
}
break;
}
case 'r': {
refFileName = optarg;
if (!fileExist(refFileName)) {
cerr << "reference (FASTA) file is not exist." << endl;
throw (1);
}
break;
}
case 'o':
outputFileName = optarg;
break;
case 'b': {
string arg = optarg;
if (arg.size() != 3 || arg[1] != ',') {
cerr << "Error: expected 2 comma-separated "
<< "arguments to --base-change option (e.g. C,T), got " << arg << endl;
throw 1;
}
convertFrom = toupper(arg.front());
convertTo = toupper(arg.back());
break;
}
case 'u':{
uniqueOnly = true;
break;
}
case 'm': {
multipleOnly = true;
break;
}
case 'c': {
CG_only = true;
break;
}
case 'h': {
printHelp(cerr);
throw 0;
}
case 'p': {
nThreads = stoi(optarg);
if (nThreads < 1) {
nThreads = 1;
}
break;
}
case ARG_ADDED_CHRNAME: {
addedChrName = true;
break;
}
case ARG_REMOVED_CHRNAME: {
removedChrName = true;
break;
}
default:
printHelp(cerr);
throw 1;
}
}
static void parseOptions(int argc, const char **argv) {
int option_index = 0;
int next_option;
while (true) {
next_option = getopt_long(argc, const_cast<char **>(argv), short_options,
long_options, &option_index);
if (next_option == -1)
break;
parseOption(next_option, optarg);
}
// check filenames
if (refFileName.empty() || alignmentFileName.empty()) {
cerr << "No reference or SAM file specified!" << endl;
printHelp(cerr);
throw 1;
}
// give a warning for CG-only
if (CG_only) {
if (convertFrom != 'C' || convertTo != 'T') {
cerr << "Warning! You are using CG-only mode. The the --base-change option is set to: C,T" << endl;
convertFrom = 'C';
convertTo = 'T';
}
}
// check if --base-change is empty
if (convertFrom == '0' || convertTo == '0') {
cerr << "the --base-change argument is required." << endl;
throw 1;
}
if(removedChrName && addedChrName) {
cerr << "Error: --removed-chrname and --added-chrname cannot be used at the same time" << endl;
throw 1;
}
// set complements
convertFromComplement = asc2dnacomp[convertFrom];
convertToComplement = asc2dnacomp[convertTo];
}
/**
* give a SAM line, extract the chromosome and position information.
* return true if the SAM line is mapped. return false if SAM line is not maped.
*/
bool getSAMChromosomePos(string* line, string& chr, long long int& pos) {
int startPosition = 0;
int endPosition = 0;
int count = 0;
while ((endPosition = line->find("\t", startPosition)) != string::npos) {
if (count == 2) {
chr = line->substr(startPosition, endPosition - startPosition);
} else if (count == 3) {
pos = stoll(line->substr(startPosition, endPosition - startPosition));
if (chr == "*") {
return false;
} else {
return true;
}
}
startPosition = endPosition + 1;
count++;
}
return false;
}
/*void opeInFile(ifstream& f) {
if (alignmentFileName == "-") {
f = cin;
} else {
ifstream alignmentFile;
alignmentFile.open(alignmentFileName, ios_base::in);
return alignmentFile;
}
}*/
int hisat_3n_table()
{
positions = new Positions(refFileName, nThreads, addedChrName, removedChrName);
// open #nThreads workers
2025-01-19 09:06:28 +00:00
vector<thread> workers;
2025-01-18 13:09:52 +00:00
for (int i = 0; i < nThreads; i++) {
2025-01-19 09:06:28 +00:00
workers.emplace_back(&Positions::append, positions, i);
2025-01-18 13:09:52 +00:00
}
// open a output thread
thread outputThread;
outputThread = thread(&Positions::outputFunction, positions, outputFileName);
// main function, initially 2 load loadingBlockSize (2,000,000) bp of reference, set reloadPos to 1 loadingBlockSize, then load SAM data.
// when the samPos larger than the reloadPos load 1 loadingBlockSize bp of reference.
// when the samChromosome is different to current chromosome, finish all sam position and output all.
ifstream inputFile;
istream *alignmentFile = &cin;
if (!standardInMode) {
inputFile.open(alignmentFileName, ios_base::in);
alignmentFile = &inputFile;
}
long long int samPos; // the position of current SAM line.
long long int reloadPos; // the position in reference that we need to reload.
long long int lastPos = 0; // the position on last SAM line. compare lastPos with samPos to make sure the SAM is sorted.
while (alignmentFile->good()) {
2025-01-19 09:06:28 +00:00
string line;
string samChromosome;
if (!getline(*alignmentFile, line)) {
2025-01-18 13:09:52 +00:00
break;
}
2025-01-19 09:06:28 +00:00
if (line.empty() || line.front() == '@') {
2025-01-18 13:09:52 +00:00
continue;
}
2025-01-19 09:06:28 +00:00
2025-01-18 13:09:52 +00:00
// if the SAM line is empty or unmapped, get the next SAM line.
2025-01-19 09:06:28 +00:00
if (!getSAMChromosomePos(&line, samChromosome, samPos)) {
2025-01-18 13:09:52 +00:00
continue;
}
// if the samChromosome is different than current positions' chromosome, finish all SAM line.
// then load a new reference chromosome.
2025-01-19 10:11:37 +00:00
auto old = positions->chromosome;
if (samChromosome != (old ? *old : "")) {
2025-01-18 13:09:52 +00:00
positions->appendingFinished();
positions->moveAllToOutput();
2025-01-19 09:06:28 +00:00
positions->loadNewChromosome(std::move(samChromosome));
2025-01-18 13:09:52 +00:00
reloadPos = loadingBlockSize;
lastPos = 0;
}
// if the samPos is larger than reloadPos, load 1 loadingBlockSize bp in from reference.
while (samPos > reloadPos) {
positions->appendingFinished();
positions->moveBlockToOutput();
positions->loadMore();
reloadPos += loadingBlockSize;
}
if (lastPos > samPos) {
cerr << "The input alignment file is not sorted. Please use sorted SAM file as alignment file." << endl;
throw 1;
}
2025-01-19 09:06:28 +00:00
positions->linePool.send(std::move(line));
2025-01-18 13:09:52 +00:00
lastPos = samPos;
}
//}
if (!standardInMode) {
inputFile.close();
}
2025-01-19 09:06:28 +00:00
positions->linePool.close();
for (int i = 0; i < nThreads; i++){
workers[i].join();
2025-01-18 13:09:52 +00:00
}
2025-01-19 09:06:28 +00:00
2025-01-18 13:09:52 +00:00
// make sure all workers finished their appending work.
positions->appendingFinished();
// move all position to outputPool
positions->moveAllToOutput();
// wait until outputPool is empty
2025-01-19 09:06:28 +00:00
positions->outputPositionPool.close();
2025-01-18 13:09:52 +00:00
outputThread.join();
delete positions;
return 0;
}
int main(int argc, const char** argv)
{
int ret = 0;
try {
parseOptions(argc, argv);
ret = hisat_3n_table();
} catch(std::exception& e) {
cerr << "Error: Encountered exception: '" << e.what() << "'" << endl;
cerr << "Command: ";
for(int i = 0; i < argc; i++) cerr << argv[i] << " ";
cerr << endl;
return 1;
} catch(int e) {
if (e != 0) {
cerr << "Error: Encountered internal HISAT-3N exception (#" << e << ")" << endl;
cerr << "Command: ";
for(int i = 0; i < argc; i++) cerr << argv[i] << " ";
cerr << endl;
}
return e;
}
return ret;
}