hisat-3n/hisat2.cpp
2025-01-18 21:09:52 +08:00

4979 lines
199 KiB
C++
Raw Permalink Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

/*
* Copyright 2015, Daehwan Kim <infphilo@gmail.com>
*
* This file is part of HISAT 2.
* This file is edited by Yun (Leo) Zhang for HISAT-3N.
*
* HISAT 2 is free software: you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation, either version 3 of the License, or
* (at your option) any later version.
*
* HISAT 2 is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with HISAT 2. If not, see <http://www.gnu.org/licenses/>.
*/
#include <stdlib.h>
#include <iostream>
#include <fstream>
#include <string>
#include <cassert>
#include <stdexcept>
#include <getopt.h>
#include <math.h>
#include <utility>
#include <limits>
#include "alphabet.h"
#include "assert_helpers.h"
#include "endian_swap.h"
#include "hgfm.h"
#include "rfm.h"
#include "formats.h"
#include "sequence_io.h"
#include "tokenize.h"
#include "aln_sink.h"
#include "pat.h"
#include "threading.h"
#include "ds.h"
#include "aligner_metrics.h"
#include "sam.h"
#include "aligner_seed.h"
#include "splice_site.h"
#include "spliced_aligner.h"
#include "aligner_seed_policy.h"
#include "aligner_sw.h"
#include "aligner_sw_driver.h"
#include "aligner_cache.h"
#include "util.h"
#include "pe.h"
#include "tp.h"
#include "gp.h"
#include "simple_func.h"
#include "presets.h"
#include "opts.h"
#include "outq.h"
#include "repeat_kmer.h"
#include "hisat2lib/ht2.h"
//#include "utility_3n.h"
using namespace std;
MemoryTally gMemTally;
static EList<string> mates1; // mated reads (first mate)
static EList<string> mates2; // mated reads (second mate)
static EList<string> mates12; // mated reads (1st/2nd interleaved in 1 file)
static string adjIdxBase;
static string adjIdxBases_3N[2];
bool gColor; // colorspace (not supported)
int gVerbose; // be talkative
static bool startVerbose; // be talkative at startup
int gQuiet; // print nothing but the alignments
static int sanityCheck; // enable expensive sanity checks
static int format; // default read format is FASTQ
static string origString; // reference text, or filename(s)
static int seed; // srandom() seed
static int timing; // whether to report basic timing data
static int metricsIval; // interval between alignment metrics messages (0 = no messages)
static string metricsFile;// output file to put alignment metrics in
static bool metricsStderr;// output file to put alignment metrics in
static bool metricsPerRead; // report a metrics tuple for every read
static bool allHits; // for multihits, report just one
static bool showVersion; // just print version and quit?
static int ipause; // pause before maching?
static uint32_t qUpto; // max # of queries to read
int gTrim5; // amount to trim from 5' end
int gTrim3; // amount to trim from 3' end
static int offRate; // keep default offRate
static bool solexaQuals; // quality strings are solexa quals, not phred, and subtract 64 (not 33)
static bool phred64Quals; // quality chars are phred, but must subtract 64 (not 33)
static bool integerQuals; // quality strings are space-separated strings of integers, not ASCII
static int nthreads; // number of pthreads operating concurrently
static int outType; // style of output
static bool noRefNames; // true -> print reference indexes; not names
static uint32_t khits; // number of hits per read; >1 is much slower
static uint32_t mhits; // don't report any hits if there are > mhits
static int partitionSz; // output a partitioning key in first field
static bool useSpinlock; // false -> don't use of spinlocks even if they're #defines
static bool fileParallel; // separate threads read separate input files in parallel
static bool useShmem; // use shared memory to hold the index
static bool useMm; // use memory-mapped files to hold the index
static bool mmSweep; // sweep through memory-mapped files immediately after mapping
int gMinInsert; // minimum insert size
int gMaxInsert; // maximum insert size
bool gMate1fw; // -1 mate aligns in fw orientation on fw strand
bool gMate2fw; // -2 mate aligns in rc orientation on fw strand
bool gFlippedMatesOK; // allow mates to be in wrong order
bool gDovetailMatesOK; // allow one mate to extend off the end of the other
bool gContainMatesOK; // allow one mate to contain the other in PE alignment
bool gOlapMatesOK; // allow mates to overlap in PE alignment
bool gExpandToFrag; // incr max frag length to =larger mate len if necessary
bool gReportDiscordant; // find and report discordant paired-end alignments
bool gReportMixed; // find and report unpaired alignments for paired reads
static uint32_t cacheLimit; // ranges w/ size > limit will be cached
static uint32_t cacheSize; // # words per range cache
static uint32_t skipReads; // # reads/read pairs to skip
bool gNofw; // don't align fw orientation of read
bool gNorc; // don't align rc orientation of read
static uint32_t fastaContLen;
static uint32_t fastaContFreq;
static bool hadoopOut; // print Hadoop status and summary messages
static bool fuzzy;
static bool fullRef;
static bool samTruncQname; // whether to truncate QNAME to 255 chars
static bool samOmitSecSeqQual; // omit SEQ/QUAL for 2ndary alignments?
static bool samNoUnal; // don't print records for unaligned reads
static bool samNoHead; // don't print any header lines in SAM output
static bool samNoSQ; // don't print @SQ header lines
static bool sam_print_as;
static bool sam_print_xs; // XS:i
static bool sam_print_xss; // Xs:i and Ys:i
static bool sam_print_yn; // YN:i and Yn:i
static bool sam_print_xn;
static bool sam_print_cs;
static bool sam_print_cq;
static bool sam_print_x0;
static bool sam_print_x1;
static bool sam_print_xm;
static bool sam_print_xo;
static bool sam_print_xg;
static bool sam_print_nm;
static bool sam_print_md;
static bool sam_print_yf;
static bool sam_print_yi;
static bool sam_print_ym;
static bool sam_print_yp;
static bool sam_print_yt;
static bool sam_print_ys;
static bool sam_print_zs;
static bool sam_print_xr;
static bool sam_print_xt;
static bool sam_print_xd;
static bool sam_print_xu;
static bool sam_print_yl;
static bool sam_print_ye;
static bool sam_print_yu;
static bool sam_print_xp;
static bool sam_print_yr;
static bool sam_print_zb;
static bool sam_print_zr;
static bool sam_print_zf;
static bool sam_print_zm;
static bool sam_print_zi;
static bool sam_print_zp;
static bool sam_print_zu;
static bool sam_print_xs_a;
static bool sam_print_nh;
static bool bwaSwLike;
static float bwaSwLikeC;
static float bwaSwLikeT;
static bool qcFilter;
static bool sortByScore; // prioritize alignments to report by score?
bool gReportOverhangs; // false -> filter out alignments that fall off the end of a reference sequence
static string rgid; // ID: setting for @RG header line
static string rgs; // SAM outputs for @RG header line
static string rgs_optflag; // SAM optional flag to add corresponding to @RG ID
static bool msample; // whether to report a random alignment when maxed-out via -m/-M
int gGapBarrier; // # diags on top/bot only to be entered diagonally
static EList<string> qualities;
static EList<string> qualities1;
static EList<string> qualities2;
static string polstr; // temporary holder for policy string
static bool msNoCache; // true -> disable local cache
static int bonusMatchType; // how to reward matches
static int bonusMatch; // constant reward if bonusMatchType=constant
static int penMmcType; // how to penalize mismatches
int penMmcMax; // max mm penalty
static int penMmcMin; // min mm penalty
static int penScMax; // max sc penalty
static int penScMin; // min sc penalty
static int penNType; // how to penalize Ns in the read
static int penN; // constant if N pelanty is a constant
static bool penNCatPair; // concatenate mates before N filtering?
static bool localAlign; // do local alignment in DP steps
static bool noisyHpolymer; // set to true if gap penalties should be reduced to be consistent with a sequencer that under- and overcalls homopolymers
static int penRdGapConst; // constant cost of extending a gap in the read
static int penRfGapConst; // constant cost of extending a gap in the reference
static int penRdGapLinear; // coeff of linear term for cost of gap extension in read
static int penRfGapLinear; // coeff of linear term for cost of gap extension in ref
SimpleFunc scoreMin; // minimum valid score as function of read len
static SimpleFunc nCeil; // max # Ns allowed as function of read len
static SimpleFunc msIval; // interval between seeds as function of read len
static double descConsExp; // how to adjust score minimum as we descent further into index-assisted alignment
static size_t descentLanding; // don't place a search root if it's within this many positions of end
static SimpleFunc descentTotSz; // maximum space a DescentDriver can use in bytes
static SimpleFunc descentTotFmops; // maximum # FM ops a DescentDriver can perform
static int multiseedMms; // mismatches permitted in a multiseed seed
static int multiseedLen; // length of multiseed seeds
static size_t multiseedOff; // offset to begin extracting seeds
static uint32_t seedCacheLocalMB; // # MB to use for non-shared seed alignment cacheing
static uint32_t seedCacheCurrentMB; // # MB to use for current-read seed hit cacheing
static uint32_t exactCacheCurrentMB; // # MB to use for current-read seed hit cacheing
static size_t maxhalf; // max width on one side of DP table
static bool seedSumm; // print summary information about seed hits, not alignments
static bool doUngapped; // do ungapped alignment
static size_t maxIters; // stop after this many extend loop iterations
static size_t maxUg; // stop after this many ungap extends
static size_t maxDp; // stop after this many DPs
static size_t maxItersIncr; // amt to add to maxIters for each -k > 1
static size_t maxEeStreak; // stop after this many end-to-end fails in a row
static size_t maxUgStreak; // stop after this many ungap fails in a row
static size_t maxDpStreak; // stop after this many dp fails in a row
static size_t maxStreakIncr; // amt to add to streak for each -k > 1
static size_t maxMateStreak; // stop seed range after this many mate-find fails
static bool doExtend; // extend seed hits
static bool enable8; // use 8-bit SSE where possible?
static size_t cminlen; // longer reads use checkpointing
static size_t cpow2; // checkpoint interval log2
static bool doTri; // do triangular mini-fills?
static string defaultPreset; // default preset; applied immediately
static bool ignoreQuals; // all mms incur same penalty, regardless of qual
static string wrapper; // type of wrapper script, so we can print correct usage
static EList<string> queries; // list of query files
static string outfile; // write SAM output to this file
static int mapqv; // MAPQ calculation version
static int tighten; // -M tighten mode (0=none, 1=best, 2=secbest+1)
static bool doExactUpFront; // do exact search up front if seeds seem good enough
static bool do1mmUpFront; // do 1mm search up front if seeds seem good enough
static size_t do1mmMinLen; // length below which we disable 1mm e2e search
static int seedBoostThresh; // if average non-zero position has more than this many elements
static size_t maxSeeds; // maximum number of seeds allowed
static size_t nSeedRounds; // # seed rounds
static bool reorder; // true -> reorder SAM recs in -p mode
static float sampleFrac; // only align random fraction of input reads
static bool arbitraryRandom; // pseudo-randoms no longer a function of read properties
static bool bowtie2p5;
static bool useTempSpliceSite;
static int penCanSplice;
static int penNoncanSplice;
static int penConflictSplice;
static SimpleFunc penCanIntronLen;
static SimpleFunc penNoncanIntronLen;
static size_t minIntronLen;
static size_t maxIntronLen;
static string knownSpliceSiteInfile; //
static string novelSpliceSiteInfile; //
static string novelSpliceSiteOutfile; //
static bool secondary;
static bool no_spliced_alignment;
static int rna_strandness; //
static bool splicesite_db_only; //
static bool anchorStop;
static bool pseudogeneStop;
static bool tranMapOnly; // transcriptome mapping only
static bool tranAssm; // alignments selected for downstream transcript assembly such as StringTie and Cufflinks
static string tranAssm_program;
static bool avoid_pseudogene;
#ifdef USE_SRA
static EList<string> sra_accs;
#endif
static string bt2indexs[2]; // read Bowtie 2 index from files with this prefix
static EList<pair<int, string> > extra_opts;
static size_t extra_opts_cur;
static EList<uint64_t> thread_rids;
static MUTEX_T thread_rids_mutex;
static uint64_t thread_rids_mindist;
static bool rmChrName; // remove "chr" from reference names (e.g., chr18 to 18)
static bool addChrName; // add "chr" to reference names (e.g., 18 to chr18)
static size_t max_alts_tried;
static bool use_haplotype;
static bool enable_codis;
static bool templateLenAdjustment;
static string alignSumFile; // write alignment summary stat. to this file
static bool newAlignSummary;
static int bowtie2_dp; // Bowtie2's dynamic programming alignment (0: no dynamic programming, 1: conditional dynamic programming, and 2: uncoditional dynamic programming)
static bool fast; // --fast
static bool sensitive; // --sensitive
static bool very_sensitive; // --very-sensitive
static bool repeat;
static bool use_repeat_index;
static EList<size_t> readLens;
// 3N variable
bool threeN = false; // indicator for 3N mode.
bool base_change_entered; // set true once user used --base-change
char usrInput_convertedFrom; // user input converted from. the nucleotide is replaced by others in sample preparation protocol. for sequence comparison step in HISAT-3N.
char usrInput_convertedTo; // user input converted To. the nucleotide to others in sample preparation protocol. for sequence comparison step in HISAT-3N.
char usrInput_convertedFromComplement; // the complement of usrInput_convertedFrom. for sequence comparison step in HISAT-3N.
char usrInput_convertedToComplement; // the complement of usrInput_convertedTo. for sequence comparison step in HISAT-3N.
char hs3N_convertedFrom; // the actual converted from by HISAT-3N. use in + strand.
char hs3N_convertedTo; // the actual converted to by HISAT-3N. use in + strand.
char hs3N_convertedFromComplement; // the complement of hs3N_convertedFrom. use in - strand.
char hs3N_convertedToComplement; // the complement of hs3N_convertedTo. use in - strand.
string threeN_indexTags[2];
vector<ht2_handle_t> repeatHandles; // the 2 repeat handles helps expand the repeat alignment information. 0 for + strand. 1 for - strand.
struct ht2_index_getrefnames_result *refNameMap; // chromosome names and it's index for repeat alignment.
int repeatLimit; // expand #repeatLimit of qualified position in repeat alignment.
bool uniqueOutputOnly; // only output the unique alignment result.
int nMappingCycle; // =1 for standard HISAT2, =4 for HISAT-3N
bool mappingCycles[4]; // this array will indicate which mapping cycle will be run
int directional3NMapping; // =0 for non-directional mapping, =1 for directional mapping and read1/single-end map to fw reference, =2 for reverse directional mapping and read1/single-end map to rc reference.
#define DMAX std::numeric_limits<double>::max()
static void resetOptions() {
mates1.clear();
mates2.clear();
mates12.clear();
adjIdxBase = "";
adjIdxBases_3N[0] = "";
adjIdxBases_3N[1] = "";
gColor = false;
gVerbose = 0;
startVerbose = 0;
gQuiet = false;
sanityCheck = 0; // enable expensive sanity checks
format = FASTQ; // default read format is FASTQ
origString = ""; // reference text, or filename(s)
seed = 0; // srandom() seed
timing = 0; // whether to report basic timing data
metricsIval = 1; // interval between alignment metrics messages (0 = no messages)
metricsFile = ""; // output file to put alignment metrics in
metricsStderr = false; // print metrics to stderr (in addition to --metrics-file if it's specified
metricsPerRead = false; // report a metrics tuple for every read?
allHits = false; // for multihits, report just one
showVersion = false; // just print version and quit?
ipause = 0; // pause before maching?
qUpto = 0xffffffff; // max # of queries to read
gTrim5 = 0; // amount to trim from 5' end
gTrim3 = 0; // amount to trim from 3' end
offRate = -1; // keep default offRate
solexaQuals = false; // quality strings are solexa quals, not phred, and subtract 64 (not 33)
phred64Quals = false; // quality chars are phred, but must subtract 64 (not 33)
integerQuals = false; // quality strings are space-separated strings of integers, not ASCII
nthreads = 1; // number of pthreads operating concurrently
outType = OUTPUT_SAM; // style of output
noRefNames = false; // true -> print reference indexes; not names
khits = 10; // number of hits per read; >1 is much slower
mhits = 0; // stop after finding this many alignments+1
partitionSz = 0; // output a partitioning key in first field
useSpinlock = true; // false -> don't use of spinlocks even if they're #defines
fileParallel = false; // separate threads read separate input files in parallel
useShmem = false; // use shared memory to hold the index
useMm = false; // use memory-mapped files to hold the index
mmSweep = false; // sweep through memory-mapped files immediately after mapping
gMinInsert = 0; // minimum insert size
gMaxInsert = 1000; // maximum insert size
gMate1fw = true; // -1 mate aligns in fw orientation on fw strand
gMate2fw = false; // -2 mate aligns in rc orientation on fw strand
gFlippedMatesOK = false; // allow mates to be in wrong order
gDovetailMatesOK = false; // allow one mate to extend off the end of the other
gContainMatesOK = true; // allow one mate to contain the other in PE alignment
gOlapMatesOK = true; // allow mates to overlap in PE alignment
gExpandToFrag = true; // incr max frag length to =larger mate len if necessary
gReportDiscordant = true; // find and report discordant paired-end alignments
gReportMixed = true; // find and report unpaired alignments for paired reads
cacheLimit = 5; // ranges w/ size > limit will be cached
cacheSize = 0; // # words per range cache
skipReads = 0; // # reads/read pairs to skip
gNofw = false; // don't align fw orientation of read
gNorc = false; // don't align rc orientation of read
fastaContLen = 0;
fastaContFreq = 0;
hadoopOut = false; // print Hadoop status and summary messages
fuzzy = false; // reads will have alternate basecalls w/ qualities
fullRef = false; // print entire reference name instead of just up to 1st space
samTruncQname = true; // whether to truncate QNAME to 255 chars
samOmitSecSeqQual = false; // omit SEQ/QUAL for 2ndary alignments?
samNoUnal = false; // omit SAM records for unaligned reads
samNoHead = false; // don't print any header lines in SAM output
samNoSQ = false; // don't print @SQ header lines
sam_print_as = true;
sam_print_xs = true;
sam_print_xss = false; // Xs:i and Ys:i
sam_print_yn = false; // YN:i and Yn:i
sam_print_xn = true;
sam_print_cs = false;
sam_print_cq = false;
sam_print_x0 = true;
sam_print_x1 = true;
sam_print_xm = true;
sam_print_xo = true;
sam_print_xg = true;
sam_print_nm = true;
sam_print_md = true;
sam_print_yf = true;
sam_print_yi = false;
sam_print_ym = false;
sam_print_yp = false;
sam_print_yt = true;
sam_print_ys = true;
sam_print_zs = false;
sam_print_xr = false;
sam_print_xt = false;
sam_print_xd = false;
sam_print_xu = false;
sam_print_yl = false;
sam_print_ye = false;
sam_print_yu = false;
sam_print_xp = false;
sam_print_yr = false;
sam_print_zb = false;
sam_print_zr = false;
sam_print_zf = false;
sam_print_zm = false;
sam_print_zi = false;
sam_print_zp = false;
sam_print_zu = false;
sam_print_xs_a = true;
sam_print_nh = true;
bwaSwLike = false;
bwaSwLikeC = 5.5f;
bwaSwLikeT = 20.0f;
qcFilter = false; // don't believe upstream qc by default
sortByScore = true; // prioritize alignments to report by score?
rgid = ""; // SAM outputs for @RG header line
rgs = ""; // SAM outputs for @RG header line
rgs_optflag = ""; // SAM optional flag to add corresponding to @RG ID
msample = true;
gGapBarrier = 4; // disallow gaps within this many chars of either end of alignment
qualities.clear();
qualities1.clear();
qualities2.clear();
polstr.clear();
msNoCache = true; // true -> disable local cache
bonusMatchType = DEFAULT_MATCH_BONUS_TYPE;
bonusMatch = DEFAULT_MATCH_BONUS;
penMmcType = DEFAULT_MM_PENALTY_TYPE;
penMmcMax = DEFAULT_MM_PENALTY_MAX;
penMmcMin = DEFAULT_MM_PENALTY_MIN;
penScMax = DEFAULT_SC_PENALTY_MAX;
penScMin = DEFAULT_SC_PENALTY_MIN;
penNType = DEFAULT_N_PENALTY_TYPE;
penN = DEFAULT_N_PENALTY;
penNCatPair = DEFAULT_N_CAT_PAIR; // concatenate mates before N filtering?
localAlign = false; // do local alignment in DP steps
noisyHpolymer = false;
penRdGapConst = DEFAULT_READ_GAP_CONST;
penRfGapConst = DEFAULT_REF_GAP_CONST;
penRdGapLinear = DEFAULT_READ_GAP_LINEAR;
penRfGapLinear = DEFAULT_REF_GAP_LINEAR;
scoreMin.init (SIMPLE_FUNC_LINEAR, 0.0f, -0.2f);
// scoreMin.init (SIMPLE_FUNC_CONST, -18, 0);
nCeil.init (SIMPLE_FUNC_LINEAR, 0.0f, DMAX, 2.0f, 0.1f);
msIval.init (SIMPLE_FUNC_LINEAR, 1.0f, DMAX, DEFAULT_IVAL_B, DEFAULT_IVAL_A);
descConsExp = 2.0;
descentLanding = 20;
descentTotSz.init(SIMPLE_FUNC_LINEAR, 1024.0, DMAX, 0.0, 1024.0);
descentTotFmops.init(SIMPLE_FUNC_LINEAR, 100.0, DMAX, 0.0, 10.0);
multiseedMms = DEFAULT_SEEDMMS;
multiseedLen = DEFAULT_SEEDLEN;
multiseedOff = 0;
seedCacheLocalMB = 32; // # MB to use for non-shared seed alignment cacheing
seedCacheCurrentMB = 20; // # MB to use for current-read seed hit cacheing
exactCacheCurrentMB = 20; // # MB to use for current-read seed hit cacheing
maxhalf = 15; // max width on one side of DP table
seedSumm = false; // print summary information about seed hits, not alignments
doUngapped = true; // do ungapped alignment
maxIters = 400; // max iterations of extend loop
maxUg = 300; // stop after this many ungap extends
maxDp = 300; // stop after this many dp extends
maxItersIncr = 20; // amt to add to maxIters for each -k > 1
maxEeStreak = 15; // stop after this many end-to-end fails in a row
maxUgStreak = 15; // stop after this many ungap fails in a row
maxDpStreak = 15; // stop after this many dp fails in a row
maxStreakIncr = 10; // amt to add to streak for each -k > 1
maxMateStreak = 10; // in PE: abort seed range after N mate-find fails
doExtend = true; // do seed extensions
enable8 = true; // use 8-bit SSE where possible?
cminlen = 2000; // longer reads use checkpointing
cpow2 = 4; // checkpoint interval log2
doTri = false; // do triangular mini-fills?
defaultPreset = "sensitive%LOCAL%"; // default preset; applied immediately
extra_opts.clear();
extra_opts_cur = 0;
bt2indexs[0].clear(); // read Bowtie 2 index from files with this prefix
bt2indexs[1].clear();
ignoreQuals = false; // all mms incur same penalty, regardless of qual
wrapper.clear(); // type of wrapper script, so we can print correct usage
queries.clear(); // list of query files
outfile.clear(); // write SAM output to this file
mapqv = 2; // MAPQ calculation version
tighten = 3; // -M tightening mode
doExactUpFront = true; // do exact search up front if seeds seem good enough
do1mmUpFront = true; // do 1mm search up front if seeds seem good enough
seedBoostThresh = 300; // if average non-zero position has more than this many elements
nSeedRounds = 2; // # rounds of seed searches to do for repetitive reads
maxSeeds = 0; // maximum number of seeds allowed
do1mmMinLen = 60; // length below which we disable 1mm search
reorder = false; // reorder SAM records with -p > 1
sampleFrac = 1.1f; // align all reads
arbitraryRandom = false; // let pseudo-random seeds be a function of read properties
bowtie2p5 = false;
useTempSpliceSite = true;
penCanSplice = 0;
penNoncanSplice = 12;
penConflictSplice = 1000000;
penCanIntronLen.init(SIMPLE_FUNC_LOG, -8, 1);
penNoncanIntronLen.init(SIMPLE_FUNC_LOG, -8, 1);
minIntronLen = 20;
maxIntronLen = 500000;
knownSpliceSiteInfile = "";
novelSpliceSiteInfile = "";
novelSpliceSiteOutfile = "";
secondary = false; // allow secondary alignments
no_spliced_alignment = false;
rna_strandness = RNA_STRANDNESS_UNKNOWN;
splicesite_db_only = false;
anchorStop = true;
pseudogeneStop = true;
tranMapOnly = false;
tranAssm = false;
tranAssm_program = "";
avoid_pseudogene = false;
#ifdef USE_SRA
sra_accs.clear();
#endif
rmChrName = false;
addChrName = false;
max_alts_tried = 16;
use_haplotype = false;
enable_codis = false;
templateLenAdjustment = true;
alignSumFile = "";
newAlignSummary = false;
bowtie2_dp = 0; // disable Bowtie2's dynamic programming alignment
fast = false;
sensitive = false;
very_sensitive = false;
repeat = false; // true iff alignments to repeat sequences are directly reported.
use_repeat_index = true;
readLens.clear();
refNameMap = NULL;
threeN = false;
repeatLimit = 1000;
uniqueOutputOnly = false;
base_change_entered = false;
threeN_indexTags[0] = ".3n.";
threeN_indexTags[1] = ".3n.";
nMappingCycle = 1;
directional3NMapping = 0;
for (int i = 0; i < 4; i++){
mappingCycles[i] = false;
}
}
static const char *short_options = "fF:qbzhcu:rv:s:aP:t3:5:w:p:k:M:1:2:I:X:CQ:N:i:L:U:x:S:g:O:D:R:";
static struct option long_options[] = {
{(char*)"verbose", no_argument, 0, ARG_VERBOSE},
{(char*)"startverbose", no_argument, 0, ARG_STARTVERBOSE},
{(char*)"quiet", no_argument, 0, ARG_QUIET},
{(char*)"sanity", no_argument, 0, ARG_SANITY},
{(char*)"pause", no_argument, &ipause, 1},
{(char*)"orig", required_argument, 0, ARG_ORIG},
{(char*)"all", no_argument, 0, 'a'},
{(char*)"solexa-quals", no_argument, 0, ARG_SOLEXA_QUALS},
{(char*)"integer-quals",no_argument, 0, ARG_INTEGER_QUALS},
{(char*)"int-quals", no_argument, 0, ARG_INTEGER_QUALS},
{(char*)"metrics", required_argument, 0, ARG_METRIC_IVAL},
{(char*)"metrics-file", required_argument, 0, ARG_METRIC_FILE},
{(char*)"metrics-stderr",no_argument, 0, ARG_METRIC_STDERR},
{(char*)"metrics-per-read", no_argument, 0, ARG_METRIC_PER_READ},
{(char*)"met-read", no_argument, 0, ARG_METRIC_PER_READ},
{(char*)"met", required_argument, 0, ARG_METRIC_IVAL},
{(char*)"met-file", required_argument, 0, ARG_METRIC_FILE},
{(char*)"met-stderr", no_argument, 0, ARG_METRIC_STDERR},
{(char*)"time", no_argument, 0, 't'},
{(char*)"trim3", required_argument, 0, '3'},
{(char*)"trim5", required_argument, 0, '5'},
{(char*)"seed", required_argument, 0, ARG_SEED},
{(char*)"qupto", required_argument, 0, 'u'},
{(char*)"upto", required_argument, 0, 'u'},
{(char*)"version", no_argument, 0, ARG_VERSION},
{(char*)"filepar", no_argument, 0, ARG_FILEPAR},
{(char*)"help", no_argument, 0, 'h'},
{(char*)"threads", required_argument, 0, 'p'},
{(char*)"khits", required_argument, 0, 'k'},
{(char*)"minins", required_argument, 0, 'I'},
{(char*)"maxins", required_argument, 0, 'X'},
{(char*)"quals", required_argument, 0, 'Q'},
{(char*)"Q1", required_argument, 0, ARG_QUALS1},
{(char*)"Q2", required_argument, 0, ARG_QUALS2},
{(char*)"refidx", no_argument, 0, ARG_REFIDX},
{(char*)"partition", required_argument, 0, ARG_PARTITION},
{(char*)"ff", no_argument, 0, ARG_FF},
{(char*)"fr", no_argument, 0, ARG_FR},
{(char*)"rf", no_argument, 0, ARG_RF},
{(char*)"cachelim", required_argument, 0, ARG_CACHE_LIM},
{(char*)"cachesz", required_argument, 0, ARG_CACHE_SZ},
{(char*)"nofw", no_argument, 0, ARG_NO_FW},
{(char*)"norc", no_argument, 0, ARG_NO_RC},
{(char*)"skip", required_argument, 0, 's'},
{(char*)"12", required_argument, 0, ARG_ONETWO},
{(char*)"tab5", required_argument, 0, ARG_TAB5},
{(char*)"tab6", required_argument, 0, ARG_TAB6},
{(char*)"phred33-quals", no_argument, 0, ARG_PHRED33},
{(char*)"phred64-quals", no_argument, 0, ARG_PHRED64},
{(char*)"phred33", no_argument, 0, ARG_PHRED33},
{(char*)"phred64", no_argument, 0, ARG_PHRED64},
{(char*)"solexa1.3-quals", no_argument, 0, ARG_PHRED64},
{(char*)"mm", no_argument, 0, ARG_MM},
{(char*)"shmem", no_argument, 0, ARG_SHMEM},
{(char*)"mmsweep", no_argument, 0, ARG_MMSWEEP},
{(char*)"hadoopout", no_argument, 0, ARG_HADOOPOUT},
{(char*)"fuzzy", no_argument, 0, ARG_FUZZY},
{(char*)"fullref", no_argument, 0, ARG_FULLREF},
{(char*)"usage", no_argument, 0, ARG_USAGE},
{(char*)"sam-no-qname-trunc", no_argument, 0, ARG_SAM_NO_QNAME_TRUNC},
{(char*)"sam-omit-sec-seq", no_argument, 0, ARG_SAM_OMIT_SEC_SEQ},
{(char*)"omit-sec-seq", no_argument, 0, ARG_SAM_OMIT_SEC_SEQ},
{(char*)"sam-no-head", no_argument, 0, ARG_SAM_NOHEAD},
{(char*)"sam-nohead", no_argument, 0, ARG_SAM_NOHEAD},
{(char*)"sam-noHD", no_argument, 0, ARG_SAM_NOHEAD},
{(char*)"sam-no-hd", no_argument, 0, ARG_SAM_NOHEAD},
{(char*)"sam-nosq", no_argument, 0, ARG_SAM_NOSQ},
{(char*)"sam-no-sq", no_argument, 0, ARG_SAM_NOSQ},
{(char*)"sam-noSQ", no_argument, 0, ARG_SAM_NOSQ},
{(char*)"no-head", no_argument, 0, ARG_SAM_NOHEAD},
{(char*)"no-hd", no_argument, 0, ARG_SAM_NOHEAD},
{(char*)"no-sq", no_argument, 0, ARG_SAM_NOSQ},
{(char*)"no-HD", no_argument, 0, ARG_SAM_NOHEAD},
{(char*)"no-SQ", no_argument, 0, ARG_SAM_NOSQ},
{(char*)"no-unal", no_argument, 0, ARG_SAM_NO_UNAL},
{(char*)"color", no_argument, 0, 'C'},
{(char*)"sam-RG", required_argument, 0, ARG_SAM_RG},
{(char*)"sam-rg", required_argument, 0, ARG_SAM_RG},
{(char*)"sam-rg-id", required_argument, 0, ARG_SAM_RGID},
{(char*)"RG", required_argument, 0, ARG_SAM_RG},
{(char*)"rg", required_argument, 0, ARG_SAM_RG},
{(char*)"rg-id", required_argument, 0, ARG_SAM_RGID},
{(char*)"snpphred", required_argument, 0, ARG_SNPPHRED},
{(char*)"snpfrac", required_argument, 0, ARG_SNPFRAC},
{(char*)"gbar", required_argument, 0, ARG_GAP_BAR},
{(char*)"qseq", no_argument, 0, ARG_QSEQ},
{(char*)"policy", required_argument, 0, ARG_ALIGN_POLICY},
{(char*)"preset", required_argument, 0, 'P'},
{(char*)"seed-summ", no_argument, 0, ARG_SEED_SUMM},
{(char*)"seed-summary", no_argument, 0, ARG_SEED_SUMM},
{(char*)"overhang", no_argument, 0, ARG_OVERHANG},
{(char*)"no-cache", no_argument, 0, ARG_NO_CACHE},
{(char*)"cache", no_argument, 0, ARG_USE_CACHE},
{(char*)"454", no_argument, 0, ARG_NOISY_HPOLY},
{(char*)"ion-torrent", no_argument, 0, ARG_NOISY_HPOLY},
{(char*)"no-mixed", no_argument, 0, ARG_NO_MIXED},
{(char*)"no-discordant",no_argument, 0, ARG_NO_DISCORDANT},
// {(char*)"local", no_argument, 0, ARG_LOCAL},
{(char*)"end-to-end", no_argument, 0, ARG_END_TO_END},
{(char*)"ungapped", no_argument, 0, ARG_UNGAPPED},
{(char*)"no-ungapped", no_argument, 0, ARG_UNGAPPED_NO},
{(char*)"sse8", no_argument, 0, ARG_SSE8},
{(char*)"no-sse8", no_argument, 0, ARG_SSE8_NO},
{(char*)"scan-narrowed",no_argument, 0, ARG_SCAN_NARROWED},
{(char*)"qc-filter", no_argument, 0, ARG_QC_FILTER},
{(char*)"bwa-sw-like", no_argument, 0, ARG_BWA_SW_LIKE},
{(char*)"multiseed", required_argument, 0, ARG_MULTISEED_IVAL},
{(char*)"ma", required_argument, 0, ARG_SCORE_MA},
{(char*)"mp", required_argument, 0, ARG_SCORE_MMP},
{(char*)"sp", required_argument, 0, ARG_SCORE_SCP},
{(char*)"no-softclip", no_argument, 0, ARG_NO_SOFTCLIP},
{(char*)"np", required_argument, 0, ARG_SCORE_NP},
{(char*)"rdg", required_argument, 0, ARG_SCORE_RDG},
{(char*)"rfg", required_argument, 0, ARG_SCORE_RFG},
{(char*)"score-min", required_argument, 0, ARG_SCORE_MIN},
{(char*)"min-score", required_argument, 0, ARG_SCORE_MIN},
{(char*)"n-ceil", required_argument, 0, ARG_N_CEIL},
{(char*)"dpad", required_argument, 0, ARG_DPAD},
{(char*)"mapq-print-inputs",no_argument, 0, ARG_SAM_PRINT_YI},
{(char*)"very-fast", no_argument, 0, ARG_PRESET_VERY_FAST},
{(char*)"fast", no_argument, 0, ARG_PRESET_FAST},
{(char*)"sensitive", no_argument, 0, ARG_PRESET_SENSITIVE},
{(char*)"very-sensitive", no_argument, 0, ARG_PRESET_VERY_SENSITIVE},
// {(char*)"very-fast-local", no_argument, 0, ARG_PRESET_VERY_FAST_LOCAL},
// {(char*)"fast-local", no_argument, 0, ARG_PRESET_FAST_LOCAL},
// {(char*)"sensitive-local", no_argument, 0, ARG_PRESET_SENSITIVE_LOCAL},
// {(char*)"very-sensitive-local", no_argument, 0, ARG_PRESET_VERY_SENSITIVE_LOCAL},
{(char*)"no-score-priority",no_argument, 0, ARG_NO_SCORE_PRIORITY},
{(char*)"seedlen", required_argument, 0, 'L'},
{(char*)"seedmms", required_argument, 0, 'N'},
{(char*)"seedival", required_argument, 0, 'i'},
{(char*)"ignore-quals", no_argument, 0, ARG_IGNORE_QUALS},
{(char*)"index", required_argument, 0, 'x'},
{(char*)"arg-desc", no_argument, 0, ARG_DESC},
{(char*)"wrapper", required_argument, 0, ARG_WRAPPER},
{(char*)"unpaired", required_argument, 0, 'U'},
{(char*)"output", required_argument, 0, 'S'},
{(char*)"mapq-v", required_argument, 0, ARG_MAPQ_V},
{(char*)"dovetail", no_argument, 0, ARG_DOVETAIL},
{(char*)"no-dovetail", no_argument, 0, ARG_NO_DOVETAIL},
{(char*)"contain", no_argument, 0, ARG_CONTAIN},
{(char*)"no-contain", no_argument, 0, ARG_NO_CONTAIN},
{(char*)"overlap", no_argument, 0, ARG_OVERLAP},
{(char*)"no-overlap", no_argument, 0, ARG_NO_OVERLAP},
{(char*)"tighten", required_argument, 0, ARG_TIGHTEN},
{(char*)"exact-upfront", no_argument, 0, ARG_EXACT_UPFRONT},
{(char*)"1mm-upfront", no_argument, 0, ARG_1MM_UPFRONT},
{(char*)"no-exact-upfront", no_argument, 0, ARG_EXACT_UPFRONT_NO},
{(char*)"no-1mm-upfront", no_argument, 0, ARG_1MM_UPFRONT_NO},
{(char*)"1mm-minlen", required_argument, 0, ARG_1MM_MINLEN},
{(char*)"seed-off", required_argument, 0, 'O'},
{(char*)"seed-boost", required_argument, 0, ARG_SEED_BOOST_THRESH},
{(char*)"max-seeds", required_argument, 0, ARG_MAX_SEEDS},
{(char*)"read-times", no_argument, 0, ARG_READ_TIMES},
{(char*)"show-rand-seed", no_argument, 0, ARG_SHOW_RAND_SEED},
{(char*)"dp-fail-streak", required_argument, 0, ARG_DP_FAIL_STREAK_THRESH},
{(char*)"ee-fail-streak", required_argument, 0, ARG_EE_FAIL_STREAK_THRESH},
{(char*)"ug-fail-streak", required_argument, 0, ARG_UG_FAIL_STREAK_THRESH},
{(char*)"fail-streak", required_argument, 0, 'D'},
{(char*)"dp-fails", required_argument, 0, ARG_DP_FAIL_THRESH},
{(char*)"ug-fails", required_argument, 0, ARG_UG_FAIL_THRESH},
{(char*)"extends", required_argument, 0, ARG_EXTEND_ITERS},
{(char*)"no-extend", no_argument, 0, ARG_NO_EXTEND},
{(char*)"mapq-extra", no_argument, 0, ARG_MAPQ_EX},
{(char*)"seed-rounds", required_argument, 0, 'R'},
{(char*)"reorder", no_argument, 0, ARG_REORDER},
{(char*)"passthrough", no_argument, 0, ARG_READ_PASSTHRU},
{(char*)"sample", required_argument, 0, ARG_SAMPLE},
{(char*)"cp-min", required_argument, 0, ARG_CP_MIN},
{(char*)"cp-ival", required_argument, 0, ARG_CP_IVAL},
{(char*)"tri", no_argument, 0, ARG_TRI},
{(char*)"nondeterministic", no_argument, 0, ARG_NON_DETERMINISTIC},
{(char*)"non-deterministic", no_argument, 0, ARG_NON_DETERMINISTIC},
// {(char*)"local-seed-cache-sz", required_argument, 0, ARG_LOCAL_SEED_CACHE_SZ},
{(char*)"seed-cache-sz", required_argument, 0, ARG_CURRENT_SEED_CACHE_SZ},
{(char*)"no-unal", no_argument, 0, ARG_SAM_NO_UNAL},
{(char*)"test-25", no_argument, 0, ARG_TEST_25},
// TODO: following should be a function of read length?
{(char*)"desc-kb", required_argument, 0, ARG_DESC_KB},
{(char*)"desc-landing", required_argument, 0, ARG_DESC_LANDING},
{(char*)"desc-exp", required_argument, 0, ARG_DESC_EXP},
{(char*)"desc-fmops", required_argument, 0, ARG_DESC_FMOPS},
{(char*)"no-temp-splicesite", no_argument, 0, ARG_NO_TEMPSPLICESITE},
{(char*)"pen-cansplice", required_argument, 0, ARG_PEN_CANSPLICE},
{(char*)"pen-noncansplice", required_argument, 0, ARG_PEN_NONCANSPLICE},
{(char*)"pen-conflictsplice", required_argument, 0, ARG_PEN_CONFLICTSPLICE},
{(char*)"pen-intronlen", required_argument, 0, ARG_PEN_CANINTRONLEN},
{(char*)"pen-canintronlen", required_argument, 0, ARG_PEN_CANINTRONLEN},
{(char*)"pen-noncanintronlen", required_argument, 0, ARG_PEN_NONCANINTRONLEN},
{(char*)"min-intronlen", required_argument, 0, ARG_MIN_INTRONLEN},
{(char*)"max-intronlen", required_argument, 0, ARG_MAX_INTRONLEN},
{(char*)"known-splicesite-infile", required_argument, 0, ARG_KNOWN_SPLICESITE_INFILE},
{(char*)"novel-splicesite-infile", required_argument, 0, ARG_NOVEL_SPLICESITE_INFILE},
{(char*)"novel-splicesite-outfile", required_argument, 0, ARG_NOVEL_SPLICESITE_OUTFILE},
{(char*)"secondary", no_argument, 0, ARG_SECONDARY},
{(char*)"no-spliced-alignment", no_argument, 0, ARG_NO_SPLICED_ALIGNMENT},
{(char*)"rna-strandness", required_argument, 0, ARG_RNA_STRANDNESS},
{(char*)"splicesite-db-only", no_argument, 0, ARG_SPLICESITE_DB_ONLY},
{(char*)"no-anchorstop", no_argument, 0, ARG_NO_ANCHORSTOP},
{(char*)"transcriptome-mapping-only", no_argument, 0, ARG_TRANSCRIPTOME_MAPPING_ONLY},
{(char*)"tmo", no_argument, 0, ARG_TRANSCRIPTOME_MAPPING_ONLY},
{(char*)"downstream-transcriptome-assembly", no_argument, 0, ARG_TRANSCRIPTOME_ASSEMBLY},
{(char*)"dta", no_argument, 0, ARG_TRANSCRIPTOME_ASSEMBLY},
{(char*)"dta-cufflinks", no_argument, 0, ARG_TRANSCRIPTOME_ASSEMBLY_CUFFLINKS},
{(char*)"avoid-pseudogene",no_argument, 0, ARG_AVOID_PSEUDOGENE},
{(char*)"no-templatelen-adjustment", no_argument, 0, ARG_NO_TEMPLATELEN_ADJUSTMENT},
#ifdef USE_SRA
{(char*)"sra-acc", required_argument, 0, ARG_SRA_ACC},
#endif
{(char*)"remove-chrname", no_argument, 0, ARG_REMOVE_CHRNAME},
{(char*)"add-chrname", no_argument, 0, ARG_ADD_CHRNAME},
{(char*)"max-altstried", required_argument, 0, ARG_MAX_ALTSTRIED},
{(char*)"haplotype", no_argument, 0, ARG_HAPLOTYPE},
{(char*)"enable-codis", no_argument, 0, ARG_CODIS},
{(char*)"summary-file", required_argument, 0, ARG_SUMMARY_FILE},
{(char*)"new-summary", no_argument, 0, ARG_NEW_SUMMARY},
{(char*)"enable-dp", no_argument, 0, ARG_DP},
{(char*)"bowtie2-dp", required_argument, 0, ARG_DP},
{(char*)"repeat", no_argument, 0, ARG_REPEAT},
{(char*)"no-repeat-index", no_argument, 0, ARG_NO_REPEAT_INDEX},
{(char*)"read-lengths", required_argument, 0, ARG_READ_LENGTHS},
{(char*)"base-change", required_argument, 0, ARG_BASE_CHANGE},
{(char*)"repeat-limit", required_argument, 0, ARG_REPEAT_LIMIT},
{(char*)"unique-only", no_argument, 0, ARG_UNIQUE_ONLY},
{(char*)"3N", no_argument, 0, ARG_3N},
{(char*)"directional-mapping", no_argument, 0, ARG_DIRECTIONAL},
{(char*)"directional-mapping-reverse", no_argument, 0, ARG_DIRECTIONAL_REVERSE},
{(char*)0, 0, 0, 0} // terminator
};
/**
* Print out a concise description of what options are taken and whether they
* take an argument.
*/
static void printArgDesc(ostream& out) {
// struct option {
// const char *name;
// int has_arg;
// int *flag;
// int val;
// };
size_t i = 0;
while(long_options[i].name != 0) {
out << long_options[i].name << "\t"
<< (long_options[i].has_arg == no_argument ? 0 : 1)
<< endl;
i++;
}
size_t solen = strlen(short_options);
for(i = 0; i < solen; i++) {
// Has an option? Does if next char is :
if(i == solen-1) {
assert_neq(':', short_options[i]);
cout << (char)short_options[i] << "\t" << 0 << endl;
} else {
if(short_options[i+1] == ':') {
// Option with argument
cout << (char)short_options[i] << "\t" << 1 << endl;
i++; // skip the ':'
} else {
// Option with no argument
cout << (char)short_options[i] << "\t" << 0 << endl;
}
}
}
}
/**
* Print a summary usage message to the provided output stream.
*/
static void printUsage(ostream& out) {
out << "HISAT2 version " << string(HISAT2_VERSION).c_str() << " by Daehwan Kim (infphilo@gmail.com, www.ccb.jhu.edu/people/infphilo)" << endl;
string tool_name = "hisat2-align";
if(wrapper == "basic-0") {
tool_name = "hisat2";
}
out << "Usage: " << endl
#ifdef USE_SRA
<< " " << tool_name.c_str() << " [options]* -x <ht2-idx> {-1 <m1> -2 <m2> | -U <r> | --sra-acc <SRA accession number>} [-S <sam>]" << endl
#else
<< " " << tool_name.c_str() << " [options]* -x <ht2-idx> {-1 <m1> -2 <m2> | -U <r>} [-S <sam>]" << endl
#endif
<< endl
<< " <ht2-idx> Index filename prefix (minus trailing .X." << gfm_ext << ")." << endl
<< " <m1> Files with #1 mates, paired with files in <m2>." << endl;
if(wrapper == "basic-0") {
out << " Could be gzip'ed (extension: .gz) or bzip2'ed (extension: .bz2)." << endl;
}
out << " <m2> Files with #2 mates, paired with files in <m1>." << endl;
if(wrapper == "basic-0") {
out << " Could be gzip'ed (extension: .gz) or bzip2'ed (extension: .bz2)." << endl;
}
out << " <r> Files with unpaired reads." << endl;
if(wrapper == "basic-0") {
out << " Could be gzip'ed (extension: .gz) or bzip2'ed (extension: .bz2)." << endl;
}
#ifdef USE_SRA
out << " <SRA accession number> Comma-separated list of SRA accession numbers, e.g. --sra-acc SRR353653,SRR353654." << endl;
#endif
out << " <sam> File for SAM output (default: stdout)" << endl
<< endl
<< " <m1>, <m2>, <r> can be comma-separated lists (no whitespace) and can be" << endl
<< " specified many times. E.g. '-U file1.fq,file2.fq -U file3.fq'." << endl
// Wrapper script should write <bam> line next
<< endl
<< "Options (defaults in parentheses):" << endl
<< endl
<< " Input:" << endl
<< " -q query input files are FASTQ .fq/.fastq (default)" << endl
<< " --qseq query input files are in Illumina's qseq format" << endl
<< " -f query input files are (multi-)FASTA .fa/.mfa" << endl
<< " -r query input files are raw one-sequence-per-line" << endl
<< " -c <m1>, <m2>, <r> are sequences themselves, not files" << endl
<< " -s/--skip <int> skip the first <int> reads/pairs in the input (none)" << endl
<< " -u/--upto <int> stop after first <int> reads/pairs (no limit)" << endl
<< " -5/--trim5 <int> trim <int> bases from 5'/left end of reads (0)" << endl
<< " -3/--trim3 <int> trim <int> bases from 3'/right end of reads (0)" << endl
<< " --phred33 qualities are Phred+33 (default)" << endl
<< " --phred64 qualities are Phred+64" << endl
<< " --int-quals qualities encoded as space-delimited integers" << endl
#ifdef USE_SRA
<< " --sra-acc SRA accession ID" << endl
#endif
<< endl
<< " Presets: Same as:" << endl
// << " For --end-to-end:" << endl
// << " --very-fast -D 5 -R 1 -N 0 -L 22 -i S,0,2.50" << endl
// << " --fast -D 10 -R 2 -N 0 -L 22 -i S,0,2.50" << endl
// << " --sensitive -D 15 -R 2 -N 0 -L 22 -i S,1,1.15 (default)" << endl
// << " --very-sensitive -D 20 -R 3 -N 0 -L 20 -i S,1,0.50" << endl
<< " --fast --no-repeat-index" << endl
<< " --sensitive --bowtie2-dp 1 -k 30 --score-min L,0,-0.5" << endl
<< " --very-sensitive --bowtie2-dp 2 -k 50 --score-min L,0,-1" << endl
<< endl
<< " Alignment:" << endl
//<< " -N <int> max # mismatches in seed alignment; can be 0 or 1 (0)" << endl
//<< " -L <int> length of seed substrings; must be >3, <32 (22)" << endl
//<< " -i <func> interval between seed substrings w/r/t read len (S,1,1.15)" << endl
<< " --bowtie2-dp <int> use Bowtie2's dynamic programming alignment algorithm (0) - 0: no dynamic programming, 1: conditional dynamic programming, and 2: unconditional dynamic programming (slowest)" << endl
<< " --n-ceil <func> func for max # non-A/C/G/Ts permitted in aln (L,0,0.15)" << endl
//<< " --dpad <int> include <int> extra ref chars on sides of DP table (15)" << endl
//<< " --gbar <int> disallow gaps within <int> nucs of read extremes (4)" << endl
<< " --ignore-quals treat all quality values as 30 on Phred scale (off)" << endl
<< " --nofw do not align forward (original) version of read (off)" << endl
<< " --norc do not align reverse-complement version of read (off)" << endl
<< " --no-repeat-index do not use repeat index" << endl
<< endl
<< " 3N-Alignment:" << endl
<< " --base-change <chr,chr> the converted nucleotide and converted to nucleotide (C,T)" << endl
<< " --directional-mapping make directional mapping, please use this option only if your reads are prepared with a strand specific library (off)" << endl
<< " --repeat-limit <int> maximum number of repeat will be expanded for repeat alignment (1000)" << endl
<< " --unique-only only output the reads have unique alignment (off)" << endl
<< endl
<< " Spliced Alignment:" << endl
<< " --pen-cansplice <int> penalty for a canonical splice site (0)" << endl
<< " --pen-noncansplice <int> penalty for a non-canonical splice site (12)" << endl
// << " --pen-conflictsplice <int> penalty for conflicting splice sites (1000000)" << endl
<< " --pen-canintronlen <func> penalty for long introns (G,-8,1) with canonical splice sites" << endl
<< " --pen-noncanintronlen <func> penalty for long introns (G,-8,1) with noncanonical splice sites" << endl
<< " --min-intronlen <int> minimum intron length (20)" << endl
<< " --max-intronlen <int> maximum intron length (500000)" << endl
<< " --known-splicesite-infile <path> provide a list of known splice sites" << endl
<< " --novel-splicesite-outfile <path> report a list of splice sites" << endl
<< " --novel-splicesite-infile <path> provide a list of novel splice sites" << endl
<< " --no-temp-splicesite disable the use of splice sites found" << endl
<< " --no-spliced-alignment disable spliced alignment" << endl
<< " --rna-strandness <string> specify strand-specific information (unstranded)" << endl
<< " --tmo reports only those alignments within known transcriptome" << endl
<< " --dta reports alignments tailored for transcript assemblers" << endl
<< " --dta-cufflinks reports alignments tailored specifically for cufflinks" << endl
<< " --avoid-pseudogene tries to avoid aligning reads to pseudogenes (experimental option)" << endl
<< " --no-templatelen-adjustment disables template length adjustment for RNA-seq reads" << endl
<< endl
<< " Scoring:" << endl
//<< " --ma <int> match bonus (0 for --end-to-end, 2 for --local) " << endl
<< " --mp <int>,<int> max and min penalties for mismatch; lower qual = lower penalty <6,2>" << endl
<< " --sp <int>,<int> max and min penalties for soft-clipping; lower qual = lower penalty <2,1>" << endl
<< " --no-softclip no soft-clipping" << endl
<< " --np <int> penalty for non-A/C/G/Ts in read/ref (1)" << endl
<< " --rdg <int>,<int> read gap open, extend penalties (5,3)" << endl
<< " --rfg <int>,<int> reference gap open, extend penalties (5,3)" << endl
<< " --score-min <func> min acceptable alignment score w/r/t read length" << endl
<< " (L,0.0,-0.2)" << endl
<< endl
<< " Reporting:" << endl
<< " -k <int> It searches for at most <int> distinct, primary alignments for each read. Primary alignments mean " << endl
<< " alignments whose alignment score is equal to or higher than any other alignments. The search terminates " << endl
<< " when it cannot find more distinct valid alignments, or when it finds <int>, whichever happens first. " << endl
<< " The alignment score for a paired-end alignment equals the sum of the alignment scores of " << endl
<< " the individual mates. Each reported read or pair alignment beyond the first has the SAM secondary bit " << endl
<< " (which equals 256) set in its FLAGS field. For reads that have more than <int> distinct, " << endl
<< " valid alignments, hisat2 does not guarantee that the <int> alignments reported are the best possible " << endl
<< " in terms of alignment score. Default: 5 (linear index) or 10 (graph index)." << endl
<< " Note: HISAT2 is not designed with large values for -k in mind, and when aligning reads to long, " << endl
<< " repetitive genomes, large -k could make alignment much slower." << endl
<< " --max-seeds <int> HISAT2, like other aligners, uses seed-and-extend approaches. HISAT2 tries to extend seeds to " << endl
<< " full-length alignments. In HISAT2, --max-seeds is used to control the maximum number of seeds that " << endl
<< " will be extended. For DNA-read alignment (--no-spliced-alignment), HISAT2 extends up to these many seeds" << endl
<< " and skips the rest of the seeds. For RNA-read alignment, HISAT2 skips extending seeds and reports " << endl
<< " no alignments if the number of seeds is larger than the number specified with the option, " << endl
<< " to be compatible with previous versions of HISAT2. Large values for --max-seeds may improve alignment " << endl
<< " sensitivity, but HISAT2 is not designed with large values for --max-seeds in mind, and when aligning " << endl
<< " reads to long, repetitive genomes, large --max-seeds could make alignment much slower. " << endl
<< " The default value is the maximum of 5 and the value that comes with -k times 2." << endl
<< " -a/--all HISAT2 reports all alignments it can find. Using the option is equivalent to using both --max-seeds " << endl
<< " and -k with the maximum value that a 64-bit signed integer can represent (9,223,372,036,854,775,807)." << endl
<< " --repeat report alignments to repeat sequences directly" << endl
<< endl
//<< " Effort:" << endl
//<< " -D <int> give up extending after <int> failed extends in a row (15)" << endl
//<< " -R <int> for reads w/ repetitive seeds, try <int> sets of seeds (2)" << endl
//<< endl
<< " Paired-end:" << endl
<< " -I/--minins <int> minimum fragment length (0), only valid with --no-spliced-alignment" << endl
<< " -X/--maxins <int> maximum fragment length (500), only valid with --no-spliced-alignment" << endl
<< " --fr/--rf/--ff -1, -2 mates align fw/rev, rev/fw, fw/fw (--fr)" << endl
<< " --no-mixed suppress unpaired alignments for paired reads" << endl
<< " --no-discordant suppress discordant alignments for paired reads" << endl
<< endl
<< " Output:" << endl;
//if(wrapper == "basic-0") {
// out << " --bam output directly to BAM (by piping through 'samtools view')" << endl;
//}
out << " -t/--time print wall-clock time taken by search phases" << endl;
if(wrapper == "basic-0") {
out << " --un <path> write unpaired reads that didn't align to <path>" << endl
<< " --al <path> write unpaired reads that aligned at least once to <path>" << endl
<< " --un-conc <path> write pairs that didn't align concordantly to <path>" << endl
<< " --al-conc <path> write pairs that aligned concordantly at least once to <path>" << endl
<< " (Note: for --un, --al, --un-conc, or --al-conc, add '-gz' to the option name, e.g." << endl
<< " --un-gz <path>, to gzip compress output, or add '-bz2' to bzip2 compress output.)" << endl;
}
out << " --summary-file <path> print alignment summary to this file." << endl
<< " --new-summary print alignment summary in a new style, which is more machine-friendly." << endl
<< " --quiet print nothing to stderr except serious errors" << endl
// << " --refidx refer to ref. seqs by 0-based index rather than name" << endl
<< " --met-file <path> send metrics to file at <path> (off)" << endl
<< " --met-stderr send metrics to stderr (off)" << endl
<< " --met <int> report internal counters & metrics every <int> secs (1)" << endl
// Following is supported in the wrapper instead
// << " --no-unal suppress SAM records for unaligned reads" << endl
<< " --no-head suppress header lines, i.e. lines starting with @" << endl
<< " --no-sq suppress @SQ header lines" << endl
<< " --rg-id <text> set read group id, reflected in @RG line and RG:Z: opt field" << endl
<< " --rg <text> add <text> (\"lab:value\") to @RG line of SAM header." << endl
<< " Note: @RG line only printed when --rg-id is set." << endl
<< " --omit-sec-seq put '*' in SEQ and QUAL fields for secondary alignments." << endl
<< endl
<< " Performance:" << endl
<< " -o/--offrate <int> override offrate of index; must be >= index's offrate" << endl
<< " -p/--threads <int> number of alignment threads to launch (1)" << endl
<< " --reorder force SAM output order to match order of input reads" << endl
#ifdef BOWTIE_MM
<< " --mm use memory-mapped I/O for index; many 'hisat2's can share" << endl
#endif
#ifdef BOWTIE_SHARED_MEM
//<< " --shmem use shared mem for index; many 'hisat2's can share" << endl
#endif
<< endl
<< " Other:" << endl
<< " --qc-filter filter out reads that are bad according to QSEQ filter" << endl
<< " --seed <int> seed for random number generator (0)" << endl
<< " --non-deterministic seed rand. gen. arbitrarily instead of using read attributes" << endl
<< " --remove-chrname remove 'chr' from reference names in alignment" << endl
<< " --add-chrname add 'chr' to reference names in alignment " << endl
// << " --verbose verbose output for debugging" << endl
<< " --version print version information and quit" << endl
<< " -h/--help print this usage message" << endl
;
if(wrapper.empty()) {
cerr << endl
<< "*** Warning ***" << endl
<< "'hisat2-align' was run directly. It is recommended that you run the wrapper script 'hisat2' instead." << endl
<< endl;
}
}
/**
* Parse an int out of optarg and enforce that it be at least 'lower';
* if it is less than 'lower', than output the given error message and
* exit with an error and a usage message.
*/
static int parseInt(int lower, int upper, const char *errmsg, const char *arg) {
long l;
char *endPtr= NULL;
l = strtol(arg, &endPtr, 10);
if (endPtr != NULL) {
if (l < lower || l > upper) {
cerr << errmsg << endl;
printUsage(cerr);
throw 1;
}
return (int32_t)l;
}
cerr << errmsg << endl;
printUsage(cerr);
throw 1;
return -1;
}
/**
* Upper is maximum int by default.
*/
static int parseInt(int lower, const char *errmsg, const char *arg) {
return parseInt(lower, std::numeric_limits<int>::max(), errmsg, arg);
}
/**
* Parse a T string 'str'.
*/
template<typename T>
T parse(const char *s) {
T tmp;
stringstream ss(s);
ss >> tmp;
return tmp;
}
/**
* Parse a pair of Ts from a string, 'str', delimited with 'delim'.
*/
template<typename T>
pair<T, T> parsePair(const char *str, char delim) {
string s(str);
EList<string> ss;
tokenize(s, delim, ss);
pair<T, T> ret;
ret.first = parse<T>(ss[0].c_str());
ret.second = parse<T>(ss[1].c_str());
return ret;
}
/**
* Parse a pair of Ts from a string, 'str', delimited with 'delim'.
*/
template<typename T>
void parseTuple(const char *str, char delim, EList<T>& ret) {
string s(str);
EList<string> ss;
tokenize(s, delim, ss);
for(size_t i = 0; i < ss.size(); i++) {
ret.push_back(parse<T>(ss[i].c_str()));
}
}
static string applyPreset(const string& sorig, Presets& presets) {
string s = sorig;
size_t found = s.find("%LOCAL%");
if(found != string::npos) {
s.replace(found, strlen("%LOCAL%"), localAlign ? "-local" : "");
}
if(gVerbose) {
cerr << "Applying preset: '" << s.c_str() << "' using preset menu '"
<< presets.name() << "'" << endl;
}
string pol;
presets.apply(s, pol, extra_opts);
return pol;
}
static bool saw_M;
static bool saw_a;
static bool saw_k;
static EList<string> presetList;
/**
* TODO: Argument parsing is very, very flawed. The biggest problem is that
* there are two separate worlds of arguments, the ones set via polstr, and
* the ones set directly in variables. This makes for nasty interactions,
* e.g., with the -M option being resolved at an awkward time relative to
* the -k and -a options.
*/
static void parseOption(int next_option, const char *arg) {
switch (next_option) {
case ARG_TEST_25: bowtie2p5 = true; break;
case ARG_DESC_KB: descentTotSz = SimpleFunc::parse(arg, 0.0, 1024.0, 1024.0, DMAX); break;
case ARG_DESC_FMOPS: descentTotFmops = SimpleFunc::parse(arg, 0.0, 10.0, 100.0, DMAX); break;
case ARG_DESC_LANDING: descentLanding = parse<int>(arg); break;
case ARG_DESC_EXP: {
descConsExp = parse<double>(arg);
if(descConsExp < 0.0) {
cerr << "Error: --desc-exp must be greater than or equal to 0" << endl;
throw 1;
}
break;
}
case '1': tokenize(arg, ",", mates1); break;
case '2': tokenize(arg, ",", mates2); break;
case ARG_ONETWO: tokenize(arg, ",", mates12); format = TAB_MATE5; break;
case ARG_TAB5: tokenize(arg, ",", mates12); format = TAB_MATE5; break;
case ARG_TAB6: tokenize(arg, ",", mates12); format = TAB_MATE6; break;
case 'f': format = FASTA; break;
case 'F': {
format = FASTA_CONT;
pair<uint32_t, uint32_t> p = parsePair<uint32_t>(arg, ',');
fastaContLen = p.first;
fastaContFreq = p.second;
break;
}
case ARG_BWA_SW_LIKE: {
bwaSwLikeC = 5.5f;
bwaSwLikeT = 30;
bwaSwLike = true;
localAlign = true;
// -a INT Score of a match [1]
// -b INT Mismatch penalty [3]
// -q INT Gap open penalty [5]
// -r INT Gap extension penalty. The penalty for a contiguous
// gap of size k is q+k*r. [2]
polstr += ";MA=1;MMP=C3;RDG=5,2;RFG=5,2";
break;
}
case 'q': format = FASTQ; break;
case 'r': format = RAW; break;
case 'c': format = CMDLINE; break;
case ARG_QSEQ: format = QSEQ; break;
case 'C': {
cerr << "Error: -C specified but Bowtie 2 does not support colorspace input." << endl;
throw 1;
break;
}
case 'I':
gMinInsert = parseInt(0, "-I arg must be positive", arg);
break;
case 'X':
gMaxInsert = parseInt(1, "-X arg must be at least 1", arg);
break;
case ARG_NO_DISCORDANT: gReportDiscordant = false; break;
case ARG_NO_MIXED: gReportMixed = false; break;
case 's':
skipReads = (uint32_t)parseInt(0, "-s arg must be positive", arg);
break;
case ARG_FF: gMate1fw = true; gMate2fw = true; break;
case ARG_RF: gMate1fw = false; gMate2fw = true; break;
case ARG_FR: gMate1fw = true; gMate2fw = false; break;
case ARG_SHMEM: useShmem = true; break;
case ARG_SEED_SUMM: seedSumm = true; break;
case ARG_MM: {
#ifdef BOWTIE_MM
useMm = true;
break;
#else
cerr << "Memory-mapped I/O mode is disabled because bowtie was not compiled with" << endl
<< "BOWTIE_MM defined. Memory-mapped I/O is not supported under Windows. If you" << endl
<< "would like to use memory-mapped I/O on a platform that supports it, please" << endl
<< "refrain from specifying BOWTIE_MM=0 when compiling Bowtie." << endl;
throw 1;
#endif
}
case ARG_MMSWEEP: mmSweep = true; break;
case ARG_HADOOPOUT: hadoopOut = true; break;
case ARG_SOLEXA_QUALS: solexaQuals = true; break;
case ARG_INTEGER_QUALS: integerQuals = true; break;
case ARG_PHRED64: phred64Quals = true; break;
case ARG_PHRED33: solexaQuals = false; phred64Quals = false; break;
case ARG_OVERHANG: gReportOverhangs = true; break;
case ARG_NO_CACHE: msNoCache = true; break;
case ARG_USE_CACHE: msNoCache = false; break;
case ARG_LOCAL_SEED_CACHE_SZ:
seedCacheLocalMB = (uint32_t)parseInt(1, "--local-seed-cache-sz arg must be at least 1", arg);
break;
case ARG_CURRENT_SEED_CACHE_SZ:
seedCacheCurrentMB = (uint32_t)parseInt(1, "--seed-cache-sz arg must be at least 1", arg);
break;
case ARG_REFIDX: noRefNames = true; break;
case ARG_FUZZY: fuzzy = true; break;
case ARG_FULLREF: fullRef = true; break;
case ARG_GAP_BAR:
gGapBarrier = parseInt(1, "--gbar must be no less than 1", arg);
break;
case ARG_SEED:
seed = parseInt(0, "--seed arg must be at least 0", arg);
break;
case ARG_NON_DETERMINISTIC:
arbitraryRandom = true;
break;
case 'u':
qUpto = (uint32_t)parseInt(1, "-u/--qupto arg must be at least 1", arg);
break;
case 'Q':
tokenize(arg, ",", qualities);
integerQuals = true;
break;
case ARG_QUALS1:
tokenize(arg, ",", qualities1);
integerQuals = true;
break;
case ARG_QUALS2:
tokenize(arg, ",", qualities2);
integerQuals = true;
break;
case ARG_CACHE_LIM:
cacheLimit = (uint32_t)parseInt(1, "--cachelim arg must be at least 1", arg);
break;
case ARG_CACHE_SZ:
cacheSize = (uint32_t)parseInt(1, "--cachesz arg must be at least 1", arg);
cacheSize *= (1024 * 1024); // convert from MB to B
break;
case ARG_WRAPPER: wrapper = arg; break;
case 'p':
nthreads = parseInt(1, "-p/--threads arg must be at least 1", arg);
break;
case ARG_FILEPAR:
fileParallel = true;
break;
case '3': gTrim3 = parseInt(0, "-3/--trim3 arg must be at least 0", arg); break;
case '5': gTrim5 = parseInt(0, "-5/--trim5 arg must be at least 0", arg); break;
case 'h': printUsage(cout); throw 0; break;
case ARG_USAGE: printUsage(cout); throw 0; break;
//
// NOTE that unlike in Bowtie 1, -M, -a and -k are mutually
// exclusive here.
//
case 'M': {
msample = true;
mhits = parse<uint32_t>(arg);
if(saw_a || saw_k) {
cerr << "Warning: -M, -k and -a are mutually exclusive. "
<< "-M will override" << endl;
khits = 1;
}
assert_eq(1, khits);
saw_M = true;
cerr << "Warning: -M is deprecated. Use -D and -R to adjust " <<
"effort instead." << endl;
break;
}
case ARG_EXTEND_ITERS: {
maxIters = parse<size_t>(arg);
break;
}
case ARG_NO_EXTEND: {
doExtend = false;
break;
}
case 'R': { polstr += ";ROUNDS="; polstr += arg; break; }
case 'D': { polstr += ";DPS="; polstr += arg; break; }
case ARG_DP_MATE_STREAK_THRESH: {
maxMateStreak = parse<size_t>(arg);
break;
}
case ARG_DP_FAIL_STREAK_THRESH: {
maxDpStreak = parse<size_t>(arg);
break;
}
case ARG_EE_FAIL_STREAK_THRESH: {
maxEeStreak = parse<size_t>(arg);
break;
}
case ARG_UG_FAIL_STREAK_THRESH: {
maxUgStreak = parse<size_t>(arg);
break;
}
case ARG_DP_FAIL_THRESH: {
maxDp = parse<size_t>(arg);
break;
}
case ARG_UG_FAIL_THRESH: {
maxUg = parse<size_t>(arg);
break;
}
case ARG_MAX_SEEDS: {
maxSeeds = parse<size_t>(arg);
break;
}
case ARG_SEED_BOOST_THRESH: {
seedBoostThresh = parse<int>(arg);
break;
}
case 'a': {
msample = false;
allHits = true;
mhits = 0; // disable -M
if(saw_M || saw_k) {
cerr << "Warning: -M, -k and -a are mutually exclusive. "
<< "-a will override" << endl;
}
saw_a = true;
break;
}
case 'k': {
msample = false;
khits = (uint32_t)parseInt(1, "-k arg must be at least 1", arg);
mhits = 0; // disable -M
if(saw_M || saw_a) {
cerr << "Warning: -M, -k and -a are mutually exclusive. "
<< "-k will override" << endl;
}
saw_k = true;
break;
}
case ARG_VERBOSE: gVerbose = 1; break;
case ARG_STARTVERBOSE: startVerbose = true; break;
case ARG_QUIET: gQuiet = true; break;
case ARG_SANITY: sanityCheck = true; break;
case 't': timing = true; break;
case ARG_METRIC_IVAL: {
metricsIval = parseInt(1, "--metrics arg must be at least 1", arg);
break;
}
case ARG_METRIC_FILE: metricsFile = arg; break;
case ARG_METRIC_STDERR: metricsStderr = true; break;
case ARG_METRIC_PER_READ: metricsPerRead = true; break;
case ARG_NO_FW: gNofw = true; break;
case ARG_NO_RC: gNorc = true; break;
case ARG_SAM_NO_QNAME_TRUNC: samTruncQname = false; break;
case ARG_SAM_OMIT_SEC_SEQ: samOmitSecSeqQual = true; break;
case ARG_SAM_NO_UNAL: samNoUnal = true; break;
case ARG_SAM_NOHEAD: samNoHead = true; break;
case ARG_SAM_NOSQ: samNoSQ = true; break;
case ARG_SAM_PRINT_YI: sam_print_yi = true; break;
case ARG_REORDER: reorder = true; break;
case ARG_MAPQ_EX: {
sam_print_zp = true;
sam_print_zu = true;
sam_print_xp = true;
sam_print_xss = true;
sam_print_yn = true;
break;
}
case ARG_SHOW_RAND_SEED: {
sam_print_zs = true;
break;
}
case ARG_SAMPLE:
sampleFrac = parse<float>(arg);
break;
case ARG_CP_MIN:
cminlen = parse<size_t>(arg);
break;
case ARG_CP_IVAL:
cpow2 = parse<size_t>(arg);
break;
case ARG_TRI:
doTri = true;
break;
case ARG_READ_PASSTHRU: {
sam_print_xr = true;
break;
}
case ARG_READ_TIMES: {
sam_print_xt = true;
sam_print_xd = true;
sam_print_xu = true;
sam_print_yl = true;
sam_print_ye = true;
sam_print_yu = true;
sam_print_yr = true;
sam_print_zb = true;
sam_print_zr = true;
sam_print_zf = true;
sam_print_zm = true;
sam_print_zi = true;
break;
}
case ARG_SAM_RG: {
string argstr = arg;
if(argstr.substr(0, 3) == "ID:") {
rgid = "\t";
rgid += argstr;
rgs_optflag = "RG:Z:" + argstr.substr(3);
} else {
rgs += '\t';
rgs += argstr;
}
break;
}
case ARG_SAM_RGID: {
string argstr = arg;
rgid = "\t";
rgid = "\tID:" + argstr;
rgs_optflag = "RG:Z:" + argstr;
break;
}
case ARG_PARTITION: partitionSz = parse<int>(arg); break;
case ARG_DPAD:
maxhalf = parseInt(0, "--dpad must be no less than 0", arg);
break;
case ARG_ORIG:
if(arg == NULL || strlen(arg) == 0) {
cerr << "--orig arg must be followed by a string" << endl;
printUsage(cerr);
throw 1;
}
origString = arg;
break;
case ARG_LOCAL: localAlign = true; break;
case ARG_END_TO_END: localAlign = false; break;
case ARG_SSE8: enable8 = true; break;
case ARG_SSE8_NO: enable8 = false; break;
case ARG_UNGAPPED: doUngapped = true; break;
case ARG_UNGAPPED_NO: doUngapped = false; break;
// case ARG_NO_DOVETAIL: gDovetailMatesOK = false; break;
// case ARG_NO_CONTAIN: gContainMatesOK = false; break;
// case ARG_NO_OVERLAP: gOlapMatesOK = false; break;
// case ARG_DOVETAIL: gDovetailMatesOK = true; break;
// case ARG_CONTAIN: gContainMatesOK = true; break;
// case ARG_OVERLAP: gOlapMatesOK = true; break;
case ARG_QC_FILTER: qcFilter = true; break;
case ARG_NO_SCORE_PRIORITY: sortByScore = false; break;
case ARG_IGNORE_QUALS: ignoreQuals = true; break;
case ARG_MAPQ_V: mapqv = parse<int>(arg); break;
case ARG_TIGHTEN: tighten = parse<int>(arg); break;
case ARG_EXACT_UPFRONT: doExactUpFront = true; break;
case ARG_1MM_UPFRONT: do1mmUpFront = true; break;
case ARG_EXACT_UPFRONT_NO: doExactUpFront = false; break;
case ARG_1MM_UPFRONT_NO: do1mmUpFront = false; break;
case ARG_1MM_MINLEN: do1mmMinLen = parse<size_t>(arg); break;
case ARG_NOISY_HPOLY: noisyHpolymer = true; break;
case 'x' : bt2indexs[0] = arg; break;
case ARG_PRESET_VERY_FAST_LOCAL: localAlign = true;
case ARG_PRESET_VERY_FAST: {
presetList.push_back("very-fast%LOCAL%"); break;
}
case ARG_PRESET_FAST_LOCAL: localAlign = true;
case ARG_PRESET_FAST: {
fast = true;
presetList.push_back("fast%LOCAL%"); break;
}
case ARG_PRESET_SENSITIVE_LOCAL: localAlign = true;
case ARG_PRESET_SENSITIVE: {
sensitive = true;
presetList.push_back("sensitive%LOCAL%"); break;
}
case ARG_PRESET_VERY_SENSITIVE_LOCAL: localAlign = true;
case ARG_PRESET_VERY_SENSITIVE: {
very_sensitive = true;
presetList.push_back("very-sensitive%LOCAL%"); break;
}
case 'P': { presetList.push_back(arg); break; }
case ARG_ALIGN_POLICY: {
if(strlen(arg) > 0) {
polstr += ";"; polstr += arg;
}
break;
}
case 'N': { polstr += ";SEED="; polstr += arg; break; }
case 'L': {
int64_t len = parse<size_t>(arg);
if(len < 0) {
cerr << "Error: -L argument must be >= 0; was " << arg << endl;
throw 1;
}
if(len > 32) {
cerr << "Error: -L argument must be <= 32; was" << arg << endl;
throw 1;
}
polstr += ";SEEDLEN="; polstr += arg; break;
}
case 'O':
multiseedOff = parse<size_t>(arg);
break;
case 'i': {
EList<string> args;
tokenize(arg, ",", args);
if(args.size() > 3 || args.size() == 0) {
cerr << "Error: expected 3 or fewer comma-separated "
<< "arguments to -i option, got "
<< args.size() << endl;
throw 1;
}
// Interval-settings arguments
polstr += (";IVAL=" + args[0]); // Function type
if(args.size() > 1) {
polstr += ("," + args[1]); // Constant term
}
if(args.size() > 2) {
polstr += ("," + args[2]); // Coefficient
}
break;
}
case ARG_MULTISEED_IVAL: {
polstr += ";";
// Split argument by comma
EList<string> args;
tokenize(arg, ",", args);
if(args.size() > 5 || args.size() == 0) {
cerr << "Error: expected 5 or fewer comma-separated "
<< "arguments to --multiseed option, got "
<< args.size() << endl;
throw 1;
}
// Seed mm and length arguments
polstr += "SEED=";
polstr += (args[0]); // # mismatches
if(args.size() > 1) polstr += ("," + args[ 1]); // length
if(args.size() > 2) polstr += (";IVAL=" + args[2]); // Func type
if(args.size() > 3) polstr += ("," + args[ 3]); // Constant term
if(args.size() > 4) polstr += ("," + args[ 4]); // Coefficient
break;
}
case ARG_N_CEIL: {
// Split argument by comma
EList<string> args;
tokenize(arg, ",", args);
if(args.size() > 3) {
cerr << "Error: expected 3 or fewer comma-separated "
<< "arguments to --n-ceil option, got "
<< args.size() << endl;
throw 1;
}
if(args.size() == 0) {
cerr << "Error: expected at least one argument to --n-ceil option" << endl;
throw 1;
}
polstr += ";NCEIL=";
if(args.size() == 3) {
polstr += (args[0] + "," + args[1] + "," + args[2]);
} else {
if(args.size() == 1) {
polstr += ("C," + args[0]);
} else {
polstr += (args[0] + "," + args[1]);
}
}
break;
}
case ARG_SCORE_MA: polstr += ";MA="; polstr += arg; break;
case ARG_SCORE_MMP: {
EList<string> args;
tokenize(arg, ",", args);
if(args.size() > 2 || args.size() == 0) {
cerr << "Error: expected 1 or 2 comma-separated "
<< "arguments to --mp option, got " << args.size() << endl;
throw 1;
}
if(args.size() >= 1) {
polstr += ";MMP=Q,";
polstr += args[0];
if(args.size() >= 2) {
polstr += ",";
polstr += args[1];
}
}
break;
}
case ARG_SCORE_SCP: {
EList<string> args;
tokenize(arg, ",", args);
if(args.size() > 2 || args.size() == 0) {
cerr << "Error: expected 1 or 2 comma-separated "
<< "arguments to --sp option, got " << args.size() << endl;
throw 1;
}
if(args.size() >= 1) {
polstr += ";SCP=Q,";
polstr += args[0];
if(args.size() >= 2) {
polstr += ",";
polstr += args[1];
}
}
break;
}
case ARG_NO_SOFTCLIP: {
ostringstream convert;
convert << std::numeric_limits<int>::max();
polstr += ";SCP=Q,";
polstr += convert.str();
polstr += ",";
polstr += convert.str();
break;
}
case ARG_SCORE_NP: polstr += ";NP=C"; polstr += arg; break;
case ARG_SCORE_RDG: polstr += ";RDG="; polstr += arg; break;
case ARG_SCORE_RFG: polstr += ";RFG="; polstr += arg; break;
case ARG_SCORE_MIN: {
polstr += ";";
EList<string> args;
tokenize(arg, ",", args);
if(args.size() > 3 && args.size() == 0) {
cerr << "Error: expected 3 or fewer comma-separated "
<< "arguments to --n-ceil option, got "
<< args.size() << endl;
throw 1;
}
polstr += ("MIN=" + args[0]);
if(args.size() > 1) {
polstr += ("," + args[1]);
}
if(args.size() > 2) {
polstr += ("," + args[2]);
}
break;
}
case ARG_DESC: printArgDesc(cout); throw 0;
case 'S': outfile = arg; break;
case 'U': {
EList<string> args;
tokenize(arg, ",", args);
for(size_t i = 0; i < args.size(); i++) {
queries.push_back(args[i]);
}
break;
}
case ARG_VERSION: showVersion = 1; break;
case ARG_NO_TEMPSPLICESITE: useTempSpliceSite = false; break;
case ARG_PEN_CANSPLICE: {
penCanSplice = parseInt(0, "--pen-cansplice arg must be at least 0", arg);
break;
}
case ARG_PEN_NONCANSPLICE: {
penNoncanSplice = parseInt(0, "--pen-noncansplice arg must be at least 0", arg);
break;
}
case ARG_PEN_CONFLICTSPLICE: {
penConflictSplice = parseInt(0, "--pen-conflictsplice arg must be at least 0", arg);
break;
}
case ARG_PEN_CANINTRONLEN: {
polstr += ";";
EList<string> args;
tokenize(arg, ",", args);
if(args.size() > 3 && args.size() == 0) {
cerr << "Error: expected 3 or fewer comma-separated "
<< "arguments to --n-ceil option, got "
<< args.size() << endl;
throw 1;
}
polstr += ("CANINTRONLEN=" + args[0]);
if(args.size() > 1) {
polstr += ("," + args[1]);
}
if(args.size() > 2) {
polstr += ("," + args[2]);
}
break;
}
case ARG_PEN_NONCANINTRONLEN: {
polstr += ";";
EList<string> args;
tokenize(arg, ",", args);
if(args.size() > 3 && args.size() == 0) {
cerr << "Error: expected 3 or fewer comma-separated "
<< "arguments to --n-ceil option, got "
<< args.size() << endl;
throw 1;
}
polstr += ("NONCANINTRONLEN=" + args[0]);
if(args.size() > 1) {
polstr += ("," + args[1]);
}
if(args.size() > 2) {
polstr += ("," + args[2]);
}
break;
}
case ARG_MIN_INTRONLEN: {
minIntronLen = parseInt(20, "--min-intronlen arg must be at least 20", arg);
break;
}
case ARG_MAX_INTRONLEN: {
maxIntronLen = parseInt(20, "--max-intronlen arg must be at least 20", arg);
break;
}
case ARG_KNOWN_SPLICESITE_INFILE: knownSpliceSiteInfile = arg; break;
case ARG_NOVEL_SPLICESITE_INFILE: novelSpliceSiteInfile = arg; break;
case ARG_NOVEL_SPLICESITE_OUTFILE: novelSpliceSiteOutfile = arg; break;
case ARG_SECONDARY: secondary = true; break;
case ARG_NO_SPLICED_ALIGNMENT: no_spliced_alignment = true; break;
case ARG_RNA_STRANDNESS: {
string strandness = arg;
if(strandness == "F") rna_strandness = RNA_STRANDNESS_F;
else if(strandness == "R") rna_strandness = RNA_STRANDNESS_R;
else if(strandness == "FR") rna_strandness = RNA_STRANDNESS_FR;
else if(strandness == "RF") rna_strandness = RNA_STRANDNESS_RF;
else {
cerr << "Error: should be one of F, R, FR, or RF " << endl;
throw 1;
}
break;
}
case ARG_SPLICESITE_DB_ONLY: {
splicesite_db_only = true;
break;
}
case ARG_NO_ANCHORSTOP: {
anchorStop = false;
break;
}
case ARG_TRANSCRIPTOME_MAPPING_ONLY: {
tranMapOnly = true;
break;
}
case ARG_TRANSCRIPTOME_ASSEMBLY: {
tranAssm = true;
break;
}
case ARG_TRANSCRIPTOME_ASSEMBLY_CUFFLINKS: {
tranAssm = true;
tranAssm_program = "cufflinks";
break;
}
case ARG_AVOID_PSEUDOGENE: {
avoid_pseudogene = true;
break;
}
#ifdef USE_SRA
case ARG_SRA_ACC: {
tokenize(arg, ",", sra_accs); format = SRA_FASTA;
break;
}
#endif
case ARG_REMOVE_CHRNAME: {
rmChrName = true;
break;
}
case ARG_ADD_CHRNAME: {
addChrName = true;
break;
}
case ARG_MAX_ALTSTRIED: {
max_alts_tried = parseInt(8, "--max-altstried arg must be at least 8", arg);
break;
}
case ARG_HAPLOTYPE: {
use_haplotype = true;
break;
}
case ARG_CODIS: {
enable_codis = true;
break;
}
case ARG_NO_TEMPLATELEN_ADJUSTMENT: {
templateLenAdjustment = false;
break;
}
case ARG_SUMMARY_FILE: {
alignSumFile = arg;
break;
}
case ARG_NEW_SUMMARY: {
newAlignSummary = true;
break;
}
case ARG_DP: {
bowtie2_dp = parseInt(0, "--bowtie2-dp arg must be 0, 1, or 2", arg);
break;
}
case ARG_REPEAT: {
repeat = true;
break;
}
case ARG_NO_REPEAT_INDEX: {
use_repeat_index = false;
break;
}
case ARG_READ_LENGTHS: {
EList<string> str_readLens;
tokenize(arg, ",", str_readLens);
for(size_t i = 0; i < str_readLens.size(); i++) {
int readLen = parseInt(0, "--read-lengths arg must be at least 0", str_readLens[i].c_str());
readLens.push_back(readLen);
}
readLens.sort();
break;
}
case ARG_BASE_CHANGE: {
// Split argument by comma
EList<string> args;
tokenize(arg, ",", args);
if(args.size() != 2) {
cerr << "Error: expected 2 comma-separated "
<< "arguments to --base-change option, got " << args.size() << endl;
throw 1;
}
base_change_entered = true;
usrInput_convertedFrom = toupper(args[0][0]);
usrInput_convertedTo = toupper(args[1][0]);
string s = "ACGT";
if ((s.find(usrInput_convertedFrom) == std::string::npos) || (s.find(usrInput_convertedTo) == std::string::npos)) {
cerr << "Please enter the nucleotide in 'ACGT' for --base-change option." << endl;
throw 1;
}
if (usrInput_convertedFrom == usrInput_convertedTo) {
cerr << "Please enter two different base for --base-change option. If you wish to align normal reads without nucleotide conversion, please use hisat2." << endl;
throw 1;
}
break;
}
case ARG_3N: {
threeN = true;
break;
}
case ARG_REPEAT_LIMIT: {
repeatLimit = parseInt(1, "--repeat-limit arg must be at least 1", arg);;
break;
}
case ARG_UNIQUE_ONLY: {
uniqueOutputOnly = true;
break;
}
case ARG_DIRECTIONAL: {
directional3NMapping = 1;
break;
}
case ARG_DIRECTIONAL_REVERSE: {
directional3NMapping = 2;
break;
}
default:
printUsage(cerr);
throw 1;
}
}
/**
* Read command-line arguments
*/
static void parseOptions(int argc, const char **argv) {
int option_index = 0;
int next_option;
saw_M = false;
saw_a = false;
saw_k = false;
presetList.clear();
if(startVerbose) { cerr << "Parsing options: "; logTime(cerr, true); }
while(true) {
next_option = getopt_long(
argc, const_cast<char**>(argv),
short_options, long_options, &option_index);
const char * arg = optarg;
if(next_option == EOF) {
if(extra_opts_cur < extra_opts.size()) {
next_option = extra_opts[extra_opts_cur].first;
arg = extra_opts[extra_opts_cur].second.c_str();
extra_opts_cur++;
} else {
break;
}
}
parseOption(next_option, arg);
}
// Now parse all the presets. Might want to pick which presets version to
// use according to other parameters.
auto_ptr<Presets> presets(new PresetsV0());
// Apply default preset
if(!defaultPreset.empty()) {
polstr = applyPreset(defaultPreset, *presets.get()) + polstr;
}
// Apply specified presets
for(size_t i = 0; i < presetList.size(); i++) {
polstr += applyPreset(presetList[i], *presets.get());
}
for(size_t i = 0; i < extra_opts.size(); i++) {
next_option = extra_opts[extra_opts_cur].first;
const char *arg = extra_opts[extra_opts_cur].second.c_str();
parseOption(next_option, arg);
}
if (showVersion) {
return;
}
// Remove initial semicolons
while(!polstr.empty() && polstr[0] == ';') {
polstr = polstr.substr(1);
}
if(gVerbose) {
cerr << "Final policy string: '" << polstr.c_str() << "'" << endl;
}
if (threeN && !base_change_entered) {
cerr << "--base-change must be set for HISAT-3N" << endl;
printUsage(cerr);
throw 1;
}
if (!threeN && base_change_entered) {
cerr << "Please do not use --base-change for HISAT2. To align nucleotide conversion reads, please use HISAT-3N" << endl;
printUsage(cerr);
throw 1;
}
if (threeN) {
usrInput_convertedFromComplement = asc2dnacomp[usrInput_convertedFrom];
usrInput_convertedToComplement = asc2dnacomp[usrInput_convertedTo];
getConversion(usrInput_convertedFrom, usrInput_convertedTo, hs3N_convertedFrom, hs3N_convertedTo);
hs3N_convertedFromComplement = asc2dnacomp[hs3N_convertedFrom];
hs3N_convertedToComplement = asc2dnacomp[hs3N_convertedTo];
asc2dna_3N[0][hs3N_convertedFrom] = asc2dna[hs3N_convertedTo];
asc2dna_3N[0][tolower(hs3N_convertedFrom)] = asc2dna[hs3N_convertedTo];
asc2dna_3N[1][hs3N_convertedFromComplement] = asc2dna[hs3N_convertedToComplement];
asc2dna_3N[1][tolower(hs3N_convertedFromComplement)] = asc2dna[hs3N_convertedToComplement];
threeN_indexTags[0] += hs3N_convertedFrom;
threeN_indexTags[0] += hs3N_convertedTo;
threeN_indexTags[1] += hs3N_convertedFromComplement;
threeN_indexTags[1] += hs3N_convertedToComplement;
nMappingCycle = 4;
if (hs3N_convertedFrom == hs3N_convertedToComplement || directional3NMapping == 1) {
mappingCycles[0] = true;
mappingCycles[1] = true;
}
else if (directional3NMapping == 2) {
mappingCycles[2] = true;
mappingCycles[3] = true;
}
else {
for (int i = 0; i < 4; i++){
mappingCycles[i] = true;
}
}
}
else
{
nMappingCycle = 1;
mappingCycles[0] = true;
}
size_t failStreakTmp = 0;
SeedAlignmentPolicy::parseString(
polstr,
localAlign,
noisyHpolymer,
ignoreQuals,
bonusMatchType,
bonusMatch,
penMmcType,
penMmcMax,
penMmcMin,
penScMax,
penScMin,
penNType,
penN,
penRdGapConst,
penRfGapConst,
penRdGapLinear,
penRfGapLinear,
scoreMin,
nCeil,
penNCatPair,
multiseedMms,
multiseedLen,
msIval,
failStreakTmp,
nSeedRounds,
&penCanIntronLen,
&penNoncanIntronLen);
if(failStreakTmp > 0) {
maxEeStreak = failStreakTmp;
maxUgStreak = failStreakTmp;
maxDpStreak = failStreakTmp;
}
if(saw_a || saw_k || true) {
msample = false;
mhits = 0;
} else {
assert_gt(mhits, 0);
msample = true;
}
if(fast) {
use_repeat_index = false;
} else if(sensitive) {
if(bowtie2_dp == 0) {
bowtie2_dp = 1;
}
if(khits < 10) {
khits = 10;
saw_k = true;
}
scoreMin.init(SIMPLE_FUNC_LINEAR, 0.0f, -0.5f);
} else if(very_sensitive) {
bowtie2_dp = 2;
if(khits < 30) {
khits = 30;
saw_k = true;
}
scoreMin.init(SIMPLE_FUNC_LINEAR, 0.0f, -1.0f);
}
if(mates1.size() != mates2.size()) {
cerr << "Error: " << mates1.size() << " mate files/sequences were specified with -1, but " << mates2.size() << endl
<< "mate files/sequences were specified with -2. The same number of mate files/" << endl
<< "sequences must be specified with -1 and -2." << endl;
throw 1;
}
if(qualities.size() && format != FASTA) {
cerr << "Error: one or more quality files were specified with -Q but -f was not" << endl
<< "enabled. -Q works only in combination with -f and -C." << endl;
throw 1;
}
if(qualities1.size() && format != FASTA) {
cerr << "Error: one or more quality files were specified with --Q1 but -f was not" << endl
<< "enabled. --Q1 works only in combination with -f and -C." << endl;
throw 1;
}
if(qualities2.size() && format != FASTA) {
cerr << "Error: one or more quality files were specified with --Q2 but -f was not" << endl
<< "enabled. --Q2 works only in combination with -f and -C." << endl;
throw 1;
}
if(qualities1.size() > 0 && mates1.size() != qualities1.size()) {
cerr << "Error: " << mates1.size() << " mate files/sequences were specified with -1, but " << qualities1.size() << endl
<< "quality files were specified with --Q1. The same number of mate and quality" << endl
<< "files must sequences must be specified with -1 and --Q1." << endl;
throw 1;
}
if(qualities2.size() > 0 && mates2.size() != qualities2.size()) {
cerr << "Error: " << mates2.size() << " mate files/sequences were specified with -2, but " << qualities2.size() << endl
<< "quality files were specified with --Q2. The same number of mate and quality" << endl
<< "files must sequences must be specified with -2 and --Q2." << endl;
throw 1;
}
if(!rgs.empty() && rgid.empty()) {
cerr << "Warning: --rg was specified without --rg-id also "
<< "being specified. @RG line is not printed unless --rg-id "
<< "is specified." << endl;
}
// Check for duplicate mate input files
if(format != CMDLINE) {
for(size_t i = 0; i < mates1.size(); i++) {
for(size_t j = 0; j < mates2.size(); j++) {
if(mates1[i] == mates2[j] && !gQuiet) {
cerr << "Warning: Same mate file \"" << mates1[i].c_str() << "\" appears as argument to both -1 and -2" << endl;
}
}
}
}
// If both -s and -u are used, we need to adjust qUpto accordingly
// since it uses rdid to know if we've reached the -u limit (and
// rdids are all shifted up by skipReads characters)
if(qUpto + skipReads > qUpto) {
qUpto += skipReads;
}
if(useShmem && useMm && !gQuiet) {
cerr << "Warning: --shmem overrides --mm..." << endl;
useMm = false;
}
if(gGapBarrier < 1) {
cerr << "Warning: --gbar was set less than 1 (=" << gGapBarrier
<< "); setting to 1 instead" << endl;
gGapBarrier = 1;
}
if(multiseedMms >= multiseedLen) {
assert_gt(multiseedLen, 0);
cerr << "Warning: seed mismatches (" << multiseedMms
<< ") is less than seed length (" << multiseedLen
<< "); setting mismatches to " << (multiseedMms-1)
<< " instead" << endl;
multiseedMms = multiseedLen-1;
}
sam_print_zm = sam_print_zm && bowtie2p5;
#ifndef NDEBUG
if(!gQuiet) {
cerr << "Warning: Running in debug mode. Please use debug mode only "
<< "for diagnosing errors, and not for typical use of HISAT2."
<< endl;
}
#endif
}
static const char *argv0 = NULL;
/// Create a PatternSourcePerThread for the current thread according
/// to the global params and return a pointer to it
static PatternSourcePerThreadFactory*
createPatsrcFactory(PairedPatternSource& _patsrc, int tid) {
PatternSourcePerThreadFactory *patsrcFact;
patsrcFact = new WrappedPatternSourcePerThreadFactory(_patsrc);
assert(patsrcFact != NULL);
return patsrcFact;
}
#define PTHREAD_ATTRS (PTHREAD_CREATE_JOINABLE | PTHREAD_CREATE_DETACHED)
typedef TIndexOffU index_t;
typedef uint16_t local_index_t;
static PairedPatternSource* multiseed_patsrc;
static HGFM<index_t>* multiseed_gfm;
static RFM<index_t>* multiseed_rgfm;
//static HGFM<index_t>* multiseed_gfms[2];
//static RFM<index_t>* multiseed_rgfms[2];
static Scoring* multiseed_sc;
static BitPairReference* multiseed_refs;
static BitPairReference* multiseed_rrefs;
//static BitPairReference* multiseed_refss[2];
//static BitPairReference* multiseed_rrefss[2];
static AlnSink<index_t>* multiseed_msink;
static OutFileBuf* multiseed_metricsOfb;
static SpliceSiteDB* ssdb;
static ALTDB<index_t>* altdb;
static RepeatDB<index_t>* repeatdb;
static ALTDB<index_t>* raltdb;
static ALTDB<index_t> *altdbs_3N[2];
static RepeatDB<index_t> *repeatdbs_3N[2];
static ALTDB<index_t> *raltdbs_3N[2];
static TranscriptomePolicy* multiseed_tpol;
static GraphPolicy* gpol;
class reference3N {
public:
const HGFM<index_t>* multiseed_gfm[2];
const RFM<index_t>* multiseed_rgfm[2];
const BitPairReference* multiseed_rrefs[2];
reference3N() {
}
void load(EList<HGFM<index_t>* >& gfms_3N,
RFM<index_t>* rgfms_3N[2],
BitPairReference* rrefss[2]) {
for (int i = 0; i < 2; i++) {
multiseed_gfm[i] = gfms_3N[i];
multiseed_rgfm[i] = rgfms_3N[i];
multiseed_rrefs[i] = rrefss[i];
}
}
};
reference3N ref3N;
/**
* Metrics for measuring the work done by the outer read alignment
* loop.
*/
struct OuterLoopMetrics {
OuterLoopMetrics() {
reset();
}
/**
* Set all counters to 0.
*/
void reset() {
reads = bases = srreads = srbases =
freads = fbases = ureads = ubases = 0;
}
/**
* Sum the counters in m in with the conters in this object. This
* is the only safe way to update an OuterLoopMetrics that's shared
* by multiple threads.
*/
void merge(
const OuterLoopMetrics& m,
bool getLock = false)
{
ThreadSafe ts(&mutex_m, getLock);
reads += m.reads;
bases += m.bases;
srreads += m.srreads;
srbases += m.srbases;
freads += m.freads;
fbases += m.fbases;
ureads += m.ureads;
ubases += m.ubases;
}
uint64_t reads; // total reads
uint64_t bases; // total bases
uint64_t srreads; // same-read reads
uint64_t srbases; // same-read bases
uint64_t freads; // filtered reads
uint64_t fbases; // filtered bases
uint64_t ureads; // unfiltered reads
uint64_t ubases; // unfiltered bases
MUTEX_T mutex_m;
};
/**
* Collection of all relevant performance metrics when aligning in
* multiseed mode.
*/
struct PerfMetrics {
PerfMetrics() : first(true) { reset(); }
/**
* Set all counters to 0.
*/
void reset() {
olm.reset();
sdm.reset();
wlm.reset();
swmSeed.reset();
swmMate.reset();
rpm.reset();
dpSse8Seed.reset(); // 8-bit SSE seed extensions
dpSse8Mate.reset(); // 8-bit SSE mate finds
dpSse16Seed.reset(); // 16-bit SSE seed extensions
dpSse16Mate.reset(); // 16-bit SSE mate finds
nbtfiltst = 0;
nbtfiltsc = 0;
nbtfiltdo = 0;
olmu.reset();
sdmu.reset();
wlmu.reset();
swmuSeed.reset();
swmuMate.reset();
rpmu.reset();
dpSse8uSeed.reset(); // 8-bit SSE seed extensions
dpSse8uMate.reset(); // 8-bit SSE mate finds
dpSse16uSeed.reset(); // 16-bit SSE seed extensions
dpSse16uMate.reset(); // 16-bit SSE mate finds
nbtfiltst_u = 0;
nbtfiltsc_u = 0;
nbtfiltdo_u = 0;
him.reset();
}
/**
* Merge a set of specific metrics into this object.
*/
void merge(
const OuterLoopMetrics *ol,
const SeedSearchMetrics *sd,
const WalkMetrics *wl,
const SwMetrics *swSeed,
const SwMetrics *swMate,
const ReportingMetrics *rm,
const SSEMetrics *dpSse8Ex,
const SSEMetrics *dpSse8Ma,
const SSEMetrics *dpSse16Ex,
const SSEMetrics *dpSse16Ma,
uint64_t nbtfiltst_,
uint64_t nbtfiltsc_,
uint64_t nbtfiltdo_,
const HIMetrics *hi,
bool getLock)
{
ThreadSafe ts(&mutex_m, getLock);
if(ol != NULL) {
olmu.merge(*ol, false);
}
if(sd != NULL) {
sdmu.merge(*sd, false);
}
if(wl != NULL) {
wlmu.merge(*wl, false);
}
if(swSeed != NULL) {
swmuSeed.merge(*swSeed, false);
}
if(swMate != NULL) {
swmuMate.merge(*swMate, false);
}
if(rm != NULL) {
rpmu.merge(*rm, false);
}
if(dpSse8Ex != NULL) {
dpSse8uSeed.merge(*dpSse8Ex, false);
}
if(dpSse8Ma != NULL) {
dpSse8uMate.merge(*dpSse8Ma, false);
}
if(dpSse16Ex != NULL) {
dpSse16uSeed.merge(*dpSse16Ex, false);
}
if(dpSse16Ma != NULL) {
dpSse16uMate.merge(*dpSse16Ma, false);
}
nbtfiltst_u += nbtfiltst_;
nbtfiltsc_u += nbtfiltsc_;
nbtfiltdo_u += nbtfiltdo_;
if(hi != NULL) {
him.merge(*hi, false);
}
}
/**
* Reports a matrix of results, incl. column labels, to an OutFileBuf.
* Optionally also sends results to stderr (unbuffered). Can optionally
* print a per-read record with the read name at the beginning.
*/
void reportInterval(
OutFileBuf* o, // file to send output to
bool metricsStderr, // additionally output to stderr?
bool total, // true -> report total, otherwise incremental
bool sync, // synchronize output
const BTString *name) // non-NULL name pointer if is per-read record
{
ThreadSafe ts(&mutex_m, sync);
ostringstream stderrSs;
time_t curtime = time(0);
char buf[1024];
if(first) {
const char *str =
/* 1 */ "Time" "\t"
/* 2 */ "Read" "\t"
/* 3 */ "Base" "\t"
/* 4 */ "SameRead" "\t"
/* 5 */ "SameReadBase" "\t"
/* 6 */ "UnfilteredRead" "\t"
/* 7 */ "UnfilteredBase" "\t"
/* 8 */ "Paired" "\t"
/* 9 */ "Unpaired" "\t"
/* 10 */ "AlConUni" "\t"
/* 11 */ "AlConRep" "\t"
/* 12 */ "AlConFail" "\t"
/* 13 */ "AlDis" "\t"
/* 14 */ "AlConFailUni" "\t"
/* 15 */ "AlConFailRep" "\t"
/* 16 */ "AlConFailFail" "\t"
/* 17 */ "AlConRepUni" "\t"
/* 18 */ "AlConRepRep" "\t"
/* 19 */ "AlConRepFail" "\t"
/* 20 */ "AlUnpUni" "\t"
/* 21 */ "AlUnpRep" "\t"
/* 22 */ "AlUnpFail" "\t"
/* 23 */ "SeedSearch" "\t"
/* 24 */ "IntraSCacheHit" "\t"
/* 25 */ "InterSCacheHit" "\t"
/* 26 */ "OutOfMemory" "\t"
/* 27 */ "AlBWOp" "\t"
/* 28 */ "AlBWBranch" "\t"
/* 29 */ "ResBWOp" "\t"
/* 30 */ "ResBWBranch" "\t"
/* 31 */ "ResResolve" "\t"
/* 34 */ "ResReport" "\t"
/* 35 */ "RedundantSHit" "\t"
/* 36 */ "BestMinEdit0" "\t"
/* 37 */ "BestMinEdit1" "\t"
/* 38 */ "BestMinEdit2" "\t"
/* 39 */ "ExactAttempts" "\t"
/* 40 */ "ExactSucc" "\t"
/* 41 */ "ExactRanges" "\t"
/* 42 */ "ExactRows" "\t"
/* 43 */ "ExactOOMs" "\t"
/* 44 */ "1mmAttempts" "\t"
/* 45 */ "1mmSucc" "\t"
/* 46 */ "1mmRanges" "\t"
/* 47 */ "1mmRows" "\t"
/* 48 */ "1mmOOMs" "\t"
/* 49 */ "UngappedSucc" "\t"
/* 50 */ "UngappedFail" "\t"
/* 51 */ "UngappedNoDec" "\t"
/* 52 */ "DPExLt10Gaps" "\t"
/* 53 */ "DPExLt5Gaps" "\t"
/* 54 */ "DPExLt3Gaps" "\t"
/* 55 */ "DPMateLt10Gaps" "\t"
/* 56 */ "DPMateLt5Gaps" "\t"
/* 57 */ "DPMateLt3Gaps" "\t"
/* 58 */ "DP16ExDps" "\t"
/* 59 */ "DP16ExDpSat" "\t"
/* 60 */ "DP16ExDpFail" "\t"
/* 61 */ "DP16ExDpSucc" "\t"
/* 62 */ "DP16ExCol" "\t"
/* 63 */ "DP16ExCell" "\t"
/* 64 */ "DP16ExInner" "\t"
/* 65 */ "DP16ExFixup" "\t"
/* 66 */ "DP16ExGathSol" "\t"
/* 67 */ "DP16ExBt" "\t"
/* 68 */ "DP16ExBtFail" "\t"
/* 69 */ "DP16ExBtSucc" "\t"
/* 70 */ "DP16ExBtCell" "\t"
/* 71 */ "DP16ExCoreRej" "\t"
/* 72 */ "DP16ExNRej" "\t"
/* 73 */ "DP8ExDps" "\t"
/* 74 */ "DP8ExDpSat" "\t"
/* 75 */ "DP8ExDpFail" "\t"
/* 76 */ "DP8ExDpSucc" "\t"
/* 77 */ "DP8ExCol" "\t"
/* 78 */ "DP8ExCell" "\t"
/* 79 */ "DP8ExInner" "\t"
/* 80 */ "DP8ExFixup" "\t"
/* 81 */ "DP8ExGathSol" "\t"
/* 82 */ "DP8ExBt" "\t"
/* 83 */ "DP8ExBtFail" "\t"
/* 84 */ "DP8ExBtSucc" "\t"
/* 85 */ "DP8ExBtCell" "\t"
/* 86 */ "DP8ExCoreRej" "\t"
/* 87 */ "DP8ExNRej" "\t"
/* 88 */ "DP16MateDps" "\t"
/* 89 */ "DP16MateDpSat" "\t"
/* 90 */ "DP16MateDpFail" "\t"
/* 91 */ "DP16MateDpSucc" "\t"
/* 92 */ "DP16MateCol" "\t"
/* 93 */ "DP16MateCell" "\t"
/* 94 */ "DP16MateInner" "\t"
/* 95 */ "DP16MateFixup" "\t"
/* 96 */ "DP16MateGathSol" "\t"
/* 97 */ "DP16MateBt" "\t"
/* 98 */ "DP16MateBtFail" "\t"
/* 99 */ "DP16MateBtSucc" "\t"
/* 100 */ "DP16MateBtCell" "\t"
/* 101 */ "DP16MateCoreRej" "\t"
/* 102 */ "DP16MateNRej" "\t"
/* 103 */ "DP8MateDps" "\t"
/* 104 */ "DP8MateDpSat" "\t"
/* 105 */ "DP8MateDpFail" "\t"
/* 106 */ "DP8MateDpSucc" "\t"
/* 107 */ "DP8MateCol" "\t"
/* 108 */ "DP8MateCell" "\t"
/* 109 */ "DP8MateInner" "\t"
/* 110 */ "DP8MateFixup" "\t"
/* 111 */ "DP8MateGathSol" "\t"
/* 112 */ "DP8MateBt" "\t"
/* 113 */ "DP8MateBtFail" "\t"
/* 114 */ "DP8MateBtSucc" "\t"
/* 115 */ "DP8MateBtCell" "\t"
/* 116 */ "DP8MateCoreRej" "\t"
/* 117 */ "DP8MateNRej" "\t"
/* 118 */ "DPBtFiltStart" "\t"
/* 119 */ "DPBtFiltScore" "\t"
/* 120 */ "DpBtFiltDom" "\t"
/* 121 */ "MemPeak" "\t"
/* 122 */ "UncatMemPeak" "\t" // 0
/* 123 */ "EbwtMemPeak" "\t" // EBWT_CAT
/* 124 */ "CacheMemPeak" "\t" // CA_CAT
/* 125 */ "ResolveMemPeak" "\t" // GW_CAT
/* 126 */ "AlignMemPeak" "\t" // AL_CAT
/* 127 */ "DPMemPeak" "\t" // DP_CAT
/* 128 */ "MiscMemPeak" "\t" // MISC_CAT
/* 129 */ "DebugMemPeak" "\t" // DEBUG_CAT
/* 130 */ "LocalSearch" "\t"
/* 131 */ "AnchorSearch" "\t"
/* 132 */ "LocalIndexSearch" "\t"
/* 133 */ "LocalExtSearch" "\t"
/* 134 */ "LocalSearchRecur" "\t"
/* 135 */ "GlobalGenomeCoords" "\t"
/* 136 */ "LocalGenomeCoords" "\t"
"\n";
if(name != NULL) {
if(o != NULL) o->writeChars("Name\t");
if(metricsStderr) stderrSs << "Name\t";
}
if(o != NULL) o->writeChars(str);
if(metricsStderr) stderrSs << str;
first = false;
}
if(total) mergeIncrementals();
// 0. Read name, if needed
if(name != NULL) {
if(o != NULL) {
o->writeChars(name->toZBuf());
o->write('\t');
}
if(metricsStderr) {
stderrSs << (*name) << '\t';
}
}
// 1. Current time in secs
itoa10<time_t>(curtime, buf);
if(metricsStderr) stderrSs << buf << '\t';
if(o != NULL) { o->writeChars(buf); o->write('\t'); }
const OuterLoopMetrics& ol = total ? olm : olmu;
// 2. Reads
itoa10<uint64_t>(ol.reads, buf);
if(metricsStderr) stderrSs << buf << '\t';
if(o != NULL) { o->writeChars(buf); o->write('\t'); }
// 3. Bases
itoa10<uint64_t>(ol.bases, buf);
if(metricsStderr) stderrSs << buf << '\t';
if(o != NULL) { o->writeChars(buf); o->write('\t'); }
// 4. Same-read reads
itoa10<uint64_t>(ol.srreads, buf);
if(metricsStderr) stderrSs << buf << '\t';
if(o != NULL) { o->writeChars(buf); o->write('\t'); }
// 5. Same-read bases
itoa10<uint64_t>(ol.srbases, buf);
if(metricsStderr) stderrSs << buf << '\t';
if(o != NULL) { o->writeChars(buf); o->write('\t'); }
// 6. Unfiltered reads
itoa10<uint64_t>(ol.ureads, buf);
if(metricsStderr) stderrSs << buf << '\t';
if(o != NULL) { o->writeChars(buf); o->write('\t'); }
// 7. Unfiltered bases
itoa10<uint64_t>(ol.ubases, buf);
if(metricsStderr) stderrSs << buf << '\t';
if(o != NULL) { o->writeChars(buf); o->write('\t'); }
const ReportingMetrics& rp = total ? rpm : rpmu;
// 8. Paired reads
itoa10<uint64_t>(rp.npaired, buf);
if(metricsStderr) stderrSs << buf << '\t';
if(o != NULL) { o->writeChars(buf); o->write('\t'); }
// 9. Unpaired reads
itoa10<uint64_t>(rp.nunpaired, buf);
if(metricsStderr) stderrSs << buf << '\t';
if(o != NULL) { o->writeChars(buf); o->write('\t'); }
// 10. Pairs with unique concordant alignments
itoa10<uint64_t>(rp.nconcord_uni, buf);
if(metricsStderr) stderrSs << buf << '\t';
if(o != NULL) { o->writeChars(buf); o->write('\t'); }
// 11. Pairs with repetitive concordant alignments
itoa10<uint64_t>(rp.nconcord_rep, buf);
if(metricsStderr) stderrSs << buf << '\t';
if(o != NULL) { o->writeChars(buf); o->write('\t'); }
// 12. Pairs with 0 concordant alignments
itoa10<uint64_t>(rp.nconcord_0, buf);
if(metricsStderr) stderrSs << buf << '\t';
if(o != NULL) { o->writeChars(buf); o->write('\t'); }
// 13. Pairs with 1 discordant alignment
itoa10<uint64_t>(rp.ndiscord, buf);
if(metricsStderr) stderrSs << buf << '\t';
if(o != NULL) { o->writeChars(buf); o->write('\t'); }
// 14. Mates from unaligned pairs that align uniquely
itoa10<uint64_t>(rp.nunp_0_uni, buf);
if(metricsStderr) stderrSs << buf << '\t';
if(o != NULL) { o->writeChars(buf); o->write('\t'); }
// 15. Mates from unaligned pairs that align repetitively
itoa10<uint64_t>(rp.nunp_0_rep, buf);
if(metricsStderr) stderrSs << buf << '\t';
if(o != NULL) { o->writeChars(buf); o->write('\t'); }
// 16. Mates from unaligned pairs that fail to align
itoa10<uint64_t>(rp.nunp_0_0, buf);
if(metricsStderr) stderrSs << buf << '\t';
if(o != NULL) { o->writeChars(buf); o->write('\t'); }
// 17. Mates from repetitive pairs that align uniquely
itoa10<uint64_t>(rp.nunp_rep_uni, buf);
if(metricsStderr) stderrSs << buf << '\t';
if(o != NULL) { o->writeChars(buf); o->write('\t'); }
// 18. Mates from repetitive pairs that align repetitively
itoa10<uint64_t>(rp.nunp_rep_rep, buf);
if(metricsStderr) stderrSs << buf << '\t';
if(o != NULL) { o->writeChars(buf); o->write('\t'); }
// 19. Mates from repetitive pairs that fail to align
itoa10<uint64_t>(rp.nunp_rep_0, buf);
if(metricsStderr) stderrSs << buf << '\t';
if(o != NULL) { o->writeChars(buf); o->write('\t'); }
// 20. Unpaired reads that align uniquely
itoa10<uint64_t>(rp.nunp_uni, buf);
if(metricsStderr) stderrSs << buf << '\t';
if(o != NULL) { o->writeChars(buf); o->write('\t'); }
// 21. Unpaired reads that align repetitively
itoa10<uint64_t>(rp.nunp_rep, buf);
if(metricsStderr) stderrSs << buf << '\t';
if(o != NULL) { o->writeChars(buf); o->write('\t'); }
// 22. Unpaired reads that fail to align
itoa10<uint64_t>(rp.nunp_0, buf);
if(metricsStderr) stderrSs << buf << '\t';
if(o != NULL) { o->writeChars(buf); o->write('\t'); }
const SeedSearchMetrics& sd = total ? sdm : sdmu;
// 23. Seed searches
itoa10<uint64_t>(sd.seedsearch, buf);
if(metricsStderr) stderrSs << buf << '\t';
if(o != NULL) { o->writeChars(buf); o->write('\t'); }
// 24. Hits in 'current' cache
itoa10<uint64_t>(sd.intrahit, buf);
if(metricsStderr) stderrSs << buf << '\t';
if(o != NULL) { o->writeChars(buf); o->write('\t'); }
// 25. Hits in 'local' cache
itoa10<uint64_t>(sd.interhit, buf);
if(metricsStderr) stderrSs << buf << '\t';
if(o != NULL) { o->writeChars(buf); o->write('\t'); }
// 26. Out of memory
itoa10<uint64_t>(sd.ooms, buf);
if(metricsStderr) stderrSs << buf << '\t';
if(o != NULL) { o->writeChars(buf); o->write('\t'); }
// 27. Burrows-Wheeler ops in aligner
itoa10<uint64_t>(sd.bwops, buf);
if(metricsStderr) stderrSs << buf << '\t';
if(o != NULL) { o->writeChars(buf); o->write('\t'); }
// 28. Burrows-Wheeler branches (edits) in aligner
itoa10<uint64_t>(sd.bweds, buf);
if(metricsStderr) stderrSs << buf << '\t';
if(o != NULL) { o->writeChars(buf); o->write('\t'); }
const WalkMetrics& wl = total ? wlm : wlmu;
// 29. Burrows-Wheeler ops in resolver
itoa10<uint64_t>(wl.bwops, buf);
if(metricsStderr) stderrSs << buf << '\t';
if(o != NULL) { o->writeChars(buf); o->write('\t'); }
// 30. Burrows-Wheeler branches in resolver
itoa10<uint64_t>(wl.branches, buf);
if(metricsStderr) stderrSs << buf << '\t';
if(o != NULL) { o->writeChars(buf); o->write('\t'); }
// 31. Burrows-Wheeler offset resolutions
itoa10<uint64_t>(wl.resolves, buf);
if(metricsStderr) stderrSs << buf << '\t';
if(o != NULL) { o->writeChars(buf); o->write('\t'); }
// 34. Offset reports
itoa10<uint64_t>(wl.reports, buf);
if(metricsStderr) stderrSs << buf << '\t';
if(o != NULL) { o->writeChars(buf); o->write('\t'); }
// 35. Redundant seed hit
itoa10<uint64_t>(total ? swmSeed.rshit : swmuSeed.rshit, buf);
if(metricsStderr) stderrSs << buf << '\t';
if(o != NULL) { o->writeChars(buf); o->write('\t'); }
// 36. # times the best (out of fw/rc) minimum # edits was 0
itoa10<uint64_t>(total ? sdm.bestmin0 : sdmu.bestmin0, buf);
if(metricsStderr) stderrSs << buf << '\t';
if(o != NULL) { o->writeChars(buf); o->write('\t'); }
// 37. # times the best (out of fw/rc) minimum # edits was 1
itoa10<uint64_t>(total ? sdm.bestmin1 : sdmu.bestmin1, buf);
if(metricsStderr) stderrSs << buf << '\t';
if(o != NULL) { o->writeChars(buf); o->write('\t'); }
// 38. # times the best (out of fw/rc) minimum # edits was 2
itoa10<uint64_t>(total ? sdm.bestmin2 : sdmu.bestmin2, buf);
if(metricsStderr) stderrSs << buf << '\t';
if(o != NULL) { o->writeChars(buf); o->write('\t'); }
// 39. Exact aligner attempts
itoa10<uint64_t>(total ? swmSeed.exatts : swmuSeed.exatts, buf);
if(metricsStderr) stderrSs << buf << '\t';
if(o != NULL) { o->writeChars(buf); o->write('\t'); }
// 40. Exact aligner successes
itoa10<uint64_t>(total ? swmSeed.exsucc : swmuSeed.exsucc, buf);
if(metricsStderr) stderrSs << buf << '\t';
if(o != NULL) { o->writeChars(buf); o->write('\t'); }
// 41. Exact aligner ranges
itoa10<uint64_t>(total ? swmSeed.exranges : swmuSeed.exranges, buf);
if(metricsStderr) stderrSs << buf << '\t';
if(o != NULL) { o->writeChars(buf); o->write('\t'); }
// 42. Exact aligner rows
itoa10<uint64_t>(total ? swmSeed.exrows : swmuSeed.exrows, buf);
if(metricsStderr) stderrSs << buf << '\t';
if(o != NULL) { o->writeChars(buf); o->write('\t'); }
// 43. Exact aligner OOMs
itoa10<uint64_t>(total ? swmSeed.exooms : swmuSeed.exooms, buf);
if(metricsStderr) stderrSs << buf << '\t';
if(o != NULL) { o->writeChars(buf); o->write('\t'); }
// 44. 1mm aligner attempts
itoa10<uint64_t>(total ? swmSeed.mm1atts : swmuSeed.mm1atts, buf);
if(metricsStderr) stderrSs << buf << '\t';
if(o != NULL) { o->writeChars(buf); o->write('\t'); }
// 45. 1mm aligner successes
itoa10<uint64_t>(total ? swmSeed.mm1succ : swmuSeed.mm1succ, buf);
if(metricsStderr) stderrSs << buf << '\t';
if(o != NULL) { o->writeChars(buf); o->write('\t'); }
// 46. 1mm aligner ranges
itoa10<uint64_t>(total ? swmSeed.mm1ranges : swmuSeed.mm1ranges, buf);
if(metricsStderr) stderrSs << buf << '\t';
if(o != NULL) { o->writeChars(buf); o->write('\t'); }
// 47. 1mm aligner rows
itoa10<uint64_t>(total ? swmSeed.mm1rows : swmuSeed.mm1rows, buf);
if(metricsStderr) stderrSs << buf << '\t';
if(o != NULL) { o->writeChars(buf); o->write('\t'); }
// 48. 1mm aligner OOMs
itoa10<uint64_t>(total ? swmSeed.mm1ooms : swmuSeed.mm1ooms, buf);
if(metricsStderr) stderrSs << buf << '\t';
if(o != NULL) { o->writeChars(buf); o->write('\t'); }
// 49 Ungapped aligner success
itoa10<uint64_t>(total ? swmSeed.ungapsucc : swmuSeed.ungapsucc, buf);
if(metricsStderr) stderrSs << buf << '\t';
if(o != NULL) { o->writeChars(buf); o->write('\t'); }
// 50. Ungapped aligner fail
itoa10<uint64_t>(total ? swmSeed.ungapfail : swmuSeed.ungapfail, buf);
if(metricsStderr) stderrSs << buf << '\t';
if(o != NULL) { o->writeChars(buf); o->write('\t'); }
// 51. Ungapped aligner no decision
itoa10<uint64_t>(total ? swmSeed.ungapnodec : swmuSeed.ungapnodec, buf);
if(metricsStderr) stderrSs << buf << '\t';
if(o != NULL) { o->writeChars(buf); o->write('\t'); }
// 52. # seed-extend DPs with < 10 gaps
itoa10<uint64_t>(total ? swmSeed.sws10 : swmuSeed.sws10, buf);
if(metricsStderr) stderrSs << buf << '\t';
if(o != NULL) { o->writeChars(buf); o->write('\t'); }
// 53. # seed-extend DPs with < 5 gaps
itoa10<uint64_t>(total ? swmSeed.sws5 : swmuSeed.sws5, buf);
if(metricsStderr) stderrSs << buf << '\t';
if(o != NULL) { o->writeChars(buf); o->write('\t'); }
// 54. # seed-extend DPs with < 3 gaps
itoa10<uint64_t>(total ? swmSeed.sws3 : swmuSeed.sws3, buf);
if(metricsStderr) stderrSs << buf << '\t';
if(o != NULL) { o->writeChars(buf); o->write('\t'); }
// 55. # seed-extend DPs with < 10 gaps
itoa10<uint64_t>(total ? swmMate.sws10 : swmuMate.sws10, buf);
if(metricsStderr) stderrSs << buf << '\t';
if(o != NULL) { o->writeChars(buf); o->write('\t'); }
// 56. # seed-extend DPs with < 5 gaps
itoa10<uint64_t>(total ? swmMate.sws5 : swmuMate.sws5, buf);
if(metricsStderr) stderrSs << buf << '\t';
if(o != NULL) { o->writeChars(buf); o->write('\t'); }
// 57. # seed-extend DPs with < 3 gaps
itoa10<uint64_t>(total ? swmMate.sws3 : swmuMate.sws3, buf);
if(metricsStderr) stderrSs << buf << '\t';
if(o != NULL) { o->writeChars(buf); o->write('\t'); }
const SSEMetrics& dpSse16s = total ? dpSse16Seed : dpSse16uSeed;
// 58. 16-bit SSE seed-extend DPs tried
itoa10<uint64_t>(dpSse16s.dp, buf);
if(metricsStderr) stderrSs << buf << '\t';
if(o != NULL) { o->writeChars(buf); o->write('\t'); }
// 59. 16-bit SSE seed-extend DPs saturated
itoa10<uint64_t>(dpSse16s.dpsat, buf);
if(metricsStderr) stderrSs << buf << '\t';
if(o != NULL) { o->writeChars(buf); o->write('\t'); }
// 60. 16-bit SSE seed-extend DPs failed
itoa10<uint64_t>(dpSse16s.dpfail, buf);
if(metricsStderr) stderrSs << buf << '\t';
if(o != NULL) { o->writeChars(buf); o->write('\t'); }
// 61. 16-bit SSE seed-extend DPs succeeded
itoa10<uint64_t>(dpSse16s.dpsucc, buf);
if(metricsStderr) stderrSs << buf << '\t';
if(o != NULL) { o->writeChars(buf); o->write('\t'); }
// 62. 16-bit SSE seed-extend DP columns completed
itoa10<uint64_t>(dpSse16s.col, buf);
if(metricsStderr) stderrSs << buf << '\t';
if(o != NULL) { o->writeChars(buf); o->write('\t'); }
// 63. 16-bit SSE seed-extend DP cells completed
itoa10<uint64_t>(dpSse16s.cell, buf);
if(metricsStderr) stderrSs << buf << '\t';
if(o != NULL) { o->writeChars(buf); o->write('\t'); }
// 64. 16-bit SSE seed-extend DP inner loop iters completed
itoa10<uint64_t>(dpSse16s.inner, buf);
if(metricsStderr) stderrSs << buf << '\t';
if(o != NULL) { o->writeChars(buf); o->write('\t'); }
// 65. 16-bit SSE seed-extend DP fixup loop iters completed
itoa10<uint64_t>(dpSse16s.fixup, buf);
if(metricsStderr) stderrSs << buf << '\t';
if(o != NULL) { o->writeChars(buf); o->write('\t'); }
// 66. 16-bit SSE seed-extend DP gather, cells with potential solutions
itoa10<uint64_t>(dpSse16s.gathsol, buf);
if(metricsStderr) stderrSs << buf << '\t';
if(o != NULL) { o->writeChars(buf); o->write('\t'); }
// 67. 16-bit SSE seed-extend DP backtrace attempts
itoa10<uint64_t>(dpSse16s.bt, buf);
if(metricsStderr) stderrSs << buf << '\t';
if(o != NULL) { o->writeChars(buf); o->write('\t'); }
// 68. 16-bit SSE seed-extend DP failed backtrace attempts
itoa10<uint64_t>(dpSse16s.btfail, buf);
if(metricsStderr) stderrSs << buf << '\t';
if(o != NULL) { o->writeChars(buf); o->write('\t'); }
// 69. 16-bit SSE seed-extend DP succesful backtrace attempts
itoa10<uint64_t>(dpSse16s.btsucc, buf);
if(metricsStderr) stderrSs << buf << '\t';
if(o != NULL) { o->writeChars(buf); o->write('\t'); }
// 70. 16-bit SSE seed-extend DP backtrace cells
itoa10<uint64_t>(dpSse16s.btcell, buf);
if(metricsStderr) stderrSs << buf << '\t';
if(o != NULL) { o->writeChars(buf); o->write('\t'); }
// 71. 16-bit SSE seed-extend DP core-diag rejections
itoa10<uint64_t>(dpSse16s.corerej, buf);
if(metricsStderr) stderrSs << buf << '\t';
if(o != NULL) { o->writeChars(buf); o->write('\t'); }
// 72. 16-bit SSE seed-extend DP N rejections
itoa10<uint64_t>(dpSse16s.nrej, buf);
if(metricsStderr) stderrSs << buf << '\t';
if(o != NULL) { o->writeChars(buf); o->write('\t'); }
const SSEMetrics& dpSse8s = total ? dpSse8Seed : dpSse8uSeed;
// 73. 8-bit SSE seed-extend DPs tried
itoa10<uint64_t>(dpSse8s.dp, buf);
if(metricsStderr) stderrSs << buf << '\t';
if(o != NULL) { o->writeChars(buf); o->write('\t'); }
// 74. 8-bit SSE seed-extend DPs saturated
itoa10<uint64_t>(dpSse8s.dpsat, buf);
if(metricsStderr) stderrSs << buf << '\t';
if(o != NULL) { o->writeChars(buf); o->write('\t'); }
// 75. 8-bit SSE seed-extend DPs failed
itoa10<uint64_t>(dpSse8s.dpfail, buf);
if(metricsStderr) stderrSs << buf << '\t';
if(o != NULL) { o->writeChars(buf); o->write('\t'); }
// 76. 8-bit SSE seed-extend DPs succeeded
itoa10<uint64_t>(dpSse8s.dpsucc, buf);
if(metricsStderr) stderrSs << buf << '\t';
if(o != NULL) { o->writeChars(buf); o->write('\t'); }
// 77. 8-bit SSE seed-extend DP columns completed
itoa10<uint64_t>(dpSse8s.col, buf);
if(metricsStderr) stderrSs << buf << '\t';
if(o != NULL) { o->writeChars(buf); o->write('\t'); }
// 78. 8-bit SSE seed-extend DP cells completed
itoa10<uint64_t>(dpSse8s.cell, buf);
if(metricsStderr) stderrSs << buf << '\t';
if(o != NULL) { o->writeChars(buf); o->write('\t'); }
// 79. 8-bit SSE seed-extend DP inner loop iters completed
itoa10<uint64_t>(dpSse8s.inner, buf);
if(metricsStderr) stderrSs << buf << '\t';
if(o != NULL) { o->writeChars(buf); o->write('\t'); }
// 80. 8-bit SSE seed-extend DP fixup loop iters completed
itoa10<uint64_t>(dpSse8s.fixup, buf);
if(metricsStderr) stderrSs << buf << '\t';
if(o != NULL) { o->writeChars(buf); o->write('\t'); }
// 81. 16-bit SSE seed-extend DP gather, cells with potential solutions
itoa10<uint64_t>(dpSse8s.gathsol, buf);
if(metricsStderr) stderrSs << buf << '\t';
if(o != NULL) { o->writeChars(buf); o->write('\t'); }
// 82. 16-bit SSE seed-extend DP backtrace attempts
itoa10<uint64_t>(dpSse8s.bt, buf);
if(metricsStderr) stderrSs << buf << '\t';
if(o != NULL) { o->writeChars(buf); o->write('\t'); }
// 83. 16-bit SSE seed-extend DP failed backtrace attempts
itoa10<uint64_t>(dpSse8s.btfail, buf);
if(metricsStderr) stderrSs << buf << '\t';
if(o != NULL) { o->writeChars(buf); o->write('\t'); }
// 84. 16-bit SSE seed-extend DP succesful backtrace attempts
itoa10<uint64_t>(dpSse8s.btsucc, buf);
if(metricsStderr) stderrSs << buf << '\t';
if(o != NULL) { o->writeChars(buf); o->write('\t'); }
// 85. 16-bit SSE seed-extend DP backtrace cells
itoa10<uint64_t>(dpSse8s.btcell, buf);
if(metricsStderr) stderrSs << buf << '\t';
if(o != NULL) { o->writeChars(buf); o->write('\t'); }
// 86. 16-bit SSE seed-extend DP core-diag rejections
itoa10<uint64_t>(dpSse8s.corerej, buf);
if(metricsStderr) stderrSs << buf << '\t';
if(o != NULL) { o->writeChars(buf); o->write('\t'); }
// 87. 16-bit SSE seed-extend DP N rejections
itoa10<uint64_t>(dpSse8s.nrej, buf);
if(metricsStderr) stderrSs << buf << '\t';
if(o != NULL) { o->writeChars(buf); o->write('\t'); }
const SSEMetrics& dpSse16m = total ? dpSse16Mate : dpSse16uMate;
// 88. 16-bit SSE mate-finding DPs tried
itoa10<uint64_t>(dpSse16m.dp, buf);
if(metricsStderr) stderrSs << buf << '\t';
if(o != NULL) { o->writeChars(buf); o->write('\t'); }
// 89. 16-bit SSE mate-finding DPs saturated
itoa10<uint64_t>(dpSse16m.dpsat, buf);
if(metricsStderr) stderrSs << buf << '\t';
if(o != NULL) { o->writeChars(buf); o->write('\t'); }
// 90. 16-bit SSE mate-finding DPs failed
itoa10<uint64_t>(dpSse16m.dpfail, buf);
if(metricsStderr) stderrSs << buf << '\t';
if(o != NULL) { o->writeChars(buf); o->write('\t'); }
// 91. 16-bit SSE mate-finding DPs succeeded
itoa10<uint64_t>(dpSse16m.dpsucc, buf);
if(metricsStderr) stderrSs << buf << '\t';
if(o != NULL) { o->writeChars(buf); o->write('\t'); }
// 92. 16-bit SSE mate-finding DP columns completed
itoa10<uint64_t>(dpSse16m.col, buf);
if(metricsStderr) stderrSs << buf << '\t';
if(o != NULL) { o->writeChars(buf); o->write('\t'); }
// 93. 16-bit SSE mate-finding DP cells completed
itoa10<uint64_t>(dpSse16m.cell, buf);
if(metricsStderr) stderrSs << buf << '\t';
if(o != NULL) { o->writeChars(buf); o->write('\t'); }
// 94. 16-bit SSE mate-finding DP inner loop iters completed
itoa10<uint64_t>(dpSse16m.inner, buf);
if(metricsStderr) stderrSs << buf << '\t';
if(o != NULL) { o->writeChars(buf); o->write('\t'); }
// 95. 16-bit SSE mate-finding DP fixup loop iters completed
itoa10<uint64_t>(dpSse16m.fixup, buf);
if(metricsStderr) stderrSs << buf << '\t';
if(o != NULL) { o->writeChars(buf); o->write('\t'); }
// 96. 16-bit SSE mate-finding DP gather, cells with potential solutions
itoa10<uint64_t>(dpSse16m.gathsol, buf);
if(metricsStderr) stderrSs << buf << '\t';
if(o != NULL) { o->writeChars(buf); o->write('\t'); }
// 97. 16-bit SSE mate-finding DP backtrace attempts
itoa10<uint64_t>(dpSse16m.bt, buf);
if(metricsStderr) stderrSs << buf << '\t';
if(o != NULL) { o->writeChars(buf); o->write('\t'); }
// 98. 16-bit SSE mate-finding DP failed backtrace attempts
itoa10<uint64_t>(dpSse16m.btfail, buf);
if(metricsStderr) stderrSs << buf << '\t';
if(o != NULL) { o->writeChars(buf); o->write('\t'); }
// 99. 16-bit SSE mate-finding DP succesful backtrace attempts
itoa10<uint64_t>(dpSse16m.btsucc, buf);
if(metricsStderr) stderrSs << buf << '\t';
if(o != NULL) { o->writeChars(buf); o->write('\t'); }
// 100. 16-bit SSE mate-finding DP backtrace cells
itoa10<uint64_t>(dpSse16m.btcell, buf);
if(metricsStderr) stderrSs << buf << '\t';
if(o != NULL) { o->writeChars(buf); o->write('\t'); }
// 101. 16-bit SSE mate-finding DP core-diag rejections
itoa10<uint64_t>(dpSse16m.corerej, buf);
if(metricsStderr) stderrSs << buf << '\t';
if(o != NULL) { o->writeChars(buf); o->write('\t'); }
// 102. 16-bit SSE mate-finding DP N rejections
itoa10<uint64_t>(dpSse16m.nrej, buf);
if(metricsStderr) stderrSs << buf << '\t';
if(o != NULL) { o->writeChars(buf); o->write('\t'); }
const SSEMetrics& dpSse8m = total ? dpSse8Mate : dpSse8uMate;
// 103. 8-bit SSE mate-finding DPs tried
itoa10<uint64_t>(dpSse8m.dp, buf);
if(metricsStderr) stderrSs << buf << '\t';
if(o != NULL) { o->writeChars(buf); o->write('\t'); }
// 104. 8-bit SSE mate-finding DPs saturated
itoa10<uint64_t>(dpSse8m.dpsat, buf);
if(metricsStderr) stderrSs << buf << '\t';
if(o != NULL) { o->writeChars(buf); o->write('\t'); }
// 105. 8-bit SSE mate-finding DPs failed
itoa10<uint64_t>(dpSse8m.dpfail, buf);
if(metricsStderr) stderrSs << buf << '\t';
if(o != NULL) { o->writeChars(buf); o->write('\t'); }
// 106. 8-bit SSE mate-finding DPs succeeded
itoa10<uint64_t>(dpSse8m.dpsucc, buf);
if(metricsStderr) stderrSs << buf << '\t';
if(o != NULL) { o->writeChars(buf); o->write('\t'); }
// 107. 8-bit SSE mate-finding DP columns completed
itoa10<uint64_t>(dpSse8m.col, buf);
if(metricsStderr) stderrSs << buf << '\t';
if(o != NULL) { o->writeChars(buf); o->write('\t'); }
// 108. 8-bit SSE mate-finding DP cells completed
itoa10<uint64_t>(dpSse8m.cell, buf);
if(metricsStderr) stderrSs << buf << '\t';
if(o != NULL) { o->writeChars(buf); o->write('\t'); }
// 109. 8-bit SSE mate-finding DP inner loop iters completed
itoa10<uint64_t>(dpSse8m.inner, buf);
if(metricsStderr) stderrSs << buf << '\t';
if(o != NULL) { o->writeChars(buf); o->write('\t'); }
// 110. 8-bit SSE mate-finding DP fixup loop iters completed
itoa10<uint64_t>(dpSse8m.fixup, buf);
if(metricsStderr) stderrSs << buf << '\t';
if(o != NULL) { o->writeChars(buf); o->write('\t'); }
// 111. 16-bit SSE mate-finding DP gather, cells with potential solutions
itoa10<uint64_t>(dpSse8m.gathsol, buf);
if(metricsStderr) stderrSs << buf << '\t';
if(o != NULL) { o->writeChars(buf); o->write('\t'); }
// 112. 16-bit SSE mate-finding DP backtrace attempts
itoa10<uint64_t>(dpSse8m.bt, buf);
if(metricsStderr) stderrSs << buf << '\t';
if(o != NULL) { o->writeChars(buf); o->write('\t'); }
// 113. 16-bit SSE mate-finding DP failed backtrace attempts
itoa10<uint64_t>(dpSse8m.btfail, buf);
if(metricsStderr) stderrSs << buf << '\t';
if(o != NULL) { o->writeChars(buf); o->write('\t'); }
// 114. 16-bit SSE mate-finding DP succesful backtrace attempts
itoa10<uint64_t>(dpSse8m.btsucc, buf);
if(metricsStderr) stderrSs << buf << '\t';
if(o != NULL) { o->writeChars(buf); o->write('\t'); }
// 115. 16-bit SSE mate-finding DP backtrace cells
itoa10<uint64_t>(dpSse8m.btcell, buf);
if(metricsStderr) stderrSs << buf << '\t';
if(o != NULL) { o->writeChars(buf); o->write('\t'); }
// 116. 16-bit SSE mate-finding DP core rejections
itoa10<uint64_t>(dpSse8m.corerej, buf);
if(metricsStderr) stderrSs << buf << '\t';
if(o != NULL) { o->writeChars(buf); o->write('\t'); }
// 117. 16-bit SSE mate-finding N rejections
itoa10<uint64_t>(dpSse8m.nrej, buf);
if(metricsStderr) stderrSs << buf << '\t';
if(o != NULL) { o->writeChars(buf); o->write('\t'); }
// 118. Backtrace candidates filtered due to starting cell
itoa10<uint64_t>(total ? nbtfiltst : nbtfiltst_u, buf);
if(metricsStderr) stderrSs << buf << '\t';
if(o != NULL) { o->writeChars(buf); o->write('\t'); }
// 119. Backtrace candidates filtered due to low score
itoa10<uint64_t>(total ? nbtfiltsc : nbtfiltsc_u, buf);
if(metricsStderr) stderrSs << buf << '\t';
if(o != NULL) { o->writeChars(buf); o->write('\t'); }
// 120. Backtrace candidates filtered due to domination
itoa10<uint64_t>(total ? nbtfiltdo : nbtfiltdo_u, buf);
if(metricsStderr) stderrSs << buf << '\t';
if(o != NULL) { o->writeChars(buf); o->write('\t'); }
// 121. Overall memory peak
itoa10<size_t>(gMemTally.peak() >> 20, buf);
if(metricsStderr) stderrSs << buf << '\t';
if(o != NULL) { o->writeChars(buf); o->write('\t'); }
// 122. Uncategorized memory peak
itoa10<size_t>(gMemTally.peak(0) >> 20, buf);
if(metricsStderr) stderrSs << buf << '\t';
if(o != NULL) { o->writeChars(buf); o->write('\t'); }
// 123. Ebwt memory peak
itoa10<size_t>(gMemTally.peak(EBWT_CAT) >> 20, buf);
if(metricsStderr) stderrSs << buf << '\t';
if(o != NULL) { o->writeChars(buf); o->write('\t'); }
// 124. Cache memory peak
itoa10<size_t>(gMemTally.peak(CA_CAT) >> 20, buf);
if(metricsStderr) stderrSs << buf << '\t';
if(o != NULL) { o->writeChars(buf); o->write('\t'); }
// 125. Resolver memory peak
itoa10<size_t>(gMemTally.peak(GW_CAT) >> 20, buf);
if(metricsStderr) stderrSs << buf << '\t';
if(o != NULL) { o->writeChars(buf); o->write('\t'); }
// 126. Seed aligner memory peak
itoa10<size_t>(gMemTally.peak(AL_CAT) >> 20, buf);
if(metricsStderr) stderrSs << buf << '\t';
if(o != NULL) { o->writeChars(buf); o->write('\t'); }
// 127. Dynamic programming aligner memory peak
itoa10<size_t>(gMemTally.peak(DP_CAT) >> 20, buf);
if(metricsStderr) stderrSs << buf << '\t';
if(o != NULL) { o->writeChars(buf); o->write('\t'); }
// 128. Miscellaneous memory peak
itoa10<size_t>(gMemTally.peak(MISC_CAT) >> 20, buf);
if(metricsStderr) stderrSs << buf << '\t';
if(o != NULL) { o->writeChars(buf); o->write('\t'); }
// 129. Debug memory peak
itoa10<size_t>(gMemTally.peak(DEBUG_CAT) >> 20, buf);
if(metricsStderr) stderrSs << buf << '\t';
if(o != NULL) { o->writeChars(buf); o->write('\t'); }
// 130
itoa10<size_t>(him.localatts, buf);
if(metricsStderr) stderrSs << buf << '\t';
if(o != NULL) { o->writeChars(buf); o->write('\t'); }
// 131
itoa10<size_t>(him.anchoratts, buf);
if(metricsStderr) stderrSs << buf << '\t';
if(o != NULL) { o->writeChars(buf); o->write('\t'); }
// 132
itoa10<size_t>(him.localindexatts, buf);
if(metricsStderr) stderrSs << buf << '\t';
if(o != NULL) { o->writeChars(buf); o->write('\t'); }
// 133
itoa10<size_t>(him.localextatts, buf);
if(metricsStderr) stderrSs << buf << '\t';
if(o != NULL) { o->writeChars(buf); o->write('\t'); }
// 134
itoa10<size_t>(him.localsearchrecur, buf);
if(metricsStderr) stderrSs << buf << '\t';
if(o != NULL) { o->writeChars(buf); o->write('\t'); }
// 135
itoa10<size_t>(him.globalgenomecoords, buf);
if(metricsStderr) stderrSs << buf << '\t';
if(o != NULL) { o->writeChars(buf); o->write('\t'); }
// 136
itoa10<size_t>(him.localgenomecoords, buf);
if(metricsStderr) stderrSs << buf;
if(o != NULL) { o->writeChars(buf); }
if(o != NULL) { o->write('\n'); }
if(metricsStderr) cerr << stderrSs.str().c_str() << endl;
if(!total) mergeIncrementals();
}
void mergeIncrementals() {
olm.merge(olmu, false);
sdm.merge(sdmu, false);
wlm.merge(wlmu, false);
swmSeed.merge(swmuSeed, false);
swmMate.merge(swmuMate, false);
dpSse8Seed.merge(dpSse8uSeed, false);
dpSse8Mate.merge(dpSse8uMate, false);
dpSse16Seed.merge(dpSse16uSeed, false);
dpSse16Mate.merge(dpSse16uMate, false);
nbtfiltst_u += nbtfiltst;
nbtfiltsc_u += nbtfiltsc;
nbtfiltdo_u += nbtfiltdo;
olmu.reset();
sdmu.reset();
wlmu.reset();
swmuSeed.reset();
swmuMate.reset();
rpmu.reset();
dpSse8uSeed.reset();
dpSse8uMate.reset();
dpSse16uSeed.reset();
dpSse16uMate.reset();
nbtfiltst_u = 0;
nbtfiltsc_u = 0;
nbtfiltdo_u = 0;
}
// Total over the whole job
OuterLoopMetrics olm; // overall metrics
SeedSearchMetrics sdm; // metrics related to seed alignment
WalkMetrics wlm; // metrics related to walking left (i.e. resolving reference offsets)
SwMetrics swmSeed; // metrics related to DP seed-extend alignment
SwMetrics swmMate; // metrics related to DP mate-finding alignment
ReportingMetrics rpm; // metrics related to reporting
SSEMetrics dpSse8Seed; // 8-bit SSE seed extensions
SSEMetrics dpSse8Mate; // 8-bit SSE mate finds
SSEMetrics dpSse16Seed; // 16-bit SSE seed extensions
SSEMetrics dpSse16Mate; // 16-bit SSE mate finds
uint64_t nbtfiltst;
uint64_t nbtfiltsc;
uint64_t nbtfiltdo;
// Just since the last update
OuterLoopMetrics olmu; // overall metrics
SeedSearchMetrics sdmu; // metrics related to seed alignment
WalkMetrics wlmu; // metrics related to walking left (i.e. resolving reference offsets)
SwMetrics swmuSeed; // metrics related to DP seed-extend alignment
SwMetrics swmuMate; // metrics related to DP mate-finding alignment
ReportingMetrics rpmu; // metrics related to reporting
SSEMetrics dpSse8uSeed; // 8-bit SSE seed extensions
SSEMetrics dpSse8uMate; // 8-bit SSE mate finds
SSEMetrics dpSse16uSeed; // 16-bit SSE seed extensions
SSEMetrics dpSse16uMate; // 16-bit SSE mate finds
uint64_t nbtfiltst_u;
uint64_t nbtfiltsc_u;
uint64_t nbtfiltdo_u;
//
HIMetrics him;
MUTEX_T mutex_m; // lock for when one ob
bool first; // yet to print first line?
time_t lastElapsed; // used in reportInterval to measure time since last call
};
static PerfMetrics metrics;
// Cyclic rotations
#define ROTL(n, x) (((x) << (n)) | ((x) >> (32-n)))
#define ROTR(n, x) (((x) >> (n)) | ((x) << (32-n)))
static inline void printMmsSkipMsg(
const PatternSourcePerThread& ps,
bool paired,
bool mate1,
int seedmms)
{
ostringstream os;
if(paired) {
os << "Warning: skipping mate #" << (mate1 ? '1' : '2')
<< " of read '" << (mate1 ? ps.bufa().name : ps.bufb().name)
<< "' because length (" << (mate1 ? ps.bufa().patFw.length() : ps.bufb().patFw.length())
<< ") <= # seed mismatches (" << seedmms << ")" << endl;
} else {
os << "Warning: skipping read '" << (mate1 ? ps.bufa().name : ps.bufb().name)
<< "' because length (" << (mate1 ? ps.bufa().patFw.length() : ps.bufb().patFw.length())
<< ") <= # seed mismatches (" << seedmms << ")" << endl;
}
cerr << os.str().c_str();
}
static inline void printLenSkipMsg(
const PatternSourcePerThread& ps,
bool paired,
bool mate1)
{
ostringstream os;
if(paired) {
os << "Warning: skipping mate #" << (mate1 ? '1' : '2')
<< " of read '" << (mate1 ? ps.bufa().name : ps.bufb().name)
<< "' because it was < 2 characters long" << endl;
} else {
os << "Warning: skipping read '" << (mate1 ? ps.bufa().name : ps.bufb().name)
<< "' because it was < 2 characters long" << endl;
}
cerr << os.str().c_str();
}
static inline void printLocalScoreMsg(
const PatternSourcePerThread& ps,
bool paired,
bool mate1)
{
ostringstream os;
if(paired) {
os << "Warning: minimum score function gave negative number in "
<< "--local mode for mate #" << (mate1 ? '1' : '2')
<< " of read '" << (mate1 ? ps.bufa().name : ps.bufb().name)
<< "; setting to 0 instead" << endl;
} else {
os << "Warning: minimum score function gave negative number in "
<< "--local mode for read '" << (mate1 ? ps.bufa().name : ps.bufb().name)
<< "; setting to 0 instead" << endl;
}
cerr << os.str().c_str();
}
static inline void printEEScoreMsg(
const PatternSourcePerThread& ps,
bool paired,
bool mate1)
{
ostringstream os;
if(paired) {
os << "Warning: minimum score function gave positive number in "
<< "--end-to-end mode for mate #" << (mate1 ? '1' : '2')
<< " of read '" << (mate1 ? ps.bufa().name : ps.bufb().name)
<< "; setting to 0 instead" << endl;
} else {
os << "Warning: minimum score function gave positive number in "
<< "--end-to-end mode for read '" << (mate1 ? ps.bufa().name : ps.bufb().name)
<< "; setting to 0 instead" << endl;
}
cerr << os.str().c_str();
}
#define MERGE_METRICS(met, sync) { \
msink.mergeMetrics(rpm); \
met.merge( \
&olm, \
&sdm, \
&wlm, \
&swmSeed, \
&swmMate, \
&rpm, \
&sseU8ExtendMet, \
&sseU8MateMet, \
&sseI16ExtendMet, \
&sseI16MateMet, \
nbtfiltst, \
nbtfiltsc, \
nbtfiltdo, \
&him, \
sync); \
olm.reset(); \
sdm.reset(); \
wlm.reset(); \
swmSeed.reset(); \
swmMate.reset(); \
rpm.reset(); \
sseU8ExtendMet.reset(); \
sseU8MateMet.reset(); \
sseI16ExtendMet.reset(); \
sseI16MateMet.reset(); \
him.reset(); \
}
#define MERGE_SW(x) { \
x.merge( \
sseU8ExtendMet, \
sseU8MateMet, \
sseI16ExtendMet, \
sseI16MateMet, \
nbtfiltst, \
nbtfiltsc, \
nbtfiltdo); \
x.resetCounters(); \
}
/**
* Called once per thread. Sets up per-thread pointers to the shared global
* data structures, creates per-thread structures, then enters the alignment
* loop. The general flow of the alignment loop is:
*
* - If it's been a while and we're the master thread, report some alignment
* metrics
* - Get the next read/pair
* - Check if this read/pair is identical to the previous
* + If identical, check whether we can skip any or all alignment stages. If
* we can skip all stages, report the result immediately and move to next
* read/pair
* + If not identical, continue
* -
*/
static void multiseedSearchWorker_hisat2(void *vp) {
int tid = *((int*)vp);
if (threeN) {
assert(ref3N.multiseed_gfm[0] != NULL);
assert(ref3N.multiseed_gfm[1] != NULL);
} else {
assert(multiseed_gfm != NULL);
}
assert(multiseedMms == 0);
// for regular Hisat2
PairedPatternSource& patsrc = *multiseed_patsrc;
const HGFM<index_t>& gfm = *multiseed_gfm;
const RFM<index_t>* rgfm = multiseed_rgfm;
const Scoring& sc = *multiseed_sc;
const BitPairReference& ref = *multiseed_refs;
const BitPairReference* rref = multiseed_rrefs;
AlnSink<index_t>& msink = *multiseed_msink;
OutFileBuf* metricsOfb = multiseed_metricsOfb;
// for Hisat-3N
const HGFM<index_t>* gfm_3N[2];
const RFM<index_t>* rgfm_3N[2];
const BitPairReference* rref_3N[2];
for (int i = 0; i < 2; i++) {
gfm_3N[i] = ref3N.multiseed_gfm[i];
rgfm_3N[i] = ref3N.multiseed_rgfm[i];
rref_3N[i] = ref3N.multiseed_rrefs[i];
}
// Sinks: these are so that we can print tables encoding counts for
// events of interest on a per-read, per-seed, per-join, or per-SW
// level. These in turn can be used to diagnose performance
// problems, or generally characterize performance.
//const BitPairReference& refs = *multiseed_refs;
auto_ptr<PatternSourcePerThreadFactory> patsrcFact(createPatsrcFactory(patsrc, tid));
auto_ptr<PatternSourcePerThread> ps(patsrcFact->create());
// Instantiate an object for holding reporting-related parameters.
if(maxSeeds == 0) {
maxSeeds = max<size_t>(5, khits * 2);
}
ReportingParams rp(
(allHits ? std::numeric_limits<THitInt>::max() : khits), // -k
(allHits ? std::numeric_limits<THitInt>::max() : maxSeeds), // --max-seeds
mhits, // -m/-M
0, // penalty gap (not used now)
msample, // true -> -M was specified, otherwise assume -m
gReportDiscordant, // report discordang paired-end alignments?
gReportMixed, // report unpaired alignments for paired reads?
secondary,
localAlign,
bowtie2_dp,
sensitive | very_sensitive,
repeat);
// Instantiate a mapping quality calculator
auto_ptr<Mapq> bmapq(new_mapq(mapqv, scoreMin, sc));
// Make a per-thread wrapper for the global MHitSink object.
AlnSinkWrap<index_t>* msinkwrap;
if (threeN) {
msinkwrap = new AlnSinkWrap3N<index_t>(
msink, // global sink
rp, // reporting parameters
*bmapq.get(), // MAPQ calculator
(size_t)tid, // thread id
mappingCycles,
secondary, // secondary alignments
no_spliced_alignment ? NULL : ssdb,
thread_rids_mindist);
} else {
msinkwrap = new AlnSinkWrap<index_t>(
msink, // global sink
rp, // reporting parameters
*bmapq.get(), // MAPQ calculator
(size_t)tid, // thread id
secondary, // secondary alignments
no_spliced_alignment ? NULL : ssdb,
thread_rids_mindist);
}
SplicedAligner<index_t, local_index_t> splicedAligner(threeN? *gfm_3N[0]: gfm,
anchorStop,
thread_rids_mindist);
SwAligner sw;
OuterLoopMetrics olm;
SeedSearchMetrics sdm;
WalkMetrics wlm;
SwMetrics swmSeed, swmMate;
ReportingMetrics rpm;
RandomSource rnd, rndArb;
SSEMetrics sseU8ExtendMet;
SSEMetrics sseU8MateMet;
SSEMetrics sseI16ExtendMet;
SSEMetrics sseI16MateMet;
DescentMetrics descm;
uint64_t nbtfiltst = 0; // TODO: find a new home for these
uint64_t nbtfiltsc = 0; // TODO: find a new home for these
uint64_t nbtfiltdo = 0; // TODO: find a new home for these
HIMetrics him;
ASSERT_ONLY(BTDnaString tmp);
int pepolFlag;
if(gMate1fw && gMate2fw) {
pepolFlag = PE_POLICY_FF;
} else if(gMate1fw && !gMate2fw) {
pepolFlag = PE_POLICY_FR;
} else if(!gMate1fw && gMate2fw) {
pepolFlag = PE_POLICY_RF;
} else {
pepolFlag = PE_POLICY_RR;
}
assert_geq(gMaxInsert, gMinInsert);
assert_geq(gMinInsert, 0);
PairedEndPolicy pepol(
pepolFlag,
gMaxInsert,
gMinInsert,
localAlign,
gFlippedMatesOK,
gDovetailMatesOK,
gContainMatesOK,
gOlapMatesOK,
gExpandToFrag);
PerfMetrics metricsPt; // per-thread metrics object; for read-level metrics
BTString nametmp;
PerReadMetrics prm;
// Used by thread with threadid == 1 to measure time elapsed
time_t iTime = time(0);
// Keep track of whether last search was exhaustive for mates 1 and 2
bool exhaustive[2] = { false, false };
// Keep track of whether mates 1/2 were filtered out last time through
bool filt[2] = { true, true };
// Keep track of whether mates 1/2 were filtered out due Ns last time
bool nfilt[2] = { true, true };
// Keep track of whether mates 1/2 were filtered out due to not having
// enough characters to rise about the score threshold.
bool scfilt[2] = { true, true };
// Keep track of whether mates 1/2 were filtered out due to not having
// more characters than the number of mismatches permitted in a seed.
bool lenfilt[2] = { true, true };
// Keep track of whether mates 1/2 were filtered out by upstream qc
bool qcfilt[2] = { true, true };
rndArb.init((uint32_t)time(0));
int mergei = 0;
int mergeival = 16;
while(true) {
bool success = false, done = false, paired = false;
ps->nextReadPair(success, done, paired, outType != OUTPUT_SAM);
if(!success && done) {
break;
} else if(!success) {
continue;
}
TReadId rdid = ps->rdid();
if(nthreads > 1 && useTempSpliceSite) {
assert_gt(tid, 0);
assert_leq(tid, thread_rids.size());
assert(thread_rids[tid - 1] == 0 || rdid > thread_rids[tid - 1]);
thread_rids[tid - 1] = (rdid > 0 ? rdid - 1 : 0);
while(true) {
uint64_t min_rdid = thread_rids[0];
{
for(size_t i = 1; i < thread_rids.size(); i++) {
if(thread_rids[i] < min_rdid) {
min_rdid = thread_rids[i];
}
}
}
if(min_rdid + thread_rids_mindist < rdid) {
#if defined(_TTHREAD_WIN32_)
Sleep(0);
#elif defined(_TTHREAD_POSIX_)
sched_yield();
#endif
} else break;
}
}
bool sample = true;
if(arbitraryRandom) {
ps->bufa().seed = rndArb.nextU32();
ps->bufb().seed = rndArb.nextU32();
}
if(sampleFrac < 1.0f) {
rnd.init(ROTL(ps->bufa().seed, 2));
sample = rnd.nextFloat() < sampleFrac;
}
if(rdid >= skipReads && rdid < qUpto && sample) {
// Align this read/pair
bool retry = true;
//
// Check if there is metrics reporting for us to do.
//
if(metricsIval > 0 &&
(metricsOfb != NULL || metricsStderr) &&
!metricsPerRead &&
++mergei == mergeival)
{
// Do a periodic merge. Update global metrics, in a
// synchronized manner if needed.
MERGE_METRICS(metrics, nthreads > 1);
mergei = 0;
// Check if a progress message should be printed
if(tid == 0) {
// Only thread 1 prints progress messages
time_t curTime = time(0);
if(curTime - iTime >= metricsIval) {
metrics.reportInterval(metricsOfb, metricsStderr, false, true, NULL);
iTime = curTime;
}
}
}
prm.reset(); // per-read metrics
prm.doFmString = false;
if(sam_print_xt) {
gettimeofday(&prm.tv_beg, &prm.tz_beg);
}
// Try to align this read
int mappingCycle = 0;
bool gNofw3N = false;
bool gNorc3N = false;
// for threeN (3N) mode, we need to map the read 4 times. for regular mode, only 1 time.
while(retry || mappingCycle < nMappingCycle) {
msinkwrap->resetInit_();
if (threeN) {
ps->changePlan3N(mappingCycle);
gNorc3N = (mappingCycle == threeN_type1conversion_FW || mappingCycle == threeN_type2conversion_FW);
gNofw3N = !gNorc3N;
}
retry = false;
assert_eq(ps->bufa().color, false);
if (!mappingCycles[mappingCycle])
{
mappingCycle++;
continue;
}
olm.reads++;
bool pair = paired;
const size_t rdlen1 = ps->bufa().length();
const size_t rdlen2 = pair ? ps->bufb().length() : 0;
olm.bases += (rdlen1 + rdlen2);
msinkwrap->nextRead(
&ps->bufa(),
pair ? &ps->bufb() : NULL,
rdid,
sc.qualitiesMatter());
assert(msinkwrap->inited());
size_t rdlens[2] = { rdlen1, rdlen2 };
// Calculate the minimum valid score threshold for the read
TAlScore minsc[2], maxpen[2];
maxpen[0] = maxpen[1] = 0;
minsc[0] = minsc[1] = std::numeric_limits<TAlScore>::max();
if(bwaSwLike) {
// From BWA-SW manual: "Given an l-long query, the
// threshold for a hit to be retained is
// a*max{T,c*log(l)}." We try to recreate that here.
float a = (float)sc.match(30);
float T = bwaSwLikeT, c = bwaSwLikeC;
minsc[0] = (TAlScore)max<float>(a*T, a*c*log(rdlens[0]));
if(paired) {
minsc[1] = (TAlScore)max<float>(a*T, a*c*log(rdlens[1]));
}
} else {
minsc[0] = scoreMin.f<TAlScore>(rdlens[0]);
if(paired) minsc[1] = scoreMin.f<TAlScore>(rdlens[1]);
if(localAlign) {
if(minsc[0] < 0) {
if(!gQuiet) printLocalScoreMsg(*ps, paired, true);
minsc[0] = 0;
}
if(paired && minsc[1] < 0) {
if(!gQuiet) printLocalScoreMsg(*ps, paired, false);
minsc[1] = 0;
}
} else {
if(minsc[0] > 0) {
if(!gQuiet) printEEScoreMsg(*ps, paired, true);
minsc[0] = 0;
}
if(paired && minsc[1] > 0) {
if(!gQuiet) printEEScoreMsg(*ps, paired, false);
minsc[1] = 0;
}
}
}
// N filter; does the read have too many Ns?
size_t readns[2] = {0, 0};
sc.nFilterPair(
&ps->bufa().patFw,
pair ? &ps->bufb().patFw : NULL,
readns[0],
readns[1],
nfilt[0],
nfilt[1]);
// Score filter; does the read enough character to rise above
// the score threshold?
scfilt[0] = sc.scoreFilter(minsc[0], rdlens[0]);
scfilt[1] = sc.scoreFilter(minsc[1], rdlens[1]);
lenfilt[0] = lenfilt[1] = true;
if(rdlens[0] <= (size_t)multiseedMms || rdlens[0] < 2) {
if(!gQuiet) printMmsSkipMsg(*ps, paired, true, multiseedMms);
lenfilt[0] = false;
}
if((rdlens[1] <= (size_t)multiseedMms || rdlens[1] < 2) && paired) {
if(!gQuiet) printMmsSkipMsg(*ps, paired, false, multiseedMms);
lenfilt[1] = false;
}
if(rdlens[0] < 2) {
if(!gQuiet) printLenSkipMsg(*ps, paired, true);
lenfilt[0] = false;
}
if(rdlens[1] < 2 && paired) {
if(!gQuiet) printLenSkipMsg(*ps, paired, false);
lenfilt[1] = false;
}
qcfilt[0] = qcfilt[1] = true;
if(qcFilter) {
qcfilt[0] = (ps->bufa().filter != '0');
qcfilt[1] = (ps->bufb().filter != '0');
}
filt[0] = (nfilt[0] && scfilt[0] && lenfilt[0] && qcfilt[0]);
filt[1] = (nfilt[1] && scfilt[1] && lenfilt[1] && qcfilt[1]);
prm.nFilt += (filt[0] ? 0 : 1) + (filt[1] ? 0 : 1);
Read* rds[2] = { &ps->bufa(), &ps->bufb() };
// For each mate...
assert(msinkwrap->empty());
//size_t minedfw[2] = { 0, 0 };
//size_t minedrc[2] = { 0, 0 };
// Calcualte nofw / no rc
bool nofw[2] = { false, false };
bool norc[2] = { false, false };
if (threeN) {
nofw[0] = paired ? (gMate1fw ? gNofw3N : gNorc3N) : gNofw3N;
norc[0] = paired ? (gMate1fw ? gNorc3N : gNofw3N) : gNorc3N;
nofw[1] = paired ? (gMate2fw ? gNofw3N : gNorc3N) : gNofw3N;
norc[1] = paired ? (gMate2fw ? gNorc3N : gNofw3N) : gNorc3N;
} else {
nofw[0] = paired ? (gMate1fw ? gNofw : gNorc) : gNofw;
norc[0] = paired ? (gMate1fw ? gNorc : gNofw) : gNorc;
nofw[1] = paired ? (gMate2fw ? gNofw : gNorc) : gNofw;
norc[1] = paired ? (gMate2fw ? gNorc : gNofw) : gNorc;
}
// Calculate nceil
int nceil[2] = { 0, 0 };
nceil[0] = nCeil.f<int>((double)rdlens[0]);
nceil[0] = min(nceil[0], (int)rdlens[0]);
if(paired) {
nceil[1] = nCeil.f<int>((double)rdlens[1]);
nceil[1] = min(nceil[1], (int)rdlens[1]);
}
exhaustive[0] = exhaustive[1] = false;
//size_t matemap[2] = { 0, 1 };
bool pairPostFilt = filt[0] && filt[1];
if(pairPostFilt) {
rnd.init(ps->bufa().seed ^ ps->bufb().seed);
} else {
rnd.init(ps->bufa().seed);
}
// Calculate interval length for both mates
int interval[2] = { 0, 0 };
for(size_t mate = 0; mate < (pair ? 2:1); mate++) {
interval[mate] = msIval.f<int>((double)rdlens[mate]);
if(filt[0] && filt[1]) {
// Boost interval length by 20% for paired-end reads
interval[mate] = (int)(interval[mate] * 1.2 + 0.5);
}
interval[mate] = max(interval[mate], 1);
}
// Calculate streak length
size_t streak[2] = { maxDpStreak, maxDpStreak };
size_t mtStreak[2] = { maxMateStreak, maxMateStreak };
size_t mxDp[2] = { maxDp, maxDp };
size_t mxUg[2] = { maxUg, maxUg };
size_t mxIter[2] = { maxIters, maxIters };
if(allHits) {
streak[0] = streak[1] = std::numeric_limits<size_t>::max();
mtStreak[0] = mtStreak[1] = std::numeric_limits<size_t>::max();
mxDp[0] = mxDp[1] = std::numeric_limits<size_t>::max();
mxUg[0] = mxUg[1] = std::numeric_limits<size_t>::max();
mxIter[0] = mxIter[1] = std::numeric_limits<size_t>::max();
} else if(khits > 1) {
for(size_t mate = 0; mate < 2; mate++) {
streak[mate] += (khits-1) * maxStreakIncr;
mtStreak[mate] += (khits-1) * maxStreakIncr;
mxDp[mate] += (khits-1) * maxItersIncr;
mxUg[mate] += (khits-1) * maxItersIncr;
mxIter[mate] += (khits-1) * maxItersIncr;
}
}
if(filt[0] && filt[1]) {
streak[0] = (size_t)ceil((double)streak[0] / 2.0);
streak[1] = (size_t)ceil((double)streak[1] / 2.0);
assert_gt(streak[1], 0);
}
assert_gt(streak[0], 0);
// Calculate # seed rounds for each mate
size_t nrounds[2] = { nSeedRounds, nSeedRounds };
if(filt[0] && filt[1]) {
nrounds[0] = (size_t)ceil((double)nrounds[0] / 2.0);
nrounds[1] = (size_t)ceil((double)nrounds[1] / 2.0);
assert_gt(nrounds[1], 0);
}
assert_gt(nrounds[0], 0);
// Increment counters according to what got filtered
for(size_t mate = 0; mate < (pair ? 2:1); mate++) {
if(!filt[mate]) {
// Mate was rejected by N filter
olm.freads++; // reads filtered out
olm.fbases += rdlens[mate]; // bases filtered out
} else {
//shs[mate].clear();
//shs[mate].nextRead(mate == 0 ? ps->bufa() : ps->bufb());
//assert(shs[mate].empty());
olm.ureads++; // reads passing filter
olm.ubases += rdlens[mate]; // bases passing filter
}
}
//size_t eePeEeltLimit = std::numeric_limits<size_t>::max();
// Whether we're done with mate1 / mate2
bool done[2] = { !filt[0], !filt[1] };
// size_t nelt[2] = {0, 0};
if(filt[0] && filt[1]) {
splicedAligner.initReads(rds, nofw, norc, minsc, maxpen);
} else if(filt[0]) {
splicedAligner.initRead(rds[0], nofw[0], norc[0], minsc[0], maxpen[0], false);
} else if(filt[1]) {
splicedAligner.initRead(rds[1], nofw[1], norc[1], minsc[1], maxpen[1], true);
}
if(filt[0] || filt[1]) {
int ret;
int threeN_index;
bool useRepeat;
if (threeN) {
threeN_index = (mappingCycle == threeN_type1conversion_FW || mappingCycle == threeN_type2conversion_RC) ? 0 : 1;
useRepeat = paired ? (ps->bufa().length() >= 100) && (ps->bufb().length() >= 100) :
ps->bufa().length() >= 80;
}
ret = splicedAligner.go(
sc,
pepol,
*multiseed_tpol,
*gpol,
threeN ? *gfm_3N[threeN_index] : gfm,
threeN ?(useRepeat ? rgfm_3N[threeN_index] : NULL) : rgfm,
threeN ? *altdbs_3N[threeN_index] : *altdb,
threeN ? *repeatdbs_3N[threeN_index] : *repeatdb,
threeN ? *raltdbs_3N[threeN_index] : *raltdb,
ref,
threeN ? rref_3N[threeN_index] : rref,
sw,
*ssdb,
wlm,
prm,
swmSeed,
him,
rnd,
*msinkwrap);
MERGE_SW(sw);
// daehwan
size_t mate = 0;
assert_gt(ret, 0);
// Clear out the exact hits so that we don't try to
// extend them again later!
if(ret == EXTEND_EXHAUSTED_CANDIDATES) {
// Not done yet
} else if(ret == EXTEND_POLICY_FULFILLED) {
// Policy is satisfied for this mate at least
if(msinkwrap->state().doneWithMate(mate == 0)) {
done[mate] = true;
}
if(msinkwrap->state().doneWithMate(mate == 1)) {
done[mate^1] = true;
}
} else if(ret == EXTEND_PERFECT_SCORE) {
// We exhausted this mode at least
done[mate] = true;
} else if(ret == EXTEND_EXCEEDED_HARD_LIMIT) {
// We exceeded a per-read limit
done[mate] = true;
} else if(ret == EXTEND_EXCEEDED_SOFT_LIMIT) {
// Not done yet
} else {
//
cerr << "Bad return value: " << ret << endl;
throw 1;
}
if(!done[mate]) {
TAlScore perfectScore = sc.perfectScore(rdlens[mate]);
if(!done[mate] && minsc[mate] == perfectScore) {
done[mate] = true;
}
}
}
for(size_t i = 0; i < 2; i++) {
assert_leq(prm.nExIters, mxIter[i]);
assert_leq(prm.nExDps, mxDp[i]);
assert_leq(prm.nMateDps, mxDp[i]);
assert_leq(prm.nExUgs, mxUg[i]);
assert_leq(prm.nMateUgs, mxUg[i]);
assert_leq(prm.nDpFail, streak[i]);
assert_leq(prm.nUgFail, streak[i]);
assert_leq(prm.nEeFail, streak[i]);
}
msinkwrap->finishRead(
NULL,
NULL,
exhaustive[0], // exhausted seed hits for mate 1?
exhaustive[1], // exhausted seed hits for mate 2?
nfilt[0],
nfilt[1],
scfilt[0],
scfilt[1],
lenfilt[0],
lenfilt[1],
qcfilt[0],
qcfilt[1],
sortByScore, // prioritize by alignment score
rnd, // pseudo-random generator
rpm, // reporting metrics
prm, // per-read metrics
sc, // scoring scheme
!seedSumm, // suppress seed summaries?
seedSumm, //rdid suppress alignments?
templateLenAdjustment);
mappingCycle++;
}
} // if(rdid >= skipReads && rdid < qUpto)
else if(rdid >= qUpto) {
break;
}
if(metricsPerRead) {
MERGE_METRICS(metricsPt, nthreads > 1);
nametmp = ps->bufa().name;
metricsPt.reportInterval(
metricsOfb, metricsStderr, true, true, &nametmp);
metricsPt.reset();
}
} // while(true)
// One last metrics merge
MERGE_METRICS(metrics, nthreads > 1);
delete msinkwrap;
return;
}
/**
* Called once per alignment job. Sets up global pointers to the
* shared global data structures, creates per-thread structures, then
* enters the search loop.
*/
static void multiseedSearch(
Scoring& sc,
TranscriptomePolicy& tpol,
GraphPolicy& gp,
PairedPatternSource& patsrc, // pattern source
AlnSink<index_t>& msink, // hit sink
EList<HGFM<index_t>* > gfms_3N, // 3N index of original text
RFM<index_t>* rgfms_3N[2], // 3N index of repeat sequences
BitPairReference* rrefss[2], // 3N repeat reference
HGFM<index_t>* gfm, // index of original text
RFM<index_t>* rgfm, // index of repeat sequences
BitPairReference* refs, // base reference
BitPairReference* rrefs, // repeat reference
OutFileBuf *metricsOfb)
{
multiseed_patsrc = &patsrc;
multiseed_msink = &msink;
multiseed_sc = &sc;
multiseed_tpol = &tpol;
gpol = &gp;
multiseed_metricsOfb = metricsOfb;
multiseed_refs = refs;
if (threeN) {
ref3N.load(gfms_3N, rgfms_3N, rrefss);
} else {
multiseed_gfm = gfm;
multiseed_rgfm = rgfm;
multiseed_rrefs = rrefs;
}
AutoArray<tthread::thread*> threads(nthreads);
AutoArray<int> tids(nthreads);
// Start the metrics thread
{
Timer _t(cerr, "Multiseed full-index search: ", timing);
thread_rids.resize(nthreads);
thread_rids.fill(0);
thread_rids_mindist = (nthreads == 1 || !useTempSpliceSite ? 0 : 1000 * nthreads);
for(int i = 0; i < nthreads; i++) {
// Thread IDs start at 1
tids[i] = i+1;
threads[i] = new tthread::thread(multiseedSearchWorker_hisat2, (void*)&tids[i]);
}
for (int i = 0; i < nthreads; i++)
threads[i]->join();
}
if(!metricsPerRead && (metricsOfb != NULL || metricsStderr)) {
metrics.reportInterval(metricsOfb, metricsStderr, true, false, NULL);
}
}
static string argstr;
extern void initializeCntLut();
extern void initializeCntBit();
template<typename TStr>
static void driver(
const char * type,
const string bt2indexBases[2],
const string& outfile)
{
if(gVerbose || startVerbose) {
cerr << "Entered driver(): "; logTime(cerr, true);
}
if (gVerbose || startVerbose) {
cerr << "Running in " << ((threeN) ? "3N" : "Regular") << " Mode" << endl;
}
initializeCntLut();
initializeCntBit();
// Vector of the reference sequences; used for sanity-checking
EList<SString<char> > names, os;
EList<size_t> nameLens, seqLens;
// Read reference sequences from the command-line or from a FASTA file
if(!origString.empty()) {
// Read fasta file(s)
EList<string> origFiles;
tokenize(origString, ",", origFiles);
parseFastas(origFiles, names, nameLens, os, seqLens);
}
PatternParams pp(
format, // file format
fileParallel, // true -> wrap files with separate PairedPatternSources
seed, // pseudo-random seed
useSpinlock, // use spin locks instead of pthreads
solexaQuals, // true -> qualities are on solexa64 scale
phred64Quals, // true -> qualities are on phred64 scale
integerQuals, // true -> qualities are space-separated numbers
fuzzy, // true -> try to parse fuzzy fastq
fastaContLen, // length of sampled reads for FastaContinuous...
fastaContFreq, // frequency of sampled reads for FastaContinuous...
skipReads // skip the first 'skip' patterns
);
if(gVerbose || startVerbose) {
cerr << "Creating PatternSource: "; logTime(cerr, true);
}
PairedPatternSource *patsrc = PairedPatternSource::setupPatternSources(
queries, // singles, from argv
mates1, // mate1's, from -1 arg
mates2, // mate2's, from -2 arg
mates12, // both mates on each line, from --12 arg
#ifdef USE_SRA
sra_accs, // SRA accessions
#endif
qualities, // qualities associated with singles
qualities1, // qualities associated with m1
qualities2, // qualities associated with m2
pp, // read read-in parameters
nthreads,
gVerbose || startVerbose); // be talkative
// Open hit output file
if(gVerbose || startVerbose) {
cerr << "Opening hit output file: "; logTime(cerr, true);
}
OutFileBuf *fout;
if(!outfile.empty()) {
fout = new OutFileBuf(outfile.c_str(), false);
} else {
fout = new OutFileBuf();
}
// Initialize GFM object and read in header
if(gVerbose || startVerbose) {
cerr << "About to initialize fw GFM: "; logTime(cerr, true);
}
// for 3N
if (threeN) {
for (int i = 0; i < 2; i++) {
altdbs_3N[i] = new ALTDB<index_t>();
repeatdbs_3N[i] = new RepeatDB<index_t>();
raltdbs_3N[i] = new ALTDB<index_t>();
}
}
EList<HGFM<index_t>* >gfms_3N;
RFM<index_t>* rgfms_3N[2];
for (int i = 0; i < 2; i++) {
rgfms_3N[i] = NULL;
}
bool rep_index_exists_3N[2]{false};
bool rep_index_exists = false;
string rep_adjIdxBase_3N[2];
string rep_adjIdxBase;
HGFM<index_t>* gfm;
RFM<index_t>* rgfm = NULL;
if (threeN) {
for (int j = 0; j < 2; j++) {
adjIdxBases_3N[j] = adjustEbwtBase(argv0, bt2indexBases[j], gVerbose);
HGFM<index_t, local_index_t> *tmp_gfm = new HGFM<index_t, local_index_t>(
adjIdxBases_3N[j],
altdbs_3N[j],
NULL,
NULL,
-1, // fw index
true, // index is for the forward direction
/* overriding: */ offRate,
0, // amount to add to index offrate or <= 0 to do nothing
useMm, // whether to use memory-mapped files
useShmem, // whether to use shared memory
mmSweep, // sweep memory-mapped files
!noRefNames, // load names?
true, // load SA sample?
true, // load ftab?
true, // load rstarts?
!no_spliced_alignment, // load splice sites?
gVerbose, // whether to be talkative
startVerbose, // talkative during initialization
false /*passMemExc*/,
sanityCheck,
use_haplotype); //use haplotypes?
gfms_3N.push_back(tmp_gfm);
if(sanityCheck && !os.empty()) {
// Sanity check number of patterns and pattern lengths in GFM
// against original strings
assert_eq(os.size(), gfms_3N[j]->nPat());
for(size_t i = 0; i < os.size(); i++) {
assert_eq(os[i].length(), gfms_3N[j]->plen()[i]);
}
}
if(sanityCheck && !os.empty()) {
gfms_3N[j]->loadIntoMemory(
-1, // fw index
true, // load SA sample
true, // load ftab
true, // load rstarts
!noRefNames,
startVerbose);
gfms_3N[j]->checkOrigs(os, false);
gfms_3N[j]->evictFromMemory();
}
{
// Load the other half of the index into memory
assert(!gfms_3N[j]->isInMemory());
Timer _t(cerr, "Time loading forward index: ", timing);
gfms_3N[j]->loadIntoMemory(
-1, // not the reverse index
true, // load SA samp? (yes, need forward index's SA samp)
true, // load ftab (in forward index)
true, // load rstarts (in forward index)
!noRefNames, // load names?
startVerbose);
}
rep_adjIdxBase_3N[j] = adjIdxBases_3N[j] + ".rep";
{
std::ifstream infile((rep_adjIdxBase_3N[j] + ".1." + gfm_ext.c_str()).c_str());
rep_index_exists_3N[j] = infile.good();
}
if(rep_index_exists_3N[j] && use_repeat_index) {
rgfms_3N[j] = new RFM<index_t>(
rep_adjIdxBase_3N[j],
raltdbs_3N[j],
repeatdbs_3N[j],
&readLens,
-1, // fw index
true, // index is for the forward direction
/* overriding: */ offRate,
0, // amount to add to index offrate or <= 0 to do nothing
useMm, // whether to use memory-mapped files
useShmem, // whether to use shared memory
mmSweep, // sweep memory-mapped files
!noRefNames, // load names?
true, // load SA sample?
true, // load ftab?
true, // load rstarts?
!no_spliced_alignment, // load splice sites?
gVerbose, // whether to be talkative
startVerbose, // talkative during initialization
false /*passMemExc*/,
sanityCheck,
false); //use haplotypes?
// CP to do
#if 0
if(sanityCheck && !os.empty()) {
// Sanity check number of patterns and pattern lengths in GFM
// against original strings
assert_eq(os.size(), gfm.nPat());
for(size_t i = 0; i < os.size(); i++) {
assert_eq(os[i].length(), rgfm->plen()[i]);
}
}
// Sanity-check the restored version of the GFM
if(sanityCheck && !os.empty()) {
rgfm->loadIntoMemory(
-1, // fw index
true, // load SA sample
true, // load ftab
true, // load rstarts
!noRefNames,
startVerbose);
rgfm->checkOrigs(os, false);
rgfm->evictFromMemory();
}
#endif
{
// Load the other half of the index into memory
assert(!rgfms_3N[j]->isInMemory());
Timer _t(cerr, "Time loading forward index: ", timing);
rgfms_3N[j]->loadIntoMemory(
-1, // not the reverse index
true, // load SA samp? (yes, need forward index's SA samp)
true, // load ftab (in forward index)
true, // load rstarts (in forward index)
!noRefNames, // load names?
startVerbose);
repeatdbs_3N[j]->construct(gfms_3N[j]->rstarts(), gfms_3N[j]->nFrag());
}
if (threeN) {
ht2_option_t option;
ht2_init_options(&option);
option.altdb = altdbs_3N[j];
option.raltdb = raltdbs_3N[j];
option.repeatdb = repeatdbs_3N[j];
option.gfm = gfms_3N[j];
option.rgfm = rgfms_3N[j];
ht2_handle_t handle = ht2_init(adjIdxBases_3N[j].c_str(), &option);
repeatHandles.push_back(handle);
if (refNameMap == NULL) {
ht2_index_getrefnames(repeatHandles[0], &refNameMap);
}
}
}
if(!saw_k) {
if(gfms_3N[j]->gh().linearFM()) khits = 5;
else khits = 10;
}
}
} else {
altdb = new ALTDB<index_t>();
repeatdb = new RepeatDB<index_t>();
raltdb = new ALTDB<index_t>();
adjIdxBase = adjustEbwtBase(argv0, bt2indexBases[0], gVerbose);
gfm = new HGFM<index_t, local_index_t>(
adjIdxBase,
altdb,
NULL,
NULL,
-1, // fw index
true, // index is for the forward direction
/* overriding: */ offRate,
0, // amount to add to index offrate or <= 0 to do nothing
useMm, // whether to use memory-mapped files
useShmem, // whether to use shared memory
mmSweep, // sweep memory-mapped files
!noRefNames, // load names?
true, // load SA sample?
true, // load ftab?
true, // load rstarts?
!no_spliced_alignment, // load splice sites?
gVerbose, // whether to be talkative
startVerbose, // talkative during initialization
false /*passMemExc*/,
sanityCheck,
use_haplotype); //use haplotypes?
if(sanityCheck && !os.empty()) {
// Sanity check number of patterns and pattern lengths in GFM
// against original strings
assert_eq(os.size(), gfm->nPat());
for(size_t i = 0; i < os.size(); i++) {
assert_eq(os[i].length(), gfm->plen()[i]);
}
}
// Sanity-check the restored version of the GFM
if(sanityCheck && !os.empty()) {
gfm->loadIntoMemory(
-1, // fw index
true, // load SA sample
true, // load ftab
true, // load rstarts
!noRefNames,
startVerbose);
gfm->checkOrigs(os, false);
gfm->evictFromMemory();
}
{
// Load the other half of the index into memory
assert(!gfm->isInMemory());
Timer _t(cerr, "Time loading forward index: ", timing);
gfm->loadIntoMemory(
-1, // not the reverse index
true, // load SA samp? (yes, need forward index's SA samp)
true, // load ftab (in forward index)
true, // load rstarts (in forward index)
!noRefNames, // load names?
startVerbose);
}
rep_adjIdxBase = adjIdxBase + ".rep";
{
std::ifstream infile((rep_adjIdxBase + ".1." + gfm_ext.c_str()).c_str());
rep_index_exists = infile.good();
}
if(rep_index_exists && use_repeat_index) {
rgfm = new RFM<index_t>(
rep_adjIdxBase,
raltdb,
repeatdb,
&readLens,
-1, // fw index
true, // index is for the forward direction
/* overriding: */ offRate,
0, // amount to add to index offrate or <= 0 to do nothing
useMm, // whether to use memory-mapped files
useShmem, // whether to use shared memory
mmSweep, // sweep memory-mapped files
!noRefNames, // load names?
true, // load SA sample?
true, // load ftab?
true, // load rstarts?
!no_spliced_alignment, // load splice sites?
gVerbose, // whether to be talkative
startVerbose, // talkative during initialization
false /*passMemExc*/,
sanityCheck,
false); //use haplotypes?
// CP to do
#if 0
if(sanityCheck && !os.empty()) {
// Sanity check number of patterns and pattern lengths in GFM
// against original strings
assert_eq(os.size(), gfm.nPat());
for(size_t i = 0; i < os.size(); i++) {
assert_eq(os[i].length(), rgfm->plen()[i]);
}
}
// Sanity-check the restored version of the GFM
if(sanityCheck && !os.empty()) {
rgfm->loadIntoMemory(
-1, // fw index
true, // load SA sample
true, // load ftab
true, // load rstarts
!noRefNames,
startVerbose);
rgfm->checkOrigs(os, false);
rgfm->evictFromMemory();
}
#endif
{
// Load the other half of the index into memory
assert(!rgfm->isInMemory());
Timer _t(cerr, "Time loading forward index: ", timing);
rgfm->loadIntoMemory(
-1, // not the reverse index
true, // load SA samp? (yes, need forward index's SA samp)
true, // load ftab (in forward index)
true, // load rstarts (in forward index)
!noRefNames, // load names?
startVerbose);
repeatdb->construct(gfm->rstarts(), gfm->nFrag());
}
}
if(!saw_k) {
if(gfm->gh().linearFM()) khits = 5;
else khits = 10;
}
} // else threeN
OutputQueue oq(
*fout, // out file buffer
reorder && nthreads > 1, // whether to reorder when there's >1 thread
nthreads, // # threads
nthreads > 1, // whether to be thread-safe
skipReads); // first read will have this rdid
{
Timer _t(cerr, "Time searching: ", timing);
// Set up penalities
if(bonusMatch > 0 && !localAlign) {
cerr << "Warning: Match bonus always = 0 in --end-to-end mode; ignoring user setting" << endl;
bonusMatch = 0;
}
if(tranAssm) {
penNoncanIntronLen.init(SIMPLE_FUNC_LOG, -8, 2);
}
Scoring sc(
bonusMatch, // constant reward for match
penMmcType, // how to penalize mismatches
penMmcMax, // max mm penalty
penMmcMin, // min mm penalty
penScMax, // max sc penalty
penScMin, // min sc penalty
scoreMin, // min score as function of read len
nCeil, // max # Ns as function of read len
penNType, // how to penalize Ns in the read
penN, // constant if N pelanty is a constant
penNCatPair, // whether to concat mates before N filtering
penRdGapConst, // constant coeff for read gap cost
penRfGapConst, // constant coeff for ref gap cost
penRdGapLinear, // linear coeff for read gap cost
penRfGapLinear, // linear coeff for ref gap cost
gGapBarrier, // # rows at top/bot only entered diagonally
penCanSplice, // canonical splicing penalty
penNoncanSplice,// non-canonical splicing penalty
penConflictSplice, // conflicting splice site penalty
&penCanIntronLen, // penalty as to intron length
&penNoncanIntronLen); // penalty as to intron length
EList<size_t> reflens;
// for HISAT-3N
EList<string> refnames_3N[2];
EList<size_t> replens_3N[2];
EList<string> repnames_3N[2];
EList<size_t> empty_replens_3N[2];
EList<string> empty_repnames_3N[2];
//for regular hisat2
EList<string> refnames;
//readEbwtRefnames<index_t>(adjIdxBase, refnames);
EList<size_t> replens;
EList<string> repnames;
EList<size_t> empty_replens;
EList<string> empty_repnames;
if (threeN) {
for(size_t i = 0; i < gfms_3N[0]->nPat(); i++) {
reflens.push_back(gfms_3N[0]->plen()[i]);
}
for (int j = 0; j < 2; j++) {
readEbwtRefnames<index_t>(adjIdxBases_3N[j], refnames_3N[j]);
if (rep_index_exists_3N[j] && use_repeat_index) {
rgfms_3N[j]->getReferenceNames(repnames_3N[j]);
rgfms_3N[j]->getReferenceLens(replens_3N[j]);
}
if(rmChrName && addChrName) {
cerr << "Error: --remove-chrname and --add-chrname cannot be used at the same time" << endl;
throw 1;
}
if(rmChrName) {
for(size_t i = 0; i < refnames_3N[j].size(); i++) {
string& refname = refnames_3N[j][i];
if(refname.find("chr") == 0) {
refname = refname.substr(3);
}
}
} else if(addChrName) {
for(size_t i = 0; i < refnames_3N[j].size(); i++) {
string& refname = refnames_3N[j][i];
if(refname.find("chr") != 0) {
refname = string("chr") + refname;
}
}
}
}
} else {
readEbwtRefnames<index_t>(adjIdxBase, refnames);
for(size_t i = 0; i < gfm->nPat(); i++) {
reflens.push_back(gfm->plen()[i]);
}
if(rep_index_exists && use_repeat_index) {
rgfm->getReferenceNames(repnames);
rgfm->getReferenceLens(replens);
}
if(rmChrName && addChrName) {
cerr << "Error: --remove-chrname and --add-chrname cannot be used at the same time" << endl;
throw 1;
}
if(rmChrName) {
for(size_t i = 0; i < refnames.size(); i++) {
string& refname = refnames[i];
if(refname.find("chr") == 0) {
refname = refname.substr(3);
}
}
} else if(addChrName) {
for(size_t i = 0; i < refnames.size(); i++) {
string& refname = refnames[i];
if(refname.find("chr") != 0) {
refname = string("chr") + refname;
}
}
}
}
SamConfig<index_t> samc(
threeN ? refnames_3N[0]: refnames, // reference sequence names
reflens, // reference sequence lengths
threeN?(repeat ? repnames_3N[0] : empty_repnames_3N[0]): (repeat ? repnames : empty_repnames), // repeat sequence names
threeN? (repeat ? replens_3N[0] : empty_replens_3N[0]): (repeat ? replens : empty_replens), // repeat sequence lengths
samTruncQname, // whether to truncate QNAME to 255 chars
samOmitSecSeqQual, // omit SEQ/QUAL for 2ndary alignments?
samNoUnal, // omit unaligned-read records?
string("hisat2"), // program id
string("hisat2"), // program name
string(HISAT2_VERSION), // program version
argstr, // command-line
rgs_optflag, // read-group string
rna_strandness,
sam_print_as,
sam_print_xs,
sam_print_xss,
sam_print_yn,
sam_print_xn,
sam_print_cs,
sam_print_cq,
sam_print_x0,
sam_print_x1,
sam_print_xm,
sam_print_xo,
sam_print_xg,
sam_print_nm,
sam_print_md,
sam_print_yf,
sam_print_yi,
sam_print_ym,
sam_print_yp,
sam_print_yt,
sam_print_ys,
sam_print_zs,
sam_print_xr,
sam_print_xt,
sam_print_xd,
sam_print_xu,
sam_print_yl,
sam_print_ye,
sam_print_yu,
sam_print_xp,
sam_print_yr,
sam_print_zb,
sam_print_zr,
sam_print_zf,
sam_print_zm,
sam_print_zi,
sam_print_zp,
sam_print_zu,
sam_print_xs_a,
sam_print_nh);
// Set up hit sink; if sanityCheck && !os.empty() is true,
// then instruct the sink to "retain" hits in a vector in
// memory so that we can easily sanity check them later on
AlnSink<index_t> *mssink = NULL;
//auto_ptr<BitPairReference> refss[2];
auto_ptr<BitPairReference> refs;
Timer *_tRef = new Timer(cerr, "Time loading reference: ", timing);
refs = auto_ptr<BitPairReference>(
new BitPairReference(
threeN ? adjIdxBases_3N[0] : adjIdxBase,
NULL,
false,
sanityCheck,
NULL,
NULL,
false,
useMm,
useShmem,
mmSweep,
gVerbose,
startVerbose)
);
delete _tRef;
if(!refs->loaded()) throw 1;
BitPairReference* rrefss[2] = {NULL, };
BitPairReference* rrefs = NULL;
if (threeN) {
for (int j = 0; j < 2; j++) {
if (rep_index_exists_3N[j] && use_repeat_index) {
const EList<uint8_t> &included = rgfms_3N[j]->getRepeatIncluded();
rrefss[j] = new BitPairReference(
rep_adjIdxBase_3N[j],
&included,
false,
sanityCheck,
NULL,
NULL,
false,
useMm,
useShmem,
mmSweep,
gVerbose,
startVerbose);
if (!rrefss[j]->loaded()) throw 1;
}
}
} else {
if(rep_index_exists && use_repeat_index) {
const EList<uint8_t>& included = rgfm->getRepeatIncluded();
rrefs = new BitPairReference(
rep_adjIdxBase,
&included,
false,
sanityCheck,
NULL,
NULL,
false,
useMm,
useShmem,
mmSweep,
gVerbose,
startVerbose);
if(!rrefs->loaded()) throw 1;
}
}
bool xsOnly = (tranAssm_program == "cufflinks");
TranscriptomePolicy tpol(minIntronLen,
maxIntronLen,
tranAssm ? 15 : 7,
tranAssm ? 20 : 14,
no_spliced_alignment,
tranMapOnly,
tranAssm,
xsOnly,
avoid_pseudogene);
GraphPolicy gpol(max_alts_tried,
use_haplotype,
(threeN ? altdbs_3N[0]->haplotypes().size() : altdb->haplotypes().size()) > 0 && use_haplotype,
enable_codis);
init_junction_prob();
bool write = novelSpliceSiteOutfile != "" || useTempSpliceSite;
bool read = knownSpliceSiteInfile != "" || novelSpliceSiteInfile != "" || useTempSpliceSite || altdbs_3N[0]->hasSpliceSites();
ssdb = new SpliceSiteDB(
*(refs.get()),
threeN ? refnames_3N[0] : refnames,
nthreads > 1, // thread-safe
write, // write?
read); // read?
ssdb->read(threeN ? *gfms_3N[0] : *gfm, threeN ? altdbs_3N[0]->alts() : altdb->alts());
if(knownSpliceSiteInfile != "") {
ifstream ssdb_file(knownSpliceSiteInfile.c_str(), ios::in);
if(ssdb_file.is_open()) {
ssdb->read(ssdb_file,
true); // known splice sites
ssdb_file.close();
}
}
if(novelSpliceSiteInfile != "") {
ifstream ssdb_file(novelSpliceSiteInfile.c_str(), ios::in);
if(ssdb_file.is_open()) {
ssdb->read(ssdb_file,
false); // novel splice sites
ssdb_file.close();
}
}
switch(outType) {
case OUTPUT_SAM: {
if (threeN) {
mssink = new AlnSink3NSam<index_t>(
oq, // output queue
samc, // settings & routines for SAM output
refnames_3N[0], // reference names
repnames_3N[0], // repeat names
gQuiet, // don't print alignment summary at end
nthreads,
refs.get(),
no_spliced_alignment,
altdbs_3N[0],
ssdb);
} else {
mssink = new AlnSinkSam<index_t>(
oq, // output queue
samc, // settings & routines for SAM output
refnames, // reference names
repnames, // repeat names
gQuiet, // don't print alignment summary at end
altdb,
ssdb);
};
if(!samNoHead) {
bool printHd = true, printSq = true;
BTString buf;
samc.printHeader(buf, rgid, rgs, printHd, !samNoSQ, printSq);
fout->writeString(buf);
}
break;
}
default:
cerr << "Invalid output type: " << outType << endl;
throw 1;
}
if(gVerbose || startVerbose) {
cerr << "Dispatching to search driver: "; logTime(cerr, true);
}
// Set up global constraint
OutFileBuf *metricsOfb = NULL;
if(!metricsFile.empty() && metricsIval > 0) {
metricsOfb = new OutFileBuf(metricsFile);
}
// Do the search for all input reads
assert(patsrc != NULL);
assert(mssink != NULL);
multiseedSearch(
sc, // scoring scheme
tpol,
gpol,
*patsrc, // pattern source
*mssink, // hit sink
gfms_3N, // 3N BWT
rgfms_3N, // 3N
rrefss, // 3N
gfm, // BWT
rgfm,
refs.get(),
rrefs,
metricsOfb);
// Evict any loaded indexes from memory
if (threeN) {
for (int j = 0; j < 2; j++) {
if(gfms_3N[j]->isInMemory()) {
gfms_3N[j]->evictFromMemory();
}
}
} else {
if(gfm->isInMemory()) {
gfm->evictFromMemory();
}
}
if(!gQuiet && !seedSumm) {
size_t repThresh = mhits;
if(repThresh == 0) {
repThresh = std::numeric_limits<size_t>::max();
}
mssink->finish(cerr,
repThresh,
gReportDiscordant,
gReportMixed,
newAlignSummary,
hadoopOut);
if(alignSumFile != "") {
ofstream sumfile(alignSumFile.c_str(), ios::out);
if(sumfile.is_open()) {
mssink->finish(sumfile,
repThresh,
gReportDiscordant,
gReportMixed,
newAlignSummary,
false); // hadoopOut
sumfile.close();
}
}
}
if(ssdb != NULL) {
if(novelSpliceSiteOutfile != "") {
ofstream ssdb_file(novelSpliceSiteOutfile.c_str(), ios::out);
if(ssdb_file.is_open()) {
ssdb->print(ssdb_file);
ssdb_file.close();
}
}
}
oq.flush(true);
assert_eq(oq.numStarted(), oq.numFinished());
assert_eq(oq.numStarted(), oq.numFlushed());
delete patsrc;
delete mssink;
delete ssdb;
delete metricsOfb;
if (threeN) {
for (int i = 0; i < 2; i++) {
if(rep_index_exists_3N[i] && use_repeat_index) {
delete rgfms_3N[i];
delete rrefss[i];
delete repeatdbs_3N[i];
delete raltdbs_3N[i];
}
delete gfms_3N[i];
delete altdbs_3N[i];
}
if(rep_index_exists_3N[0] && use_repeat_index){
for (int k = 0; k < 2; k++) {
ht2_close(repeatHandles[k]);
}
}
} else {
delete altdb;
delete repeatdb;
delete raltdb;
delete rgfm;
delete rrefs;
delete gfm;
}
if (refNameMap != NULL) {
free(refNameMap);
}
if(fout != NULL) {
delete fout;
}
}
}
// C++ name mangling is disabled for the bowtie() function to make it
// easier to use Bowtie as a library.
extern "C" {
/**
* Main bowtie entry function. Parses argc/argv style command-line
* options, sets global configuration variables, and calls the driver()
* function.
*/
int hisat2(int argc, const char **argv) {
try {
// Reset all global state, including getopt state
opterr = optind = 1;
resetOptions();
for(int i = 0; i < argc; i++) {
argstr += argv[i];
if(i < argc-1) argstr += " ";
}
if(startVerbose) { cerr << "Entered main(): "; logTime(cerr, true); }
parseOptions(argc, argv);
argv0 = argv[0];
if(showVersion) {
cout << argv0 << " version " << HISAT2_VERSION << endl;
if(sizeof(void*) == 4) {
cout << "32-bit" << endl;
} else if(sizeof(void*) == 8) {
cout << "64-bit" << endl;
} else {
cout << "Neither 32- nor 64-bit: sizeof(void*) = " << sizeof(void*) << endl;
}
cout << "Built on " << BUILD_HOST << endl;
cout << BUILD_TIME << endl;
cout << "Compiler: " << COMPILER_VERSION << endl;
cout << "Options: " << COMPILER_OPTIONS << endl;
cout << "Sizeof {int, long, long long, void*, size_t, off_t}: {"
<< sizeof(int)
<< ", " << sizeof(long) << ", " << sizeof(long long)
<< ", " << sizeof(void *) << ", " << sizeof(size_t)
<< ", " << sizeof(off_t) << "}" << endl;
return 0;
}
{
Timer _t(cerr, "Overall time: ", timing);
if(startVerbose) {
cerr << "Parsing index and read arguments: "; logTime(cerr, true);
}
// Get index basename (but only if it wasn't specified via --index)
if(bt2indexs[0].empty()) {
if(optind >= argc) {
cerr << "No index, query, or output file specified!" << endl;
printUsage(cerr);
return 1;
}
bt2indexs[0] = argv[optind++];
}
if (threeN) {
bt2indexs[1] = bt2indexs[0];
if (fileExist(bt2indexs[0] + threeN_indexTags[0] + ".1." + gfm_ext)) {
bt2indexs[0] += threeN_indexTags[0];
bt2indexs[1] += threeN_indexTags[1];
} else if (fileExist(bt2indexs[0] + ".3n.1.1." + gfm_ext)) {
bt2indexs[0] += ".3n.1";
bt2indexs[1] += ".3n.2";
if (!((usrInput_convertedFrom == 'C' && usrInput_convertedTo == 'T') ||
(usrInput_convertedFrom == 'T' && usrInput_convertedTo == 'C'))) {
cerr << "Your current hisat-3n index only support C-to-T or T-to-C base change. Please build new hisat-3n index to support "
<< usrInput_convertedFrom << " to " << usrInput_convertedTo << "change." << endl;
printUsage(cerr);
return 1;
}
} else {
cerr << "Index is not exist, please use hisat-3n-build to build index first. Please use the same --base-change argument for both hisat-3n-build and hisat-3n." << endl;
printUsage(cerr);
return 1;
}
}
// Get query filename
bool got_reads = !queries.empty() || !mates1.empty() || !mates12.empty();
#ifdef USE_SRA
got_reads = got_reads || !sra_accs.empty();
#endif
if(minIntronLen > maxIntronLen) {
cerr << "--min-intronlen(" << minIntronLen << ") should not be greater than --max-intronlen("
<< maxIntronLen << ")" << endl;
printUsage(cerr);
return 1;
}
if(optind >= argc) {
if(!got_reads) {
printUsage(cerr);
cerr << "***" << endl
#ifdef USE_SRA
<< "Error: Must specify at least one read input with -U/-1/-2/--sra-acc" << endl;
#else
<< "Error: Must specify at least one read input with -U/-1/-2" << endl;
#endif
return 1;
}
} else if(!got_reads) {
// Tokenize the list of query files
tokenize(argv[optind++], ",", queries);
if(queries.empty()) {
cerr << "Tokenized query file list was empty!" << endl;
printUsage(cerr);
return 1;
}
}
// Get output filename
if(optind < argc && outfile.empty()) {
outfile = argv[optind++];
cerr << "Warning: Output file '" << outfile.c_str()
<< "' was specified without -S. This will not work in "
<< "future HISAT 2 versions. Please use -S instead."
<< endl;
}
// Extra parametesr?
if(optind < argc) {
cerr << "Extra parameter(s) specified: ";
for(int i = optind; i < argc; i++) {
cerr << "\"" << argv[i] << "\"";
if(i < argc-1) cerr << ", ";
}
cerr << endl;
if(mates1.size() > 0) {
cerr << "Note that if <mates> files are specified using -1/-2, a <singles> file cannot" << endl
<< "also be specified. Please run HISAT2 separately for mates and singles." << endl;
}
throw 1;
}
// Optionally summarize
if(gVerbose) {
cout << "Input bt2 file: \"" << bt2indexs[0].c_str() << "\"" << endl;
cout << "Input bt2 file: \"" << bt2indexs[1].c_str() << "\"" << endl;
cout << "Query inputs (DNA, " << file_format_names[format].c_str() << "):" << endl;
for(size_t i = 0; i < queries.size(); i++) {
cout << " " << queries[i].c_str() << endl;
}
cout << "Quality inputs:" << endl;
for(size_t i = 0; i < qualities.size(); i++) {
cout << " " << qualities[i].c_str() << endl;
}
cout << "Output file: \"" << outfile.c_str() << "\"" << endl;
cout << "Local endianness: " << (currentlyBigEndian()? "big":"little") << endl;
cout << "Sanity checking: " << (sanityCheck? "enabled":"disabled") << endl;
#ifdef NDEBUG
cout << "Assertions: disabled" << endl;
#else
cout << "Assertions: enabled" << endl;
#endif
}
if(ipause) {
cout << "Press key to continue..." << endl;
getchar();
}
driver<SString<char> >("DNA", bt2indexs, outfile);
}
return 0;
} catch(std::exception& e) {
cerr << "Error: Encountered exception: '" << e.what() << "'" << endl;
cerr << "Command: ";
for(int i = 0; i < argc; i++) cerr << argv[i] << " ";
cerr << endl;
return 1;
} catch(int e) {
if(e != 0) {
cerr << "Error: Encountered internal HISAT2 exception (#" << e << ")" << endl;
cerr << "Command: ";
for(int i = 0; i < argc; i++) cerr << argv[i] << " ";
cerr << endl;
}
return e;
}
} // bowtie()
} // extern "C"