/* * Copyright 2015, Daehwan Kim * * This file is part of HISAT 2. * This file is edited by Yun (Leo) Zhang for HISAT-3N. * * HISAT 2 is free software: you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation, either version 3 of the License, or * (at your option) any later version. * * HISAT 2 is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with HISAT 2. If not, see . */ #include #include #include #include #include #include #include #include #include #include #include "alphabet.h" #include "assert_helpers.h" #include "endian_swap.h" #include "hgfm.h" #include "rfm.h" #include "formats.h" #include "sequence_io.h" #include "tokenize.h" #include "aln_sink.h" #include "pat.h" #include "threading.h" #include "ds.h" #include "aligner_metrics.h" #include "sam.h" #include "aligner_seed.h" #include "splice_site.h" #include "spliced_aligner.h" #include "aligner_seed_policy.h" #include "aligner_sw.h" #include "aligner_sw_driver.h" #include "aligner_cache.h" #include "util.h" #include "pe.h" #include "tp.h" #include "gp.h" #include "simple_func.h" #include "presets.h" #include "opts.h" #include "outq.h" #include "repeat_kmer.h" #include "hisat2lib/ht2.h" //#include "utility_3n.h" using namespace std; MemoryTally gMemTally; static EList mates1; // mated reads (first mate) static EList mates2; // mated reads (second mate) static EList mates12; // mated reads (1st/2nd interleaved in 1 file) static string adjIdxBase; static string adjIdxBases_3N[2]; bool gColor; // colorspace (not supported) int gVerbose; // be talkative static bool startVerbose; // be talkative at startup int gQuiet; // print nothing but the alignments static int sanityCheck; // enable expensive sanity checks static int format; // default read format is FASTQ static string origString; // reference text, or filename(s) static int seed; // srandom() seed static int timing; // whether to report basic timing data static int metricsIval; // interval between alignment metrics messages (0 = no messages) static string metricsFile;// output file to put alignment metrics in static bool metricsStderr;// output file to put alignment metrics in static bool metricsPerRead; // report a metrics tuple for every read static bool allHits; // for multihits, report just one static bool showVersion; // just print version and quit? static int ipause; // pause before maching? static uint32_t qUpto; // max # of queries to read int gTrim5; // amount to trim from 5' end int gTrim3; // amount to trim from 3' end static int offRate; // keep default offRate static bool solexaQuals; // quality strings are solexa quals, not phred, and subtract 64 (not 33) static bool phred64Quals; // quality chars are phred, but must subtract 64 (not 33) static bool integerQuals; // quality strings are space-separated strings of integers, not ASCII static int nthreads; // number of pthreads operating concurrently static int outType; // style of output static bool noRefNames; // true -> print reference indexes; not names static uint32_t khits; // number of hits per read; >1 is much slower static uint32_t mhits; // don't report any hits if there are > mhits static int partitionSz; // output a partitioning key in first field static bool useSpinlock; // false -> don't use of spinlocks even if they're #defines static bool fileParallel; // separate threads read separate input files in parallel static bool useShmem; // use shared memory to hold the index static bool useMm; // use memory-mapped files to hold the index static bool mmSweep; // sweep through memory-mapped files immediately after mapping int gMinInsert; // minimum insert size int gMaxInsert; // maximum insert size bool gMate1fw; // -1 mate aligns in fw orientation on fw strand bool gMate2fw; // -2 mate aligns in rc orientation on fw strand bool gFlippedMatesOK; // allow mates to be in wrong order bool gDovetailMatesOK; // allow one mate to extend off the end of the other bool gContainMatesOK; // allow one mate to contain the other in PE alignment bool gOlapMatesOK; // allow mates to overlap in PE alignment bool gExpandToFrag; // incr max frag length to =larger mate len if necessary bool gReportDiscordant; // find and report discordant paired-end alignments bool gReportMixed; // find and report unpaired alignments for paired reads static uint32_t cacheLimit; // ranges w/ size > limit will be cached static uint32_t cacheSize; // # words per range cache static uint32_t skipReads; // # reads/read pairs to skip bool gNofw; // don't align fw orientation of read bool gNorc; // don't align rc orientation of read static uint32_t fastaContLen; static uint32_t fastaContFreq; static bool hadoopOut; // print Hadoop status and summary messages static bool fuzzy; static bool fullRef; static bool samTruncQname; // whether to truncate QNAME to 255 chars static bool samOmitSecSeqQual; // omit SEQ/QUAL for 2ndary alignments? static bool samNoUnal; // don't print records for unaligned reads static bool samNoHead; // don't print any header lines in SAM output static bool samNoSQ; // don't print @SQ header lines static bool sam_print_as; static bool sam_print_xs; // XS:i static bool sam_print_xss; // Xs:i and Ys:i static bool sam_print_yn; // YN:i and Yn:i static bool sam_print_xn; static bool sam_print_cs; static bool sam_print_cq; static bool sam_print_x0; static bool sam_print_x1; static bool sam_print_xm; static bool sam_print_xo; static bool sam_print_xg; static bool sam_print_nm; static bool sam_print_md; static bool sam_print_yf; static bool sam_print_yi; static bool sam_print_ym; static bool sam_print_yp; static bool sam_print_yt; static bool sam_print_ys; static bool sam_print_zs; static bool sam_print_xr; static bool sam_print_xt; static bool sam_print_xd; static bool sam_print_xu; static bool sam_print_yl; static bool sam_print_ye; static bool sam_print_yu; static bool sam_print_xp; static bool sam_print_yr; static bool sam_print_zb; static bool sam_print_zr; static bool sam_print_zf; static bool sam_print_zm; static bool sam_print_zi; static bool sam_print_zp; static bool sam_print_zu; static bool sam_print_xs_a; static bool sam_print_nh; static bool bwaSwLike; static float bwaSwLikeC; static float bwaSwLikeT; static bool qcFilter; static bool sortByScore; // prioritize alignments to report by score? bool gReportOverhangs; // false -> filter out alignments that fall off the end of a reference sequence static string rgid; // ID: setting for @RG header line static string rgs; // SAM outputs for @RG header line static string rgs_optflag; // SAM optional flag to add corresponding to @RG ID static bool msample; // whether to report a random alignment when maxed-out via -m/-M int gGapBarrier; // # diags on top/bot only to be entered diagonally static EList qualities; static EList qualities1; static EList qualities2; static string polstr; // temporary holder for policy string static bool msNoCache; // true -> disable local cache static int bonusMatchType; // how to reward matches static int bonusMatch; // constant reward if bonusMatchType=constant static int penMmcType; // how to penalize mismatches int penMmcMax; // max mm penalty static int penMmcMin; // min mm penalty static int penScMax; // max sc penalty static int penScMin; // min sc penalty static int penNType; // how to penalize Ns in the read static int penN; // constant if N pelanty is a constant static bool penNCatPair; // concatenate mates before N filtering? static bool localAlign; // do local alignment in DP steps static bool noisyHpolymer; // set to true if gap penalties should be reduced to be consistent with a sequencer that under- and overcalls homopolymers static int penRdGapConst; // constant cost of extending a gap in the read static int penRfGapConst; // constant cost of extending a gap in the reference static int penRdGapLinear; // coeff of linear term for cost of gap extension in read static int penRfGapLinear; // coeff of linear term for cost of gap extension in ref SimpleFunc scoreMin; // minimum valid score as function of read len static SimpleFunc nCeil; // max # Ns allowed as function of read len static SimpleFunc msIval; // interval between seeds as function of read len static double descConsExp; // how to adjust score minimum as we descent further into index-assisted alignment static size_t descentLanding; // don't place a search root if it's within this many positions of end static SimpleFunc descentTotSz; // maximum space a DescentDriver can use in bytes static SimpleFunc descentTotFmops; // maximum # FM ops a DescentDriver can perform static int multiseedMms; // mismatches permitted in a multiseed seed static int multiseedLen; // length of multiseed seeds static size_t multiseedOff; // offset to begin extracting seeds static uint32_t seedCacheLocalMB; // # MB to use for non-shared seed alignment cacheing static uint32_t seedCacheCurrentMB; // # MB to use for current-read seed hit cacheing static uint32_t exactCacheCurrentMB; // # MB to use for current-read seed hit cacheing static size_t maxhalf; // max width on one side of DP table static bool seedSumm; // print summary information about seed hits, not alignments static bool doUngapped; // do ungapped alignment static size_t maxIters; // stop after this many extend loop iterations static size_t maxUg; // stop after this many ungap extends static size_t maxDp; // stop after this many DPs static size_t maxItersIncr; // amt to add to maxIters for each -k > 1 static size_t maxEeStreak; // stop after this many end-to-end fails in a row static size_t maxUgStreak; // stop after this many ungap fails in a row static size_t maxDpStreak; // stop after this many dp fails in a row static size_t maxStreakIncr; // amt to add to streak for each -k > 1 static size_t maxMateStreak; // stop seed range after this many mate-find fails static bool doExtend; // extend seed hits static bool enable8; // use 8-bit SSE where possible? static size_t cminlen; // longer reads use checkpointing static size_t cpow2; // checkpoint interval log2 static bool doTri; // do triangular mini-fills? static string defaultPreset; // default preset; applied immediately static bool ignoreQuals; // all mms incur same penalty, regardless of qual static string wrapper; // type of wrapper script, so we can print correct usage static EList queries; // list of query files static string outfile; // write SAM output to this file static int mapqv; // MAPQ calculation version static int tighten; // -M tighten mode (0=none, 1=best, 2=secbest+1) static bool doExactUpFront; // do exact search up front if seeds seem good enough static bool do1mmUpFront; // do 1mm search up front if seeds seem good enough static size_t do1mmMinLen; // length below which we disable 1mm e2e search static int seedBoostThresh; // if average non-zero position has more than this many elements static size_t maxSeeds; // maximum number of seeds allowed static size_t nSeedRounds; // # seed rounds static bool reorder; // true -> reorder SAM recs in -p mode static float sampleFrac; // only align random fraction of input reads static bool arbitraryRandom; // pseudo-randoms no longer a function of read properties static bool bowtie2p5; static bool useTempSpliceSite; static int penCanSplice; static int penNoncanSplice; static int penConflictSplice; static SimpleFunc penCanIntronLen; static SimpleFunc penNoncanIntronLen; static size_t minIntronLen; static size_t maxIntronLen; static string knownSpliceSiteInfile; // static string novelSpliceSiteInfile; // static string novelSpliceSiteOutfile; // static bool secondary; static bool no_spliced_alignment; static int rna_strandness; // static bool splicesite_db_only; // static bool anchorStop; static bool pseudogeneStop; static bool tranMapOnly; // transcriptome mapping only static bool tranAssm; // alignments selected for downstream transcript assembly such as StringTie and Cufflinks static string tranAssm_program; static bool avoid_pseudogene; #ifdef USE_SRA static EList sra_accs; #endif static string bt2indexs[2]; // read Bowtie 2 index from files with this prefix static EList > extra_opts; static size_t extra_opts_cur; static EList thread_rids; static MUTEX_T thread_rids_mutex; static uint64_t thread_rids_mindist; static bool rmChrName; // remove "chr" from reference names (e.g., chr18 to 18) static bool addChrName; // add "chr" to reference names (e.g., 18 to chr18) static size_t max_alts_tried; static bool use_haplotype; static bool enable_codis; static bool templateLenAdjustment; static string alignSumFile; // write alignment summary stat. to this file static bool newAlignSummary; static int bowtie2_dp; // Bowtie2's dynamic programming alignment (0: no dynamic programming, 1: conditional dynamic programming, and 2: uncoditional dynamic programming) static bool fast; // --fast static bool sensitive; // --sensitive static bool very_sensitive; // --very-sensitive static bool repeat; static bool use_repeat_index; static EList readLens; // 3N variable bool threeN = false; // indicator for 3N mode. bool base_change_entered; // set true once user used --base-change char usrInput_convertedFrom; // user input converted from. the nucleotide is replaced by others in sample preparation protocol. for sequence comparison step in HISAT-3N. char usrInput_convertedTo; // user input converted To. the nucleotide to others in sample preparation protocol. for sequence comparison step in HISAT-3N. char usrInput_convertedFromComplement; // the complement of usrInput_convertedFrom. for sequence comparison step in HISAT-3N. char usrInput_convertedToComplement; // the complement of usrInput_convertedTo. for sequence comparison step in HISAT-3N. char hs3N_convertedFrom; // the actual converted from by HISAT-3N. use in + strand. char hs3N_convertedTo; // the actual converted to by HISAT-3N. use in + strand. char hs3N_convertedFromComplement; // the complement of hs3N_convertedFrom. use in - strand. char hs3N_convertedToComplement; // the complement of hs3N_convertedTo. use in - strand. string threeN_indexTags[2]; vector repeatHandles; // the 2 repeat handles helps expand the repeat alignment information. 0 for + strand. 1 for - strand. struct ht2_index_getrefnames_result *refNameMap; // chromosome names and it's index for repeat alignment. int repeatLimit; // expand #repeatLimit of qualified position in repeat alignment. bool uniqueOutputOnly; // only output the unique alignment result. int nMappingCycle; // =1 for standard HISAT2, =4 for HISAT-3N bool mappingCycles[4]; // this array will indicate which mapping cycle will be run int directional3NMapping; // =0 for non-directional mapping, =1 for directional mapping and read1/single-end map to fw reference, =2 for reverse directional mapping and read1/single-end map to rc reference. #define DMAX std::numeric_limits::max() static void resetOptions() { mates1.clear(); mates2.clear(); mates12.clear(); adjIdxBase = ""; adjIdxBases_3N[0] = ""; adjIdxBases_3N[1] = ""; gColor = false; gVerbose = 0; startVerbose = 0; gQuiet = false; sanityCheck = 0; // enable expensive sanity checks format = FASTQ; // default read format is FASTQ origString = ""; // reference text, or filename(s) seed = 0; // srandom() seed timing = 0; // whether to report basic timing data metricsIval = 1; // interval between alignment metrics messages (0 = no messages) metricsFile = ""; // output file to put alignment metrics in metricsStderr = false; // print metrics to stderr (in addition to --metrics-file if it's specified metricsPerRead = false; // report a metrics tuple for every read? allHits = false; // for multihits, report just one showVersion = false; // just print version and quit? ipause = 0; // pause before maching? qUpto = 0xffffffff; // max # of queries to read gTrim5 = 0; // amount to trim from 5' end gTrim3 = 0; // amount to trim from 3' end offRate = -1; // keep default offRate solexaQuals = false; // quality strings are solexa quals, not phred, and subtract 64 (not 33) phred64Quals = false; // quality chars are phred, but must subtract 64 (not 33) integerQuals = false; // quality strings are space-separated strings of integers, not ASCII nthreads = 1; // number of pthreads operating concurrently outType = OUTPUT_SAM; // style of output noRefNames = false; // true -> print reference indexes; not names khits = 10; // number of hits per read; >1 is much slower mhits = 0; // stop after finding this many alignments+1 partitionSz = 0; // output a partitioning key in first field useSpinlock = true; // false -> don't use of spinlocks even if they're #defines fileParallel = false; // separate threads read separate input files in parallel useShmem = false; // use shared memory to hold the index useMm = false; // use memory-mapped files to hold the index mmSweep = false; // sweep through memory-mapped files immediately after mapping gMinInsert = 0; // minimum insert size gMaxInsert = 1000; // maximum insert size gMate1fw = true; // -1 mate aligns in fw orientation on fw strand gMate2fw = false; // -2 mate aligns in rc orientation on fw strand gFlippedMatesOK = false; // allow mates to be in wrong order gDovetailMatesOK = false; // allow one mate to extend off the end of the other gContainMatesOK = true; // allow one mate to contain the other in PE alignment gOlapMatesOK = true; // allow mates to overlap in PE alignment gExpandToFrag = true; // incr max frag length to =larger mate len if necessary gReportDiscordant = true; // find and report discordant paired-end alignments gReportMixed = true; // find and report unpaired alignments for paired reads cacheLimit = 5; // ranges w/ size > limit will be cached cacheSize = 0; // # words per range cache skipReads = 0; // # reads/read pairs to skip gNofw = false; // don't align fw orientation of read gNorc = false; // don't align rc orientation of read fastaContLen = 0; fastaContFreq = 0; hadoopOut = false; // print Hadoop status and summary messages fuzzy = false; // reads will have alternate basecalls w/ qualities fullRef = false; // print entire reference name instead of just up to 1st space samTruncQname = true; // whether to truncate QNAME to 255 chars samOmitSecSeqQual = false; // omit SEQ/QUAL for 2ndary alignments? samNoUnal = false; // omit SAM records for unaligned reads samNoHead = false; // don't print any header lines in SAM output samNoSQ = false; // don't print @SQ header lines sam_print_as = true; sam_print_xs = true; sam_print_xss = false; // Xs:i and Ys:i sam_print_yn = false; // YN:i and Yn:i sam_print_xn = true; sam_print_cs = false; sam_print_cq = false; sam_print_x0 = true; sam_print_x1 = true; sam_print_xm = true; sam_print_xo = true; sam_print_xg = true; sam_print_nm = true; sam_print_md = true; sam_print_yf = true; sam_print_yi = false; sam_print_ym = false; sam_print_yp = false; sam_print_yt = true; sam_print_ys = true; sam_print_zs = false; sam_print_xr = false; sam_print_xt = false; sam_print_xd = false; sam_print_xu = false; sam_print_yl = false; sam_print_ye = false; sam_print_yu = false; sam_print_xp = false; sam_print_yr = false; sam_print_zb = false; sam_print_zr = false; sam_print_zf = false; sam_print_zm = false; sam_print_zi = false; sam_print_zp = false; sam_print_zu = false; sam_print_xs_a = true; sam_print_nh = true; bwaSwLike = false; bwaSwLikeC = 5.5f; bwaSwLikeT = 20.0f; qcFilter = false; // don't believe upstream qc by default sortByScore = true; // prioritize alignments to report by score? rgid = ""; // SAM outputs for @RG header line rgs = ""; // SAM outputs for @RG header line rgs_optflag = ""; // SAM optional flag to add corresponding to @RG ID msample = true; gGapBarrier = 4; // disallow gaps within this many chars of either end of alignment qualities.clear(); qualities1.clear(); qualities2.clear(); polstr.clear(); msNoCache = true; // true -> disable local cache bonusMatchType = DEFAULT_MATCH_BONUS_TYPE; bonusMatch = DEFAULT_MATCH_BONUS; penMmcType = DEFAULT_MM_PENALTY_TYPE; penMmcMax = DEFAULT_MM_PENALTY_MAX; penMmcMin = DEFAULT_MM_PENALTY_MIN; penScMax = DEFAULT_SC_PENALTY_MAX; penScMin = DEFAULT_SC_PENALTY_MIN; penNType = DEFAULT_N_PENALTY_TYPE; penN = DEFAULT_N_PENALTY; penNCatPair = DEFAULT_N_CAT_PAIR; // concatenate mates before N filtering? localAlign = false; // do local alignment in DP steps noisyHpolymer = false; penRdGapConst = DEFAULT_READ_GAP_CONST; penRfGapConst = DEFAULT_REF_GAP_CONST; penRdGapLinear = DEFAULT_READ_GAP_LINEAR; penRfGapLinear = DEFAULT_REF_GAP_LINEAR; scoreMin.init (SIMPLE_FUNC_LINEAR, 0.0f, -0.2f); // scoreMin.init (SIMPLE_FUNC_CONST, -18, 0); nCeil.init (SIMPLE_FUNC_LINEAR, 0.0f, DMAX, 2.0f, 0.1f); msIval.init (SIMPLE_FUNC_LINEAR, 1.0f, DMAX, DEFAULT_IVAL_B, DEFAULT_IVAL_A); descConsExp = 2.0; descentLanding = 20; descentTotSz.init(SIMPLE_FUNC_LINEAR, 1024.0, DMAX, 0.0, 1024.0); descentTotFmops.init(SIMPLE_FUNC_LINEAR, 100.0, DMAX, 0.0, 10.0); multiseedMms = DEFAULT_SEEDMMS; multiseedLen = DEFAULT_SEEDLEN; multiseedOff = 0; seedCacheLocalMB = 32; // # MB to use for non-shared seed alignment cacheing seedCacheCurrentMB = 20; // # MB to use for current-read seed hit cacheing exactCacheCurrentMB = 20; // # MB to use for current-read seed hit cacheing maxhalf = 15; // max width on one side of DP table seedSumm = false; // print summary information about seed hits, not alignments doUngapped = true; // do ungapped alignment maxIters = 400; // max iterations of extend loop maxUg = 300; // stop after this many ungap extends maxDp = 300; // stop after this many dp extends maxItersIncr = 20; // amt to add to maxIters for each -k > 1 maxEeStreak = 15; // stop after this many end-to-end fails in a row maxUgStreak = 15; // stop after this many ungap fails in a row maxDpStreak = 15; // stop after this many dp fails in a row maxStreakIncr = 10; // amt to add to streak for each -k > 1 maxMateStreak = 10; // in PE: abort seed range after N mate-find fails doExtend = true; // do seed extensions enable8 = true; // use 8-bit SSE where possible? cminlen = 2000; // longer reads use checkpointing cpow2 = 4; // checkpoint interval log2 doTri = false; // do triangular mini-fills? defaultPreset = "sensitive%LOCAL%"; // default preset; applied immediately extra_opts.clear(); extra_opts_cur = 0; bt2indexs[0].clear(); // read Bowtie 2 index from files with this prefix bt2indexs[1].clear(); ignoreQuals = false; // all mms incur same penalty, regardless of qual wrapper.clear(); // type of wrapper script, so we can print correct usage queries.clear(); // list of query files outfile.clear(); // write SAM output to this file mapqv = 2; // MAPQ calculation version tighten = 3; // -M tightening mode doExactUpFront = true; // do exact search up front if seeds seem good enough do1mmUpFront = true; // do 1mm search up front if seeds seem good enough seedBoostThresh = 300; // if average non-zero position has more than this many elements nSeedRounds = 2; // # rounds of seed searches to do for repetitive reads maxSeeds = 0; // maximum number of seeds allowed do1mmMinLen = 60; // length below which we disable 1mm search reorder = false; // reorder SAM records with -p > 1 sampleFrac = 1.1f; // align all reads arbitraryRandom = false; // let pseudo-random seeds be a function of read properties bowtie2p5 = false; useTempSpliceSite = true; penCanSplice = 0; penNoncanSplice = 12; penConflictSplice = 1000000; penCanIntronLen.init(SIMPLE_FUNC_LOG, -8, 1); penNoncanIntronLen.init(SIMPLE_FUNC_LOG, -8, 1); minIntronLen = 20; maxIntronLen = 500000; knownSpliceSiteInfile = ""; novelSpliceSiteInfile = ""; novelSpliceSiteOutfile = ""; secondary = false; // allow secondary alignments no_spliced_alignment = false; rna_strandness = RNA_STRANDNESS_UNKNOWN; splicesite_db_only = false; anchorStop = true; pseudogeneStop = true; tranMapOnly = false; tranAssm = false; tranAssm_program = ""; avoid_pseudogene = false; #ifdef USE_SRA sra_accs.clear(); #endif rmChrName = false; addChrName = false; max_alts_tried = 16; use_haplotype = false; enable_codis = false; templateLenAdjustment = true; alignSumFile = ""; newAlignSummary = false; bowtie2_dp = 0; // disable Bowtie2's dynamic programming alignment fast = false; sensitive = false; very_sensitive = false; repeat = false; // true iff alignments to repeat sequences are directly reported. use_repeat_index = true; readLens.clear(); refNameMap = NULL; threeN = false; repeatLimit = 1000; uniqueOutputOnly = false; base_change_entered = false; threeN_indexTags[0] = ".3n."; threeN_indexTags[1] = ".3n."; nMappingCycle = 1; directional3NMapping = 0; for (int i = 0; i < 4; i++){ mappingCycles[i] = false; } } static const char *short_options = "fF:qbzhcu:rv:s:aP:t3:5:w:p:k:M:1:2:I:X:CQ:N:i:L:U:x:S:g:O:D:R:"; static struct option long_options[] = { {(char*)"verbose", no_argument, 0, ARG_VERBOSE}, {(char*)"startverbose", no_argument, 0, ARG_STARTVERBOSE}, {(char*)"quiet", no_argument, 0, ARG_QUIET}, {(char*)"sanity", no_argument, 0, ARG_SANITY}, {(char*)"pause", no_argument, &ipause, 1}, {(char*)"orig", required_argument, 0, ARG_ORIG}, {(char*)"all", no_argument, 0, 'a'}, {(char*)"solexa-quals", no_argument, 0, ARG_SOLEXA_QUALS}, {(char*)"integer-quals",no_argument, 0, ARG_INTEGER_QUALS}, {(char*)"int-quals", no_argument, 0, ARG_INTEGER_QUALS}, {(char*)"metrics", required_argument, 0, ARG_METRIC_IVAL}, {(char*)"metrics-file", required_argument, 0, ARG_METRIC_FILE}, {(char*)"metrics-stderr",no_argument, 0, ARG_METRIC_STDERR}, {(char*)"metrics-per-read", no_argument, 0, ARG_METRIC_PER_READ}, {(char*)"met-read", no_argument, 0, ARG_METRIC_PER_READ}, {(char*)"met", required_argument, 0, ARG_METRIC_IVAL}, {(char*)"met-file", required_argument, 0, ARG_METRIC_FILE}, {(char*)"met-stderr", no_argument, 0, ARG_METRIC_STDERR}, {(char*)"time", no_argument, 0, 't'}, {(char*)"trim3", required_argument, 0, '3'}, {(char*)"trim5", required_argument, 0, '5'}, {(char*)"seed", required_argument, 0, ARG_SEED}, {(char*)"qupto", required_argument, 0, 'u'}, {(char*)"upto", required_argument, 0, 'u'}, {(char*)"version", no_argument, 0, ARG_VERSION}, {(char*)"filepar", no_argument, 0, ARG_FILEPAR}, {(char*)"help", no_argument, 0, 'h'}, {(char*)"threads", required_argument, 0, 'p'}, {(char*)"khits", required_argument, 0, 'k'}, {(char*)"minins", required_argument, 0, 'I'}, {(char*)"maxins", required_argument, 0, 'X'}, {(char*)"quals", required_argument, 0, 'Q'}, {(char*)"Q1", required_argument, 0, ARG_QUALS1}, {(char*)"Q2", required_argument, 0, ARG_QUALS2}, {(char*)"refidx", no_argument, 0, ARG_REFIDX}, {(char*)"partition", required_argument, 0, ARG_PARTITION}, {(char*)"ff", no_argument, 0, ARG_FF}, {(char*)"fr", no_argument, 0, ARG_FR}, {(char*)"rf", no_argument, 0, ARG_RF}, {(char*)"cachelim", required_argument, 0, ARG_CACHE_LIM}, {(char*)"cachesz", required_argument, 0, ARG_CACHE_SZ}, {(char*)"nofw", no_argument, 0, ARG_NO_FW}, {(char*)"norc", no_argument, 0, ARG_NO_RC}, {(char*)"skip", required_argument, 0, 's'}, {(char*)"12", required_argument, 0, ARG_ONETWO}, {(char*)"tab5", required_argument, 0, ARG_TAB5}, {(char*)"tab6", required_argument, 0, ARG_TAB6}, {(char*)"phred33-quals", no_argument, 0, ARG_PHRED33}, {(char*)"phred64-quals", no_argument, 0, ARG_PHRED64}, {(char*)"phred33", no_argument, 0, ARG_PHRED33}, {(char*)"phred64", no_argument, 0, ARG_PHRED64}, {(char*)"solexa1.3-quals", no_argument, 0, ARG_PHRED64}, {(char*)"mm", no_argument, 0, ARG_MM}, {(char*)"shmem", no_argument, 0, ARG_SHMEM}, {(char*)"mmsweep", no_argument, 0, ARG_MMSWEEP}, {(char*)"hadoopout", no_argument, 0, ARG_HADOOPOUT}, {(char*)"fuzzy", no_argument, 0, ARG_FUZZY}, {(char*)"fullref", no_argument, 0, ARG_FULLREF}, {(char*)"usage", no_argument, 0, ARG_USAGE}, {(char*)"sam-no-qname-trunc", no_argument, 0, ARG_SAM_NO_QNAME_TRUNC}, {(char*)"sam-omit-sec-seq", no_argument, 0, ARG_SAM_OMIT_SEC_SEQ}, {(char*)"omit-sec-seq", no_argument, 0, ARG_SAM_OMIT_SEC_SEQ}, {(char*)"sam-no-head", no_argument, 0, ARG_SAM_NOHEAD}, {(char*)"sam-nohead", no_argument, 0, ARG_SAM_NOHEAD}, {(char*)"sam-noHD", no_argument, 0, ARG_SAM_NOHEAD}, {(char*)"sam-no-hd", no_argument, 0, ARG_SAM_NOHEAD}, {(char*)"sam-nosq", no_argument, 0, ARG_SAM_NOSQ}, {(char*)"sam-no-sq", no_argument, 0, ARG_SAM_NOSQ}, {(char*)"sam-noSQ", no_argument, 0, ARG_SAM_NOSQ}, {(char*)"no-head", no_argument, 0, ARG_SAM_NOHEAD}, {(char*)"no-hd", no_argument, 0, ARG_SAM_NOHEAD}, {(char*)"no-sq", no_argument, 0, ARG_SAM_NOSQ}, {(char*)"no-HD", no_argument, 0, ARG_SAM_NOHEAD}, {(char*)"no-SQ", no_argument, 0, ARG_SAM_NOSQ}, {(char*)"no-unal", no_argument, 0, ARG_SAM_NO_UNAL}, {(char*)"color", no_argument, 0, 'C'}, {(char*)"sam-RG", required_argument, 0, ARG_SAM_RG}, {(char*)"sam-rg", required_argument, 0, ARG_SAM_RG}, {(char*)"sam-rg-id", required_argument, 0, ARG_SAM_RGID}, {(char*)"RG", required_argument, 0, ARG_SAM_RG}, {(char*)"rg", required_argument, 0, ARG_SAM_RG}, {(char*)"rg-id", required_argument, 0, ARG_SAM_RGID}, {(char*)"snpphred", required_argument, 0, ARG_SNPPHRED}, {(char*)"snpfrac", required_argument, 0, ARG_SNPFRAC}, {(char*)"gbar", required_argument, 0, ARG_GAP_BAR}, {(char*)"qseq", no_argument, 0, ARG_QSEQ}, {(char*)"policy", required_argument, 0, ARG_ALIGN_POLICY}, {(char*)"preset", required_argument, 0, 'P'}, {(char*)"seed-summ", no_argument, 0, ARG_SEED_SUMM}, {(char*)"seed-summary", no_argument, 0, ARG_SEED_SUMM}, {(char*)"overhang", no_argument, 0, ARG_OVERHANG}, {(char*)"no-cache", no_argument, 0, ARG_NO_CACHE}, {(char*)"cache", no_argument, 0, ARG_USE_CACHE}, {(char*)"454", no_argument, 0, ARG_NOISY_HPOLY}, {(char*)"ion-torrent", no_argument, 0, ARG_NOISY_HPOLY}, {(char*)"no-mixed", no_argument, 0, ARG_NO_MIXED}, {(char*)"no-discordant",no_argument, 0, ARG_NO_DISCORDANT}, // {(char*)"local", no_argument, 0, ARG_LOCAL}, {(char*)"end-to-end", no_argument, 0, ARG_END_TO_END}, {(char*)"ungapped", no_argument, 0, ARG_UNGAPPED}, {(char*)"no-ungapped", no_argument, 0, ARG_UNGAPPED_NO}, {(char*)"sse8", no_argument, 0, ARG_SSE8}, {(char*)"no-sse8", no_argument, 0, ARG_SSE8_NO}, {(char*)"scan-narrowed",no_argument, 0, ARG_SCAN_NARROWED}, {(char*)"qc-filter", no_argument, 0, ARG_QC_FILTER}, {(char*)"bwa-sw-like", no_argument, 0, ARG_BWA_SW_LIKE}, {(char*)"multiseed", required_argument, 0, ARG_MULTISEED_IVAL}, {(char*)"ma", required_argument, 0, ARG_SCORE_MA}, {(char*)"mp", required_argument, 0, ARG_SCORE_MMP}, {(char*)"sp", required_argument, 0, ARG_SCORE_SCP}, {(char*)"no-softclip", no_argument, 0, ARG_NO_SOFTCLIP}, {(char*)"np", required_argument, 0, ARG_SCORE_NP}, {(char*)"rdg", required_argument, 0, ARG_SCORE_RDG}, {(char*)"rfg", required_argument, 0, ARG_SCORE_RFG}, {(char*)"score-min", required_argument, 0, ARG_SCORE_MIN}, {(char*)"min-score", required_argument, 0, ARG_SCORE_MIN}, {(char*)"n-ceil", required_argument, 0, ARG_N_CEIL}, {(char*)"dpad", required_argument, 0, ARG_DPAD}, {(char*)"mapq-print-inputs",no_argument, 0, ARG_SAM_PRINT_YI}, {(char*)"very-fast", no_argument, 0, ARG_PRESET_VERY_FAST}, {(char*)"fast", no_argument, 0, ARG_PRESET_FAST}, {(char*)"sensitive", no_argument, 0, ARG_PRESET_SENSITIVE}, {(char*)"very-sensitive", no_argument, 0, ARG_PRESET_VERY_SENSITIVE}, // {(char*)"very-fast-local", no_argument, 0, ARG_PRESET_VERY_FAST_LOCAL}, // {(char*)"fast-local", no_argument, 0, ARG_PRESET_FAST_LOCAL}, // {(char*)"sensitive-local", no_argument, 0, ARG_PRESET_SENSITIVE_LOCAL}, // {(char*)"very-sensitive-local", no_argument, 0, ARG_PRESET_VERY_SENSITIVE_LOCAL}, {(char*)"no-score-priority",no_argument, 0, ARG_NO_SCORE_PRIORITY}, {(char*)"seedlen", required_argument, 0, 'L'}, {(char*)"seedmms", required_argument, 0, 'N'}, {(char*)"seedival", required_argument, 0, 'i'}, {(char*)"ignore-quals", no_argument, 0, ARG_IGNORE_QUALS}, {(char*)"index", required_argument, 0, 'x'}, {(char*)"arg-desc", no_argument, 0, ARG_DESC}, {(char*)"wrapper", required_argument, 0, ARG_WRAPPER}, {(char*)"unpaired", required_argument, 0, 'U'}, {(char*)"output", required_argument, 0, 'S'}, {(char*)"mapq-v", required_argument, 0, ARG_MAPQ_V}, {(char*)"dovetail", no_argument, 0, ARG_DOVETAIL}, {(char*)"no-dovetail", no_argument, 0, ARG_NO_DOVETAIL}, {(char*)"contain", no_argument, 0, ARG_CONTAIN}, {(char*)"no-contain", no_argument, 0, ARG_NO_CONTAIN}, {(char*)"overlap", no_argument, 0, ARG_OVERLAP}, {(char*)"no-overlap", no_argument, 0, ARG_NO_OVERLAP}, {(char*)"tighten", required_argument, 0, ARG_TIGHTEN}, {(char*)"exact-upfront", no_argument, 0, ARG_EXACT_UPFRONT}, {(char*)"1mm-upfront", no_argument, 0, ARG_1MM_UPFRONT}, {(char*)"no-exact-upfront", no_argument, 0, ARG_EXACT_UPFRONT_NO}, {(char*)"no-1mm-upfront", no_argument, 0, ARG_1MM_UPFRONT_NO}, {(char*)"1mm-minlen", required_argument, 0, ARG_1MM_MINLEN}, {(char*)"seed-off", required_argument, 0, 'O'}, {(char*)"seed-boost", required_argument, 0, ARG_SEED_BOOST_THRESH}, {(char*)"max-seeds", required_argument, 0, ARG_MAX_SEEDS}, {(char*)"read-times", no_argument, 0, ARG_READ_TIMES}, {(char*)"show-rand-seed", no_argument, 0, ARG_SHOW_RAND_SEED}, {(char*)"dp-fail-streak", required_argument, 0, ARG_DP_FAIL_STREAK_THRESH}, {(char*)"ee-fail-streak", required_argument, 0, ARG_EE_FAIL_STREAK_THRESH}, {(char*)"ug-fail-streak", required_argument, 0, ARG_UG_FAIL_STREAK_THRESH}, {(char*)"fail-streak", required_argument, 0, 'D'}, {(char*)"dp-fails", required_argument, 0, ARG_DP_FAIL_THRESH}, {(char*)"ug-fails", required_argument, 0, ARG_UG_FAIL_THRESH}, {(char*)"extends", required_argument, 0, ARG_EXTEND_ITERS}, {(char*)"no-extend", no_argument, 0, ARG_NO_EXTEND}, {(char*)"mapq-extra", no_argument, 0, ARG_MAPQ_EX}, {(char*)"seed-rounds", required_argument, 0, 'R'}, {(char*)"reorder", no_argument, 0, ARG_REORDER}, {(char*)"passthrough", no_argument, 0, ARG_READ_PASSTHRU}, {(char*)"sample", required_argument, 0, ARG_SAMPLE}, {(char*)"cp-min", required_argument, 0, ARG_CP_MIN}, {(char*)"cp-ival", required_argument, 0, ARG_CP_IVAL}, {(char*)"tri", no_argument, 0, ARG_TRI}, {(char*)"nondeterministic", no_argument, 0, ARG_NON_DETERMINISTIC}, {(char*)"non-deterministic", no_argument, 0, ARG_NON_DETERMINISTIC}, // {(char*)"local-seed-cache-sz", required_argument, 0, ARG_LOCAL_SEED_CACHE_SZ}, {(char*)"seed-cache-sz", required_argument, 0, ARG_CURRENT_SEED_CACHE_SZ}, {(char*)"no-unal", no_argument, 0, ARG_SAM_NO_UNAL}, {(char*)"test-25", no_argument, 0, ARG_TEST_25}, // TODO: following should be a function of read length? {(char*)"desc-kb", required_argument, 0, ARG_DESC_KB}, {(char*)"desc-landing", required_argument, 0, ARG_DESC_LANDING}, {(char*)"desc-exp", required_argument, 0, ARG_DESC_EXP}, {(char*)"desc-fmops", required_argument, 0, ARG_DESC_FMOPS}, {(char*)"no-temp-splicesite", no_argument, 0, ARG_NO_TEMPSPLICESITE}, {(char*)"pen-cansplice", required_argument, 0, ARG_PEN_CANSPLICE}, {(char*)"pen-noncansplice", required_argument, 0, ARG_PEN_NONCANSPLICE}, {(char*)"pen-conflictsplice", required_argument, 0, ARG_PEN_CONFLICTSPLICE}, {(char*)"pen-intronlen", required_argument, 0, ARG_PEN_CANINTRONLEN}, {(char*)"pen-canintronlen", required_argument, 0, ARG_PEN_CANINTRONLEN}, {(char*)"pen-noncanintronlen", required_argument, 0, ARG_PEN_NONCANINTRONLEN}, {(char*)"min-intronlen", required_argument, 0, ARG_MIN_INTRONLEN}, {(char*)"max-intronlen", required_argument, 0, ARG_MAX_INTRONLEN}, {(char*)"known-splicesite-infile", required_argument, 0, ARG_KNOWN_SPLICESITE_INFILE}, {(char*)"novel-splicesite-infile", required_argument, 0, ARG_NOVEL_SPLICESITE_INFILE}, {(char*)"novel-splicesite-outfile", required_argument, 0, ARG_NOVEL_SPLICESITE_OUTFILE}, {(char*)"secondary", no_argument, 0, ARG_SECONDARY}, {(char*)"no-spliced-alignment", no_argument, 0, ARG_NO_SPLICED_ALIGNMENT}, {(char*)"rna-strandness", required_argument, 0, ARG_RNA_STRANDNESS}, {(char*)"splicesite-db-only", no_argument, 0, ARG_SPLICESITE_DB_ONLY}, {(char*)"no-anchorstop", no_argument, 0, ARG_NO_ANCHORSTOP}, {(char*)"transcriptome-mapping-only", no_argument, 0, ARG_TRANSCRIPTOME_MAPPING_ONLY}, {(char*)"tmo", no_argument, 0, ARG_TRANSCRIPTOME_MAPPING_ONLY}, {(char*)"downstream-transcriptome-assembly", no_argument, 0, ARG_TRANSCRIPTOME_ASSEMBLY}, {(char*)"dta", no_argument, 0, ARG_TRANSCRIPTOME_ASSEMBLY}, {(char*)"dta-cufflinks", no_argument, 0, ARG_TRANSCRIPTOME_ASSEMBLY_CUFFLINKS}, {(char*)"avoid-pseudogene",no_argument, 0, ARG_AVOID_PSEUDOGENE}, {(char*)"no-templatelen-adjustment", no_argument, 0, ARG_NO_TEMPLATELEN_ADJUSTMENT}, #ifdef USE_SRA {(char*)"sra-acc", required_argument, 0, ARG_SRA_ACC}, #endif {(char*)"remove-chrname", no_argument, 0, ARG_REMOVE_CHRNAME}, {(char*)"add-chrname", no_argument, 0, ARG_ADD_CHRNAME}, {(char*)"max-altstried", required_argument, 0, ARG_MAX_ALTSTRIED}, {(char*)"haplotype", no_argument, 0, ARG_HAPLOTYPE}, {(char*)"enable-codis", no_argument, 0, ARG_CODIS}, {(char*)"summary-file", required_argument, 0, ARG_SUMMARY_FILE}, {(char*)"new-summary", no_argument, 0, ARG_NEW_SUMMARY}, {(char*)"enable-dp", no_argument, 0, ARG_DP}, {(char*)"bowtie2-dp", required_argument, 0, ARG_DP}, {(char*)"repeat", no_argument, 0, ARG_REPEAT}, {(char*)"no-repeat-index", no_argument, 0, ARG_NO_REPEAT_INDEX}, {(char*)"read-lengths", required_argument, 0, ARG_READ_LENGTHS}, {(char*)"base-change", required_argument, 0, ARG_BASE_CHANGE}, {(char*)"repeat-limit", required_argument, 0, ARG_REPEAT_LIMIT}, {(char*)"unique-only", no_argument, 0, ARG_UNIQUE_ONLY}, {(char*)"3N", no_argument, 0, ARG_3N}, {(char*)"directional-mapping", no_argument, 0, ARG_DIRECTIONAL}, {(char*)"directional-mapping-reverse", no_argument, 0, ARG_DIRECTIONAL_REVERSE}, {(char*)0, 0, 0, 0} // terminator }; /** * Print out a concise description of what options are taken and whether they * take an argument. */ static void printArgDesc(ostream& out) { // struct option { // const char *name; // int has_arg; // int *flag; // int val; // }; size_t i = 0; while(long_options[i].name != 0) { out << long_options[i].name << "\t" << (long_options[i].has_arg == no_argument ? 0 : 1) << endl; i++; } size_t solen = strlen(short_options); for(i = 0; i < solen; i++) { // Has an option? Does if next char is : if(i == solen-1) { assert_neq(':', short_options[i]); cout << (char)short_options[i] << "\t" << 0 << endl; } else { if(short_options[i+1] == ':') { // Option with argument cout << (char)short_options[i] << "\t" << 1 << endl; i++; // skip the ':' } else { // Option with no argument cout << (char)short_options[i] << "\t" << 0 << endl; } } } } /** * Print a summary usage message to the provided output stream. */ static void printUsage(ostream& out) { out << "HISAT2 version " << string(HISAT2_VERSION).c_str() << " by Daehwan Kim (infphilo@gmail.com, www.ccb.jhu.edu/people/infphilo)" << endl; string tool_name = "hisat2-align"; if(wrapper == "basic-0") { tool_name = "hisat2"; } out << "Usage: " << endl #ifdef USE_SRA << " " << tool_name.c_str() << " [options]* -x {-1 -2 | -U | --sra-acc } [-S ]" << endl #else << " " << tool_name.c_str() << " [options]* -x {-1 -2 | -U } [-S ]" << endl #endif << endl << " Index filename prefix (minus trailing .X." << gfm_ext << ")." << endl << " Files with #1 mates, paired with files in ." << endl; if(wrapper == "basic-0") { out << " Could be gzip'ed (extension: .gz) or bzip2'ed (extension: .bz2)." << endl; } out << " Files with #2 mates, paired with files in ." << endl; if(wrapper == "basic-0") { out << " Could be gzip'ed (extension: .gz) or bzip2'ed (extension: .bz2)." << endl; } out << " Files with unpaired reads." << endl; if(wrapper == "basic-0") { out << " Could be gzip'ed (extension: .gz) or bzip2'ed (extension: .bz2)." << endl; } #ifdef USE_SRA out << " Comma-separated list of SRA accession numbers, e.g. --sra-acc SRR353653,SRR353654." << endl; #endif out << " File for SAM output (default: stdout)" << endl << endl << " , , can be comma-separated lists (no whitespace) and can be" << endl << " specified many times. E.g. '-U file1.fq,file2.fq -U file3.fq'." << endl // Wrapper script should write line next << endl << "Options (defaults in parentheses):" << endl << endl << " Input:" << endl << " -q query input files are FASTQ .fq/.fastq (default)" << endl << " --qseq query input files are in Illumina's qseq format" << endl << " -f query input files are (multi-)FASTA .fa/.mfa" << endl << " -r query input files are raw one-sequence-per-line" << endl << " -c , , are sequences themselves, not files" << endl << " -s/--skip skip the first reads/pairs in the input (none)" << endl << " -u/--upto stop after first reads/pairs (no limit)" << endl << " -5/--trim5 trim bases from 5'/left end of reads (0)" << endl << " -3/--trim3 trim bases from 3'/right end of reads (0)" << endl << " --phred33 qualities are Phred+33 (default)" << endl << " --phred64 qualities are Phred+64" << endl << " --int-quals qualities encoded as space-delimited integers" << endl #ifdef USE_SRA << " --sra-acc SRA accession ID" << endl #endif << endl << " Presets: Same as:" << endl // << " For --end-to-end:" << endl // << " --very-fast -D 5 -R 1 -N 0 -L 22 -i S,0,2.50" << endl // << " --fast -D 10 -R 2 -N 0 -L 22 -i S,0,2.50" << endl // << " --sensitive -D 15 -R 2 -N 0 -L 22 -i S,1,1.15 (default)" << endl // << " --very-sensitive -D 20 -R 3 -N 0 -L 20 -i S,1,0.50" << endl << " --fast --no-repeat-index" << endl << " --sensitive --bowtie2-dp 1 -k 30 --score-min L,0,-0.5" << endl << " --very-sensitive --bowtie2-dp 2 -k 50 --score-min L,0,-1" << endl << endl << " Alignment:" << endl //<< " -N max # mismatches in seed alignment; can be 0 or 1 (0)" << endl //<< " -L length of seed substrings; must be >3, <32 (22)" << endl //<< " -i interval between seed substrings w/r/t read len (S,1,1.15)" << endl << " --bowtie2-dp use Bowtie2's dynamic programming alignment algorithm (0) - 0: no dynamic programming, 1: conditional dynamic programming, and 2: unconditional dynamic programming (slowest)" << endl << " --n-ceil func for max # non-A/C/G/Ts permitted in aln (L,0,0.15)" << endl //<< " --dpad include extra ref chars on sides of DP table (15)" << endl //<< " --gbar disallow gaps within nucs of read extremes (4)" << endl << " --ignore-quals treat all quality values as 30 on Phred scale (off)" << endl << " --nofw do not align forward (original) version of read (off)" << endl << " --norc do not align reverse-complement version of read (off)" << endl << " --no-repeat-index do not use repeat index" << endl << endl << " 3N-Alignment:" << endl << " --base-change the converted nucleotide and converted to nucleotide (C,T)" << endl << " --directional-mapping make directional mapping, please use this option only if your reads are prepared with a strand specific library (off)" << endl << " --repeat-limit maximum number of repeat will be expanded for repeat alignment (1000)" << endl << " --unique-only only output the reads have unique alignment (off)" << endl << endl << " Spliced Alignment:" << endl << " --pen-cansplice penalty for a canonical splice site (0)" << endl << " --pen-noncansplice penalty for a non-canonical splice site (12)" << endl // << " --pen-conflictsplice penalty for conflicting splice sites (1000000)" << endl << " --pen-canintronlen penalty for long introns (G,-8,1) with canonical splice sites" << endl << " --pen-noncanintronlen penalty for long introns (G,-8,1) with noncanonical splice sites" << endl << " --min-intronlen minimum intron length (20)" << endl << " --max-intronlen maximum intron length (500000)" << endl << " --known-splicesite-infile provide a list of known splice sites" << endl << " --novel-splicesite-outfile report a list of splice sites" << endl << " --novel-splicesite-infile provide a list of novel splice sites" << endl << " --no-temp-splicesite disable the use of splice sites found" << endl << " --no-spliced-alignment disable spliced alignment" << endl << " --rna-strandness specify strand-specific information (unstranded)" << endl << " --tmo reports only those alignments within known transcriptome" << endl << " --dta reports alignments tailored for transcript assemblers" << endl << " --dta-cufflinks reports alignments tailored specifically for cufflinks" << endl << " --avoid-pseudogene tries to avoid aligning reads to pseudogenes (experimental option)" << endl << " --no-templatelen-adjustment disables template length adjustment for RNA-seq reads" << endl << endl << " Scoring:" << endl //<< " --ma match bonus (0 for --end-to-end, 2 for --local) " << endl << " --mp , max and min penalties for mismatch; lower qual = lower penalty <6,2>" << endl << " --sp , max and min penalties for soft-clipping; lower qual = lower penalty <2,1>" << endl << " --no-softclip no soft-clipping" << endl << " --np penalty for non-A/C/G/Ts in read/ref (1)" << endl << " --rdg , read gap open, extend penalties (5,3)" << endl << " --rfg , reference gap open, extend penalties (5,3)" << endl << " --score-min min acceptable alignment score w/r/t read length" << endl << " (L,0.0,-0.2)" << endl << endl << " Reporting:" << endl << " -k It searches for at most distinct, primary alignments for each read. Primary alignments mean " << endl << " alignments whose alignment score is equal to or higher than any other alignments. The search terminates " << endl << " when it cannot find more distinct valid alignments, or when it finds , whichever happens first. " << endl << " The alignment score for a paired-end alignment equals the sum of the alignment scores of " << endl << " the individual mates. Each reported read or pair alignment beyond the first has the SAM ‘secondary’ bit " << endl << " (which equals 256) set in its FLAGS field. For reads that have more than distinct, " << endl << " valid alignments, hisat2 does not guarantee that the alignments reported are the best possible " << endl << " in terms of alignment score. Default: 5 (linear index) or 10 (graph index)." << endl << " Note: HISAT2 is not designed with large values for -k in mind, and when aligning reads to long, " << endl << " repetitive genomes, large -k could make alignment much slower." << endl << " --max-seeds HISAT2, like other aligners, uses seed-and-extend approaches. HISAT2 tries to extend seeds to " << endl << " full-length alignments. In HISAT2, --max-seeds is used to control the maximum number of seeds that " << endl << " will be extended. For DNA-read alignment (--no-spliced-alignment), HISAT2 extends up to these many seeds" << endl << " and skips the rest of the seeds. For RNA-read alignment, HISAT2 skips extending seeds and reports " << endl << " no alignments if the number of seeds is larger than the number specified with the option, " << endl << " to be compatible with previous versions of HISAT2. Large values for --max-seeds may improve alignment " << endl << " sensitivity, but HISAT2 is not designed with large values for --max-seeds in mind, and when aligning " << endl << " reads to long, repetitive genomes, large --max-seeds could make alignment much slower. " << endl << " The default value is the maximum of 5 and the value that comes with -k times 2." << endl << " -a/--all HISAT2 reports all alignments it can find. Using the option is equivalent to using both --max-seeds " << endl << " and -k with the maximum value that a 64-bit signed integer can represent (9,223,372,036,854,775,807)." << endl << " --repeat report alignments to repeat sequences directly" << endl << endl //<< " Effort:" << endl //<< " -D give up extending after failed extends in a row (15)" << endl //<< " -R for reads w/ repetitive seeds, try sets of seeds (2)" << endl //<< endl << " Paired-end:" << endl << " -I/--minins minimum fragment length (0), only valid with --no-spliced-alignment" << endl << " -X/--maxins maximum fragment length (500), only valid with --no-spliced-alignment" << endl << " --fr/--rf/--ff -1, -2 mates align fw/rev, rev/fw, fw/fw (--fr)" << endl << " --no-mixed suppress unpaired alignments for paired reads" << endl << " --no-discordant suppress discordant alignments for paired reads" << endl << endl << " Output:" << endl; //if(wrapper == "basic-0") { // out << " --bam output directly to BAM (by piping through 'samtools view')" << endl; //} out << " -t/--time print wall-clock time taken by search phases" << endl; if(wrapper == "basic-0") { out << " --un write unpaired reads that didn't align to " << endl << " --al write unpaired reads that aligned at least once to " << endl << " --un-conc write pairs that didn't align concordantly to " << endl << " --al-conc write pairs that aligned concordantly at least once to " << endl << " (Note: for --un, --al, --un-conc, or --al-conc, add '-gz' to the option name, e.g." << endl << " --un-gz , to gzip compress output, or add '-bz2' to bzip2 compress output.)" << endl; } out << " --summary-file print alignment summary to this file." << endl << " --new-summary print alignment summary in a new style, which is more machine-friendly." << endl << " --quiet print nothing to stderr except serious errors" << endl // << " --refidx refer to ref. seqs by 0-based index rather than name" << endl << " --met-file send metrics to file at (off)" << endl << " --met-stderr send metrics to stderr (off)" << endl << " --met report internal counters & metrics every secs (1)" << endl // Following is supported in the wrapper instead // << " --no-unal suppress SAM records for unaligned reads" << endl << " --no-head suppress header lines, i.e. lines starting with @" << endl << " --no-sq suppress @SQ header lines" << endl << " --rg-id set read group id, reflected in @RG line and RG:Z: opt field" << endl << " --rg add (\"lab:value\") to @RG line of SAM header." << endl << " Note: @RG line only printed when --rg-id is set." << endl << " --omit-sec-seq put '*' in SEQ and QUAL fields for secondary alignments." << endl << endl << " Performance:" << endl << " -o/--offrate override offrate of index; must be >= index's offrate" << endl << " -p/--threads number of alignment threads to launch (1)" << endl << " --reorder force SAM output order to match order of input reads" << endl #ifdef BOWTIE_MM << " --mm use memory-mapped I/O for index; many 'hisat2's can share" << endl #endif #ifdef BOWTIE_SHARED_MEM //<< " --shmem use shared mem for index; many 'hisat2's can share" << endl #endif << endl << " Other:" << endl << " --qc-filter filter out reads that are bad according to QSEQ filter" << endl << " --seed seed for random number generator (0)" << endl << " --non-deterministic seed rand. gen. arbitrarily instead of using read attributes" << endl << " --remove-chrname remove 'chr' from reference names in alignment" << endl << " --add-chrname add 'chr' to reference names in alignment " << endl // << " --verbose verbose output for debugging" << endl << " --version print version information and quit" << endl << " -h/--help print this usage message" << endl ; if(wrapper.empty()) { cerr << endl << "*** Warning ***" << endl << "'hisat2-align' was run directly. It is recommended that you run the wrapper script 'hisat2' instead." << endl << endl; } } /** * Parse an int out of optarg and enforce that it be at least 'lower'; * if it is less than 'lower', than output the given error message and * exit with an error and a usage message. */ static int parseInt(int lower, int upper, const char *errmsg, const char *arg) { long l; char *endPtr= NULL; l = strtol(arg, &endPtr, 10); if (endPtr != NULL) { if (l < lower || l > upper) { cerr << errmsg << endl; printUsage(cerr); throw 1; } return (int32_t)l; } cerr << errmsg << endl; printUsage(cerr); throw 1; return -1; } /** * Upper is maximum int by default. */ static int parseInt(int lower, const char *errmsg, const char *arg) { return parseInt(lower, std::numeric_limits::max(), errmsg, arg); } /** * Parse a T string 'str'. */ template T parse(const char *s) { T tmp; stringstream ss(s); ss >> tmp; return tmp; } /** * Parse a pair of Ts from a string, 'str', delimited with 'delim'. */ template pair parsePair(const char *str, char delim) { string s(str); EList ss; tokenize(s, delim, ss); pair ret; ret.first = parse(ss[0].c_str()); ret.second = parse(ss[1].c_str()); return ret; } /** * Parse a pair of Ts from a string, 'str', delimited with 'delim'. */ template void parseTuple(const char *str, char delim, EList& ret) { string s(str); EList ss; tokenize(s, delim, ss); for(size_t i = 0; i < ss.size(); i++) { ret.push_back(parse(ss[i].c_str())); } } static string applyPreset(const string& sorig, Presets& presets) { string s = sorig; size_t found = s.find("%LOCAL%"); if(found != string::npos) { s.replace(found, strlen("%LOCAL%"), localAlign ? "-local" : ""); } if(gVerbose) { cerr << "Applying preset: '" << s.c_str() << "' using preset menu '" << presets.name() << "'" << endl; } string pol; presets.apply(s, pol, extra_opts); return pol; } static bool saw_M; static bool saw_a; static bool saw_k; static EList presetList; /** * TODO: Argument parsing is very, very flawed. The biggest problem is that * there are two separate worlds of arguments, the ones set via polstr, and * the ones set directly in variables. This makes for nasty interactions, * e.g., with the -M option being resolved at an awkward time relative to * the -k and -a options. */ static void parseOption(int next_option, const char *arg) { switch (next_option) { case ARG_TEST_25: bowtie2p5 = true; break; case ARG_DESC_KB: descentTotSz = SimpleFunc::parse(arg, 0.0, 1024.0, 1024.0, DMAX); break; case ARG_DESC_FMOPS: descentTotFmops = SimpleFunc::parse(arg, 0.0, 10.0, 100.0, DMAX); break; case ARG_DESC_LANDING: descentLanding = parse(arg); break; case ARG_DESC_EXP: { descConsExp = parse(arg); if(descConsExp < 0.0) { cerr << "Error: --desc-exp must be greater than or equal to 0" << endl; throw 1; } break; } case '1': tokenize(arg, ",", mates1); break; case '2': tokenize(arg, ",", mates2); break; case ARG_ONETWO: tokenize(arg, ",", mates12); format = TAB_MATE5; break; case ARG_TAB5: tokenize(arg, ",", mates12); format = TAB_MATE5; break; case ARG_TAB6: tokenize(arg, ",", mates12); format = TAB_MATE6; break; case 'f': format = FASTA; break; case 'F': { format = FASTA_CONT; pair p = parsePair(arg, ','); fastaContLen = p.first; fastaContFreq = p.second; break; } case ARG_BWA_SW_LIKE: { bwaSwLikeC = 5.5f; bwaSwLikeT = 30; bwaSwLike = true; localAlign = true; // -a INT Score of a match [1] // -b INT Mismatch penalty [3] // -q INT Gap open penalty [5] // -r INT Gap extension penalty. The penalty for a contiguous // gap of size k is q+k*r. [2] polstr += ";MA=1;MMP=C3;RDG=5,2;RFG=5,2"; break; } case 'q': format = FASTQ; break; case 'r': format = RAW; break; case 'c': format = CMDLINE; break; case ARG_QSEQ: format = QSEQ; break; case 'C': { cerr << "Error: -C specified but Bowtie 2 does not support colorspace input." << endl; throw 1; break; } case 'I': gMinInsert = parseInt(0, "-I arg must be positive", arg); break; case 'X': gMaxInsert = parseInt(1, "-X arg must be at least 1", arg); break; case ARG_NO_DISCORDANT: gReportDiscordant = false; break; case ARG_NO_MIXED: gReportMixed = false; break; case 's': skipReads = (uint32_t)parseInt(0, "-s arg must be positive", arg); break; case ARG_FF: gMate1fw = true; gMate2fw = true; break; case ARG_RF: gMate1fw = false; gMate2fw = true; break; case ARG_FR: gMate1fw = true; gMate2fw = false; break; case ARG_SHMEM: useShmem = true; break; case ARG_SEED_SUMM: seedSumm = true; break; case ARG_MM: { #ifdef BOWTIE_MM useMm = true; break; #else cerr << "Memory-mapped I/O mode is disabled because bowtie was not compiled with" << endl << "BOWTIE_MM defined. Memory-mapped I/O is not supported under Windows. If you" << endl << "would like to use memory-mapped I/O on a platform that supports it, please" << endl << "refrain from specifying BOWTIE_MM=0 when compiling Bowtie." << endl; throw 1; #endif } case ARG_MMSWEEP: mmSweep = true; break; case ARG_HADOOPOUT: hadoopOut = true; break; case ARG_SOLEXA_QUALS: solexaQuals = true; break; case ARG_INTEGER_QUALS: integerQuals = true; break; case ARG_PHRED64: phred64Quals = true; break; case ARG_PHRED33: solexaQuals = false; phred64Quals = false; break; case ARG_OVERHANG: gReportOverhangs = true; break; case ARG_NO_CACHE: msNoCache = true; break; case ARG_USE_CACHE: msNoCache = false; break; case ARG_LOCAL_SEED_CACHE_SZ: seedCacheLocalMB = (uint32_t)parseInt(1, "--local-seed-cache-sz arg must be at least 1", arg); break; case ARG_CURRENT_SEED_CACHE_SZ: seedCacheCurrentMB = (uint32_t)parseInt(1, "--seed-cache-sz arg must be at least 1", arg); break; case ARG_REFIDX: noRefNames = true; break; case ARG_FUZZY: fuzzy = true; break; case ARG_FULLREF: fullRef = true; break; case ARG_GAP_BAR: gGapBarrier = parseInt(1, "--gbar must be no less than 1", arg); break; case ARG_SEED: seed = parseInt(0, "--seed arg must be at least 0", arg); break; case ARG_NON_DETERMINISTIC: arbitraryRandom = true; break; case 'u': qUpto = (uint32_t)parseInt(1, "-u/--qupto arg must be at least 1", arg); break; case 'Q': tokenize(arg, ",", qualities); integerQuals = true; break; case ARG_QUALS1: tokenize(arg, ",", qualities1); integerQuals = true; break; case ARG_QUALS2: tokenize(arg, ",", qualities2); integerQuals = true; break; case ARG_CACHE_LIM: cacheLimit = (uint32_t)parseInt(1, "--cachelim arg must be at least 1", arg); break; case ARG_CACHE_SZ: cacheSize = (uint32_t)parseInt(1, "--cachesz arg must be at least 1", arg); cacheSize *= (1024 * 1024); // convert from MB to B break; case ARG_WRAPPER: wrapper = arg; break; case 'p': nthreads = parseInt(1, "-p/--threads arg must be at least 1", arg); break; case ARG_FILEPAR: fileParallel = true; break; case '3': gTrim3 = parseInt(0, "-3/--trim3 arg must be at least 0", arg); break; case '5': gTrim5 = parseInt(0, "-5/--trim5 arg must be at least 0", arg); break; case 'h': printUsage(cout); throw 0; break; case ARG_USAGE: printUsage(cout); throw 0; break; // // NOTE that unlike in Bowtie 1, -M, -a and -k are mutually // exclusive here. // case 'M': { msample = true; mhits = parse(arg); if(saw_a || saw_k) { cerr << "Warning: -M, -k and -a are mutually exclusive. " << "-M will override" << endl; khits = 1; } assert_eq(1, khits); saw_M = true; cerr << "Warning: -M is deprecated. Use -D and -R to adjust " << "effort instead." << endl; break; } case ARG_EXTEND_ITERS: { maxIters = parse(arg); break; } case ARG_NO_EXTEND: { doExtend = false; break; } case 'R': { polstr += ";ROUNDS="; polstr += arg; break; } case 'D': { polstr += ";DPS="; polstr += arg; break; } case ARG_DP_MATE_STREAK_THRESH: { maxMateStreak = parse(arg); break; } case ARG_DP_FAIL_STREAK_THRESH: { maxDpStreak = parse(arg); break; } case ARG_EE_FAIL_STREAK_THRESH: { maxEeStreak = parse(arg); break; } case ARG_UG_FAIL_STREAK_THRESH: { maxUgStreak = parse(arg); break; } case ARG_DP_FAIL_THRESH: { maxDp = parse(arg); break; } case ARG_UG_FAIL_THRESH: { maxUg = parse(arg); break; } case ARG_MAX_SEEDS: { maxSeeds = parse(arg); break; } case ARG_SEED_BOOST_THRESH: { seedBoostThresh = parse(arg); break; } case 'a': { msample = false; allHits = true; mhits = 0; // disable -M if(saw_M || saw_k) { cerr << "Warning: -M, -k and -a are mutually exclusive. " << "-a will override" << endl; } saw_a = true; break; } case 'k': { msample = false; khits = (uint32_t)parseInt(1, "-k arg must be at least 1", arg); mhits = 0; // disable -M if(saw_M || saw_a) { cerr << "Warning: -M, -k and -a are mutually exclusive. " << "-k will override" << endl; } saw_k = true; break; } case ARG_VERBOSE: gVerbose = 1; break; case ARG_STARTVERBOSE: startVerbose = true; break; case ARG_QUIET: gQuiet = true; break; case ARG_SANITY: sanityCheck = true; break; case 't': timing = true; break; case ARG_METRIC_IVAL: { metricsIval = parseInt(1, "--metrics arg must be at least 1", arg); break; } case ARG_METRIC_FILE: metricsFile = arg; break; case ARG_METRIC_STDERR: metricsStderr = true; break; case ARG_METRIC_PER_READ: metricsPerRead = true; break; case ARG_NO_FW: gNofw = true; break; case ARG_NO_RC: gNorc = true; break; case ARG_SAM_NO_QNAME_TRUNC: samTruncQname = false; break; case ARG_SAM_OMIT_SEC_SEQ: samOmitSecSeqQual = true; break; case ARG_SAM_NO_UNAL: samNoUnal = true; break; case ARG_SAM_NOHEAD: samNoHead = true; break; case ARG_SAM_NOSQ: samNoSQ = true; break; case ARG_SAM_PRINT_YI: sam_print_yi = true; break; case ARG_REORDER: reorder = true; break; case ARG_MAPQ_EX: { sam_print_zp = true; sam_print_zu = true; sam_print_xp = true; sam_print_xss = true; sam_print_yn = true; break; } case ARG_SHOW_RAND_SEED: { sam_print_zs = true; break; } case ARG_SAMPLE: sampleFrac = parse(arg); break; case ARG_CP_MIN: cminlen = parse(arg); break; case ARG_CP_IVAL: cpow2 = parse(arg); break; case ARG_TRI: doTri = true; break; case ARG_READ_PASSTHRU: { sam_print_xr = true; break; } case ARG_READ_TIMES: { sam_print_xt = true; sam_print_xd = true; sam_print_xu = true; sam_print_yl = true; sam_print_ye = true; sam_print_yu = true; sam_print_yr = true; sam_print_zb = true; sam_print_zr = true; sam_print_zf = true; sam_print_zm = true; sam_print_zi = true; break; } case ARG_SAM_RG: { string argstr = arg; if(argstr.substr(0, 3) == "ID:") { rgid = "\t"; rgid += argstr; rgs_optflag = "RG:Z:" + argstr.substr(3); } else { rgs += '\t'; rgs += argstr; } break; } case ARG_SAM_RGID: { string argstr = arg; rgid = "\t"; rgid = "\tID:" + argstr; rgs_optflag = "RG:Z:" + argstr; break; } case ARG_PARTITION: partitionSz = parse(arg); break; case ARG_DPAD: maxhalf = parseInt(0, "--dpad must be no less than 0", arg); break; case ARG_ORIG: if(arg == NULL || strlen(arg) == 0) { cerr << "--orig arg must be followed by a string" << endl; printUsage(cerr); throw 1; } origString = arg; break; case ARG_LOCAL: localAlign = true; break; case ARG_END_TO_END: localAlign = false; break; case ARG_SSE8: enable8 = true; break; case ARG_SSE8_NO: enable8 = false; break; case ARG_UNGAPPED: doUngapped = true; break; case ARG_UNGAPPED_NO: doUngapped = false; break; // case ARG_NO_DOVETAIL: gDovetailMatesOK = false; break; // case ARG_NO_CONTAIN: gContainMatesOK = false; break; // case ARG_NO_OVERLAP: gOlapMatesOK = false; break; // case ARG_DOVETAIL: gDovetailMatesOK = true; break; // case ARG_CONTAIN: gContainMatesOK = true; break; // case ARG_OVERLAP: gOlapMatesOK = true; break; case ARG_QC_FILTER: qcFilter = true; break; case ARG_NO_SCORE_PRIORITY: sortByScore = false; break; case ARG_IGNORE_QUALS: ignoreQuals = true; break; case ARG_MAPQ_V: mapqv = parse(arg); break; case ARG_TIGHTEN: tighten = parse(arg); break; case ARG_EXACT_UPFRONT: doExactUpFront = true; break; case ARG_1MM_UPFRONT: do1mmUpFront = true; break; case ARG_EXACT_UPFRONT_NO: doExactUpFront = false; break; case ARG_1MM_UPFRONT_NO: do1mmUpFront = false; break; case ARG_1MM_MINLEN: do1mmMinLen = parse(arg); break; case ARG_NOISY_HPOLY: noisyHpolymer = true; break; case 'x' : bt2indexs[0] = arg; break; case ARG_PRESET_VERY_FAST_LOCAL: localAlign = true; case ARG_PRESET_VERY_FAST: { presetList.push_back("very-fast%LOCAL%"); break; } case ARG_PRESET_FAST_LOCAL: localAlign = true; case ARG_PRESET_FAST: { fast = true; presetList.push_back("fast%LOCAL%"); break; } case ARG_PRESET_SENSITIVE_LOCAL: localAlign = true; case ARG_PRESET_SENSITIVE: { sensitive = true; presetList.push_back("sensitive%LOCAL%"); break; } case ARG_PRESET_VERY_SENSITIVE_LOCAL: localAlign = true; case ARG_PRESET_VERY_SENSITIVE: { very_sensitive = true; presetList.push_back("very-sensitive%LOCAL%"); break; } case 'P': { presetList.push_back(arg); break; } case ARG_ALIGN_POLICY: { if(strlen(arg) > 0) { polstr += ";"; polstr += arg; } break; } case 'N': { polstr += ";SEED="; polstr += arg; break; } case 'L': { int64_t len = parse(arg); if(len < 0) { cerr << "Error: -L argument must be >= 0; was " << arg << endl; throw 1; } if(len > 32) { cerr << "Error: -L argument must be <= 32; was" << arg << endl; throw 1; } polstr += ";SEEDLEN="; polstr += arg; break; } case 'O': multiseedOff = parse(arg); break; case 'i': { EList args; tokenize(arg, ",", args); if(args.size() > 3 || args.size() == 0) { cerr << "Error: expected 3 or fewer comma-separated " << "arguments to -i option, got " << args.size() << endl; throw 1; } // Interval-settings arguments polstr += (";IVAL=" + args[0]); // Function type if(args.size() > 1) { polstr += ("," + args[1]); // Constant term } if(args.size() > 2) { polstr += ("," + args[2]); // Coefficient } break; } case ARG_MULTISEED_IVAL: { polstr += ";"; // Split argument by comma EList args; tokenize(arg, ",", args); if(args.size() > 5 || args.size() == 0) { cerr << "Error: expected 5 or fewer comma-separated " << "arguments to --multiseed option, got " << args.size() << endl; throw 1; } // Seed mm and length arguments polstr += "SEED="; polstr += (args[0]); // # mismatches if(args.size() > 1) polstr += ("," + args[ 1]); // length if(args.size() > 2) polstr += (";IVAL=" + args[2]); // Func type if(args.size() > 3) polstr += ("," + args[ 3]); // Constant term if(args.size() > 4) polstr += ("," + args[ 4]); // Coefficient break; } case ARG_N_CEIL: { // Split argument by comma EList args; tokenize(arg, ",", args); if(args.size() > 3) { cerr << "Error: expected 3 or fewer comma-separated " << "arguments to --n-ceil option, got " << args.size() << endl; throw 1; } if(args.size() == 0) { cerr << "Error: expected at least one argument to --n-ceil option" << endl; throw 1; } polstr += ";NCEIL="; if(args.size() == 3) { polstr += (args[0] + "," + args[1] + "," + args[2]); } else { if(args.size() == 1) { polstr += ("C," + args[0]); } else { polstr += (args[0] + "," + args[1]); } } break; } case ARG_SCORE_MA: polstr += ";MA="; polstr += arg; break; case ARG_SCORE_MMP: { EList args; tokenize(arg, ",", args); if(args.size() > 2 || args.size() == 0) { cerr << "Error: expected 1 or 2 comma-separated " << "arguments to --mp option, got " << args.size() << endl; throw 1; } if(args.size() >= 1) { polstr += ";MMP=Q,"; polstr += args[0]; if(args.size() >= 2) { polstr += ","; polstr += args[1]; } } break; } case ARG_SCORE_SCP: { EList args; tokenize(arg, ",", args); if(args.size() > 2 || args.size() == 0) { cerr << "Error: expected 1 or 2 comma-separated " << "arguments to --sp option, got " << args.size() << endl; throw 1; } if(args.size() >= 1) { polstr += ";SCP=Q,"; polstr += args[0]; if(args.size() >= 2) { polstr += ","; polstr += args[1]; } } break; } case ARG_NO_SOFTCLIP: { ostringstream convert; convert << std::numeric_limits::max(); polstr += ";SCP=Q,"; polstr += convert.str(); polstr += ","; polstr += convert.str(); break; } case ARG_SCORE_NP: polstr += ";NP=C"; polstr += arg; break; case ARG_SCORE_RDG: polstr += ";RDG="; polstr += arg; break; case ARG_SCORE_RFG: polstr += ";RFG="; polstr += arg; break; case ARG_SCORE_MIN: { polstr += ";"; EList args; tokenize(arg, ",", args); if(args.size() > 3 && args.size() == 0) { cerr << "Error: expected 3 or fewer comma-separated " << "arguments to --n-ceil option, got " << args.size() << endl; throw 1; } polstr += ("MIN=" + args[0]); if(args.size() > 1) { polstr += ("," + args[1]); } if(args.size() > 2) { polstr += ("," + args[2]); } break; } case ARG_DESC: printArgDesc(cout); throw 0; case 'S': outfile = arg; break; case 'U': { EList args; tokenize(arg, ",", args); for(size_t i = 0; i < args.size(); i++) { queries.push_back(args[i]); } break; } case ARG_VERSION: showVersion = 1; break; case ARG_NO_TEMPSPLICESITE: useTempSpliceSite = false; break; case ARG_PEN_CANSPLICE: { penCanSplice = parseInt(0, "--pen-cansplice arg must be at least 0", arg); break; } case ARG_PEN_NONCANSPLICE: { penNoncanSplice = parseInt(0, "--pen-noncansplice arg must be at least 0", arg); break; } case ARG_PEN_CONFLICTSPLICE: { penConflictSplice = parseInt(0, "--pen-conflictsplice arg must be at least 0", arg); break; } case ARG_PEN_CANINTRONLEN: { polstr += ";"; EList args; tokenize(arg, ",", args); if(args.size() > 3 && args.size() == 0) { cerr << "Error: expected 3 or fewer comma-separated " << "arguments to --n-ceil option, got " << args.size() << endl; throw 1; } polstr += ("CANINTRONLEN=" + args[0]); if(args.size() > 1) { polstr += ("," + args[1]); } if(args.size() > 2) { polstr += ("," + args[2]); } break; } case ARG_PEN_NONCANINTRONLEN: { polstr += ";"; EList args; tokenize(arg, ",", args); if(args.size() > 3 && args.size() == 0) { cerr << "Error: expected 3 or fewer comma-separated " << "arguments to --n-ceil option, got " << args.size() << endl; throw 1; } polstr += ("NONCANINTRONLEN=" + args[0]); if(args.size() > 1) { polstr += ("," + args[1]); } if(args.size() > 2) { polstr += ("," + args[2]); } break; } case ARG_MIN_INTRONLEN: { minIntronLen = parseInt(20, "--min-intronlen arg must be at least 20", arg); break; } case ARG_MAX_INTRONLEN: { maxIntronLen = parseInt(20, "--max-intronlen arg must be at least 20", arg); break; } case ARG_KNOWN_SPLICESITE_INFILE: knownSpliceSiteInfile = arg; break; case ARG_NOVEL_SPLICESITE_INFILE: novelSpliceSiteInfile = arg; break; case ARG_NOVEL_SPLICESITE_OUTFILE: novelSpliceSiteOutfile = arg; break; case ARG_SECONDARY: secondary = true; break; case ARG_NO_SPLICED_ALIGNMENT: no_spliced_alignment = true; break; case ARG_RNA_STRANDNESS: { string strandness = arg; if(strandness == "F") rna_strandness = RNA_STRANDNESS_F; else if(strandness == "R") rna_strandness = RNA_STRANDNESS_R; else if(strandness == "FR") rna_strandness = RNA_STRANDNESS_FR; else if(strandness == "RF") rna_strandness = RNA_STRANDNESS_RF; else { cerr << "Error: should be one of F, R, FR, or RF " << endl; throw 1; } break; } case ARG_SPLICESITE_DB_ONLY: { splicesite_db_only = true; break; } case ARG_NO_ANCHORSTOP: { anchorStop = false; break; } case ARG_TRANSCRIPTOME_MAPPING_ONLY: { tranMapOnly = true; break; } case ARG_TRANSCRIPTOME_ASSEMBLY: { tranAssm = true; break; } case ARG_TRANSCRIPTOME_ASSEMBLY_CUFFLINKS: { tranAssm = true; tranAssm_program = "cufflinks"; break; } case ARG_AVOID_PSEUDOGENE: { avoid_pseudogene = true; break; } #ifdef USE_SRA case ARG_SRA_ACC: { tokenize(arg, ",", sra_accs); format = SRA_FASTA; break; } #endif case ARG_REMOVE_CHRNAME: { rmChrName = true; break; } case ARG_ADD_CHRNAME: { addChrName = true; break; } case ARG_MAX_ALTSTRIED: { max_alts_tried = parseInt(8, "--max-altstried arg must be at least 8", arg); break; } case ARG_HAPLOTYPE: { use_haplotype = true; break; } case ARG_CODIS: { enable_codis = true; break; } case ARG_NO_TEMPLATELEN_ADJUSTMENT: { templateLenAdjustment = false; break; } case ARG_SUMMARY_FILE: { alignSumFile = arg; break; } case ARG_NEW_SUMMARY: { newAlignSummary = true; break; } case ARG_DP: { bowtie2_dp = parseInt(0, "--bowtie2-dp arg must be 0, 1, or 2", arg); break; } case ARG_REPEAT: { repeat = true; break; } case ARG_NO_REPEAT_INDEX: { use_repeat_index = false; break; } case ARG_READ_LENGTHS: { EList str_readLens; tokenize(arg, ",", str_readLens); for(size_t i = 0; i < str_readLens.size(); i++) { int readLen = parseInt(0, "--read-lengths arg must be at least 0", str_readLens[i].c_str()); readLens.push_back(readLen); } readLens.sort(); break; } case ARG_BASE_CHANGE: { // Split argument by comma EList args; tokenize(arg, ",", args); if(args.size() != 2) { cerr << "Error: expected 2 comma-separated " << "arguments to --base-change option, got " << args.size() << endl; throw 1; } base_change_entered = true; usrInput_convertedFrom = toupper(args[0][0]); usrInput_convertedTo = toupper(args[1][0]); string s = "ACGT"; if ((s.find(usrInput_convertedFrom) == std::string::npos) || (s.find(usrInput_convertedTo) == std::string::npos)) { cerr << "Please enter the nucleotide in 'ACGT' for --base-change option." << endl; throw 1; } if (usrInput_convertedFrom == usrInput_convertedTo) { cerr << "Please enter two different base for --base-change option. If you wish to align normal reads without nucleotide conversion, please use hisat2." << endl; throw 1; } break; } case ARG_3N: { threeN = true; break; } case ARG_REPEAT_LIMIT: { repeatLimit = parseInt(1, "--repeat-limit arg must be at least 1", arg);; break; } case ARG_UNIQUE_ONLY: { uniqueOutputOnly = true; break; } case ARG_DIRECTIONAL: { directional3NMapping = 1; break; } case ARG_DIRECTIONAL_REVERSE: { directional3NMapping = 2; break; } default: printUsage(cerr); throw 1; } } /** * Read command-line arguments */ static void parseOptions(int argc, const char **argv) { int option_index = 0; int next_option; saw_M = false; saw_a = false; saw_k = false; presetList.clear(); if(startVerbose) { cerr << "Parsing options: "; logTime(cerr, true); } while(true) { next_option = getopt_long( argc, const_cast(argv), short_options, long_options, &option_index); const char * arg = optarg; if(next_option == EOF) { if(extra_opts_cur < extra_opts.size()) { next_option = extra_opts[extra_opts_cur].first; arg = extra_opts[extra_opts_cur].second.c_str(); extra_opts_cur++; } else { break; } } parseOption(next_option, arg); } // Now parse all the presets. Might want to pick which presets version to // use according to other parameters. auto_ptr presets(new PresetsV0()); // Apply default preset if(!defaultPreset.empty()) { polstr = applyPreset(defaultPreset, *presets.get()) + polstr; } // Apply specified presets for(size_t i = 0; i < presetList.size(); i++) { polstr += applyPreset(presetList[i], *presets.get()); } for(size_t i = 0; i < extra_opts.size(); i++) { next_option = extra_opts[extra_opts_cur].first; const char *arg = extra_opts[extra_opts_cur].second.c_str(); parseOption(next_option, arg); } if (showVersion) { return; } // Remove initial semicolons while(!polstr.empty() && polstr[0] == ';') { polstr = polstr.substr(1); } if(gVerbose) { cerr << "Final policy string: '" << polstr.c_str() << "'" << endl; } if (threeN && !base_change_entered) { cerr << "--base-change must be set for HISAT-3N" << endl; printUsage(cerr); throw 1; } if (!threeN && base_change_entered) { cerr << "Please do not use --base-change for HISAT2. To align nucleotide conversion reads, please use HISAT-3N" << endl; printUsage(cerr); throw 1; } if (threeN) { usrInput_convertedFromComplement = asc2dnacomp[usrInput_convertedFrom]; usrInput_convertedToComplement = asc2dnacomp[usrInput_convertedTo]; getConversion(usrInput_convertedFrom, usrInput_convertedTo, hs3N_convertedFrom, hs3N_convertedTo); hs3N_convertedFromComplement = asc2dnacomp[hs3N_convertedFrom]; hs3N_convertedToComplement = asc2dnacomp[hs3N_convertedTo]; asc2dna_3N[0][hs3N_convertedFrom] = asc2dna[hs3N_convertedTo]; asc2dna_3N[0][tolower(hs3N_convertedFrom)] = asc2dna[hs3N_convertedTo]; asc2dna_3N[1][hs3N_convertedFromComplement] = asc2dna[hs3N_convertedToComplement]; asc2dna_3N[1][tolower(hs3N_convertedFromComplement)] = asc2dna[hs3N_convertedToComplement]; threeN_indexTags[0] += hs3N_convertedFrom; threeN_indexTags[0] += hs3N_convertedTo; threeN_indexTags[1] += hs3N_convertedFromComplement; threeN_indexTags[1] += hs3N_convertedToComplement; nMappingCycle = 4; if (hs3N_convertedFrom == hs3N_convertedToComplement || directional3NMapping == 1) { mappingCycles[0] = true; mappingCycles[1] = true; } else if (directional3NMapping == 2) { mappingCycles[2] = true; mappingCycles[3] = true; } else { for (int i = 0; i < 4; i++){ mappingCycles[i] = true; } } } else { nMappingCycle = 1; mappingCycles[0] = true; } size_t failStreakTmp = 0; SeedAlignmentPolicy::parseString( polstr, localAlign, noisyHpolymer, ignoreQuals, bonusMatchType, bonusMatch, penMmcType, penMmcMax, penMmcMin, penScMax, penScMin, penNType, penN, penRdGapConst, penRfGapConst, penRdGapLinear, penRfGapLinear, scoreMin, nCeil, penNCatPair, multiseedMms, multiseedLen, msIval, failStreakTmp, nSeedRounds, &penCanIntronLen, &penNoncanIntronLen); if(failStreakTmp > 0) { maxEeStreak = failStreakTmp; maxUgStreak = failStreakTmp; maxDpStreak = failStreakTmp; } if(saw_a || saw_k || true) { msample = false; mhits = 0; } else { assert_gt(mhits, 0); msample = true; } if(fast) { use_repeat_index = false; } else if(sensitive) { if(bowtie2_dp == 0) { bowtie2_dp = 1; } if(khits < 10) { khits = 10; saw_k = true; } scoreMin.init(SIMPLE_FUNC_LINEAR, 0.0f, -0.5f); } else if(very_sensitive) { bowtie2_dp = 2; if(khits < 30) { khits = 30; saw_k = true; } scoreMin.init(SIMPLE_FUNC_LINEAR, 0.0f, -1.0f); } if(mates1.size() != mates2.size()) { cerr << "Error: " << mates1.size() << " mate files/sequences were specified with -1, but " << mates2.size() << endl << "mate files/sequences were specified with -2. The same number of mate files/" << endl << "sequences must be specified with -1 and -2." << endl; throw 1; } if(qualities.size() && format != FASTA) { cerr << "Error: one or more quality files were specified with -Q but -f was not" << endl << "enabled. -Q works only in combination with -f and -C." << endl; throw 1; } if(qualities1.size() && format != FASTA) { cerr << "Error: one or more quality files were specified with --Q1 but -f was not" << endl << "enabled. --Q1 works only in combination with -f and -C." << endl; throw 1; } if(qualities2.size() && format != FASTA) { cerr << "Error: one or more quality files were specified with --Q2 but -f was not" << endl << "enabled. --Q2 works only in combination with -f and -C." << endl; throw 1; } if(qualities1.size() > 0 && mates1.size() != qualities1.size()) { cerr << "Error: " << mates1.size() << " mate files/sequences were specified with -1, but " << qualities1.size() << endl << "quality files were specified with --Q1. The same number of mate and quality" << endl << "files must sequences must be specified with -1 and --Q1." << endl; throw 1; } if(qualities2.size() > 0 && mates2.size() != qualities2.size()) { cerr << "Error: " << mates2.size() << " mate files/sequences were specified with -2, but " << qualities2.size() << endl << "quality files were specified with --Q2. The same number of mate and quality" << endl << "files must sequences must be specified with -2 and --Q2." << endl; throw 1; } if(!rgs.empty() && rgid.empty()) { cerr << "Warning: --rg was specified without --rg-id also " << "being specified. @RG line is not printed unless --rg-id " << "is specified." << endl; } // Check for duplicate mate input files if(format != CMDLINE) { for(size_t i = 0; i < mates1.size(); i++) { for(size_t j = 0; j < mates2.size(); j++) { if(mates1[i] == mates2[j] && !gQuiet) { cerr << "Warning: Same mate file \"" << mates1[i].c_str() << "\" appears as argument to both -1 and -2" << endl; } } } } // If both -s and -u are used, we need to adjust qUpto accordingly // since it uses rdid to know if we've reached the -u limit (and // rdids are all shifted up by skipReads characters) if(qUpto + skipReads > qUpto) { qUpto += skipReads; } if(useShmem && useMm && !gQuiet) { cerr << "Warning: --shmem overrides --mm..." << endl; useMm = false; } if(gGapBarrier < 1) { cerr << "Warning: --gbar was set less than 1 (=" << gGapBarrier << "); setting to 1 instead" << endl; gGapBarrier = 1; } if(multiseedMms >= multiseedLen) { assert_gt(multiseedLen, 0); cerr << "Warning: seed mismatches (" << multiseedMms << ") is less than seed length (" << multiseedLen << "); setting mismatches to " << (multiseedMms-1) << " instead" << endl; multiseedMms = multiseedLen-1; } sam_print_zm = sam_print_zm && bowtie2p5; #ifndef NDEBUG if(!gQuiet) { cerr << "Warning: Running in debug mode. Please use debug mode only " << "for diagnosing errors, and not for typical use of HISAT2." << endl; } #endif } static const char *argv0 = NULL; /// Create a PatternSourcePerThread for the current thread according /// to the global params and return a pointer to it static PatternSourcePerThreadFactory* createPatsrcFactory(PairedPatternSource& _patsrc, int tid) { PatternSourcePerThreadFactory *patsrcFact; patsrcFact = new WrappedPatternSourcePerThreadFactory(_patsrc); assert(patsrcFact != NULL); return patsrcFact; } #define PTHREAD_ATTRS (PTHREAD_CREATE_JOINABLE | PTHREAD_CREATE_DETACHED) typedef TIndexOffU index_t; typedef uint16_t local_index_t; static PairedPatternSource* multiseed_patsrc; static HGFM* multiseed_gfm; static RFM* multiseed_rgfm; //static HGFM* multiseed_gfms[2]; //static RFM* multiseed_rgfms[2]; static Scoring* multiseed_sc; static BitPairReference* multiseed_refs; static BitPairReference* multiseed_rrefs; //static BitPairReference* multiseed_refss[2]; //static BitPairReference* multiseed_rrefss[2]; static AlnSink* multiseed_msink; static OutFileBuf* multiseed_metricsOfb; static SpliceSiteDB* ssdb; static ALTDB* altdb; static RepeatDB* repeatdb; static ALTDB* raltdb; static ALTDB *altdbs_3N[2]; static RepeatDB *repeatdbs_3N[2]; static ALTDB *raltdbs_3N[2]; static TranscriptomePolicy* multiseed_tpol; static GraphPolicy* gpol; class reference3N { public: const HGFM* multiseed_gfm[2]; const RFM* multiseed_rgfm[2]; const BitPairReference* multiseed_rrefs[2]; reference3N() { } void load(EList* >& gfms_3N, RFM* rgfms_3N[2], BitPairReference* rrefss[2]) { for (int i = 0; i < 2; i++) { multiseed_gfm[i] = gfms_3N[i]; multiseed_rgfm[i] = rgfms_3N[i]; multiseed_rrefs[i] = rrefss[i]; } } }; reference3N ref3N; /** * Metrics for measuring the work done by the outer read alignment * loop. */ struct OuterLoopMetrics { OuterLoopMetrics() { reset(); } /** * Set all counters to 0. */ void reset() { reads = bases = srreads = srbases = freads = fbases = ureads = ubases = 0; } /** * Sum the counters in m in with the conters in this object. This * is the only safe way to update an OuterLoopMetrics that's shared * by multiple threads. */ void merge( const OuterLoopMetrics& m, bool getLock = false) { ThreadSafe ts(&mutex_m, getLock); reads += m.reads; bases += m.bases; srreads += m.srreads; srbases += m.srbases; freads += m.freads; fbases += m.fbases; ureads += m.ureads; ubases += m.ubases; } uint64_t reads; // total reads uint64_t bases; // total bases uint64_t srreads; // same-read reads uint64_t srbases; // same-read bases uint64_t freads; // filtered reads uint64_t fbases; // filtered bases uint64_t ureads; // unfiltered reads uint64_t ubases; // unfiltered bases MUTEX_T mutex_m; }; /** * Collection of all relevant performance metrics when aligning in * multiseed mode. */ struct PerfMetrics { PerfMetrics() : first(true) { reset(); } /** * Set all counters to 0. */ void reset() { olm.reset(); sdm.reset(); wlm.reset(); swmSeed.reset(); swmMate.reset(); rpm.reset(); dpSse8Seed.reset(); // 8-bit SSE seed extensions dpSse8Mate.reset(); // 8-bit SSE mate finds dpSse16Seed.reset(); // 16-bit SSE seed extensions dpSse16Mate.reset(); // 16-bit SSE mate finds nbtfiltst = 0; nbtfiltsc = 0; nbtfiltdo = 0; olmu.reset(); sdmu.reset(); wlmu.reset(); swmuSeed.reset(); swmuMate.reset(); rpmu.reset(); dpSse8uSeed.reset(); // 8-bit SSE seed extensions dpSse8uMate.reset(); // 8-bit SSE mate finds dpSse16uSeed.reset(); // 16-bit SSE seed extensions dpSse16uMate.reset(); // 16-bit SSE mate finds nbtfiltst_u = 0; nbtfiltsc_u = 0; nbtfiltdo_u = 0; him.reset(); } /** * Merge a set of specific metrics into this object. */ void merge( const OuterLoopMetrics *ol, const SeedSearchMetrics *sd, const WalkMetrics *wl, const SwMetrics *swSeed, const SwMetrics *swMate, const ReportingMetrics *rm, const SSEMetrics *dpSse8Ex, const SSEMetrics *dpSse8Ma, const SSEMetrics *dpSse16Ex, const SSEMetrics *dpSse16Ma, uint64_t nbtfiltst_, uint64_t nbtfiltsc_, uint64_t nbtfiltdo_, const HIMetrics *hi, bool getLock) { ThreadSafe ts(&mutex_m, getLock); if(ol != NULL) { olmu.merge(*ol, false); } if(sd != NULL) { sdmu.merge(*sd, false); } if(wl != NULL) { wlmu.merge(*wl, false); } if(swSeed != NULL) { swmuSeed.merge(*swSeed, false); } if(swMate != NULL) { swmuMate.merge(*swMate, false); } if(rm != NULL) { rpmu.merge(*rm, false); } if(dpSse8Ex != NULL) { dpSse8uSeed.merge(*dpSse8Ex, false); } if(dpSse8Ma != NULL) { dpSse8uMate.merge(*dpSse8Ma, false); } if(dpSse16Ex != NULL) { dpSse16uSeed.merge(*dpSse16Ex, false); } if(dpSse16Ma != NULL) { dpSse16uMate.merge(*dpSse16Ma, false); } nbtfiltst_u += nbtfiltst_; nbtfiltsc_u += nbtfiltsc_; nbtfiltdo_u += nbtfiltdo_; if(hi != NULL) { him.merge(*hi, false); } } /** * Reports a matrix of results, incl. column labels, to an OutFileBuf. * Optionally also sends results to stderr (unbuffered). Can optionally * print a per-read record with the read name at the beginning. */ void reportInterval( OutFileBuf* o, // file to send output to bool metricsStderr, // additionally output to stderr? bool total, // true -> report total, otherwise incremental bool sync, // synchronize output const BTString *name) // non-NULL name pointer if is per-read record { ThreadSafe ts(&mutex_m, sync); ostringstream stderrSs; time_t curtime = time(0); char buf[1024]; if(first) { const char *str = /* 1 */ "Time" "\t" /* 2 */ "Read" "\t" /* 3 */ "Base" "\t" /* 4 */ "SameRead" "\t" /* 5 */ "SameReadBase" "\t" /* 6 */ "UnfilteredRead" "\t" /* 7 */ "UnfilteredBase" "\t" /* 8 */ "Paired" "\t" /* 9 */ "Unpaired" "\t" /* 10 */ "AlConUni" "\t" /* 11 */ "AlConRep" "\t" /* 12 */ "AlConFail" "\t" /* 13 */ "AlDis" "\t" /* 14 */ "AlConFailUni" "\t" /* 15 */ "AlConFailRep" "\t" /* 16 */ "AlConFailFail" "\t" /* 17 */ "AlConRepUni" "\t" /* 18 */ "AlConRepRep" "\t" /* 19 */ "AlConRepFail" "\t" /* 20 */ "AlUnpUni" "\t" /* 21 */ "AlUnpRep" "\t" /* 22 */ "AlUnpFail" "\t" /* 23 */ "SeedSearch" "\t" /* 24 */ "IntraSCacheHit" "\t" /* 25 */ "InterSCacheHit" "\t" /* 26 */ "OutOfMemory" "\t" /* 27 */ "AlBWOp" "\t" /* 28 */ "AlBWBranch" "\t" /* 29 */ "ResBWOp" "\t" /* 30 */ "ResBWBranch" "\t" /* 31 */ "ResResolve" "\t" /* 34 */ "ResReport" "\t" /* 35 */ "RedundantSHit" "\t" /* 36 */ "BestMinEdit0" "\t" /* 37 */ "BestMinEdit1" "\t" /* 38 */ "BestMinEdit2" "\t" /* 39 */ "ExactAttempts" "\t" /* 40 */ "ExactSucc" "\t" /* 41 */ "ExactRanges" "\t" /* 42 */ "ExactRows" "\t" /* 43 */ "ExactOOMs" "\t" /* 44 */ "1mmAttempts" "\t" /* 45 */ "1mmSucc" "\t" /* 46 */ "1mmRanges" "\t" /* 47 */ "1mmRows" "\t" /* 48 */ "1mmOOMs" "\t" /* 49 */ "UngappedSucc" "\t" /* 50 */ "UngappedFail" "\t" /* 51 */ "UngappedNoDec" "\t" /* 52 */ "DPExLt10Gaps" "\t" /* 53 */ "DPExLt5Gaps" "\t" /* 54 */ "DPExLt3Gaps" "\t" /* 55 */ "DPMateLt10Gaps" "\t" /* 56 */ "DPMateLt5Gaps" "\t" /* 57 */ "DPMateLt3Gaps" "\t" /* 58 */ "DP16ExDps" "\t" /* 59 */ "DP16ExDpSat" "\t" /* 60 */ "DP16ExDpFail" "\t" /* 61 */ "DP16ExDpSucc" "\t" /* 62 */ "DP16ExCol" "\t" /* 63 */ "DP16ExCell" "\t" /* 64 */ "DP16ExInner" "\t" /* 65 */ "DP16ExFixup" "\t" /* 66 */ "DP16ExGathSol" "\t" /* 67 */ "DP16ExBt" "\t" /* 68 */ "DP16ExBtFail" "\t" /* 69 */ "DP16ExBtSucc" "\t" /* 70 */ "DP16ExBtCell" "\t" /* 71 */ "DP16ExCoreRej" "\t" /* 72 */ "DP16ExNRej" "\t" /* 73 */ "DP8ExDps" "\t" /* 74 */ "DP8ExDpSat" "\t" /* 75 */ "DP8ExDpFail" "\t" /* 76 */ "DP8ExDpSucc" "\t" /* 77 */ "DP8ExCol" "\t" /* 78 */ "DP8ExCell" "\t" /* 79 */ "DP8ExInner" "\t" /* 80 */ "DP8ExFixup" "\t" /* 81 */ "DP8ExGathSol" "\t" /* 82 */ "DP8ExBt" "\t" /* 83 */ "DP8ExBtFail" "\t" /* 84 */ "DP8ExBtSucc" "\t" /* 85 */ "DP8ExBtCell" "\t" /* 86 */ "DP8ExCoreRej" "\t" /* 87 */ "DP8ExNRej" "\t" /* 88 */ "DP16MateDps" "\t" /* 89 */ "DP16MateDpSat" "\t" /* 90 */ "DP16MateDpFail" "\t" /* 91 */ "DP16MateDpSucc" "\t" /* 92 */ "DP16MateCol" "\t" /* 93 */ "DP16MateCell" "\t" /* 94 */ "DP16MateInner" "\t" /* 95 */ "DP16MateFixup" "\t" /* 96 */ "DP16MateGathSol" "\t" /* 97 */ "DP16MateBt" "\t" /* 98 */ "DP16MateBtFail" "\t" /* 99 */ "DP16MateBtSucc" "\t" /* 100 */ "DP16MateBtCell" "\t" /* 101 */ "DP16MateCoreRej" "\t" /* 102 */ "DP16MateNRej" "\t" /* 103 */ "DP8MateDps" "\t" /* 104 */ "DP8MateDpSat" "\t" /* 105 */ "DP8MateDpFail" "\t" /* 106 */ "DP8MateDpSucc" "\t" /* 107 */ "DP8MateCol" "\t" /* 108 */ "DP8MateCell" "\t" /* 109 */ "DP8MateInner" "\t" /* 110 */ "DP8MateFixup" "\t" /* 111 */ "DP8MateGathSol" "\t" /* 112 */ "DP8MateBt" "\t" /* 113 */ "DP8MateBtFail" "\t" /* 114 */ "DP8MateBtSucc" "\t" /* 115 */ "DP8MateBtCell" "\t" /* 116 */ "DP8MateCoreRej" "\t" /* 117 */ "DP8MateNRej" "\t" /* 118 */ "DPBtFiltStart" "\t" /* 119 */ "DPBtFiltScore" "\t" /* 120 */ "DpBtFiltDom" "\t" /* 121 */ "MemPeak" "\t" /* 122 */ "UncatMemPeak" "\t" // 0 /* 123 */ "EbwtMemPeak" "\t" // EBWT_CAT /* 124 */ "CacheMemPeak" "\t" // CA_CAT /* 125 */ "ResolveMemPeak" "\t" // GW_CAT /* 126 */ "AlignMemPeak" "\t" // AL_CAT /* 127 */ "DPMemPeak" "\t" // DP_CAT /* 128 */ "MiscMemPeak" "\t" // MISC_CAT /* 129 */ "DebugMemPeak" "\t" // DEBUG_CAT /* 130 */ "LocalSearch" "\t" /* 131 */ "AnchorSearch" "\t" /* 132 */ "LocalIndexSearch" "\t" /* 133 */ "LocalExtSearch" "\t" /* 134 */ "LocalSearchRecur" "\t" /* 135 */ "GlobalGenomeCoords" "\t" /* 136 */ "LocalGenomeCoords" "\t" "\n"; if(name != NULL) { if(o != NULL) o->writeChars("Name\t"); if(metricsStderr) stderrSs << "Name\t"; } if(o != NULL) o->writeChars(str); if(metricsStderr) stderrSs << str; first = false; } if(total) mergeIncrementals(); // 0. Read name, if needed if(name != NULL) { if(o != NULL) { o->writeChars(name->toZBuf()); o->write('\t'); } if(metricsStderr) { stderrSs << (*name) << '\t'; } } // 1. Current time in secs itoa10(curtime, buf); if(metricsStderr) stderrSs << buf << '\t'; if(o != NULL) { o->writeChars(buf); o->write('\t'); } const OuterLoopMetrics& ol = total ? olm : olmu; // 2. Reads itoa10(ol.reads, buf); if(metricsStderr) stderrSs << buf << '\t'; if(o != NULL) { o->writeChars(buf); o->write('\t'); } // 3. Bases itoa10(ol.bases, buf); if(metricsStderr) stderrSs << buf << '\t'; if(o != NULL) { o->writeChars(buf); o->write('\t'); } // 4. Same-read reads itoa10(ol.srreads, buf); if(metricsStderr) stderrSs << buf << '\t'; if(o != NULL) { o->writeChars(buf); o->write('\t'); } // 5. Same-read bases itoa10(ol.srbases, buf); if(metricsStderr) stderrSs << buf << '\t'; if(o != NULL) { o->writeChars(buf); o->write('\t'); } // 6. Unfiltered reads itoa10(ol.ureads, buf); if(metricsStderr) stderrSs << buf << '\t'; if(o != NULL) { o->writeChars(buf); o->write('\t'); } // 7. Unfiltered bases itoa10(ol.ubases, buf); if(metricsStderr) stderrSs << buf << '\t'; if(o != NULL) { o->writeChars(buf); o->write('\t'); } const ReportingMetrics& rp = total ? rpm : rpmu; // 8. Paired reads itoa10(rp.npaired, buf); if(metricsStderr) stderrSs << buf << '\t'; if(o != NULL) { o->writeChars(buf); o->write('\t'); } // 9. Unpaired reads itoa10(rp.nunpaired, buf); if(metricsStderr) stderrSs << buf << '\t'; if(o != NULL) { o->writeChars(buf); o->write('\t'); } // 10. Pairs with unique concordant alignments itoa10(rp.nconcord_uni, buf); if(metricsStderr) stderrSs << buf << '\t'; if(o != NULL) { o->writeChars(buf); o->write('\t'); } // 11. Pairs with repetitive concordant alignments itoa10(rp.nconcord_rep, buf); if(metricsStderr) stderrSs << buf << '\t'; if(o != NULL) { o->writeChars(buf); o->write('\t'); } // 12. Pairs with 0 concordant alignments itoa10(rp.nconcord_0, buf); if(metricsStderr) stderrSs << buf << '\t'; if(o != NULL) { o->writeChars(buf); o->write('\t'); } // 13. Pairs with 1 discordant alignment itoa10(rp.ndiscord, buf); if(metricsStderr) stderrSs << buf << '\t'; if(o != NULL) { o->writeChars(buf); o->write('\t'); } // 14. Mates from unaligned pairs that align uniquely itoa10(rp.nunp_0_uni, buf); if(metricsStderr) stderrSs << buf << '\t'; if(o != NULL) { o->writeChars(buf); o->write('\t'); } // 15. Mates from unaligned pairs that align repetitively itoa10(rp.nunp_0_rep, buf); if(metricsStderr) stderrSs << buf << '\t'; if(o != NULL) { o->writeChars(buf); o->write('\t'); } // 16. Mates from unaligned pairs that fail to align itoa10(rp.nunp_0_0, buf); if(metricsStderr) stderrSs << buf << '\t'; if(o != NULL) { o->writeChars(buf); o->write('\t'); } // 17. Mates from repetitive pairs that align uniquely itoa10(rp.nunp_rep_uni, buf); if(metricsStderr) stderrSs << buf << '\t'; if(o != NULL) { o->writeChars(buf); o->write('\t'); } // 18. Mates from repetitive pairs that align repetitively itoa10(rp.nunp_rep_rep, buf); if(metricsStderr) stderrSs << buf << '\t'; if(o != NULL) { o->writeChars(buf); o->write('\t'); } // 19. Mates from repetitive pairs that fail to align itoa10(rp.nunp_rep_0, buf); if(metricsStderr) stderrSs << buf << '\t'; if(o != NULL) { o->writeChars(buf); o->write('\t'); } // 20. Unpaired reads that align uniquely itoa10(rp.nunp_uni, buf); if(metricsStderr) stderrSs << buf << '\t'; if(o != NULL) { o->writeChars(buf); o->write('\t'); } // 21. Unpaired reads that align repetitively itoa10(rp.nunp_rep, buf); if(metricsStderr) stderrSs << buf << '\t'; if(o != NULL) { o->writeChars(buf); o->write('\t'); } // 22. Unpaired reads that fail to align itoa10(rp.nunp_0, buf); if(metricsStderr) stderrSs << buf << '\t'; if(o != NULL) { o->writeChars(buf); o->write('\t'); } const SeedSearchMetrics& sd = total ? sdm : sdmu; // 23. Seed searches itoa10(sd.seedsearch, buf); if(metricsStderr) stderrSs << buf << '\t'; if(o != NULL) { o->writeChars(buf); o->write('\t'); } // 24. Hits in 'current' cache itoa10(sd.intrahit, buf); if(metricsStderr) stderrSs << buf << '\t'; if(o != NULL) { o->writeChars(buf); o->write('\t'); } // 25. Hits in 'local' cache itoa10(sd.interhit, buf); if(metricsStderr) stderrSs << buf << '\t'; if(o != NULL) { o->writeChars(buf); o->write('\t'); } // 26. Out of memory itoa10(sd.ooms, buf); if(metricsStderr) stderrSs << buf << '\t'; if(o != NULL) { o->writeChars(buf); o->write('\t'); } // 27. Burrows-Wheeler ops in aligner itoa10(sd.bwops, buf); if(metricsStderr) stderrSs << buf << '\t'; if(o != NULL) { o->writeChars(buf); o->write('\t'); } // 28. Burrows-Wheeler branches (edits) in aligner itoa10(sd.bweds, buf); if(metricsStderr) stderrSs << buf << '\t'; if(o != NULL) { o->writeChars(buf); o->write('\t'); } const WalkMetrics& wl = total ? wlm : wlmu; // 29. Burrows-Wheeler ops in resolver itoa10(wl.bwops, buf); if(metricsStderr) stderrSs << buf << '\t'; if(o != NULL) { o->writeChars(buf); o->write('\t'); } // 30. Burrows-Wheeler branches in resolver itoa10(wl.branches, buf); if(metricsStderr) stderrSs << buf << '\t'; if(o != NULL) { o->writeChars(buf); o->write('\t'); } // 31. Burrows-Wheeler offset resolutions itoa10(wl.resolves, buf); if(metricsStderr) stderrSs << buf << '\t'; if(o != NULL) { o->writeChars(buf); o->write('\t'); } // 34. Offset reports itoa10(wl.reports, buf); if(metricsStderr) stderrSs << buf << '\t'; if(o != NULL) { o->writeChars(buf); o->write('\t'); } // 35. Redundant seed hit itoa10(total ? swmSeed.rshit : swmuSeed.rshit, buf); if(metricsStderr) stderrSs << buf << '\t'; if(o != NULL) { o->writeChars(buf); o->write('\t'); } // 36. # times the best (out of fw/rc) minimum # edits was 0 itoa10(total ? sdm.bestmin0 : sdmu.bestmin0, buf); if(metricsStderr) stderrSs << buf << '\t'; if(o != NULL) { o->writeChars(buf); o->write('\t'); } // 37. # times the best (out of fw/rc) minimum # edits was 1 itoa10(total ? sdm.bestmin1 : sdmu.bestmin1, buf); if(metricsStderr) stderrSs << buf << '\t'; if(o != NULL) { o->writeChars(buf); o->write('\t'); } // 38. # times the best (out of fw/rc) minimum # edits was 2 itoa10(total ? sdm.bestmin2 : sdmu.bestmin2, buf); if(metricsStderr) stderrSs << buf << '\t'; if(o != NULL) { o->writeChars(buf); o->write('\t'); } // 39. Exact aligner attempts itoa10(total ? swmSeed.exatts : swmuSeed.exatts, buf); if(metricsStderr) stderrSs << buf << '\t'; if(o != NULL) { o->writeChars(buf); o->write('\t'); } // 40. Exact aligner successes itoa10(total ? swmSeed.exsucc : swmuSeed.exsucc, buf); if(metricsStderr) stderrSs << buf << '\t'; if(o != NULL) { o->writeChars(buf); o->write('\t'); } // 41. Exact aligner ranges itoa10(total ? swmSeed.exranges : swmuSeed.exranges, buf); if(metricsStderr) stderrSs << buf << '\t'; if(o != NULL) { o->writeChars(buf); o->write('\t'); } // 42. Exact aligner rows itoa10(total ? swmSeed.exrows : swmuSeed.exrows, buf); if(metricsStderr) stderrSs << buf << '\t'; if(o != NULL) { o->writeChars(buf); o->write('\t'); } // 43. Exact aligner OOMs itoa10(total ? swmSeed.exooms : swmuSeed.exooms, buf); if(metricsStderr) stderrSs << buf << '\t'; if(o != NULL) { o->writeChars(buf); o->write('\t'); } // 44. 1mm aligner attempts itoa10(total ? swmSeed.mm1atts : swmuSeed.mm1atts, buf); if(metricsStderr) stderrSs << buf << '\t'; if(o != NULL) { o->writeChars(buf); o->write('\t'); } // 45. 1mm aligner successes itoa10(total ? swmSeed.mm1succ : swmuSeed.mm1succ, buf); if(metricsStderr) stderrSs << buf << '\t'; if(o != NULL) { o->writeChars(buf); o->write('\t'); } // 46. 1mm aligner ranges itoa10(total ? swmSeed.mm1ranges : swmuSeed.mm1ranges, buf); if(metricsStderr) stderrSs << buf << '\t'; if(o != NULL) { o->writeChars(buf); o->write('\t'); } // 47. 1mm aligner rows itoa10(total ? swmSeed.mm1rows : swmuSeed.mm1rows, buf); if(metricsStderr) stderrSs << buf << '\t'; if(o != NULL) { o->writeChars(buf); o->write('\t'); } // 48. 1mm aligner OOMs itoa10(total ? swmSeed.mm1ooms : swmuSeed.mm1ooms, buf); if(metricsStderr) stderrSs << buf << '\t'; if(o != NULL) { o->writeChars(buf); o->write('\t'); } // 49 Ungapped aligner success itoa10(total ? swmSeed.ungapsucc : swmuSeed.ungapsucc, buf); if(metricsStderr) stderrSs << buf << '\t'; if(o != NULL) { o->writeChars(buf); o->write('\t'); } // 50. Ungapped aligner fail itoa10(total ? swmSeed.ungapfail : swmuSeed.ungapfail, buf); if(metricsStderr) stderrSs << buf << '\t'; if(o != NULL) { o->writeChars(buf); o->write('\t'); } // 51. Ungapped aligner no decision itoa10(total ? swmSeed.ungapnodec : swmuSeed.ungapnodec, buf); if(metricsStderr) stderrSs << buf << '\t'; if(o != NULL) { o->writeChars(buf); o->write('\t'); } // 52. # seed-extend DPs with < 10 gaps itoa10(total ? swmSeed.sws10 : swmuSeed.sws10, buf); if(metricsStderr) stderrSs << buf << '\t'; if(o != NULL) { o->writeChars(buf); o->write('\t'); } // 53. # seed-extend DPs with < 5 gaps itoa10(total ? swmSeed.sws5 : swmuSeed.sws5, buf); if(metricsStderr) stderrSs << buf << '\t'; if(o != NULL) { o->writeChars(buf); o->write('\t'); } // 54. # seed-extend DPs with < 3 gaps itoa10(total ? swmSeed.sws3 : swmuSeed.sws3, buf); if(metricsStderr) stderrSs << buf << '\t'; if(o != NULL) { o->writeChars(buf); o->write('\t'); } // 55. # seed-extend DPs with < 10 gaps itoa10(total ? swmMate.sws10 : swmuMate.sws10, buf); if(metricsStderr) stderrSs << buf << '\t'; if(o != NULL) { o->writeChars(buf); o->write('\t'); } // 56. # seed-extend DPs with < 5 gaps itoa10(total ? swmMate.sws5 : swmuMate.sws5, buf); if(metricsStderr) stderrSs << buf << '\t'; if(o != NULL) { o->writeChars(buf); o->write('\t'); } // 57. # seed-extend DPs with < 3 gaps itoa10(total ? swmMate.sws3 : swmuMate.sws3, buf); if(metricsStderr) stderrSs << buf << '\t'; if(o != NULL) { o->writeChars(buf); o->write('\t'); } const SSEMetrics& dpSse16s = total ? dpSse16Seed : dpSse16uSeed; // 58. 16-bit SSE seed-extend DPs tried itoa10(dpSse16s.dp, buf); if(metricsStderr) stderrSs << buf << '\t'; if(o != NULL) { o->writeChars(buf); o->write('\t'); } // 59. 16-bit SSE seed-extend DPs saturated itoa10(dpSse16s.dpsat, buf); if(metricsStderr) stderrSs << buf << '\t'; if(o != NULL) { o->writeChars(buf); o->write('\t'); } // 60. 16-bit SSE seed-extend DPs failed itoa10(dpSse16s.dpfail, buf); if(metricsStderr) stderrSs << buf << '\t'; if(o != NULL) { o->writeChars(buf); o->write('\t'); } // 61. 16-bit SSE seed-extend DPs succeeded itoa10(dpSse16s.dpsucc, buf); if(metricsStderr) stderrSs << buf << '\t'; if(o != NULL) { o->writeChars(buf); o->write('\t'); } // 62. 16-bit SSE seed-extend DP columns completed itoa10(dpSse16s.col, buf); if(metricsStderr) stderrSs << buf << '\t'; if(o != NULL) { o->writeChars(buf); o->write('\t'); } // 63. 16-bit SSE seed-extend DP cells completed itoa10(dpSse16s.cell, buf); if(metricsStderr) stderrSs << buf << '\t'; if(o != NULL) { o->writeChars(buf); o->write('\t'); } // 64. 16-bit SSE seed-extend DP inner loop iters completed itoa10(dpSse16s.inner, buf); if(metricsStderr) stderrSs << buf << '\t'; if(o != NULL) { o->writeChars(buf); o->write('\t'); } // 65. 16-bit SSE seed-extend DP fixup loop iters completed itoa10(dpSse16s.fixup, buf); if(metricsStderr) stderrSs << buf << '\t'; if(o != NULL) { o->writeChars(buf); o->write('\t'); } // 66. 16-bit SSE seed-extend DP gather, cells with potential solutions itoa10(dpSse16s.gathsol, buf); if(metricsStderr) stderrSs << buf << '\t'; if(o != NULL) { o->writeChars(buf); o->write('\t'); } // 67. 16-bit SSE seed-extend DP backtrace attempts itoa10(dpSse16s.bt, buf); if(metricsStderr) stderrSs << buf << '\t'; if(o != NULL) { o->writeChars(buf); o->write('\t'); } // 68. 16-bit SSE seed-extend DP failed backtrace attempts itoa10(dpSse16s.btfail, buf); if(metricsStderr) stderrSs << buf << '\t'; if(o != NULL) { o->writeChars(buf); o->write('\t'); } // 69. 16-bit SSE seed-extend DP succesful backtrace attempts itoa10(dpSse16s.btsucc, buf); if(metricsStderr) stderrSs << buf << '\t'; if(o != NULL) { o->writeChars(buf); o->write('\t'); } // 70. 16-bit SSE seed-extend DP backtrace cells itoa10(dpSse16s.btcell, buf); if(metricsStderr) stderrSs << buf << '\t'; if(o != NULL) { o->writeChars(buf); o->write('\t'); } // 71. 16-bit SSE seed-extend DP core-diag rejections itoa10(dpSse16s.corerej, buf); if(metricsStderr) stderrSs << buf << '\t'; if(o != NULL) { o->writeChars(buf); o->write('\t'); } // 72. 16-bit SSE seed-extend DP N rejections itoa10(dpSse16s.nrej, buf); if(metricsStderr) stderrSs << buf << '\t'; if(o != NULL) { o->writeChars(buf); o->write('\t'); } const SSEMetrics& dpSse8s = total ? dpSse8Seed : dpSse8uSeed; // 73. 8-bit SSE seed-extend DPs tried itoa10(dpSse8s.dp, buf); if(metricsStderr) stderrSs << buf << '\t'; if(o != NULL) { o->writeChars(buf); o->write('\t'); } // 74. 8-bit SSE seed-extend DPs saturated itoa10(dpSse8s.dpsat, buf); if(metricsStderr) stderrSs << buf << '\t'; if(o != NULL) { o->writeChars(buf); o->write('\t'); } // 75. 8-bit SSE seed-extend DPs failed itoa10(dpSse8s.dpfail, buf); if(metricsStderr) stderrSs << buf << '\t'; if(o != NULL) { o->writeChars(buf); o->write('\t'); } // 76. 8-bit SSE seed-extend DPs succeeded itoa10(dpSse8s.dpsucc, buf); if(metricsStderr) stderrSs << buf << '\t'; if(o != NULL) { o->writeChars(buf); o->write('\t'); } // 77. 8-bit SSE seed-extend DP columns completed itoa10(dpSse8s.col, buf); if(metricsStderr) stderrSs << buf << '\t'; if(o != NULL) { o->writeChars(buf); o->write('\t'); } // 78. 8-bit SSE seed-extend DP cells completed itoa10(dpSse8s.cell, buf); if(metricsStderr) stderrSs << buf << '\t'; if(o != NULL) { o->writeChars(buf); o->write('\t'); } // 79. 8-bit SSE seed-extend DP inner loop iters completed itoa10(dpSse8s.inner, buf); if(metricsStderr) stderrSs << buf << '\t'; if(o != NULL) { o->writeChars(buf); o->write('\t'); } // 80. 8-bit SSE seed-extend DP fixup loop iters completed itoa10(dpSse8s.fixup, buf); if(metricsStderr) stderrSs << buf << '\t'; if(o != NULL) { o->writeChars(buf); o->write('\t'); } // 81. 16-bit SSE seed-extend DP gather, cells with potential solutions itoa10(dpSse8s.gathsol, buf); if(metricsStderr) stderrSs << buf << '\t'; if(o != NULL) { o->writeChars(buf); o->write('\t'); } // 82. 16-bit SSE seed-extend DP backtrace attempts itoa10(dpSse8s.bt, buf); if(metricsStderr) stderrSs << buf << '\t'; if(o != NULL) { o->writeChars(buf); o->write('\t'); } // 83. 16-bit SSE seed-extend DP failed backtrace attempts itoa10(dpSse8s.btfail, buf); if(metricsStderr) stderrSs << buf << '\t'; if(o != NULL) { o->writeChars(buf); o->write('\t'); } // 84. 16-bit SSE seed-extend DP succesful backtrace attempts itoa10(dpSse8s.btsucc, buf); if(metricsStderr) stderrSs << buf << '\t'; if(o != NULL) { o->writeChars(buf); o->write('\t'); } // 85. 16-bit SSE seed-extend DP backtrace cells itoa10(dpSse8s.btcell, buf); if(metricsStderr) stderrSs << buf << '\t'; if(o != NULL) { o->writeChars(buf); o->write('\t'); } // 86. 16-bit SSE seed-extend DP core-diag rejections itoa10(dpSse8s.corerej, buf); if(metricsStderr) stderrSs << buf << '\t'; if(o != NULL) { o->writeChars(buf); o->write('\t'); } // 87. 16-bit SSE seed-extend DP N rejections itoa10(dpSse8s.nrej, buf); if(metricsStderr) stderrSs << buf << '\t'; if(o != NULL) { o->writeChars(buf); o->write('\t'); } const SSEMetrics& dpSse16m = total ? dpSse16Mate : dpSse16uMate; // 88. 16-bit SSE mate-finding DPs tried itoa10(dpSse16m.dp, buf); if(metricsStderr) stderrSs << buf << '\t'; if(o != NULL) { o->writeChars(buf); o->write('\t'); } // 89. 16-bit SSE mate-finding DPs saturated itoa10(dpSse16m.dpsat, buf); if(metricsStderr) stderrSs << buf << '\t'; if(o != NULL) { o->writeChars(buf); o->write('\t'); } // 90. 16-bit SSE mate-finding DPs failed itoa10(dpSse16m.dpfail, buf); if(metricsStderr) stderrSs << buf << '\t'; if(o != NULL) { o->writeChars(buf); o->write('\t'); } // 91. 16-bit SSE mate-finding DPs succeeded itoa10(dpSse16m.dpsucc, buf); if(metricsStderr) stderrSs << buf << '\t'; if(o != NULL) { o->writeChars(buf); o->write('\t'); } // 92. 16-bit SSE mate-finding DP columns completed itoa10(dpSse16m.col, buf); if(metricsStderr) stderrSs << buf << '\t'; if(o != NULL) { o->writeChars(buf); o->write('\t'); } // 93. 16-bit SSE mate-finding DP cells completed itoa10(dpSse16m.cell, buf); if(metricsStderr) stderrSs << buf << '\t'; if(o != NULL) { o->writeChars(buf); o->write('\t'); } // 94. 16-bit SSE mate-finding DP inner loop iters completed itoa10(dpSse16m.inner, buf); if(metricsStderr) stderrSs << buf << '\t'; if(o != NULL) { o->writeChars(buf); o->write('\t'); } // 95. 16-bit SSE mate-finding DP fixup loop iters completed itoa10(dpSse16m.fixup, buf); if(metricsStderr) stderrSs << buf << '\t'; if(o != NULL) { o->writeChars(buf); o->write('\t'); } // 96. 16-bit SSE mate-finding DP gather, cells with potential solutions itoa10(dpSse16m.gathsol, buf); if(metricsStderr) stderrSs << buf << '\t'; if(o != NULL) { o->writeChars(buf); o->write('\t'); } // 97. 16-bit SSE mate-finding DP backtrace attempts itoa10(dpSse16m.bt, buf); if(metricsStderr) stderrSs << buf << '\t'; if(o != NULL) { o->writeChars(buf); o->write('\t'); } // 98. 16-bit SSE mate-finding DP failed backtrace attempts itoa10(dpSse16m.btfail, buf); if(metricsStderr) stderrSs << buf << '\t'; if(o != NULL) { o->writeChars(buf); o->write('\t'); } // 99. 16-bit SSE mate-finding DP succesful backtrace attempts itoa10(dpSse16m.btsucc, buf); if(metricsStderr) stderrSs << buf << '\t'; if(o != NULL) { o->writeChars(buf); o->write('\t'); } // 100. 16-bit SSE mate-finding DP backtrace cells itoa10(dpSse16m.btcell, buf); if(metricsStderr) stderrSs << buf << '\t'; if(o != NULL) { o->writeChars(buf); o->write('\t'); } // 101. 16-bit SSE mate-finding DP core-diag rejections itoa10(dpSse16m.corerej, buf); if(metricsStderr) stderrSs << buf << '\t'; if(o != NULL) { o->writeChars(buf); o->write('\t'); } // 102. 16-bit SSE mate-finding DP N rejections itoa10(dpSse16m.nrej, buf); if(metricsStderr) stderrSs << buf << '\t'; if(o != NULL) { o->writeChars(buf); o->write('\t'); } const SSEMetrics& dpSse8m = total ? dpSse8Mate : dpSse8uMate; // 103. 8-bit SSE mate-finding DPs tried itoa10(dpSse8m.dp, buf); if(metricsStderr) stderrSs << buf << '\t'; if(o != NULL) { o->writeChars(buf); o->write('\t'); } // 104. 8-bit SSE mate-finding DPs saturated itoa10(dpSse8m.dpsat, buf); if(metricsStderr) stderrSs << buf << '\t'; if(o != NULL) { o->writeChars(buf); o->write('\t'); } // 105. 8-bit SSE mate-finding DPs failed itoa10(dpSse8m.dpfail, buf); if(metricsStderr) stderrSs << buf << '\t'; if(o != NULL) { o->writeChars(buf); o->write('\t'); } // 106. 8-bit SSE mate-finding DPs succeeded itoa10(dpSse8m.dpsucc, buf); if(metricsStderr) stderrSs << buf << '\t'; if(o != NULL) { o->writeChars(buf); o->write('\t'); } // 107. 8-bit SSE mate-finding DP columns completed itoa10(dpSse8m.col, buf); if(metricsStderr) stderrSs << buf << '\t'; if(o != NULL) { o->writeChars(buf); o->write('\t'); } // 108. 8-bit SSE mate-finding DP cells completed itoa10(dpSse8m.cell, buf); if(metricsStderr) stderrSs << buf << '\t'; if(o != NULL) { o->writeChars(buf); o->write('\t'); } // 109. 8-bit SSE mate-finding DP inner loop iters completed itoa10(dpSse8m.inner, buf); if(metricsStderr) stderrSs << buf << '\t'; if(o != NULL) { o->writeChars(buf); o->write('\t'); } // 110. 8-bit SSE mate-finding DP fixup loop iters completed itoa10(dpSse8m.fixup, buf); if(metricsStderr) stderrSs << buf << '\t'; if(o != NULL) { o->writeChars(buf); o->write('\t'); } // 111. 16-bit SSE mate-finding DP gather, cells with potential solutions itoa10(dpSse8m.gathsol, buf); if(metricsStderr) stderrSs << buf << '\t'; if(o != NULL) { o->writeChars(buf); o->write('\t'); } // 112. 16-bit SSE mate-finding DP backtrace attempts itoa10(dpSse8m.bt, buf); if(metricsStderr) stderrSs << buf << '\t'; if(o != NULL) { o->writeChars(buf); o->write('\t'); } // 113. 16-bit SSE mate-finding DP failed backtrace attempts itoa10(dpSse8m.btfail, buf); if(metricsStderr) stderrSs << buf << '\t'; if(o != NULL) { o->writeChars(buf); o->write('\t'); } // 114. 16-bit SSE mate-finding DP succesful backtrace attempts itoa10(dpSse8m.btsucc, buf); if(metricsStderr) stderrSs << buf << '\t'; if(o != NULL) { o->writeChars(buf); o->write('\t'); } // 115. 16-bit SSE mate-finding DP backtrace cells itoa10(dpSse8m.btcell, buf); if(metricsStderr) stderrSs << buf << '\t'; if(o != NULL) { o->writeChars(buf); o->write('\t'); } // 116. 16-bit SSE mate-finding DP core rejections itoa10(dpSse8m.corerej, buf); if(metricsStderr) stderrSs << buf << '\t'; if(o != NULL) { o->writeChars(buf); o->write('\t'); } // 117. 16-bit SSE mate-finding N rejections itoa10(dpSse8m.nrej, buf); if(metricsStderr) stderrSs << buf << '\t'; if(o != NULL) { o->writeChars(buf); o->write('\t'); } // 118. Backtrace candidates filtered due to starting cell itoa10(total ? nbtfiltst : nbtfiltst_u, buf); if(metricsStderr) stderrSs << buf << '\t'; if(o != NULL) { o->writeChars(buf); o->write('\t'); } // 119. Backtrace candidates filtered due to low score itoa10(total ? nbtfiltsc : nbtfiltsc_u, buf); if(metricsStderr) stderrSs << buf << '\t'; if(o != NULL) { o->writeChars(buf); o->write('\t'); } // 120. Backtrace candidates filtered due to domination itoa10(total ? nbtfiltdo : nbtfiltdo_u, buf); if(metricsStderr) stderrSs << buf << '\t'; if(o != NULL) { o->writeChars(buf); o->write('\t'); } // 121. Overall memory peak itoa10(gMemTally.peak() >> 20, buf); if(metricsStderr) stderrSs << buf << '\t'; if(o != NULL) { o->writeChars(buf); o->write('\t'); } // 122. Uncategorized memory peak itoa10(gMemTally.peak(0) >> 20, buf); if(metricsStderr) stderrSs << buf << '\t'; if(o != NULL) { o->writeChars(buf); o->write('\t'); } // 123. Ebwt memory peak itoa10(gMemTally.peak(EBWT_CAT) >> 20, buf); if(metricsStderr) stderrSs << buf << '\t'; if(o != NULL) { o->writeChars(buf); o->write('\t'); } // 124. Cache memory peak itoa10(gMemTally.peak(CA_CAT) >> 20, buf); if(metricsStderr) stderrSs << buf << '\t'; if(o != NULL) { o->writeChars(buf); o->write('\t'); } // 125. Resolver memory peak itoa10(gMemTally.peak(GW_CAT) >> 20, buf); if(metricsStderr) stderrSs << buf << '\t'; if(o != NULL) { o->writeChars(buf); o->write('\t'); } // 126. Seed aligner memory peak itoa10(gMemTally.peak(AL_CAT) >> 20, buf); if(metricsStderr) stderrSs << buf << '\t'; if(o != NULL) { o->writeChars(buf); o->write('\t'); } // 127. Dynamic programming aligner memory peak itoa10(gMemTally.peak(DP_CAT) >> 20, buf); if(metricsStderr) stderrSs << buf << '\t'; if(o != NULL) { o->writeChars(buf); o->write('\t'); } // 128. Miscellaneous memory peak itoa10(gMemTally.peak(MISC_CAT) >> 20, buf); if(metricsStderr) stderrSs << buf << '\t'; if(o != NULL) { o->writeChars(buf); o->write('\t'); } // 129. Debug memory peak itoa10(gMemTally.peak(DEBUG_CAT) >> 20, buf); if(metricsStderr) stderrSs << buf << '\t'; if(o != NULL) { o->writeChars(buf); o->write('\t'); } // 130 itoa10(him.localatts, buf); if(metricsStderr) stderrSs << buf << '\t'; if(o != NULL) { o->writeChars(buf); o->write('\t'); } // 131 itoa10(him.anchoratts, buf); if(metricsStderr) stderrSs << buf << '\t'; if(o != NULL) { o->writeChars(buf); o->write('\t'); } // 132 itoa10(him.localindexatts, buf); if(metricsStderr) stderrSs << buf << '\t'; if(o != NULL) { o->writeChars(buf); o->write('\t'); } // 133 itoa10(him.localextatts, buf); if(metricsStderr) stderrSs << buf << '\t'; if(o != NULL) { o->writeChars(buf); o->write('\t'); } // 134 itoa10(him.localsearchrecur, buf); if(metricsStderr) stderrSs << buf << '\t'; if(o != NULL) { o->writeChars(buf); o->write('\t'); } // 135 itoa10(him.globalgenomecoords, buf); if(metricsStderr) stderrSs << buf << '\t'; if(o != NULL) { o->writeChars(buf); o->write('\t'); } // 136 itoa10(him.localgenomecoords, buf); if(metricsStderr) stderrSs << buf; if(o != NULL) { o->writeChars(buf); } if(o != NULL) { o->write('\n'); } if(metricsStderr) cerr << stderrSs.str().c_str() << endl; if(!total) mergeIncrementals(); } void mergeIncrementals() { olm.merge(olmu, false); sdm.merge(sdmu, false); wlm.merge(wlmu, false); swmSeed.merge(swmuSeed, false); swmMate.merge(swmuMate, false); dpSse8Seed.merge(dpSse8uSeed, false); dpSse8Mate.merge(dpSse8uMate, false); dpSse16Seed.merge(dpSse16uSeed, false); dpSse16Mate.merge(dpSse16uMate, false); nbtfiltst_u += nbtfiltst; nbtfiltsc_u += nbtfiltsc; nbtfiltdo_u += nbtfiltdo; olmu.reset(); sdmu.reset(); wlmu.reset(); swmuSeed.reset(); swmuMate.reset(); rpmu.reset(); dpSse8uSeed.reset(); dpSse8uMate.reset(); dpSse16uSeed.reset(); dpSse16uMate.reset(); nbtfiltst_u = 0; nbtfiltsc_u = 0; nbtfiltdo_u = 0; } // Total over the whole job OuterLoopMetrics olm; // overall metrics SeedSearchMetrics sdm; // metrics related to seed alignment WalkMetrics wlm; // metrics related to walking left (i.e. resolving reference offsets) SwMetrics swmSeed; // metrics related to DP seed-extend alignment SwMetrics swmMate; // metrics related to DP mate-finding alignment ReportingMetrics rpm; // metrics related to reporting SSEMetrics dpSse8Seed; // 8-bit SSE seed extensions SSEMetrics dpSse8Mate; // 8-bit SSE mate finds SSEMetrics dpSse16Seed; // 16-bit SSE seed extensions SSEMetrics dpSse16Mate; // 16-bit SSE mate finds uint64_t nbtfiltst; uint64_t nbtfiltsc; uint64_t nbtfiltdo; // Just since the last update OuterLoopMetrics olmu; // overall metrics SeedSearchMetrics sdmu; // metrics related to seed alignment WalkMetrics wlmu; // metrics related to walking left (i.e. resolving reference offsets) SwMetrics swmuSeed; // metrics related to DP seed-extend alignment SwMetrics swmuMate; // metrics related to DP mate-finding alignment ReportingMetrics rpmu; // metrics related to reporting SSEMetrics dpSse8uSeed; // 8-bit SSE seed extensions SSEMetrics dpSse8uMate; // 8-bit SSE mate finds SSEMetrics dpSse16uSeed; // 16-bit SSE seed extensions SSEMetrics dpSse16uMate; // 16-bit SSE mate finds uint64_t nbtfiltst_u; uint64_t nbtfiltsc_u; uint64_t nbtfiltdo_u; // HIMetrics him; MUTEX_T mutex_m; // lock for when one ob bool first; // yet to print first line? time_t lastElapsed; // used in reportInterval to measure time since last call }; static PerfMetrics metrics; // Cyclic rotations #define ROTL(n, x) (((x) << (n)) | ((x) >> (32-n))) #define ROTR(n, x) (((x) >> (n)) | ((x) << (32-n))) static inline void printMmsSkipMsg( const PatternSourcePerThread& ps, bool paired, bool mate1, int seedmms) { ostringstream os; if(paired) { os << "Warning: skipping mate #" << (mate1 ? '1' : '2') << " of read '" << (mate1 ? ps.bufa().name : ps.bufb().name) << "' because length (" << (mate1 ? ps.bufa().patFw.length() : ps.bufb().patFw.length()) << ") <= # seed mismatches (" << seedmms << ")" << endl; } else { os << "Warning: skipping read '" << (mate1 ? ps.bufa().name : ps.bufb().name) << "' because length (" << (mate1 ? ps.bufa().patFw.length() : ps.bufb().patFw.length()) << ") <= # seed mismatches (" << seedmms << ")" << endl; } cerr << os.str().c_str(); } static inline void printLenSkipMsg( const PatternSourcePerThread& ps, bool paired, bool mate1) { ostringstream os; if(paired) { os << "Warning: skipping mate #" << (mate1 ? '1' : '2') << " of read '" << (mate1 ? ps.bufa().name : ps.bufb().name) << "' because it was < 2 characters long" << endl; } else { os << "Warning: skipping read '" << (mate1 ? ps.bufa().name : ps.bufb().name) << "' because it was < 2 characters long" << endl; } cerr << os.str().c_str(); } static inline void printLocalScoreMsg( const PatternSourcePerThread& ps, bool paired, bool mate1) { ostringstream os; if(paired) { os << "Warning: minimum score function gave negative number in " << "--local mode for mate #" << (mate1 ? '1' : '2') << " of read '" << (mate1 ? ps.bufa().name : ps.bufb().name) << "; setting to 0 instead" << endl; } else { os << "Warning: minimum score function gave negative number in " << "--local mode for read '" << (mate1 ? ps.bufa().name : ps.bufb().name) << "; setting to 0 instead" << endl; } cerr << os.str().c_str(); } static inline void printEEScoreMsg( const PatternSourcePerThread& ps, bool paired, bool mate1) { ostringstream os; if(paired) { os << "Warning: minimum score function gave positive number in " << "--end-to-end mode for mate #" << (mate1 ? '1' : '2') << " of read '" << (mate1 ? ps.bufa().name : ps.bufb().name) << "; setting to 0 instead" << endl; } else { os << "Warning: minimum score function gave positive number in " << "--end-to-end mode for read '" << (mate1 ? ps.bufa().name : ps.bufb().name) << "; setting to 0 instead" << endl; } cerr << os.str().c_str(); } #define MERGE_METRICS(met, sync) { \ msink.mergeMetrics(rpm); \ met.merge( \ &olm, \ &sdm, \ &wlm, \ &swmSeed, \ &swmMate, \ &rpm, \ &sseU8ExtendMet, \ &sseU8MateMet, \ &sseI16ExtendMet, \ &sseI16MateMet, \ nbtfiltst, \ nbtfiltsc, \ nbtfiltdo, \ &him, \ sync); \ olm.reset(); \ sdm.reset(); \ wlm.reset(); \ swmSeed.reset(); \ swmMate.reset(); \ rpm.reset(); \ sseU8ExtendMet.reset(); \ sseU8MateMet.reset(); \ sseI16ExtendMet.reset(); \ sseI16MateMet.reset(); \ him.reset(); \ } #define MERGE_SW(x) { \ x.merge( \ sseU8ExtendMet, \ sseU8MateMet, \ sseI16ExtendMet, \ sseI16MateMet, \ nbtfiltst, \ nbtfiltsc, \ nbtfiltdo); \ x.resetCounters(); \ } /** * Called once per thread. Sets up per-thread pointers to the shared global * data structures, creates per-thread structures, then enters the alignment * loop. The general flow of the alignment loop is: * * - If it's been a while and we're the master thread, report some alignment * metrics * - Get the next read/pair * - Check if this read/pair is identical to the previous * + If identical, check whether we can skip any or all alignment stages. If * we can skip all stages, report the result immediately and move to next * read/pair * + If not identical, continue * - */ static void multiseedSearchWorker_hisat2(void *vp) { int tid = *((int*)vp); if (threeN) { assert(ref3N.multiseed_gfm[0] != NULL); assert(ref3N.multiseed_gfm[1] != NULL); } else { assert(multiseed_gfm != NULL); } assert(multiseedMms == 0); // for regular Hisat2 PairedPatternSource& patsrc = *multiseed_patsrc; const HGFM& gfm = *multiseed_gfm; const RFM* rgfm = multiseed_rgfm; const Scoring& sc = *multiseed_sc; const BitPairReference& ref = *multiseed_refs; const BitPairReference* rref = multiseed_rrefs; AlnSink& msink = *multiseed_msink; OutFileBuf* metricsOfb = multiseed_metricsOfb; // for Hisat-3N const HGFM* gfm_3N[2]; const RFM* rgfm_3N[2]; const BitPairReference* rref_3N[2]; for (int i = 0; i < 2; i++) { gfm_3N[i] = ref3N.multiseed_gfm[i]; rgfm_3N[i] = ref3N.multiseed_rgfm[i]; rref_3N[i] = ref3N.multiseed_rrefs[i]; } // Sinks: these are so that we can print tables encoding counts for // events of interest on a per-read, per-seed, per-join, or per-SW // level. These in turn can be used to diagnose performance // problems, or generally characterize performance. //const BitPairReference& refs = *multiseed_refs; auto_ptr patsrcFact(createPatsrcFactory(patsrc, tid)); auto_ptr ps(patsrcFact->create()); // Instantiate an object for holding reporting-related parameters. if(maxSeeds == 0) { maxSeeds = max(5, khits * 2); } ReportingParams rp( (allHits ? std::numeric_limits::max() : khits), // -k (allHits ? std::numeric_limits::max() : maxSeeds), // --max-seeds mhits, // -m/-M 0, // penalty gap (not used now) msample, // true -> -M was specified, otherwise assume -m gReportDiscordant, // report discordang paired-end alignments? gReportMixed, // report unpaired alignments for paired reads? secondary, localAlign, bowtie2_dp, sensitive | very_sensitive, repeat); // Instantiate a mapping quality calculator auto_ptr bmapq(new_mapq(mapqv, scoreMin, sc)); // Make a per-thread wrapper for the global MHitSink object. AlnSinkWrap* msinkwrap; if (threeN) { msinkwrap = new AlnSinkWrap3N( msink, // global sink rp, // reporting parameters *bmapq.get(), // MAPQ calculator (size_t)tid, // thread id mappingCycles, secondary, // secondary alignments no_spliced_alignment ? NULL : ssdb, thread_rids_mindist); } else { msinkwrap = new AlnSinkWrap( msink, // global sink rp, // reporting parameters *bmapq.get(), // MAPQ calculator (size_t)tid, // thread id secondary, // secondary alignments no_spliced_alignment ? NULL : ssdb, thread_rids_mindist); } SplicedAligner splicedAligner(threeN? *gfm_3N[0]: gfm, anchorStop, thread_rids_mindist); SwAligner sw; OuterLoopMetrics olm; SeedSearchMetrics sdm; WalkMetrics wlm; SwMetrics swmSeed, swmMate; ReportingMetrics rpm; RandomSource rnd, rndArb; SSEMetrics sseU8ExtendMet; SSEMetrics sseU8MateMet; SSEMetrics sseI16ExtendMet; SSEMetrics sseI16MateMet; DescentMetrics descm; uint64_t nbtfiltst = 0; // TODO: find a new home for these uint64_t nbtfiltsc = 0; // TODO: find a new home for these uint64_t nbtfiltdo = 0; // TODO: find a new home for these HIMetrics him; ASSERT_ONLY(BTDnaString tmp); int pepolFlag; if(gMate1fw && gMate2fw) { pepolFlag = PE_POLICY_FF; } else if(gMate1fw && !gMate2fw) { pepolFlag = PE_POLICY_FR; } else if(!gMate1fw && gMate2fw) { pepolFlag = PE_POLICY_RF; } else { pepolFlag = PE_POLICY_RR; } assert_geq(gMaxInsert, gMinInsert); assert_geq(gMinInsert, 0); PairedEndPolicy pepol( pepolFlag, gMaxInsert, gMinInsert, localAlign, gFlippedMatesOK, gDovetailMatesOK, gContainMatesOK, gOlapMatesOK, gExpandToFrag); PerfMetrics metricsPt; // per-thread metrics object; for read-level metrics BTString nametmp; PerReadMetrics prm; // Used by thread with threadid == 1 to measure time elapsed time_t iTime = time(0); // Keep track of whether last search was exhaustive for mates 1 and 2 bool exhaustive[2] = { false, false }; // Keep track of whether mates 1/2 were filtered out last time through bool filt[2] = { true, true }; // Keep track of whether mates 1/2 were filtered out due Ns last time bool nfilt[2] = { true, true }; // Keep track of whether mates 1/2 were filtered out due to not having // enough characters to rise about the score threshold. bool scfilt[2] = { true, true }; // Keep track of whether mates 1/2 were filtered out due to not having // more characters than the number of mismatches permitted in a seed. bool lenfilt[2] = { true, true }; // Keep track of whether mates 1/2 were filtered out by upstream qc bool qcfilt[2] = { true, true }; rndArb.init((uint32_t)time(0)); int mergei = 0; int mergeival = 16; while(true) { bool success = false, done = false, paired = false; ps->nextReadPair(success, done, paired, outType != OUTPUT_SAM); if(!success && done) { break; } else if(!success) { continue; } TReadId rdid = ps->rdid(); if(nthreads > 1 && useTempSpliceSite) { assert_gt(tid, 0); assert_leq(tid, thread_rids.size()); assert(thread_rids[tid - 1] == 0 || rdid > thread_rids[tid - 1]); thread_rids[tid - 1] = (rdid > 0 ? rdid - 1 : 0); while(true) { uint64_t min_rdid = thread_rids[0]; { for(size_t i = 1; i < thread_rids.size(); i++) { if(thread_rids[i] < min_rdid) { min_rdid = thread_rids[i]; } } } if(min_rdid + thread_rids_mindist < rdid) { #if defined(_TTHREAD_WIN32_) Sleep(0); #elif defined(_TTHREAD_POSIX_) sched_yield(); #endif } else break; } } bool sample = true; if(arbitraryRandom) { ps->bufa().seed = rndArb.nextU32(); ps->bufb().seed = rndArb.nextU32(); } if(sampleFrac < 1.0f) { rnd.init(ROTL(ps->bufa().seed, 2)); sample = rnd.nextFloat() < sampleFrac; } if(rdid >= skipReads && rdid < qUpto && sample) { // Align this read/pair bool retry = true; // // Check if there is metrics reporting for us to do. // if(metricsIval > 0 && (metricsOfb != NULL || metricsStderr) && !metricsPerRead && ++mergei == mergeival) { // Do a periodic merge. Update global metrics, in a // synchronized manner if needed. MERGE_METRICS(metrics, nthreads > 1); mergei = 0; // Check if a progress message should be printed if(tid == 0) { // Only thread 1 prints progress messages time_t curTime = time(0); if(curTime - iTime >= metricsIval) { metrics.reportInterval(metricsOfb, metricsStderr, false, true, NULL); iTime = curTime; } } } prm.reset(); // per-read metrics prm.doFmString = false; if(sam_print_xt) { gettimeofday(&prm.tv_beg, &prm.tz_beg); } // Try to align this read int mappingCycle = 0; bool gNofw3N = false; bool gNorc3N = false; // for threeN (3N) mode, we need to map the read 4 times. for regular mode, only 1 time. while(retry || mappingCycle < nMappingCycle) { msinkwrap->resetInit_(); if (threeN) { ps->changePlan3N(mappingCycle); gNorc3N = (mappingCycle == threeN_type1conversion_FW || mappingCycle == threeN_type2conversion_FW); gNofw3N = !gNorc3N; } retry = false; assert_eq(ps->bufa().color, false); if (!mappingCycles[mappingCycle]) { mappingCycle++; continue; } olm.reads++; bool pair = paired; const size_t rdlen1 = ps->bufa().length(); const size_t rdlen2 = pair ? ps->bufb().length() : 0; olm.bases += (rdlen1 + rdlen2); msinkwrap->nextRead( &ps->bufa(), pair ? &ps->bufb() : NULL, rdid, sc.qualitiesMatter()); assert(msinkwrap->inited()); size_t rdlens[2] = { rdlen1, rdlen2 }; // Calculate the minimum valid score threshold for the read TAlScore minsc[2], maxpen[2]; maxpen[0] = maxpen[1] = 0; minsc[0] = minsc[1] = std::numeric_limits::max(); if(bwaSwLike) { // From BWA-SW manual: "Given an l-long query, the // threshold for a hit to be retained is // a*max{T,c*log(l)}." We try to recreate that here. float a = (float)sc.match(30); float T = bwaSwLikeT, c = bwaSwLikeC; minsc[0] = (TAlScore)max(a*T, a*c*log(rdlens[0])); if(paired) { minsc[1] = (TAlScore)max(a*T, a*c*log(rdlens[1])); } } else { minsc[0] = scoreMin.f(rdlens[0]); if(paired) minsc[1] = scoreMin.f(rdlens[1]); if(localAlign) { if(minsc[0] < 0) { if(!gQuiet) printLocalScoreMsg(*ps, paired, true); minsc[0] = 0; } if(paired && minsc[1] < 0) { if(!gQuiet) printLocalScoreMsg(*ps, paired, false); minsc[1] = 0; } } else { if(minsc[0] > 0) { if(!gQuiet) printEEScoreMsg(*ps, paired, true); minsc[0] = 0; } if(paired && minsc[1] > 0) { if(!gQuiet) printEEScoreMsg(*ps, paired, false); minsc[1] = 0; } } } // N filter; does the read have too many Ns? size_t readns[2] = {0, 0}; sc.nFilterPair( &ps->bufa().patFw, pair ? &ps->bufb().patFw : NULL, readns[0], readns[1], nfilt[0], nfilt[1]); // Score filter; does the read enough character to rise above // the score threshold? scfilt[0] = sc.scoreFilter(minsc[0], rdlens[0]); scfilt[1] = sc.scoreFilter(minsc[1], rdlens[1]); lenfilt[0] = lenfilt[1] = true; if(rdlens[0] <= (size_t)multiseedMms || rdlens[0] < 2) { if(!gQuiet) printMmsSkipMsg(*ps, paired, true, multiseedMms); lenfilt[0] = false; } if((rdlens[1] <= (size_t)multiseedMms || rdlens[1] < 2) && paired) { if(!gQuiet) printMmsSkipMsg(*ps, paired, false, multiseedMms); lenfilt[1] = false; } if(rdlens[0] < 2) { if(!gQuiet) printLenSkipMsg(*ps, paired, true); lenfilt[0] = false; } if(rdlens[1] < 2 && paired) { if(!gQuiet) printLenSkipMsg(*ps, paired, false); lenfilt[1] = false; } qcfilt[0] = qcfilt[1] = true; if(qcFilter) { qcfilt[0] = (ps->bufa().filter != '0'); qcfilt[1] = (ps->bufb().filter != '0'); } filt[0] = (nfilt[0] && scfilt[0] && lenfilt[0] && qcfilt[0]); filt[1] = (nfilt[1] && scfilt[1] && lenfilt[1] && qcfilt[1]); prm.nFilt += (filt[0] ? 0 : 1) + (filt[1] ? 0 : 1); Read* rds[2] = { &ps->bufa(), &ps->bufb() }; // For each mate... assert(msinkwrap->empty()); //size_t minedfw[2] = { 0, 0 }; //size_t minedrc[2] = { 0, 0 }; // Calcualte nofw / no rc bool nofw[2] = { false, false }; bool norc[2] = { false, false }; if (threeN) { nofw[0] = paired ? (gMate1fw ? gNofw3N : gNorc3N) : gNofw3N; norc[0] = paired ? (gMate1fw ? gNorc3N : gNofw3N) : gNorc3N; nofw[1] = paired ? (gMate2fw ? gNofw3N : gNorc3N) : gNofw3N; norc[1] = paired ? (gMate2fw ? gNorc3N : gNofw3N) : gNorc3N; } else { nofw[0] = paired ? (gMate1fw ? gNofw : gNorc) : gNofw; norc[0] = paired ? (gMate1fw ? gNorc : gNofw) : gNorc; nofw[1] = paired ? (gMate2fw ? gNofw : gNorc) : gNofw; norc[1] = paired ? (gMate2fw ? gNorc : gNofw) : gNorc; } // Calculate nceil int nceil[2] = { 0, 0 }; nceil[0] = nCeil.f((double)rdlens[0]); nceil[0] = min(nceil[0], (int)rdlens[0]); if(paired) { nceil[1] = nCeil.f((double)rdlens[1]); nceil[1] = min(nceil[1], (int)rdlens[1]); } exhaustive[0] = exhaustive[1] = false; //size_t matemap[2] = { 0, 1 }; bool pairPostFilt = filt[0] && filt[1]; if(pairPostFilt) { rnd.init(ps->bufa().seed ^ ps->bufb().seed); } else { rnd.init(ps->bufa().seed); } // Calculate interval length for both mates int interval[2] = { 0, 0 }; for(size_t mate = 0; mate < (pair ? 2:1); mate++) { interval[mate] = msIval.f((double)rdlens[mate]); if(filt[0] && filt[1]) { // Boost interval length by 20% for paired-end reads interval[mate] = (int)(interval[mate] * 1.2 + 0.5); } interval[mate] = max(interval[mate], 1); } // Calculate streak length size_t streak[2] = { maxDpStreak, maxDpStreak }; size_t mtStreak[2] = { maxMateStreak, maxMateStreak }; size_t mxDp[2] = { maxDp, maxDp }; size_t mxUg[2] = { maxUg, maxUg }; size_t mxIter[2] = { maxIters, maxIters }; if(allHits) { streak[0] = streak[1] = std::numeric_limits::max(); mtStreak[0] = mtStreak[1] = std::numeric_limits::max(); mxDp[0] = mxDp[1] = std::numeric_limits::max(); mxUg[0] = mxUg[1] = std::numeric_limits::max(); mxIter[0] = mxIter[1] = std::numeric_limits::max(); } else if(khits > 1) { for(size_t mate = 0; mate < 2; mate++) { streak[mate] += (khits-1) * maxStreakIncr; mtStreak[mate] += (khits-1) * maxStreakIncr; mxDp[mate] += (khits-1) * maxItersIncr; mxUg[mate] += (khits-1) * maxItersIncr; mxIter[mate] += (khits-1) * maxItersIncr; } } if(filt[0] && filt[1]) { streak[0] = (size_t)ceil((double)streak[0] / 2.0); streak[1] = (size_t)ceil((double)streak[1] / 2.0); assert_gt(streak[1], 0); } assert_gt(streak[0], 0); // Calculate # seed rounds for each mate size_t nrounds[2] = { nSeedRounds, nSeedRounds }; if(filt[0] && filt[1]) { nrounds[0] = (size_t)ceil((double)nrounds[0] / 2.0); nrounds[1] = (size_t)ceil((double)nrounds[1] / 2.0); assert_gt(nrounds[1], 0); } assert_gt(nrounds[0], 0); // Increment counters according to what got filtered for(size_t mate = 0; mate < (pair ? 2:1); mate++) { if(!filt[mate]) { // Mate was rejected by N filter olm.freads++; // reads filtered out olm.fbases += rdlens[mate]; // bases filtered out } else { //shs[mate].clear(); //shs[mate].nextRead(mate == 0 ? ps->bufa() : ps->bufb()); //assert(shs[mate].empty()); olm.ureads++; // reads passing filter olm.ubases += rdlens[mate]; // bases passing filter } } //size_t eePeEeltLimit = std::numeric_limits::max(); // Whether we're done with mate1 / mate2 bool done[2] = { !filt[0], !filt[1] }; // size_t nelt[2] = {0, 0}; if(filt[0] && filt[1]) { splicedAligner.initReads(rds, nofw, norc, minsc, maxpen); } else if(filt[0]) { splicedAligner.initRead(rds[0], nofw[0], norc[0], minsc[0], maxpen[0], false); } else if(filt[1]) { splicedAligner.initRead(rds[1], nofw[1], norc[1], minsc[1], maxpen[1], true); } if(filt[0] || filt[1]) { int ret; int threeN_index; bool useRepeat; if (threeN) { threeN_index = (mappingCycle == threeN_type1conversion_FW || mappingCycle == threeN_type2conversion_RC) ? 0 : 1; useRepeat = paired ? (ps->bufa().length() >= 100) && (ps->bufb().length() >= 100) : ps->bufa().length() >= 80; } ret = splicedAligner.go( sc, pepol, *multiseed_tpol, *gpol, threeN ? *gfm_3N[threeN_index] : gfm, threeN ?(useRepeat ? rgfm_3N[threeN_index] : NULL) : rgfm, threeN ? *altdbs_3N[threeN_index] : *altdb, threeN ? *repeatdbs_3N[threeN_index] : *repeatdb, threeN ? *raltdbs_3N[threeN_index] : *raltdb, ref, threeN ? rref_3N[threeN_index] : rref, sw, *ssdb, wlm, prm, swmSeed, him, rnd, *msinkwrap); MERGE_SW(sw); // daehwan size_t mate = 0; assert_gt(ret, 0); // Clear out the exact hits so that we don't try to // extend them again later! if(ret == EXTEND_EXHAUSTED_CANDIDATES) { // Not done yet } else if(ret == EXTEND_POLICY_FULFILLED) { // Policy is satisfied for this mate at least if(msinkwrap->state().doneWithMate(mate == 0)) { done[mate] = true; } if(msinkwrap->state().doneWithMate(mate == 1)) { done[mate^1] = true; } } else if(ret == EXTEND_PERFECT_SCORE) { // We exhausted this mode at least done[mate] = true; } else if(ret == EXTEND_EXCEEDED_HARD_LIMIT) { // We exceeded a per-read limit done[mate] = true; } else if(ret == EXTEND_EXCEEDED_SOFT_LIMIT) { // Not done yet } else { // cerr << "Bad return value: " << ret << endl; throw 1; } if(!done[mate]) { TAlScore perfectScore = sc.perfectScore(rdlens[mate]); if(!done[mate] && minsc[mate] == perfectScore) { done[mate] = true; } } } for(size_t i = 0; i < 2; i++) { assert_leq(prm.nExIters, mxIter[i]); assert_leq(prm.nExDps, mxDp[i]); assert_leq(prm.nMateDps, mxDp[i]); assert_leq(prm.nExUgs, mxUg[i]); assert_leq(prm.nMateUgs, mxUg[i]); assert_leq(prm.nDpFail, streak[i]); assert_leq(prm.nUgFail, streak[i]); assert_leq(prm.nEeFail, streak[i]); } msinkwrap->finishRead( NULL, NULL, exhaustive[0], // exhausted seed hits for mate 1? exhaustive[1], // exhausted seed hits for mate 2? nfilt[0], nfilt[1], scfilt[0], scfilt[1], lenfilt[0], lenfilt[1], qcfilt[0], qcfilt[1], sortByScore, // prioritize by alignment score rnd, // pseudo-random generator rpm, // reporting metrics prm, // per-read metrics sc, // scoring scheme !seedSumm, // suppress seed summaries? seedSumm, //rdid suppress alignments? templateLenAdjustment); mappingCycle++; } } // if(rdid >= skipReads && rdid < qUpto) else if(rdid >= qUpto) { break; } if(metricsPerRead) { MERGE_METRICS(metricsPt, nthreads > 1); nametmp = ps->bufa().name; metricsPt.reportInterval( metricsOfb, metricsStderr, true, true, &nametmp); metricsPt.reset(); } } // while(true) // One last metrics merge MERGE_METRICS(metrics, nthreads > 1); delete msinkwrap; return; } /** * Called once per alignment job. Sets up global pointers to the * shared global data structures, creates per-thread structures, then * enters the search loop. */ static void multiseedSearch( Scoring& sc, TranscriptomePolicy& tpol, GraphPolicy& gp, PairedPatternSource& patsrc, // pattern source AlnSink& msink, // hit sink EList* > gfms_3N, // 3N index of original text RFM* rgfms_3N[2], // 3N index of repeat sequences BitPairReference* rrefss[2], // 3N repeat reference HGFM* gfm, // index of original text RFM* rgfm, // index of repeat sequences BitPairReference* refs, // base reference BitPairReference* rrefs, // repeat reference OutFileBuf *metricsOfb) { multiseed_patsrc = &patsrc; multiseed_msink = &msink; multiseed_sc = ≻ multiseed_tpol = &tpol; gpol = &gp; multiseed_metricsOfb = metricsOfb; multiseed_refs = refs; if (threeN) { ref3N.load(gfms_3N, rgfms_3N, rrefss); } else { multiseed_gfm = gfm; multiseed_rgfm = rgfm; multiseed_rrefs = rrefs; } AutoArray threads(nthreads); AutoArray tids(nthreads); // Start the metrics thread { Timer _t(cerr, "Multiseed full-index search: ", timing); thread_rids.resize(nthreads); thread_rids.fill(0); thread_rids_mindist = (nthreads == 1 || !useTempSpliceSite ? 0 : 1000 * nthreads); for(int i = 0; i < nthreads; i++) { // Thread IDs start at 1 tids[i] = i+1; threads[i] = new tthread::thread(multiseedSearchWorker_hisat2, (void*)&tids[i]); } for (int i = 0; i < nthreads; i++) threads[i]->join(); } if(!metricsPerRead && (metricsOfb != NULL || metricsStderr)) { metrics.reportInterval(metricsOfb, metricsStderr, true, false, NULL); } } static string argstr; extern void initializeCntLut(); extern void initializeCntBit(); template static void driver( const char * type, const string bt2indexBases[2], const string& outfile) { if(gVerbose || startVerbose) { cerr << "Entered driver(): "; logTime(cerr, true); } if (gVerbose || startVerbose) { cerr << "Running in " << ((threeN) ? "3N" : "Regular") << " Mode" << endl; } initializeCntLut(); initializeCntBit(); // Vector of the reference sequences; used for sanity-checking EList > names, os; EList nameLens, seqLens; // Read reference sequences from the command-line or from a FASTA file if(!origString.empty()) { // Read fasta file(s) EList origFiles; tokenize(origString, ",", origFiles); parseFastas(origFiles, names, nameLens, os, seqLens); } PatternParams pp( format, // file format fileParallel, // true -> wrap files with separate PairedPatternSources seed, // pseudo-random seed useSpinlock, // use spin locks instead of pthreads solexaQuals, // true -> qualities are on solexa64 scale phred64Quals, // true -> qualities are on phred64 scale integerQuals, // true -> qualities are space-separated numbers fuzzy, // true -> try to parse fuzzy fastq fastaContLen, // length of sampled reads for FastaContinuous... fastaContFreq, // frequency of sampled reads for FastaContinuous... skipReads // skip the first 'skip' patterns ); if(gVerbose || startVerbose) { cerr << "Creating PatternSource: "; logTime(cerr, true); } PairedPatternSource *patsrc = PairedPatternSource::setupPatternSources( queries, // singles, from argv mates1, // mate1's, from -1 arg mates2, // mate2's, from -2 arg mates12, // both mates on each line, from --12 arg #ifdef USE_SRA sra_accs, // SRA accessions #endif qualities, // qualities associated with singles qualities1, // qualities associated with m1 qualities2, // qualities associated with m2 pp, // read read-in parameters nthreads, gVerbose || startVerbose); // be talkative // Open hit output file if(gVerbose || startVerbose) { cerr << "Opening hit output file: "; logTime(cerr, true); } OutFileBuf *fout; if(!outfile.empty()) { fout = new OutFileBuf(outfile.c_str(), false); } else { fout = new OutFileBuf(); } // Initialize GFM object and read in header if(gVerbose || startVerbose) { cerr << "About to initialize fw GFM: "; logTime(cerr, true); } // for 3N if (threeN) { for (int i = 0; i < 2; i++) { altdbs_3N[i] = new ALTDB(); repeatdbs_3N[i] = new RepeatDB(); raltdbs_3N[i] = new ALTDB(); } } EList* >gfms_3N; RFM* rgfms_3N[2]; for (int i = 0; i < 2; i++) { rgfms_3N[i] = NULL; } bool rep_index_exists_3N[2]{false}; bool rep_index_exists = false; string rep_adjIdxBase_3N[2]; string rep_adjIdxBase; HGFM* gfm; RFM* rgfm = NULL; if (threeN) { for (int j = 0; j < 2; j++) { adjIdxBases_3N[j] = adjustEbwtBase(argv0, bt2indexBases[j], gVerbose); HGFM *tmp_gfm = new HGFM( adjIdxBases_3N[j], altdbs_3N[j], NULL, NULL, -1, // fw index true, // index is for the forward direction /* overriding: */ offRate, 0, // amount to add to index offrate or <= 0 to do nothing useMm, // whether to use memory-mapped files useShmem, // whether to use shared memory mmSweep, // sweep memory-mapped files !noRefNames, // load names? true, // load SA sample? true, // load ftab? true, // load rstarts? !no_spliced_alignment, // load splice sites? gVerbose, // whether to be talkative startVerbose, // talkative during initialization false /*passMemExc*/, sanityCheck, use_haplotype); //use haplotypes? gfms_3N.push_back(tmp_gfm); if(sanityCheck && !os.empty()) { // Sanity check number of patterns and pattern lengths in GFM // against original strings assert_eq(os.size(), gfms_3N[j]->nPat()); for(size_t i = 0; i < os.size(); i++) { assert_eq(os[i].length(), gfms_3N[j]->plen()[i]); } } if(sanityCheck && !os.empty()) { gfms_3N[j]->loadIntoMemory( -1, // fw index true, // load SA sample true, // load ftab true, // load rstarts !noRefNames, startVerbose); gfms_3N[j]->checkOrigs(os, false); gfms_3N[j]->evictFromMemory(); } { // Load the other half of the index into memory assert(!gfms_3N[j]->isInMemory()); Timer _t(cerr, "Time loading forward index: ", timing); gfms_3N[j]->loadIntoMemory( -1, // not the reverse index true, // load SA samp? (yes, need forward index's SA samp) true, // load ftab (in forward index) true, // load rstarts (in forward index) !noRefNames, // load names? startVerbose); } rep_adjIdxBase_3N[j] = adjIdxBases_3N[j] + ".rep"; { std::ifstream infile((rep_adjIdxBase_3N[j] + ".1." + gfm_ext.c_str()).c_str()); rep_index_exists_3N[j] = infile.good(); } if(rep_index_exists_3N[j] && use_repeat_index) { rgfms_3N[j] = new RFM( rep_adjIdxBase_3N[j], raltdbs_3N[j], repeatdbs_3N[j], &readLens, -1, // fw index true, // index is for the forward direction /* overriding: */ offRate, 0, // amount to add to index offrate or <= 0 to do nothing useMm, // whether to use memory-mapped files useShmem, // whether to use shared memory mmSweep, // sweep memory-mapped files !noRefNames, // load names? true, // load SA sample? true, // load ftab? true, // load rstarts? !no_spliced_alignment, // load splice sites? gVerbose, // whether to be talkative startVerbose, // talkative during initialization false /*passMemExc*/, sanityCheck, false); //use haplotypes? // CP to do #if 0 if(sanityCheck && !os.empty()) { // Sanity check number of patterns and pattern lengths in GFM // against original strings assert_eq(os.size(), gfm.nPat()); for(size_t i = 0; i < os.size(); i++) { assert_eq(os[i].length(), rgfm->plen()[i]); } } // Sanity-check the restored version of the GFM if(sanityCheck && !os.empty()) { rgfm->loadIntoMemory( -1, // fw index true, // load SA sample true, // load ftab true, // load rstarts !noRefNames, startVerbose); rgfm->checkOrigs(os, false); rgfm->evictFromMemory(); } #endif { // Load the other half of the index into memory assert(!rgfms_3N[j]->isInMemory()); Timer _t(cerr, "Time loading forward index: ", timing); rgfms_3N[j]->loadIntoMemory( -1, // not the reverse index true, // load SA samp? (yes, need forward index's SA samp) true, // load ftab (in forward index) true, // load rstarts (in forward index) !noRefNames, // load names? startVerbose); repeatdbs_3N[j]->construct(gfms_3N[j]->rstarts(), gfms_3N[j]->nFrag()); } if (threeN) { ht2_option_t option; ht2_init_options(&option); option.altdb = altdbs_3N[j]; option.raltdb = raltdbs_3N[j]; option.repeatdb = repeatdbs_3N[j]; option.gfm = gfms_3N[j]; option.rgfm = rgfms_3N[j]; ht2_handle_t handle = ht2_init(adjIdxBases_3N[j].c_str(), &option); repeatHandles.push_back(handle); if (refNameMap == NULL) { ht2_index_getrefnames(repeatHandles[0], &refNameMap); } } } if(!saw_k) { if(gfms_3N[j]->gh().linearFM()) khits = 5; else khits = 10; } } } else { altdb = new ALTDB(); repeatdb = new RepeatDB(); raltdb = new ALTDB(); adjIdxBase = adjustEbwtBase(argv0, bt2indexBases[0], gVerbose); gfm = new HGFM( adjIdxBase, altdb, NULL, NULL, -1, // fw index true, // index is for the forward direction /* overriding: */ offRate, 0, // amount to add to index offrate or <= 0 to do nothing useMm, // whether to use memory-mapped files useShmem, // whether to use shared memory mmSweep, // sweep memory-mapped files !noRefNames, // load names? true, // load SA sample? true, // load ftab? true, // load rstarts? !no_spliced_alignment, // load splice sites? gVerbose, // whether to be talkative startVerbose, // talkative during initialization false /*passMemExc*/, sanityCheck, use_haplotype); //use haplotypes? if(sanityCheck && !os.empty()) { // Sanity check number of patterns and pattern lengths in GFM // against original strings assert_eq(os.size(), gfm->nPat()); for(size_t i = 0; i < os.size(); i++) { assert_eq(os[i].length(), gfm->plen()[i]); } } // Sanity-check the restored version of the GFM if(sanityCheck && !os.empty()) { gfm->loadIntoMemory( -1, // fw index true, // load SA sample true, // load ftab true, // load rstarts !noRefNames, startVerbose); gfm->checkOrigs(os, false); gfm->evictFromMemory(); } { // Load the other half of the index into memory assert(!gfm->isInMemory()); Timer _t(cerr, "Time loading forward index: ", timing); gfm->loadIntoMemory( -1, // not the reverse index true, // load SA samp? (yes, need forward index's SA samp) true, // load ftab (in forward index) true, // load rstarts (in forward index) !noRefNames, // load names? startVerbose); } rep_adjIdxBase = adjIdxBase + ".rep"; { std::ifstream infile((rep_adjIdxBase + ".1." + gfm_ext.c_str()).c_str()); rep_index_exists = infile.good(); } if(rep_index_exists && use_repeat_index) { rgfm = new RFM( rep_adjIdxBase, raltdb, repeatdb, &readLens, -1, // fw index true, // index is for the forward direction /* overriding: */ offRate, 0, // amount to add to index offrate or <= 0 to do nothing useMm, // whether to use memory-mapped files useShmem, // whether to use shared memory mmSweep, // sweep memory-mapped files !noRefNames, // load names? true, // load SA sample? true, // load ftab? true, // load rstarts? !no_spliced_alignment, // load splice sites? gVerbose, // whether to be talkative startVerbose, // talkative during initialization false /*passMemExc*/, sanityCheck, false); //use haplotypes? // CP to do #if 0 if(sanityCheck && !os.empty()) { // Sanity check number of patterns and pattern lengths in GFM // against original strings assert_eq(os.size(), gfm.nPat()); for(size_t i = 0; i < os.size(); i++) { assert_eq(os[i].length(), rgfm->plen()[i]); } } // Sanity-check the restored version of the GFM if(sanityCheck && !os.empty()) { rgfm->loadIntoMemory( -1, // fw index true, // load SA sample true, // load ftab true, // load rstarts !noRefNames, startVerbose); rgfm->checkOrigs(os, false); rgfm->evictFromMemory(); } #endif { // Load the other half of the index into memory assert(!rgfm->isInMemory()); Timer _t(cerr, "Time loading forward index: ", timing); rgfm->loadIntoMemory( -1, // not the reverse index true, // load SA samp? (yes, need forward index's SA samp) true, // load ftab (in forward index) true, // load rstarts (in forward index) !noRefNames, // load names? startVerbose); repeatdb->construct(gfm->rstarts(), gfm->nFrag()); } } if(!saw_k) { if(gfm->gh().linearFM()) khits = 5; else khits = 10; } } // else threeN OutputQueue oq( *fout, // out file buffer reorder && nthreads > 1, // whether to reorder when there's >1 thread nthreads, // # threads nthreads > 1, // whether to be thread-safe skipReads); // first read will have this rdid { Timer _t(cerr, "Time searching: ", timing); // Set up penalities if(bonusMatch > 0 && !localAlign) { cerr << "Warning: Match bonus always = 0 in --end-to-end mode; ignoring user setting" << endl; bonusMatch = 0; } if(tranAssm) { penNoncanIntronLen.init(SIMPLE_FUNC_LOG, -8, 2); } Scoring sc( bonusMatch, // constant reward for match penMmcType, // how to penalize mismatches penMmcMax, // max mm penalty penMmcMin, // min mm penalty penScMax, // max sc penalty penScMin, // min sc penalty scoreMin, // min score as function of read len nCeil, // max # Ns as function of read len penNType, // how to penalize Ns in the read penN, // constant if N pelanty is a constant penNCatPair, // whether to concat mates before N filtering penRdGapConst, // constant coeff for read gap cost penRfGapConst, // constant coeff for ref gap cost penRdGapLinear, // linear coeff for read gap cost penRfGapLinear, // linear coeff for ref gap cost gGapBarrier, // # rows at top/bot only entered diagonally penCanSplice, // canonical splicing penalty penNoncanSplice,// non-canonical splicing penalty penConflictSplice, // conflicting splice site penalty &penCanIntronLen, // penalty as to intron length &penNoncanIntronLen); // penalty as to intron length EList reflens; // for HISAT-3N EList refnames_3N[2]; EList replens_3N[2]; EList repnames_3N[2]; EList empty_replens_3N[2]; EList empty_repnames_3N[2]; //for regular hisat2 EList refnames; //readEbwtRefnames(adjIdxBase, refnames); EList replens; EList repnames; EList empty_replens; EList empty_repnames; if (threeN) { for(size_t i = 0; i < gfms_3N[0]->nPat(); i++) { reflens.push_back(gfms_3N[0]->plen()[i]); } for (int j = 0; j < 2; j++) { readEbwtRefnames(adjIdxBases_3N[j], refnames_3N[j]); if (rep_index_exists_3N[j] && use_repeat_index) { rgfms_3N[j]->getReferenceNames(repnames_3N[j]); rgfms_3N[j]->getReferenceLens(replens_3N[j]); } if(rmChrName && addChrName) { cerr << "Error: --remove-chrname and --add-chrname cannot be used at the same time" << endl; throw 1; } if(rmChrName) { for(size_t i = 0; i < refnames_3N[j].size(); i++) { string& refname = refnames_3N[j][i]; if(refname.find("chr") == 0) { refname = refname.substr(3); } } } else if(addChrName) { for(size_t i = 0; i < refnames_3N[j].size(); i++) { string& refname = refnames_3N[j][i]; if(refname.find("chr") != 0) { refname = string("chr") + refname; } } } } } else { readEbwtRefnames(adjIdxBase, refnames); for(size_t i = 0; i < gfm->nPat(); i++) { reflens.push_back(gfm->plen()[i]); } if(rep_index_exists && use_repeat_index) { rgfm->getReferenceNames(repnames); rgfm->getReferenceLens(replens); } if(rmChrName && addChrName) { cerr << "Error: --remove-chrname and --add-chrname cannot be used at the same time" << endl; throw 1; } if(rmChrName) { for(size_t i = 0; i < refnames.size(); i++) { string& refname = refnames[i]; if(refname.find("chr") == 0) { refname = refname.substr(3); } } } else if(addChrName) { for(size_t i = 0; i < refnames.size(); i++) { string& refname = refnames[i]; if(refname.find("chr") != 0) { refname = string("chr") + refname; } } } } SamConfig samc( threeN ? refnames_3N[0]: refnames, // reference sequence names reflens, // reference sequence lengths threeN?(repeat ? repnames_3N[0] : empty_repnames_3N[0]): (repeat ? repnames : empty_repnames), // repeat sequence names threeN? (repeat ? replens_3N[0] : empty_replens_3N[0]): (repeat ? replens : empty_replens), // repeat sequence lengths samTruncQname, // whether to truncate QNAME to 255 chars samOmitSecSeqQual, // omit SEQ/QUAL for 2ndary alignments? samNoUnal, // omit unaligned-read records? string("hisat2"), // program id string("hisat2"), // program name string(HISAT2_VERSION), // program version argstr, // command-line rgs_optflag, // read-group string rna_strandness, sam_print_as, sam_print_xs, sam_print_xss, sam_print_yn, sam_print_xn, sam_print_cs, sam_print_cq, sam_print_x0, sam_print_x1, sam_print_xm, sam_print_xo, sam_print_xg, sam_print_nm, sam_print_md, sam_print_yf, sam_print_yi, sam_print_ym, sam_print_yp, sam_print_yt, sam_print_ys, sam_print_zs, sam_print_xr, sam_print_xt, sam_print_xd, sam_print_xu, sam_print_yl, sam_print_ye, sam_print_yu, sam_print_xp, sam_print_yr, sam_print_zb, sam_print_zr, sam_print_zf, sam_print_zm, sam_print_zi, sam_print_zp, sam_print_zu, sam_print_xs_a, sam_print_nh); // Set up hit sink; if sanityCheck && !os.empty() is true, // then instruct the sink to "retain" hits in a vector in // memory so that we can easily sanity check them later on AlnSink *mssink = NULL; //auto_ptr refss[2]; auto_ptr refs; Timer *_tRef = new Timer(cerr, "Time loading reference: ", timing); refs = auto_ptr( new BitPairReference( threeN ? adjIdxBases_3N[0] : adjIdxBase, NULL, false, sanityCheck, NULL, NULL, false, useMm, useShmem, mmSweep, gVerbose, startVerbose) ); delete _tRef; if(!refs->loaded()) throw 1; BitPairReference* rrefss[2] = {NULL, }; BitPairReference* rrefs = NULL; if (threeN) { for (int j = 0; j < 2; j++) { if (rep_index_exists_3N[j] && use_repeat_index) { const EList &included = rgfms_3N[j]->getRepeatIncluded(); rrefss[j] = new BitPairReference( rep_adjIdxBase_3N[j], &included, false, sanityCheck, NULL, NULL, false, useMm, useShmem, mmSweep, gVerbose, startVerbose); if (!rrefss[j]->loaded()) throw 1; } } } else { if(rep_index_exists && use_repeat_index) { const EList& included = rgfm->getRepeatIncluded(); rrefs = new BitPairReference( rep_adjIdxBase, &included, false, sanityCheck, NULL, NULL, false, useMm, useShmem, mmSweep, gVerbose, startVerbose); if(!rrefs->loaded()) throw 1; } } bool xsOnly = (tranAssm_program == "cufflinks"); TranscriptomePolicy tpol(minIntronLen, maxIntronLen, tranAssm ? 15 : 7, tranAssm ? 20 : 14, no_spliced_alignment, tranMapOnly, tranAssm, xsOnly, avoid_pseudogene); GraphPolicy gpol(max_alts_tried, use_haplotype, (threeN ? altdbs_3N[0]->haplotypes().size() : altdb->haplotypes().size()) > 0 && use_haplotype, enable_codis); init_junction_prob(); bool write = novelSpliceSiteOutfile != "" || useTempSpliceSite; bool read = knownSpliceSiteInfile != "" || novelSpliceSiteInfile != "" || useTempSpliceSite || altdbs_3N[0]->hasSpliceSites(); ssdb = new SpliceSiteDB( *(refs.get()), threeN ? refnames_3N[0] : refnames, nthreads > 1, // thread-safe write, // write? read); // read? ssdb->read(threeN ? *gfms_3N[0] : *gfm, threeN ? altdbs_3N[0]->alts() : altdb->alts()); if(knownSpliceSiteInfile != "") { ifstream ssdb_file(knownSpliceSiteInfile.c_str(), ios::in); if(ssdb_file.is_open()) { ssdb->read(ssdb_file, true); // known splice sites ssdb_file.close(); } } if(novelSpliceSiteInfile != "") { ifstream ssdb_file(novelSpliceSiteInfile.c_str(), ios::in); if(ssdb_file.is_open()) { ssdb->read(ssdb_file, false); // novel splice sites ssdb_file.close(); } } switch(outType) { case OUTPUT_SAM: { if (threeN) { mssink = new AlnSink3NSam( oq, // output queue samc, // settings & routines for SAM output refnames_3N[0], // reference names repnames_3N[0], // repeat names gQuiet, // don't print alignment summary at end nthreads, refs.get(), no_spliced_alignment, altdbs_3N[0], ssdb); } else { mssink = new AlnSinkSam( oq, // output queue samc, // settings & routines for SAM output refnames, // reference names repnames, // repeat names gQuiet, // don't print alignment summary at end altdb, ssdb); }; if(!samNoHead) { bool printHd = true, printSq = true; BTString buf; samc.printHeader(buf, rgid, rgs, printHd, !samNoSQ, printSq); fout->writeString(buf); } break; } default: cerr << "Invalid output type: " << outType << endl; throw 1; } if(gVerbose || startVerbose) { cerr << "Dispatching to search driver: "; logTime(cerr, true); } // Set up global constraint OutFileBuf *metricsOfb = NULL; if(!metricsFile.empty() && metricsIval > 0) { metricsOfb = new OutFileBuf(metricsFile); } // Do the search for all input reads assert(patsrc != NULL); assert(mssink != NULL); multiseedSearch( sc, // scoring scheme tpol, gpol, *patsrc, // pattern source *mssink, // hit sink gfms_3N, // 3N BWT rgfms_3N, // 3N rrefss, // 3N gfm, // BWT rgfm, refs.get(), rrefs, metricsOfb); // Evict any loaded indexes from memory if (threeN) { for (int j = 0; j < 2; j++) { if(gfms_3N[j]->isInMemory()) { gfms_3N[j]->evictFromMemory(); } } } else { if(gfm->isInMemory()) { gfm->evictFromMemory(); } } if(!gQuiet && !seedSumm) { size_t repThresh = mhits; if(repThresh == 0) { repThresh = std::numeric_limits::max(); } mssink->finish(cerr, repThresh, gReportDiscordant, gReportMixed, newAlignSummary, hadoopOut); if(alignSumFile != "") { ofstream sumfile(alignSumFile.c_str(), ios::out); if(sumfile.is_open()) { mssink->finish(sumfile, repThresh, gReportDiscordant, gReportMixed, newAlignSummary, false); // hadoopOut sumfile.close(); } } } if(ssdb != NULL) { if(novelSpliceSiteOutfile != "") { ofstream ssdb_file(novelSpliceSiteOutfile.c_str(), ios::out); if(ssdb_file.is_open()) { ssdb->print(ssdb_file); ssdb_file.close(); } } } oq.flush(true); assert_eq(oq.numStarted(), oq.numFinished()); assert_eq(oq.numStarted(), oq.numFlushed()); delete patsrc; delete mssink; delete ssdb; delete metricsOfb; if (threeN) { for (int i = 0; i < 2; i++) { if(rep_index_exists_3N[i] && use_repeat_index) { delete rgfms_3N[i]; delete rrefss[i]; delete repeatdbs_3N[i]; delete raltdbs_3N[i]; } delete gfms_3N[i]; delete altdbs_3N[i]; } if(rep_index_exists_3N[0] && use_repeat_index){ for (int k = 0; k < 2; k++) { ht2_close(repeatHandles[k]); } } } else { delete altdb; delete repeatdb; delete raltdb; delete rgfm; delete rrefs; delete gfm; } if (refNameMap != NULL) { free(refNameMap); } if(fout != NULL) { delete fout; } } } // C++ name mangling is disabled for the bowtie() function to make it // easier to use Bowtie as a library. extern "C" { /** * Main bowtie entry function. Parses argc/argv style command-line * options, sets global configuration variables, and calls the driver() * function. */ int hisat2(int argc, const char **argv) { try { // Reset all global state, including getopt state opterr = optind = 1; resetOptions(); for(int i = 0; i < argc; i++) { argstr += argv[i]; if(i < argc-1) argstr += " "; } if(startVerbose) { cerr << "Entered main(): "; logTime(cerr, true); } parseOptions(argc, argv); argv0 = argv[0]; if(showVersion) { cout << argv0 << " version " << HISAT2_VERSION << endl; if(sizeof(void*) == 4) { cout << "32-bit" << endl; } else if(sizeof(void*) == 8) { cout << "64-bit" << endl; } else { cout << "Neither 32- nor 64-bit: sizeof(void*) = " << sizeof(void*) << endl; } cout << "Built on " << BUILD_HOST << endl; cout << BUILD_TIME << endl; cout << "Compiler: " << COMPILER_VERSION << endl; cout << "Options: " << COMPILER_OPTIONS << endl; cout << "Sizeof {int, long, long long, void*, size_t, off_t}: {" << sizeof(int) << ", " << sizeof(long) << ", " << sizeof(long long) << ", " << sizeof(void *) << ", " << sizeof(size_t) << ", " << sizeof(off_t) << "}" << endl; return 0; } { Timer _t(cerr, "Overall time: ", timing); if(startVerbose) { cerr << "Parsing index and read arguments: "; logTime(cerr, true); } // Get index basename (but only if it wasn't specified via --index) if(bt2indexs[0].empty()) { if(optind >= argc) { cerr << "No index, query, or output file specified!" << endl; printUsage(cerr); return 1; } bt2indexs[0] = argv[optind++]; } if (threeN) { bt2indexs[1] = bt2indexs[0]; if (fileExist(bt2indexs[0] + threeN_indexTags[0] + ".1." + gfm_ext)) { bt2indexs[0] += threeN_indexTags[0]; bt2indexs[1] += threeN_indexTags[1]; } else if (fileExist(bt2indexs[0] + ".3n.1.1." + gfm_ext)) { bt2indexs[0] += ".3n.1"; bt2indexs[1] += ".3n.2"; if (!((usrInput_convertedFrom == 'C' && usrInput_convertedTo == 'T') || (usrInput_convertedFrom == 'T' && usrInput_convertedTo == 'C'))) { cerr << "Your current hisat-3n index only support C-to-T or T-to-C base change. Please build new hisat-3n index to support " << usrInput_convertedFrom << " to " << usrInput_convertedTo << "change." << endl; printUsage(cerr); return 1; } } else { cerr << "Index is not exist, please use hisat-3n-build to build index first. Please use the same --base-change argument for both hisat-3n-build and hisat-3n." << endl; printUsage(cerr); return 1; } } // Get query filename bool got_reads = !queries.empty() || !mates1.empty() || !mates12.empty(); #ifdef USE_SRA got_reads = got_reads || !sra_accs.empty(); #endif if(minIntronLen > maxIntronLen) { cerr << "--min-intronlen(" << minIntronLen << ") should not be greater than --max-intronlen(" << maxIntronLen << ")" << endl; printUsage(cerr); return 1; } if(optind >= argc) { if(!got_reads) { printUsage(cerr); cerr << "***" << endl #ifdef USE_SRA << "Error: Must specify at least one read input with -U/-1/-2/--sra-acc" << endl; #else << "Error: Must specify at least one read input with -U/-1/-2" << endl; #endif return 1; } } else if(!got_reads) { // Tokenize the list of query files tokenize(argv[optind++], ",", queries); if(queries.empty()) { cerr << "Tokenized query file list was empty!" << endl; printUsage(cerr); return 1; } } // Get output filename if(optind < argc && outfile.empty()) { outfile = argv[optind++]; cerr << "Warning: Output file '" << outfile.c_str() << "' was specified without -S. This will not work in " << "future HISAT 2 versions. Please use -S instead." << endl; } // Extra parametesr? if(optind < argc) { cerr << "Extra parameter(s) specified: "; for(int i = optind; i < argc; i++) { cerr << "\"" << argv[i] << "\""; if(i < argc-1) cerr << ", "; } cerr << endl; if(mates1.size() > 0) { cerr << "Note that if files are specified using -1/-2, a file cannot" << endl << "also be specified. Please run HISAT2 separately for mates and singles." << endl; } throw 1; } // Optionally summarize if(gVerbose) { cout << "Input bt2 file: \"" << bt2indexs[0].c_str() << "\"" << endl; cout << "Input bt2 file: \"" << bt2indexs[1].c_str() << "\"" << endl; cout << "Query inputs (DNA, " << file_format_names[format].c_str() << "):" << endl; for(size_t i = 0; i < queries.size(); i++) { cout << " " << queries[i].c_str() << endl; } cout << "Quality inputs:" << endl; for(size_t i = 0; i < qualities.size(); i++) { cout << " " << qualities[i].c_str() << endl; } cout << "Output file: \"" << outfile.c_str() << "\"" << endl; cout << "Local endianness: " << (currentlyBigEndian()? "big":"little") << endl; cout << "Sanity checking: " << (sanityCheck? "enabled":"disabled") << endl; #ifdef NDEBUG cout << "Assertions: disabled" << endl; #else cout << "Assertions: enabled" << endl; #endif } if(ipause) { cout << "Press key to continue..." << endl; getchar(); } driver >("DNA", bt2indexs, outfile); } return 0; } catch(std::exception& e) { cerr << "Error: Encountered exception: '" << e.what() << "'" << endl; cerr << "Command: "; for(int i = 0; i < argc; i++) cerr << argv[i] << " "; cerr << endl; return 1; } catch(int e) { if(e != 0) { cerr << "Error: Encountered internal HISAT2 exception (#" << e << ")" << endl; cerr << "Command: "; for(int i = 0; i < argc; i++) cerr << argv[i] << " "; cerr << endl; } return e; } } // bowtie() } // extern "C"