971 lines
37 KiB
C++
971 lines
37 KiB
C++
/*
|
||
* Copyright 2015, Daehwan Kim <infphilo@gmail.com>
|
||
*
|
||
* This file is part of HISAT 2.
|
||
*
|
||
* HISAT 2 is free software: you can redistribute it and/or modify
|
||
* it under the terms of the GNU General Public License as published by
|
||
* the Free Software Foundation, either version 3 of the License, or
|
||
* (at your option) any later version.
|
||
*
|
||
* HISAT 2 is distributed in the hope that it will be useful,
|
||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||
* GNU General Public License for more details.
|
||
*
|
||
* You should have received a copy of the GNU General Public License
|
||
* along with HISAT 2. If not, see <http://www.gnu.org/licenses/>.
|
||
*/
|
||
|
||
#include <iostream>
|
||
#include <fstream>
|
||
#include <string>
|
||
#include <cassert>
|
||
#include <getopt.h>
|
||
#include "assert_helpers.h"
|
||
#include "endian_swap.h"
|
||
#include "formats.h"
|
||
#include "sequence_io.h"
|
||
#include "tokenize.h"
|
||
#include "timer.h"
|
||
#include "ref_read.h"
|
||
#include "filebuf.h"
|
||
#include "reference.h"
|
||
#include "ds.h"
|
||
#include "gfm.h"
|
||
#include "hgfm.h"
|
||
#include "rfm.h"
|
||
#include "utility_3n.h"
|
||
|
||
|
||
/**
|
||
* \file Driver for the bowtie-build indexing tool.
|
||
*/
|
||
|
||
#include <algorithm>
|
||
#include <cstdlib>
|
||
#include <fstream>
|
||
#include <iostream>
|
||
#include <vector>
|
||
|
||
MemoryTally gMemTally;
|
||
// Build parameters
|
||
int verbose;
|
||
static int sanityCheck;
|
||
static int format;
|
||
static TIndexOffU bmax;
|
||
static TIndexOffU bmaxMultSqrt;
|
||
static uint32_t bmaxDivN;
|
||
static int dcv;
|
||
static int noDc;
|
||
static int entireSA;
|
||
static int seed;
|
||
static int showVersion;
|
||
// GFM parameters
|
||
static int32_t lineRate;
|
||
static bool lineRate_provided;
|
||
static int32_t linesPerSide;
|
||
static int32_t offRate;
|
||
static int32_t ftabChars;
|
||
static int32_t localOffRate;
|
||
static int32_t localFtabChars;
|
||
static int bigEndian;
|
||
static bool nsToAs;
|
||
static bool autoMem;
|
||
static bool packed;
|
||
static bool writeRef;
|
||
static bool justRef;
|
||
static bool reverseEach;
|
||
static int nthreads; // number of pthreads operating concurrently
|
||
static string wrapper;
|
||
static string snp_fname;
|
||
static string ht_fname;
|
||
static string ss_fname;
|
||
static string exon_fname;
|
||
static string sv_fname;
|
||
static string repeat_ref_fname;
|
||
static string repeat_info_fname;
|
||
static string repeat_snp_fname;
|
||
static string repeat_haplotype_fname;
|
||
|
||
bool threeN = false;
|
||
bool repeatIndex = false;
|
||
bool base_change_entered;
|
||
char convertedFrom;
|
||
char convertedTo;
|
||
char convertedFromComplement;
|
||
char convertedToComplement;
|
||
|
||
ConvertMatrix3N baseChange;
|
||
|
||
static void resetOptions() {
|
||
verbose = true; // be talkative (default)
|
||
sanityCheck = 0; // do slow sanity checks
|
||
format = FASTA; // input sequence format
|
||
bmax = OFF_MASK; // max blockwise SA bucket size
|
||
bmaxMultSqrt = OFF_MASK; // same, as multplier of sqrt(n)
|
||
bmaxDivN = 4; // same, as divisor of n
|
||
dcv = 1024; // bwise SA difference-cover sample sz
|
||
noDc = 0; // disable difference-cover sample
|
||
entireSA = 0; // 1 = disable blockwise SA
|
||
seed = 0; // srandom seed
|
||
showVersion = 0; // just print version and quit?
|
||
// GFM parameters
|
||
lineRate = GFM<TIndexOffU>::default_lineRate_gfm;
|
||
lineRate_provided = false;
|
||
linesPerSide = 1; // 1 64-byte line on a side
|
||
offRate = 4; // sample 1 out of 16 SA elts
|
||
ftabChars = 10; // 10 chars in initial lookup table
|
||
localOffRate = 3;
|
||
localFtabChars = 6;
|
||
bigEndian = 0; // little endian
|
||
nsToAs = false; // convert reference Ns to As prior to indexing
|
||
autoMem = true; // automatically adjust memory usage parameters
|
||
packed = false; //
|
||
writeRef = true; // write compact reference to .3.ht2/.4.ht2
|
||
justRef = false; // *just* write compact reference, don't index
|
||
reverseEach = false;
|
||
nthreads = 1;
|
||
wrapper.clear();
|
||
snp_fname = "";
|
||
ht_fname = "";
|
||
ss_fname = "";
|
||
exon_fname = "";
|
||
sv_fname = "";
|
||
repeat_ref_fname = "";
|
||
repeat_info_fname = "";
|
||
repeat_snp_fname = "";
|
||
repeat_haplotype_fname = "";
|
||
threeN = false;
|
||
repeatIndex = false;
|
||
base_change_entered = false;
|
||
convertedFrom = 'C';
|
||
convertedTo = 'T';
|
||
convertedFromComplement = asc2dnacomp[convertedFrom];
|
||
convertedToComplement = asc2dnacomp[convertedTo];
|
||
}
|
||
|
||
// Argument constants for getopts
|
||
enum {
|
||
ARG_BMAX = 256,
|
||
ARG_BMAX_MULT,
|
||
ARG_BMAX_DIV,
|
||
ARG_DCV,
|
||
ARG_SEED,
|
||
ARG_CUTOFF,
|
||
ARG_PMAP,
|
||
ARG_NTOA,
|
||
ARG_USAGE,
|
||
ARG_REVERSE_EACH,
|
||
ARG_SA,
|
||
ARG_WRAPPER,
|
||
ARG_LOCAL_OFFRATE,
|
||
ARG_LOCAL_FTABCHARS,
|
||
ARG_SNP,
|
||
ARG_HAPLOTYPE,
|
||
ARG_SPLICESITE,
|
||
ARG_EXON,
|
||
ARG_SV,
|
||
ARG_REPEAT_REF,
|
||
ARG_REPEAT_INFO,
|
||
ARG_REPEAT_SNP,
|
||
ARG_REPEAT_HAPLOTYPE,
|
||
ARG_3N,
|
||
ARG_REPEAT_INDEX,
|
||
ARG_BASE_CHANGE
|
||
};
|
||
|
||
/**
|
||
* Print a detailed usage message to the provided output stream.
|
||
*/
|
||
static void printUsage(ostream& out) {
|
||
out << "HISAT2 version " << string(HISAT2_VERSION).c_str() << " by Daehwan Kim (infphilo@gmail.com, http://www.ccb.jhu.edu/people/infphilo)" << endl;
|
||
|
||
#ifdef BOWTIE_64BIT_INDEX
|
||
string tool_name = "hisat2-build-l";
|
||
#else
|
||
string tool_name = "hisat2-build-s";
|
||
#endif
|
||
if(wrapper == "basic-0") {
|
||
tool_name = "hisat2-build";
|
||
}
|
||
|
||
out << "Usage: hisat2-build [options]* <reference_in> <ht2_index_base>" << endl
|
||
<< " reference_in comma-separated list of files with ref sequences" << endl
|
||
<< " hisat2_index_base write " << gfm_ext << " data to files with this dir/basename" << endl
|
||
<< "Options:" << endl
|
||
<< " -c reference sequences given on cmd line (as" << endl
|
||
<< " <reference_in>)" << endl;
|
||
if(wrapper == "basic-0") {
|
||
out << " --large-index force generated index to be 'large', even if ref" << endl
|
||
<< " has fewer than 4 billion nucleotides" << endl;
|
||
}
|
||
out << " -a/--noauto disable automatic -p/--bmax/--dcv memory-fitting" << endl
|
||
<< " -p <int> number of threads" << endl
|
||
<< " --bmax <int> max bucket sz for blockwise suffix-array builder" << endl
|
||
<< " --bmaxdivn <int> max bucket sz as divisor of ref len (default: 4)" << endl
|
||
<< " --dcv <int> diff-cover period for blockwise (default: 1024)" << endl
|
||
<< " --nodc disable diff-cover (algorithm becomes quadratic)" << endl
|
||
<< " -r/--noref don't build .3/.4.ht2 (packed reference) portion" << endl
|
||
<< " -3/--justref just build .3/.4.ht2 (packed reference) portion" << endl
|
||
<< " -o/--offrate <int> SA is sampled every 2^offRate BWT chars (default: 5)" << endl
|
||
<< " -t/--ftabchars <int> # of chars consumed in initial lookup (default: 10)" << endl
|
||
<< " --localoffrate <int> SA (local) is sampled every 2^offRate BWT chars (default: 3)" << endl
|
||
<< " --localftabchars <int> # of chars consumed in initial lookup in a local index (default: 6)" << endl
|
||
<< " --snp <path> SNP file name" << endl
|
||
<< " --haplotype <path> haplotype file name" << endl
|
||
<< " --ss <path> Splice site file name" << endl
|
||
<< " --exon <path> Exon file name" << endl
|
||
<< " --repeat-ref <path> Repeat reference file name" << endl
|
||
<< " --repeat-info <path> Repeat information file name" << endl
|
||
<< " --repeat-snp <path> Repeat snp file name" << endl
|
||
<< " --repeat-haplotype <path> Repeat haplotype file name" << endl
|
||
<< " --seed <int> seed for random number generator" << endl
|
||
<< " --base-change <chr,chr> the converted nucleotide and converted to nucleotide (default:C,T)" << endl
|
||
<< " --repeat-index<int>-<int>[,<int>-<int>] automatically build repeat database and repeat index, enter the minimum-maximum repeat length pairs (default: 100-300)" << endl
|
||
<< " -q/--quiet disable verbose output (for debugging)" << endl
|
||
<< " -h/--help print detailed description of tool and its options" << endl
|
||
<< " --usage print this usage message" << endl
|
||
<< " --version print version information and quit" << endl
|
||
;
|
||
|
||
if(wrapper.empty()) {
|
||
cerr << endl
|
||
<< "*** Warning ***" << endl
|
||
<< "'" << tool_name << "' was run directly. It is recommended "
|
||
<< "that you run the wrapper script 'hisat2-build' instead."
|
||
<< endl << endl;
|
||
}
|
||
}
|
||
|
||
static const char *short_options = "qrap:h?nscfl:i:o:t:h:3C";
|
||
|
||
static struct option long_options[] = {
|
||
{(char*)"quiet", no_argument, 0, 'q'},
|
||
{(char*)"sanity", no_argument, 0, 's'},
|
||
{(char*)"threads", required_argument, 0, 'p'},
|
||
{(char*)"little", no_argument, &bigEndian, 0},
|
||
{(char*)"big", no_argument, &bigEndian, 1},
|
||
{(char*)"bmax", required_argument, 0, ARG_BMAX},
|
||
{(char*)"bmaxmultsqrt", required_argument, 0, ARG_BMAX_MULT},
|
||
{(char*)"bmaxdivn", required_argument, 0, ARG_BMAX_DIV},
|
||
{(char*)"dcv", required_argument, 0, ARG_DCV},
|
||
{(char*)"nodc", no_argument, &noDc, 1},
|
||
{(char*)"seed", required_argument, 0, ARG_SEED},
|
||
{(char*)"entiresa", no_argument, &entireSA, 1},
|
||
{(char*)"version", no_argument, &showVersion, 1},
|
||
{(char*)"noauto", no_argument, 0, 'a'},
|
||
{(char*)"noblocks", required_argument, 0, 'n'},
|
||
{(char*)"linerate", required_argument, 0, 'l'},
|
||
{(char*)"linesperside", required_argument, 0, 'i'},
|
||
{(char*)"offrate", required_argument, 0, 'o'},
|
||
{(char*)"ftabchars", required_argument, 0, 't'},
|
||
{(char*)"localoffrate", required_argument, 0, ARG_LOCAL_OFFRATE},
|
||
{(char*)"localftabchars", required_argument, 0, ARG_LOCAL_FTABCHARS},
|
||
{(char*)"snp", required_argument, 0, ARG_SNP},
|
||
{(char*)"haplotype", required_argument, 0, ARG_HAPLOTYPE},
|
||
{(char*)"ss", required_argument, 0, ARG_SPLICESITE},
|
||
{(char*)"exon", required_argument, 0, ARG_EXON},
|
||
{(char*)"sv", required_argument, 0, ARG_SV},
|
||
{(char*)"repeat-ref", required_argument, 0, ARG_REPEAT_REF},
|
||
{(char*)"repeat-info", required_argument, 0, ARG_REPEAT_INFO},
|
||
{(char*)"repeat-snp", required_argument, 0, ARG_REPEAT_SNP},
|
||
{(char*)"repeat-haplotype", required_argument, 0, ARG_REPEAT_HAPLOTYPE},
|
||
{(char*)"help", no_argument, 0, 'h'},
|
||
{(char*)"ntoa", no_argument, 0, ARG_NTOA},
|
||
{(char*)"justref", no_argument, 0, '3'},
|
||
{(char*)"noref", no_argument, 0, 'r'},
|
||
{(char*)"sa", no_argument, 0, ARG_SA},
|
||
{(char*)"reverse-each", no_argument, 0, ARG_REVERSE_EACH},
|
||
{(char*)"usage", no_argument, 0, ARG_USAGE},
|
||
{(char*)"wrapper", required_argument, 0, ARG_WRAPPER},
|
||
{(char*)"3N", no_argument, 0, ARG_3N},
|
||
{(char*)"repeat-index", no_argument, 0, ARG_REPEAT_INDEX},
|
||
{(char*)"base-change", required_argument, 0, ARG_BASE_CHANGE},
|
||
{(char*)0, 0, 0, 0} // terminator
|
||
};
|
||
|
||
/**
|
||
* Parse an int out of optarg and enforce that it be at least 'lower';
|
||
* if it is less than 'lower', then output the given error message and
|
||
* exit with an error and a usage message.
|
||
*/
|
||
template<typename T>
|
||
static T parseNumber(T lower, const char *errmsg) {
|
||
char *endPtr= NULL;
|
||
T t = (T)strtoll(optarg, &endPtr, 10);
|
||
if (endPtr != NULL) {
|
||
if (t < lower) {
|
||
cerr << errmsg << endl;
|
||
printUsage(cerr);
|
||
throw 1;
|
||
}
|
||
return t;
|
||
}
|
||
cerr << errmsg << endl;
|
||
printUsage(cerr);
|
||
throw 1;
|
||
return -1;
|
||
}
|
||
|
||
/**
|
||
* Read command-line arguments
|
||
*/
|
||
static void parseOptions(int argc, const char **argv) {
|
||
int option_index = 0;
|
||
int next_option;
|
||
do {
|
||
next_option = getopt_long(
|
||
argc, const_cast<char**>(argv),
|
||
short_options, long_options, &option_index);
|
||
switch (next_option) {
|
||
case ARG_WRAPPER:
|
||
wrapper = optarg;
|
||
break;
|
||
case 'f': format = FASTA; break;
|
||
case 'c': format = CMDLINE; break;
|
||
//case 'p': packed = true; break;
|
||
case 'C':
|
||
cerr << "Error: -C specified but Bowtie 2 does not support colorspace input." << endl;
|
||
throw 1;
|
||
break;
|
||
case 'l':
|
||
lineRate = parseNumber<int>(3, "-l/--lineRate arg must be at least 3");
|
||
lineRate_provided = true;
|
||
break;
|
||
case 'i':
|
||
linesPerSide = parseNumber<int>(1, "-i/--linesPerSide arg must be at least 1");
|
||
break;
|
||
case 'o':
|
||
offRate = parseNumber<int>(0, "-o/--offRate arg must be at least 0");
|
||
break;
|
||
case ARG_LOCAL_OFFRATE:
|
||
localOffRate = parseNumber<int>(0, "-o/--localoffrate arg must be at least 0");
|
||
break;
|
||
case '3':
|
||
justRef = true;
|
||
break;
|
||
case 't':
|
||
ftabChars = parseNumber<int>(1, "-t/--ftabChars arg must be at least 1");
|
||
break;
|
||
case ARG_LOCAL_FTABCHARS:
|
||
localFtabChars = parseNumber<int>(1, "-t/--localftabchars arg must be at least 1");
|
||
break;
|
||
case 'n':
|
||
// all f-s is used to mean "not set", so put 'e' on end
|
||
bmax = 0xfffffffe;
|
||
break;
|
||
case 'h':
|
||
case ARG_USAGE:
|
||
printUsage(cout);
|
||
throw 0;
|
||
break;
|
||
case ARG_SNP:
|
||
snp_fname = optarg;
|
||
break;
|
||
case ARG_HAPLOTYPE:
|
||
ht_fname = optarg;
|
||
break;
|
||
case ARG_SPLICESITE:
|
||
ss_fname = optarg;
|
||
break;
|
||
case ARG_EXON:
|
||
exon_fname = optarg;
|
||
break;
|
||
case ARG_SV:
|
||
sv_fname = optarg;
|
||
break;
|
||
case ARG_REPEAT_REF:
|
||
repeat_ref_fname = optarg;
|
||
break;
|
||
case ARG_REPEAT_INFO:
|
||
repeat_info_fname = optarg;
|
||
break;
|
||
case ARG_REPEAT_SNP:
|
||
repeat_snp_fname = optarg;
|
||
break;
|
||
case ARG_REPEAT_HAPLOTYPE:
|
||
repeat_haplotype_fname = optarg;
|
||
break;
|
||
case ARG_BMAX:
|
||
bmax = parseNumber<TIndexOffU>(1, "--bmax arg must be at least 1");
|
||
bmaxMultSqrt = OFF_MASK; // don't use multSqrt
|
||
bmaxDivN = 0xffffffff; // don't use multSqrt
|
||
break;
|
||
case ARG_BMAX_MULT:
|
||
bmaxMultSqrt = parseNumber<TIndexOffU>(1, "--bmaxmultsqrt arg must be at least 1");
|
||
bmax = OFF_MASK; // don't use bmax
|
||
bmaxDivN = 0xffffffff; // don't use multSqrt
|
||
break;
|
||
case ARG_BMAX_DIV:
|
||
bmaxDivN = parseNumber<uint32_t>(1, "--bmaxdivn arg must be at least 1");
|
||
bmax = OFF_MASK; // don't use bmax
|
||
bmaxMultSqrt = OFF_MASK; // don't use multSqrt
|
||
break;
|
||
case ARG_DCV:
|
||
dcv = parseNumber<int>(3, "--dcv arg must be at least 3");
|
||
break;
|
||
case ARG_SEED:
|
||
seed = parseNumber<int>(0, "--seed arg must be at least 0");
|
||
break;
|
||
case ARG_REVERSE_EACH:
|
||
reverseEach = true;
|
||
break;
|
||
case ARG_NTOA: nsToAs = true; break;
|
||
case ARG_3N: threeN = true; break;
|
||
case ARG_REPEAT_INDEX: repeatIndex = true; break;
|
||
case ARG_BASE_CHANGE: {
|
||
EList<string> args;
|
||
tokenize(optarg, ",", args);
|
||
if(args.size() != 2) {
|
||
cerr << "Error: expected 2 comma-separated "
|
||
<< "arguments to --base-change option, got " << args.size() << endl;
|
||
throw 1;
|
||
}
|
||
|
||
getConversion(args[0][0], args[1][0], convertedFrom, convertedTo);
|
||
|
||
string s = "ACGT";
|
||
if ((s.find(convertedFrom) == std::string::npos) || (s.find(convertedTo) == std::string::npos)) {
|
||
cerr << "Please enter the nucleotide in 'ACGT' for --base-change option." << endl;
|
||
throw 1;
|
||
}
|
||
|
||
if (convertedFrom == convertedTo) {
|
||
cerr << "Please enter two different base for --base-change option. If you wish to build index without nucleotide conversion, please use hisat2-build." << endl;
|
||
throw 1;
|
||
}
|
||
|
||
base_change_entered = true;
|
||
}
|
||
case 'a': autoMem = false; break;
|
||
case 'q': verbose = false; break;
|
||
case 's': sanityCheck = true; break;
|
||
case 'r': writeRef = false; break;
|
||
case 'p':
|
||
nthreads = parseNumber<int>(1, "-p arg must be at least 1");
|
||
break;
|
||
|
||
case -1: /* Done with options. */
|
||
break;
|
||
case 0:
|
||
if (long_options[option_index].flag != 0)
|
||
break;
|
||
default:
|
||
printUsage(cerr);
|
||
throw 1;
|
||
}
|
||
} while(next_option != -1);
|
||
if(bmax < 40) {
|
||
cerr << "Warning: specified bmax is very small (" << bmax << "). This can lead to" << endl
|
||
<< "extremely slow performance and memory exhaustion. Perhaps you meant to specify" << endl
|
||
<< "a small --bmaxdivn?" << endl;
|
||
}
|
||
}
|
||
|
||
EList<string> filesWritten;
|
||
|
||
/**
|
||
* Delete all the index files that we tried to create. For when we had to
|
||
* abort the index-building process due to an error.
|
||
*/
|
||
static void deleteIdxFiles(
|
||
const string& outfile,
|
||
bool doRef,
|
||
bool justRef)
|
||
{
|
||
|
||
for(size_t i = 0; i < filesWritten.size(); i++) {
|
||
cerr << "Deleting \"" << filesWritten[i].c_str()
|
||
<< "\" file written during aborted indexing attempt." << endl;
|
||
remove(filesWritten[i].c_str());
|
||
}
|
||
}
|
||
|
||
extern void initializeCntLut();
|
||
extern void initializeCntBit();
|
||
|
||
|
||
/**
|
||
* Drive the index construction process and optionally sanity-check the
|
||
* result.
|
||
*/
|
||
template<typename TStr>
|
||
static void driver(
|
||
const string& infile,
|
||
EList<string>& infiles,
|
||
const string& snpfile,
|
||
const string& htfile,
|
||
const string& ssfile,
|
||
const string& exonfile,
|
||
const string& svfile,
|
||
const string& repeatfile,
|
||
const string& outfile,
|
||
bool packed,
|
||
int reverse,
|
||
bool localindex = true,
|
||
EList<RefRecord>* parent_szs = NULL,
|
||
EList<string>* parent_refnames = NULL,
|
||
EList<RefRecord>* output_szs = NULL,
|
||
EList<string>* output_refnames = NULL)
|
||
{
|
||
initializeCntLut();
|
||
initializeCntBit();
|
||
EList<FileBuf*> is(MISC_CAT);
|
||
bool bisulfite = false;
|
||
bool repeat = parent_szs != NULL;
|
||
RefReadInParams refparams(false, reverse, nsToAs, bisulfite);
|
||
assert_gt(infiles.size(), 0);
|
||
if(format == CMDLINE) {
|
||
// Adapt sequence strings to stringstreams open for input
|
||
stringstream *ss = new stringstream();
|
||
for(size_t i = 0; i < infiles.size(); i++) {
|
||
(*ss) << ">" << i << endl << infiles[i].c_str() << endl;
|
||
}
|
||
FileBuf *fb = new FileBuf(ss);
|
||
assert(fb != NULL);
|
||
assert(!fb->eof());
|
||
assert(fb->get() == '>');
|
||
ASSERT_ONLY(fb->reset());
|
||
assert(!fb->eof());
|
||
is.push_back(fb);
|
||
} else {
|
||
// Adapt sequence files to ifstreams
|
||
for(size_t i = 0; i < infiles.size(); i++) {
|
||
FILE *f = fopen(infiles[i].c_str(), "r");
|
||
if (f == NULL) {
|
||
cerr << "Error: could not open "<< infiles[i].c_str() << endl;
|
||
throw 1;
|
||
}
|
||
FileBuf *fb = new FileBuf(f);
|
||
assert(fb != NULL);
|
||
if(fb->peek() == -1 || fb->eof()) {
|
||
cerr << "Warning: Empty fasta file: '" << infile.c_str() << "'" << endl;
|
||
continue;
|
||
}
|
||
assert(!fb->eof());
|
||
assert(fb->get() == '>');
|
||
ASSERT_ONLY(fb->reset());
|
||
assert(!fb->eof());
|
||
is.push_back(fb);
|
||
}
|
||
}
|
||
if(is.empty()) {
|
||
cerr << "Warning: All fasta inputs were empty" << endl;
|
||
throw 1;
|
||
}
|
||
filesWritten.push_back(outfile + ".1." + gfm_ext);
|
||
filesWritten.push_back(outfile + ".2." + gfm_ext);
|
||
// Vector for the ordered list of "records" comprising the input
|
||
// sequences. A record represents a stretch of unambiguous
|
||
// characters in one of the input sequences.
|
||
EList<RefRecord> szs(MISC_CAT);
|
||
std::pair<size_t, size_t> sztot;
|
||
{
|
||
if(verbose) cerr << "Reading reference sizes" << endl;
|
||
Timer _t(cerr, " Time reading reference sizes: ", verbose);
|
||
if(!reverse && (writeRef || justRef)) {
|
||
filesWritten.push_back(outfile + ".3." + gfm_ext);
|
||
filesWritten.push_back(outfile + ".4." + gfm_ext);
|
||
sztot = BitPairReference::szsFromFasta(is, outfile, bigEndian, refparams, szs, sanityCheck);
|
||
if (threeN) {
|
||
// save the unchanged reference in .3.ht2 and .4.ht2
|
||
baseChange.restoreNormal();
|
||
EList<RefRecord> tmp_szs(MISC_CAT);
|
||
BitPairReference::szsFromFasta(is, outfile, bigEndian, refparams, tmp_szs, sanityCheck);
|
||
baseChange.restoreConversion();
|
||
}
|
||
} else {
|
||
assert(false);
|
||
sztot = BitPairReference::szsFromFasta(is, string(), bigEndian, refparams, szs, sanityCheck);
|
||
}
|
||
}
|
||
if(justRef) return;
|
||
assert_gt(sztot.first, 0);
|
||
assert_gt(sztot.second, 0);
|
||
assert_gt(szs.size(), 0);
|
||
|
||
// Construct index from input strings and parameters
|
||
filesWritten.push_back(outfile + ".5." + gfm_ext);
|
||
filesWritten.push_back(outfile + ".6." + gfm_ext);
|
||
filesWritten.push_back(outfile + ".7." + gfm_ext);
|
||
filesWritten.push_back(outfile + ".8." + gfm_ext);
|
||
TStr s;
|
||
GFM<TIndexOffU>* gfm = NULL;
|
||
if(!repeat) { // base index
|
||
gfm = new HGFM<TIndexOffU>(
|
||
s,
|
||
packed,
|
||
1, // TODO: maybe not?
|
||
lineRate,
|
||
offRate, // suffix-array sampling rate
|
||
ftabChars, // number of chars in initial arrow-pair calc
|
||
localOffRate,
|
||
localFtabChars,
|
||
nthreads,
|
||
snpfile,
|
||
htfile,
|
||
ssfile,
|
||
exonfile,
|
||
svfile,
|
||
repeatfile,
|
||
outfile, // basename for .?.ht2 files
|
||
reverse == 0, // fw
|
||
!entireSA, // useBlockwise
|
||
bmax, // block size for blockwise SA builder
|
||
bmaxMultSqrt, // block size as multiplier of sqrt(len)
|
||
bmaxDivN, // block size as divisor of len
|
||
noDc? 0 : dcv,// difference-cover period
|
||
is, // list of input streams
|
||
szs, // list of reference sizes
|
||
(TIndexOffU)sztot.first, // total size of all unambiguous ref chars
|
||
refparams, // reference read-in parameters
|
||
localindex, // create local indexes?
|
||
parent_szs, // parent szs
|
||
parent_refnames, // parent refence names
|
||
seed, // pseudo-random number generator seed
|
||
-1, // override offRate
|
||
verbose, // be talkative
|
||
autoMem, // pass exceptions up to the toplevel so that we can adjust memory settings automatically
|
||
sanityCheck); // verify results and internal consistency
|
||
} else { // repeat index
|
||
gfm = new RFM<TIndexOffU>(
|
||
s,
|
||
packed,
|
||
1, // TODO: maybe not?
|
||
lineRate,
|
||
offRate, // suffix-array sampling rate
|
||
ftabChars, // number of chars in initial arrow-pair calc
|
||
localOffRate,
|
||
localFtabChars,
|
||
nthreads,
|
||
snpfile,
|
||
htfile,
|
||
ssfile,
|
||
exonfile,
|
||
svfile,
|
||
repeatfile,
|
||
outfile, // basename for .?.ht2 files
|
||
reverse == 0, // fw
|
||
!entireSA, // useBlockwise
|
||
bmax, // block size for blockwise SA builder
|
||
bmaxMultSqrt, // block size as multiplier of sqrt(len)
|
||
bmaxDivN, // block size as divisor of len
|
||
noDc? 0 : dcv,// difference-cover period
|
||
is, // list of input streams
|
||
szs, // list of reference sizes
|
||
(TIndexOffU)sztot.first, // total size of all unambiguous ref chars
|
||
refparams, // reference read-in parameters
|
||
localindex, // create local indexes?
|
||
parent_szs, // parent szs
|
||
parent_refnames, // parent refence names
|
||
seed, // pseudo-random number generator seed
|
||
-1, // override offRate
|
||
verbose, // be talkative
|
||
autoMem, // pass exceptions up to the toplevel so that we can adjust memory settings automatically
|
||
sanityCheck); // verify results and internal consistency
|
||
}
|
||
|
||
if(output_szs != NULL) {
|
||
*output_szs = szs;
|
||
}
|
||
if(output_refnames != NULL) {
|
||
*output_refnames = gfm->_refnames_nospace;
|
||
}
|
||
|
||
// Note that the Ebwt is *not* resident in memory at this time. To
|
||
// load it into memory, call ebwt.loadIntoMemory()
|
||
if(verbose) {
|
||
// Print Ebwt's vital stats
|
||
gfm->gh().print(cerr);
|
||
}
|
||
if(sanityCheck) {
|
||
// Try restoring the original string (if there were
|
||
// multiple texts, what we'll get back is the joined,
|
||
// padded string, not a list)
|
||
gfm->loadIntoMemory(
|
||
reverse ? (refparams.reverse == REF_READ_REVERSE) : 0,
|
||
true, // load SA sample?
|
||
true, // load ftab?
|
||
true, // load rstarts?
|
||
false,
|
||
false);
|
||
SString<char> s2;
|
||
gfm->restore(s2);
|
||
gfm->evictFromMemory();
|
||
{
|
||
SString<char> joinedss;
|
||
GFM<>::join<SString<char> >(
|
||
is, // list of input streams
|
||
szs, // list of reference sizes
|
||
(TIndexOffU)sztot.first, // total size of all unambiguous ref chars
|
||
refparams, // reference read-in parameters
|
||
seed, // pseudo-random number generator seed
|
||
joinedss);
|
||
if(refparams.reverse == REF_READ_REVERSE) {
|
||
joinedss.reverse();
|
||
}
|
||
assert_eq(joinedss.length(), s2.length());
|
||
assert(sstr_eq(joinedss, s2));
|
||
}
|
||
if(verbose) {
|
||
if(s2.length() < 1000) {
|
||
cout << "Passed restore check: " << s2.toZBuf() << endl;
|
||
} else {
|
||
cout << "Passed restore check: (" << s2.length() << " chars)" << endl;
|
||
}
|
||
}
|
||
}
|
||
|
||
delete gfm;
|
||
}
|
||
|
||
static const char *argv0 = NULL;
|
||
|
||
extern "C" {
|
||
/**
|
||
* main function. Parses command-line arguments.
|
||
*/
|
||
int hisat2_build(int argc, const char **argv) {
|
||
string outfile;
|
||
try {
|
||
// Reset all global state, including getopt state
|
||
opterr = optind = 1;
|
||
resetOptions();
|
||
|
||
string infile;
|
||
EList<string> infiles(MISC_CAT);
|
||
|
||
parseOptions(argc, argv);
|
||
argv0 = argv[0];
|
||
if(showVersion) {
|
||
cout << argv0 << " version " << string(HISAT2_VERSION).c_str() << endl;
|
||
if(sizeof(void*) == 4) {
|
||
cout << "32-bit" << endl;
|
||
} else if(sizeof(void*) == 8) {
|
||
cout << "64-bit" << endl;
|
||
} else {
|
||
cout << "Neither 32- nor 64-bit: sizeof(void*) = " << sizeof(void*) << endl;
|
||
}
|
||
cout << "Built on " << BUILD_HOST << endl;
|
||
cout << BUILD_TIME << endl;
|
||
cout << "Compiler: " << COMPILER_VERSION << endl;
|
||
cout << "Options: " << COMPILER_OPTIONS << endl;
|
||
cout << "Sizeof {int, long, long long, void*, size_t, off_t}: {"
|
||
<< sizeof(int)
|
||
<< ", " << sizeof(long) << ", " << sizeof(long long)
|
||
<< ", " << sizeof(void *) << ", " << sizeof(size_t)
|
||
<< ", " << sizeof(off_t) << "}" << endl;
|
||
return 0;
|
||
}
|
||
|
||
if (!threeN && base_change_entered) {
|
||
cerr << "Please do not use --base-change for hisat2-build. To build hisat-3n index, please use hisat-3n-build." << endl;
|
||
printUsage(cerr);
|
||
throw 1;
|
||
}
|
||
if (threeN) {
|
||
convertedFromComplement = asc2dnacomp[convertedFrom];
|
||
convertedToComplement = asc2dnacomp[convertedTo];
|
||
}
|
||
// Get input filename
|
||
if(optind >= argc) {
|
||
cerr << "No input sequence or sequence file specified!" << endl;
|
||
printUsage(cerr);
|
||
return 1;
|
||
}
|
||
infile = argv[optind++];
|
||
|
||
// Get output filename
|
||
if(optind >= argc) {
|
||
cerr << "No output file specified!" << endl;
|
||
printUsage(cerr);
|
||
return 1;
|
||
}
|
||
outfile = argv[optind++];
|
||
|
||
tokenize(infile, ",", infiles);
|
||
if(infiles.size() < 1) {
|
||
cerr << "Tokenized input file list was empty!" << endl;
|
||
printUsage(cerr);
|
||
return 1;
|
||
}
|
||
|
||
if(!lineRate_provided) {
|
||
if(snp_fname == "" && ss_fname == "" && exon_fname == "") {
|
||
lineRate = GFM<TIndexOffU>::default_lineRate_fm;
|
||
} else {
|
||
lineRate = GFM<TIndexOffU>::default_lineRate_gfm;
|
||
}
|
||
}
|
||
|
||
// Optionally summarize
|
||
if(verbose) {
|
||
cerr << "Settings:" << endl
|
||
<< " Output files: \"" << outfile.c_str() << (threeN?".3n":"") << ".*." << gfm_ext << "\"" << endl
|
||
<< " Line rate: " << lineRate << " (line is " << (1<<lineRate) << " bytes)" << endl
|
||
<< " Lines per side: " << linesPerSide << " (side is " << ((1<<lineRate)*linesPerSide) << " bytes)" << endl
|
||
<< " Offset rate: " << offRate << " (one in " << (1<<offRate) << ")" << endl
|
||
<< " FTable chars: " << ftabChars << endl
|
||
<< " Strings: " << (packed? "packed" : "unpacked") << endl
|
||
<< " Local offset rate: " << localOffRate << " (one in " << (1<<localOffRate) << ")" << endl
|
||
<< " Local fTable chars: " << localFtabChars << endl
|
||
<< " Local sequence length: " << local_index_size << endl
|
||
<< " Local sequence overlap between two consecutive indexes: " << local_index_overlap << endl;
|
||
#if 0
|
||
if(bmax == OFF_MASK) {
|
||
cerr << " Max bucket size: default" << endl;
|
||
} else {
|
||
cerr << " Max bucket size: " << bmax << endl;
|
||
}
|
||
if(bmaxMultSqrt == OFF_MASK) {
|
||
cerr << " Max bucket size, sqrt multiplier: default" << endl;
|
||
} else {
|
||
cerr << " Max bucket size, sqrt multiplier: " << bmaxMultSqrt << endl;
|
||
}
|
||
if(bmaxDivN == 0xffffffff) {
|
||
cerr << " Max bucket size, len divisor: default" << endl;
|
||
} else {
|
||
cerr << " Max bucket size, len divisor: " << bmaxDivN << endl;
|
||
}
|
||
cerr << " Difference-cover sample period: " << dcv << endl;
|
||
#endif
|
||
cerr << " Endianness: " << (bigEndian? "big":"little") << endl
|
||
<< " Actual local endianness: " << (currentlyBigEndian()? "big":"little") << endl
|
||
<< " Sanity checking: " << (sanityCheck? "enabled":"disabled") << endl;
|
||
#ifdef NDEBUG
|
||
cerr << " Assertions: disabled" << endl;
|
||
#else
|
||
cerr << " Assertions: enabled" << endl;
|
||
#endif
|
||
cerr << " Random seed: " << seed << endl;
|
||
cerr << " Sizeofs: void*:" << sizeof(void*) << ", int:" << sizeof(int) << ", long:" << sizeof(long) << ", size_t:" << sizeof(size_t) << endl;
|
||
cerr << "Input files DNA, " << file_format_names[format].c_str() << ":" << endl;
|
||
for(size_t i = 0; i < infiles.size(); i++) {
|
||
cerr << " " << infiles[i].c_str() << endl;
|
||
}
|
||
}
|
||
// Seed random number generator
|
||
srand(seed);
|
||
|
||
{
|
||
Timer timer(cerr, "Total time for call to driver() for forward index: ", verbose);
|
||
try {
|
||
EList<RefRecord> parent_szs(MISC_CAT);
|
||
EList<string> parent_refnames;
|
||
string dummy_fname = "";
|
||
|
||
int nloop = threeN ? 2 : 1; // if threeN == true, nloop = 2. else one loop
|
||
for (int i = 0; i < nloop; i++) {
|
||
string tag = "";
|
||
if (threeN) {
|
||
tag += ".3n.";
|
||
if (i == 0) {
|
||
tag += convertedFrom;
|
||
tag += convertedTo;
|
||
baseChange.convert(convertedFrom, convertedTo);
|
||
} else {
|
||
tag += convertedFromComplement;
|
||
tag += convertedToComplement;
|
||
baseChange.convert(convertedFromComplement, convertedToComplement);
|
||
}
|
||
|
||
string indexFilename = outfile + tag + ".6.ht2";
|
||
if (fileExist(indexFilename)) {
|
||
cerr << "*** Find index for " << outfile + tag << ",skip this index building process." << endl;
|
||
cerr << " To re-build your hisat-3n index, please delete the old index manually before running hisat-3n-build." << endl;
|
||
continue;
|
||
}
|
||
}
|
||
|
||
driver<SString<char> >(infile,
|
||
infiles,
|
||
snp_fname,
|
||
ht_fname,
|
||
ss_fname,
|
||
exon_fname,
|
||
sv_fname,
|
||
dummy_fname,
|
||
outfile + tag,
|
||
false,
|
||
REF_READ_FORWARD,
|
||
true, // create local indexes
|
||
NULL, // no parent szs
|
||
NULL, // no parent refnames
|
||
&parent_szs, // get parent szs
|
||
&parent_refnames); // get parent refnames
|
||
|
||
if(repeat_ref_fname.length() > 0) {
|
||
string repeat_ref_fname_3N;
|
||
string repeat_info_fname_3N;
|
||
if (threeN) {
|
||
repeat_ref_fname_3N = repeat_ref_fname + tag + ".rep.fa";
|
||
repeat_info_fname_3N = repeat_info_fname + tag + ".rep.info";
|
||
}
|
||
EList<string> repeat_infiles(MISC_CAT);
|
||
tokenize(repeat_ref_fname_3N, ",", repeat_infiles);
|
||
driver<SString<char> >(repeat_ref_fname_3N,
|
||
repeat_infiles,
|
||
repeat_snp_fname,
|
||
repeat_haplotype_fname,
|
||
dummy_fname,
|
||
dummy_fname,
|
||
dummy_fname,
|
||
repeat_info_fname_3N,
|
||
outfile + tag + ".rep",
|
||
false,
|
||
REF_READ_FORWARD,
|
||
true, // create local index?
|
||
&parent_szs,
|
||
&parent_refnames);
|
||
} else if (repeatIndex) {
|
||
string repeat_ref_fname_3N = outfile + tag + ".rep.fa";
|
||
string repeat_info_fname_3N = outfile + tag + ".rep.info";
|
||
EList<string> repeat_infiles(MISC_CAT);
|
||
tokenize(repeat_ref_fname_3N, ",", repeat_infiles);
|
||
driver<SString<char> >(repeat_ref_fname_3N,
|
||
repeat_infiles,
|
||
repeat_snp_fname,
|
||
repeat_haplotype_fname,
|
||
dummy_fname,
|
||
dummy_fname,
|
||
dummy_fname,
|
||
repeat_info_fname_3N,
|
||
outfile + tag + ".rep",
|
||
false,
|
||
REF_READ_FORWARD,
|
||
true, // create local index?
|
||
&parent_szs,
|
||
&parent_refnames);
|
||
}
|
||
}
|
||
} catch(bad_alloc& e) {
|
||
if(autoMem) {
|
||
cerr << "Switching to a packed string representation." << endl;
|
||
packed = true;
|
||
} else {
|
||
throw e;
|
||
}
|
||
}
|
||
}
|
||
return 0;
|
||
} catch(std::exception& e) {
|
||
cerr << "Error: Encountered exception: '" << e.what() << "'" << endl;
|
||
cerr << "Command: ";
|
||
for(int i = 0; i < argc; i++) cerr << argv[i] << " ";
|
||
cerr << endl;
|
||
deleteIdxFiles(outfile, writeRef || justRef, justRef);
|
||
return 1;
|
||
} catch(int e) {
|
||
if(e != 0) {
|
||
cerr << "Error: Encountered internal HISAT2 exception (#" << e << ")" << endl;
|
||
cerr << "Command: ";
|
||
for(int i = 0; i < argc; i++) cerr << argv[i] << " ";
|
||
cerr << endl;
|
||
}
|
||
deleteIdxFiles(outfile, writeRef || justRef, justRef);
|
||
return e;
|
||
}
|
||
}
|
||
}
|