/* * Copyright 2018, Chanhee Park and Daehwan Kim * * This file is part of HISAT 2. * * HISAT 2 is free software: you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation, either version 3 of the License, or * (at your option) any later version. * * HISAT 2 is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with HISAT 2. If not, see . */ #include #include #include #include #include #include #include "assert_helpers.h" #include "endian_swap.h" #include "formats.h" #include "sequence_io.h" #include "tokenize.h" #include "timer.h" #include "ref_read.h" #include "filebuf.h" #include "reference.h" #include "ds.h" #include "gfm.h" #include "aligner_sw.h" #include "aligner_result.h" #include "search_globals.h" #include "scoring.h" #include "mask.h" #include "repeat_builder.h" #include "utility_3n.h" /** * \file Driver for the bowtie-build indexing tool. */ #include #include #include #include #include MemoryTally gMemTally; // Build parameters int verbose; static int sanityCheck; static int format; static TIndexOffU bmax; static TIndexOffU bmaxMultSqrt; static uint32_t bmaxDivN; static int dcv; static int noDc; static int entireSA; static int seed; static int showVersion; // GFM parameters static int32_t lineRate; static bool lineRate_provided; static int32_t linesPerSide; static int32_t offRate; static int32_t ftabChars; static int32_t localOffRate; static int32_t localFtabChars; static int bigEndian; static bool autoMem; static int nthreads; // number of pthreads operating concurrently static string wrapper; static TIndexOffU seed_length; static TIndexOffU seed_count; static TIndexOffU repeat_count; static TIndexOffU min_repeat_length; static TIndexOffU max_repeat_length; static EList > repeat_length_pair; static TIndexOffU max_repeat_edit; static TIndexOffU max_repeat_matchlen; static bool symmetric_extend; static bool repeat_indel; static bool forward_only; static bool CGtoTG; static string repeat_str1; static string repeat_str2; TIndexOffU max_seed_mm; TIndexOffU max_seed_repeat; TIndexOffU max_seed_extlen; static bool save_sa; static bool load_sa; bool threeN = false; char convertedFrom; char convertedTo; char convertedFromComplement; char convertedToComplement; bool base_change_entered; static void resetOptions() { verbose = true; // be talkative (default) sanityCheck = 0; // do slow sanity checks format = FASTA; // input sequence format bmax = OFF_MASK; // max blockwise SA bucket size bmaxMultSqrt = OFF_MASK; // same, as multplier of sqrt(n) bmaxDivN = 4; // same, as divisor of n dcv = 1024; // bwise SA difference-cover sample sz noDc = 0; // disable difference-cover sample entireSA = 0; // 1 = disable blockwise SA seed = 0; // srandom seed showVersion = 0; // just print version and quit? // GFM parameters lineRate = GFM::default_lineRate_gfm; lineRate_provided = false; linesPerSide = 1; // 1 64-byte line on a side offRate = 4; // sample 1 out of 16 SA elts ftabChars = 10; // 10 chars in initial lookup table localOffRate = 3; localFtabChars = 6; bigEndian = 0; // little endian autoMem = true; // automatically adjust memory usage parameters nthreads = 1; seed_length = 50; seed_count = 5; min_repeat_length = 100; max_repeat_length = numeric_limits::max(); repeat_length_pair.clear(); repeat_count = 5; max_repeat_edit = 10; max_repeat_matchlen = min_repeat_length / 2; // half of repeat_length repeat_indel = false; symmetric_extend = true; forward_only = false; CGtoTG = false; max_seed_mm = 5; max_seed_repeat = 5; max_seed_extlen = 25; save_sa = false; load_sa = false; wrapper.clear(); threeN = false; convertedFrom = 'C'; convertedTo = 'T'; convertedFromComplement = asc2dnacomp[convertedFrom]; convertedToComplement = asc2dnacomp[convertedTo]; base_change_entered = false; } // Argument constants for getopts enum { ARG_BMAX = 256, ARG_BMAX_MULT, ARG_BMAX_DIV, ARG_DCV, ARG_SEED, ARG_CUTOFF, ARG_PMAP, ARG_NTOA, ARG_USAGE, ARG_REVERSE_EACH, ARG_SA, ARG_SEED_LENGTH, ARG_SEED_COUNT, ARG_MIN_REPEAT_LENGTH, ARG_MAX_REPEAT_LENGTH, ARG_REPEAT_LENGTH, ARG_REPEAT_COUNT, ARG_REPEAT_EDIT, ARG_REPEAT_MATCHLEN, ARG_REPEAT_INDEL, ARG_WRAPPER, ARG_ASYMMETRIC_EXTEND, ARG_FORWARD_ONLY, ARG_CGTOTG, ARG_REPEAT_STR1, ARG_REPEAT_STR2, ARG_MAX_SEED_MM, ARG_MAX_SEED_REPEAT, ARG_MAX_SEED_EXTLEN, ARG_SAVE_SA, ARG_LOAD_SA, ARG_3N, ARG_BASE_CHANGE }; /** * Print a detailed usage message to the provided output stream. */ static void printUsage(ostream& out) { out << "HISAT2 version " << string(HISAT2_VERSION).c_str() << " by Chanhee Park and Daehwan Kim " << endl; string tool_name = "hisat2-repeat"; out << "Usage: " << tool_name << " [options]* " << endl << " reference_in comma-separated list of files with ref sequences" << endl << "Options:" << endl << " -c reference sequences given on cmd line (as" << endl << " )" << endl; if(wrapper == "basic-0") { out << " --large-index force generated index to be 'large', even if ref" << endl << " has fewer than 4 billion nucleotides" << endl; } out << " -a/--noauto disable automatic -p/--bmax/--dcv memory-fitting" << endl << " -p number of threads" << endl << " --bmax max bucket sz for blockwise suffix-array builder" << endl << " --bmaxdivn max bucket sz as divisor of ref len (default: 4)" << endl << " --dcv diff-cover period for blockwise (default: 1024)" << endl << " --nodc disable diff-cover (algorithm becomes quadratic)" << endl << " --seed-length seed length (default: 50)" << endl << " --seed-count seed count (default: 5)" << endl << " --min-repeat-length minimum repeat length (default: 100)" << endl << " --max-repeat-length maximum repeat length (default: 65535)" << endl << " --repeat-length -[,-] minimum-maximum repeat length pairs" << endl << " --repeat-count minimum repeat count (default: 5)" << endl << " --repeat-edit maximum repeat edit distance (default: 10)" << endl << " --repeat-matchlen " << endl << " --repeat-indel" << endl << " --repeat-str1" << endl << " --repeat-str2" << endl << " --asymmetric-extend extend seeds asymmetrically" << endl << " --forward-only use forward strand only" << endl << " --CGtoTG change CG to TG" << endl << " --max-seed-mm " << endl << " --max-seed-repeat " << endl << " --max-seed-extlen " << endl << " --save-sa" << endl << " --load-sa" << endl << " --3N make 3N repeat database" << endl << " --base-change the converted nucleotide and converted to nucleotide (default:C,T)" << endl << " -q/--quiet disable verbose output (for debugging)" << endl << " -h/--help print detailed description of tool and its options" << endl << " --usage print this usage message" << endl << " --version print version information and quit" << endl ; } static const char *short_options = "qrap:h?nscfl:i:o:t:h:3C"; static struct option long_options[] = { {(char*)"quiet", no_argument, 0, 'q'}, {(char*)"sanity", no_argument, 0, 's'}, {(char*)"threads", required_argument, 0, 'p'}, {(char*)"little", no_argument, &bigEndian, 0}, {(char*)"big", no_argument, &bigEndian, 1}, {(char*)"bmax", required_argument, 0, ARG_BMAX}, {(char*)"bmaxmultsqrt", required_argument, 0, ARG_BMAX_MULT}, {(char*)"bmaxdivn", required_argument, 0, ARG_BMAX_DIV}, {(char*)"dcv", required_argument, 0, ARG_DCV}, {(char*)"nodc", no_argument, &noDc, 1}, {(char*)"seed", required_argument, 0, ARG_SEED}, {(char*)"entiresa", no_argument, &entireSA, 1}, {(char*)"version", no_argument, &showVersion, 1}, {(char*)"noauto", no_argument, 0, 'a'}, {(char*)"noblocks", required_argument, 0, 'n'}, {(char*)"linerate", required_argument, 0, 'l'}, {(char*)"linesperside", required_argument, 0, 'i'}, {(char*)"usage", no_argument, 0, ARG_USAGE}, {(char*)"seed-length", required_argument, 0, ARG_SEED_LENGTH}, {(char*)"seed-count", required_argument, 0, ARG_SEED_COUNT}, {(char*)"min-repeat-length", required_argument, 0, ARG_MIN_REPEAT_LENGTH}, {(char*)"max-repeat-length", required_argument, 0, ARG_MAX_REPEAT_LENGTH}, {(char*)"repeat-length", required_argument, 0, ARG_REPEAT_LENGTH}, {(char*)"repeat-count", required_argument, 0, ARG_REPEAT_COUNT}, {(char*)"repeat-edit", required_argument, 0, ARG_REPEAT_EDIT}, {(char*)"repeat-matchlen",required_argument, 0, ARG_REPEAT_MATCHLEN}, {(char*)"repeat-indel", no_argument, 0, ARG_REPEAT_INDEL}, {(char*)"wrapper", required_argument, 0, ARG_WRAPPER}, {(char*)"asymmetric-extend", no_argument, 0, ARG_ASYMMETRIC_EXTEND}, {(char*)"forward-only", no_argument, 0, ARG_FORWARD_ONLY}, {(char*)"CGtoTG", no_argument, 0, ARG_CGTOTG}, {(char*)"repeat-str1", required_argument, 0, ARG_REPEAT_STR1}, {(char*)"repeat-str2", required_argument, 0, ARG_REPEAT_STR2}, {(char*)"max-seed-mm", required_argument, 0, ARG_MAX_SEED_MM}, {(char*)"max-seed-repeat",required_argument, 0, ARG_MAX_SEED_REPEAT}, {(char*)"max-seed-extlen",required_argument, 0, ARG_MAX_SEED_EXTLEN}, {(char*)"save-sa", no_argument, 0, ARG_SAVE_SA}, {(char*)"load-sa", no_argument, 0, ARG_LOAD_SA}, {(char*)"3N", no_argument, 0, ARG_3N}, {(char*)"base-change", required_argument, 0, ARG_BASE_CHANGE}, {(char*)0, 0, 0, 0} // terminator }; /** * Parse an int out of optarg and enforce that it be at least 'lower'; * if it is less than 'lower', then output the given error message and * exit with an error and a usage message. */ template static T parseNumber(const char *str, T lower, const char *errmsg) { char *endPtr= NULL; T t = (T)strtoll(str, &endPtr, 10); if(endPtr != NULL) { if(t < lower) { cerr << errmsg << endl; printUsage(cerr); throw 1; } return t; } cerr << errmsg << endl; printUsage(cerr); throw 1; return -1; } template static T parseNumber(T lower, const char *errmsg) { return parseNumber(optarg, lower, errmsg); } static void parsePair(EList >& repeat_pair) { string tok; istringstream ss(optarg); while(getline(ss, tok, ',')) { if(tok.empty()) { continue; } TIndexOffU min_len, max_len; size_t pos = tok.find('-'); if(pos == string::npos) { // min min_len = parseNumber(tok.c_str(), 1, "min-repeat-length must be at least 1"); max_len = numeric_limits::max(); } else if(pos == tok.length() - 1) { // min- min_len = parseNumber(tok.substr(0, pos).c_str(), 1, "min-repeat-length must be at least 1"); max_len = numeric_limits::max(); } else if(pos == 0) { // -max // not support? min_len = 100; max_len = parseNumber(tok.substr(pos + 1).c_str(), 1, "max-repeat-length must be at least 1"); } else { min_len = parseNumber(tok.substr(0, pos).c_str(), 1, "min-repeat-length must be at least 1"); max_len = parseNumber(tok.substr(pos + 1).c_str(), 1, "max-repeat-length must be at least 1"); } if(min_len > max_len) { printUsage(cerr); throw 1; } if(max_len > numeric_limits::max()) { printUsage(cerr); throw 1; } repeat_pair.push_back(pair(min_len, max_len)); } } /** * Read command-line arguments */ static void parseOptions(int argc, const char **argv) { int option_index = 0; int next_option; bool saw_max_repeat_matchlen = false; do { next_option = getopt_long( argc, const_cast(argv), short_options, long_options, &option_index); switch (next_option) { case ARG_WRAPPER: wrapper = optarg; break; case 'f': format = FASTA; break; case 'c': format = CMDLINE; break; //case 'p': packed = true; break; case 'C': cerr << "Error: -C specified but Bowtie 2 does not support colorspace input." << endl; throw 1; break; case 'l': lineRate = parseNumber(3, "-l/--lineRate arg must be at least 3"); lineRate_provided = true; break; case 'i': linesPerSide = parseNumber(1, "-i/--linesPerSide arg must be at least 1"); break; case 'o': offRate = parseNumber(0, "-o/--offRate arg must be at least 0"); break; case 'n': // all f-s is used to mean "not set", so put 'e' on end bmax = 0xfffffffe; break; case 'h': case ARG_USAGE: printUsage(cout); throw 0; break; case ARG_BMAX: bmax = parseNumber(1, "--bmax arg must be at least 1"); bmaxMultSqrt = OFF_MASK; // don't use multSqrt bmaxDivN = 0xffffffff; // don't use multSqrt break; case ARG_BMAX_MULT: bmaxMultSqrt = parseNumber(1, "--bmaxmultsqrt arg must be at least 1"); bmax = OFF_MASK; // don't use bmax bmaxDivN = 0xffffffff; // don't use multSqrt break; case ARG_BMAX_DIV: bmaxDivN = parseNumber(1, "--bmaxdivn arg must be at least 1"); bmax = OFF_MASK; // don't use bmax bmaxMultSqrt = OFF_MASK; // don't use multSqrt break; case ARG_DCV: dcv = parseNumber(3, "--dcv arg must be at least 3"); break; case ARG_SEED: seed = parseNumber(0, "--seed arg must be at least 0"); break; case ARG_SEED_LENGTH: seed_length = parseNumber(1, "--seed-length arg must be at least 1"); break; case ARG_SEED_COUNT: seed_count = parseNumber(2, "--repeat-count arg must be at least 2"); break; case ARG_MIN_REPEAT_LENGTH: min_repeat_length = parseNumber(1, "--min-repeat-length arg must be at least 1"); break; case ARG_MAX_REPEAT_LENGTH: max_repeat_length = parseNumber(1, "--max-repeat-length arg must be at least 1"); break; case ARG_REPEAT_LENGTH: parsePair(repeat_length_pair); break; case ARG_REPEAT_COUNT: repeat_count = parseNumber(2, "--repeat-count arg must be at least 2"); break; case ARG_REPEAT_EDIT: max_repeat_edit = parseNumber(0, "--repeat-edit arg must be at least 0"); break; case ARG_REPEAT_MATCHLEN: max_repeat_matchlen = parseNumber(0, "--repeat-matchlen arg must be at least 0"); saw_max_repeat_matchlen = true; break; case ARG_REPEAT_INDEL: repeat_indel = true; break; case ARG_ASYMMETRIC_EXTEND: symmetric_extend = false; break; case ARG_FORWARD_ONLY: forward_only = true; break; case ARG_CGTOTG: CGtoTG = true; break; case ARG_REPEAT_STR1: repeat_str1 = optarg; break; case ARG_REPEAT_STR2: repeat_str2 = optarg; break; case ARG_MAX_SEED_MM: max_seed_mm = parseNumber(1, "--max_seed_mm arg must be at least 1"); break; case ARG_MAX_SEED_REPEAT: max_seed_repeat = parseNumber(5, "--max_seed_repeat arg must be at least 5"); break; case ARG_MAX_SEED_EXTLEN: max_seed_extlen = parseNumber(0, "--max_seed_extlen arg must be at least 0"); break; case ARG_SAVE_SA: save_sa = true; break; case ARG_LOAD_SA: load_sa = true; break; case ARG_3N: { threeN = true; break; } case ARG_BASE_CHANGE: { EList args; tokenize(optarg, ",", args); if(args.size() != 2) { cerr << "Error: expected 2 comma-separated " << "arguments to --base-change option, got " << args.size() << endl; throw 1; } getConversion(args[0][0], args[1][0], convertedFrom, convertedTo); string s = "ACGT"; if ((s.find(convertedFrom) == std::string::npos) || (s.find(convertedTo) == std::string::npos)) { cerr << "Please enter the nucleotide in 'ACGT' for --base-change option." << endl; throw 1; } if (convertedFrom == convertedTo) { cerr << "Please enter two different base for --base-change option. If you wish to build the repeat database without nucleotide conversion, please do not use --base-change and --3N options." << endl; throw 1; } base_change_entered = true; } case 'a': autoMem = false; break; case 'q': verbose = false; break; case 's': sanityCheck = true; break; case 'p': nthreads = parseNumber(1, "-p arg must be at least 1"); break; case -1: /* Done with options. */ break; case 0: if (long_options[option_index].flag != 0) break; default: printUsage(cerr); throw 1; } } while(next_option != -1); if(bmax < 40) { cerr << "Warning: specified bmax is very small (" << bmax << "). This can lead to" << endl << "extremely slow performance and memory exhaustion. Perhaps you meant to specify" << endl << "a small --bmaxdivn?" << endl; } if(!saw_max_repeat_matchlen) { max_repeat_matchlen = min_repeat_length / 2; } } extern void initializeCntLut(); extern void initializeCntBit(); ConvertMatrix3N baseChange; /** * Drive the index construction process and optionally sanity-check the * result. */ template static void driver( const string& infile, EList& infiles, const string& outfile, bool packed, bool forward_only, bool CGtoTG) { initializeCntLut(); initializeCntBit(); EList is(MISC_CAT); bool bisulfite = false; bool nsToAs = false; RefReadInParams refparams(false, false /* reverse */, nsToAs, bisulfite); assert_gt(infiles.size(), 0); if(format == CMDLINE) { // Adapt sequence strings to stringstreams open for input stringstream *ss = new stringstream(); for(size_t i = 0; i < infiles.size(); i++) { (*ss) << ">" << i << endl << infiles[i].c_str() << endl; } FileBuf *fb = new FileBuf(ss); assert(fb != NULL); assert(!fb->eof()); assert(fb->get() == '>'); ASSERT_ONLY(fb->reset()); assert(!fb->eof()); is.push_back(fb); } else { // Adapt sequence files to ifstreams for(size_t i = 0; i < infiles.size(); i++) { FILE *f = fopen(infiles[i].c_str(), "r"); if (f == NULL) { cerr << "Error: could not open "<< infiles[i].c_str() << endl; throw 1; } FileBuf *fb = new FileBuf(f); assert(fb != NULL); if(fb->peek() == -1 || fb->eof()) { cerr << "Warning: Empty fasta file: '" << infile.c_str() << "'" << endl; continue; } assert(!fb->eof()); assert(fb->get() == '>'); ASSERT_ONLY(fb->reset()); assert(!fb->eof()); is.push_back(fb); } } if(is.empty()) { cerr << "Warning: All fasta inputs were empty" << endl; throw 1; } // Vector for the ordered list of "records" comprising the input // sequences. A record represents a stretch of unambiguous // characters in one of the input sequences. EList szs; EList ref_names; std::pair sztot; { if(verbose) cerr << "Reading reference sizes" << endl; Timer _t(cerr, " Time reading reference sizes: ", verbose); sztot = BitPairReference::szsFromFasta(is, "", bigEndian, refparams, szs, sanityCheck, &ref_names); } assert_gt(sztot.first, 0); assert_gt(sztot.second, 0); assert_gt(szs.size(), 0); // Compose text strings into single string cerr << "Calculating joined length" << endl; TIndexOffU jlen = 0; for(unsigned int i = 0; i < szs.size(); i++) { jlen += (TIndexOffU)szs[i].len; } // assert_geq(jlen, sztot); cerr << " Joined length: " << jlen << endl; TStr s; { bool both_strand = forward_only ? false : true; cerr << "Reserving space for joined string" << endl; cerr << "Joining reference sequences" << endl; Timer timer(cerr, " Time to join reference sequences: ", verbose); GFM::join( is, szs, (TIndexOffU) sztot.first, refparams, seed, s, both_strand, // include reverse complemented sequence CGtoTG); //Change CG to TG } TStr sOriginal; if (threeN) { baseChange.restoreNormal(); bool both_strand = forward_only ? false : true; for (int i = 0; i < is.size(); i++) { is[i]->reset(); } GFM::join( is, szs, (TIndexOffU) sztot.first, refparams, seed, sOriginal, both_strand, // include reverse complemented sequence CGtoTG); //Change CG to TG baseChange.restoreConversion(); long long int guessLen = s.length() / 2; for (TIndexOffU i = 0; i < guessLen; i++) { int nt = sOriginal[guessLen - i - 1]; assert_range(0, 3, nt); s[guessLen + i] = dnacomp[nt]; } } else { sOriginal = s; } // Successfully obtained joined reference string #ifndef NDEBUG if(forward_only) { assert_geq(s.length(), jlen); } else { assert_geq(s.length(), jlen << 1); } #endif BitPackedArray suffix_array; bool sa_file_exist = false; string sa_fname = outfile + ".rep.sa"; if(load_sa) { ifstream fp(sa_fname, std::ifstream::binary); sa_file_exist = fp.is_open(); } if(load_sa && sa_file_exist) { cerr << "Load SA from " << sa_fname << endl; suffix_array.readFile(sa_fname); } else { suffix_array.init(s.length() + 1); if(bmax != (TIndexOffU) OFF_MASK) { // VMSG_NL("bmax according to bmax setting: " << bmax); } else if(bmaxDivN != (uint32_t) OFF_MASK) { bmax = max(jlen / (bmaxDivN * nthreads), 1); // VMSG_NL("bmax according to bmaxDivN setting: " << bmax); } else { bmax = (uint32_t) sqrt(s.length()); // VMSG_NL("bmax defaulted to: " << bmax); } int iter = 0; bool first = true; bool passMemExc = false, sanity = false; // Look for bmax/dcv parameters that work. while(true) { if(!first && bmax < 40 && passMemExc) { cerr << "Could not find appropriate bmax/dcv settings for building this index." << endl; cerr << "Please try indexing this reference on a computer with more memory." << endl; if (sizeof(void *) == 4) { cerr << "If this computer has more than 4 GB of memory, try using a 64-bit executable;" << endl << "this executable is 32-bit." << endl; } throw 1; } if(dcv > 4096) dcv = 4096; if((iter % 6) == 5 && dcv < 4096 && dcv != 0) { dcv <<= 1; // double difference-cover period } else { bmax -= (bmax >> 2); // reduce by 25% } iter++; suffix_array.reset(); try { cerr << "Using parameters --bmax " << bmax << endl; if(dcv == 0) { cerr << " and *no difference cover*" << endl; } else { cerr << " --dcv " << dcv << endl; } { cerr << " Doing ahead-of-time memory usage test" << endl; // Make a quick-and-dirty attempt to force a bad_alloc iff // we would have thrown one eventually as part of // constructing the DifferenceCoverSample dcv <<= 1; TIndexOffU sz = (TIndexOffU) DifferenceCoverSample::simulateAllocs(s, dcv >> 1); if(nthreads > 1) sz *= (nthreads + 1); AutoArray tmp(sz, EBWT_CAT); dcv >>= 1; // Likewise with the KarkkainenBlockwiseSA sz = (TIndexOffU) KarkkainenBlockwiseSA::simulateAllocs(s, bmax); AutoArray tmp2(sz, EBWT_CAT); // Grab another 20 MB out of caution AutoArray extra(20 * 1024 * 1024, EBWT_CAT); // If we made it here without throwing bad_alloc, then we // passed the memory-usage stress test cerr << " Passed! Constructing with these parameters: --bmax " << bmax << " --dcv " << dcv << endl; cerr << "" << endl; } cerr << "Constructing suffix-array element generator" << endl; KarkkainenBlockwiseSA bsa(s, bmax, nthreads, dcv, seed, sanity, passMemExc, false /* verbose */, outfile); assert(bsa.suffixItrIsReset()); assert_eq(bsa.size(), s.length() + 1); TIndexOffU count = 0; while(count < s.length() + 1) { TIndexOffU saElt = bsa.nextSuffix(); count++; if(count && (count % 10000000 == 0)) { cerr << "SA count " << count << endl; } if(saElt == s.length()) { assert_eq(count, s.length() + 1); break; } suffix_array.pushBack(saElt); } break; } catch (bad_alloc &e) { if(passMemExc) { cerr << " Ran out of memory; automatically trying more memory-economical parameters." << endl; } else { cerr << "Out of memory while constructing suffix array. Please try using a smaller" << endl << "number of blocks by specifying a smaller --bmax or a larger --bmaxdivn" << endl; throw 1; } } first = false; } if(save_sa) { suffix_array.writeFile(sa_fname); } } cerr << "suffix_array: " << endl; suffix_array.dump(); // Build Repeats { if(repeat_length_pair.empty()) { repeat_length_pair.push_back(pair(min_repeat_length, max_repeat_length)); } RepeatParameter rp; rp.seed_len = seed_length; rp.seed_count = seed_count; rp.seed_mm = max_seed_mm; rp.max_edit = max_repeat_edit; rp.symmetric_extend = symmetric_extend; rp.extend_unit_len = max_seed_extlen; for(size_t i = 0; i < repeat_length_pair.size(); i++) { rp.min_repeat_len = repeat_length_pair[i].first; rp.max_repeat_len = repeat_length_pair[i].second; rp.repeat_count = repeat_count; rp.append_result = (i != 0); RepeatBuilder repeatBuilder(s, sOriginal, szs, ref_names, forward_only, outfile); cerr << "RepeatBuilder: " << outfile << " " << rp.min_repeat_len << "-" << rp.max_repeat_len << endl; { Timer _t(cerr, " Time reading suffix array: ", verbose); repeatBuilder.readSA(rp, suffix_array); } repeatBuilder.build(rp); repeatBuilder.saveFile(rp); } } } static const char *argv0 = NULL; extern "C" { /** * main function. Parses command-line arguments. */ int hisat2_repeat(int argc, const char **argv) { string outfile; try { // Reset all global state, including getopt state opterr = optind = 1; resetOptions(); string infile; EList infiles(MISC_CAT); parseOptions(argc, argv); argv0 = argv[0]; if(showVersion) { cout << argv0 << " version " << string(HISAT2_VERSION).c_str() << endl; if(sizeof(void*) == 4) { cout << "32-bit" << endl; } else if(sizeof(void*) == 8) { cout << "64-bit" << endl; } else { cout << "Neither 32- nor 64-bit: sizeof(void*) = " << sizeof(void*) << endl; } cout << "Built on " << BUILD_HOST << endl; cout << BUILD_TIME << endl; cout << "Compiler: " << COMPILER_VERSION << endl; cout << "Options: " << COMPILER_OPTIONS << endl; cout << "Sizeof {int, long, long long, void*, size_t, off_t}: {" << sizeof(int) << ", " << sizeof(long) << ", " << sizeof(long long) << ", " << sizeof(void *) << ", " << sizeof(size_t) << ", " << sizeof(off_t) << "}" << endl; return 0; } if (!threeN && base_change_entered) { cerr << "To build hisat-3n repeat database, please add argument --3N. To build the hisat2 repeat database, please remove the argument --base-change." << endl; printUsage(cerr); throw 1; } if (threeN) { convertedFromComplement = asc2dnacomp[convertedFrom]; convertedToComplement = asc2dnacomp[convertedTo]; } // Get input filename if(optind >= argc) { cerr << "No input sequence or sequence file specified!" << endl; printUsage(cerr); return 1; } infile = argv[optind++]; if(optind >= argc) { cerr << "No output file specified!" << endl; printUsage(cerr); return 1; } outfile = argv[optind++]; tokenize(infile, ",", infiles); if(infiles.size() < 1) { cerr << "Tokenized input file list was empty!" << endl; printUsage(cerr); return 1; } // Optionally summarize if(verbose) { cerr << "Settings:" << endl; // << " Output files: \"" << outfile.c_str() << ".*." << gfm_ext << "\"" << endl; cerr << " Endianness: " << (bigEndian? "big":"little") << endl << " Actual local endianness: " << (currentlyBigEndian()? "big":"little") << endl << " Sanity checking: " << (sanityCheck? "enabled":"disabled") << endl; #ifdef NDEBUG cerr << " Assertions: disabled" << endl; #else cerr << " Assertions: enabled" << endl; #endif cerr << " Random seed: " << seed << endl; cerr << " Sizeofs: void*:" << sizeof(void*) << ", int:" << sizeof(int) << ", long:" << sizeof(long) << ", size_t:" << sizeof(size_t) << endl; cerr << "Input files DNA, " << file_format_names[format].c_str() << ":" << endl; for(size_t i = 0; i < infiles.size(); i++) { cerr << " " << infiles[i].c_str() << endl; } } // Seed random number generator srand(seed); { Timer timer(cerr, "Total time for call to driver() for forward index: ", verbose); try { int nloop = threeN ? 2 : 1; // if threeN == true, nloop = 2. else one loop for (int i = 0; i < nloop; i++) { string tag = ""; if (threeN) { tag += ".3n."; if (i == 0) { tag += convertedFrom; tag += convertedTo; baseChange.convert(convertedFrom, convertedTo); } else { tag += convertedFromComplement; tag += convertedToComplement; baseChange.convert(convertedFromComplement, convertedToComplement); } string indexFilename = outfile + tag + ".rep.fa"; if (fileExist(indexFilename)) { cerr << "*** Find repeat database for " << outfile + tag << ",skip this repeat database building process." << endl; cerr << " To re-build your hisat-3n repeat database, please delete the old index manually before running hisat2-repeat or hisat-3n-build." << endl; continue; } } driver >(infile, infiles, outfile + tag, false, forward_only, CGtoTG); } } catch(bad_alloc& e) { if(autoMem) { cerr << "Switching to a packed string representation." << endl; } else { throw e; } } } return 0; } catch(std::exception& e) { cerr << "Error: Encountered exception: '" << e.what() << "'" << endl; cerr << "Command: "; for(int i = 0; i < argc; i++) cerr << argv[i] << " "; cerr << endl; return 1; } catch(int e) { if(e != 0) { cerr << "Error: Encountered internal HISAT2 exception (#" << e << ")" << endl; cerr << "Command: "; for(int i = 0; i < argc; i++) cerr << argv[i] << " "; cerr << endl; } return e; } } }