997 lines
36 KiB
C++
997 lines
36 KiB
C++
|
/*
|
|||
|
* Copyright 2018, Chanhee Park <parkchanhee@gmail.com> and Daehwan Kim <infphilo@gmail.com>
|
|||
|
*
|
|||
|
* This file is part of HISAT 2.
|
|||
|
*
|
|||
|
* HISAT 2 is free software: you can redistribute it and/or modify
|
|||
|
* it under the terms of the GNU General Public License as published by
|
|||
|
* the Free Software Foundation, either version 3 of the License, or
|
|||
|
* (at your option) any later version.
|
|||
|
*
|
|||
|
* HISAT 2 is distributed in the hope that it will be useful,
|
|||
|
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|||
|
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
|||
|
* GNU General Public License for more details.
|
|||
|
*
|
|||
|
* You should have received a copy of the GNU General Public License
|
|||
|
* along with HISAT 2. If not, see <http://www.gnu.org/licenses/>.
|
|||
|
*/
|
|||
|
|
|||
|
#include <iostream>
|
|||
|
#include <fstream>
|
|||
|
#include <string>
|
|||
|
#include <cassert>
|
|||
|
#include <getopt.h>
|
|||
|
#include <algorithm>
|
|||
|
#include "assert_helpers.h"
|
|||
|
#include "endian_swap.h"
|
|||
|
#include "formats.h"
|
|||
|
#include "sequence_io.h"
|
|||
|
#include "tokenize.h"
|
|||
|
#include "timer.h"
|
|||
|
#include "ref_read.h"
|
|||
|
#include "filebuf.h"
|
|||
|
#include "reference.h"
|
|||
|
#include "ds.h"
|
|||
|
#include "gfm.h"
|
|||
|
#include "aligner_sw.h"
|
|||
|
#include "aligner_result.h"
|
|||
|
#include "search_globals.h"
|
|||
|
#include "scoring.h"
|
|||
|
#include "mask.h"
|
|||
|
#include "repeat_builder.h"
|
|||
|
#include "utility_3n.h"
|
|||
|
|
|||
|
/**
|
|||
|
* \file Driver for the bowtie-build indexing tool.
|
|||
|
*/
|
|||
|
|
|||
|
#include <algorithm>
|
|||
|
#include <cstdlib>
|
|||
|
#include <fstream>
|
|||
|
#include <iostream>
|
|||
|
#include <vector>
|
|||
|
|
|||
|
MemoryTally gMemTally;
|
|||
|
// Build parameters
|
|||
|
int verbose;
|
|||
|
static int sanityCheck;
|
|||
|
static int format;
|
|||
|
static TIndexOffU bmax;
|
|||
|
static TIndexOffU bmaxMultSqrt;
|
|||
|
static uint32_t bmaxDivN;
|
|||
|
static int dcv;
|
|||
|
static int noDc;
|
|||
|
static int entireSA;
|
|||
|
static int seed;
|
|||
|
static int showVersion;
|
|||
|
// GFM parameters
|
|||
|
static int32_t lineRate;
|
|||
|
static bool lineRate_provided;
|
|||
|
static int32_t linesPerSide;
|
|||
|
static int32_t offRate;
|
|||
|
static int32_t ftabChars;
|
|||
|
static int32_t localOffRate;
|
|||
|
static int32_t localFtabChars;
|
|||
|
static int bigEndian;
|
|||
|
static bool autoMem;
|
|||
|
static int nthreads; // number of pthreads operating concurrently
|
|||
|
static string wrapper;
|
|||
|
static TIndexOffU seed_length;
|
|||
|
static TIndexOffU seed_count;
|
|||
|
static TIndexOffU repeat_count;
|
|||
|
static TIndexOffU min_repeat_length;
|
|||
|
static TIndexOffU max_repeat_length;
|
|||
|
static EList<pair<TIndexOffU, TIndexOffU> > repeat_length_pair;
|
|||
|
static TIndexOffU max_repeat_edit;
|
|||
|
static TIndexOffU max_repeat_matchlen;
|
|||
|
static bool symmetric_extend;
|
|||
|
static bool repeat_indel;
|
|||
|
static bool forward_only;
|
|||
|
static bool CGtoTG;
|
|||
|
static string repeat_str1;
|
|||
|
static string repeat_str2;
|
|||
|
TIndexOffU max_seed_mm;
|
|||
|
TIndexOffU max_seed_repeat;
|
|||
|
TIndexOffU max_seed_extlen;
|
|||
|
static bool save_sa;
|
|||
|
static bool load_sa;
|
|||
|
|
|||
|
bool threeN = false;
|
|||
|
char convertedFrom;
|
|||
|
char convertedTo;
|
|||
|
char convertedFromComplement;
|
|||
|
char convertedToComplement;
|
|||
|
bool base_change_entered;
|
|||
|
|
|||
|
static void resetOptions() {
|
|||
|
verbose = true; // be talkative (default)
|
|||
|
sanityCheck = 0; // do slow sanity checks
|
|||
|
format = FASTA; // input sequence format
|
|||
|
bmax = OFF_MASK; // max blockwise SA bucket size
|
|||
|
bmaxMultSqrt = OFF_MASK; // same, as multplier of sqrt(n)
|
|||
|
bmaxDivN = 4; // same, as divisor of n
|
|||
|
dcv = 1024; // bwise SA difference-cover sample sz
|
|||
|
noDc = 0; // disable difference-cover sample
|
|||
|
entireSA = 0; // 1 = disable blockwise SA
|
|||
|
seed = 0; // srandom seed
|
|||
|
showVersion = 0; // just print version and quit?
|
|||
|
// GFM parameters
|
|||
|
lineRate = GFM<TIndexOffU>::default_lineRate_gfm;
|
|||
|
lineRate_provided = false;
|
|||
|
linesPerSide = 1; // 1 64-byte line on a side
|
|||
|
offRate = 4; // sample 1 out of 16 SA elts
|
|||
|
ftabChars = 10; // 10 chars in initial lookup table
|
|||
|
localOffRate = 3;
|
|||
|
localFtabChars = 6;
|
|||
|
bigEndian = 0; // little endian
|
|||
|
autoMem = true; // automatically adjust memory usage parameters
|
|||
|
nthreads = 1;
|
|||
|
seed_length = 50;
|
|||
|
seed_count = 5;
|
|||
|
min_repeat_length = 100;
|
|||
|
max_repeat_length = numeric_limits<uint16_t>::max();
|
|||
|
repeat_length_pair.clear();
|
|||
|
repeat_count = 5;
|
|||
|
max_repeat_edit = 10;
|
|||
|
max_repeat_matchlen = min_repeat_length / 2; // half of repeat_length
|
|||
|
repeat_indel = false;
|
|||
|
symmetric_extend = true;
|
|||
|
forward_only = false;
|
|||
|
CGtoTG = false;
|
|||
|
max_seed_mm = 5;
|
|||
|
max_seed_repeat = 5;
|
|||
|
max_seed_extlen = 25;
|
|||
|
save_sa = false;
|
|||
|
load_sa = false;
|
|||
|
wrapper.clear();
|
|||
|
threeN = false;
|
|||
|
convertedFrom = 'C';
|
|||
|
convertedTo = 'T';
|
|||
|
convertedFromComplement = asc2dnacomp[convertedFrom];
|
|||
|
convertedToComplement = asc2dnacomp[convertedTo];
|
|||
|
base_change_entered = false;
|
|||
|
}
|
|||
|
|
|||
|
// Argument constants for getopts
|
|||
|
enum {
|
|||
|
ARG_BMAX = 256,
|
|||
|
ARG_BMAX_MULT,
|
|||
|
ARG_BMAX_DIV,
|
|||
|
ARG_DCV,
|
|||
|
ARG_SEED,
|
|||
|
ARG_CUTOFF,
|
|||
|
ARG_PMAP,
|
|||
|
ARG_NTOA,
|
|||
|
ARG_USAGE,
|
|||
|
ARG_REVERSE_EACH,
|
|||
|
ARG_SA,
|
|||
|
ARG_SEED_LENGTH,
|
|||
|
ARG_SEED_COUNT,
|
|||
|
ARG_MIN_REPEAT_LENGTH,
|
|||
|
ARG_MAX_REPEAT_LENGTH,
|
|||
|
ARG_REPEAT_LENGTH,
|
|||
|
ARG_REPEAT_COUNT,
|
|||
|
ARG_REPEAT_EDIT,
|
|||
|
ARG_REPEAT_MATCHLEN,
|
|||
|
ARG_REPEAT_INDEL,
|
|||
|
ARG_WRAPPER,
|
|||
|
ARG_ASYMMETRIC_EXTEND,
|
|||
|
ARG_FORWARD_ONLY,
|
|||
|
ARG_CGTOTG,
|
|||
|
ARG_REPEAT_STR1,
|
|||
|
ARG_REPEAT_STR2,
|
|||
|
ARG_MAX_SEED_MM,
|
|||
|
ARG_MAX_SEED_REPEAT,
|
|||
|
ARG_MAX_SEED_EXTLEN,
|
|||
|
ARG_SAVE_SA,
|
|||
|
ARG_LOAD_SA,
|
|||
|
ARG_3N,
|
|||
|
ARG_BASE_CHANGE
|
|||
|
};
|
|||
|
|
|||
|
/**
|
|||
|
* Print a detailed usage message to the provided output stream.
|
|||
|
*/
|
|||
|
static void printUsage(ostream& out) {
|
|||
|
out << "HISAT2 version " << string(HISAT2_VERSION).c_str() << " by Chanhee Park <parkchanhee@gmail.com> and Daehwan Kim <infphilo@gmail.com>" << endl;
|
|||
|
|
|||
|
string tool_name = "hisat2-repeat";
|
|||
|
out << "Usage: " << tool_name << " [options]* <reference_in>" << endl
|
|||
|
<< " reference_in comma-separated list of files with ref sequences" << endl
|
|||
|
<< "Options:" << endl
|
|||
|
<< " -c reference sequences given on cmd line (as" << endl
|
|||
|
<< " <reference_in>)" << endl;
|
|||
|
if(wrapper == "basic-0") {
|
|||
|
out << " --large-index force generated index to be 'large', even if ref" << endl
|
|||
|
<< " has fewer than 4 billion nucleotides" << endl;
|
|||
|
}
|
|||
|
out << " -a/--noauto disable automatic -p/--bmax/--dcv memory-fitting" << endl
|
|||
|
<< " -p <int> number of threads" << endl
|
|||
|
<< " --bmax <int> max bucket sz for blockwise suffix-array builder" << endl
|
|||
|
<< " --bmaxdivn <int> max bucket sz as divisor of ref len (default: 4)" << endl
|
|||
|
<< " --dcv <int> diff-cover period for blockwise (default: 1024)" << endl
|
|||
|
<< " --nodc disable diff-cover (algorithm becomes quadratic)" << endl
|
|||
|
<< " --seed-length <int> seed length (default: 50)" << endl
|
|||
|
<< " --seed-count <int> seed count (default: 5)" << endl
|
|||
|
<< " --min-repeat-length <int> minimum repeat length (default: 100)" << endl
|
|||
|
<< " --max-repeat-length <int> maximum repeat length (default: 65535)" << endl
|
|||
|
<< " --repeat-length <int>-<int>[,<int>-<int>] minimum-maximum repeat length pairs" << endl
|
|||
|
<< " --repeat-count <int> minimum repeat count (default: 5)" << endl
|
|||
|
<< " --repeat-edit <int> maximum repeat edit distance (default: 10)" << endl
|
|||
|
<< " --repeat-matchlen <int>" << endl
|
|||
|
<< " --repeat-indel" << endl
|
|||
|
<< " --repeat-str1" << endl
|
|||
|
<< " --repeat-str2" << endl
|
|||
|
<< " --asymmetric-extend extend seeds asymmetrically" << endl
|
|||
|
<< " --forward-only use forward strand only" << endl
|
|||
|
<< " --CGtoTG change CG to TG" << endl
|
|||
|
<< " --max-seed-mm <int>" << endl
|
|||
|
<< " --max-seed-repeat <int>" << endl
|
|||
|
<< " --max-seed-extlen <int>" << endl
|
|||
|
<< " --save-sa" << endl
|
|||
|
<< " --load-sa" << endl
|
|||
|
<< " --3N make 3N repeat database" << endl
|
|||
|
<< " --base-change <chr,chr> the converted nucleotide and converted to nucleotide (default:C,T)" << endl
|
|||
|
<< " -q/--quiet disable verbose output (for debugging)" << endl
|
|||
|
<< " -h/--help print detailed description of tool and its options" << endl
|
|||
|
<< " --usage print this usage message" << endl
|
|||
|
<< " --version print version information and quit" << endl
|
|||
|
;
|
|||
|
}
|
|||
|
|
|||
|
static const char *short_options = "qrap:h?nscfl:i:o:t:h:3C";
|
|||
|
|
|||
|
static struct option long_options[] = {
|
|||
|
{(char*)"quiet", no_argument, 0, 'q'},
|
|||
|
{(char*)"sanity", no_argument, 0, 's'},
|
|||
|
{(char*)"threads", required_argument, 0, 'p'},
|
|||
|
{(char*)"little", no_argument, &bigEndian, 0},
|
|||
|
{(char*)"big", no_argument, &bigEndian, 1},
|
|||
|
{(char*)"bmax", required_argument, 0, ARG_BMAX},
|
|||
|
{(char*)"bmaxmultsqrt", required_argument, 0, ARG_BMAX_MULT},
|
|||
|
{(char*)"bmaxdivn", required_argument, 0, ARG_BMAX_DIV},
|
|||
|
{(char*)"dcv", required_argument, 0, ARG_DCV},
|
|||
|
{(char*)"nodc", no_argument, &noDc, 1},
|
|||
|
{(char*)"seed", required_argument, 0, ARG_SEED},
|
|||
|
{(char*)"entiresa", no_argument, &entireSA, 1},
|
|||
|
{(char*)"version", no_argument, &showVersion, 1},
|
|||
|
{(char*)"noauto", no_argument, 0, 'a'},
|
|||
|
{(char*)"noblocks", required_argument, 0, 'n'},
|
|||
|
{(char*)"linerate", required_argument, 0, 'l'},
|
|||
|
{(char*)"linesperside", required_argument, 0, 'i'},
|
|||
|
{(char*)"usage", no_argument, 0, ARG_USAGE},
|
|||
|
{(char*)"seed-length", required_argument, 0, ARG_SEED_LENGTH},
|
|||
|
{(char*)"seed-count", required_argument, 0, ARG_SEED_COUNT},
|
|||
|
{(char*)"min-repeat-length", required_argument, 0, ARG_MIN_REPEAT_LENGTH},
|
|||
|
{(char*)"max-repeat-length", required_argument, 0, ARG_MAX_REPEAT_LENGTH},
|
|||
|
{(char*)"repeat-length", required_argument, 0, ARG_REPEAT_LENGTH},
|
|||
|
{(char*)"repeat-count", required_argument, 0, ARG_REPEAT_COUNT},
|
|||
|
{(char*)"repeat-edit", required_argument, 0, ARG_REPEAT_EDIT},
|
|||
|
{(char*)"repeat-matchlen",required_argument, 0, ARG_REPEAT_MATCHLEN},
|
|||
|
{(char*)"repeat-indel", no_argument, 0, ARG_REPEAT_INDEL},
|
|||
|
{(char*)"wrapper", required_argument, 0, ARG_WRAPPER},
|
|||
|
{(char*)"asymmetric-extend", no_argument, 0, ARG_ASYMMETRIC_EXTEND},
|
|||
|
{(char*)"forward-only", no_argument, 0, ARG_FORWARD_ONLY},
|
|||
|
{(char*)"CGtoTG", no_argument, 0, ARG_CGTOTG},
|
|||
|
{(char*)"repeat-str1", required_argument, 0, ARG_REPEAT_STR1},
|
|||
|
{(char*)"repeat-str2", required_argument, 0, ARG_REPEAT_STR2},
|
|||
|
{(char*)"max-seed-mm", required_argument, 0, ARG_MAX_SEED_MM},
|
|||
|
{(char*)"max-seed-repeat",required_argument, 0, ARG_MAX_SEED_REPEAT},
|
|||
|
{(char*)"max-seed-extlen",required_argument, 0, ARG_MAX_SEED_EXTLEN},
|
|||
|
{(char*)"save-sa", no_argument, 0, ARG_SAVE_SA},
|
|||
|
{(char*)"load-sa", no_argument, 0, ARG_LOAD_SA},
|
|||
|
{(char*)"3N", no_argument, 0, ARG_3N},
|
|||
|
{(char*)"base-change", required_argument, 0, ARG_BASE_CHANGE},
|
|||
|
{(char*)0, 0, 0, 0} // terminator
|
|||
|
};
|
|||
|
|
|||
|
/**
|
|||
|
* Parse an int out of optarg and enforce that it be at least 'lower';
|
|||
|
* if it is less than 'lower', then output the given error message and
|
|||
|
* exit with an error and a usage message.
|
|||
|
*/
|
|||
|
template<typename T>
|
|||
|
static T parseNumber(const char *str, T lower, const char *errmsg)
|
|||
|
{
|
|||
|
char *endPtr= NULL;
|
|||
|
T t = (T)strtoll(str, &endPtr, 10);
|
|||
|
if(endPtr != NULL) {
|
|||
|
if(t < lower) {
|
|||
|
cerr << errmsg << endl;
|
|||
|
printUsage(cerr);
|
|||
|
throw 1;
|
|||
|
}
|
|||
|
return t;
|
|||
|
}
|
|||
|
cerr << errmsg << endl;
|
|||
|
printUsage(cerr);
|
|||
|
throw 1;
|
|||
|
return -1;
|
|||
|
}
|
|||
|
|
|||
|
template<typename T>
|
|||
|
static T parseNumber(T lower, const char *errmsg) {
|
|||
|
return parseNumber(optarg, lower, errmsg);
|
|||
|
}
|
|||
|
|
|||
|
static void parsePair(EList<pair<TIndexOffU, TIndexOffU> >& repeat_pair)
|
|||
|
{
|
|||
|
string tok;
|
|||
|
istringstream ss(optarg);
|
|||
|
|
|||
|
while(getline(ss, tok, ',')) {
|
|||
|
if(tok.empty()) {
|
|||
|
continue;
|
|||
|
}
|
|||
|
|
|||
|
TIndexOffU min_len, max_len;
|
|||
|
|
|||
|
size_t pos = tok.find('-');
|
|||
|
if(pos == string::npos) {
|
|||
|
// min
|
|||
|
min_len = parseNumber<TIndexOffU>(tok.c_str(), 1, "min-repeat-length must be at least 1");
|
|||
|
max_len = numeric_limits<uint16_t>::max();
|
|||
|
} else if(pos == tok.length() - 1) {
|
|||
|
// min-
|
|||
|
min_len = parseNumber<TIndexOffU>(tok.substr(0, pos).c_str(), 1, "min-repeat-length must be at least 1");
|
|||
|
max_len = numeric_limits<uint16_t>::max();
|
|||
|
} else if(pos == 0) {
|
|||
|
// -max
|
|||
|
// not support?
|
|||
|
min_len = 100;
|
|||
|
max_len = parseNumber<TIndexOffU>(tok.substr(pos + 1).c_str(), 1, "max-repeat-length must be at least 1");
|
|||
|
} else {
|
|||
|
min_len = parseNumber<TIndexOffU>(tok.substr(0, pos).c_str(), 1, "min-repeat-length must be at least 1");
|
|||
|
max_len = parseNumber<TIndexOffU>(tok.substr(pos + 1).c_str(), 1, "max-repeat-length must be at least 1");
|
|||
|
}
|
|||
|
|
|||
|
if(min_len > max_len) {
|
|||
|
printUsage(cerr);
|
|||
|
throw 1;
|
|||
|
}
|
|||
|
if(max_len > numeric_limits<uint16_t>::max()) {
|
|||
|
printUsage(cerr);
|
|||
|
throw 1;
|
|||
|
}
|
|||
|
|
|||
|
repeat_pair.push_back(pair<TIndexOffU, TIndexOffU>(min_len, max_len));
|
|||
|
}
|
|||
|
}
|
|||
|
|
|||
|
/**
|
|||
|
* Read command-line arguments
|
|||
|
*/
|
|||
|
static void parseOptions(int argc, const char **argv) {
|
|||
|
int option_index = 0;
|
|||
|
int next_option;
|
|||
|
bool saw_max_repeat_matchlen = false;
|
|||
|
do {
|
|||
|
next_option = getopt_long(
|
|||
|
argc, const_cast<char**>(argv),
|
|||
|
short_options, long_options, &option_index);
|
|||
|
switch (next_option) {
|
|||
|
case ARG_WRAPPER:
|
|||
|
wrapper = optarg;
|
|||
|
break;
|
|||
|
case 'f': format = FASTA; break;
|
|||
|
case 'c': format = CMDLINE; break;
|
|||
|
//case 'p': packed = true; break;
|
|||
|
case 'C':
|
|||
|
cerr << "Error: -C specified but Bowtie 2 does not support colorspace input." << endl;
|
|||
|
throw 1;
|
|||
|
break;
|
|||
|
case 'l':
|
|||
|
lineRate = parseNumber<int>(3, "-l/--lineRate arg must be at least 3");
|
|||
|
lineRate_provided = true;
|
|||
|
break;
|
|||
|
case 'i':
|
|||
|
linesPerSide = parseNumber<int>(1, "-i/--linesPerSide arg must be at least 1");
|
|||
|
break;
|
|||
|
case 'o':
|
|||
|
offRate = parseNumber<int>(0, "-o/--offRate arg must be at least 0");
|
|||
|
break;
|
|||
|
case 'n':
|
|||
|
// all f-s is used to mean "not set", so put 'e' on end
|
|||
|
bmax = 0xfffffffe;
|
|||
|
break;
|
|||
|
case 'h':
|
|||
|
case ARG_USAGE:
|
|||
|
printUsage(cout);
|
|||
|
throw 0;
|
|||
|
break;
|
|||
|
case ARG_BMAX:
|
|||
|
bmax = parseNumber<TIndexOffU>(1, "--bmax arg must be at least 1");
|
|||
|
bmaxMultSqrt = OFF_MASK; // don't use multSqrt
|
|||
|
bmaxDivN = 0xffffffff; // don't use multSqrt
|
|||
|
break;
|
|||
|
case ARG_BMAX_MULT:
|
|||
|
bmaxMultSqrt = parseNumber<TIndexOffU>(1, "--bmaxmultsqrt arg must be at least 1");
|
|||
|
bmax = OFF_MASK; // don't use bmax
|
|||
|
bmaxDivN = 0xffffffff; // don't use multSqrt
|
|||
|
break;
|
|||
|
case ARG_BMAX_DIV:
|
|||
|
bmaxDivN = parseNumber<uint32_t>(1, "--bmaxdivn arg must be at least 1");
|
|||
|
bmax = OFF_MASK; // don't use bmax
|
|||
|
bmaxMultSqrt = OFF_MASK; // don't use multSqrt
|
|||
|
break;
|
|||
|
case ARG_DCV:
|
|||
|
dcv = parseNumber<int>(3, "--dcv arg must be at least 3");
|
|||
|
break;
|
|||
|
case ARG_SEED:
|
|||
|
seed = parseNumber<int>(0, "--seed arg must be at least 0");
|
|||
|
break;
|
|||
|
case ARG_SEED_LENGTH:
|
|||
|
seed_length = parseNumber<TIndexOffU>(1, "--seed-length arg must be at least 1");
|
|||
|
break;
|
|||
|
case ARG_SEED_COUNT:
|
|||
|
seed_count = parseNumber<TIndexOffU>(2, "--repeat-count arg must be at least 2");
|
|||
|
break;
|
|||
|
case ARG_MIN_REPEAT_LENGTH:
|
|||
|
min_repeat_length = parseNumber<TIndexOffU>(1, "--min-repeat-length arg must be at least 1");
|
|||
|
break;
|
|||
|
case ARG_MAX_REPEAT_LENGTH:
|
|||
|
max_repeat_length = parseNumber<TIndexOffU>(1, "--max-repeat-length arg must be at least 1");
|
|||
|
break;
|
|||
|
case ARG_REPEAT_LENGTH:
|
|||
|
parsePair(repeat_length_pair);
|
|||
|
break;
|
|||
|
case ARG_REPEAT_COUNT:
|
|||
|
repeat_count = parseNumber<TIndexOffU>(2, "--repeat-count arg must be at least 2");
|
|||
|
break;
|
|||
|
case ARG_REPEAT_EDIT:
|
|||
|
max_repeat_edit = parseNumber<TIndexOffU>(0, "--repeat-edit arg must be at least 0");
|
|||
|
break;
|
|||
|
case ARG_REPEAT_MATCHLEN:
|
|||
|
max_repeat_matchlen = parseNumber<TIndexOffU>(0, "--repeat-matchlen arg must be at least 0");
|
|||
|
saw_max_repeat_matchlen = true;
|
|||
|
break;
|
|||
|
case ARG_REPEAT_INDEL:
|
|||
|
repeat_indel = true;
|
|||
|
break;
|
|||
|
case ARG_ASYMMETRIC_EXTEND:
|
|||
|
symmetric_extend = false;
|
|||
|
break;
|
|||
|
case ARG_FORWARD_ONLY:
|
|||
|
forward_only = true;
|
|||
|
break;
|
|||
|
case ARG_CGTOTG:
|
|||
|
CGtoTG = true;
|
|||
|
break;
|
|||
|
case ARG_REPEAT_STR1:
|
|||
|
repeat_str1 = optarg;
|
|||
|
break;
|
|||
|
case ARG_REPEAT_STR2:
|
|||
|
repeat_str2 = optarg;
|
|||
|
break;
|
|||
|
case ARG_MAX_SEED_MM:
|
|||
|
max_seed_mm = parseNumber<TIndexOffU>(1, "--max_seed_mm arg must be at least 1");
|
|||
|
break;
|
|||
|
case ARG_MAX_SEED_REPEAT:
|
|||
|
max_seed_repeat = parseNumber<TIndexOffU>(5, "--max_seed_repeat arg must be at least 5");
|
|||
|
break;
|
|||
|
case ARG_MAX_SEED_EXTLEN:
|
|||
|
max_seed_extlen = parseNumber<TIndexOffU>(0, "--max_seed_extlen arg must be at least 0");
|
|||
|
break;
|
|||
|
case ARG_SAVE_SA:
|
|||
|
save_sa = true;
|
|||
|
break;
|
|||
|
case ARG_LOAD_SA:
|
|||
|
load_sa = true;
|
|||
|
break;
|
|||
|
case ARG_3N: {
|
|||
|
threeN = true;
|
|||
|
break;
|
|||
|
}
|
|||
|
case ARG_BASE_CHANGE: {
|
|||
|
EList<string> args;
|
|||
|
tokenize(optarg, ",", args);
|
|||
|
if(args.size() != 2) {
|
|||
|
cerr << "Error: expected 2 comma-separated "
|
|||
|
<< "arguments to --base-change option, got " << args.size() << endl;
|
|||
|
throw 1;
|
|||
|
}
|
|||
|
getConversion(args[0][0], args[1][0], convertedFrom, convertedTo);
|
|||
|
|
|||
|
string s = "ACGT";
|
|||
|
if ((s.find(convertedFrom) == std::string::npos) || (s.find(convertedTo) == std::string::npos)) {
|
|||
|
cerr << "Please enter the nucleotide in 'ACGT' for --base-change option." << endl;
|
|||
|
throw 1;
|
|||
|
}
|
|||
|
|
|||
|
if (convertedFrom == convertedTo) {
|
|||
|
cerr << "Please enter two different base for --base-change option. If you wish to build the repeat database without nucleotide conversion, please do not use --base-change and --3N options." << endl;
|
|||
|
throw 1;
|
|||
|
}
|
|||
|
|
|||
|
base_change_entered = true;
|
|||
|
}
|
|||
|
case 'a': autoMem = false; break;
|
|||
|
case 'q': verbose = false; break;
|
|||
|
case 's': sanityCheck = true; break;
|
|||
|
case 'p':
|
|||
|
nthreads = parseNumber<int>(1, "-p arg must be at least 1");
|
|||
|
break;
|
|||
|
|
|||
|
case -1: /* Done with options. */
|
|||
|
break;
|
|||
|
case 0:
|
|||
|
if (long_options[option_index].flag != 0)
|
|||
|
break;
|
|||
|
default:
|
|||
|
printUsage(cerr);
|
|||
|
throw 1;
|
|||
|
}
|
|||
|
} while(next_option != -1);
|
|||
|
|
|||
|
if(bmax < 40) {
|
|||
|
cerr << "Warning: specified bmax is very small (" << bmax << "). This can lead to" << endl
|
|||
|
<< "extremely slow performance and memory exhaustion. Perhaps you meant to specify" << endl
|
|||
|
<< "a small --bmaxdivn?" << endl;
|
|||
|
}
|
|||
|
|
|||
|
if(!saw_max_repeat_matchlen) {
|
|||
|
max_repeat_matchlen = min_repeat_length / 2;
|
|||
|
}
|
|||
|
}
|
|||
|
|
|||
|
extern void initializeCntLut();
|
|||
|
extern void initializeCntBit();
|
|||
|
|
|||
|
|
|||
|
ConvertMatrix3N baseChange;
|
|||
|
/**
|
|||
|
* Drive the index construction process and optionally sanity-check the
|
|||
|
* result.
|
|||
|
*/
|
|||
|
template<typename TStr>
|
|||
|
static void driver(
|
|||
|
const string& infile,
|
|||
|
EList<string>& infiles,
|
|||
|
const string& outfile,
|
|||
|
bool packed,
|
|||
|
bool forward_only,
|
|||
|
bool CGtoTG)
|
|||
|
{
|
|||
|
initializeCntLut();
|
|||
|
initializeCntBit();
|
|||
|
|
|||
|
EList<FileBuf*> is(MISC_CAT);
|
|||
|
bool bisulfite = false;
|
|||
|
bool nsToAs = false;
|
|||
|
RefReadInParams refparams(false, false /* reverse */, nsToAs, bisulfite);
|
|||
|
assert_gt(infiles.size(), 0);
|
|||
|
if(format == CMDLINE) {
|
|||
|
// Adapt sequence strings to stringstreams open for input
|
|||
|
stringstream *ss = new stringstream();
|
|||
|
for(size_t i = 0; i < infiles.size(); i++) {
|
|||
|
(*ss) << ">" << i << endl << infiles[i].c_str() << endl;
|
|||
|
}
|
|||
|
FileBuf *fb = new FileBuf(ss);
|
|||
|
assert(fb != NULL);
|
|||
|
assert(!fb->eof());
|
|||
|
assert(fb->get() == '>');
|
|||
|
ASSERT_ONLY(fb->reset());
|
|||
|
assert(!fb->eof());
|
|||
|
is.push_back(fb);
|
|||
|
} else {
|
|||
|
// Adapt sequence files to ifstreams
|
|||
|
for(size_t i = 0; i < infiles.size(); i++) {
|
|||
|
FILE *f = fopen(infiles[i].c_str(), "r");
|
|||
|
if (f == NULL) {
|
|||
|
cerr << "Error: could not open "<< infiles[i].c_str() << endl;
|
|||
|
throw 1;
|
|||
|
}
|
|||
|
FileBuf *fb = new FileBuf(f);
|
|||
|
assert(fb != NULL);
|
|||
|
if(fb->peek() == -1 || fb->eof()) {
|
|||
|
cerr << "Warning: Empty fasta file: '" << infile.c_str() << "'" << endl;
|
|||
|
continue;
|
|||
|
}
|
|||
|
assert(!fb->eof());
|
|||
|
assert(fb->get() == '>');
|
|||
|
ASSERT_ONLY(fb->reset());
|
|||
|
assert(!fb->eof());
|
|||
|
is.push_back(fb);
|
|||
|
}
|
|||
|
}
|
|||
|
if(is.empty()) {
|
|||
|
cerr << "Warning: All fasta inputs were empty" << endl;
|
|||
|
throw 1;
|
|||
|
}
|
|||
|
|
|||
|
// Vector for the ordered list of "records" comprising the input
|
|||
|
// sequences. A record represents a stretch of unambiguous
|
|||
|
// characters in one of the input sequences.
|
|||
|
EList<RefRecord> szs;
|
|||
|
EList<string> ref_names;
|
|||
|
std::pair<size_t, size_t> sztot;
|
|||
|
{
|
|||
|
if(verbose) cerr << "Reading reference sizes" << endl;
|
|||
|
Timer _t(cerr, " Time reading reference sizes: ", verbose);
|
|||
|
sztot = BitPairReference::szsFromFasta(is, "", bigEndian, refparams, szs, sanityCheck, &ref_names);
|
|||
|
}
|
|||
|
|
|||
|
assert_gt(sztot.first, 0);
|
|||
|
assert_gt(sztot.second, 0);
|
|||
|
assert_gt(szs.size(), 0);
|
|||
|
|
|||
|
// Compose text strings into single string
|
|||
|
cerr << "Calculating joined length" << endl;
|
|||
|
TIndexOffU jlen = 0;
|
|||
|
for(unsigned int i = 0; i < szs.size(); i++) {
|
|||
|
jlen += (TIndexOffU)szs[i].len;
|
|||
|
}
|
|||
|
// assert_geq(jlen, sztot);
|
|||
|
cerr << " Joined length: " << jlen << endl;
|
|||
|
TStr s;
|
|||
|
{
|
|||
|
bool both_strand = forward_only ? false : true;
|
|||
|
cerr << "Reserving space for joined string" << endl;
|
|||
|
cerr << "Joining reference sequences" << endl;
|
|||
|
Timer timer(cerr, " Time to join reference sequences: ", verbose);
|
|||
|
GFM<TIndexOffU>::join<TStr>(
|
|||
|
is,
|
|||
|
szs,
|
|||
|
(TIndexOffU) sztot.first,
|
|||
|
refparams,
|
|||
|
seed,
|
|||
|
s,
|
|||
|
both_strand, // include reverse complemented sequence
|
|||
|
CGtoTG); //Change CG to TG
|
|||
|
}
|
|||
|
|
|||
|
TStr sOriginal;
|
|||
|
if (threeN) {
|
|||
|
baseChange.restoreNormal();
|
|||
|
bool both_strand = forward_only ? false : true;
|
|||
|
for (int i = 0; i < is.size(); i++) {
|
|||
|
is[i]->reset();
|
|||
|
}
|
|||
|
GFM<TIndexOffU>::join<TStr>(
|
|||
|
is,
|
|||
|
szs,
|
|||
|
(TIndexOffU) sztot.first,
|
|||
|
refparams,
|
|||
|
seed,
|
|||
|
sOriginal,
|
|||
|
both_strand, // include reverse complemented sequence
|
|||
|
CGtoTG); //Change CG to TG
|
|||
|
baseChange.restoreConversion();
|
|||
|
|
|||
|
long long int guessLen = s.length() / 2;
|
|||
|
for (TIndexOffU i = 0; i < guessLen; i++) {
|
|||
|
int nt = sOriginal[guessLen - i - 1];
|
|||
|
assert_range(0, 3, nt);
|
|||
|
s[guessLen + i] = dnacomp[nt];
|
|||
|
}
|
|||
|
} else {
|
|||
|
sOriginal = s;
|
|||
|
}
|
|||
|
|
|||
|
// Successfully obtained joined reference string
|
|||
|
#ifndef NDEBUG
|
|||
|
if(forward_only) {
|
|||
|
assert_geq(s.length(), jlen);
|
|||
|
} else {
|
|||
|
assert_geq(s.length(), jlen << 1);
|
|||
|
}
|
|||
|
#endif
|
|||
|
|
|||
|
|
|||
|
BitPackedArray suffix_array;
|
|||
|
|
|||
|
bool sa_file_exist = false;
|
|||
|
string sa_fname = outfile + ".rep.sa";
|
|||
|
if(load_sa) {
|
|||
|
ifstream fp(sa_fname, std::ifstream::binary);
|
|||
|
sa_file_exist = fp.is_open();
|
|||
|
}
|
|||
|
|
|||
|
if(load_sa && sa_file_exist) {
|
|||
|
cerr << "Load SA from " << sa_fname << endl;
|
|||
|
suffix_array.readFile(sa_fname);
|
|||
|
} else {
|
|||
|
suffix_array.init(s.length() + 1);
|
|||
|
|
|||
|
if(bmax != (TIndexOffU) OFF_MASK) {
|
|||
|
// VMSG_NL("bmax according to bmax setting: " << bmax);
|
|||
|
} else if(bmaxDivN != (uint32_t) OFF_MASK) {
|
|||
|
bmax = max<uint32_t>(jlen / (bmaxDivN * nthreads), 1);
|
|||
|
// VMSG_NL("bmax according to bmaxDivN setting: " << bmax);
|
|||
|
} else {
|
|||
|
bmax = (uint32_t) sqrt(s.length());
|
|||
|
// VMSG_NL("bmax defaulted to: " << bmax);
|
|||
|
}
|
|||
|
int iter = 0;
|
|||
|
bool first = true;
|
|||
|
bool passMemExc = false, sanity = false;
|
|||
|
// Look for bmax/dcv parameters that work.
|
|||
|
while(true) {
|
|||
|
if(!first && bmax < 40 && passMemExc) {
|
|||
|
cerr << "Could not find appropriate bmax/dcv settings for building this index." << endl;
|
|||
|
cerr << "Please try indexing this reference on a computer with more memory." << endl;
|
|||
|
if (sizeof(void *) == 4) {
|
|||
|
cerr << "If this computer has more than 4 GB of memory, try using a 64-bit executable;" << endl
|
|||
|
<< "this executable is 32-bit." << endl;
|
|||
|
}
|
|||
|
throw 1;
|
|||
|
}
|
|||
|
if(dcv > 4096) dcv = 4096;
|
|||
|
if((iter % 6) == 5 && dcv < 4096 && dcv != 0) {
|
|||
|
dcv <<= 1; // double difference-cover period
|
|||
|
} else {
|
|||
|
bmax -= (bmax >> 2); // reduce by 25%
|
|||
|
}
|
|||
|
iter++;
|
|||
|
|
|||
|
suffix_array.reset();
|
|||
|
|
|||
|
try {
|
|||
|
cerr << "Using parameters --bmax " << bmax << endl;
|
|||
|
if(dcv == 0) {
|
|||
|
cerr << " and *no difference cover*" << endl;
|
|||
|
} else {
|
|||
|
cerr << " --dcv " << dcv << endl;
|
|||
|
}
|
|||
|
{
|
|||
|
cerr << " Doing ahead-of-time memory usage test" << endl;
|
|||
|
// Make a quick-and-dirty attempt to force a bad_alloc iff
|
|||
|
// we would have thrown one eventually as part of
|
|||
|
// constructing the DifferenceCoverSample
|
|||
|
dcv <<= 1;
|
|||
|
TIndexOffU sz = (TIndexOffU) DifferenceCoverSample<TStr>::simulateAllocs(s, dcv >> 1);
|
|||
|
if(nthreads > 1) sz *= (nthreads + 1);
|
|||
|
AutoArray<uint8_t> tmp(sz, EBWT_CAT);
|
|||
|
dcv >>= 1;
|
|||
|
// Likewise with the KarkkainenBlockwiseSA
|
|||
|
sz = (TIndexOffU) KarkkainenBlockwiseSA<TStr>::simulateAllocs(s, bmax);
|
|||
|
AutoArray<uint8_t> tmp2(sz, EBWT_CAT);
|
|||
|
// Grab another 20 MB out of caution
|
|||
|
AutoArray<uint32_t> extra(20 * 1024 * 1024, EBWT_CAT);
|
|||
|
// If we made it here without throwing bad_alloc, then we
|
|||
|
// passed the memory-usage stress test
|
|||
|
cerr << " Passed! Constructing with these parameters: --bmax " << bmax << " --dcv " << dcv
|
|||
|
<< endl;
|
|||
|
cerr << "" << endl;
|
|||
|
}
|
|||
|
cerr << "Constructing suffix-array element generator" << endl;
|
|||
|
|
|||
|
KarkkainenBlockwiseSA<TStr> bsa(s,
|
|||
|
bmax,
|
|||
|
nthreads,
|
|||
|
dcv,
|
|||
|
seed,
|
|||
|
sanity,
|
|||
|
passMemExc,
|
|||
|
false /* verbose */,
|
|||
|
outfile);
|
|||
|
|
|||
|
assert(bsa.suffixItrIsReset());
|
|||
|
assert_eq(bsa.size(), s.length() + 1);
|
|||
|
|
|||
|
TIndexOffU count = 0;
|
|||
|
|
|||
|
while(count < s.length() + 1) {
|
|||
|
TIndexOffU saElt = bsa.nextSuffix();
|
|||
|
count++;
|
|||
|
|
|||
|
if(count && (count % 10000000 == 0)) {
|
|||
|
cerr << "SA count " << count << endl;
|
|||
|
}
|
|||
|
|
|||
|
if(saElt == s.length()) {
|
|||
|
assert_eq(count, s.length() + 1);
|
|||
|
break;
|
|||
|
}
|
|||
|
|
|||
|
suffix_array.pushBack(saElt);
|
|||
|
}
|
|||
|
|
|||
|
break;
|
|||
|
|
|||
|
} catch (bad_alloc &e) {
|
|||
|
if(passMemExc) {
|
|||
|
cerr << " Ran out of memory; automatically trying more memory-economical parameters." << endl;
|
|||
|
} else {
|
|||
|
cerr << "Out of memory while constructing suffix array. Please try using a smaller" << endl
|
|||
|
<< "number of blocks by specifying a smaller --bmax or a larger --bmaxdivn" << endl;
|
|||
|
throw 1;
|
|||
|
}
|
|||
|
}
|
|||
|
first = false;
|
|||
|
}
|
|||
|
|
|||
|
if(save_sa) {
|
|||
|
suffix_array.writeFile(sa_fname);
|
|||
|
}
|
|||
|
}
|
|||
|
|
|||
|
cerr << "suffix_array: " << endl;
|
|||
|
suffix_array.dump();
|
|||
|
|
|||
|
|
|||
|
// Build Repeats
|
|||
|
{
|
|||
|
if(repeat_length_pair.empty()) {
|
|||
|
repeat_length_pair.push_back(pair<TIndexOffU, TIndexOffU>(min_repeat_length, max_repeat_length));
|
|||
|
}
|
|||
|
|
|||
|
RepeatParameter rp;
|
|||
|
rp.seed_len = seed_length;
|
|||
|
rp.seed_count = seed_count;
|
|||
|
rp.seed_mm = max_seed_mm;
|
|||
|
rp.max_edit = max_repeat_edit;
|
|||
|
rp.symmetric_extend = symmetric_extend;
|
|||
|
rp.extend_unit_len = max_seed_extlen;
|
|||
|
for(size_t i = 0; i < repeat_length_pair.size(); i++) {
|
|||
|
rp.min_repeat_len = repeat_length_pair[i].first;
|
|||
|
rp.max_repeat_len = repeat_length_pair[i].second;
|
|||
|
rp.repeat_count = repeat_count;
|
|||
|
rp.append_result = (i != 0);
|
|||
|
|
|||
|
RepeatBuilder<TStr> repeatBuilder(s,
|
|||
|
sOriginal,
|
|||
|
szs,
|
|||
|
ref_names,
|
|||
|
forward_only,
|
|||
|
outfile);
|
|||
|
cerr << "RepeatBuilder: " << outfile << " " << rp.min_repeat_len << "-" << rp.max_repeat_len << endl;
|
|||
|
|
|||
|
{
|
|||
|
Timer _t(cerr, " Time reading suffix array: ", verbose);
|
|||
|
repeatBuilder.readSA(rp, suffix_array);
|
|||
|
}
|
|||
|
|
|||
|
repeatBuilder.build(rp);
|
|||
|
repeatBuilder.saveFile(rp);
|
|||
|
}
|
|||
|
}
|
|||
|
}
|
|||
|
|
|||
|
static const char *argv0 = NULL;
|
|||
|
|
|||
|
extern "C" {
|
|||
|
/**
|
|||
|
* main function. Parses command-line arguments.
|
|||
|
*/
|
|||
|
int hisat2_repeat(int argc, const char **argv) {
|
|||
|
string outfile;
|
|||
|
try {
|
|||
|
// Reset all global state, including getopt state
|
|||
|
opterr = optind = 1;
|
|||
|
resetOptions();
|
|||
|
|
|||
|
string infile;
|
|||
|
EList<string> infiles(MISC_CAT);
|
|||
|
|
|||
|
parseOptions(argc, argv);
|
|||
|
argv0 = argv[0];
|
|||
|
if(showVersion) {
|
|||
|
cout << argv0 << " version " << string(HISAT2_VERSION).c_str() << endl;
|
|||
|
if(sizeof(void*) == 4) {
|
|||
|
cout << "32-bit" << endl;
|
|||
|
} else if(sizeof(void*) == 8) {
|
|||
|
cout << "64-bit" << endl;
|
|||
|
} else {
|
|||
|
cout << "Neither 32- nor 64-bit: sizeof(void*) = " << sizeof(void*) << endl;
|
|||
|
}
|
|||
|
cout << "Built on " << BUILD_HOST << endl;
|
|||
|
cout << BUILD_TIME << endl;
|
|||
|
cout << "Compiler: " << COMPILER_VERSION << endl;
|
|||
|
cout << "Options: " << COMPILER_OPTIONS << endl;
|
|||
|
cout << "Sizeof {int, long, long long, void*, size_t, off_t}: {"
|
|||
|
<< sizeof(int)
|
|||
|
<< ", " << sizeof(long) << ", " << sizeof(long long)
|
|||
|
<< ", " << sizeof(void *) << ", " << sizeof(size_t)
|
|||
|
<< ", " << sizeof(off_t) << "}" << endl;
|
|||
|
return 0;
|
|||
|
}
|
|||
|
|
|||
|
if (!threeN && base_change_entered) {
|
|||
|
cerr << "To build hisat-3n repeat database, please add argument --3N. To build the hisat2 repeat database, please remove the argument --base-change." << endl;
|
|||
|
printUsage(cerr);
|
|||
|
throw 1;
|
|||
|
}
|
|||
|
if (threeN) {
|
|||
|
convertedFromComplement = asc2dnacomp[convertedFrom];
|
|||
|
convertedToComplement = asc2dnacomp[convertedTo];
|
|||
|
}
|
|||
|
// Get input filename
|
|||
|
if(optind >= argc) {
|
|||
|
cerr << "No input sequence or sequence file specified!" << endl;
|
|||
|
printUsage(cerr);
|
|||
|
return 1;
|
|||
|
}
|
|||
|
infile = argv[optind++];
|
|||
|
|
|||
|
if(optind >= argc) {
|
|||
|
cerr << "No output file specified!" << endl;
|
|||
|
printUsage(cerr);
|
|||
|
return 1;
|
|||
|
}
|
|||
|
outfile = argv[optind++];
|
|||
|
|
|||
|
tokenize(infile, ",", infiles);
|
|||
|
if(infiles.size() < 1) {
|
|||
|
cerr << "Tokenized input file list was empty!" << endl;
|
|||
|
printUsage(cerr);
|
|||
|
return 1;
|
|||
|
}
|
|||
|
|
|||
|
// Optionally summarize
|
|||
|
if(verbose) {
|
|||
|
cerr << "Settings:" << endl;
|
|||
|
// << " Output files: \"" << outfile.c_str() << ".*." << gfm_ext << "\"" << endl;
|
|||
|
cerr << " Endianness: " << (bigEndian? "big":"little") << endl
|
|||
|
<< " Actual local endianness: " << (currentlyBigEndian()? "big":"little") << endl
|
|||
|
<< " Sanity checking: " << (sanityCheck? "enabled":"disabled") << endl;
|
|||
|
#ifdef NDEBUG
|
|||
|
cerr << " Assertions: disabled" << endl;
|
|||
|
#else
|
|||
|
cerr << " Assertions: enabled" << endl;
|
|||
|
#endif
|
|||
|
cerr << " Random seed: " << seed << endl;
|
|||
|
cerr << " Sizeofs: void*:" << sizeof(void*) << ", int:" << sizeof(int) << ", long:" << sizeof(long) << ", size_t:" << sizeof(size_t) << endl;
|
|||
|
cerr << "Input files DNA, " << file_format_names[format].c_str() << ":" << endl;
|
|||
|
for(size_t i = 0; i < infiles.size(); i++) {
|
|||
|
cerr << " " << infiles[i].c_str() << endl;
|
|||
|
}
|
|||
|
}
|
|||
|
// Seed random number generator
|
|||
|
srand(seed);
|
|||
|
{
|
|||
|
Timer timer(cerr, "Total time for call to driver() for forward index: ", verbose);
|
|||
|
try {
|
|||
|
int nloop = threeN ? 2 : 1; // if threeN == true, nloop = 2. else one loop
|
|||
|
for (int i = 0; i < nloop; i++) {
|
|||
|
string tag = "";
|
|||
|
if (threeN) {
|
|||
|
tag += ".3n.";
|
|||
|
if (i == 0) {
|
|||
|
tag += convertedFrom;
|
|||
|
tag += convertedTo;
|
|||
|
baseChange.convert(convertedFrom, convertedTo);
|
|||
|
} else {
|
|||
|
tag += convertedFromComplement;
|
|||
|
tag += convertedToComplement;
|
|||
|
baseChange.convert(convertedFromComplement, convertedToComplement);
|
|||
|
}
|
|||
|
|
|||
|
string indexFilename = outfile + tag + ".rep.fa";
|
|||
|
if (fileExist(indexFilename)) {
|
|||
|
cerr << "*** Find repeat database for " << outfile + tag << ",skip this repeat database building process." << endl;
|
|||
|
cerr << " To re-build your hisat-3n repeat database, please delete the old index manually before running hisat2-repeat or hisat-3n-build." << endl;
|
|||
|
continue;
|
|||
|
}
|
|||
|
}
|
|||
|
driver<SString<char> >(infile, infiles, outfile + tag, false, forward_only, CGtoTG);
|
|||
|
}
|
|||
|
|
|||
|
} catch(bad_alloc& e) {
|
|||
|
if(autoMem) {
|
|||
|
cerr << "Switching to a packed string representation." << endl;
|
|||
|
} else {
|
|||
|
throw e;
|
|||
|
}
|
|||
|
}
|
|||
|
}
|
|||
|
return 0;
|
|||
|
} catch(std::exception& e) {
|
|||
|
cerr << "Error: Encountered exception: '" << e.what() << "'" << endl;
|
|||
|
cerr << "Command: ";
|
|||
|
for(int i = 0; i < argc; i++) cerr << argv[i] << " ";
|
|||
|
cerr << endl;
|
|||
|
return 1;
|
|||
|
} catch(int e) {
|
|||
|
if(e != 0) {
|
|||
|
cerr << "Error: Encountered internal HISAT2 exception (#" << e << ")" << endl;
|
|||
|
cerr << "Command: ";
|
|||
|
for(int i = 0; i < argc; i++) cerr << argv[i] << " ";
|
|||
|
cerr << endl;
|
|||
|
}
|
|||
|
return e;
|
|||
|
}
|
|||
|
}
|
|||
|
}
|