917 lines
30 KiB
C++
917 lines
30 KiB
C++
|
/*
|
||
|
* Copyright 2011, Ben Langmead <langmea@cs.jhu.edu>
|
||
|
*
|
||
|
* This file is part of Bowtie 2.
|
||
|
*
|
||
|
* Bowtie 2 is free software: you can redistribute it and/or modify
|
||
|
* it under the terms of the GNU General Public License as published by
|
||
|
* the Free Software Foundation, either version 3 of the License, or
|
||
|
* (at your option) any later version.
|
||
|
*
|
||
|
* Bowtie 2 is distributed in the hope that it will be useful,
|
||
|
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||
|
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||
|
* GNU General Public License for more details.
|
||
|
*
|
||
|
* You should have received a copy of the GNU General Public License
|
||
|
* along with Bowtie 2. If not, see <http://www.gnu.org/licenses/>.
|
||
|
*/
|
||
|
|
||
|
#include <string>
|
||
|
#include <iostream>
|
||
|
#include <sstream>
|
||
|
#include <limits>
|
||
|
#include "ds.h"
|
||
|
#include "aligner_seed_policy.h"
|
||
|
#include "mem_ids.h"
|
||
|
|
||
|
using namespace std;
|
||
|
|
||
|
static int parseFuncType(const std::string& otype) {
|
||
|
string type = otype;
|
||
|
if(type == "C" || type == "Constant") {
|
||
|
return SIMPLE_FUNC_CONST;
|
||
|
} else if(type == "L" || type == "Linear") {
|
||
|
return SIMPLE_FUNC_LINEAR;
|
||
|
} else if(type == "S" || type == "Sqrt") {
|
||
|
return SIMPLE_FUNC_SQRT;
|
||
|
} else if(type == "G" || type == "Log") {
|
||
|
return SIMPLE_FUNC_LOG;
|
||
|
}
|
||
|
std::cerr << "Error: Bad function type '" << otype.c_str()
|
||
|
<< "'. Should be C (constant), L (linear), "
|
||
|
<< "S (square root) or G (natural log)." << std::endl;
|
||
|
throw 1;
|
||
|
}
|
||
|
|
||
|
#define PARSE_FUNC(fv) { \
|
||
|
if(ctoks.size() >= 1) { \
|
||
|
fv.setType(parseFuncType(ctoks[0])); \
|
||
|
} \
|
||
|
if(ctoks.size() >= 2) { \
|
||
|
double co; \
|
||
|
istringstream tmpss(ctoks[1]); \
|
||
|
tmpss >> co; \
|
||
|
fv.setConst(co); \
|
||
|
} \
|
||
|
if(ctoks.size() >= 3) { \
|
||
|
double ce; \
|
||
|
istringstream tmpss(ctoks[2]); \
|
||
|
tmpss >> ce; \
|
||
|
fv.setCoeff(ce); \
|
||
|
} \
|
||
|
if(ctoks.size() >= 4) { \
|
||
|
double mn; \
|
||
|
istringstream tmpss(ctoks[3]); \
|
||
|
tmpss >> mn; \
|
||
|
fv.setMin(mn); \
|
||
|
} \
|
||
|
if(ctoks.size() >= 5) { \
|
||
|
double mx; \
|
||
|
istringstream tmpss(ctoks[4]); \
|
||
|
tmpss >> mx; \
|
||
|
fv.setMin(mx); \
|
||
|
} \
|
||
|
}
|
||
|
|
||
|
/**
|
||
|
* Parse alignment policy when provided in this format:
|
||
|
* <lab>=<val>;<lab>=<val>;<lab>=<val>...
|
||
|
*
|
||
|
* And label=value possibilities are:
|
||
|
*
|
||
|
* Bonus for a match
|
||
|
* -----------------
|
||
|
*
|
||
|
* MA=xx (default: MA=0, or MA=2 if --local is set)
|
||
|
*
|
||
|
* xx = Each position where equal read and reference characters match up
|
||
|
* in the alignment contriubtes this amount to the total score.
|
||
|
*
|
||
|
* Penalty for a mismatch
|
||
|
* ----------------------
|
||
|
*
|
||
|
* MMP={Cxx|Q|RQ} (default: MMP=C6)
|
||
|
*
|
||
|
* Cxx = Each mismatch costs xx. If MMP=Cxx is specified, quality
|
||
|
* values are ignored when assessing penalities for mismatches.
|
||
|
* Q = Each mismatch incurs a penalty equal to the mismatched base's
|
||
|
* value.
|
||
|
* R = Each mismatch incurs a penalty equal to the mismatched base's
|
||
|
* rounded quality value. Qualities are rounded off to the
|
||
|
* nearest 10, and qualities greater than 30 are rounded to 30.
|
||
|
*
|
||
|
* Penalty for position with N (in either read or reference)
|
||
|
* ---------------------------------------------------------
|
||
|
*
|
||
|
* NP={Cxx|Q|RQ} (default: NP=C1)
|
||
|
*
|
||
|
* Cxx = Each alignment position with an N in either the read or the
|
||
|
* reference costs xx. If NP=Cxx is specified, quality values are
|
||
|
* ignored when assessing penalities for Ns.
|
||
|
* Q = Each alignment position with an N in either the read or the
|
||
|
* reference incurs a penalty equal to the read base's quality
|
||
|
* value.
|
||
|
* R = Each alignment position with an N in either the read or the
|
||
|
* reference incurs a penalty equal to the read base's rounded
|
||
|
* quality value. Qualities are rounded off to the nearest 10,
|
||
|
* and qualities greater than 30 are rounded to 30.
|
||
|
*
|
||
|
* Penalty for a read gap
|
||
|
* ----------------------
|
||
|
*
|
||
|
* RDG=xx,yy (default: RDG=5,3)
|
||
|
*
|
||
|
* xx = Read gap open penalty.
|
||
|
* yy = Read gap extension penalty.
|
||
|
*
|
||
|
* Total cost incurred by a read gap = xx + (yy * gap length)
|
||
|
*
|
||
|
* Penalty for a reference gap
|
||
|
* ---------------------------
|
||
|
*
|
||
|
* RFG=xx,yy (default: RFG=5,3)
|
||
|
*
|
||
|
* xx = Reference gap open penalty.
|
||
|
* yy = Reference gap extension penalty.
|
||
|
*
|
||
|
* Total cost incurred by a reference gap = xx + (yy * gap length)
|
||
|
*
|
||
|
* Minimum score for valid alignment
|
||
|
* ---------------------------------
|
||
|
*
|
||
|
* MIN=xx,yy (defaults: MIN=-0.6,-0.6, or MIN=0.0,0.66 if --local is set)
|
||
|
*
|
||
|
* xx,yy = For a read of length N, the total score must be at least
|
||
|
* xx + (read length * yy) for the alignment to be valid. The
|
||
|
* total score is the sum of all negative penalties (from
|
||
|
* mismatches and gaps) and all positive bonuses. The minimum
|
||
|
* can be negative (and is by default in global alignment mode).
|
||
|
*
|
||
|
* Score floor for local alignment
|
||
|
* -------------------------------
|
||
|
*
|
||
|
* FL=xx,yy (defaults: FL=-Infinity,0.0, or FL=0.0,0.0 if --local is set)
|
||
|
*
|
||
|
* xx,yy = If a cell in the dynamic programming table has a score less
|
||
|
* than xx + (read length * yy), then no valid alignment can go
|
||
|
* through it. Defaults are highly recommended.
|
||
|
*
|
||
|
* N ceiling
|
||
|
* ---------
|
||
|
*
|
||
|
* NCEIL=xx,yy (default: NCEIL=0.0,0.15)
|
||
|
*
|
||
|
* xx,yy = For a read of length N, the number of alignment
|
||
|
* positions with an N in either the read or the
|
||
|
* reference cannot exceed
|
||
|
* ceiling = xx + (read length * yy). If the ceiling is
|
||
|
* exceeded, the alignment is considered invalid.
|
||
|
*
|
||
|
* Seeds
|
||
|
* -----
|
||
|
*
|
||
|
* SEED=mm,len,ival (default: SEED=0,22)
|
||
|
*
|
||
|
* mm = Maximum number of mismatches allowed within a seed.
|
||
|
* Must be >= 0 and <= 2. Note that 2-mismatch mode is
|
||
|
* not fully sensitive; i.e. some 2-mismatch seed
|
||
|
* alignments may be missed.
|
||
|
* len = Length of seed.
|
||
|
* ival = Interval between seeds. If not specified, seed
|
||
|
* interval is determined by IVAL.
|
||
|
*
|
||
|
* Seed interval
|
||
|
* -------------
|
||
|
*
|
||
|
* IVAL={L|S|C},xx,yy (default: IVAL=S,1.0,0.0)
|
||
|
*
|
||
|
* L = let interval between seeds be a linear function of the
|
||
|
* read length. xx and yy are the constant and linear
|
||
|
* coefficients respectively. In other words, the interval
|
||
|
* equals a * len + b, where len is the read length.
|
||
|
* Intervals less than 1 are rounded up to 1.
|
||
|
* S = let interval between seeds be a function of the sqaure
|
||
|
* root of the read length. xx and yy are the
|
||
|
* coefficients. In other words, the interval equals
|
||
|
* a * sqrt(len) + b, where len is the read length.
|
||
|
* Intervals less than 1 are rounded up to 1.
|
||
|
* C = Like S but uses cube root of length instead of square
|
||
|
* root.
|
||
|
*
|
||
|
* Example 1:
|
||
|
*
|
||
|
* SEED=1,10,5 and read sequence is TGCTATCGTACGATCGTAC:
|
||
|
*
|
||
|
* The following seeds are extracted from the forward
|
||
|
* representation of the read and aligned to the reference
|
||
|
* allowing up to 1 mismatch:
|
||
|
*
|
||
|
* Read: TGCTATCGTACGATCGTACA
|
||
|
*
|
||
|
* Seed 1+: TGCTATCGTA
|
||
|
* Seed 2+: TCGTACGATC
|
||
|
* Seed 3+: CGATCGTACA
|
||
|
*
|
||
|
* ...and the following are extracted from the reverse-complement
|
||
|
* representation of the read and align to the reference allowing
|
||
|
* up to 1 mismatch:
|
||
|
*
|
||
|
* Seed 1-: TACGATAGCA
|
||
|
* Seed 2-: GATCGTACGA
|
||
|
* Seed 3-: TGTACGATCG
|
||
|
*
|
||
|
* Example 2:
|
||
|
*
|
||
|
* SEED=1,20,20 and read sequence is TGCTATCGTACGATC. The seed
|
||
|
* length is 20 but the read is only 15 characters long. In this
|
||
|
* case, Bowtie2 automatically shrinks the seed length to be equal
|
||
|
* to the read length.
|
||
|
*
|
||
|
* Read: TGCTATCGTACGATC
|
||
|
*
|
||
|
* Seed 1+: TGCTATCGTACGATC
|
||
|
* Seed 1-: GATCGTACGATAGCA
|
||
|
*
|
||
|
* Example 3:
|
||
|
*
|
||
|
* SEED=1,10,10 and read sequence is TGCTATCGTACGATC. Only one seed
|
||
|
* fits on the read; a second seed would overhang the end of the read
|
||
|
* by 5 positions. In this case, Bowtie2 extracts one seed.
|
||
|
*
|
||
|
* Read: TGCTATCGTACGATC
|
||
|
*
|
||
|
* Seed 1+: TGCTATCGTA
|
||
|
* Seed 1-: TACGATAGCA
|
||
|
*/
|
||
|
void SeedAlignmentPolicy::parseString(
|
||
|
const std::string& s,
|
||
|
bool local,
|
||
|
bool noisyHpolymer,
|
||
|
bool ignoreQuals,
|
||
|
int& bonusMatchType,
|
||
|
int& bonusMatch,
|
||
|
int& penMmcType,
|
||
|
int& penMmcMax,
|
||
|
int& penMmcMin,
|
||
|
int& penScMax,
|
||
|
int& penScMin,
|
||
|
int& penNType,
|
||
|
int& penN,
|
||
|
int& penRdExConst,
|
||
|
int& penRfExConst,
|
||
|
int& penRdExLinear,
|
||
|
int& penRfExLinear,
|
||
|
SimpleFunc& costMin,
|
||
|
SimpleFunc& nCeil,
|
||
|
bool& nCatPair,
|
||
|
int& multiseedMms,
|
||
|
int& multiseedLen,
|
||
|
SimpleFunc& multiseedIval,
|
||
|
size_t& failStreak,
|
||
|
size_t& seedRounds,
|
||
|
SimpleFunc* penCanIntronLen,
|
||
|
SimpleFunc* penNoncanIntronLen)
|
||
|
{
|
||
|
|
||
|
bonusMatchType = local ? DEFAULT_MATCH_BONUS_TYPE_LOCAL : DEFAULT_MATCH_BONUS_TYPE;
|
||
|
bonusMatch = local ? DEFAULT_MATCH_BONUS_LOCAL : DEFAULT_MATCH_BONUS;
|
||
|
penMmcType = ignoreQuals ? DEFAULT_MM_PENALTY_TYPE_IGNORE_QUALS :
|
||
|
DEFAULT_MM_PENALTY_TYPE;
|
||
|
penMmcMax = DEFAULT_MM_PENALTY_MAX;
|
||
|
penMmcMin = DEFAULT_MM_PENALTY_MIN;
|
||
|
penNType = DEFAULT_N_PENALTY_TYPE;
|
||
|
penN = DEFAULT_N_PENALTY;
|
||
|
|
||
|
penScMax = DEFAULT_SC_PENALTY_MAX;
|
||
|
penScMin = DEFAULT_SC_PENALTY_MIN;
|
||
|
|
||
|
const double DMAX = std::numeric_limits<double>::max();
|
||
|
costMin.init(
|
||
|
local ? SIMPLE_FUNC_LOG : SIMPLE_FUNC_LINEAR,
|
||
|
local ? DEFAULT_MIN_CONST_LOCAL : 0.0f,
|
||
|
local ? DEFAULT_MIN_LINEAR_LOCAL : -0.2f);
|
||
|
nCeil.init(
|
||
|
SIMPLE_FUNC_LINEAR, 0.0f, DMAX,
|
||
|
DEFAULT_N_CEIL_CONST, DEFAULT_N_CEIL_LINEAR);
|
||
|
multiseedIval.init(
|
||
|
DEFAULT_IVAL, 1.0f, DMAX,
|
||
|
DEFAULT_IVAL_B, DEFAULT_IVAL_A);
|
||
|
nCatPair = DEFAULT_N_CAT_PAIR;
|
||
|
|
||
|
if(!noisyHpolymer) {
|
||
|
penRdExConst = DEFAULT_READ_GAP_CONST;
|
||
|
penRdExLinear = DEFAULT_READ_GAP_LINEAR;
|
||
|
penRfExConst = DEFAULT_REF_GAP_CONST;
|
||
|
penRfExLinear = DEFAULT_REF_GAP_LINEAR;
|
||
|
} else {
|
||
|
penRdExConst = DEFAULT_READ_GAP_CONST_BADHPOLY;
|
||
|
penRdExLinear = DEFAULT_READ_GAP_LINEAR_BADHPOLY;
|
||
|
penRfExConst = DEFAULT_REF_GAP_CONST_BADHPOLY;
|
||
|
penRfExLinear = DEFAULT_REF_GAP_LINEAR_BADHPOLY;
|
||
|
}
|
||
|
|
||
|
multiseedMms = DEFAULT_SEEDMMS;
|
||
|
multiseedLen = DEFAULT_SEEDLEN;
|
||
|
|
||
|
EList<string> toks(MISC_CAT);
|
||
|
string tok;
|
||
|
istringstream ss(s);
|
||
|
int setting = 0;
|
||
|
// Get each ;-separated token
|
||
|
while(getline(ss, tok, ';')) {
|
||
|
setting++;
|
||
|
EList<string> etoks(MISC_CAT);
|
||
|
string etok;
|
||
|
// Divide into tokens on either side of =
|
||
|
istringstream ess(tok);
|
||
|
while(getline(ess, etok, '=')) {
|
||
|
etoks.push_back(etok);
|
||
|
}
|
||
|
// Must be exactly 1 =
|
||
|
if(etoks.size() != 2) {
|
||
|
cerr << "Error parsing alignment policy setting " << setting
|
||
|
<< "; must be bisected by = sign" << endl
|
||
|
<< "Policy: " << s.c_str() << endl;
|
||
|
assert(false); throw 1;
|
||
|
}
|
||
|
// LHS is tag, RHS value
|
||
|
string tag = etoks[0], val = etoks[1];
|
||
|
// Separate value into comma-separated tokens
|
||
|
EList<string> ctoks(MISC_CAT);
|
||
|
string ctok;
|
||
|
istringstream css(val);
|
||
|
while(getline(css, ctok, ',')) {
|
||
|
ctoks.push_back(ctok);
|
||
|
}
|
||
|
if(ctoks.size() == 0) {
|
||
|
cerr << "Error parsing alignment policy setting " << setting
|
||
|
<< "; RHS must have at least 1 token" << endl
|
||
|
<< "Policy: " << s.c_str() << endl;
|
||
|
assert(false); throw 1;
|
||
|
}
|
||
|
for(size_t i = 0; i < ctoks.size(); i++) {
|
||
|
if(ctoks[i].length() == 0) {
|
||
|
cerr << "Error parsing alignment policy setting " << setting
|
||
|
<< "; token " << i+1 << " on RHS had length=0" << endl
|
||
|
<< "Policy: " << s.c_str() << endl;
|
||
|
assert(false); throw 1;
|
||
|
}
|
||
|
}
|
||
|
// Bonus for a match
|
||
|
// MA=xx (default: MA=0, or MA=10 if --local is set)
|
||
|
if(tag == "MA") {
|
||
|
if(ctoks.size() != 1) {
|
||
|
cerr << "Error parsing alignment policy setting " << setting
|
||
|
<< "; RHS must have 1 token" << endl
|
||
|
<< "Policy: " << s.c_str() << endl;
|
||
|
assert(false); throw 1;
|
||
|
}
|
||
|
string tmp = ctoks[0];
|
||
|
istringstream tmpss(tmp);
|
||
|
tmpss >> bonusMatch;
|
||
|
}
|
||
|
// Scoring for mismatches
|
||
|
// MMP={Cxx|Q|RQ}
|
||
|
// Cxx = constant, where constant is integer xx
|
||
|
// Qxx = equal to quality, scaled
|
||
|
// R = equal to maq-rounded quality value (rounded to nearest
|
||
|
// 10, can't be greater than 30)
|
||
|
else if(tag == "MMP") {
|
||
|
if(ctoks.size() > 3) {
|
||
|
cerr << "Error parsing alignment policy setting "
|
||
|
<< "'" << tag.c_str() << "'"
|
||
|
<< "; RHS must have at most 3 tokens" << endl
|
||
|
<< "Policy: '" << s.c_str() << "'" << endl;
|
||
|
assert(false); throw 1;
|
||
|
}
|
||
|
if(ctoks[0][0] == 'C') {
|
||
|
string tmp = ctoks[0].substr(1);
|
||
|
// Parse constant penalty
|
||
|
istringstream tmpss(tmp);
|
||
|
tmpss >> penMmcMax;
|
||
|
penMmcMin = penMmcMax;
|
||
|
// Parse constant penalty
|
||
|
penMmcType = COST_MODEL_CONSTANT;
|
||
|
} else if(ctoks[0][0] == 'Q') {
|
||
|
if(ctoks.size() >= 2) {
|
||
|
string tmp = ctoks[1];
|
||
|
istringstream tmpss(tmp);
|
||
|
tmpss >> penMmcMax;
|
||
|
} else {
|
||
|
penMmcMax = DEFAULT_MM_PENALTY_MAX;
|
||
|
}
|
||
|
if(ctoks.size() >= 3) {
|
||
|
string tmp = ctoks[2];
|
||
|
istringstream tmpss(tmp);
|
||
|
tmpss >> penMmcMin;
|
||
|
} else {
|
||
|
penMmcMin = DEFAULT_MM_PENALTY_MIN;
|
||
|
}
|
||
|
if(penMmcMin > penMmcMax) {
|
||
|
cerr << "Error: Maximum mismatch penalty (" << penMmcMax
|
||
|
<< ") is less than minimum penalty (" << penMmcMin
|
||
|
<< endl;
|
||
|
throw 1;
|
||
|
}
|
||
|
// Set type to =quality
|
||
|
penMmcType = COST_MODEL_QUAL;
|
||
|
} else if(ctoks[0][0] == 'R') {
|
||
|
// Set type to=Maq-quality
|
||
|
penMmcType = COST_MODEL_ROUNDED_QUAL;
|
||
|
} else {
|
||
|
cerr << "Error parsing alignment policy setting "
|
||
|
<< "'" << tag.c_str() << "'"
|
||
|
<< "; RHS must start with C, Q or R" << endl
|
||
|
<< "Policy: '" << s.c_str() << "'" << endl;
|
||
|
assert(false); throw 1;
|
||
|
}
|
||
|
}
|
||
|
else if(tag == "SCP") {
|
||
|
if(ctoks.size() > 3) {
|
||
|
cerr << "Error parsing alignment policy setting "
|
||
|
<< "'" << tag.c_str() << "'"
|
||
|
<< "; SCP must have at most 3 tokens" << endl
|
||
|
<< "Policy: '" << s.c_str() << "'" << endl;
|
||
|
assert(false); throw 1;
|
||
|
}
|
||
|
istringstream tmpMax(ctoks[1]);
|
||
|
tmpMax >> penScMax;
|
||
|
istringstream tmpMin(ctoks[1]);
|
||
|
tmpMin >> penScMin;
|
||
|
if(penScMin > penScMax) {
|
||
|
cerr << "max (" << penScMax << ") should be >= min (" << penScMin << ")" << endl;
|
||
|
assert(false); throw 1;
|
||
|
}
|
||
|
if(penScMin < 1) {
|
||
|
cerr << "min (" << penScMin << ") should be greater than 0" << endl;
|
||
|
assert(false); throw 1;
|
||
|
}
|
||
|
}
|
||
|
// Scoring for mismatches where read char=N
|
||
|
// NP={Cxx|Q|RQ}
|
||
|
// Cxx = constant, where constant is integer xx
|
||
|
// Q = equal to quality
|
||
|
// R = equal to maq-rounded quality value (rounded to nearest
|
||
|
// 10, can't be greater than 30)
|
||
|
else if(tag == "NP") {
|
||
|
if(ctoks.size() != 1) {
|
||
|
cerr << "Error parsing alignment policy setting "
|
||
|
<< "'" << tag.c_str() << "'"
|
||
|
<< "; RHS must have 1 token" << endl
|
||
|
<< "Policy: '" << s.c_str() << "'" << endl;
|
||
|
assert(false); throw 1;
|
||
|
}
|
||
|
if(ctoks[0][0] == 'C') {
|
||
|
string tmp = ctoks[0].substr(1);
|
||
|
// Parse constant penalty
|
||
|
istringstream tmpss(tmp);
|
||
|
tmpss >> penN;
|
||
|
// Parse constant penalty
|
||
|
penNType = COST_MODEL_CONSTANT;
|
||
|
} else if(ctoks[0][0] == 'Q') {
|
||
|
// Set type to =quality
|
||
|
penNType = COST_MODEL_QUAL;
|
||
|
} else if(ctoks[0][0] == 'R') {
|
||
|
// Set type to=Maq-quality
|
||
|
penNType = COST_MODEL_ROUNDED_QUAL;
|
||
|
} else {
|
||
|
cerr << "Error parsing alignment policy setting "
|
||
|
<< "'" << tag.c_str() << "'"
|
||
|
<< "; RHS must start with C, Q or R" << endl
|
||
|
<< "Policy: '" << s.c_str() << "'" << endl;
|
||
|
assert(false); throw 1;
|
||
|
}
|
||
|
}
|
||
|
// Scoring for read gaps
|
||
|
// RDG=xx,yy,zz
|
||
|
// xx = read gap open penalty
|
||
|
// yy = read gap extension penalty constant coefficient
|
||
|
// (defaults to open penalty)
|
||
|
// zz = read gap extension penalty linear coefficient
|
||
|
// (defaults to 0)
|
||
|
else if(tag == "RDG") {
|
||
|
if(ctoks.size() >= 1) {
|
||
|
istringstream tmpss(ctoks[0]);
|
||
|
tmpss >> penRdExConst;
|
||
|
} else {
|
||
|
penRdExConst = noisyHpolymer ?
|
||
|
DEFAULT_READ_GAP_CONST_BADHPOLY :
|
||
|
DEFAULT_READ_GAP_CONST;
|
||
|
}
|
||
|
if(ctoks.size() >= 2) {
|
||
|
istringstream tmpss(ctoks[1]);
|
||
|
tmpss >> penRdExLinear;
|
||
|
} else {
|
||
|
penRdExLinear = noisyHpolymer ?
|
||
|
DEFAULT_READ_GAP_LINEAR_BADHPOLY :
|
||
|
DEFAULT_READ_GAP_LINEAR;
|
||
|
}
|
||
|
}
|
||
|
// Scoring for reference gaps
|
||
|
// RFG=xx,yy,zz
|
||
|
// xx = ref gap open penalty
|
||
|
// yy = ref gap extension penalty constant coefficient
|
||
|
// (defaults to open penalty)
|
||
|
// zz = ref gap extension penalty linear coefficient
|
||
|
// (defaults to 0)
|
||
|
else if(tag == "RFG") {
|
||
|
if(ctoks.size() >= 1) {
|
||
|
istringstream tmpss(ctoks[0]);
|
||
|
tmpss >> penRfExConst;
|
||
|
} else {
|
||
|
penRfExConst = noisyHpolymer ?
|
||
|
DEFAULT_REF_GAP_CONST_BADHPOLY :
|
||
|
DEFAULT_REF_GAP_CONST;
|
||
|
}
|
||
|
if(ctoks.size() >= 2) {
|
||
|
istringstream tmpss(ctoks[1]);
|
||
|
tmpss >> penRfExLinear;
|
||
|
} else {
|
||
|
penRfExLinear = noisyHpolymer ?
|
||
|
DEFAULT_REF_GAP_LINEAR_BADHPOLY :
|
||
|
DEFAULT_REF_GAP_LINEAR;
|
||
|
}
|
||
|
}
|
||
|
// Minimum score as a function of read length
|
||
|
// MIN=xx,yy
|
||
|
// xx = constant coefficient
|
||
|
// yy = linear coefficient
|
||
|
else if(tag == "MIN") {
|
||
|
PARSE_FUNC(costMin);
|
||
|
}
|
||
|
// Per-read N ceiling as a function of read length
|
||
|
// NCEIL=xx,yy
|
||
|
// xx = N ceiling constant coefficient
|
||
|
// yy = N ceiling linear coefficient (set to 0 if unspecified)
|
||
|
else if(tag == "NCEIL") {
|
||
|
PARSE_FUNC(nCeil);
|
||
|
}
|
||
|
/*
|
||
|
* Seeds
|
||
|
* -----
|
||
|
*
|
||
|
* SEED=mm,len,ival (default: SEED=0,22)
|
||
|
*
|
||
|
* mm = Maximum number of mismatches allowed within a seed.
|
||
|
* Must be >= 0 and <= 2. Note that 2-mismatch mode is
|
||
|
* not fully sensitive; i.e. some 2-mismatch seed
|
||
|
* alignments may be missed.
|
||
|
* len = Length of seed.
|
||
|
* ival = Interval between seeds. If not specified, seed
|
||
|
* interval is determined by IVAL.
|
||
|
*/
|
||
|
else if(tag == "SEED") {
|
||
|
if(ctoks.size() > 2) {
|
||
|
cerr << "Error parsing alignment policy setting "
|
||
|
<< "'" << tag.c_str() << "'; RHS must have 1 or 2 tokens, "
|
||
|
<< "had " << ctoks.size() << ". "
|
||
|
<< "Policy: '" << s.c_str() << "'" << endl;
|
||
|
assert(false); throw 1;
|
||
|
}
|
||
|
if(ctoks.size() >= 1) {
|
||
|
istringstream tmpss(ctoks[0]);
|
||
|
tmpss >> multiseedMms;
|
||
|
if(multiseedMms > 1) {
|
||
|
cerr << "Error: -N was set to " << multiseedMms << ", but cannot be set greater than 1" << endl;
|
||
|
throw 1;
|
||
|
}
|
||
|
if(multiseedMms < 0) {
|
||
|
cerr << "Error: -N was set to a number less than 0 (" << multiseedMms << ")" << endl;
|
||
|
throw 1;
|
||
|
}
|
||
|
}
|
||
|
if(ctoks.size() >= 2) {
|
||
|
istringstream tmpss(ctoks[1]);
|
||
|
tmpss >> multiseedLen;
|
||
|
} else {
|
||
|
multiseedLen = DEFAULT_SEEDLEN;
|
||
|
}
|
||
|
}
|
||
|
else if(tag == "SEEDLEN") {
|
||
|
if(ctoks.size() > 1) {
|
||
|
cerr << "Error parsing alignment policy setting "
|
||
|
<< "'" << tag.c_str() << "'; RHS must have 1 token, "
|
||
|
<< "had " << ctoks.size() << ". "
|
||
|
<< "Policy: '" << s.c_str() << "'" << endl;
|
||
|
assert(false); throw 1;
|
||
|
}
|
||
|
if(ctoks.size() >= 1) {
|
||
|
istringstream tmpss(ctoks[0]);
|
||
|
tmpss >> multiseedLen;
|
||
|
}
|
||
|
}
|
||
|
else if(tag == "DPS") {
|
||
|
if(ctoks.size() > 1) {
|
||
|
cerr << "Error parsing alignment policy setting "
|
||
|
<< "'" << tag.c_str() << "'; RHS must have 1 token, "
|
||
|
<< "had " << ctoks.size() << ". "
|
||
|
<< "Policy: '" << s.c_str() << "'" << endl;
|
||
|
assert(false); throw 1;
|
||
|
}
|
||
|
if(ctoks.size() >= 1) {
|
||
|
istringstream tmpss(ctoks[0]);
|
||
|
tmpss >> failStreak;
|
||
|
}
|
||
|
}
|
||
|
else if(tag == "ROUNDS") {
|
||
|
if(ctoks.size() > 1) {
|
||
|
cerr << "Error parsing alignment policy setting "
|
||
|
<< "'" << tag.c_str() << "'; RHS must have 1 token, "
|
||
|
<< "had " << ctoks.size() << ". "
|
||
|
<< "Policy: '" << s.c_str() << "'" << endl;
|
||
|
assert(false); throw 1;
|
||
|
}
|
||
|
if(ctoks.size() >= 1) {
|
||
|
istringstream tmpss(ctoks[0]);
|
||
|
tmpss >> seedRounds;
|
||
|
}
|
||
|
}
|
||
|
/*
|
||
|
* Seed interval
|
||
|
* -------------
|
||
|
*
|
||
|
* IVAL={L|S|C},a,b (default: IVAL=S,1.0,0.0)
|
||
|
*
|
||
|
* L = let interval between seeds be a linear function of the
|
||
|
* read length. xx and yy are the constant and linear
|
||
|
* coefficients respectively. In other words, the interval
|
||
|
* equals a * len + b, where len is the read length.
|
||
|
* Intervals less than 1 are rounded up to 1.
|
||
|
* S = let interval between seeds be a function of the sqaure
|
||
|
* root of the read length. xx and yy are the
|
||
|
* coefficients. In other words, the interval equals
|
||
|
* a * sqrt(len) + b, where len is the read length.
|
||
|
* Intervals less than 1 are rounded up to 1.
|
||
|
* C = Like S but uses cube root of length instead of square
|
||
|
* root.
|
||
|
*/
|
||
|
else if(tag == "IVAL") {
|
||
|
PARSE_FUNC(multiseedIval);
|
||
|
}
|
||
|
else if(tag == "CANINTRONLEN") {
|
||
|
assert(penCanIntronLen != NULL);
|
||
|
PARSE_FUNC((*penCanIntronLen));
|
||
|
}
|
||
|
else if(tag == "NONCANINTRONLEN") {
|
||
|
assert(penNoncanIntronLen != NULL);
|
||
|
PARSE_FUNC((*penNoncanIntronLen));
|
||
|
}
|
||
|
else {
|
||
|
// Unknown tag
|
||
|
cerr << "Unexpected alignment policy setting "
|
||
|
<< "'" << tag.c_str() << "'" << endl
|
||
|
<< "Policy: '" << s.c_str() << "'" << endl;
|
||
|
assert(false); throw 1;
|
||
|
}
|
||
|
}
|
||
|
}
|
||
|
|
||
|
#ifdef ALIGNER_SEED_POLICY_MAIN
|
||
|
int main() {
|
||
|
|
||
|
int bonusMatchType;
|
||
|
int bonusMatch;
|
||
|
int penMmcType;
|
||
|
int penMmc;
|
||
|
int penScMax;
|
||
|
int penScMin;
|
||
|
int penNType;
|
||
|
int penN;
|
||
|
int penRdExConst;
|
||
|
int penRfExConst;
|
||
|
int penRdExLinear;
|
||
|
int penRfExLinear;
|
||
|
SimpleFunc costMin;
|
||
|
SimpleFunc costFloor;
|
||
|
SimpleFunc nCeil;
|
||
|
bool nCatPair;
|
||
|
int multiseedMms;
|
||
|
int multiseedLen;
|
||
|
SimpleFunc msIval;
|
||
|
SimpleFunc posfrac;
|
||
|
SimpleFunc rowmult;
|
||
|
uint32_t mhits;
|
||
|
|
||
|
{
|
||
|
cout << "Case 1: Defaults 1 ... ";
|
||
|
const char *pol = "";
|
||
|
SeedAlignmentPolicy::parseString(
|
||
|
string(pol),
|
||
|
false, // --local?
|
||
|
false, // noisy homopolymers a la 454?
|
||
|
false, // ignore qualities?
|
||
|
bonusMatchType,
|
||
|
bonusMatch,
|
||
|
penMmcType,
|
||
|
penMmc,
|
||
|
penScMax,
|
||
|
penScMin,
|
||
|
penNType,
|
||
|
penN,
|
||
|
penRdExConst,
|
||
|
penRfExConst,
|
||
|
penRdExLinear,
|
||
|
penRfExLinear,
|
||
|
costMin,
|
||
|
costFloor,
|
||
|
nCeil,
|
||
|
nCatPair,
|
||
|
multiseedMms,
|
||
|
multiseedLen,
|
||
|
msIval,
|
||
|
mhits);
|
||
|
|
||
|
assert_eq(DEFAULT_MATCH_BONUS_TYPE, bonusMatchType);
|
||
|
assert_eq(DEFAULT_MATCH_BONUS, bonusMatch);
|
||
|
assert_eq(DEFAULT_MM_PENALTY_TYPE, penMmcType);
|
||
|
assert_eq(DEFAULT_MM_PENALTY_MAX, penMmcMax);
|
||
|
assert_eq(DEFAULT_MM_PENALTY_MIN, penMmcMin);
|
||
|
assert_eq(DEFAULT_N_PENALTY_TYPE, penNType);
|
||
|
assert_eq(DEFAULT_N_PENALTY, penN);
|
||
|
assert_eq(DEFAULT_MIN_CONST, costMin.getConst());
|
||
|
assert_eq(DEFAULT_MIN_LINEAR, costMin.getCoeff());
|
||
|
assert_eq(DEFAULT_FLOOR_CONST, costFloor.getConst());
|
||
|
assert_eq(DEFAULT_FLOOR_LINEAR, costFloor.getCoeff());
|
||
|
assert_eq(DEFAULT_N_CEIL_CONST, nCeil.getConst());
|
||
|
assert_eq(DEFAULT_N_CAT_PAIR, nCatPair);
|
||
|
|
||
|
assert_eq(DEFAULT_READ_GAP_CONST, penRdExConst);
|
||
|
assert_eq(DEFAULT_READ_GAP_LINEAR, penRdExLinear);
|
||
|
assert_eq(DEFAULT_REF_GAP_CONST, penRfExConst);
|
||
|
assert_eq(DEFAULT_REF_GAP_LINEAR, penRfExLinear);
|
||
|
assert_eq(DEFAULT_SEEDMMS, multiseedMms);
|
||
|
assert_eq(DEFAULT_SEEDLEN, multiseedLen);
|
||
|
assert_eq(DEFAULT_IVAL, msIval.getType());
|
||
|
assert_eq(DEFAULT_IVAL_A, msIval.getCoeff());
|
||
|
assert_eq(DEFAULT_IVAL_B, msIval.getConst());
|
||
|
|
||
|
cout << "PASSED" << endl;
|
||
|
}
|
||
|
|
||
|
{
|
||
|
cout << "Case 2: Defaults 2 ... ";
|
||
|
const char *pol = "";
|
||
|
SeedAlignmentPolicy::parseString(
|
||
|
string(pol),
|
||
|
false, // --local?
|
||
|
true, // noisy homopolymers a la 454?
|
||
|
false, // ignore qualities?
|
||
|
bonusMatchType,
|
||
|
bonusMatch,
|
||
|
penMmcType,
|
||
|
penMmc,
|
||
|
|
||
|
penNType,
|
||
|
penN,
|
||
|
penRdExConst,
|
||
|
penRfExConst,
|
||
|
penRdExLinear,
|
||
|
penRfExLinear,
|
||
|
costMin,
|
||
|
costFloor,
|
||
|
nCeil,
|
||
|
nCatPair,
|
||
|
multiseedMms,
|
||
|
multiseedLen,
|
||
|
msIval,
|
||
|
mhits);
|
||
|
|
||
|
assert_eq(DEFAULT_MATCH_BONUS_TYPE, bonusMatchType);
|
||
|
assert_eq(DEFAULT_MATCH_BONUS, bonusMatch);
|
||
|
assert_eq(DEFAULT_MM_PENALTY_TYPE, penMmcType);
|
||
|
assert_eq(DEFAULT_MM_PENALTY_MAX, penMmc);
|
||
|
assert_eq(DEFAULT_MM_PENALTY_MIN, penMmc);
|
||
|
assert_eq(DEFAULT_N_PENALTY_TYPE, penNType);
|
||
|
assert_eq(DEFAULT_N_PENALTY, penN);
|
||
|
assert_eq(DEFAULT_MIN_CONST, costMin.getConst());
|
||
|
assert_eq(DEFAULT_MIN_LINEAR, costMin.getCoeff());
|
||
|
assert_eq(DEFAULT_FLOOR_CONST, costFloor.getConst());
|
||
|
assert_eq(DEFAULT_FLOOR_LINEAR, costFloor.getCoeff());
|
||
|
assert_eq(DEFAULT_N_CEIL_CONST, nCeil.getConst());
|
||
|
assert_eq(DEFAULT_N_CAT_PAIR, nCatPair);
|
||
|
|
||
|
assert_eq(DEFAULT_READ_GAP_CONST_BADHPOLY, penRdExConst);
|
||
|
assert_eq(DEFAULT_READ_GAP_LINEAR_BADHPOLY, penRdExLinear);
|
||
|
assert_eq(DEFAULT_REF_GAP_CONST_BADHPOLY, penRfExConst);
|
||
|
assert_eq(DEFAULT_REF_GAP_LINEAR_BADHPOLY, penRfExLinear);
|
||
|
assert_eq(DEFAULT_SEEDMMS, multiseedMms);
|
||
|
assert_eq(DEFAULT_SEEDLEN, multiseedLen);
|
||
|
assert_eq(DEFAULT_IVAL, msIval.getType());
|
||
|
assert_eq(DEFAULT_IVAL_A, msIval.getCoeff());
|
||
|
assert_eq(DEFAULT_IVAL_B, msIval.getConst());
|
||
|
|
||
|
cout << "PASSED" << endl;
|
||
|
}
|
||
|
|
||
|
{
|
||
|
cout << "Case 3: Defaults 3 ... ";
|
||
|
const char *pol = "";
|
||
|
SeedAlignmentPolicy::parseString(
|
||
|
string(pol),
|
||
|
true, // --local?
|
||
|
false, // noisy homopolymers a la 454?
|
||
|
false, // ignore qualities?
|
||
|
bonusMatchType,
|
||
|
bonusMatch,
|
||
|
penMmcType,
|
||
|
penMmc,
|
||
|
penNType,
|
||
|
penN,
|
||
|
penRdExConst,
|
||
|
penRfExConst,
|
||
|
penRdExLinear,
|
||
|
penRfExLinear,
|
||
|
costMin,
|
||
|
costFloor,
|
||
|
nCeil,
|
||
|
nCatPair,
|
||
|
multiseedMms,
|
||
|
multiseedLen,
|
||
|
msIval,
|
||
|
mhits);
|
||
|
|
||
|
assert_eq(DEFAULT_MATCH_BONUS_TYPE_LOCAL, bonusMatchType);
|
||
|
assert_eq(DEFAULT_MATCH_BONUS_LOCAL, bonusMatch);
|
||
|
assert_eq(DEFAULT_MM_PENALTY_TYPE, penMmcType);
|
||
|
assert_eq(DEFAULT_MM_PENALTY_MAX, penMmcMax);
|
||
|
assert_eq(DEFAULT_MM_PENALTY_MIN, penMmcMin);
|
||
|
assert_eq(DEFAULT_N_PENALTY_TYPE, penNType);
|
||
|
assert_eq(DEFAULT_N_PENALTY, penN);
|
||
|
assert_eq(DEFAULT_MIN_CONST_LOCAL, costMin.getConst());
|
||
|
assert_eq(DEFAULT_MIN_LINEAR_LOCAL, costMin.getCoeff());
|
||
|
assert_eq(DEFAULT_FLOOR_CONST_LOCAL, costFloor.getConst());
|
||
|
assert_eq(DEFAULT_FLOOR_LINEAR_LOCAL, costFloor.getCoeff());
|
||
|
assert_eq(DEFAULT_N_CEIL_CONST, nCeil.getConst());
|
||
|
assert_eq(DEFAULT_N_CEIL_LINEAR, nCeil.getCoeff());
|
||
|
assert_eq(DEFAULT_N_CAT_PAIR, nCatPair);
|
||
|
|
||
|
assert_eq(DEFAULT_READ_GAP_CONST, penRdExConst);
|
||
|
assert_eq(DEFAULT_READ_GAP_LINEAR, penRdExLinear);
|
||
|
assert_eq(DEFAULT_REF_GAP_CONST, penRfExConst);
|
||
|
assert_eq(DEFAULT_REF_GAP_LINEAR, penRfExLinear);
|
||
|
assert_eq(DEFAULT_SEEDMMS, multiseedMms);
|
||
|
assert_eq(DEFAULT_SEEDLEN, multiseedLen);
|
||
|
assert_eq(DEFAULT_IVAL, msIval.getType());
|
||
|
assert_eq(DEFAULT_IVAL_A, msIval.getCoeff());
|
||
|
assert_eq(DEFAULT_IVAL_B, msIval.getConst());
|
||
|
|
||
|
cout << "PASSED" << endl;
|
||
|
}
|
||
|
|
||
|
{
|
||
|
cout << "Case 4: Simple string 1 ... ";
|
||
|
const char *pol = "MMP=C44;MA=4;RFG=24,12;FL=C,8;RDG=2;NP=C4;MIN=C,7";
|
||
|
SeedAlignmentPolicy::parseString(
|
||
|
string(pol),
|
||
|
true, // --local?
|
||
|
false, // noisy homopolymers a la 454?
|
||
|
false, // ignore qualities?
|
||
|
bonusMatchType,
|
||
|
bonusMatch,
|
||
|
penMmcType,
|
||
|
penMmc,
|
||
|
penNType,
|
||
|
penN,
|
||
|
penRdExConst,
|
||
|
penRfExConst,
|
||
|
penRdExLinear,
|
||
|
penRfExLinear,
|
||
|
costMin,
|
||
|
costFloor,
|
||
|
nCeil,
|
||
|
nCatPair,
|
||
|
multiseedMms,
|
||
|
multiseedLen,
|
||
|
msIval,
|
||
|
mhits);
|
||
|
|
||
|
assert_eq(COST_MODEL_CONSTANT, bonusMatchType);
|
||
|
assert_eq(4, bonusMatch);
|
||
|
assert_eq(COST_MODEL_CONSTANT, penMmcType);
|
||
|
assert_eq(44, penMmc);
|
||
|
assert_eq(COST_MODEL_CONSTANT, penNType);
|
||
|
assert_eq(4.0f, penN);
|
||
|
assert_eq(7, costMin.getConst());
|
||
|
assert_eq(DEFAULT_MIN_LINEAR_LOCAL, costMin.getCoeff());
|
||
|
assert_eq(8, costFloor.getConst());
|
||
|
assert_eq(DEFAULT_FLOOR_LINEAR_LOCAL, costFloor.getCoeff());
|
||
|
assert_eq(DEFAULT_N_CEIL_CONST, nCeil.getConst());
|
||
|
assert_eq(DEFAULT_N_CEIL_LINEAR, nCeil.getCoeff());
|
||
|
assert_eq(DEFAULT_N_CAT_PAIR, nCatPair);
|
||
|
|
||
|
assert_eq(2.0f, penRdExConst);
|
||
|
assert_eq(DEFAULT_READ_GAP_LINEAR, penRdExLinear);
|
||
|
assert_eq(24.0f, penRfExConst);
|
||
|
assert_eq(12.0f, penRfExLinear);
|
||
|
assert_eq(DEFAULT_SEEDMMS, multiseedMms);
|
||
|
assert_eq(DEFAULT_SEEDLEN, multiseedLen);
|
||
|
assert_eq(DEFAULT_IVAL, msIval.getType());
|
||
|
assert_eq(DEFAULT_IVAL_A, msIval.getCoeff());
|
||
|
assert_eq(DEFAULT_IVAL_B, msIval.getConst());
|
||
|
|
||
|
cout << "PASSED" << endl;
|
||
|
}
|
||
|
}
|
||
|
#endif /*def ALIGNER_SEED_POLICY_MAIN*/
|