532 lines
14 KiB
C
532 lines
14 KiB
C
|
/*
|
||
|
* Copyright 2011, Ben Langmead <langmea@cs.jhu.edu>
|
||
|
*
|
||
|
* This file is part of Bowtie 2.
|
||
|
*
|
||
|
* Bowtie 2 is free software: you can redistribute it and/or modify
|
||
|
* it under the terms of the GNU General Public License as published by
|
||
|
* the Free Software Foundation, either version 3 of the License, or
|
||
|
* (at your option) any later version.
|
||
|
*
|
||
|
* Bowtie 2 is distributed in the hope that it will be useful,
|
||
|
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||
|
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||
|
* GNU General Public License for more details.
|
||
|
*
|
||
|
* You should have received a copy of the GNU General Public License
|
||
|
* along with Bowtie 2. If not, see <http://www.gnu.org/licenses/>.
|
||
|
*/
|
||
|
|
||
|
/*
|
||
|
* unique.h
|
||
|
*
|
||
|
* Encapsulates objects and routines for determining whether and to
|
||
|
* what extend the best alignment for a read is "unique." In the
|
||
|
* simplest scenario, uniqueness is determined by whether we found only
|
||
|
* one alignment. More complex scenarios might assign a uniqueness
|
||
|
* score based that is a function of (of a summarized version of): all
|
||
|
* the alignments found and their scores.
|
||
|
*
|
||
|
* Since mapping quality is related to uniqueness, objects and
|
||
|
* routings for calculating mapping quality are also included here.
|
||
|
*/
|
||
|
|
||
|
#ifndef UNIQUE_H_
|
||
|
#define UNIQUE_H_
|
||
|
|
||
|
#include <string>
|
||
|
#include "aligner_result.h"
|
||
|
#include "simple_func.h"
|
||
|
#include "util.h"
|
||
|
#include "scoring.h"
|
||
|
|
||
|
typedef int64_t TMapq;
|
||
|
|
||
|
/**
|
||
|
* Class that returns yes-or-no answers to the question of whether a
|
||
|
*/
|
||
|
class Uniqueness {
|
||
|
public:
|
||
|
|
||
|
/**
|
||
|
* Given an AlnSetSumm, determine if the best alignment is "unique"
|
||
|
* according to some definition.
|
||
|
*/
|
||
|
static bool bestIsUnique(
|
||
|
const AlnSetSumm& s,
|
||
|
const AlnFlags& flags,
|
||
|
bool mate1,
|
||
|
size_t rdlen,
|
||
|
size_t ordlen,
|
||
|
char *inps)
|
||
|
{
|
||
|
assert(!s.empty());
|
||
|
return !VALID_AL_SCORE(s.secbest(mate1));
|
||
|
}
|
||
|
};
|
||
|
|
||
|
/**
|
||
|
* Collection of routines for calculating mapping quality.
|
||
|
*/
|
||
|
class Mapq {
|
||
|
|
||
|
public:
|
||
|
|
||
|
virtual ~Mapq() { }
|
||
|
|
||
|
virtual TMapq mapq(
|
||
|
const AlnSetSumm& s,
|
||
|
const AlnFlags& flags,
|
||
|
bool mate1,
|
||
|
size_t rdlen,
|
||
|
size_t ordlen,
|
||
|
char *inps) const = 0;
|
||
|
};
|
||
|
|
||
|
extern const TMapq unp_nosec_perf;
|
||
|
extern const TMapq unp_nosec[11];
|
||
|
extern const TMapq unp_sec_perf[11];
|
||
|
extern const TMapq unp_sec[11][11];
|
||
|
extern const TMapq pair_nosec_perf;
|
||
|
|
||
|
/**
|
||
|
* V3 of the MAPQ calculator
|
||
|
*/
|
||
|
class BowtieMapq3 : public Mapq {
|
||
|
|
||
|
public:
|
||
|
|
||
|
BowtieMapq3(
|
||
|
const SimpleFunc& scoreMin,
|
||
|
const Scoring& sc) :
|
||
|
scoreMin_(scoreMin),
|
||
|
sc_(sc)
|
||
|
{ }
|
||
|
|
||
|
virtual ~BowtieMapq3() { }
|
||
|
|
||
|
/**
|
||
|
* Given an AlnSetSumm, return a mapping quality calculated.
|
||
|
*/
|
||
|
virtual TMapq mapq(
|
||
|
const AlnSetSumm& s,
|
||
|
const AlnFlags& flags,
|
||
|
bool mate1,
|
||
|
size_t rdlen,
|
||
|
size_t ordlen,
|
||
|
char *inps) // put string representation of inputs here
|
||
|
const
|
||
|
{
|
||
|
if(s.paired()) {
|
||
|
return pair_nosec_perf;
|
||
|
} else {
|
||
|
bool hasSecbest = VALID_AL_SCORE(s.secbest(mate1));
|
||
|
if(!flags.canMax() && !s.exhausted(mate1) && !hasSecbest) {
|
||
|
return 255;
|
||
|
}
|
||
|
TAlScore scMax = (TAlScore)sc_.perfectScore(rdlen);
|
||
|
TAlScore scMin = scoreMin_.f<TAlScore>((float)rdlen);
|
||
|
assert_geq(scMax, scMin);
|
||
|
TAlScore best = scMax - s.best(mate1).score(); // best score (lower=better)
|
||
|
size_t best_bin = (size_t)((double)best * (10.0 / (double)(scMax - scMin)) + 0.5);
|
||
|
assert_geq(best_bin, 0);
|
||
|
assert_lt(best_bin, 11);
|
||
|
if(hasSecbest) {
|
||
|
assert_geq(s.best(mate1).score(), s.secbest(mate1).score());
|
||
|
size_t diff = s.best(mate1).score() - s.secbest(mate1).score();
|
||
|
size_t diff_bin = (size_t)((double)diff * (10.0 / (double)(scMax - scMin)) + 0.5);
|
||
|
assert_geq(diff_bin, 0);
|
||
|
assert_lt(diff_bin, 11);
|
||
|
// A valid second-best alignment was found
|
||
|
if(best == scMax) {
|
||
|
// Best alignment has perfect score
|
||
|
return unp_sec_perf[best_bin];
|
||
|
} else {
|
||
|
// Best alignment has less than perfect score
|
||
|
return unp_sec[diff_bin][best_bin];
|
||
|
}
|
||
|
} else {
|
||
|
// No valid second-best alignment was found
|
||
|
if(best == scMax) {
|
||
|
// Best alignment has perfect score
|
||
|
return unp_nosec_perf;
|
||
|
} else {
|
||
|
// Best alignment has less than perfect score
|
||
|
return unp_nosec[best_bin];
|
||
|
}
|
||
|
}
|
||
|
}
|
||
|
}
|
||
|
|
||
|
protected:
|
||
|
|
||
|
SimpleFunc scoreMin_;
|
||
|
const Scoring& sc_;
|
||
|
};
|
||
|
|
||
|
/**
|
||
|
* V2 of the MAPQ calculator
|
||
|
*/
|
||
|
class BowtieMapq2 : public Mapq {
|
||
|
|
||
|
public:
|
||
|
|
||
|
BowtieMapq2(
|
||
|
const SimpleFunc& scoreMin,
|
||
|
const Scoring& sc) :
|
||
|
scoreMin_(scoreMin),
|
||
|
sc_(sc)
|
||
|
{ }
|
||
|
|
||
|
virtual ~BowtieMapq2() { }
|
||
|
|
||
|
/**
|
||
|
* Given an AlnSetSumm, return a mapping quality calculated.
|
||
|
*/
|
||
|
virtual TMapq mapq(
|
||
|
const AlnSetSumm& s,
|
||
|
const AlnFlags& flags,
|
||
|
bool mate1,
|
||
|
size_t rdlen,
|
||
|
size_t ordlen,
|
||
|
char *inps) // put string representation of inputs here
|
||
|
const
|
||
|
{
|
||
|
// Did the read have a second-best alignment?
|
||
|
bool hasSecbest = s.paired() ?
|
||
|
VALID_AL_SCORE(s.secbestPaired()) :
|
||
|
VALID_AL_SCORE(s.secbest(mate1));
|
||
|
|
||
|
// for hisat
|
||
|
bool equalSecbest = false;
|
||
|
if(hasSecbest) {
|
||
|
if(s.paired()) {
|
||
|
assert_geq(s.bestPaired(), s.secbestPaired());
|
||
|
equalSecbest = s.bestPaired() == s.secbestPaired();
|
||
|
} else {
|
||
|
assert_geq(s.best(mate1), s.secbest(mate1));
|
||
|
equalSecbest = s.best(mate1) == s.secbest(mate1);
|
||
|
}
|
||
|
}
|
||
|
|
||
|
// This corresponds to a scenario where we found one and only one
|
||
|
// alignment but didn't really look for a second one
|
||
|
if(!flags.canMax() &&
|
||
|
!s.exhausted(mate1) &&
|
||
|
(!hasSecbest || !equalSecbest)) {
|
||
|
return 60;
|
||
|
}
|
||
|
// scPer = score of a perfect match
|
||
|
TAlScore scPer = (TAlScore)sc_.perfectScore(rdlen);
|
||
|
if(s.paired()) {
|
||
|
scPer += (TAlScore)sc_.perfectScore(ordlen);
|
||
|
}
|
||
|
// scMin = score of a just barely valid match
|
||
|
TAlScore scMin = scoreMin_.f<TAlScore>((float)rdlen);
|
||
|
if(s.paired()) {
|
||
|
scMin += scoreMin_.f<TAlScore>((float)ordlen);
|
||
|
}
|
||
|
TAlScore secbest = scMin-1;
|
||
|
TAlScore diff = (scPer - scMin); // scores can vary by up to this much
|
||
|
TMapq ret = 0;
|
||
|
TAlScore best = s.paired() ?
|
||
|
s.bestPaired().score() : s.best(mate1).score();
|
||
|
// best score but normalized so that 0 = worst valid score
|
||
|
TAlScore bestOver = best - scMin;
|
||
|
if(sc_.monotone) {
|
||
|
// End-to-end alignment
|
||
|
if(!hasSecbest) {
|
||
|
if (bestOver >= diff * (double)0.8f) ret = 42;
|
||
|
else if(bestOver >= diff * (double)0.7f) ret = 40;
|
||
|
else if(bestOver >= diff * (double)0.6f) ret = 24;
|
||
|
else if(bestOver >= diff * (double)0.5f) ret = 23;
|
||
|
else if(bestOver >= diff * (double)0.4f) ret = 8;
|
||
|
else if(bestOver >= diff * (double)0.3f) ret = 3;
|
||
|
else ret = 0;
|
||
|
} else {
|
||
|
secbest = s.paired() ?
|
||
|
s.secbestPaired().score() : s.secbest(mate1).score();
|
||
|
TAlScore bestdiff = abs(abs(static_cast<long>(best))-abs(static_cast<long>(secbest)));
|
||
|
if(bestdiff >= diff * (double)0.9f) {
|
||
|
if(bestOver == diff) {
|
||
|
ret = 39;
|
||
|
} else {
|
||
|
ret = 33;
|
||
|
}
|
||
|
} else if(bestdiff >= diff * (double)0.8f) {
|
||
|
if(bestOver == diff) {
|
||
|
ret = 38;
|
||
|
} else {
|
||
|
ret = 27;
|
||
|
}
|
||
|
} else if(bestdiff >= diff * (double)0.7f) {
|
||
|
if(bestOver == diff) {
|
||
|
ret = 37;
|
||
|
} else {
|
||
|
ret = 26;
|
||
|
}
|
||
|
} else if(bestdiff >= diff * (double)0.6f) {
|
||
|
if(bestOver == diff) {
|
||
|
ret = 36;
|
||
|
} else {
|
||
|
ret = 22;
|
||
|
}
|
||
|
} else if(bestdiff >= diff * (double)0.5f) {
|
||
|
// Top third is still pretty good
|
||
|
if (bestOver == diff) {
|
||
|
ret = 35;
|
||
|
} else if(bestOver >= diff * (double)0.84f) {
|
||
|
ret = 25;
|
||
|
} else if(bestOver >= diff * (double)0.68f) {
|
||
|
ret = 16;
|
||
|
} else {
|
||
|
ret = 5;
|
||
|
}
|
||
|
} else if(bestdiff >= diff * (double)0.4f) {
|
||
|
// Top third is still pretty good
|
||
|
if (bestOver == diff) {
|
||
|
ret = 34;
|
||
|
} else if(bestOver >= diff * (double)0.84f) {
|
||
|
ret = 21;
|
||
|
} else if(bestOver >= diff * (double)0.68f) {
|
||
|
ret = 14;
|
||
|
} else {
|
||
|
ret = 4;
|
||
|
}
|
||
|
} else if(bestdiff >= diff * (double)0.3f) {
|
||
|
// Top third is still pretty good
|
||
|
if (bestOver == diff) {
|
||
|
ret = 32;
|
||
|
} else if(bestOver >= diff * (double)0.88f) {
|
||
|
ret = 18;
|
||
|
} else if(bestOver >= diff * (double)0.67f) {
|
||
|
ret = 15;
|
||
|
} else {
|
||
|
ret = 3;
|
||
|
}
|
||
|
} else if(bestdiff >= diff * (double)0.2f) {
|
||
|
// Top third is still pretty good
|
||
|
if (bestOver == diff) {
|
||
|
ret = 31;
|
||
|
} else if(bestOver >= diff * (double)0.88f) {
|
||
|
ret = 17;
|
||
|
} else if(bestOver >= diff * (double)0.67f) {
|
||
|
ret = 11;
|
||
|
} else {
|
||
|
ret = 0;
|
||
|
}
|
||
|
} else if(bestdiff >= diff * (double)0.1f) {
|
||
|
// Top third is still pretty good
|
||
|
if (bestOver == diff) {
|
||
|
ret = 30;
|
||
|
} else if(bestOver >= diff * (double)0.88f) {
|
||
|
ret = 12;
|
||
|
} else if(bestOver >= diff * (double)0.67f) {
|
||
|
ret = 7;
|
||
|
} else {
|
||
|
ret = 0;
|
||
|
}
|
||
|
} else if(bestdiff > 0) {
|
||
|
// Top third is still pretty good
|
||
|
if(bestOver >= diff * (double)0.67f) {
|
||
|
ret = 6;
|
||
|
} else {
|
||
|
ret = 2;
|
||
|
}
|
||
|
} else {
|
||
|
assert_eq(bestdiff, 0);
|
||
|
// Top third is still pretty good
|
||
|
if(bestOver >= diff * (double)0.67f) {
|
||
|
ret = 1;
|
||
|
} else {
|
||
|
ret = 0;
|
||
|
}
|
||
|
}
|
||
|
}
|
||
|
} else {
|
||
|
// Local alignment
|
||
|
if(!hasSecbest) {
|
||
|
if (bestOver >= diff * (double)0.8f) ret = 44;
|
||
|
else if(bestOver >= diff * (double)0.7f) ret = 42;
|
||
|
else if(bestOver >= diff * (double)0.6f) ret = 41;
|
||
|
else if(bestOver >= diff * (double)0.5f) ret = 36;
|
||
|
else if(bestOver >= diff * (double)0.4f) ret = 28;
|
||
|
else if(bestOver >= diff * (double)0.3f) ret = 24;
|
||
|
else ret = 22;
|
||
|
} else {
|
||
|
secbest = s.paired() ?
|
||
|
s.secbestPaired().score() : s.secbest(mate1).score();
|
||
|
TAlScore bestdiff = abs(abs(static_cast<long>(best))-abs(static_cast<long>(secbest)));
|
||
|
if (bestdiff >= diff * (double)0.9f) ret = 40;
|
||
|
else if(bestdiff >= diff * (double)0.8f) ret = 39;
|
||
|
else if(bestdiff >= diff * (double)0.7f) ret = 38;
|
||
|
else if(bestdiff >= diff * (double)0.6f) ret = 37;
|
||
|
else if(bestdiff >= diff * (double)0.5f) {
|
||
|
if (bestOver == diff) ret = 35;
|
||
|
else if(bestOver >= diff * (double)0.50f) ret = 25;
|
||
|
else ret = 20;
|
||
|
} else if(bestdiff >= diff * (double)0.4f) {
|
||
|
if (bestOver == diff) ret = 34;
|
||
|
else if(bestOver >= diff * (double)0.50f) ret = 21;
|
||
|
else ret = 19;
|
||
|
} else if(bestdiff >= diff * (double)0.3f) {
|
||
|
if (bestOver == diff) ret = 33;
|
||
|
else if(bestOver >= diff * (double)0.5f) ret = 18;
|
||
|
else ret = 16;
|
||
|
} else if(bestdiff >= diff * (double)0.2f) {
|
||
|
if (bestOver == diff) ret = 32;
|
||
|
else if(bestOver >= diff * (double)0.5f) ret = 17;
|
||
|
else ret = 12;
|
||
|
} else if(bestdiff >= diff * (double)0.1f) {
|
||
|
if (bestOver == diff) ret = 31;
|
||
|
else if(bestOver >= diff * (double)0.5f) ret = 14;
|
||
|
else ret = 9;
|
||
|
} else if(bestdiff > 0) {
|
||
|
if(bestOver >= diff * (double)0.5f) ret = 11;
|
||
|
else ret = 2;
|
||
|
} else {
|
||
|
assert_eq(bestdiff, 0);
|
||
|
if(bestOver >= diff * (double)0.5f) ret = 1;
|
||
|
else ret = 0;
|
||
|
}
|
||
|
}
|
||
|
}
|
||
|
// Note: modifications to inps must be synchronized
|
||
|
//if(inps != NULL) {
|
||
|
// inps = itoa10<TAlScore>(best, inps);
|
||
|
// *inps++ = ',';
|
||
|
// inps = itoa10<TAlScore>(secbest, inps);
|
||
|
// *inps++ = ',';
|
||
|
// inps = itoa10<TMapq>(ret, inps);
|
||
|
//}
|
||
|
return ret;
|
||
|
}
|
||
|
|
||
|
protected:
|
||
|
|
||
|
SimpleFunc scoreMin_;
|
||
|
const Scoring& sc_;
|
||
|
};
|
||
|
|
||
|
/**
|
||
|
* TODO: Do BowtieMapq on a per-thread basis prior to the mutex'ed output
|
||
|
* function.
|
||
|
*
|
||
|
* topCoeff :: top_coeff
|
||
|
* botCoeff :: bot_coeff
|
||
|
* mx :: mapqMax
|
||
|
* horiz :: mapqHorizon (sort of)
|
||
|
*
|
||
|
* sc1 <- tab$sc1
|
||
|
* sc2 <- tab$sc2
|
||
|
* mapq <- rep(mx, length(sc1))
|
||
|
* diff_top <- ifelse(sc1 != best & sc2 != best, abs(best - abs(pmax(sc1, sc2))), 0)
|
||
|
* mapq <- mapq - diff_top * top_coeff
|
||
|
* diff_bot <- ifelse(sc2 != horiz, abs(abs(sc2) - abs(horiz)), 0)
|
||
|
* mapq <- mapq - diff_bot * bot_coeff
|
||
|
* mapq <- round(pmax(0, pmin(mx, mapq)))
|
||
|
* tab$mapq <- mapq
|
||
|
*/
|
||
|
class BowtieMapq : public Mapq {
|
||
|
|
||
|
public:
|
||
|
|
||
|
BowtieMapq(
|
||
|
const SimpleFunc& scoreMin,
|
||
|
const Scoring& sc) :
|
||
|
scoreMin_(scoreMin),
|
||
|
sc_(sc)
|
||
|
{ }
|
||
|
|
||
|
virtual ~BowtieMapq() { }
|
||
|
|
||
|
/**
|
||
|
* Given an AlnSetSumm, return a mapping quality calculated.
|
||
|
*/
|
||
|
virtual TMapq mapq(
|
||
|
const AlnSetSumm& s,
|
||
|
const AlnFlags& flags,
|
||
|
bool mate1,
|
||
|
size_t rdlen,
|
||
|
size_t ordlen,
|
||
|
char *inps) // put string representation of inputs here
|
||
|
const
|
||
|
{
|
||
|
bool hasSecbest = VALID_AL_SCORE(s.secbest(mate1));
|
||
|
if(!flags.canMax() && !s.exhausted(mate1) && !hasSecbest) {
|
||
|
return 255;
|
||
|
}
|
||
|
TAlScore scPer = (TAlScore)sc_.perfectScore(rdlen);
|
||
|
TAlScore scMin = scoreMin_.f<TAlScore>((float)rdlen);
|
||
|
TAlScore secbest = scMin-1;
|
||
|
TAlScore diff = (scPer - scMin);
|
||
|
float sixth_2 = (float)(scPer - diff * (double)0.1666f * 2);
|
||
|
float sixth_3 = (float)(scPer - diff * (double)0.1666f * 3);
|
||
|
TMapq ret = 0;
|
||
|
TAlScore best = s.best(mate1).score();
|
||
|
if(!hasSecbest) {
|
||
|
// Top third?
|
||
|
if(best >= sixth_2) {
|
||
|
ret = 37;
|
||
|
}
|
||
|
// Otherwise in top half?
|
||
|
else if(best >= sixth_3) {
|
||
|
ret = 25;
|
||
|
}
|
||
|
// Otherwise has no second-best?
|
||
|
else {
|
||
|
ret = 10;
|
||
|
}
|
||
|
} else {
|
||
|
secbest = s.secbest(mate1).score();
|
||
|
TAlScore bestdiff = abs(abs(static_cast<long>(best))-abs(static_cast<long>(secbest)));
|
||
|
if(bestdiff >= diff * 0.1666 * 5) {
|
||
|
ret = 6;
|
||
|
} else if(bestdiff >= diff * 0.1666 * 4) {
|
||
|
ret = 5;
|
||
|
} else if(bestdiff >= diff * 0.1666 * 3) {
|
||
|
ret = 4;
|
||
|
} else if(bestdiff >= diff * 0.1666 * 2) {
|
||
|
ret = 3;
|
||
|
} else if(bestdiff >= diff * 0.1666 * 1) {
|
||
|
ret = 2;
|
||
|
} else {
|
||
|
ret = 1;
|
||
|
}
|
||
|
}
|
||
|
// Note: modifications to inps must be synchronized
|
||
|
//if(inps != NULL) {
|
||
|
// inps = itoa10<TAlScore>(best, inps);
|
||
|
// *inps++ = ',';
|
||
|
// inps = itoa10<TAlScore>(secbest, inps);
|
||
|
// *inps++ = ',';
|
||
|
// inps = itoa10<TMapq>(ret, inps);
|
||
|
//}
|
||
|
return ret;
|
||
|
}
|
||
|
|
||
|
protected:
|
||
|
|
||
|
SimpleFunc scoreMin_;
|
||
|
const Scoring& sc_;
|
||
|
};
|
||
|
|
||
|
/**
|
||
|
* Create and return new MAPQ calculating object.
|
||
|
*/
|
||
|
static inline Mapq *new_mapq(
|
||
|
int version,
|
||
|
const SimpleFunc& scoreMin,
|
||
|
const Scoring& sc)
|
||
|
{
|
||
|
if(version == 3) {
|
||
|
return new BowtieMapq3(scoreMin, sc);
|
||
|
} else if(version == 2) {
|
||
|
return new BowtieMapq2(scoreMin, sc);
|
||
|
} else {
|
||
|
return new BowtieMapq(scoreMin, sc);
|
||
|
}
|
||
|
}
|
||
|
|
||
|
#endif /*ndef UNIQUE_H_*/
|