353 lines
11 KiB
C
353 lines
11 KiB
C
|
/*
|
||
|
* Copyright 2011, Ben Langmead <langmea@cs.jhu.edu>
|
||
|
*
|
||
|
* This file is part of Bowtie 2.
|
||
|
*
|
||
|
* Bowtie 2 is free software: you can redistribute it and/or modify
|
||
|
* it under the terms of the GNU General Public License as published by
|
||
|
* the Free Software Foundation, either version 3 of the License, or
|
||
|
* (at your option) any later version.
|
||
|
*
|
||
|
* Bowtie 2 is distributed in the hope that it will be useful,
|
||
|
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||
|
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||
|
* GNU General Public License for more details.
|
||
|
*
|
||
|
* You should have received a copy of the GNU General Public License
|
||
|
* along with Bowtie 2. If not, see <http://www.gnu.org/licenses/>.
|
||
|
*/
|
||
|
|
||
|
#ifndef ALIGNER_METRICS_H_
|
||
|
#define ALIGNER_METRICS_H_
|
||
|
|
||
|
#include <math.h>
|
||
|
#include <iostream>
|
||
|
#include "alphabet.h"
|
||
|
#include "timer.h"
|
||
|
#include "sstring.h"
|
||
|
|
||
|
using namespace std;
|
||
|
|
||
|
/**
|
||
|
* Borrowed from http://www.johndcook.com/standard_deviation.html,
|
||
|
* which in turn is borrowed from Knuth.
|
||
|
*/
|
||
|
class RunningStat {
|
||
|
public:
|
||
|
RunningStat() : m_n(0), m_tot(0.0) { }
|
||
|
|
||
|
void clear() {
|
||
|
m_n = 0;
|
||
|
m_tot = 0.0;
|
||
|
}
|
||
|
|
||
|
void push(float x) {
|
||
|
m_n++;
|
||
|
m_tot += x;
|
||
|
// See Knuth TAOCP vol 2, 3rd edition, page 232
|
||
|
if (m_n == 1) {
|
||
|
m_oldM = m_newM = x;
|
||
|
m_oldS = 0.0;
|
||
|
} else {
|
||
|
m_newM = m_oldM + (x - m_oldM)/m_n;
|
||
|
m_newS = m_oldS + (x - m_oldM)*(x - m_newM);
|
||
|
// set up for next iteration
|
||
|
m_oldM = m_newM;
|
||
|
m_oldS = m_newS;
|
||
|
}
|
||
|
}
|
||
|
|
||
|
int num() const {
|
||
|
return m_n;
|
||
|
}
|
||
|
|
||
|
double tot() const {
|
||
|
return m_tot;
|
||
|
}
|
||
|
|
||
|
double mean() const {
|
||
|
return (m_n > 0) ? m_newM : 0.0;
|
||
|
}
|
||
|
|
||
|
double variance() const {
|
||
|
return ( (m_n > 1) ? m_newS/(m_n - 1) : 0.0 );
|
||
|
}
|
||
|
|
||
|
double stddev() const {
|
||
|
return sqrt(variance());
|
||
|
}
|
||
|
|
||
|
private:
|
||
|
int m_n;
|
||
|
double m_tot;
|
||
|
double m_oldM, m_newM, m_oldS, m_newS;
|
||
|
};
|
||
|
|
||
|
/**
|
||
|
* Encapsulates a set of metrics that we would like an aligner to keep
|
||
|
* track of, so that we can possibly use it to diagnose performance
|
||
|
* issues.
|
||
|
*/
|
||
|
class AlignerMetrics {
|
||
|
|
||
|
public:
|
||
|
|
||
|
AlignerMetrics() :
|
||
|
curBacktracks_(0),
|
||
|
curBwtOps_(0),
|
||
|
first_(true),
|
||
|
curIsLowEntropy_(false),
|
||
|
curIsHomoPoly_(false),
|
||
|
curHadRanges_(false),
|
||
|
curNumNs_(0),
|
||
|
reads_(0),
|
||
|
homoReads_(0),
|
||
|
lowEntReads_(0),
|
||
|
hiEntReads_(0),
|
||
|
alignedReads_(0),
|
||
|
unalignedReads_(0),
|
||
|
threeOrMoreNReads_(0),
|
||
|
lessThanThreeNRreads_(0),
|
||
|
bwtOpsPerRead_(),
|
||
|
backtracksPerRead_(),
|
||
|
bwtOpsPerHomoRead_(),
|
||
|
backtracksPerHomoRead_(),
|
||
|
bwtOpsPerLoEntRead_(),
|
||
|
backtracksPerLoEntRead_(),
|
||
|
bwtOpsPerHiEntRead_(),
|
||
|
backtracksPerHiEntRead_(),
|
||
|
bwtOpsPerAlignedRead_(),
|
||
|
backtracksPerAlignedRead_(),
|
||
|
bwtOpsPerUnalignedRead_(),
|
||
|
backtracksPerUnalignedRead_(),
|
||
|
bwtOpsPer0nRead_(),
|
||
|
backtracksPer0nRead_(),
|
||
|
bwtOpsPer1nRead_(),
|
||
|
backtracksPer1nRead_(),
|
||
|
bwtOpsPer2nRead_(),
|
||
|
backtracksPer2nRead_(),
|
||
|
bwtOpsPer3orMoreNRead_(),
|
||
|
backtracksPer3orMoreNRead_(),
|
||
|
timer_(cout, "", false)
|
||
|
{ }
|
||
|
|
||
|
void printSummary() {
|
||
|
if(!first_) {
|
||
|
finishRead();
|
||
|
}
|
||
|
cout << "AlignerMetrics:" << endl;
|
||
|
cout << " # Reads: " << reads_ << endl;
|
||
|
float hopct = (reads_ > 0) ? (((float)homoReads_)/((float)reads_)) : (0.0f);
|
||
|
hopct *= 100.0f;
|
||
|
cout << " % homo-polymeric: " << (hopct) << endl;
|
||
|
float lopct = (reads_ > 0) ? ((float)lowEntReads_/(float)(reads_)) : (0.0f);
|
||
|
lopct *= 100.0f;
|
||
|
cout << " % low-entropy: " << (lopct) << endl;
|
||
|
float unpct = (reads_ > 0) ? ((float)unalignedReads_/(float)(reads_)) : (0.0f);
|
||
|
unpct *= 100.0f;
|
||
|
cout << " % unaligned: " << (unpct) << endl;
|
||
|
float npct = (reads_ > 0) ? ((float)threeOrMoreNReads_/(float)(reads_)) : (0.0f);
|
||
|
npct *= 100.0f;
|
||
|
cout << " % with 3 or more Ns: " << (npct) << endl;
|
||
|
cout << endl;
|
||
|
cout << " Total BWT ops: avg: " << bwtOpsPerRead_.mean() << ", stddev: " << bwtOpsPerRead_.stddev() << endl;
|
||
|
cout << " Total Backtracks: avg: " << backtracksPerRead_.mean() << ", stddev: " << backtracksPerRead_.stddev() << endl;
|
||
|
time_t elapsed = timer_.elapsed();
|
||
|
cout << " BWT ops per second: " << (bwtOpsPerRead_.tot()/elapsed) << endl;
|
||
|
cout << " Backtracks per second: " << (backtracksPerRead_.tot()/elapsed) << endl;
|
||
|
cout << endl;
|
||
|
cout << " Homo-poly:" << endl;
|
||
|
cout << " BWT ops: avg: " << bwtOpsPerHomoRead_.mean() << ", stddev: " << bwtOpsPerHomoRead_.stddev() << endl;
|
||
|
cout << " Backtracks: avg: " << backtracksPerHomoRead_.mean() << ", stddev: " << backtracksPerHomoRead_.stddev() << endl;
|
||
|
cout << " Low-entropy:" << endl;
|
||
|
cout << " BWT ops: avg: " << bwtOpsPerLoEntRead_.mean() << ", stddev: " << bwtOpsPerLoEntRead_.stddev() << endl;
|
||
|
cout << " Backtracks: avg: " << backtracksPerLoEntRead_.mean() << ", stddev: " << backtracksPerLoEntRead_.stddev() << endl;
|
||
|
cout << " High-entropy:" << endl;
|
||
|
cout << " BWT ops: avg: " << bwtOpsPerHiEntRead_.mean() << ", stddev: " << bwtOpsPerHiEntRead_.stddev() << endl;
|
||
|
cout << " Backtracks: avg: " << backtracksPerHiEntRead_.mean() << ", stddev: " << backtracksPerHiEntRead_.stddev() << endl;
|
||
|
cout << endl;
|
||
|
cout << " Unaligned:" << endl;
|
||
|
cout << " BWT ops: avg: " << bwtOpsPerUnalignedRead_.mean() << ", stddev: " << bwtOpsPerUnalignedRead_.stddev() << endl;
|
||
|
cout << " Backtracks: avg: " << backtracksPerUnalignedRead_.mean() << ", stddev: " << backtracksPerUnalignedRead_.stddev() << endl;
|
||
|
cout << " Aligned:" << endl;
|
||
|
cout << " BWT ops: avg: " << bwtOpsPerAlignedRead_.mean() << ", stddev: " << bwtOpsPerAlignedRead_.stddev() << endl;
|
||
|
cout << " Backtracks: avg: " << backtracksPerAlignedRead_.mean() << ", stddev: " << backtracksPerAlignedRead_.stddev() << endl;
|
||
|
cout << endl;
|
||
|
cout << " 0 Ns:" << endl;
|
||
|
cout << " BWT ops: avg: " << bwtOpsPer0nRead_.mean() << ", stddev: " << bwtOpsPer0nRead_.stddev() << endl;
|
||
|
cout << " Backtracks: avg: " << backtracksPer0nRead_.mean() << ", stddev: " << backtracksPer0nRead_.stddev() << endl;
|
||
|
cout << " 1 N:" << endl;
|
||
|
cout << " BWT ops: avg: " << bwtOpsPer1nRead_.mean() << ", stddev: " << bwtOpsPer1nRead_.stddev() << endl;
|
||
|
cout << " Backtracks: avg: " << backtracksPer1nRead_.mean() << ", stddev: " << backtracksPer1nRead_.stddev() << endl;
|
||
|
cout << " 2 Ns:" << endl;
|
||
|
cout << " BWT ops: avg: " << bwtOpsPer2nRead_.mean() << ", stddev: " << bwtOpsPer2nRead_.stddev() << endl;
|
||
|
cout << " Backtracks: avg: " << backtracksPer2nRead_.mean() << ", stddev: " << backtracksPer2nRead_.stddev() << endl;
|
||
|
cout << " >2 Ns:" << endl;
|
||
|
cout << " BWT ops: avg: " << bwtOpsPer3orMoreNRead_.mean() << ", stddev: " << bwtOpsPer3orMoreNRead_.stddev() << endl;
|
||
|
cout << " Backtracks: avg: " << backtracksPer3orMoreNRead_.mean() << ", stddev: " << backtracksPer3orMoreNRead_.stddev() << endl;
|
||
|
cout << endl;
|
||
|
}
|
||
|
|
||
|
/**
|
||
|
*
|
||
|
*/
|
||
|
void nextRead(const BTDnaString& read) {
|
||
|
if(!first_) {
|
||
|
finishRead();
|
||
|
}
|
||
|
first_ = false;
|
||
|
//float ent = entropyDna5(read);
|
||
|
float ent = 0.0f;
|
||
|
curIsLowEntropy_ = (ent < 0.75f);
|
||
|
curIsHomoPoly_ = (ent < 0.001f);
|
||
|
curHadRanges_ = false;
|
||
|
curBwtOps_ = 0;
|
||
|
curBacktracks_ = 0;
|
||
|
// Count Ns
|
||
|
curNumNs_ = 0;
|
||
|
const size_t len = read.length();
|
||
|
for(size_t i = 0; i < len; i++) {
|
||
|
if((int)read[i] == 4) curNumNs_++;
|
||
|
}
|
||
|
}
|
||
|
|
||
|
/**
|
||
|
*
|
||
|
*/
|
||
|
void setReadHasRange() {
|
||
|
curHadRanges_ = true;
|
||
|
}
|
||
|
|
||
|
/**
|
||
|
* Commit the running statistics for this read to
|
||
|
*/
|
||
|
void finishRead() {
|
||
|
reads_++;
|
||
|
if(curIsHomoPoly_) homoReads_++;
|
||
|
else if(curIsLowEntropy_) lowEntReads_++;
|
||
|
else hiEntReads_++;
|
||
|
if(curHadRanges_) alignedReads_++;
|
||
|
else unalignedReads_++;
|
||
|
bwtOpsPerRead_.push((float)curBwtOps_);
|
||
|
backtracksPerRead_.push((float)curBacktracks_);
|
||
|
// Drill down by entropy
|
||
|
if(curIsHomoPoly_) {
|
||
|
bwtOpsPerHomoRead_.push((float)curBwtOps_);
|
||
|
backtracksPerHomoRead_.push((float)curBacktracks_);
|
||
|
} else if(curIsLowEntropy_) {
|
||
|
bwtOpsPerLoEntRead_.push((float)curBwtOps_);
|
||
|
backtracksPerLoEntRead_.push((float)curBacktracks_);
|
||
|
} else {
|
||
|
bwtOpsPerHiEntRead_.push((float)curBwtOps_);
|
||
|
backtracksPerHiEntRead_.push((float)curBacktracks_);
|
||
|
}
|
||
|
// Drill down by whether it aligned
|
||
|
if(curHadRanges_) {
|
||
|
bwtOpsPerAlignedRead_.push((float)curBwtOps_);
|
||
|
backtracksPerAlignedRead_.push((float)curBacktracks_);
|
||
|
} else {
|
||
|
bwtOpsPerUnalignedRead_.push((float)curBwtOps_);
|
||
|
backtracksPerUnalignedRead_.push((float)curBacktracks_);
|
||
|
}
|
||
|
if(curNumNs_ == 0) {
|
||
|
lessThanThreeNRreads_++;
|
||
|
bwtOpsPer0nRead_.push((float)curBwtOps_);
|
||
|
backtracksPer0nRead_.push((float)curBacktracks_);
|
||
|
} else if(curNumNs_ == 1) {
|
||
|
lessThanThreeNRreads_++;
|
||
|
bwtOpsPer1nRead_.push((float)curBwtOps_);
|
||
|
backtracksPer1nRead_.push((float)curBacktracks_);
|
||
|
} else if(curNumNs_ == 2) {
|
||
|
lessThanThreeNRreads_++;
|
||
|
bwtOpsPer2nRead_.push((float)curBwtOps_);
|
||
|
backtracksPer2nRead_.push((float)curBacktracks_);
|
||
|
} else {
|
||
|
threeOrMoreNReads_++;
|
||
|
bwtOpsPer3orMoreNRead_.push((float)curBwtOps_);
|
||
|
backtracksPer3orMoreNRead_.push((float)curBacktracks_);
|
||
|
}
|
||
|
}
|
||
|
|
||
|
// Running-total of the number of backtracks and BWT ops for the
|
||
|
// current read
|
||
|
uint32_t curBacktracks_;
|
||
|
uint32_t curBwtOps_;
|
||
|
|
||
|
protected:
|
||
|
|
||
|
bool first_;
|
||
|
|
||
|
// true iff the current read is low entropy
|
||
|
bool curIsLowEntropy_;
|
||
|
// true if current read is all 1 char (or very close)
|
||
|
bool curIsHomoPoly_;
|
||
|
// true iff the current read has had one or more ranges reported
|
||
|
bool curHadRanges_;
|
||
|
// number of Ns in current read
|
||
|
int curNumNs_;
|
||
|
|
||
|
// # reads
|
||
|
uint32_t reads_;
|
||
|
// # homo-poly reads
|
||
|
uint32_t homoReads_;
|
||
|
// # low-entropy reads
|
||
|
uint32_t lowEntReads_;
|
||
|
// # high-entropy reads
|
||
|
uint32_t hiEntReads_;
|
||
|
// # reads with alignments
|
||
|
uint32_t alignedReads_;
|
||
|
// # reads without alignments
|
||
|
uint32_t unalignedReads_;
|
||
|
// # reads with 3 or more Ns
|
||
|
uint32_t threeOrMoreNReads_;
|
||
|
// # reads with < 3 Ns
|
||
|
uint32_t lessThanThreeNRreads_;
|
||
|
|
||
|
// Distribution of BWT operations per read
|
||
|
RunningStat bwtOpsPerRead_;
|
||
|
RunningStat backtracksPerRead_;
|
||
|
|
||
|
// Distribution of BWT operations per homo-poly read
|
||
|
RunningStat bwtOpsPerHomoRead_;
|
||
|
RunningStat backtracksPerHomoRead_;
|
||
|
|
||
|
// Distribution of BWT operations per low-entropy read
|
||
|
RunningStat bwtOpsPerLoEntRead_;
|
||
|
RunningStat backtracksPerLoEntRead_;
|
||
|
|
||
|
// Distribution of BWT operations per high-entropy read
|
||
|
RunningStat bwtOpsPerHiEntRead_;
|
||
|
RunningStat backtracksPerHiEntRead_;
|
||
|
|
||
|
// Distribution of BWT operations per read that "aligned" (for
|
||
|
// which a range was arrived at - range may not have necessarily
|
||
|
// lead to an alignment)
|
||
|
RunningStat bwtOpsPerAlignedRead_;
|
||
|
RunningStat backtracksPerAlignedRead_;
|
||
|
|
||
|
// Distribution of BWT operations per read that didn't align
|
||
|
RunningStat bwtOpsPerUnalignedRead_;
|
||
|
RunningStat backtracksPerUnalignedRead_;
|
||
|
|
||
|
// Distribution of BWT operations/backtracks per read with no Ns
|
||
|
RunningStat bwtOpsPer0nRead_;
|
||
|
RunningStat backtracksPer0nRead_;
|
||
|
|
||
|
// Distribution of BWT operations/backtracks per read with one N
|
||
|
RunningStat bwtOpsPer1nRead_;
|
||
|
RunningStat backtracksPer1nRead_;
|
||
|
|
||
|
// Distribution of BWT operations/backtracks per read with two Ns
|
||
|
RunningStat bwtOpsPer2nRead_;
|
||
|
RunningStat backtracksPer2nRead_;
|
||
|
|
||
|
// Distribution of BWT operations/backtracks per read with three or
|
||
|
// more Ns
|
||
|
RunningStat bwtOpsPer3orMoreNRead_;
|
||
|
RunningStat backtracksPer3orMoreNRead_;
|
||
|
|
||
|
Timer timer_;
|
||
|
};
|
||
|
|
||
|
#endif /* ALIGNER_METRICS_H_ */
|