hisat-3n/alignment_3n.h
2025-01-18 21:09:52 +08:00

1215 lines
40 KiB
C++

/*
* Copyright 2020, Yun (Leo) Zhang <imzhangyun@gmail.com>
*
* This file is part of HISAT-3N.
*
* HISAT-3N is free software: you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation, either version 3 of the License, or
* (at your option) any later version.
*
* HISAT-3N is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with HISAT-3N. If not, see <http://www.gnu.org/licenses/>.
*/
#ifndef HISAT2_ALIGNMENT_3N_H
#define HISAT2_ALIGNMENT_3N_H
#include <string>
#include <vector>
#include <fstream>
#include <limits>
#include <algorithm>
#include "sstring.h"
#include "util.h"
#include "hisat2lib/ht2.h"
#include "read.h"
#include "outq.h"
#include "reference.h"
#include <unistd.h>
#include <queue>
#include "position_3n.h"
#include "utility_3n.h"
#include "simple_func.h"
extern char usrInput_convertedFrom;
extern char usrInput_convertedTo;
extern char usrInput_convertedFromComplement;
extern char usrInput_convertedToComplement;
extern SimpleFunc scoreMin; // minimum valid score as function of read len
extern int penMmcMax; // max mm penalty
extern char hs3N_convertedFrom;
extern char hs3N_convertedTo;
extern char hs3N_convertedFromComplement;
extern char hs3N_convertedToComplement;
extern vector<ht2_handle_t> repeatHandles;
extern struct ht2_index_getrefnames_result *refNameMap;
extern int repeatLimit;
extern bool uniqueOutputOnly;
extern int directional3NMapping;
using namespace std;
struct ReportingMetrics;
/**
* the data structure to store all information of one alignment result.
*/
class Alignment {
public:
// basic information
BTString readName;
int flag;
BTString chromosomeName;
int chromosomeIndex; // the chromosome index to use getStretch() function
long long int location;
BTString MAPQ;
BTString cigarString;
vector<Cigar> cigarSegments;
int cigarLength; // this is the length that read cover genome.
BTString pairToChromosome;
long long int pairToLocation;
long long int pairingDistance;
BTString readSequence;
BTString readQuality;
//tags
int AS; // alignment score
int NH; // number of alignment
int XM; // number of mismatch
int NM; // edit distance
int YS; // mate's AS
BTString MD;
BTString YT; //"UU" for single-end. "CP" for concordant alignment, "DP" for disconcordant alignment, "UP" for else.
// special tags in HISAT-3N
int Yf; // number of conversion.
int Zf; // number of unconverted base.
char YZ; // this tag shows alignment strand: check makeYZ function for the classification rule
// + for REF strand (conversionCount[0] is equal or smaller than conversionCount[1]),
// - for REF-RC strand (conversionCount[1] is smaller)
// unChanged tags
BTString unChangedTags;
BTString passThroughLine; // this is controlled by print_xr_ in SamConfig
// for pairScore calculation
static const int maxPairDistance = 500000;
static const int penaltyFreeDistance_DNA = 1000;
static const int penaltyFreeDistance_RNA = 100000;
static const int distancePenaltyFraction_DNA = 100;
static const int distancePenaltyFraction_RNA = 1000;
static const int ASPenalty = 100;
static const int concordantScoreBounce = 500000;
// intermediate variable
bool outputted = false; // whether the alignment is outputted.
bool DNA = false;
int cycle_3N; // indicate which cycle_3N make this alignment result. 0 or 3 for repeatHandles[0], else repeatHandles[1]
bool paired;
bool forward;
bool mapped;
bool concordant;
int64_t MinimumScore;
int pairSegment; // 0 for first segment, 1 for second segment.
struct ht2_repeat_expand_result *repeatResult = nullptr;
int pairScore; // to identify the better pair
bool mateMapped; // to adjust the YT tag
bool repeat;
bool pairToRepeat;
RepeatMappingPositions repeatPositions; // to store the expanded repeat information
int conversionCount[2] = {0}; // there are two type of conversion could happen, save the number of conversion separately.
int unConversionCount[2] = {0}; // save the unconverted base count.
string intToBase = "ACGTN";
void initialize() {
readName.clear();
flag = -1;
chromosomeName.clear();
chromosomeIndex = -1;
location = 0;
MAPQ.clear();
cigarString.clear();
cigarSegments.clear();
cigarLength = 0;
pairToChromosome.clear();
pairToLocation = 0;
pairingDistance = 0;
readSequence.clear();
readQuality.clear();
AS = numeric_limits<int>::min();
NH = 0;
XM = 0;
NM = 0;
YS = 0;
MD.clear();
YT.clear();
Yf = 0;
Zf = 0;
unChangedTags.clear();
outputted = false;
DNA = false;
cycle_3N = -1;
paired = false;
forward = false;
mapped = false;
concordant = false;
pairSegment = 0;
if (repeatResult != nullptr) {
free(repeatResult);
repeatResult = nullptr;
}
pairScore = numeric_limits<int>::min();
mateMapped = false;
repeat = false;
pairToRepeat = false;
repeatPositions.initialize();
conversionCount[0] = 0;
conversionCount[1] = 0;
unConversionCount[0] = 0;
unConversionCount[1] = 0;
passThroughLine.clear();
}
Alignment() {
initialize();
}
~Alignment() {
if (repeatResult != nullptr) free(repeatResult);
}
/**
* change YS tag for output.
*/
void setYS (Alignment* input) {
YS = input->AS;
}
void setYS(RepeatMappingPosition* input) {
YS = input->AS;
}
/**
* change concordant status and flag
*/
void setConcordant(bool concordant_) {
concordant = concordant_;
if (concordant) {
flag |= 2;
} else {
flag &= ~((int)2);
}
}
/**
* change mateMapped status and flag
*/
void setMateMappingFlag(long long int *mateLocation) {
if (mateLocation == NULL) { return; }
mateMapped = *mateLocation != 0;
if ((flag&8) && mateMapped) flag -= 8;
else if (!(flag&8) && !mateMapped) flag += 8;
}
/**
* change YT tag base on mateMapped and concordant information.
*/
void setYT() {
if (paired) {
if (!mateMapped) {
YT = "UP";
return;
}
if (concordant) { YT = "CP"; }
else { YT = "DP"; }
} else {
YT = "UU";
}
}
/**
* update NH and MAPQ by number of alignment.
*/
void updateNH(int nAlignment) {
if (!mapped) return;
NH = nAlignment;
if (nAlignment == 0) return;
else if (nAlignment == 1) MAPQ = "60";
else MAPQ = "1";
}
/**
* extract information from flag, change flag to secondary alignment.
*/
void extractFlagInfo() {
paired = (flag & 1) != 0;
forward = (flag & 16) == 0;
if ((flag & 256) == 0) { // change all read to secondary alignment
flag += 256;
}
mapped = (flag & 4) == 0;
if (flag & 128) {
pairSegment = 1;
} else {
pairSegment = 0; // it could be the first pair segment or it is single read.
}
concordant = (flag & 2) != 0;
if (!mapped) {
repeat = false;
}
MinimumScore = scoreMin.f<TAlScore>(readSequence.length());
}
/**
* calculate the pairScore for a pair of alignment result. Output pair Score and number of pair.
* Do not update their pairScore.
*/
int calculatePairScore(Alignment *inputAlignment, int &nPair);
/**
* make YZ tag.
* if (no conversion) or (conversion type 0 == conversion type 1) or (directional mapping):
* if the (pairSegment is 0) && (forward) => YZ = REF (+)
* if the (pairSegment is 0) && (reverse) => YZ = REF-RC (-)
* if the (pairSegment is 1) && (forward) => YZ = REF-RC (-)
* if the (pairSegment is 1) && (reverse) => YZ = REF (+)
* if the conversion type 0 is less, the read is mapped to REF (+).
* if the conversion type 1 is less, the read is mapped to REF-RC (-).
*/
void makeYZ(char &YZ_string) {
if (directional3NMapping == 2){
if (pairSegment == 0 && forward) {
YZ_string = '-';
} else if (pairSegment == 0 && !forward) {
YZ_string = '+';
} else if (pairSegment == 1 && forward) {
YZ_string = '+';
} else if (pairSegment == 1 && !forward) {
YZ_string = '-';
}
} else if (directional3NMapping == 1 || (conversionCount[0] == 0 && conversionCount[1] == 0) || conversionCount[0] == conversionCount[1]){
if (pairSegment == 0 && forward) {
YZ_string = '+';
} else if (pairSegment == 0 && !forward) {
YZ_string = '-';
} else if (pairSegment == 1 && forward) {
YZ_string = '-';
} else if (pairSegment == 1 && !forward) {
YZ_string = '+';
}
} else if (conversionCount[0] >= conversionCount[1]) {
YZ_string = '+';
} else {
YZ_string = '-';
}
}
/**
* make Yf tag, Yf tag is to count the number of conversion in the string.
* use YZ tag to decide which type of conversion is legal conversion.
*/
int makeYf(char YZTag) {
int outYf = -1;
if (YZTag == '+'){
outYf = conversionCount[0];
} else if (YZTag == '-'){
outYf = conversionCount[1];
}
assert(outYf >= 0);
return outYf;
}
/**
* make Zf tag, Zf tag is to count the number of un-converted base in the string.
* use YZ tag to decide which type of conversion is legal conversion.
*/
int makeZf(char YZTag) {
int outZf = -1;
if (YZTag == '+'){
outZf = unConversionCount[0];
} else if (YZTag == '-'){
outZf = unConversionCount[1];
}
assert (outZf >= 0);
return outZf;
}
/**
* expand the repeat mapping location and construct MD for each location.
* return ture if there is any mapping location pass the filter, else, false.
*/
bool constructRepeatMD(BitPairReference* bitReference, MappingPositions &alignmentPositions) {
if (!mapped) {
return true;
}
// expand the repeat locations
ht2_error_t err = ht2_repeat_expand((cycle_3N == 0 || cycle_3N == 3) ? repeatHandles[0] : repeatHandles[1],
chromosomeName.toZBuf(),
location - 1,
readSequence.length(),
&repeatResult);
BTString chromosomeRepeat;
long long int locationRepeat;
for (int i = 0; i < repeatResult->count; i++) {
struct ht2_position *pos = &repeatResult->positions[i];
chromosomeRepeat = refNameMap->names[pos->chr_id];
for (int j = 0; j < chromosomeRepeat.length(); j++) {
if (chromosomeRepeat[j] == ' ') {
chromosomeRepeat.trimEnd(chromosomeRepeat.length() - j);
break;
}
}
locationRepeat = (pos->pos) + 1;
bool genomeForward = pos->direction == 0;
if (!genomeForward) { continue; } // if the repeat mapping direction is different to the designed direction, ignore it.
// if the mapping location is already exist, continue.
if (alignmentPositions.positionExist(chromosomeRepeat, locationRepeat, pairSegment)){
continue;
}
// get reference sequence
ASSERT_ONLY(SStringExpandable<uint32_t> destU32);
SStringExpandable<char> raw_refbuf;
raw_refbuf.resize(cigarLength + 16);
raw_refbuf.clear();
int off = bitReference->getStretch(
reinterpret_cast<uint32_t*>(raw_refbuf.wbuf()),
(size_t)pos->chr_id,
(size_t)max<int>(locationRepeat-1, 0),
(size_t)cigarLength ASSERT_ONLY(, destU32));
char* refSeq = raw_refbuf.wbuf() + off;
BTString refSequence;
refSequence.resize(cigarLength);
for (int j = 0; j < cigarLength; j++) {
refSequence.set(intToBase[*(refSeq + j)], j);
}
// check whether the refSequence is exist. if do, directly append the repeat.
int repeatPositionsIndex;
if (repeatPositions.sequenceExist(refSequence, repeatPositionsIndex)) {
repeatPositions.append(chromosomeRepeat, locationRepeat, repeatPositionsIndex);
continue;
}
BTString newMD;
int newMismatch = 0;
char repeatYZ;
int repeatYf;
int repeatZf;
if (!constructRepeatMD(refSequence, newMD, newMismatch, repeatYf, repeatZf, repeatYZ)) {
continue;
}
int newXM = XM + newMismatch;
int newNM = NM + newMismatch;
int newAS = AS - penMmcMax * newMismatch;
if (newAS < MinimumScore)
{
continue;
}
repeatPositions.append(locationRepeat, chromosomeRepeat, refSequence,newAS, newMD, newXM, newNM, repeatYf, repeatZf, repeatYZ);
// if there are too many mappingPosition exist return.
if (repeatPositions.size() >= repeatLimit || alignmentPositions.size() > repeatLimit) {
return true;
}
}
if (repeatPositions.size() == 0) {
return false;
} else {
return true;
}
}
/**
* for each repeat mapping position, construct its MD
* return true if the mapping result does not have a lot of mismatch, else return false.
*/
bool constructRepeatMD(BTString &refSeq, BTString &newMD_String, int &newMismatch, int& repeatYf, int& repeatZf, char &repeatYZ) {
char buf[1024];
conversionCount[0] = 0;
conversionCount[1] = 0;
unConversionCount[0] = 0;
unConversionCount[1] = 0;
int readPos = 0;
long long int refPos = 0;
int count = 0;
int newXM = 0;
char cigarSymbol;
int cigarLen;
for (int i = 0; i < cigarSegments.size(); i++) {
cigarSymbol = cigarSegments[i].getLabel();
cigarLen = cigarSegments[i].getLen();
if (cigarSymbol == 'S') {
readPos += cigarLen;
} else if (cigarSymbol == 'N') {
refPos += cigarLen;
} else if (cigarSymbol == 'M') {
for (int j = 0; j < cigarLen; j++) {
char readChar = readSequence[readPos];
char refChar = refSeq[refPos];
if (readChar == refChar) {
if (refChar == usrInput_convertedFrom)
{
unConversionCount[0]++;
}
else if (refChar == usrInput_convertedFromComplement)
{
unConversionCount[1]++;
}
count++;
} else {// mismatch
// output matched count
if (count != 0) {
itoa10<int>(count, buf);
newMD_String.append(buf);
count = 0;
}
// output mismatch
if (!newMD_String.empty() && isalpha(newMD_String[newMD_String.length()-1])) {
newMD_String.append('0');
}
if ((readChar == usrInput_convertedTo) && (refChar == usrInput_convertedFrom)) {
conversionCount[0]++;
} else if ((readChar == usrInput_convertedToComplement) && (refChar == usrInput_convertedFromComplement)) {
conversionCount[1]++;
} else {
// real mismatch
newXM++;
}
newMD_String.append(refChar);
}
readPos++;
refPos++;
}
} else if (cigarSymbol == 'I') {
readPos += cigarLen;
} else if (cigarSymbol == 'D') {
newMD_String.append('^');
for (int j = 0; j < cigarLen; j++) {
newMD_String.append(refSeq[refPos]);
refPos++;
}
}
}
if (count != 0) {
itoa10<int>(count, buf);
newMD_String.append(buf);
}
if (isalpha(newMD_String[0])) { newMD_String.insert('0', 0); }
if (isalpha(newMD_String[newMD_String.length()-1])) { newMD_String.append('0'); }
makeYZ(repeatYZ);
int badConversion = 0;
// identify the bad conversion number based on repeatYZ;
if (repeatYZ == '+') {
badConversion = conversionCount[1];
} else {
badConversion = conversionCount[0];
}
repeatYf = makeYf(repeatYZ);
repeatZf = makeZf(repeatZf);
newXM += badConversion;
newMismatch = newXM - XM;
if (newMismatch < 0){
newMismatch = 0;
}
return true;
}
/**
* for each non-repeat mapping position, construct its MD
* return true if the mapping result does not have a lot of mismatch, else return false.
*/
bool constructMD(BitPairReference* bitReference) {
if (!mapped) {
return true;
}
char buf[1024];
MD.clear();
ASSERT_ONLY(SStringExpandable<uint32_t> destU32);
SStringExpandable<char> raw_refbuf;
raw_refbuf.resize(cigarLength + 16);
raw_refbuf.clear();
int off = bitReference->getStretch(
reinterpret_cast<uint32_t*>(raw_refbuf.wbuf()),
(size_t)chromosomeIndex,
(size_t)max<int>(location-1, 0),
(size_t)cigarLength ASSERT_ONLY(, destU32));
char* refSeq = raw_refbuf.wbuf() + off;
int readPos = 0;
long long int refPos = 0;
int count = 0;
int newXM = 0;
char cigarSymbol;
int cigarLen;
for (int i = 0; i < cigarSegments.size(); i++) {
cigarSymbol = cigarSegments[i].getLabel();
cigarLen = cigarSegments[i].getLen();
if (cigarSymbol == 'S') {
readPos += cigarLen;
} else if (cigarSymbol == 'N') {
refPos += cigarLen;
} else if (cigarSymbol == 'M') {
for (int j = 0; j < cigarLen; j++) {
char readChar = readSequence[readPos];
char refChar = intToBase[*(refSeq + refPos)];
if (readChar == refChar) {
if (refChar == usrInput_convertedFrom)
{
unConversionCount[0]++;
}
else if (refChar == usrInput_convertedFromComplement)
{
unConversionCount[1]++;
}
count++;
} else {// mismatch
// output matched count
if (count != 0) {
itoa10<int>(count, buf);
MD.append(buf);
count = 0;
}
// output mismatch
if (!MD.empty() && isalpha(MD[MD.length()-1])) {
MD.append('0');
}
if ((readChar == usrInput_convertedTo) && (refChar == usrInput_convertedFrom)) {
conversionCount[0]++;
} else if ((readChar == usrInput_convertedToComplement) && (refChar == usrInput_convertedFromComplement)) {
conversionCount[1]++;
} else {
// real mismatch
newXM++;
}
MD.append(refChar);
}
readPos++;
refPos++;
}
} else if (cigarSymbol == 'I') {
readPos += cigarLen;
} else if (cigarSymbol == 'D') {
if (count != 0) {
itoa10<int>(count, buf);
MD.append(buf);
count = 0;
}
MD.append('^');
for (int j = 0; j < cigarLen; j++) {
MD.append(intToBase[*(refSeq + refPos)]);
refPos++;
}
}
}
if (count != 0) {
itoa10<int>(count, buf);
MD.append(buf);
}
if (isalpha(MD[0])) { MD.insert('0', 0); }
if (isalpha(MD[MD.length()-1])) { MD.append('0'); }
makeYZ(YZ);
int badConversion = 0;
// identify the bad conversion number based on YZ tag;
if (YZ == '+') {
badConversion = conversionCount[1];
} else {
badConversion = conversionCount[0];
}
Yf = makeYf(YZ);
Zf = makeZf(YZ);
newXM += badConversion;
newXM -= XM;
if (newXM < 0){
newXM = 0;
}
NM += newXM;
XM += newXM;
AS = AS - penMmcMax * newXM;
if (AS < MinimumScore)
{
return false;
}
BTString tmp;
if (pairToRepeat) {
repeatPositions.append(location, chromosomeName, tmp, AS, MD, XM, NM, Yf, Zf, YZ);
}
return true;
}
/**
* output the tags for non-repeat alignment.
*/
void outputTags(BTString& o) {
char buf[1024];
if (mapped) {
o.append('\t');
// AS
assert(AS <= 0);
o.append("AS:i:");
itoa10<int>(AS, buf);
o.append(buf);
o.append('\t');
// NH
assert(NH > 0);
o.append("NH:i:");
itoa10<int>(NH, buf);
o.append(buf);
o.append('\t');
// XM
assert(XM >= 0);
o.append("XM:i:");
itoa10<int>(XM, buf);
o.append(buf);
o.append('\t');
// NM
assert(NM >= 0);
o.append("NM:i:");
itoa10<int>(NM, buf);
o.append(buf);
o.append('\t');
// MD
assert(!MD.empty());
o.append("MD:Z:");
o.append(MD.toZBuf());
o.append('\t');
// YS
if (paired && mateMapped) {
o.append("YS:i:");
itoa10<int>(YS, buf);
o.append(buf);
o.append('\t');
}
// YZ
o.append("YZ:A:");
o.append(YZ);
o.append('\t');
// Yf
o.append("Yf:i:");
itoa10<int>(Yf, buf);
o.append(buf);
o.append('\t');
//Zf
o.append("Zf:i:");
itoa10<int>(Zf, buf);
o.append(buf);
}
// unchanged Tags
if (!unChangedTags.empty()) {
o.append('\t');
o.append(unChangedTags.toZBuf());
}
o.append(passThroughLine.toZBuf());
}
/**
* output the tags for repeat alignment.
*/
void outputTags(BTString& o, RepeatMappingPosition* repeatInfo){
// this function is for repeat alignment output.
char buf[1024];
o.append('\t');
// AS
assert(AS <= 0);
o.append("AS:i:");
itoa10<int>(repeatInfo->AS, buf);
o.append(buf);
o.append('\t');
// NH
assert(NH > 0);
o.append("NH:i:");
itoa10<int>(NH, buf);
o.append(buf);
o.append('\t');
// XM
assert(XM >= 0);
o.append("XM:i:");
itoa10<int>(repeatInfo->XM, buf);
o.append(buf);
o.append('\t');
// NM
assert(NM >= 0);
o.append("NM:i:");
itoa10<int>(repeatInfo->NM, buf);
o.append(buf);
o.append('\t');
// MD
assert(!MD.empty());
o.append("MD:Z:");
o.append(repeatInfo->MD.toZBuf());
o.append('\t');
// YS
if (paired) {
o.append("YS:i:");
itoa10<int>(YS, buf);
o.append(buf);
o.append('\t');
}
//YT
o.append("YT:Z:");
o.append(YT.toZBuf());
o.append('\t');
// YS
if (paired && mateMapped) {
o.append("YS:i:");
itoa10<int>(YS, buf);
o.append(buf);
o.append('\t');
}
// YZ
o.append("YZ:A:");
o.append(repeatInfo->YZ);
o.append('\t');
// Yf
o.append("Yf:i:");
itoa10<int>(repeatInfo->Yf, buf);
o.append(buf);
o.append('\t');
// Zf
o.append("Zf:i:");
itoa10<int>(repeatInfo->Zf, buf);
o.append(buf);
// unchanged Tags
if (!unChangedTags.empty()) {
o.append('\t');
o.append(unChangedTags.toZBuf());
}
o.append(passThroughLine.toZBuf());
}
/**
* output alignment. this function is for both repeat and non-repeat alignment.
*/
void outputAlignment (BTString& o, RepeatMappingPosition* repeatInfo, long long int* oppoLocation, bool& primaryAlignment) {
BTString* outputChromosome;
long long int* outputLocation;
if (repeatInfo == NULL) {
if (outputted) { return; }
outputted = true;
outputChromosome = &chromosomeName;
outputLocation = &location;
} else {
if (repeatInfo->outputted) { return; }
repeatInfo->outputted = true;
outputChromosome = &repeatInfo->repeatChromosome;
outputLocation = &repeatInfo->repeatLocation;
}
//setMateMappingFlag(oppoLocation);
setYT();
char buf[1024];
// readName
o.append(readName.toZBuf());
o.append('\t');
// flag, if it is primary alignment, -256
assert(flag >=0);
itoa10<int>(flag-primaryAlignment*256, buf);
o.append(buf);
o.append('\t');
// chromosome
assert(!outputChromosome->empty());
o.append(outputChromosome->toZBuf());
o.append('\t');
// location
assert(*outputLocation >= 0);
itoa10<int>(*outputLocation, buf);
o.append(buf);
o.append('\t');
//MAPQ
o.append(MAPQ.toZBuf());
o.append('\t');
// cigar
o.append(cigarString.toZBuf());
o.append('\t');
// pair to chromosome
if (paired && *oppoLocation!=0) {
o.append("=");
o.append('\t');
} else {
o.append("*");
o.append('\t');
}
// pair to location
if (paired) {
itoa10<int>(*oppoLocation, buf);
o.append(buf);
o.append('\t');
} else {
o.append('0');
o.append('\t');
}
// pairing distance
if (paired) {
itoa10<int>(*oppoLocation - *outputLocation, buf);
o.append(buf);
o.append('\t');
} else {
o.append('0');
o.append('\t');
}
// read sequence
o.append(readSequence.toZBuf());
o.append('\t');
// read quality
o.append(readQuality.toZBuf());
// make sure there is no '\t' at the beginning of unChangedTags
while (!unChangedTags.empty() && unChangedTags[0] == '\t') {
unChangedTags.remove(0);
}
// tags
if (repeatInfo == NULL) {
outputTags(o);
o.append('\n');
} else {
outputTags(o, repeatInfo->flagInfoIndex == -1?repeatInfo:&repeatPositions.positions[repeatInfo->flagInfoIndex]);
o.append('\n');
}
}
/**
* return true if two location is concordant.
* return false, if there are not concordant or too far (>maxPairDistance).
*/
static bool isConcordant(long long int location1, bool &forward1, long long int readLength1, long long int location2, bool &forward2, long long int readLength2);
/**
* this is the basic function to calculate DNA pair score.
* if the distance between 2 alignments is more than penaltyFreeDistance_DNA, we reduce the score by the distance/100.
* if two alignment is concordant we add concordantScoreBounce to make sure to select the concordant pair as best pair.
*/
static int calculatePairScore_DNA (long long int &location0, int& AS0, bool& forward0, long long int readLength0, long long int &location1, int &AS1, bool &forward1, long long int readLength1, bool& concordant);
/**
* this is the basic function to calculate RNA pair score.
* if the distance between 2 alignments is more than penaltyFreeDistance_RNA, we reduce the score by the distance/1000.
* if two alignment is concordant we add concordantScoreBounce to make sure to select the concordant pair as best pair.
*/
static int calculatePairScore_RNA (long long int &location0, int& XM0, bool& forward0, long long int readLength0, long long int &location1, int &XM1, bool &forward1, long long int readLength1, bool& concordant);
};
/**
* the data structure to store, process, and output all Alignment
*/
class Alignments {
public:
vector<Alignment*> alignments; // pool to store current alignment result.
vector<Alignment*> freeAlignments; // free pointer pool for new alignment result. after output a alignment, return the pointer back to this pool.
TReadId previousReadID;
MappingPositions alignmentPositions; // the pool to save all alignment position
BTString readName[2]; // the read name could be different for segment 1 and segment 2.
BTDnaString readSequence[2]; // save the read sequence for output.
BTString qualityScore[2]; // save the quality score for output.
bool paired;
const int repeatPoolLimit = 20; // this is the maximum number of repeat alignment we allowed.
bool multipleAligned; // check whether we have multiple alignment, it is work unique mode.
const int maxPairScore = 500000; // maximum pair score, if pairScore == maxPairScore, both math are perfect match and the pairDistance is small.
BitPairReference* bitReference; // bit pair reference sequence
bool DNA;
int nRepeatAlignment; // count number of repeat alignment we received, for short sequence we could receive a lot of repeat alignment result.
BTString passThroughLines[2];
void initialize() {
alignmentPositions.initialize();
paired = false;
multipleAligned = false;
nRepeatAlignment = 0;
for (int i = 0; i < 2; i++) {
readName[i].clear();
readSequence[i].clear();
qualityScore[i].clear();
passThroughLines[i].clear();
}
for (int i = 0; i < alignments.size(); i++) {
alignments[i]->initialize();
freeAlignments.push_back(alignments[i]);
}
alignments.clear();
}
Alignments(BitPairReference* ref, bool inputDNA): bitReference(ref), DNA(inputDNA) {
initialize();
}
~Alignments() {
while (!freeAlignments.empty()) {
delete freeAlignments.back();
freeAlignments.pop_back();
}
for (int i = 0; i < alignments.size(); i++) {
delete alignments[i];
}
}
/**
* get sequence for rd. if it already exist, ignore it.
*/
void getSequence(const Read& rd) {
int pairSegment = rd.mate == 0? rd.mate : rd.mate-1;
if (readName[pairSegment].empty()) { readName[pairSegment] = rd.name; }
if (readSequence[pairSegment].empty()) { readSequence[pairSegment] = rd.originalFw; }
if (qualityScore[pairSegment].empty()) { qualityScore[pairSegment] = rd.qual; }
}
/**
* return true if we want to receive more new alignment.
*/
bool acceptNewAlignment() {
if (uniqueOutputOnly && multipleAligned ||
alignmentPositions.nBestSingle >= repeatLimit ||
nRepeatAlignment > repeatPoolLimit ||
alignmentPositions.nBestPair >= repeatLimit) {
return false;
}
return true;
}
/**
* return the alignment back to freeAlignment pool.
*/
void returnToFreeAlignments (Alignment*& currentAlignment) {
currentAlignment->initialize();
freeAlignments.push_back(currentAlignment);
}
/**
* get a Alignment pointer from freeAlignments, if freeAlignment is empty, make a new Alignment.
*/
void getFreeAlignmentPointer(Alignment*& newAlignment) {
if (!freeAlignments.empty()) {
newAlignment = freeAlignments.back();
freeAlignments.pop_back();
} else {
newAlignment = new Alignment();
}
}
/**
* receive alignment information from AlnSink3NSam::appendMate() and append it to alignment pool.
*/
void append(Alignment *newAlignment) {
newAlignment->extractFlagInfo();
paired = newAlignment->paired;
newAlignment->DNA = DNA;
if (passThroughLines[newAlignment->pairSegment].empty()) {
passThroughLines[newAlignment->pairSegment] = newAlignment->passThroughLine;
}
// check if the alignment is already exist. if exist, ignore it.
if (!alignmentPositions.append(newAlignment)) {
alignments.push_back(newAlignment);
return;
}
// construct MD tag and check if the alignment has too many mismatch, if do, ignore it.
if (newAlignment->repeat) {
if (!newAlignment->constructRepeatMD(bitReference, alignmentPositions)) {
alignmentPositions.badAligned();
alignments.push_back(newAlignment);
return;
}
nRepeatAlignment++; // for each repeat alignment, record it.
} else {
// check mismatch, update tags
if (!newAlignment->constructMD(bitReference)) {
alignmentPositions.badAligned();
alignments.push_back(newAlignment);
return;
}
}
// update pair score or AS, for output using.
// if the new alignment has lower paring score or AS than bestPairScore or bestAS, ignore it.
if (paired) {
if (!alignmentPositions.updatePairScore()) {
alignments.push_back(newAlignment);
return;
}
if (alignmentPositions.bestPairScore == maxPairScore && alignmentPositions.nBestPair > 1) {
multipleAligned = true;
}
} else {
if (!alignmentPositions.updateAS()) {
alignments.push_back(newAlignment);
return;
}
if (alignmentPositions.bestAS == 0 && alignmentPositions.nBestSingle > 1) {
multipleAligned = true;
}
}
alignments.push_back(newAlignment);
}
/**
* if there is no alignment, output unAlignment result.
* this function is important when hisat2 give mapped result, but it does not pass my filter (has too many mismatch).
*/
void outputUnAlignmentRead(BTString& o) {
if (paired) {
for (int i = 0; i < 2; i++) {
assert(!readName[i].empty());
uint ReadNameLength = readName[i].length();
if (readName[i].length() > 255)
{
ReadNameLength = 255;
}
for (int j = 0; j < ReadNameLength; j++)
{
if(isspace(readName[i][j])) {
break;
}
o.append(readName[i][j]);
}
o.append("\t");
string flag = (i == 0) ? "77" : "141";
o.append(flag.c_str());
o.append("\t*\t0\t0\t*\t*\t0\t0\t");
o.append(readSequence[i].toZBuf());
o.append("\t");
o.append(qualityScore[i].toZBuf());
o.append("\tYT:Z:UP");
o.append(passThroughLines[i].toZBuf());
o.append('\n');
}
} else {
assert(!readName[0].empty());
string ReadName = "";
uint ReadNameLength = readName[0].length();
if (readName[0].length() > 255)
{
ReadNameLength = 255;
}
for (int j = 0; j < ReadNameLength; j++)
{
if(isspace(readName[0][j])) {
break;
}
o.append(readName[0][j]);
}
o.append("\t4\t*\t0\t0\t*\t*\t0\t0\t");
o.append(readSequence[0].toZBuf());
o.append("\t");
o.append(qualityScore[0].toZBuf());
o.append("\tYT:Z:UU");
o.append(passThroughLines[0].toZBuf());
o.append('\n');
}
}
/**
* report alignment statistics for single-end alignment
*/
void reportStats_single(ReportingMetrics& met);
/**
* report alignment statistics for paired-end alignment
*/
void reportStats_paired(ReportingMetrics& met);
/**
* output single-end alignment reuslts
*/
void output_single(BTString& o,
ReportingMetrics& met) {
reportStats_single(met);
// output
if (uniqueOutputOnly && (alignmentPositions.nBestSingle != 1 || multipleAligned)) {
// do not output anything
} else if (alignments.empty() || alignmentPositions.nBestSingle == 0) {
// make a unalignment result and output it.
outputUnAlignmentRead(o);
} else {
// output
alignmentPositions.outputSingle(o);
}
}
/**
* output paired-end alignment reuslts
*/
void output_paired(BTString& o,
ReportingMetrics& met) {
reportStats_paired(met);
if ((uniqueOutputOnly && (alignmentPositions.nBestPair != 1 || multipleAligned))) {
// do not report anything
} else if (alignments.empty() ||
alignmentPositions.nBestPair == 0 ||
alignmentPositions.bestPairScore == numeric_limits<int>::min()) {
// make a unalignment result and output it.
outputUnAlignmentRead(o);
} else {
// output
alignmentPositions.outputPair(o);
}
}
/**
* output function will be redirected to output_single or output_paired
*/
void output(ReportingMetrics& met,
BTString& o) {
if (paired) {
output_paired(o, met);
} else {
output_single(o,met);
}
initialize();
}
};
#endif //HISAT2_ALIGNMENT_3N_H