/* * Copyright 2020, Yun (Leo) Zhang * * This file is part of HISAT-3N. * * HISAT-3N is free software: you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation, either version 3 of the License, or * (at your option) any later version. * * HISAT-3N is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with HISAT-3N. If not, see . */ #ifndef HISAT2_ALIGNMENT_3N_H #define HISAT2_ALIGNMENT_3N_H #include #include #include #include #include #include "sstring.h" #include "util.h" #include "hisat2lib/ht2.h" #include "read.h" #include "outq.h" #include "reference.h" #include #include #include "position_3n.h" #include "utility_3n.h" #include "simple_func.h" extern char usrInput_convertedFrom; extern char usrInput_convertedTo; extern char usrInput_convertedFromComplement; extern char usrInput_convertedToComplement; extern SimpleFunc scoreMin; // minimum valid score as function of read len extern int penMmcMax; // max mm penalty extern char hs3N_convertedFrom; extern char hs3N_convertedTo; extern char hs3N_convertedFromComplement; extern char hs3N_convertedToComplement; extern vector repeatHandles; extern struct ht2_index_getrefnames_result *refNameMap; extern int repeatLimit; extern bool uniqueOutputOnly; extern int directional3NMapping; using namespace std; struct ReportingMetrics; /** * the data structure to store all information of one alignment result. */ class Alignment { public: // basic information BTString readName; int flag; BTString chromosomeName; int chromosomeIndex; // the chromosome index to use getStretch() function long long int location; BTString MAPQ; BTString cigarString; vector cigarSegments; int cigarLength; // this is the length that read cover genome. BTString pairToChromosome; long long int pairToLocation; long long int pairingDistance; BTString readSequence; BTString readQuality; //tags int AS; // alignment score int NH; // number of alignment int XM; // number of mismatch int NM; // edit distance int YS; // mate's AS BTString MD; BTString YT; //"UU" for single-end. "CP" for concordant alignment, "DP" for disconcordant alignment, "UP" for else. // special tags in HISAT-3N int Yf; // number of conversion. int Zf; // number of unconverted base. char YZ; // this tag shows alignment strand: check makeYZ function for the classification rule // + for REF strand (conversionCount[0] is equal or smaller than conversionCount[1]), // - for REF-RC strand (conversionCount[1] is smaller) // unChanged tags BTString unChangedTags; BTString passThroughLine; // this is controlled by print_xr_ in SamConfig // for pairScore calculation static const int maxPairDistance = 500000; static const int penaltyFreeDistance_DNA = 1000; static const int penaltyFreeDistance_RNA = 100000; static const int distancePenaltyFraction_DNA = 100; static const int distancePenaltyFraction_RNA = 1000; static const int ASPenalty = 100; static const int concordantScoreBounce = 500000; // intermediate variable bool outputted = false; // whether the alignment is outputted. bool DNA = false; int cycle_3N; // indicate which cycle_3N make this alignment result. 0 or 3 for repeatHandles[0], else repeatHandles[1] bool paired; bool forward; bool mapped; bool concordant; int64_t MinimumScore; int pairSegment; // 0 for first segment, 1 for second segment. struct ht2_repeat_expand_result *repeatResult = nullptr; int pairScore; // to identify the better pair bool mateMapped; // to adjust the YT tag bool repeat; bool pairToRepeat; RepeatMappingPositions repeatPositions; // to store the expanded repeat information int conversionCount[2] = {0}; // there are two type of conversion could happen, save the number of conversion separately. int unConversionCount[2] = {0}; // save the unconverted base count. string intToBase = "ACGTN"; void initialize() { readName.clear(); flag = -1; chromosomeName.clear(); chromosomeIndex = -1; location = 0; MAPQ.clear(); cigarString.clear(); cigarSegments.clear(); cigarLength = 0; pairToChromosome.clear(); pairToLocation = 0; pairingDistance = 0; readSequence.clear(); readQuality.clear(); AS = numeric_limits::min(); NH = 0; XM = 0; NM = 0; YS = 0; MD.clear(); YT.clear(); Yf = 0; Zf = 0; unChangedTags.clear(); outputted = false; DNA = false; cycle_3N = -1; paired = false; forward = false; mapped = false; concordant = false; pairSegment = 0; if (repeatResult != nullptr) { free(repeatResult); repeatResult = nullptr; } pairScore = numeric_limits::min(); mateMapped = false; repeat = false; pairToRepeat = false; repeatPositions.initialize(); conversionCount[0] = 0; conversionCount[1] = 0; unConversionCount[0] = 0; unConversionCount[1] = 0; passThroughLine.clear(); } Alignment() { initialize(); } ~Alignment() { if (repeatResult != nullptr) free(repeatResult); } /** * change YS tag for output. */ void setYS (Alignment* input) { YS = input->AS; } void setYS(RepeatMappingPosition* input) { YS = input->AS; } /** * change concordant status and flag */ void setConcordant(bool concordant_) { concordant = concordant_; if (concordant) { flag |= 2; } else { flag &= ~((int)2); } } /** * change mateMapped status and flag */ void setMateMappingFlag(long long int *mateLocation) { if (mateLocation == NULL) { return; } mateMapped = *mateLocation != 0; if ((flag&8) && mateMapped) flag -= 8; else if (!(flag&8) && !mateMapped) flag += 8; } /** * change YT tag base on mateMapped and concordant information. */ void setYT() { if (paired) { if (!mateMapped) { YT = "UP"; return; } if (concordant) { YT = "CP"; } else { YT = "DP"; } } else { YT = "UU"; } } /** * update NH and MAPQ by number of alignment. */ void updateNH(int nAlignment) { if (!mapped) return; NH = nAlignment; if (nAlignment == 0) return; else if (nAlignment == 1) MAPQ = "60"; else MAPQ = "1"; } /** * extract information from flag, change flag to secondary alignment. */ void extractFlagInfo() { paired = (flag & 1) != 0; forward = (flag & 16) == 0; if ((flag & 256) == 0) { // change all read to secondary alignment flag += 256; } mapped = (flag & 4) == 0; if (flag & 128) { pairSegment = 1; } else { pairSegment = 0; // it could be the first pair segment or it is single read. } concordant = (flag & 2) != 0; if (!mapped) { repeat = false; } MinimumScore = scoreMin.f(readSequence.length()); } /** * calculate the pairScore for a pair of alignment result. Output pair Score and number of pair. * Do not update their pairScore. */ int calculatePairScore(Alignment *inputAlignment, int &nPair); /** * make YZ tag. * if (no conversion) or (conversion type 0 == conversion type 1) or (directional mapping): * if the (pairSegment is 0) && (forward) => YZ = REF (+) * if the (pairSegment is 0) && (reverse) => YZ = REF-RC (-) * if the (pairSegment is 1) && (forward) => YZ = REF-RC (-) * if the (pairSegment is 1) && (reverse) => YZ = REF (+) * if the conversion type 0 is less, the read is mapped to REF (+). * if the conversion type 1 is less, the read is mapped to REF-RC (-). */ void makeYZ(char &YZ_string) { if (directional3NMapping == 2){ if (pairSegment == 0 && forward) { YZ_string = '-'; } else if (pairSegment == 0 && !forward) { YZ_string = '+'; } else if (pairSegment == 1 && forward) { YZ_string = '+'; } else if (pairSegment == 1 && !forward) { YZ_string = '-'; } } else if (directional3NMapping == 1 || (conversionCount[0] == 0 && conversionCount[1] == 0) || conversionCount[0] == conversionCount[1]){ if (pairSegment == 0 && forward) { YZ_string = '+'; } else if (pairSegment == 0 && !forward) { YZ_string = '-'; } else if (pairSegment == 1 && forward) { YZ_string = '-'; } else if (pairSegment == 1 && !forward) { YZ_string = '+'; } } else if (conversionCount[0] >= conversionCount[1]) { YZ_string = '+'; } else { YZ_string = '-'; } } /** * make Yf tag, Yf tag is to count the number of conversion in the string. * use YZ tag to decide which type of conversion is legal conversion. */ int makeYf(char YZTag) { int outYf = -1; if (YZTag == '+'){ outYf = conversionCount[0]; } else if (YZTag == '-'){ outYf = conversionCount[1]; } assert(outYf >= 0); return outYf; } /** * make Zf tag, Zf tag is to count the number of un-converted base in the string. * use YZ tag to decide which type of conversion is legal conversion. */ int makeZf(char YZTag) { int outZf = -1; if (YZTag == '+'){ outZf = unConversionCount[0]; } else if (YZTag == '-'){ outZf = unConversionCount[1]; } assert (outZf >= 0); return outZf; } /** * expand the repeat mapping location and construct MD for each location. * return ture if there is any mapping location pass the filter, else, false. */ bool constructRepeatMD(BitPairReference* bitReference, MappingPositions &alignmentPositions) { if (!mapped) { return true; } // expand the repeat locations ht2_error_t err = ht2_repeat_expand((cycle_3N == 0 || cycle_3N == 3) ? repeatHandles[0] : repeatHandles[1], chromosomeName.toZBuf(), location - 1, readSequence.length(), &repeatResult); BTString chromosomeRepeat; long long int locationRepeat; for (int i = 0; i < repeatResult->count; i++) { struct ht2_position *pos = &repeatResult->positions[i]; chromosomeRepeat = refNameMap->names[pos->chr_id]; for (int j = 0; j < chromosomeRepeat.length(); j++) { if (chromosomeRepeat[j] == ' ') { chromosomeRepeat.trimEnd(chromosomeRepeat.length() - j); break; } } locationRepeat = (pos->pos) + 1; bool genomeForward = pos->direction == 0; if (!genomeForward) { continue; } // if the repeat mapping direction is different to the designed direction, ignore it. // if the mapping location is already exist, continue. if (alignmentPositions.positionExist(chromosomeRepeat, locationRepeat, pairSegment)){ continue; } // get reference sequence ASSERT_ONLY(SStringExpandable destU32); SStringExpandable raw_refbuf; raw_refbuf.resize(cigarLength + 16); raw_refbuf.clear(); int off = bitReference->getStretch( reinterpret_cast(raw_refbuf.wbuf()), (size_t)pos->chr_id, (size_t)max(locationRepeat-1, 0), (size_t)cigarLength ASSERT_ONLY(, destU32)); char* refSeq = raw_refbuf.wbuf() + off; BTString refSequence; refSequence.resize(cigarLength); for (int j = 0; j < cigarLength; j++) { refSequence.set(intToBase[*(refSeq + j)], j); } // check whether the refSequence is exist. if do, directly append the repeat. int repeatPositionsIndex; if (repeatPositions.sequenceExist(refSequence, repeatPositionsIndex)) { repeatPositions.append(chromosomeRepeat, locationRepeat, repeatPositionsIndex); continue; } BTString newMD; int newMismatch = 0; char repeatYZ; int repeatYf; int repeatZf; if (!constructRepeatMD(refSequence, newMD, newMismatch, repeatYf, repeatZf, repeatYZ)) { continue; } int newXM = XM + newMismatch; int newNM = NM + newMismatch; int newAS = AS - penMmcMax * newMismatch; if (newAS < MinimumScore) { continue; } repeatPositions.append(locationRepeat, chromosomeRepeat, refSequence,newAS, newMD, newXM, newNM, repeatYf, repeatZf, repeatYZ); // if there are too many mappingPosition exist return. if (repeatPositions.size() >= repeatLimit || alignmentPositions.size() > repeatLimit) { return true; } } if (repeatPositions.size() == 0) { return false; } else { return true; } } /** * for each repeat mapping position, construct its MD * return true if the mapping result does not have a lot of mismatch, else return false. */ bool constructRepeatMD(BTString &refSeq, BTString &newMD_String, int &newMismatch, int& repeatYf, int& repeatZf, char &repeatYZ) { char buf[1024]; conversionCount[0] = 0; conversionCount[1] = 0; unConversionCount[0] = 0; unConversionCount[1] = 0; int readPos = 0; long long int refPos = 0; int count = 0; int newXM = 0; char cigarSymbol; int cigarLen; for (int i = 0; i < cigarSegments.size(); i++) { cigarSymbol = cigarSegments[i].getLabel(); cigarLen = cigarSegments[i].getLen(); if (cigarSymbol == 'S') { readPos += cigarLen; } else if (cigarSymbol == 'N') { refPos += cigarLen; } else if (cigarSymbol == 'M') { for (int j = 0; j < cigarLen; j++) { char readChar = readSequence[readPos]; char refChar = refSeq[refPos]; if (readChar == refChar) { if (refChar == usrInput_convertedFrom) { unConversionCount[0]++; } else if (refChar == usrInput_convertedFromComplement) { unConversionCount[1]++; } count++; } else {// mismatch // output matched count if (count != 0) { itoa10(count, buf); newMD_String.append(buf); count = 0; } // output mismatch if (!newMD_String.empty() && isalpha(newMD_String[newMD_String.length()-1])) { newMD_String.append('0'); } if ((readChar == usrInput_convertedTo) && (refChar == usrInput_convertedFrom)) { conversionCount[0]++; } else if ((readChar == usrInput_convertedToComplement) && (refChar == usrInput_convertedFromComplement)) { conversionCount[1]++; } else { // real mismatch newXM++; } newMD_String.append(refChar); } readPos++; refPos++; } } else if (cigarSymbol == 'I') { readPos += cigarLen; } else if (cigarSymbol == 'D') { newMD_String.append('^'); for (int j = 0; j < cigarLen; j++) { newMD_String.append(refSeq[refPos]); refPos++; } } } if (count != 0) { itoa10(count, buf); newMD_String.append(buf); } if (isalpha(newMD_String[0])) { newMD_String.insert('0', 0); } if (isalpha(newMD_String[newMD_String.length()-1])) { newMD_String.append('0'); } makeYZ(repeatYZ); int badConversion = 0; // identify the bad conversion number based on repeatYZ; if (repeatYZ == '+') { badConversion = conversionCount[1]; } else { badConversion = conversionCount[0]; } repeatYf = makeYf(repeatYZ); repeatZf = makeZf(repeatZf); newXM += badConversion; newMismatch = newXM - XM; if (newMismatch < 0){ newMismatch = 0; } return true; } /** * for each non-repeat mapping position, construct its MD * return true if the mapping result does not have a lot of mismatch, else return false. */ bool constructMD(BitPairReference* bitReference) { if (!mapped) { return true; } char buf[1024]; MD.clear(); ASSERT_ONLY(SStringExpandable destU32); SStringExpandable raw_refbuf; raw_refbuf.resize(cigarLength + 16); raw_refbuf.clear(); int off = bitReference->getStretch( reinterpret_cast(raw_refbuf.wbuf()), (size_t)chromosomeIndex, (size_t)max(location-1, 0), (size_t)cigarLength ASSERT_ONLY(, destU32)); char* refSeq = raw_refbuf.wbuf() + off; int readPos = 0; long long int refPos = 0; int count = 0; int newXM = 0; char cigarSymbol; int cigarLen; for (int i = 0; i < cigarSegments.size(); i++) { cigarSymbol = cigarSegments[i].getLabel(); cigarLen = cigarSegments[i].getLen(); if (cigarSymbol == 'S') { readPos += cigarLen; } else if (cigarSymbol == 'N') { refPos += cigarLen; } else if (cigarSymbol == 'M') { for (int j = 0; j < cigarLen; j++) { char readChar = readSequence[readPos]; char refChar = intToBase[*(refSeq + refPos)]; if (readChar == refChar) { if (refChar == usrInput_convertedFrom) { unConversionCount[0]++; } else if (refChar == usrInput_convertedFromComplement) { unConversionCount[1]++; } count++; } else {// mismatch // output matched count if (count != 0) { itoa10(count, buf); MD.append(buf); count = 0; } // output mismatch if (!MD.empty() && isalpha(MD[MD.length()-1])) { MD.append('0'); } if ((readChar == usrInput_convertedTo) && (refChar == usrInput_convertedFrom)) { conversionCount[0]++; } else if ((readChar == usrInput_convertedToComplement) && (refChar == usrInput_convertedFromComplement)) { conversionCount[1]++; } else { // real mismatch newXM++; } MD.append(refChar); } readPos++; refPos++; } } else if (cigarSymbol == 'I') { readPos += cigarLen; } else if (cigarSymbol == 'D') { if (count != 0) { itoa10(count, buf); MD.append(buf); count = 0; } MD.append('^'); for (int j = 0; j < cigarLen; j++) { MD.append(intToBase[*(refSeq + refPos)]); refPos++; } } } if (count != 0) { itoa10(count, buf); MD.append(buf); } if (isalpha(MD[0])) { MD.insert('0', 0); } if (isalpha(MD[MD.length()-1])) { MD.append('0'); } makeYZ(YZ); int badConversion = 0; // identify the bad conversion number based on YZ tag; if (YZ == '+') { badConversion = conversionCount[1]; } else { badConversion = conversionCount[0]; } Yf = makeYf(YZ); Zf = makeZf(YZ); newXM += badConversion; newXM -= XM; if (newXM < 0){ newXM = 0; } NM += newXM; XM += newXM; AS = AS - penMmcMax * newXM; if (AS < MinimumScore) { return false; } BTString tmp; if (pairToRepeat) { repeatPositions.append(location, chromosomeName, tmp, AS, MD, XM, NM, Yf, Zf, YZ); } return true; } /** * output the tags for non-repeat alignment. */ void outputTags(BTString& o) { char buf[1024]; if (mapped) { o.append('\t'); // AS assert(AS <= 0); o.append("AS:i:"); itoa10(AS, buf); o.append(buf); o.append('\t'); // NH assert(NH > 0); o.append("NH:i:"); itoa10(NH, buf); o.append(buf); o.append('\t'); // XM assert(XM >= 0); o.append("XM:i:"); itoa10(XM, buf); o.append(buf); o.append('\t'); // NM assert(NM >= 0); o.append("NM:i:"); itoa10(NM, buf); o.append(buf); o.append('\t'); // MD assert(!MD.empty()); o.append("MD:Z:"); o.append(MD.toZBuf()); o.append('\t'); // YS if (paired && mateMapped) { o.append("YS:i:"); itoa10(YS, buf); o.append(buf); o.append('\t'); } // YZ o.append("YZ:A:"); o.append(YZ); o.append('\t'); // Yf o.append("Yf:i:"); itoa10(Yf, buf); o.append(buf); o.append('\t'); //Zf o.append("Zf:i:"); itoa10(Zf, buf); o.append(buf); } // unchanged Tags if (!unChangedTags.empty()) { o.append('\t'); o.append(unChangedTags.toZBuf()); } o.append(passThroughLine.toZBuf()); } /** * output the tags for repeat alignment. */ void outputTags(BTString& o, RepeatMappingPosition* repeatInfo){ // this function is for repeat alignment output. char buf[1024]; o.append('\t'); // AS assert(AS <= 0); o.append("AS:i:"); itoa10(repeatInfo->AS, buf); o.append(buf); o.append('\t'); // NH assert(NH > 0); o.append("NH:i:"); itoa10(NH, buf); o.append(buf); o.append('\t'); // XM assert(XM >= 0); o.append("XM:i:"); itoa10(repeatInfo->XM, buf); o.append(buf); o.append('\t'); // NM assert(NM >= 0); o.append("NM:i:"); itoa10(repeatInfo->NM, buf); o.append(buf); o.append('\t'); // MD assert(!MD.empty()); o.append("MD:Z:"); o.append(repeatInfo->MD.toZBuf()); o.append('\t'); // YS if (paired) { o.append("YS:i:"); itoa10(YS, buf); o.append(buf); o.append('\t'); } //YT o.append("YT:Z:"); o.append(YT.toZBuf()); o.append('\t'); // YS if (paired && mateMapped) { o.append("YS:i:"); itoa10(YS, buf); o.append(buf); o.append('\t'); } // YZ o.append("YZ:A:"); o.append(repeatInfo->YZ); o.append('\t'); // Yf o.append("Yf:i:"); itoa10(repeatInfo->Yf, buf); o.append(buf); o.append('\t'); // Zf o.append("Zf:i:"); itoa10(repeatInfo->Zf, buf); o.append(buf); // unchanged Tags if (!unChangedTags.empty()) { o.append('\t'); o.append(unChangedTags.toZBuf()); } o.append(passThroughLine.toZBuf()); } /** * output alignment. this function is for both repeat and non-repeat alignment. */ void outputAlignment (BTString& o, RepeatMappingPosition* repeatInfo, long long int* oppoLocation, bool& primaryAlignment) { BTString* outputChromosome; long long int* outputLocation; if (repeatInfo == NULL) { if (outputted) { return; } outputted = true; outputChromosome = &chromosomeName; outputLocation = &location; } else { if (repeatInfo->outputted) { return; } repeatInfo->outputted = true; outputChromosome = &repeatInfo->repeatChromosome; outputLocation = &repeatInfo->repeatLocation; } //setMateMappingFlag(oppoLocation); setYT(); char buf[1024]; // readName o.append(readName.toZBuf()); o.append('\t'); // flag, if it is primary alignment, -256 assert(flag >=0); itoa10(flag-primaryAlignment*256, buf); o.append(buf); o.append('\t'); // chromosome assert(!outputChromosome->empty()); o.append(outputChromosome->toZBuf()); o.append('\t'); // location assert(*outputLocation >= 0); itoa10(*outputLocation, buf); o.append(buf); o.append('\t'); //MAPQ o.append(MAPQ.toZBuf()); o.append('\t'); // cigar o.append(cigarString.toZBuf()); o.append('\t'); // pair to chromosome if (paired && *oppoLocation!=0) { o.append("="); o.append('\t'); } else { o.append("*"); o.append('\t'); } // pair to location if (paired) { itoa10(*oppoLocation, buf); o.append(buf); o.append('\t'); } else { o.append('0'); o.append('\t'); } // pairing distance if (paired) { itoa10(*oppoLocation - *outputLocation, buf); o.append(buf); o.append('\t'); } else { o.append('0'); o.append('\t'); } // read sequence o.append(readSequence.toZBuf()); o.append('\t'); // read quality o.append(readQuality.toZBuf()); // make sure there is no '\t' at the beginning of unChangedTags while (!unChangedTags.empty() && unChangedTags[0] == '\t') { unChangedTags.remove(0); } // tags if (repeatInfo == NULL) { outputTags(o); o.append('\n'); } else { outputTags(o, repeatInfo->flagInfoIndex == -1?repeatInfo:&repeatPositions.positions[repeatInfo->flagInfoIndex]); o.append('\n'); } } /** * return true if two location is concordant. * return false, if there are not concordant or too far (>maxPairDistance). */ static bool isConcordant(long long int location1, bool &forward1, long long int readLength1, long long int location2, bool &forward2, long long int readLength2); /** * this is the basic function to calculate DNA pair score. * if the distance between 2 alignments is more than penaltyFreeDistance_DNA, we reduce the score by the distance/100. * if two alignment is concordant we add concordantScoreBounce to make sure to select the concordant pair as best pair. */ static int calculatePairScore_DNA (long long int &location0, int& AS0, bool& forward0, long long int readLength0, long long int &location1, int &AS1, bool &forward1, long long int readLength1, bool& concordant); /** * this is the basic function to calculate RNA pair score. * if the distance between 2 alignments is more than penaltyFreeDistance_RNA, we reduce the score by the distance/1000. * if two alignment is concordant we add concordantScoreBounce to make sure to select the concordant pair as best pair. */ static int calculatePairScore_RNA (long long int &location0, int& XM0, bool& forward0, long long int readLength0, long long int &location1, int &XM1, bool &forward1, long long int readLength1, bool& concordant); }; /** * the data structure to store, process, and output all Alignment */ class Alignments { public: vector alignments; // pool to store current alignment result. vector freeAlignments; // free pointer pool for new alignment result. after output a alignment, return the pointer back to this pool. TReadId previousReadID; MappingPositions alignmentPositions; // the pool to save all alignment position BTString readName[2]; // the read name could be different for segment 1 and segment 2. BTDnaString readSequence[2]; // save the read sequence for output. BTString qualityScore[2]; // save the quality score for output. bool paired; const int repeatPoolLimit = 20; // this is the maximum number of repeat alignment we allowed. bool multipleAligned; // check whether we have multiple alignment, it is work unique mode. const int maxPairScore = 500000; // maximum pair score, if pairScore == maxPairScore, both math are perfect match and the pairDistance is small. BitPairReference* bitReference; // bit pair reference sequence bool DNA; int nRepeatAlignment; // count number of repeat alignment we received, for short sequence we could receive a lot of repeat alignment result. BTString passThroughLines[2]; void initialize() { alignmentPositions.initialize(); paired = false; multipleAligned = false; nRepeatAlignment = 0; for (int i = 0; i < 2; i++) { readName[i].clear(); readSequence[i].clear(); qualityScore[i].clear(); passThroughLines[i].clear(); } for (int i = 0; i < alignments.size(); i++) { alignments[i]->initialize(); freeAlignments.push_back(alignments[i]); } alignments.clear(); } Alignments(BitPairReference* ref, bool inputDNA): bitReference(ref), DNA(inputDNA) { initialize(); } ~Alignments() { while (!freeAlignments.empty()) { delete freeAlignments.back(); freeAlignments.pop_back(); } for (int i = 0; i < alignments.size(); i++) { delete alignments[i]; } } /** * get sequence for rd. if it already exist, ignore it. */ void getSequence(const Read& rd) { int pairSegment = rd.mate == 0? rd.mate : rd.mate-1; if (readName[pairSegment].empty()) { readName[pairSegment] = rd.name; } if (readSequence[pairSegment].empty()) { readSequence[pairSegment] = rd.originalFw; } if (qualityScore[pairSegment].empty()) { qualityScore[pairSegment] = rd.qual; } } /** * return true if we want to receive more new alignment. */ bool acceptNewAlignment() { if (uniqueOutputOnly && multipleAligned || alignmentPositions.nBestSingle >= repeatLimit || nRepeatAlignment > repeatPoolLimit || alignmentPositions.nBestPair >= repeatLimit) { return false; } return true; } /** * return the alignment back to freeAlignment pool. */ void returnToFreeAlignments (Alignment*& currentAlignment) { currentAlignment->initialize(); freeAlignments.push_back(currentAlignment); } /** * get a Alignment pointer from freeAlignments, if freeAlignment is empty, make a new Alignment. */ void getFreeAlignmentPointer(Alignment*& newAlignment) { if (!freeAlignments.empty()) { newAlignment = freeAlignments.back(); freeAlignments.pop_back(); } else { newAlignment = new Alignment(); } } /** * receive alignment information from AlnSink3NSam::appendMate() and append it to alignment pool. */ void append(Alignment *newAlignment) { newAlignment->extractFlagInfo(); paired = newAlignment->paired; newAlignment->DNA = DNA; if (passThroughLines[newAlignment->pairSegment].empty()) { passThroughLines[newAlignment->pairSegment] = newAlignment->passThroughLine; } // check if the alignment is already exist. if exist, ignore it. if (!alignmentPositions.append(newAlignment)) { alignments.push_back(newAlignment); return; } // construct MD tag and check if the alignment has too many mismatch, if do, ignore it. if (newAlignment->repeat) { if (!newAlignment->constructRepeatMD(bitReference, alignmentPositions)) { alignmentPositions.badAligned(); alignments.push_back(newAlignment); return; } nRepeatAlignment++; // for each repeat alignment, record it. } else { // check mismatch, update tags if (!newAlignment->constructMD(bitReference)) { alignmentPositions.badAligned(); alignments.push_back(newAlignment); return; } } // update pair score or AS, for output using. // if the new alignment has lower paring score or AS than bestPairScore or bestAS, ignore it. if (paired) { if (!alignmentPositions.updatePairScore()) { alignments.push_back(newAlignment); return; } if (alignmentPositions.bestPairScore == maxPairScore && alignmentPositions.nBestPair > 1) { multipleAligned = true; } } else { if (!alignmentPositions.updateAS()) { alignments.push_back(newAlignment); return; } if (alignmentPositions.bestAS == 0 && alignmentPositions.nBestSingle > 1) { multipleAligned = true; } } alignments.push_back(newAlignment); } /** * if there is no alignment, output unAlignment result. * this function is important when hisat2 give mapped result, but it does not pass my filter (has too many mismatch). */ void outputUnAlignmentRead(BTString& o) { if (paired) { for (int i = 0; i < 2; i++) { assert(!readName[i].empty()); uint ReadNameLength = readName[i].length(); if (readName[i].length() > 255) { ReadNameLength = 255; } for (int j = 0; j < ReadNameLength; j++) { if(isspace(readName[i][j])) { break; } o.append(readName[i][j]); } o.append("\t"); string flag = (i == 0) ? "77" : "141"; o.append(flag.c_str()); o.append("\t*\t0\t0\t*\t*\t0\t0\t"); o.append(readSequence[i].toZBuf()); o.append("\t"); o.append(qualityScore[i].toZBuf()); o.append("\tYT:Z:UP"); o.append(passThroughLines[i].toZBuf()); o.append('\n'); } } else { assert(!readName[0].empty()); string ReadName = ""; uint ReadNameLength = readName[0].length(); if (readName[0].length() > 255) { ReadNameLength = 255; } for (int j = 0; j < ReadNameLength; j++) { if(isspace(readName[0][j])) { break; } o.append(readName[0][j]); } o.append("\t4\t*\t0\t0\t*\t*\t0\t0\t"); o.append(readSequence[0].toZBuf()); o.append("\t"); o.append(qualityScore[0].toZBuf()); o.append("\tYT:Z:UU"); o.append(passThroughLines[0].toZBuf()); o.append('\n'); } } /** * report alignment statistics for single-end alignment */ void reportStats_single(ReportingMetrics& met); /** * report alignment statistics for paired-end alignment */ void reportStats_paired(ReportingMetrics& met); /** * output single-end alignment reuslts */ void output_single(BTString& o, ReportingMetrics& met) { reportStats_single(met); // output if (uniqueOutputOnly && (alignmentPositions.nBestSingle != 1 || multipleAligned)) { // do not output anything } else if (alignments.empty() || alignmentPositions.nBestSingle == 0) { // make a unalignment result and output it. outputUnAlignmentRead(o); } else { // output alignmentPositions.outputSingle(o); } } /** * output paired-end alignment reuslts */ void output_paired(BTString& o, ReportingMetrics& met) { reportStats_paired(met); if ((uniqueOutputOnly && (alignmentPositions.nBestPair != 1 || multipleAligned))) { // do not report anything } else if (alignments.empty() || alignmentPositions.nBestPair == 0 || alignmentPositions.bestPairScore == numeric_limits::min()) { // make a unalignment result and output it. outputUnAlignmentRead(o); } else { // output alignmentPositions.outputPair(o); } } /** * output function will be redirected to output_single or output_paired */ void output(ReportingMetrics& met, BTString& o) { if (paired) { output_paired(o, met); } else { output_single(o,met); } initialize(); } }; #endif //HISAT2_ALIGNMENT_3N_H