/* * Copyright 2020, Yun (Leo) Zhang * * This file is part of HISAT-3N. * * HISAT-3N is free software: you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation, either version 3 of the License, or * (at your option) any later version. * * HISAT-3N is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with HISAT-3N. If not, see . */ #ifndef ALIGNMENT_3N_TABLE_H #define ALIGNMENT_3N_TABLE_H #include #include "utility_3n_table.h" extern bool uniqueOnly; extern bool multipleOnly; extern char convertFrom; extern char convertTo; extern char convertFromComplement; extern char convertToComplement; using namespace std; /** * the class to store information from one SAM line */ class Alignment { public: string chromosome; long long int location; long long int mateLocation; int flag; bool mapped; char strand; string sequence; string quality; bool unique; string mapQ; int NH; vector bases; CIGAR cigarString; MD_tag MD; unsigned long long readNameID; int sequenceCoveredLength; // the sum of number is cigarString; bool overlap; // if the segment could overlap with the mate segment. bool paired; void initialize() { chromosome.clear(); location = -1; mateLocation = -1; flag = -1; mapped = false; MD.initialize(); cigarString.initialize(); sequence.clear(); quality.clear(); unique = false; mapQ.clear(); NH = -1; bases.clear(); readNameID = 0; sequenceCoveredLength = 0; overlap = false; paired = false; } /** * for start position in input Line, check if it contain the target information. */ bool startWith(string* inputLine, int startPosition, string tag){ for (int i = 0; i < tag.size(); i++){ if (inputLine->at(startPosition+i) != tag[i]){ return false; } } return true; } /** * generate a hash value for readName */ void getNameHash(string& readName) { readNameID = 0; int a = 63689; for (size_t i = 0; i < readName.size(); i++) { readNameID = (readNameID * a) + (int)readName[i]; } } /** * extract the information from SAM line to Alignment. */ void parseInfo(string* line) { int startPosition = 0; int endPosition = 0; int count = 0; while ((endPosition = line->find("\t", startPosition)) != string::npos) { if (count == 0) { string readName = line->substr(startPosition, endPosition - startPosition); getNameHash(readName); } else if (count == 1) { flag = stoi(line->substr(startPosition, endPosition - startPosition)); mapped = (flag & 4) == 0; paired = (flag & 1) != 0; } else if (count == 2) { chromosome = line->substr(startPosition, endPosition - startPosition); } else if (count == 3) { location = stoll(line->substr(startPosition, endPosition - startPosition)); } else if (count == 4) { mapQ = line->substr(startPosition, endPosition - startPosition); if (mapQ == "1") { unique = false; } else { unique = true; } } else if (count == 5) { cigarString.loadString(line->substr(startPosition, endPosition - startPosition)); } else if (count == 7) { mateLocation = stoll(line->substr(startPosition, endPosition - startPosition)); } else if (count == 9) { sequence = line->substr(startPosition, endPosition - startPosition); } else if (count == 10) { quality = line->substr(startPosition, endPosition - startPosition); } else if (count > 10) { if (startWith(line, startPosition, "MD")) { MD.loadString(line->substr(startPosition + 5, endPosition - startPosition - 5)); } else if (startWith(line, startPosition, "NM")) { NH = stoi(line->substr(startPosition + 5, endPosition - startPosition - 5)); } else if (startWith(line, startPosition, "YZ")) { strand = line->at(endPosition-1); } } startPosition = endPosition + 1; count++; } if (startWith(line, startPosition, "MD")) { MD.loadString(line->substr(startPosition + 5, endPosition - startPosition - 5)); } else if (startWith(line, startPosition, "NM")) { NH = stoi(line->substr(startPosition + 5, endPosition - startPosition - 5)); } else if (startWith(line, startPosition, "YZ")) { strand = line->at(endPosition-1); } } /** * change the overlap = true, if the read is not uniquely mapped or the read segment is overlap to it's mate. */ void checkOverlap() { if (!unique) { overlap = true; } else { if (paired && (location + sequenceCoveredLength >= mateLocation)) { overlap = true; } else { overlap = false; } } } /** * parse the sam line to alignment information */ void parse(string* line) { initialize(); parseInfo(line); if ((uniqueOnly && !unique) || (multipleOnly && unique)) { return; } appendBase(); } /** * scan all base in read sequence label them if they are qualified. */ void appendBase() { if (!mapped || sequenceCoveredLength > 500000) { // if the read's intron longer than 500,000 ignore this read return; } bases.reserve(sequence.size()); for (int i = 0; i < sequence.size(); i++) { bases.emplace_back(i); } int pos = adjustPos(); string match; while (MD.getNextSegment(match)) { if (isdigit(match.front())) { // the first char of match is digit this is match int len = stoi(match); for (int i = 0; i < len; i++) { while (bases[pos].remove) { pos++; } if ((strand == '+' && sequence[pos] == convertFrom) || (strand == '-' && sequence[pos] == convertFromComplement)) { bases[pos].setQual(quality[pos], false); } else { bases[pos].remove = true; } pos ++; } } else if (isalpha(match.front())) { // this is mismatch or conversion char refBase = match.front(); // for + strand, it should have C->T change // for - strand, it should have G->A change while (bases[pos].remove) { pos++; } if ((strand == '+' && refBase == convertFrom && sequence[pos] == convertTo) || (strand == '-' && refBase == convertFromComplement && sequence[pos] == convertToComplement)){ bases[pos].setQual(quality[pos], true); } else { bases[pos].remove = true; } pos ++; } else { // deletion. do nothing. } } } /** * adjust the reference position in bases */ int adjustPos() { int readPos = 0; int returnPos = 0; int seqLength = sequence.size(); char cigarSymbol; int cigarLen; sequenceCoveredLength = 0; while (cigarString.getNextSegment(cigarLen, cigarSymbol)) { sequenceCoveredLength += cigarLen; if (cigarSymbol == 'S') { if (readPos == 0) { // soft clip is at the begin of the read returnPos = cigarLen; for (int i = cigarLen; i < seqLength; i++) { bases[i].refPos -= cigarLen; } } else { // soft clip is at the end of the read // do nothing } readPos += cigarLen; } else if (cigarSymbol == 'N') { for (int i = readPos; i < seqLength; i++) { bases[i].refPos += cigarLen; } } else if (cigarSymbol == 'M') { for (int i = readPos; i < readPos+cigarLen; i++) { bases[i].remove = false; } readPos += cigarLen; } else if (cigarSymbol == 'I') { for (int i = readPos + cigarLen; i < seqLength; i++) { bases[i].refPos -= cigarLen; } readPos += cigarLen; } else if (cigarSymbol == 'D') { for (int i = readPos; i < seqLength; i++) { bases[i].refPos += cigarLen; } } } return returnPos; } }; #endif //ALIGNMENT_3N_TABLE_H