hisat-3n/alignment_3n_table.h

288 lines
9.5 KiB
C
Raw Permalink Normal View History

2025-01-18 13:09:52 +00:00
/*
* Copyright 2020, Yun (Leo) Zhang <imzhangyun@gmail.com>
*
* This file is part of HISAT-3N.
*
* HISAT-3N is free software: you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation, either version 3 of the License, or
* (at your option) any later version.
*
* HISAT-3N is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with HISAT-3N. If not, see <http://www.gnu.org/licenses/>.
*/
#ifndef ALIGNMENT_3N_TABLE_H
#define ALIGNMENT_3N_TABLE_H
#include <string>
#include "utility_3n_table.h"
extern bool uniqueOnly;
extern bool multipleOnly;
extern char convertFrom;
extern char convertTo;
extern char convertFromComplement;
extern char convertToComplement;
using namespace std;
/**
* the class to store information from one SAM line
*/
class Alignment {
public:
string chromosome;
long long int location;
long long int mateLocation;
int flag;
bool mapped;
char strand;
string sequence;
string quality;
bool unique;
string mapQ;
int NH;
vector<PosQuality> bases;
CIGAR cigarString;
MD_tag MD;
unsigned long long readNameID;
int sequenceCoveredLength; // the sum of number is cigarString;
bool overlap; // if the segment could overlap with the mate segment.
bool paired;
void initialize() {
chromosome.clear();
location = -1;
mateLocation = -1;
flag = -1;
mapped = false;
MD.initialize();
cigarString.initialize();
sequence.clear();
quality.clear();
unique = false;
mapQ.clear();
NH = -1;
bases.clear();
readNameID = 0;
sequenceCoveredLength = 0;
overlap = false;
paired = false;
}
/**
* for start position in input Line, check if it contain the target information.
*/
bool startWith(string* inputLine, int startPosition, string tag){
for (int i = 0; i < tag.size(); i++){
if (inputLine->at(startPosition+i) != tag[i]){
return false;
}
}
return true;
}
/**
* generate a hash value for readName
*/
void getNameHash(string& readName) {
readNameID = 0;
int a = 63689;
for (size_t i = 0; i < readName.size(); i++) {
readNameID = (readNameID * a) + (int)readName[i];
}
}
/**
* extract the information from SAM line to Alignment.
*/
void parseInfo(string* line) {
int startPosition = 0;
int endPosition = 0;
int count = 0;
while ((endPosition = line->find("\t", startPosition)) != string::npos) {
if (count == 0) {
string readName = line->substr(startPosition, endPosition - startPosition);
getNameHash(readName);
} else if (count == 1) {
flag = stoi(line->substr(startPosition, endPosition - startPosition));
mapped = (flag & 4) == 0;
paired = (flag & 1) != 0;
} else if (count == 2) {
chromosome = line->substr(startPosition, endPosition - startPosition);
} else if (count == 3) {
location = stoll(line->substr(startPosition, endPosition - startPosition));
} else if (count == 4) {
mapQ = line->substr(startPosition, endPosition - startPosition);
if (mapQ == "1") {
unique = false;
} else {
unique = true;
}
} else if (count == 5) {
cigarString.loadString(line->substr(startPosition, endPosition - startPosition));
} else if (count == 7) {
mateLocation = stoll(line->substr(startPosition, endPosition - startPosition));
} else if (count == 9) {
sequence = line->substr(startPosition, endPosition - startPosition);
} else if (count == 10) {
quality = line->substr(startPosition, endPosition - startPosition);
} else if (count > 10) {
if (startWith(line, startPosition, "MD")) {
MD.loadString(line->substr(startPosition + 5, endPosition - startPosition - 5));
} else if (startWith(line, startPosition, "NM")) {
NH = stoi(line->substr(startPosition + 5, endPosition - startPosition - 5));
} else if (startWith(line, startPosition, "YZ")) {
strand = line->at(endPosition-1);
}
}
startPosition = endPosition + 1;
count++;
}
if (startWith(line, startPosition, "MD")) {
MD.loadString(line->substr(startPosition + 5, endPosition - startPosition - 5));
} else if (startWith(line, startPosition, "NM")) {
NH = stoi(line->substr(startPosition + 5, endPosition - startPosition - 5));
} else if (startWith(line, startPosition, "YZ")) {
strand = line->at(endPosition-1);
}
}
/**
* change the overlap = true, if the read is not uniquely mapped or the read segment is overlap to it's mate.
*/
void checkOverlap() {
if (!unique) {
overlap = true;
} else {
if (paired && (location + sequenceCoveredLength >= mateLocation)) {
overlap = true;
} else {
overlap = false;
}
}
}
/**
* parse the sam line to alignment information
*/
void parse(string* line) {
initialize();
parseInfo(line);
if ((uniqueOnly && !unique) || (multipleOnly && unique)) {
return;
}
appendBase();
}
/**
* scan all base in read sequence label them if they are qualified.
*/
void appendBase() {
if (!mapped || sequenceCoveredLength > 500000) { // if the read's intron longer than 500,000 ignore this read
return;
}
bases.reserve(sequence.size());
for (int i = 0; i < sequence.size(); i++) {
bases.emplace_back(i);
}
int pos = adjustPos();
string match;
while (MD.getNextSegment(match)) {
if (isdigit(match.front())) { // the first char of match is digit this is match
int len = stoi(match);
for (int i = 0; i < len; i++) {
while (bases[pos].remove) {
pos++;
}
if ((strand == '+' && sequence[pos] == convertFrom) ||
(strand == '-' && sequence[pos] == convertFromComplement)) {
bases[pos].setQual(quality[pos], false);
} else {
bases[pos].remove = true;
}
pos ++;
}
} else if (isalpha(match.front())) { // this is mismatch or conversion
char refBase = match.front();
// for + strand, it should have C->T change
// for - strand, it should have G->A change
while (bases[pos].remove) {
pos++;
}
if ((strand == '+' && refBase == convertFrom && sequence[pos] == convertTo) ||
(strand == '-' && refBase == convertFromComplement && sequence[pos] == convertToComplement)){
bases[pos].setQual(quality[pos], true);
} else {
bases[pos].remove = true;
}
pos ++;
} else { // deletion. do nothing.
}
}
}
/**
* adjust the reference position in bases
*/
int adjustPos() {
int readPos = 0;
int returnPos = 0;
int seqLength = sequence.size();
char cigarSymbol;
int cigarLen;
sequenceCoveredLength = 0;
while (cigarString.getNextSegment(cigarLen, cigarSymbol)) {
sequenceCoveredLength += cigarLen;
if (cigarSymbol == 'S') {
if (readPos == 0) { // soft clip is at the begin of the read
returnPos = cigarLen;
for (int i = cigarLen; i < seqLength; i++) {
bases[i].refPos -= cigarLen;
}
} else { // soft clip is at the end of the read
// do nothing
}
readPos += cigarLen;
} else if (cigarSymbol == 'N') {
for (int i = readPos; i < seqLength; i++) {
bases[i].refPos += cigarLen;
}
} else if (cigarSymbol == 'M') {
for (int i = readPos; i < readPos+cigarLen; i++) {
bases[i].remove = false;
}
readPos += cigarLen;
} else if (cigarSymbol == 'I') {
for (int i = readPos + cigarLen; i < seqLength; i++) {
bases[i].refPos -= cigarLen;
}
readPos += cigarLen;
} else if (cigarSymbol == 'D') {
for (int i = readPos; i < seqLength; i++) {
bases[i].refPos += cigarLen;
}
}
}
return returnPos;
}
};
#endif //ALIGNMENT_3N_TABLE_H