hisat-3n/edit.cpp
2025-01-18 21:09:52 +08:00

502 lines
12 KiB
C++

/*
* Copyright 2011, Ben Langmead <langmea@cs.jhu.edu>
*
* This file is part of Bowtie 2.
*
* Bowtie 2 is free software: you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation, either version 3 of the License, or
* (at your option) any later version.
*
* Bowtie 2 is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with Bowtie 2. If not, see <http://www.gnu.org/licenses/>.
*/
#include <iostream>
#include "edit.h"
using namespace std;
/**
* Print a single edit to a std::ostream. Format is
* (pos):(ref chr)>(read chr). Where 'pos' is an offset from the 5'
* end of the read, and the ref and read chrs are expressed w/r/t the
* Watson strand.
*/
ostream& operator<< (ostream& os, const Edit& e) {
if(e.type != EDIT_TYPE_SPL) {
os << e.pos << ":" << (char)e.chr << ">" << (char)e.qchr;
} else {
os << e.pos << ":" << e.splLen;
}
return os;
}
/**
* Print a list of edits to a std::ostream, separated by commas.
*/
void Edit::print(ostream& os, const EList<Edit>& edits, char delim) {
for(size_t i = 0; i < edits.size(); i++) {
os << edits[i];
if(i < edits.size()-1) os << delim;
}
}
void Edit::complement(EList<Edit> &edits)
{
for(size_t i = 0; i < edits.size(); i++) {
Edit& e = edits[i];
if(e.type != EDIT_TYPE_SPL) {
assert_in(e.chr, "ACMGRSVTWYHKDBN-");
assert_in(e.qchr, "ACGTN-");
e.chr = asc2dnacomp[e.chr];
e.qchr = asc2dnacomp[e.qchr];
}
}
}
/**
* Flip all the edits.pos fields so that they're with respect to
* the other end of the read (of length 'sz').
*/
void Edit::invertPoss(
EList<Edit>& edits,
size_t sz,
size_t ei,
size_t en,
bool sort)
{
// Invert elements
size_t ii = 0;
for(size_t i = ei; i < ei + en/2; i++) {
Edit tmp = edits[i];
edits[i] = edits[ei + en - ii - 1];
edits[ei + en - ii - 1] = tmp;
ii++;
}
for(size_t i = ei; i < ei + en; i++) {
assert(edits[i].pos < sz ||
(edits[i].isReadGap() && edits[i].pos == sz));
// Adjust pos
if(edits[i].isReadGap() || edits[i].isSpliced()) {
edits[i].pos = (uint32_t)(sz - edits[i].pos);
} else {
edits[i].pos = (uint32_t)(sz - edits[i].pos - 1);
}
// Adjust pos2
if(edits[i].isReadGap()) {
int64_t pos2diff = (int64_t)(uint64_t)edits[i].pos2 - (int64_t)((uint64_t)std::numeric_limits<uint32_t>::max() >> 1);
int64_t pos2new = (int64_t)(uint64_t)edits[i].pos2 - 2*pos2diff;
assert(pos2diff == 0 || (uint32_t)pos2new != (std::numeric_limits<uint32_t>::max() >> 1));
edits[i].pos2 = (uint32_t)pos2new;
}
}
if(sort) {
// Edits might not necessarily be in same order after inversion
edits.sortPortion(ei, en);
#ifndef NDEBUG
for(size_t i = ei + 1; i < ei + en; i++) {
assert_geq(edits[i].pos, edits[i-1].pos);
}
#endif
}
}
/**
* For now, we pretend that the alignment is in the forward orientation
* and that the Edits are listed from left- to right-hand side.
*/
void Edit::printQAlign(
std::ostream& os,
const BTDnaString& read,
const EList<Edit>& edits)
{
printQAlign(os, "", read, edits);
}
/**
* For now, we pretend that the alignment is in the forward orientation
* and that the Edits are listed from left- to right-hand side.
*/
void Edit::printQAlignNoCheck(
std::ostream& os,
const BTDnaString& read,
const EList<Edit>& edits)
{
printQAlignNoCheck(os, "", read, edits);
}
/**
* For now, we pretend that the alignment is in the forward orientation
* and that the Edits are listed from left- to right-hand side.
*/
void Edit::printQAlign(
std::ostream& os,
const char *prefix,
const BTDnaString& read,
const EList<Edit>& edits)
{
size_t eidx = 0;
os << prefix;
// Print read
for(size_t i = 0; i < read.length(); i++) {
bool del = false, mm = false;
while(eidx < edits.size() && edits[eidx].pos == i) {
if(edits[eidx].isReadGap()) {
os << '-';
} else if(edits[eidx].isRefGap()) {
del = true;
assert_eq((int)edits[eidx].qchr, read.toChar(i));
os << read.toChar(i);
} else {
mm = true;
assert(edits[eidx].isMismatch());
assert_eq((int)edits[eidx].qchr, read.toChar(i));
os << (char)edits[eidx].qchr;
}
eidx++;
}
if(!del && !mm) os << read.toChar(i);
}
os << endl;
os << prefix;
eidx = 0;
// Print match bars
for(size_t i = 0; i < read.length(); i++) {
bool del = false, mm = false;
while(eidx < edits.size() && edits[eidx].pos == i) {
if(edits[eidx].isReadGap()) {
os << ' ';
} else if(edits[eidx].isRefGap()) {
del = true;
os << ' ';
} else {
mm = true;
assert(edits[eidx].isMismatch());
os << ' ';
}
eidx++;
}
if(!del && !mm) os << '|';
}
os << endl;
os << prefix;
eidx = 0;
// Print reference
for(size_t i = 0; i < read.length(); i++) {
bool del = false, mm = false;
while(eidx < edits.size() && edits[eidx].pos == i) {
if(edits[eidx].isReadGap()) {
os << (char)edits[eidx].chr;
} else if(edits[eidx].isRefGap()) {
del = true;
os << '-';
} else {
mm = true;
assert(edits[eidx].isMismatch());
os << (char)edits[eidx].chr;
}
eidx++;
}
if(!del && !mm) os << read.toChar(i);
}
os << endl;
}
/**
* For now, we pretend that the alignment is in the forward orientation
* and that the Edits are listed from left- to right-hand side.
*/
void Edit::printQAlignNoCheck(
std::ostream& os,
const char *prefix,
const BTDnaString& read,
const EList<Edit>& edits)
{
size_t eidx = 0;
os << prefix;
// Print read
for(size_t i = 0; i < read.length(); i++) {
bool del = false, mm = false;
while(eidx < edits.size() && edits[eidx].pos == i) {
if(edits[eidx].isReadGap()) {
os << '-';
} else if(edits[eidx].isRefGap()) {
del = true;
os << read.toChar(i);
} else {
mm = true;
os << (char)edits[eidx].qchr;
}
eidx++;
}
if(!del && !mm) os << read.toChar(i);
}
os << endl;
os << prefix;
eidx = 0;
// Print match bars
for(size_t i = 0; i < read.length(); i++) {
bool del = false, mm = false;
while(eidx < edits.size() && edits[eidx].pos == i) {
if(edits[eidx].isReadGap()) {
os << ' ';
} else if(edits[eidx].isRefGap()) {
del = true;
os << ' ';
} else {
mm = true;
os << ' ';
}
eidx++;
}
if(!del && !mm) os << '|';
}
os << endl;
os << prefix;
eidx = 0;
// Print reference
for(size_t i = 0; i < read.length(); i++) {
bool del = false, mm = false;
while(eidx < edits.size() && edits[eidx].pos == i) {
if(edits[eidx].isReadGap()) {
os << (char)edits[eidx].chr;
} else if(edits[eidx].isRefGap()) {
del = true;
os << '-';
} else {
mm = true;
os << (char)edits[eidx].chr;
}
eidx++;
}
if(!del && !mm) os << read.toChar(i);
}
os << endl;
}
/**
* Sort the edits in the provided list.
*/
void Edit::sort(EList<Edit>& edits) {
edits.sort(); // simple!
}
/**
* Given a read string and some edits, generate and append the corresponding
* reference string to 'ref'. If read aligned to the Watson strand, the caller
* should pass the original read sequence and original edits. If a read
* aligned to the Crick strand, the caller should pass the reverse complement
* of the read and a version of the edits list that has had Edit:invertPoss
* called on it to cause edits to be listed in 3'-to-5' order.
*/
void Edit::toRef(
const BTDnaString& read,
const EList<Edit>& edits,
BTDnaString& ref,
bool fw,
size_t trim5,
size_t trim3)
{
// edits should be sorted
size_t eidx = 0;
// Print reference
const size_t rdlen = read.length();
size_t trimBeg = fw ? trim5 : trim3;
size_t trimEnd = fw ? trim3 : trim5;
assert(Edit::repOk(edits, read, fw, trim5, trim3));
if(!fw) {
invertPoss(const_cast<EList<Edit>&>(edits), read.length()-trimBeg-trimEnd, false);
}
for(size_t i = 0; i < rdlen; i++) {
ASSERT_ONLY(int c = read[i]);
assert_range(0, 4, c);
bool del = false, mm = false;
bool append = i >= trimBeg && rdlen - i - 1 >= trimEnd;
bool appendIns = i >= trimBeg && rdlen - i >= trimEnd;
while(eidx < edits.size() && edits[eidx].pos+trimBeg == i) {
if(edits[eidx].isReadGap()) {
// Inserted characters come before the position's
// character
if(appendIns) {
ref.appendChar((char)edits[eidx].chr);
}
} else if(edits[eidx].isRefGap()) {
assert_eq("ACGTN"[c], edits[eidx].qchr);
del = true;
} else if(edits[eidx].isMismatch()){
mm = true;
assert(edits[eidx].qchr != edits[eidx].chr || edits[eidx].qchr == 'N');
assert_eq("ACGTN"[c], edits[eidx].qchr);
if(append) {
ref.appendChar((char)edits[eidx].chr);
}
}
eidx++;
}
if(!del && !mm) {
if(append) {
ref.append(read[i]);
}
}
}
if(trimEnd == 0) {
while(eidx < edits.size()) {
assert_gt(rdlen, edits[eidx].pos);
if(edits[eidx].isReadGap()) {
ref.appendChar((char)edits[eidx].chr);
}
eidx++;
}
}
if(!fw) {
invertPoss(const_cast<EList<Edit>&>(edits), read.length()-trimBeg-trimEnd, false);
}
}
#ifndef NDEBUG
/**
* Check that the edit is internally consistent.
*/
bool Edit::repOk() const {
assert(inited());
// Ref and read characters cannot be the same unless they're both Ns
if(type != EDIT_TYPE_SPL) {
assert(qchr != chr || qchr == 'N');
// Type must match characters
assert(isRefGap() || chr != '-');
assert(isReadGap() || qchr != '-');
assert(!isMismatch() || (qchr != '-' && chr != '-'));
} else {
assert_gt(splLen, 0);
}
return true;
}
/**
* Given a list of edits and a DNA string representing the query
* sequence, check that the edits are consistent with respect to the
* query.
*/
bool Edit::repOk(
const EList<Edit>& edits,
const BTDnaString& s,
bool fw,
size_t trimBeg,
size_t trimEnd)
{
if(!fw) {
invertPoss(const_cast<EList<Edit>&>(edits), s.length()-trimBeg-trimEnd, false);
swap(trimBeg, trimEnd);
}
for(size_t i = 0; i < edits.size(); i++) {
const Edit& e = edits[i];
size_t pos = e.pos;
if(i > 0) {
assert_geq(pos, edits[i-1].pos);
}
bool del = false, mm = false;
while(i < edits.size() && edits[i].pos == pos) {
const Edit& ee = edits[i];
assert_lt(ee.pos, s.length());
if(ee.type != EDIT_TYPE_SPL) {
if(ee.qchr != '-') {
assert(ee.isRefGap() || ee.isMismatch());
//assert_eq((int)ee.qchr, s.toChar(ee.pos+trimBeg));
}
}
if(ee.isMismatch()) {
assert(!mm);
mm = true;
assert(!del);
} else if(ee.isReadGap()) {
assert(!mm);
} else if(ee.isRefGap()) {
assert(!mm);
assert(!del);
del = true;
} else if(ee.isSpliced()) {
}
i++;
}
}
if(!fw) {
invertPoss(const_cast<EList<Edit>&>(edits), s.length()-trimBeg-trimEnd, false);
}
return true;
}
#endif
/**
* Merge second argument into the first. Assume both are sorted to
* begin with.
*/
void Edit::merge(EList<Edit>& dst, const EList<Edit>& src) {
size_t di = 0, si = 0;
while(di < dst.size()) {
if(src[si].pos < dst[di].pos) {
dst.insert(src[si], di);
si++; di++;
} else if(src[si].pos == dst[di].pos) {
// There can be two inserts at a given position, but we
// can't merge them because there's no way to know their
// order
assert(src[si].isReadGap() != dst[di].isReadGap());
if(src[si].isReadGap()) {
dst.insert(src[si], di);
si++; di++;
} else if(dst[di].isReadGap()) {
di++;
}
}
}
while(si < src.size()) dst.push_back(src[si++]);
}
/**
* Clip off some of the low-numbered positions.
*/
void Edit::clipLo(EList<Edit>& ed, size_t len, size_t amt) {
size_t nrm = 0;
for(size_t i = 0; i < ed.size(); i++) {
assert_lt(ed[i].pos, len);
if(ed[i].pos < amt) {
nrm++;
} else {
// Shift everyone else up
ed[i].pos -= (uint32_t)amt;
}
}
ed.erase(0, nrm);
}
/**
* Clip off some of the high-numbered positions.
*/
void Edit::clipHi(EList<Edit>& ed, size_t len, size_t amt) {
assert_leq(amt, len);
size_t max = len - amt;
size_t nrm = 0;
for(size_t i = 0; i < ed.size(); i++) {
size_t ii = ed.size() - i - 1;
assert_lt(ed[ii].pos, len);
if(ed[ii].pos > max) {
nrm++;
} else if(ed[ii].pos == max && !ed[ii].isReadGap()) {
nrm++;
} else {
break;
}
}
ed.resize(ed.size() - nrm);
}