hisat-3n/edit.h

402 lines
9.9 KiB
C
Raw Permalink Normal View History

2025-01-18 13:09:52 +00:00
/*
* Copyright 2011, Ben Langmead <langmea@cs.jhu.edu>
*
* This file is part of Bowtie 2.
*
* Bowtie 2 is free software: you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation, either version 3 of the License, or
* (at your option) any later version.
*
* Bowtie 2 is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with Bowtie 2. If not, see <http://www.gnu.org/licenses/>.
*/
#ifndef EDIT_H_
#define EDIT_H_
#include <iostream>
#include <stdint.h>
#include <limits>
#include "assert_helpers.h"
#include "filebuf.h"
#include "sstring.h"
#include "ds.h"
#include "splice_site.h"
/**
* 3 types of edits; mismatch (substitution), insertion in the
* reference, deletion in the reference.
*/
enum {
EDIT_TYPE_READ_GAP = 1,
EDIT_TYPE_REF_GAP,
EDIT_TYPE_MM,
EDIT_TYPE_SNP,
EDIT_TYPE_SPL, // splicing of pre-messenger RNAs into messenger RNAs
};
/**
* Encapsulates an edit between the read sequence and the reference sequence.
* We obey a few conventions when populating its fields. The fields are:
*
* uint8_t chr; // reference character involved (for subst and ins)
* uint8_t qchr; // read character involved (for subst and del)
* uint8_t type; // 1 -> mm, 2 -> SNP, 3 -> ins, 4 -> del
* uint32_t pos; // position w/r/t search root
*
* One convention is that pos is always an offset w/r/t the 5' end of the read.
*
* Another is that chr and qchr are expressed in terms of the nucleotides on
* the forward version of the read. So if we're aligning the reverse
* complement of the read, and an A in the reverse complement mismatches a C in
* the reference, chr should be G and qchr should be T.
*/
struct Edit {
Edit() { reset(); }
Edit(
uint32_t po,
int ch,
int qc,
int ty,
bool chrs = true,
uint32_t snp = std::numeric_limits<uint32_t>::max())
{
init(po, ch, qc, ty, chrs, snp);
}
Edit(
uint32_t po,
int ch,
int qc,
int ty,
uint32_t sl,
uint8_t sdir,
bool knowns,
bool chrs = true)
{
init(po, ch, qc, ty, sl, sdir, knowns, chrs);
}
/**
* Reset Edit to uninitialized state.
*/
void reset() {
pos = pos2 = std::numeric_limits<uint32_t>::max();
chr = qchr = type = 0;
splLen = 0;
splDir = SPL_UNKNOWN;
knownSpl = false;
snpID = std::numeric_limits<uint32_t>::max();
}
/**
* Return true iff the Edit is initialized.
*/
bool inited() const {
return pos != std::numeric_limits<uint32_t>::max();
}
/**
* Initialize a new Edit.
*/
void init(
uint32_t po,
int ch,
int qc,
int ty,
bool chrs = true,
uint32_t snp = std::numeric_limits<uint32_t>::max())
{
chr = ch;
qchr = qc;
type = ty;
splLen = 0;
splDir = SPL_UNKNOWN;
pos = po;
if(qc == '-') {
// Read gap
pos2 = std::numeric_limits<uint32_t>::max() >> 1;
} else {
pos2 = std::numeric_limits<uint32_t>::max();
}
snpID = snp;
if(!chrs) {
assert_range(0, 4, (int)chr);
assert_range(0, 4, (int)qchr);
chr = "ACGTN"[chr];
qchr = "ACGTN"[qchr];
}
#ifndef NDEBUG
if(type != EDIT_TYPE_SPL) {
assert_in(chr, "ACMGRSVTWYHKDBN-");
assert_in(qchr, "ACGTN-");
assert(chr != qchr || chr == 'N');
}
#endif
assert(inited());
}
/**
* Initialize a new Edit.
*/
void init(
uint32_t po,
int ch,
int qc,
int ty,
uint32_t sl,
uint32_t sdir,
bool knowns,
bool chrs = true)
{
assert_eq(ty, EDIT_TYPE_SPL);
init(po, ch, qc, ty, chrs);
splLen = sl;
splDir = sdir;
knownSpl = knowns;
}
/**
* Return true iff one part of the edit or the other has an 'N'.
*/
bool hasN() const {
assert(inited());
return chr == 'N' || qchr == 'N';
}
/**
* Edit less-than overload.
*/
int operator< (const Edit &rhs) const {
assert(inited());
if(pos < rhs.pos) return 1;
if(pos > rhs.pos) return 0;
if(pos2 < rhs.pos2) return 1;
if(pos2 > rhs.pos2) return 0;
if(type < rhs.type) return 1;
if(type > rhs.type) return 0;
if(chr < rhs.chr) return 1;
if(chr > rhs.chr) return 0;
return (qchr < rhs.qchr)? 1 : 0;
}
/**
* Edit equals overload.
*/
int operator== (const Edit &rhs) const {
assert(inited());
if(type != rhs.type)
return false;
if(pos != rhs.pos)
return false;
if(type != EDIT_TYPE_SPL) {
return chr == rhs.chr && qchr == rhs.qchr;
} else {
return pos2 == rhs.pos2 &&
splLen == rhs.splLen &&
splDir == rhs.splDir /* &&
knownSpl == rhs.knownSpl */;
}
}
/**
* Return true iff this Edit is an initialized insertion.
*/
bool isReadGap() const {
assert(inited());
return type == EDIT_TYPE_READ_GAP;
}
/**
* Return true iff this Edit is an initialized deletion.
*/
bool isRefGap() const {
assert(inited());
return type == EDIT_TYPE_REF_GAP;
}
/**
* Return true if this Edit is either an initialized deletion or an
* initialized insertion.
*/
bool isGap() const {
assert(inited());
return (type == EDIT_TYPE_REF_GAP || type == EDIT_TYPE_READ_GAP);
}
bool isSpliced() const {
assert(inited());
return type == EDIT_TYPE_SPL;
}
/**
* Return the number of gaps in the given edit list.
*/
static size_t numGaps(const EList<Edit>& es) {
size_t gaps = 0;
for(size_t i = 0; i < es.size(); i++) {
if(es[i].isGap()) gaps++;
}
return gaps;
}
/**
* Return true iff this Edit is an initialized mismatch.
*/
bool isMismatch() const {
assert(inited());
return type == EDIT_TYPE_MM;
}
/**
* Sort the edits in the provided list.
*/
static void sort(EList<Edit>& edits);
/**
* Flip all the edits.pos fields so that they're with respect to
* the other end of the read (of length 'sz').
*/
static void invertPoss(
EList<Edit>& edits,
size_t sz,
size_t ei,
size_t en,
bool sort = false);
/**
* Flip all the edits.pos fields so that they're with respect to
* the other end of the read (of length 'sz').
*/
static void invertPoss(EList<Edit>& edits, size_t sz, bool sort = false) {
invertPoss(edits, sz, 0, edits.size(), sort);
}
static void complement(EList<Edit>& edits);
/**
* Clip off some of the low-numbered positions.
*/
static void clipLo(EList<Edit>& edits, size_t len, size_t amt);
/**
* Clip off some of the high-numbered positions.
*/
static void clipHi(EList<Edit>& edits, size_t len, size_t amt);
/**
* Given a read string and some edits, generate and append the
* corresponding reference string to 'ref'.
*/
static void toRef(
const BTDnaString& read,
const EList<Edit>& edits,
BTDnaString& ref,
bool fw = true,
size_t trim5 = 0,
size_t trim3 = 0);
/**
* Given a string and its edits with respect to some other string,
* print the alignment between the strings with the strings stacked
* vertically, with vertical bars denoting matches.
*/
static void printQAlign(
std::ostream& os,
const BTDnaString& read,
const EList<Edit>& edits);
/**
* Given a string and its edits with respect to some other string,
* print the alignment between the strings with the strings stacked
* vertically, with vertical bars denoting matches. Add 'prefix'
* before each line of output.
*/
static void printQAlign(
std::ostream& os,
const char *prefix,
const BTDnaString& read,
const EList<Edit>& edits);
/**
* Given a string and its edits with respect to some other string,
* print the alignment between the strings with the strings stacked
* vertically, with vertical bars denoting matches.
*/
static void printQAlignNoCheck(
std::ostream& os,
const BTDnaString& read,
const EList<Edit>& edits);
/**
* Given a string and its edits with respect to some other string,
* print the alignment between the strings with the strings stacked
* vertically, with vertical bars denoting matches. Add 'prefix'
* before each line of output.
*/
static void printQAlignNoCheck(
std::ostream& os,
const char *prefix,
const BTDnaString& read,
const EList<Edit>& edits);
#ifndef NDEBUG
bool repOk() const;
/**
* Given a list of edits and a DNA string representing the query
* sequence, check that the edits are consistent with respect to the
* query.
*/
static bool repOk(
const EList<Edit>& edits,
const BTDnaString& s,
bool fw = true,
size_t trim5 = 0,
size_t trim3 = 0);
#endif
uint8_t chr; // reference character involved (for subst and ins)
uint8_t qchr; // read character involved (for subst and del)
uint8_t type; // 1 -> mm, 2 -> SNP, 3 -> ins, 4 -> del
uint32_t pos; // position w/r/t search root
uint32_t pos2; // Second int to take into account when sorting. Useful for
// sorting read gap edits that are all part of the same long
// gap.
uint32_t splLen; // skip over the genome due to an intron
uint8_t splDir;
bool knownSpl;
int64_t donor_seq;
int64_t acceptor_seq;
uint32_t snpID; // snp ID
friend std::ostream& operator<< (std::ostream& os, const Edit& e);
/**
* Print a comma-separated list of Edits to given output stream.
*/
static void print(
std::ostream& os,
const EList<Edit>& edits,
char delim = '\t');
/**
* Merge second argument into the first. Assume both are sorted to
* begin with.
*/
static void merge(EList<Edit>& dst, const EList<Edit>& src);
};
#endif /* EDIT_H_ */