hisat-3n/alt.h
2025-01-18 21:09:52 +08:00

295 lines
8.3 KiB
C++

/*
* Copyright 2015, Daehwan Kim <infphilo@gmail.com>
*
* This file is part of HISAT 2.
*
* HISAT 2 is free software: you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation, either version 3 of the License, or
* (at your option) any later version.
*
* HISAT 2 is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with HISAT 2. If not, see <http://www.gnu.org/licenses/>.
*/
#ifndef ALT_H_
#define ALT_H_
#include <iostream>
#include <fstream>
#include <limits>
#include "assert_helpers.h"
#include "word_io.h"
#include "mem_ids.h"
using namespace std;
enum ALT_TYPE {
ALT_NONE = 0,
ALT_SNP_SGL, // single nucleotide substitution
ALT_SNP_INS, // small insertion wrt reference genome
ALT_SNP_DEL, // small deletion wrt reference genome
ALT_SNP_ALT, // alternative sequence (to be implemented ...)
ALT_SPLICESITE,
ALT_EXON
};
template <typename index_t>
struct ALT {
ALT() {
reset();
}
void reset() {
type = ALT_NONE;
pos = len = 0;
seq = 0;
}
ALT_TYPE type;
union {
index_t pos;
index_t left;
};
union {
index_t len;
index_t right;
};
union {
uint64_t seq; // used to store 32 bp, but it can be used to store a pointer to EList<uint64_t>
struct {
union {
bool fw;
bool reversed;
};
bool excluded;
};
};
public:
// in order to support a sequence longer than 32 bp
bool snp() const { return type == ALT_SNP_SGL || type == ALT_SNP_DEL || type == ALT_SNP_INS; }
bool splicesite() const { return type == ALT_SPLICESITE; }
bool mismatch() const { return type == ALT_SNP_SGL; }
bool gap() const { return type == ALT_SNP_DEL || type == ALT_SNP_INS || type == ALT_SPLICESITE; }
bool deletion() const { return type == ALT_SNP_DEL; }
bool insertion() const { return type == ALT_SNP_INS; }
bool exon() const { return type == ALT_EXON; }
bool operator< (const ALT& o) const {
if(pos != o.pos) return pos < o.pos;
if(type != o.type) {
if(type == ALT_NONE || o.type == ALT_NONE) {
return type == ALT_NONE;
}
if(type == ALT_SNP_INS) return true;
else if(o.type == ALT_SNP_INS) return false;
return type < o.type;
}
if(len != o.len) return len < o.len;
if(seq != o.seq) return seq < o.seq;
return false;
}
bool compatibleWith(const ALT& o) const {
if(pos == o.pos) return false;
// sort the two SNPs
const ALT& a = (pos < o.pos ? *this : o);
const ALT& b = (pos < o.pos ? o : *this);
if(a.snp()) {
if(a.type == ALT_SNP_DEL || a.type == ALT_SNP_INS) {
if(b.pos <= a.pos + a.len) {
return false;
}
}
} else if(a.splicesite()) {
if(b.pos <= a.right + 2) {
return false;
}
} else {
assert(false);
}
return true;
}
bool isSame(const ALT& o) const {
if(type != o.type)
return false;
if(type == ALT_SNP_SGL) {
return pos == o.pos && seq == o.seq;
} else if(type == ALT_SNP_DEL || type == ALT_SNP_INS || type == ALT_SPLICESITE) {
if(type == ALT_SNP_INS) {
if(seq != o.seq)
return false;
}
if(reversed == o.reversed) {
return pos == o.pos && len == o.len;
} else {
if(reversed) {
return pos - len + 1 == o.pos && len == o.len;
} else {
return pos == o.pos - o.len + 1 && len == o.len;
}
}
} else {
assert(false);
}
return true;
}
#ifndef NDEBUG
bool repOk() const {
if(type == ALT_SNP_SGL) {
if(len != 1) {
assert(false);
return false;
}
if(seq > 3) {
assert(false);
return false;
}
} else if(type == ALT_SNP_DEL) {
if(len <= 0) {
assert(false);
return false;
}
if(seq != 0) {
assert(false);
return false;
}
} else if(type == ALT_SNP_INS) {
if(len <= 0) {
assert(false);
return false;
}
} else if(type == ALT_SPLICESITE) {
assert_lt(left, right);
assert_leq(fw, 1);
}else {
assert(false);
return false;
}
return true;
}
#endif
bool write(ofstream& f_out, bool bigEndian) const {
writeIndex<index_t>(f_out, pos, bigEndian);
writeU32(f_out, type, bigEndian);
writeIndex<index_t>(f_out, len, bigEndian);
writeIndex<uint64_t>(f_out, seq, bigEndian);
return true;
}
bool read(ifstream& f_in, bool bigEndian) {
pos = readIndex<index_t>(f_in, bigEndian);
type = (ALT_TYPE)readU32(f_in, bigEndian);
assert_neq(type, ALT_SNP_ALT);
len = readIndex<index_t>(f_in, bigEndian);
seq = readIndex<uint64_t>(f_in, bigEndian);
return true;
}
};
template <typename index_t>
struct Haplotype {
Haplotype() {
reset();
}
void reset() {
left = right = 0;
alts.clear();
}
index_t left;
index_t right;
EList<index_t, 1> alts;
bool operator< (const Haplotype& o) const {
if(left != o.left) return left < o.left;
if(right != o.right) return right < o.right;
return false;
}
bool write(ofstream& f_out, bool bigEndian) const {
writeIndex<index_t>(f_out, left, bigEndian);
writeIndex<index_t>(f_out, right, bigEndian);
writeIndex<index_t>(f_out, alts.size(), bigEndian);
for(index_t i = 0; i < alts.size(); i++) {
writeIndex<index_t>(f_out, alts[i], bigEndian);
}
return true;
}
bool read(ifstream& f_in, bool bigEndian) {
left = readIndex<index_t>(f_in, bigEndian);
right = readIndex<index_t>(f_in, bigEndian);
assert_leq(left, right);
index_t num_alts = readIndex<index_t>(f_in, bigEndian);
alts.resizeExact(num_alts); alts.clear();
for(index_t i = 0; i < num_alts; i++) {
alts.push_back(readIndex<index_t>(f_in, bigEndian));
}
return true;
}
};
template <typename index_t>
class ALTDB {
public:
ALTDB() :
_snp(false),
_ss(false),
_exon(false)
{}
virtual ~ALTDB() {}
bool hasSNPs() const { return _snp; }
bool hasSpliceSites() const { return _ss; }
bool hasExons() const { return _exon; }
void setSNPs(bool snp) { _snp = snp; }
void setSpliceSites(bool ss) { _ss = ss; }
void setExons(bool exon) { _exon = exon; }
EList<ALT<index_t> >& alts() { return _alts; }
EList<string>& altnames() { return _altnames; }
EList<Haplotype<index_t> >& haplotypes() { return _haplotypes; }
EList<index_t>& haplotype_maxrights() { return _haplotype_maxrights; }
const EList<ALT<index_t> >& alts() const { return _alts; }
const EList<string>& altnames() const { return _altnames; }
const EList<Haplotype<index_t> >& haplotypes() const { return _haplotypes; }
const EList<index_t>& haplotype_maxrights() const { return _haplotype_maxrights; }
private:
bool _snp;
bool _ss;
bool _exon;
EList<ALT<index_t> > _alts;
EList<string> _altnames;
EList<Haplotype<index_t> > _haplotypes;
EList<index_t> _haplotype_maxrights;
};
#endif /*ifndef ALT_H_*/