616 lines
18 KiB
C++
616 lines
18 KiB
C++
/*
|
|
* Copyright 2013, Daehwan Kim <infphilo@gmail.com>
|
|
*
|
|
* This file is part of HISAT.
|
|
*
|
|
* HISAT is free software: you can redistribute it and/or modify
|
|
* it under the terms of the GNU General Public License as published by
|
|
* the Free Software Foundation, either version 3 of the License, or
|
|
* (at your option) any later version.
|
|
*
|
|
* HISAT is distributed in the hope that it will be useful,
|
|
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
|
* GNU General Public License for more details.
|
|
*
|
|
* You should have received a copy of the GNU General Public License
|
|
* along with HISAT. If not, see <http://www.gnu.org/licenses/>.
|
|
*/
|
|
|
|
#ifndef SPLICE_SITE_H_
|
|
#define SPLICE_SITE_H_
|
|
|
|
#include <stdint.h>
|
|
#include <iostream>
|
|
#include <fstream>
|
|
#include <limits>
|
|
#include "assert_helpers.h"
|
|
#include "mem_ids.h"
|
|
#include "ref_coord.h"
|
|
#include "ds.h"
|
|
#include "read.h"
|
|
#include "reference.h"
|
|
#include "hier_idx_common.h"
|
|
#include "gfm.h"
|
|
#include "alt.h"
|
|
|
|
enum {
|
|
SPL_UNKNOWN = 1,
|
|
SPL_FW,
|
|
SPL_RC,
|
|
SPL_SEMI_FW,
|
|
SPL_SEMI_RC,
|
|
};
|
|
|
|
using namespace std;
|
|
|
|
// #define NEW_PROB_MODEL
|
|
// the following parameters are borrowed from Yeo and Burge 2004 in Journal of Computational Biology
|
|
const size_t donor_exonic_len = 3;
|
|
const size_t donor_intronic_len = 6;
|
|
const size_t donor_len = donor_exonic_len + donor_intronic_len;
|
|
|
|
#if defined(NEW_PROB_MODEL)
|
|
const size_t acceptor_intronic_len = 20;
|
|
const size_t acceptor_exonic_len = 3;
|
|
const size_t acceptor_len = acceptor_intronic_len + acceptor_exonic_len;
|
|
#else
|
|
// the following parameters are borrowed from Ch 3. in Bioinformatics - From Genomes to Drugs. Volume I: Basic Technologies (Victor Solovyev)
|
|
const size_t acceptor_intronic_len = 14;
|
|
const size_t acceptor_exonic_len = 1;
|
|
const size_t acceptor_len = acceptor_intronic_len + acceptor_exonic_len;
|
|
const size_t acceptor_len1 = acceptor_len / 2;
|
|
const size_t acceptor_len2 = acceptor_len - acceptor_len1;
|
|
|
|
// the following parameters are borrowed from Yeo and Burge 2004 in Journal of Computational Biology
|
|
const float background_prob[4] = {0.27f, 0.23f, 0.23f, 0.27f};
|
|
|
|
extern float donor_prob[4][donor_len];
|
|
extern float acceptor_prob[4][acceptor_len];
|
|
|
|
extern float donor_prob_sum[1 << (donor_len << 1)];
|
|
extern float acceptor_prob_sum1[1 << (acceptor_len1 << 1)];
|
|
extern float acceptor_prob_sum2[1 << (acceptor_len2 << 1)];
|
|
|
|
#endif
|
|
const size_t intronic_len = max<size_t>(donor_intronic_len, acceptor_intronic_len);
|
|
|
|
extern void init_junction_prob();
|
|
|
|
/**
|
|
*
|
|
*/
|
|
class SpliceSitePos {
|
|
|
|
public:
|
|
|
|
SpliceSitePos() { reset(); }
|
|
|
|
SpliceSitePos(const SpliceSitePos& c) { init(c); }
|
|
|
|
SpliceSitePos(uint32_t ref, uint32_t left, uint32_t right, uint8_t splDir, bool exon = false)
|
|
{
|
|
init(ref, left, right, splDir, exon);
|
|
}
|
|
|
|
/**
|
|
* Copy given fields into this Coord.
|
|
*/
|
|
void init(uint32_t ref, uint32_t left, uint32_t right, uint8_t splDir, bool exon = false) {
|
|
_ref = ref;
|
|
_left = left;
|
|
_right = right;
|
|
_splDir = splDir;
|
|
_exon = exon;
|
|
}
|
|
|
|
/**
|
|
* Copy contents of given Coord into this one.
|
|
*/
|
|
void init(const SpliceSitePos& c) {
|
|
_ref = c._ref;
|
|
_left = c._left;
|
|
_right = c._right;
|
|
_splDir = c._splDir;
|
|
_exon = c._exon;
|
|
}
|
|
|
|
/**
|
|
* Return true iff this Coord is identical to the given Coord.
|
|
*/
|
|
bool operator==(const SpliceSitePos& o) const {
|
|
assert(inited());
|
|
assert(o.inited());
|
|
return _ref == o._ref &&
|
|
_left == o._left &&
|
|
_right == o._right &&
|
|
_splDir == o._splDir &&
|
|
_exon == o._exon;
|
|
|
|
}
|
|
|
|
/**
|
|
* Return true iff this Coord is less than the given Coord. One Coord is
|
|
* less than another if (a) its reference id is less, (b) its orientation is
|
|
* less, or (c) its offset is less.
|
|
*/
|
|
bool operator<(const SpliceSitePos& o) const {
|
|
if(_ref < o._ref) return true;
|
|
if(_ref > o._ref) return false;
|
|
if(_left < o._left) return true;
|
|
if(_left > o._left) return false;
|
|
if(_right < o._right) return true;
|
|
if(_right > o._right) return false;
|
|
if(_splDir < o._splDir) return true;
|
|
if(_splDir > o._splDir) return false;
|
|
if(_exon != o._exon) return _exon;
|
|
return false;
|
|
}
|
|
|
|
/**
|
|
* Return the opposite result from operator<.
|
|
*/
|
|
bool operator>=(const SpliceSitePos& o) const {
|
|
return !((*this) < o);
|
|
}
|
|
|
|
/**
|
|
* Return true iff this Coord is greater than the given Coord. One Coord
|
|
* is greater than another if (a) its reference id is greater, (b) its
|
|
* orientation is greater, or (c) its offset is greater.
|
|
*/
|
|
bool operator>(const SpliceSitePos& o) const {
|
|
if(_ref > o._ref) return true;
|
|
if(_ref < o._ref) return false;
|
|
if(_left > o._left) return true;
|
|
if(_left < o._left) return false;
|
|
if(_right > o._right) return true;
|
|
if(_right < o._right) return false;
|
|
if(_splDir > o._splDir) return true;
|
|
if(_splDir < o._splDir) return false;
|
|
if(_exon != o._exon) return !_exon;
|
|
return false;
|
|
}
|
|
|
|
/**
|
|
* Return the opposite result from operator>.
|
|
*/
|
|
bool operator<=(const SpliceSitePos& o) const {
|
|
return !((*this) > o);
|
|
}
|
|
|
|
/**
|
|
* Reset this coord to uninitialized state.
|
|
*/
|
|
virtual void reset() {
|
|
_ref = std::numeric_limits<uint32_t>::max();
|
|
_left = std::numeric_limits<uint32_t>::max();
|
|
_right = std::numeric_limits<uint32_t>::max();
|
|
_splDir = SPL_UNKNOWN;
|
|
_exon = false;
|
|
}
|
|
|
|
/**
|
|
* Return true iff this Coord is initialized (i.e. ref and off have both
|
|
* been set since the last call to reset()).
|
|
*/
|
|
bool inited() const {
|
|
if(_ref != std::numeric_limits<uint32_t>::max() &&
|
|
_left != std::numeric_limits<uint32_t>::max() &&
|
|
_right != std::numeric_limits<uint32_t>::max())
|
|
{
|
|
return true;
|
|
}
|
|
return false;
|
|
}
|
|
|
|
#ifndef NDEBUG
|
|
/**
|
|
* Check that coord is internally consistent.
|
|
*/
|
|
bool repOk() const {
|
|
if(_ref == std::numeric_limits<uint32_t>::max() ||
|
|
_left == std::numeric_limits<uint32_t>::max() ||
|
|
_right == std::numeric_limits<uint32_t>::max()) {
|
|
return false;
|
|
}
|
|
return true;
|
|
}
|
|
#endif
|
|
|
|
/**
|
|
* Check whether an interval defined by this coord and having
|
|
* length 'len' is contained within an interval defined by
|
|
* 'inbegin' and 'inend'.
|
|
*/
|
|
#if 0
|
|
bool within(int64_t len, int64_t inbegin, int64_t inend) const {
|
|
return off_ >= inbegin && off_ + len <= inend;
|
|
}
|
|
#endif
|
|
|
|
uint32_t ref() const { return _ref; }
|
|
uint32_t left() const { return _left; }
|
|
uint32_t right() const { return _right; }
|
|
uint8_t splDir() const { return _splDir; }
|
|
bool canonical() const { return _splDir == SPL_FW || _splDir == SPL_RC; }
|
|
uint32_t intron_len() const { return _right - _left - 1; }
|
|
bool exon() const { return _exon; }
|
|
|
|
protected:
|
|
|
|
uint32_t _ref; // which reference?
|
|
uint32_t _left; // 0-based offset of the right most base of the left flanking exon
|
|
uint32_t _right; // 0-based offset of the left most base of the right flanking exon
|
|
uint8_t _splDir;
|
|
bool _exon;
|
|
};
|
|
|
|
/**
|
|
*
|
|
*/
|
|
class SpliceSite : public SpliceSitePos {
|
|
|
|
public:
|
|
|
|
SpliceSite() { reset(); }
|
|
|
|
SpliceSite(const SpliceSite& c) { init(c); }
|
|
|
|
SpliceSite(uint32_t ref,
|
|
uint32_t left,
|
|
uint32_t right,
|
|
uint8_t splDir,
|
|
bool exon = false,
|
|
bool fromFile = false,
|
|
bool known = false)
|
|
{
|
|
init(ref, left, right, splDir, exon, fromFile, known);
|
|
}
|
|
|
|
/**
|
|
* Copy given fields into this Coord.
|
|
*/
|
|
void init(uint32_t ref,
|
|
uint32_t left,
|
|
uint32_t right,
|
|
uint8_t splDir,
|
|
bool exon = false,
|
|
bool fromFile = false,
|
|
bool known = false)
|
|
{
|
|
SpliceSitePos::init(ref, left, right, splDir, exon);
|
|
|
|
// _donordint = 0;
|
|
// _acceptordint = 0;
|
|
// _leftseq = 0;
|
|
// _rightseq = 0;
|
|
_leftext = 0;
|
|
_rightext = 0;
|
|
_numreads = 0;
|
|
_editdist = 0;
|
|
// _probscore = 0.0f;
|
|
_readid = 0;
|
|
_exon = exon;
|
|
_fromfile = fromFile;
|
|
_known = known;
|
|
}
|
|
|
|
/**
|
|
* Copy contents of given Coord into this one.
|
|
*/
|
|
void init(const SpliceSite& c) {
|
|
SpliceSitePos::init(c);
|
|
|
|
// _donordint = c._donordint;
|
|
// _acceptordint = c._acceptordint;
|
|
// _leftseq = c._leftseq;
|
|
// _rightseq = c._rightseq;
|
|
_leftext = c._leftext;
|
|
_rightext = c._rightext;
|
|
_numreads = c._numreads;
|
|
_editdist = c._editdist;
|
|
// _probscore = c._probscore;
|
|
_readid = c._readid;
|
|
_fromfile = c._fromfile;
|
|
_known = c._known;
|
|
}
|
|
|
|
/**
|
|
* Reset this coord to uninitialized state.
|
|
*/
|
|
virtual void reset() {
|
|
SpliceSitePos::reset();
|
|
|
|
// _donordint = 0;
|
|
// _acceptordint = 0;
|
|
// _leftseq = 0;
|
|
// _rightseq = 0;
|
|
_leftext = 0;
|
|
_rightext = 0;
|
|
_numreads = 0;
|
|
_editdist = 0;
|
|
// _probscore = 0.0;
|
|
_readid = 0;
|
|
_fromfile = false;
|
|
_known = false;
|
|
}
|
|
|
|
// uint8_t donordint() const { return _donordint; }
|
|
// uint8_t acceptordint() const { return _acceptordint; }
|
|
// uint64_t leftseq() const { return _leftseq; }
|
|
// uint64_t rightseq() const { return _rightseq; }
|
|
|
|
uint32_t leftext() const { return _leftext; }
|
|
uint32_t rightext() const { return _rightext; }
|
|
|
|
uint64_t numreads() const { return _numreads; }
|
|
uint32_t editdist() const { return _editdist; }
|
|
// float probscore() const { return _probscore; }
|
|
|
|
public:
|
|
|
|
// uint8_t _donordint; // 3' dinucleotide on Watson strand
|
|
// uint8_t _acceptordint; // 5' dinucleotide on Watson strand
|
|
// uint64_t _leftseq; // left 32bp flanking seq on Watson strand
|
|
// uint64_t _rightseq; // right 32bp flanking seq on Watson strand
|
|
|
|
uint32_t _leftext;
|
|
uint32_t _rightext;
|
|
|
|
uint64_t _numreads; // number of reads spanning this splice site
|
|
|
|
uint32_t _editdist;
|
|
|
|
// float _probscore;
|
|
|
|
uint64_t _readid;
|
|
bool _fromfile;
|
|
bool _known;
|
|
};
|
|
|
|
std::ostream& operator<<(std::ostream& out, const SpliceSite& c);
|
|
|
|
/**
|
|
*
|
|
*/
|
|
class Exon {
|
|
public:
|
|
|
|
Exon() { reset(); }
|
|
|
|
Exon(const Exon& c) { init(c); }
|
|
|
|
Exon(uint32_t ref, uint32_t left, uint32_t right, bool fw)
|
|
{
|
|
init(ref, left, right, fw);
|
|
}
|
|
|
|
/**
|
|
* Copy given fields into this Coord.
|
|
*/
|
|
void init(uint32_t ref, uint32_t left, uint32_t right, bool fw) {
|
|
_ref = ref;
|
|
_left = left;
|
|
_right = right;
|
|
_fw = fw;
|
|
}
|
|
|
|
/**
|
|
* Copy contents of given Coord into this one.
|
|
*/
|
|
void init(const Exon& c) {
|
|
_ref = c._ref;
|
|
_left = c._left;
|
|
_right = c._right;
|
|
_fw = c._fw;
|
|
}
|
|
|
|
/**
|
|
* Return true iff this Coord is identical to the given Coord.
|
|
*/
|
|
bool operator==(const Exon& o) const {
|
|
assert(inited());
|
|
assert(o.inited());
|
|
return _ref == o._ref &&
|
|
_left == o._left &&
|
|
_right == o._right &&
|
|
_fw == o._fw;
|
|
}
|
|
|
|
/**
|
|
* Return true iff this Coord is less than the given Coord. One Coord is
|
|
* less than another if (a) its reference id is less, (b) its orientation is
|
|
* less, or (c) its offset is less.
|
|
*/
|
|
bool operator<(const Exon& o) const {
|
|
if(_ref < o._ref) return true;
|
|
if(_ref > o._ref) return false;
|
|
if(_left < o._left) return true;
|
|
if(_left > o._left) return false;
|
|
if(_right < o._right) return true;
|
|
if(_right > o._right) return false;
|
|
if(_fw != o._fw) return _fw;
|
|
return false;
|
|
}
|
|
|
|
/**
|
|
* Return the opposite result from operator<.
|
|
*/
|
|
bool operator>=(const Exon& o) const {
|
|
return !((*this) < o);
|
|
}
|
|
|
|
/**
|
|
* Return true iff this Coord is greater than the given Coord. One Coord
|
|
* is greater than another if (a) its reference id is greater, (b) its
|
|
* orientation is greater, or (c) its offset is greater.
|
|
*/
|
|
bool operator>(const Exon& o) const {
|
|
if(_ref > o._ref) return true;
|
|
if(_ref < o._ref) return false;
|
|
if(_left > o._left) return true;
|
|
if(_left < o._left) return false;
|
|
if(_right > o._right) return true;
|
|
if(_right < o._right) return false;
|
|
if(_fw != o._fw) return !_fw;
|
|
return false;
|
|
}
|
|
|
|
/**
|
|
* Return the opposite result from operator>.
|
|
*/
|
|
bool operator<=(const Exon& o) const {
|
|
return !((*this) > o);
|
|
}
|
|
|
|
/**
|
|
* Reset this coord to uninitialized state.
|
|
*/
|
|
void reset() {
|
|
_ref = std::numeric_limits<uint32_t>::max();
|
|
_left = std::numeric_limits<uint32_t>::max();
|
|
_right = std::numeric_limits<uint32_t>::max();
|
|
_fw = true;
|
|
}
|
|
|
|
/**
|
|
* Return true iff this Coord is initialized (i.e. ref and off have both
|
|
* been set since the last call to reset()).
|
|
*/
|
|
bool inited() const {
|
|
if(_ref != std::numeric_limits<uint32_t>::max() &&
|
|
_left != std::numeric_limits<uint32_t>::max() &&
|
|
_right != std::numeric_limits<uint32_t>::max())
|
|
{
|
|
return true;
|
|
}
|
|
return false;
|
|
}
|
|
|
|
#ifndef NDEBUG
|
|
/**
|
|
* Check that coord is internally consistent.
|
|
*/
|
|
bool repOk() const {
|
|
if(_ref == std::numeric_limits<uint32_t>::max() ||
|
|
_left == std::numeric_limits<uint32_t>::max() ||
|
|
_right == std::numeric_limits<uint32_t>::max()) {
|
|
return false;
|
|
}
|
|
return true;
|
|
}
|
|
#endif
|
|
|
|
uint32_t ref() const { return _ref; }
|
|
uint32_t left() const { return _left; }
|
|
uint32_t right() const { return _right; }
|
|
bool fw() const { return _fw; }
|
|
|
|
protected:
|
|
uint32_t _ref; // which reference?
|
|
uint32_t _left; // 0-based offset of the right most base of the left flanking exon
|
|
uint32_t _right; // 0-based offset of the left most base of the right flanking exon
|
|
bool _fw; // true -> Watson strand
|
|
};
|
|
|
|
class AlnRes;
|
|
|
|
class SpliceSiteDB {
|
|
public:
|
|
typedef RedBlackNode<SpliceSitePos, uint32_t> Node;
|
|
|
|
public:
|
|
SpliceSiteDB(
|
|
const BitPairReference& refs,
|
|
const EList<string>& refnames,
|
|
bool threadSafe = true,
|
|
bool write = false,
|
|
bool read = false);
|
|
~SpliceSiteDB();
|
|
|
|
bool addSpliceSite(
|
|
const Read& rd,
|
|
const AlnRes& rs,
|
|
uint32_t minAnchorLen = 15);
|
|
|
|
static float probscore(
|
|
int64_t donor_seq,
|
|
int64_t acceptor_seq);
|
|
|
|
size_t size(uint64_t ref) const;
|
|
bool empty(uint64_t ref) const;
|
|
|
|
bool empty() { return _empty; }
|
|
|
|
bool write() const { return _write; }
|
|
bool read() const { return _read; }
|
|
|
|
bool getSpliceSite(SpliceSite& ss) const;
|
|
void getLeftSpliceSites(uint32_t ref, uint32_t left, uint32_t range, EList<SpliceSite>& spliceSites) const;
|
|
void getRightSpliceSites(uint32_t ref, uint32_t right, uint32_t range, EList<SpliceSite>& spliceSites) const;
|
|
bool hasSpliceSites(uint32_t ref, uint32_t left1, uint32_t right1, uint32_t left2, uint32_t right2, bool includeNovel = false) const;
|
|
bool insideExon(uint32_t ref, uint32_t left, uint32_t right) const;
|
|
|
|
void print(ofstream& out);
|
|
void read(const GFM<TIndexOffU>& gfm, const EList<ALT<TIndexOffU> >& alts);
|
|
void read(ifstream& in, bool known = false);
|
|
|
|
private:
|
|
void getSpliceSites_recur(
|
|
const RedBlackNode<SpliceSitePos, uint32_t> *node,
|
|
uint32_t left,
|
|
uint32_t right,
|
|
EList<SpliceSite>& spliceSites) const;
|
|
|
|
bool hasSpliceSites_recur(
|
|
const RedBlackNode<SpliceSitePos, uint32_t> *node,
|
|
uint32_t left,
|
|
uint32_t right,
|
|
bool includeNovel) const;
|
|
|
|
const RedBlackNode<SpliceSitePos, uint32_t>* getSpliceSite_temp(const SpliceSitePos& ssp) const;
|
|
|
|
void print_recur(
|
|
const RedBlackNode<SpliceSitePos, uint32_t> *node,
|
|
ofstream& out,
|
|
const uint32_t numreads_cutoff,
|
|
const uint32_t numreads_cutoff2,
|
|
EList<SpliceSite>& ss_list);
|
|
|
|
Pool& pool(uint64_t ref);
|
|
|
|
void print_impl(
|
|
ofstream& out,
|
|
EList<SpliceSite>& ss_list,
|
|
const SpliceSite* ss = NULL);
|
|
|
|
private:
|
|
uint64_t _numRefs;
|
|
EList<string> _refnames;
|
|
|
|
EList<RedBlack<SpliceSitePos, uint32_t>* > _fwIndex;
|
|
EList<RedBlack<SpliceSitePos, uint32_t>* > _bwIndex;
|
|
|
|
ELList<SpliceSite> _spliceSites;
|
|
|
|
ELList<Pool*> _pool; // dispenses memory pages
|
|
|
|
bool _write;
|
|
bool _read;
|
|
|
|
EList<MUTEX_T> _mutex;
|
|
bool _threadSafe;
|
|
|
|
SStringExpandable<char> raw_refbuf;
|
|
ASSERT_ONLY(SStringExpandable<uint32_t> destU32);
|
|
BTDnaString donorstr;
|
|
BTDnaString acceptorstr;
|
|
|
|
bool _empty;
|
|
|
|
EList<Exon> _exons;
|
|
};
|
|
|
|
#endif /*ifndef SPLICE_SITE_H_*/
|