hisat-3n/dp_framer.h
2025-01-18 21:09:52 +08:00

262 lines
9.1 KiB
C++

/*
* Copyright 2011, Ben Langmead <langmea@cs.jhu.edu>
*
* This file is part of Bowtie 2.
*
* Bowtie 2 is free software: you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation, either version 3 of the License, or
* (at your option) any later version.
*
* Bowtie 2 is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with Bowtie 2. If not, see <http://www.gnu.org/licenses/>.
*/
/*
* dp_framer.h
*
* Classes and routines for framing dynamic programming problems. There are 2
* basic types of dynamic programming problems solved in Bowtie 2:
*
* 1. Seed extension: we found a seed hit using Burrows-Wheeler techniques and
* now we would like to extend it into a full alignment by doing dynamic
* programming in the vicinity of the seed hit.
*
* 2. Mate finding: we would a full alignment for one mate in a pair and now we
* would like to extend it into a full alignment by doing dynamic
* programming in the area prescribed by the maximum and minimum fragment
* lengths.
*
* By "framing" the dynamic programming problem, we mean that all of the
* following DP inputs are calculated:
*
* 1. The width of the parallelogram/rectangle to explore.
* 2. The 0-based offset of the reference position associated with the leftmost
* diagnomal/column in the parallelogram/rectangle to explore
* 3. An EList<bool> of length=width encoding which columns the alignment may
* start in
* 4. An EList<bool> of length=width encoding which columns the alignment may
* end in
*/
#ifndef DP_FRAMER_H_
#define DP_FRAMER_H_
#include <stdint.h>
#include "ds.h"
#include "ref_coord.h"
/**
* Describes a dynamic programming rectangle.
*
* Only knows about reference offsets, not reference sequences.
*/
struct DPRect {
DPRect(int cat = 0) /*: st(cat), en(cat)*/ {
refl = refr = triml = trimr = corel = corer = 0;
}
int64_t refl; // leftmost ref offset involved post trimming (incl)
int64_t refr; // rightmost ref offset involved post trimming (incl)
int64_t refl_pretrim; // leftmost ref offset involved pre trimming (incl)
int64_t refr_pretrim; // rightmost ref offset involved pre trimming (incl)
size_t triml; // positions trimmed from LHS
size_t trimr; // positions trimmed from RHS
// If "core" diagonals are specified, then any alignment reported has to
// overlap one of the core diagonals. This is to avoid the situation where
// an alignment is reported that overlaps a better-scoring alignment that
// falls partially outside the rectangle. This is used in both seed
// extensions and in mate finding. Filtering based on the core diagonals
// should happen in the backtrace routine. I.e. it should simply never
// return an alignment that doesn't overlap a core diagonal, even if there
// is such an alignment and it's valid.
size_t corel; // offset of column where leftmost "core" diagonal starts
size_t corer; // offset of column where rightmost "core" diagonal starts
// [corel, corer] is an inclusive range and offsets are with respect to the
// original, untrimmed rectangle.
size_t maxgap; // max # gaps - width of the gap bands
/**
* Return true iff the combined effect of triml and trimr is to trim away
* the entire rectangle.
*/
bool entirelyTrimmed() const {
bool tr = refr < refl;
ASSERT_ONLY(size_t width = (size_t)(refr_pretrim - refl_pretrim + 1));
assert(tr == (width <= triml + trimr));
return tr;
}
#ifndef NDEBUG
bool repOk() const {
assert_geq(corer, corel);
return true;
}
#endif
/**
* Set the given interval to the range of diagonals that are "covered" by
* this dynamic programming problem.
*/
void initIval(Interval& iv) {
iv.setOff(refl_pretrim + (int64_t)corel);
iv.setLen(corer - corel + 1);
}
};
/**
* Encapsulates routines for calculating parameters for the various types of
* dynamic programming problems solved in Bowtie2.
*/
class DynProgFramer {
public:
DynProgFramer(bool trimToRef) : trimToRef_(trimToRef) { }
/**
* Similar to frameSeedExtensionParallelogram but we're being somewhat more
* inclusive in order to ensure all characters aling the "width" in the last
* row are exhaustively scored.
*/
bool frameSeedExtensionRect(
int64_t off, // ref offset implied by seed hit assuming no gaps
size_t rdlen, // length of read sequence used in DP table (so len
// of +1 nucleotide sequence for colorspace reads)
int64_t reflen, // length of reference sequence aligned to
size_t maxrdgap, // max # of read gaps permitted in opp mate alignment
size_t maxrfgap, // max # of ref gaps permitted in opp mate alignment
int64_t maxns, // # Ns permitted
size_t maxhalf, // max width in either direction
DPRect& rect); // out: DP rectangle
/**
* Given information about an anchor mate hit, and information deduced by
* PairedEndPolicy about where the opposite mate can begin and start given
* the fragment length range, return parameters for the dynamic programming
* problem to solve.
*/
bool frameFindMateRect(
bool anchorLeft, // true iff anchor alignment is to the left
int64_t ll, // leftmost Watson off for LHS of opp alignment
int64_t lr, // rightmost Watson off for LHS of opp alignment
int64_t rl, // leftmost Watson off for RHS of opp alignment
int64_t rr, // rightmost Watson off for RHS of opp alignment
size_t rdlen, // length of opposite mate
int64_t reflen, // length of reference sequence aligned to
size_t maxrdgap, // max # of read gaps permitted in opp mate alignment
size_t maxrfgap, // max # of ref gaps permitted in opp mate alignment
int64_t maxns, // max # Ns permitted
size_t maxhalf, // max width in either direction
DPRect& rect) // out: DP rectangle
const
{
if(anchorLeft) {
return frameFindMateAnchorLeftRect(
ll,
lr,
rl,
rr,
rdlen,
reflen,
maxrdgap,
maxrfgap,
maxns,
maxhalf,
rect);
} else {
return frameFindMateAnchorRightRect(
ll,
lr,
rl,
rr,
rdlen,
reflen,
maxrdgap,
maxrfgap,
maxns,
maxhalf,
rect);
}
}
/**
* Given information about an anchor mate hit, and information deduced by
* PairedEndPolicy about where the opposite mate can begin and start given
* the fragment length range, return parameters for the dynamic programming
* problem to solve.
*/
bool frameFindMateAnchorLeftRect(
int64_t ll, // leftmost Watson off for LHS of opp alignment
int64_t lr, // rightmost Watson off for LHS of opp alignment
int64_t rl, // leftmost Watson off for RHS of opp alignment
int64_t rr, // rightmost Watson off for RHS of opp alignment
size_t rdlen, // length of opposite mate
int64_t reflen, // length of reference sequence aligned to
size_t maxrdgap, // max # of read gaps permitted in opp mate alignment
size_t maxrfgap, // max # of ref gaps permitted in opp mate alignment
int64_t maxns, // max # Ns permitted in alignment
size_t maxhalf, // max width in either direction
DPRect& rect) // out: DP rectangle
const;
/**
* Given information about an anchor mate hit, and information deduced by
* PairedEndPolicy about where the opposite mate can begin and start given
* the fragment length range, return parameters for the dynamic programming
* problem to solve.
*/
bool frameFindMateAnchorRightRect(
int64_t ll, // leftmost Watson off for LHS of opp alignment
int64_t lr, // rightmost Watson off for LHS of opp alignment
int64_t rl, // leftmost Watson off for RHS of opp alignment
int64_t rr, // rightmost Watson off for RHS of opp alignment
size_t rdlen, // length of opposite mate
int64_t reflen, // length of reference sequence aligned to
size_t maxrdgap, // max # of read gaps permitted in opp mate alignment
size_t maxrfgap, // max # of ref gaps permitted in opp mate alignment
int64_t maxns, // max # Ns permitted in alignment
size_t maxhalf, // max width in either direction
DPRect& rect) // out: DP rectangle
const;
protected:
/**
* Trim the given parallelogram width and reference window so that neither
* overhangs the beginning or end of the reference. Return true if width
* is still > 0 after trimming, otherwise return false.
*/
void trimToRef(
size_t reflen, // in: length of reference sequence aligned to
int64_t& refl, // in/out: ref pos of upper LHS of parallelogram
int64_t& refr, // in/out: ref pos of lower RHS of parallelogram
size_t& trimup, // out: number of bases trimmed from upstream end
size_t& trimdn) // out: number of bases trimmed from downstream end
{
if(refl < 0) {
trimup = (size_t)(-refl);
//refl = 0;
}
if(refr >= (int64_t)reflen) {
trimdn = (size_t)(refr - reflen + 1);
//refr = (int64_t)reflen-1;
}
}
bool trimToRef_;
};
#endif /*ndef DP_FRAMER_H_*/