hisat-3n/dp_framer.cpp
2025-01-18 21:09:52 +08:00

911 lines
36 KiB
C++

/*
* Copyright 2011, Ben Langmead <langmea@cs.jhu.edu>
*
* This file is part of Bowtie 2.
*
* Bowtie 2 is free software: you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation, either version 3 of the License, or
* (at your option) any later version.
*
* Bowtie 2 is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with Bowtie 2. If not, see <http://www.gnu.org/licenses/>.
*/
#include "dp_framer.h"
using namespace std;
/**
* Set up variables that describe the shape of a dynamic programming matrix to
* be filled in. The matrix is built around the diagonal containing the seed
* hit: the "seed diagonal". The N diagonals to the right of the seed diagonal
* are the "RHS gap" diagonals, where N is the maximum number of read or
* reference gaps permitted (whichever is larger). The N diagonals to the left
* of the seed diagonal are the "LHS gap" diagonals.
*
* The way the rectangle is currently formulated, there are another N diagonals
* to the left of the "LHS gap" diagonals called the "LHS extra diagonals". It
* might also be possible to split the "extra diagonals" into two subsets and
* place them both to the left of the LHS gap diagonals and to the right of the
* RHS gap diagonals.
*
* The purpose of arranging and these groupings of diagonals is that a subset
* of them, the "core diagonals", can now be considered "covered." By
* "covered" I mean that any alignment that overlaps a cell in any of the core
* diagonals cannot possibly overlap another, higher-scoring alignment that
* falls partially outside the rectangle.
*
* Say the read is 5 characters long, the maximum number of read or ref gaps is
* 2, and the seed hit puts the main diagonal at offset 10 in the reference.
* The larger rectangle explored looks like this:
*
* off=10, maxgap=2
*
* Ref 1
* off: 67890123456 0: seed diagonal
* **OO0oo++---- o: "RHS gap" diagonals
* -**OO0oo++--- O: "LHS gap" diagonals
* --**OO0oo++-- *: "LHS extra" diagonals
* ---**OO0oo++- +: "RHS extra" diagonals
* ----**OO0oo++ -: cells that can't possibly be involved in a valid
* alignment that overlaps one of the core diagonals
*
* The "core diagonals" are marked with 0's, O's or o's.
*
* A caveat is that, for performance reasons, we place an upper limit on N -
* the maximum number of read or reference gaps. It is constrained to be no
* greater than 'maxgap'. This means that in some situations, we may report an
* alignment that spuriously trumps a better alignment that falls partially
* outside the rectangle. Also, we may fail to find a valid alignment with
* more than 'maxgap' gaps.
*
* Another issue is trimming: if the seed hit is sufficiently close to one or
* both ends of the reference sequence, and either (a) overhang is not
* permitted, or (b) the number of Ns permitted is less than the number of
* columns that overhang the reference, then we want to exclude the trimmed
* columns from the rectangle.
*
* We need to return enough information so that downstream routines can fully
* understand the shape of the rectangle, which diagonals are which (esp. which
* are the "core" diagonals, since we needn't examine any more seed hits from
* those columns in the future), and how the rectangle is trimmed. The
* information returned should be compatible with the sort of information
* returned by the routines that set up rectangles for mate finding.
*/
bool DynProgFramer::frameSeedExtensionRect(
int64_t off, // ref offset implied by seed hit assuming no gaps
size_t rdlen, // length of read sequence used in DP table (so len
// of +1 nucleotide sequence for colorspace reads)
int64_t reflen, // length of reference sequence aligned to
size_t maxrdgap, // max # of read gaps permitted in opp mate alignment
size_t maxrfgap, // max # of ref gaps permitted in opp mate alignment
int64_t maxns, // # Ns permitted
size_t maxhalf, // max width in either direction
DPRect& rect) // out: DP rectangle
{
assert_gt(rdlen, 0);
assert_gt(reflen, 0);
// Set N, the maximum number of reference or read gaps permitted, whichever
// is larger. Also, enforce ceiling: can't be larger than 'maxhalf'.
size_t maxgap = max(maxrdgap, maxrfgap);
maxgap = min(maxgap, maxhalf);
// Leave room for "LHS gap" and "LHS extra" diagonals
int64_t refl = off - 2 * maxgap; // inclusive
// Leave room for "RHS gap" and "RHS extra" diagonals
int64_t refr = off + (rdlen - 1) + 2 * maxgap; // inclusive
size_t triml = 0, trimr = 0;
// Check if we have to trim to fit the extents of the reference
if(trimToRef_) {
maxns = 0; // no leeway
} else if(maxns == (int64_t)rdlen) {
maxns--;
}
// Trim from RHS of rectangle
if(refr >= reflen + maxns) {
trimr = (size_t)(refr - (reflen + maxns - 1));
}
// Trim from LHS of rectangle
if(refl < -maxns) {
triml = (size_t)(-refl) - (size_t)maxns;
}
rect.refl_pretrim = refl;
rect.refr_pretrim = refr;
rect.refl = refl + triml;
rect.refr = refr - trimr;
rect.triml = triml;
rect.trimr = trimr;
rect.maxgap = maxgap;
// Remember which diagonals are "core" as offsets from the LHS of the
// untrimmed rectangle
rect.corel = maxgap;
rect.corer = rect.corel + 2 * maxgap; // inclusive
assert(rect.repOk());
return !rect.entirelyTrimmed();
}
/**
* Set up variables that describe the shape of a dynamic programming matrix to
* be filled in. The matrix is built around the diagonals that terminate in
* the range of columns where the RHS of the opposite mate must fall in order
* to satisfy the fragment-length constraint. These are the "mate" diagonals
* and they also happen to be the "core" diagonals in this case.
*
* The N diagonals to the right of the mate diagonals are the "RHS gap"
* diagonals, where N is the maximum number of read or reference gaps permitted
* (whichever is larger). The N diagonals to the left of the mate diagonals
* are the "LHS gap" diagonals.
*
* The purpose of arranging and these groupings of diagonals is that a subset
* of them, the "core diagonals", can now be considered "covered." By
* "covered" I mean that any alignment that overlaps a cell in any of the core
* diagonals cannot possibly overlap another, higher-scoring alignment that
* falls partially outside the rectangle.
*
* |Anchor|
* o---------OO0000000000000oo------ 0: mate diagonal (also core diags!)
* -o---------OO0000000000000oo----- o: "RHS gap" diagonals
* --o---------OO0000000000000oo---- O: "LHS gap" diagonals
* ---oo--------OO0000000000000oo--- *: "LHS extra" diagonals
* -----o--------OO0000000000000oo-- -: cells that can't possibly be
* ------o--------OO0000000000000oo- involved in a valid alignment that
* -------o--------OO0000000000000oo overlaps one of the core diagonals
* XXXXXXXXXXXXX
* | RHS Range |
* ^ ^
* rl rr
*
* The "core diagonals" are marked with 0s.
*
* A caveat is that, for performance reasons, we place an upper limit on N -
* the maximum number of read or reference gaps. It is constrained to be no
* greater than 'maxgap'. This means that in some situations, we may report an
* alignment that spuriously trumps a better alignment that falls partially
* outside the rectangle. Also, we may fail to find a valid alignment with
* more than 'maxgap' gaps.
*
* Another issue is trimming: if the seed hit is sufficiently close to one or
* both ends of the reference sequence, and either (a) overhang is not
* permitted, or (b) the number of Ns permitted is less than the number of
* columns that overhang the reference, then we want to exclude the trimmed
* columns from the rectangle.
*/
bool DynProgFramer::frameFindMateAnchorLeftRect(
int64_t ll, // leftmost Watson off for LHS of opp alignment
int64_t lr, // rightmost Watson off for LHS of opp alignment
int64_t rl, // leftmost Watson off for RHS of opp alignment
int64_t rr, // rightmost Watson off for RHS of opp alignment
size_t rdlen, // length of opposite mate
int64_t reflen, // length of reference sequence aligned to
size_t maxrdgap, // max # of read gaps permitted in opp mate alignment
size_t maxrfgap, // max # of ref gaps permitted in opp mate alignment
int64_t maxns, // max # ns permitted in the alignment
size_t maxhalf, // max width in either direction
DPRect& rect) // out: DP rectangle
const
{
assert_geq(lr, ll); // LHS rightmost must be >= LHS leftmost
assert_geq(rr, rl); // RHS rightmost must be >= RHS leftmost
assert_geq(rr, lr); // RHS rightmost must be >= LHS rightmost
assert_geq(rl, ll); // RHS leftmost must be >= LHS leftmost
assert_gt(rdlen, 0);
assert_gt(reflen, 0);
size_t triml = 0, trimr = 0;
size_t maxgap = max(maxrdgap, maxrfgap);
maxgap = max(maxgap, maxhalf);
// Amount of padding we have to add to account for the fact that alignments
// ending between en_left/en_right might start in various columns in the
// first row
int64_t pad_left = maxgap;
int64_t pad_right = maxgap;
int64_t en_left = rl;
int64_t en_right = rr;
int64_t st_left = en_left - (rdlen-1);
ASSERT_ONLY(int64_t st_right = en_right - (rdlen-1));
int64_t en_right_pad = en_right + pad_right;
ASSERT_ONLY(int64_t en_left_pad = en_left - pad_left);
ASSERT_ONLY(int64_t st_right_pad = st_right + pad_right);
int64_t st_left_pad = st_left - pad_left;
assert_leq(st_left, en_left);
assert_geq(en_right, st_right);
assert_leq(st_left_pad, en_left_pad);
assert_geq(en_right_pad, st_right_pad);
int64_t refl = st_left_pad;
int64_t refr = en_right_pad;
if(trimToRef_) {
maxns = 0;
} else if(maxns == (int64_t)rdlen) {
maxns--;
}
// Trim from the RHS of the rectangle?
if(refr >= reflen + maxns) {
trimr = (size_t)(refr - (reflen + maxns - 1));
}
// Trim from the LHS of the rectangle?
if(refl < -maxns) {
triml = (size_t)(-refl) - (size_t)maxns;
}
size_t width = (size_t)(refr - refl + 1);
rect.refl_pretrim = refl;
rect.refr_pretrim = refr;
rect.refl = refl + triml;
rect.refr = refr - trimr;
rect.triml = triml;
rect.trimr = trimr;
rect.maxgap = maxgap;
rect.corel = maxgap;
rect.corer = width - maxgap - 1; // inclusive
assert(rect.repOk());
return !rect.entirelyTrimmed();
}
/**
* Set up variables that describe the shape of a dynamic programming matrix to
* be filled in. The matrix is built around the diagonals that begin in the
* range of columns where the LHS of the opposite mate must fall in order to
* satisfy the fragment-length constraint. These are the "mate" diagonals and
* they also happen to be the "core" diagonals in this case.
*
* The N diagonals to the right of the mate diagonals are the "RHS gap"
* diagonals, where N is the maximum number of read or reference gaps permitted
* (whichever is larger). The N diagonals to the left of the mate diagonals
* are the "LHS gap" diagonals.
*
* The purpose of arranging and these groupings of diagonals is that a subset
* of them, the "core diagonals", can now be considered "covered." By
* "covered" I mean that any alignment that overlaps a cell in any of the core
* diagonals cannot possibly overlap another, higher-scoring alignment that
* falls partially outside the rectangle.
*
* ll lr
* v v
* | LHS Range |
* XXXXXXXXXXXXX |Anchor|
* OO0000000000000oo--------o-------- 0: mate diagonal (also core diags!)
* -OO0000000000000oo--------o------- o: "RHS gap" diagonals
* --OO0000000000000oo--------o------ O: "LHS gap" diagonals
* ---OO0000000000000oo--------oo---- *: "LHS extra" diagonals
* ----OO0000000000000oo---------o--- -: cells that can't possibly be
* -----OO0000000000000oo---------o-- involved in a valid alignment that
* ------OO0000000000000oo---------o- overlaps one of the core diagonals
*
* The "core diagonals" are marked with 0s.
*
* A caveat is that, for performance reasons, we place an upper limit on N -
* the maximum number of read or reference gaps. It is constrained to be no
* greater than 'maxgap'. This means that in some situations, we may report an
* alignment that spuriously trumps a better alignment that falls partially
* outside the rectangle. Also, we may fail to find a valid alignment with
* more than 'maxgap' gaps.
*
* Another issue is trimming: if the seed hit is sufficiently close to one or
* both ends of the reference sequence, and either (a) overhang is not
* permitted, or (b) the number of Ns permitted is less than the number of
* columns that overhang the reference, then we want to exclude the trimmed
* columns from the rectangle.
*/
bool DynProgFramer::frameFindMateAnchorRightRect(
int64_t ll, // leftmost Watson off for LHS of opp alignment
int64_t lr, // rightmost Watson off for LHS of opp alignment
int64_t rl, // leftmost Watson off for RHS of opp alignment
int64_t rr, // rightmost Watson off for RHS of opp alignment
size_t rdlen, // length of opposite mate
int64_t reflen, // length of reference sequence aligned to
size_t maxrdgap, // max # of read gaps permitted in opp mate alignment
size_t maxrfgap, // max # of ref gaps permitted in opp mate alignment
int64_t maxns, // max # ns permitted in the alignment
size_t maxhalf, // max width in either direction
DPRect& rect) // out: DP rectangle
const
{
assert_geq(lr, ll);
assert_geq(rr, rl);
assert_geq(rr, lr);
assert_geq(rl, ll);
assert_gt(rdlen, 0);
assert_gt(reflen, 0);
size_t triml = 0, trimr = 0;
size_t maxgap = max(maxrdgap, maxrfgap);
maxgap = max(maxgap, maxhalf);
int64_t pad_left = maxgap;
int64_t pad_right = maxgap;
int64_t st_left = ll;
int64_t st_right = lr;
ASSERT_ONLY(int64_t en_left = st_left + (rdlen-1));
int64_t en_right = st_right + (rdlen-1);
int64_t en_right_pad = en_right + pad_right;
ASSERT_ONLY(int64_t en_left_pad = en_left - pad_left);
ASSERT_ONLY(int64_t st_right_pad = st_right + pad_right);
int64_t st_left_pad = st_left - pad_left;
assert_leq(st_left, en_left);
assert_geq(en_right, st_right);
assert_leq(st_left_pad, en_left_pad);
assert_geq(en_right_pad, st_right_pad);
// We have enough info to deduce where the boundaries of our rectangle
// should be. Finalize the boundaries, ignoring reference trimming for now
int64_t refl = st_left_pad;
int64_t refr = en_right_pad;
if(trimToRef_) {
maxns = 0;
} else if(maxns == (int64_t)rdlen) {
maxns--;
}
// Trim from the RHS of the rectangle?
if(refr >= reflen + maxns) {
trimr = (size_t)(refr - (reflen + maxns - 1));
}
// Trim from the LHS of the rectangle?
if(refl < -maxns) {
triml = (size_t)(-refl) - (size_t)maxns;
}
size_t width = (size_t)(refr - refl + 1);
rect.refl_pretrim = refl;
rect.refr_pretrim = refr;
rect.refl = refl + triml;
rect.refr = refr - trimr;
rect.triml = triml;
rect.trimr = trimr;
rect.maxgap = maxgap;
rect.corel = maxgap;
rect.corer = width - maxgap - 1; // inclusive
assert(rect.repOk());
return !rect.entirelyTrimmed();
}
#ifdef MAIN_DP_FRAMER
#include <iostream>
static void testCaseFindMateAnchorLeft(
const char *testName,
bool trimToRef,
int64_t ll,
int64_t lr,
int64_t rl,
int64_t rr,
size_t rdlen,
size_t reflen,
size_t maxrdgap,
size_t maxrfgap,
size_t ex_width,
size_t ex_solwidth,
size_t ex_trimup,
size_t ex_trimdn,
int64_t ex_refl,
int64_t ex_refr,
const char *ex_st, // string of '0'/'1' chars
const char *ex_en) // string of '0'/'1' chars
{
cerr << testName << "...";
DynProgFramer fr(trimToRef);
size_t width, solwidth;
int64_t refl, refr;
EList<bool> st, en;
size_t trimup, trimdn;
size_t maxhalf = 500;
size_t maxgaps = 0;
fr.frameFindMateAnchorLeft(
ll, // leftmost Watson off for LHS of opp alignment
lr, // rightmost Watson off for LHS of opp alignment
rl, // leftmost Watson off for RHS of opp alignment
rr, // rightmost Watson off for RHS of opp alignment
rdlen, // length of opposite mate
reflen, // length of reference sequence aligned to
maxrdgap, // max # of read gaps permitted in opp mate alignment
maxrfgap, // max # of ref gaps permitted in opp mate alignment
maxns, // max # Ns permitted
maxhalf, // max width in either direction
width, // out: calculated width stored here
maxgaps, // out: max # gaps
trimup, // out: number of bases trimmed from upstream end
trimdn, // out: number of bases trimmed from downstream end
refl, // out: ref pos of upper LHS of parallelogram
refr, // out: ref pos of lower RHS of parallelogram
st, // out: legal starting columns stored here
en); // out: legal ending columns stored here
assert_eq(ex_width, width);
assert_eq(ex_solwidth, solwidth);
assert_eq(ex_trimup, trimup);
assert_eq(ex_trimdn, trimdn);
assert_eq(ex_refl, refl);
assert_eq(ex_refr, refr);
for(size_t i = 0; i < width; i++) {
assert_eq((ex_st[i] == '1'), st[i]);
assert_eq((ex_en[i] == '1'), en[i]);
}
cerr << "PASSED" << endl;
}
static void testCaseFindMateAnchorRight(
const char *testName,
bool trimToRef,
int64_t ll,
int64_t lr,
int64_t rl,
int64_t rr,
size_t rdlen,
size_t reflen,
size_t maxrdgap,
size_t maxrfgap,
size_t ex_width,
size_t ex_solwidth,
size_t ex_trimup,
size_t ex_trimdn,
int64_t ex_refl,
int64_t ex_refr,
const char *ex_st, // string of '0'/'1' chars
const char *ex_en) // string of '0'/'1' chars
{
cerr << testName << "...";
DynProgFramer fr(trimToRef);
size_t width, solwidth;
size_t maxgaps;
int64_t refl, refr;
EList<bool> st, en;
size_t trimup, trimdn;
size_t maxhalf = 500;
fr.frameFindMateAnchorRight(
ll, // leftmost Watson off for LHS of opp alignment
lr, // rightmost Watson off for LHS of opp alignment
rl, // leftmost Watson off for RHS of opp alignment
rr, // rightmost Watson off for RHS of opp alignment
rdlen, // length of opposite mate
reflen, // length of reference sequence aligned to
maxrdgap, // max # of read gaps permitted in opp mate alignment
maxrfgap, // max # of ref gaps permitted in opp mate alignment
maxns, // max # Ns permitted
maxhalf, // max width in either direction
width, // out: calculated width stored here
maxgaps, // out: calcualted max # gaps
trimup, // out: number of bases trimmed from upstream end
trimdn, // out: number of bases trimmed from downstream end
refl, // out: ref pos of upper LHS of parallelogram
refr, // out: ref pos of lower RHS of parallelogram
st, // out: legal starting columns stored here
en); // out: legal ending columns stored here
assert_eq(ex_width, width);
assert_eq(ex_trimup, trimup);
assert_eq(ex_trimdn, trimdn);
assert_eq(ex_refl, refl);
assert_eq(ex_refr, refr);
for(size_t i = 0; i < width; i++) {
assert_eq((ex_st[i] == '1'), st[i]);
assert_eq((ex_en[i] == '1'), en[i]);
}
cerr << "PASSED" << endl;
}
int main(void) {
///////////////////////////
//
// ANCHOR ON THE LEFT
//
///////////////////////////
// -------------
// o o
// o o
// o o
// o o
// <<<------->>>
// 012345678901234567890
// 0 1 2
testCaseFindMateAnchorLeft(
"FindMateAnchorLeft1",
false, // trim to reference
3, // left offset of upper parallelogram extent
15, // right offset of upper parallelogram extent
10, // left offset of lower parallelogram extent
16, // right offset of lower parallelogram extent
5, // length of opposite mate
30, // length of reference sequence aligned to
3, // max # of read gaps permitted in opp mate alignment
3, // max # of ref gaps permitted in opp mate alignment
13, // expected width
0, // expected # bases trimmed from upstream end
0, // expected # bases trimmed from downstream end
3, // ref offset of upstream column
19, // ref offset of downstream column
"1111111111111", // expected starting bools
"0001111111000"); // expected ending bools
// *******
// <<===-----
// o o
// o o
// o o
// o o
// <<=----->>
// *******
// 012345678901234567890
// 0 1 2
testCaseFindMateAnchorLeft(
"FindMateAnchorLeft2",
false, // trim to reference
9, // left offset of left upper parallelogram extent
14, // right offset of left upper parallelogram extent
10, // left offset of left lower parallelogram extent
15, // right offset of left lower parallelogram extent
5, // length of opposite mate
30, // length of reference sequence aligned to
2, // max # of read gaps permitted in opp mate alignment
2, // max # of ref gaps permitted in opp mate alignment
7, // expected width
3, // expected # bases trimmed from upstream end
0, // expected # bases trimmed from downstream end
7, // ref offset of upstream column
17, // ref offset of downstream column
"0011111", // expected starting bools
"1111100"); // expected ending bools
// *******
// <<===--->>
// o o
// o o
// o o
// o o
// o o
// <<=----->>
// *******
// 01234567890123456xxxx
// 0 1 2
testCaseFindMateAnchorLeft(
"FindMateAnchorLeft3",
true, // trim to reference
9, // left offset of left upper parallelogram extent
14, // right offset of left upper parallelogram extent
10, // left offset of left lower parallelogram extent
15, // right offset of left lower parallelogram extent
5, // length of opposite mate
17, // length of reference sequence aligned to
2, // max # of read gaps permitted in opp mate alignment
2, // max # of ref gaps permitted in opp mate alignment
7, // expected width
3, // expected # bases trimmed from upstream end
0, // expected # bases trimmed from downstream end
7, // ref offset of upstream column
17, // ref offset of downstream column
"0011111", // expected starting bools
"1111100"); // expected ending bools
// ******
// <<===-----
// o o
// o o
// o o
// o o
// <<=----=>>
// ******
// 012345678901234xxxxxx
// 0 1 2
testCaseFindMateAnchorLeft(
"FindMateAnchorLeft4",
true, // trim to reference
9, // left offset of left upper parallelogram extent
14, // right offset of left upper parallelogram extent
10, // left offset of left lower parallelogram extent
15, // right offset of left lower parallelogram extent
5, // length of opposite mate
15, // length of reference sequence aligned to
2, // max # of read gaps permitted in opp mate alignment
2, // max # of ref gaps permitted in opp mate alignment
6, // expected width
3, // expected # bases trimmed from upstream end
1, // expected # bases trimmed from downstream end
7, // ref offset of upstream column
16, // ref offset of downstream column
"001111", // expected starting bools
"111100"); // expected ending bools
// -1 0 2
// xxxxxxxxxx012345678xx
//
// *******
// <<===-----
// o o
// o o
// o o
// o o
// o o
// <<=----->>
// *******
//
// xxxxxxxxxx012345678xx
// -1 0 2
testCaseFindMateAnchorLeft(
"FindMateAnchorLeft5",
true, // trim to reference
1, // left offset of left upper parallelogram extent
7, // right offset of left upper parallelogram extent
2, // left offset of left lower parallelogram extent
7, // right offset of left lower parallelogram extent
5, // length of opposite mate
9, // length of reference sequence aligned to
2, // max # of read gaps permitted in opp mate alignment
2, // max # of ref gaps permitted in opp mate alignment
7, // expected width
3, // expected # bases trimmed from upstream end
0, // expected # bases trimmed from downstream end
-1, // ref offset of upstream column
9, // ref offset of downstream column
"0011111", // expected starting bools
"1111100"); // expected ending bools
// <<<<==-===>>
// o o
// o o
// o o
// o o
// <<<<------>>
// ******
// 012345678901234567890
// 0 1 2
testCaseFindMateAnchorLeft(
"FindMateAnchorLeft6",
false, // trim to reference
8, // left offset of left upper parallelogram extent
8, // right offset of left upper parallelogram extent
10, // left offset of left lower parallelogram extent
15, // right offset of left lower parallelogram extent
5, // length of opposite mate
30, // length of reference sequence aligned to
4, // max # of read gaps permitted in opp mate alignment
2, // max # of ref gaps permitted in opp mate alignment
6, // expected width
4, // expected # bases trimmed from upstream end
2, // expected # bases trimmed from downstream end
6, // ref offset of upstream column
15, // ref offset of downstream column
"001000", // expected starting bools
"111111"); // expected ending bools
///////////////////////////
//
// ANCHOR ON THE RIGHT
//
///////////////////////////
// <<<------->>>
// o o
// o o
// o o
// o o
// <<<------->>>
// 012345678901234567890123456789
// 0 1 2
testCaseFindMateAnchorRight(
"FindMateAnchorRight1",
false, // trim to reference
10, // left offset of left upper parallelogram extent
16, // right offset of left upper parallelogram extent
11, // left offset of left lower parallelogram extent
23, // right offset of left lower parallelogram extent
5, // length of opposite mate
30, // length of reference sequence aligned to
3, // max # of read gaps permitted in opp mate alignment
3, // max # of ref gaps permitted in opp mate alignment
13, // expected width
0, // expected # bases trimmed from upstream end
0, // expected # bases trimmed from downstream end
7, // ref offset of upstream column
23, // ref offset of downstream column
"0001111111000", // expected starting bools
"1111111111111"); // expected ending bools
// 0 1 2
// 012345678901234567890
// *******
// <<------>>
// o o
// o o
// o o
// o o
// <<===--->>
// *******
// 012345678901234567890
// 0 1 2
testCaseFindMateAnchorRight(
"FindMateAnchorRight2",
false, // trim to reference
6, // left offset of left upper parallelogram extent
11, // right offset of left upper parallelogram extent
13, // left offset of left lower parallelogram extent
18, // right offset of left lower parallelogram extent
5, // length of opposite mate
30, // length of reference sequence aligned to
2, // max # of read gaps permitted in opp mate alignment
2, // max # of ref gaps permitted in opp mate alignment
7, // expected width
3, // expected # bases trimmed from upstream end
0, // expected # bases trimmed from downstream end
7, // ref offset of upstream column
17, // ref offset of downstream column
"1111100", // expected starting bools
"0011111"); // expected ending bools
// Reference trimming takes off the left_pad of the left mate
//
// *******
// <<------>>
// o o
// o o
// o o
// o o
// o o
// <<===--->>
// *******
// 0123456789012345678901234567890
// -1 0 1 2
testCaseFindMateAnchorRight(
"FindMateAnchorRight3",
true, // trim to reference
0, // left offset of left upper parallelogram extent
5, // right offset of left upper parallelogram extent
7, // left offset of left lower parallelogram extent
11, // right offset of left lower parallelogram extent
5, // length of opposite mate
30, // length of reference sequence aligned to
2, // max # of read gaps permitted in opp mate alignment
2, // max # of ref gaps permitted in opp mate alignment
7, // expected width
3, // expected # bases trimmed from upstream end
0, // expected # bases trimmed from downstream end
1, // ref offset of upstream column
11, // ref offset of downstream column
"1111100", // expected starting bools
"0011111"); // expected ending bools
// Reference trimming takes off the leftmost 5 positions of the left mate,
// and takes 1 from the right mate
//
// *****
// <<------>>
// o o
// o o
// o o
// o o
// o o
// <<===--->>
// *****
// 0987654321012345678901234567890
// -1 0 1 2
testCaseFindMateAnchorRight(
"FindMateAnchorRight4",
true, // trim to reference
-3, // left offset of left upper parallelogram extent
2, // right offset of left upper parallelogram extent
4, // left offset of left lower parallelogram extent
10, // right offset of left lower parallelogram extent
5, // length of opposite mate
30, // length of reference sequence aligned to
2, // max # of read gaps permitted in opp mate alignment
2, // max # of ref gaps permitted in opp mate alignment
5, // expected width
5, // expected # bases trimmed from upstream end
0, // expected # bases trimmed from downstream end
0, // ref offset of upstream column
8, // ref offset of downstream column
"11100", // expected starting bools
"11111"); // expected ending bools
// Reference trimming takes off the leftmost 5 positions of the left mate,
// and takes 1 from the left of the right mate. Also, it takes 2 from the
// right of the right mate.
//
// ***
// <<------>>
// o o
// o o
// o o
// o o
// o o
// <<===--->>
// ***
// 0987654321012345678901234567890
// -1 0 1 2
testCaseFindMateAnchorRight(
"FindMateAnchorRight5",
true, // trim to reference
-3, // left offset of left upper parallelogram extent
2, // right offset of left upper parallelogram extent
4, // left offset of left lower parallelogram extent
10, // right offset of left lower parallelogram extent
5, // length of opposite mate
7, // length of reference sequence aligned to
2, // max # of read gaps permitted in opp mate alignment
2, // max # of ref gaps permitted in opp mate alignment
3, // expected width
5, // expected # bases trimmed from upstream end
2, // expected # bases trimmed from downstream end
0, // ref offset of upstream column
6, // ref offset of downstream column
"111", // expected starting bools
"111"); // expected ending bools
// ******
// <<------>>>>
// o o
// o o
// o o
// o o
// <<====-=>>>>
// ******
// 012345678901234567890
// 0 1 2
testCaseFindMateAnchorRight(
"FindMateAnchorRight6",
false, // trim to reference
6, // left offset of left upper parallelogram extent
11, // right offset of left upper parallelogram extent
14, // left offset of left lower parallelogram extent
14, // right offset of left lower parallelogram extent
5, // length of opposite mate
30, // length of reference sequence aligned to
4, // max # of read gaps permitted in opp mate alignment
2, // max # of ref gaps permitted in opp mate alignment
6, // expected width
2, // expected # bases trimmed from upstream end
4, // expected # bases trimmed from downstream end
6, // ref offset of upstream column
15, // ref offset of downstream column
"111111", // expected starting bools
"000010"); // expected ending bools
// ****
// <<<<==---->>
// o o
// o o
// o o
// o o
// o o
// <<<<====-=>>
// ****
// 012345678901234567890
// 0 1 2
testCaseFindMateAnchorRight(
"FindMateAnchorRight7",
false, // trim to reference
6, // left offset of left upper parallelogram extent
11, // right offset of left upper parallelogram extent
14, // left offset of left lower parallelogram extent
14, // right offset of left lower parallelogram extent
5, // length of opposite mate
30, // length of reference sequence aligned to
2, // max # of read gaps permitted in opp mate alignment
4, // max # of ref gaps permitted in opp mate alignment
4, // expected width
6, // expected # bases trimmed from upstream end
2, // expected # bases trimmed from downstream end
8, // ref offset of upstream column
15, // ref offset of downstream column
"1111", // expected starting bools
"0010"); // expected ending bools
testCaseFindMateAnchorRight(
"FindMateAnchorRight8",
true, // trim to reference
-37, // left offset of left upper parallelogram extent
13, // right offset of left upper parallelogram extent
-37, // left offset of left lower parallelogram extent
52, // right offset of left lower parallelogram extent
10, // length of opposite mate
53, // length of reference sequence aligned to
0, // max # of read gaps permitted in opp mate alignment
0, // max # of ref gaps permitted in opp mate alignment
14, // expected width
37, // expected # bases trimmed from upstream end
0, // expected # bases trimmed from downstream end
0, // ref offset of upstream column
22, // ref offset of downstream column
"11111111111111", // expected starting bools
"11111111111111");// expected ending bools
}
#endif /*def MAIN_DP_FRAMER*/