/* * Copyright 2011, Ben Langmead * * This file is part of Bowtie 2. * * Bowtie 2 is free software: you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation, either version 3 of the License, or * (at your option) any later version. * * Bowtie 2 is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with Bowtie 2. If not, see . */ #include "aligner_bt.h" #include "mask.h" using namespace std; #define CHECK_ROW_COL(rowc, colc) \ if(rowc >= 0 && colc >= 0) { \ if(!sawcell_[colc].insert(rowc)) { \ /* was already in there */ \ abort = true; \ return; \ } \ assert(local || prob_.cper_->debugCell(rowc, colc, hefc)); \ } /** * Fill in a triangle of the DP table and backtrace from the given cell to * a cell in the previous checkpoint, or to the terminal cell. */ void BtBranchTracer::triangleFill( int64_t rw, // row of cell to backtrace from int64_t cl, // column of cell to backtrace from int hef, // cell to backtrace from is H (0), E (1), or F (2) TAlScore targ, // score of cell to backtrace from TAlScore targ_final, // score of alignment we're looking for RandomSource& rnd, // pseudo-random generator int64_t& row_new, // out: row we ended up in after backtrace int64_t& col_new, // out: column we ended up in after backtrace int& hef_new, // out: H/E/F after backtrace TAlScore& targ_new, // out: score up to cell we ended up in bool& done, // out: finished tracing out an alignment? bool& abort) // out: aborted b/c cell was seen before? { assert_geq(rw, 0); assert_geq(cl, 0); assert_range(0, 2, hef); assert_lt(rw, (int64_t)prob_.qrylen_); assert_lt(cl, (int64_t)prob_.reflen_); assert(prob_.usecp_ && prob_.fill_); int64_t row = rw, col = cl; const int64_t colmin = 0; const int64_t rowmin = 0; const int64_t colmax = prob_.reflen_ - 1; const int64_t rowmax = prob_.qrylen_ - 1; assert_leq(prob_.reflen_, (TRefOff)sawcell_.size()); assert_leq(col, (int64_t)prob_.cper_->hicol()); assert_geq(col, (int64_t)prob_.cper_->locol()); assert_geq(prob_.cper_->per(), 2); size_t mod = (row + col) & prob_.cper_->lomask(); assert_lt(mod, prob_.cper_->per()); // Allocate room for diags size_t depth = mod+1; assert_leq(depth, prob_.cper_->per()); size_t breadth = depth; tri_.resize(depth); // Allocate room for each diag for(size_t i = 0; i < depth; i++) { tri_[i].resize(breadth - i); } bool upperleft = false; size_t off = (row + col) >> prob_.cper_->perpow2(); if(off == 0) { upperleft = true; } else { off--; } const TAlScore sc_rdo = prob_.sc_->readGapOpen(); const TAlScore sc_rde = prob_.sc_->readGapExtend(); const TAlScore sc_rfo = prob_.sc_->refGapOpen(); const TAlScore sc_rfe = prob_.sc_->refGapExtend(); const bool local = !prob_.sc_->monotone; int64_t row_lo = row - (int64_t)mod; const CpQuad *prev2 = NULL, *prev1 = NULL; if(!upperleft) { // Read-only pointer to cells in diagonal -2. Start one row above the // target row. prev2 = prob_.cper_->qdiag1sPtr() + (off * prob_.cper_->nrow() + row_lo - 1); // Read-only pointer to cells in diagonal -1. Start one row above the // target row prev1 = prob_.cper_->qdiag2sPtr() + (off * prob_.cper_->nrow() + row_lo - 1); #ifndef NDEBUG if(row >= (int64_t)mod) { size_t rowc = row - mod, colc = col; if(rowc > 0 && prob_.cper_->isCheckpointed(rowc-1, colc)) { TAlScore al = prev1[0].sc[0]; if(al == MIN_I16) al = MIN_I64; assert_eq(prob_.cper_->scoreTriangle(rowc-1, colc, 0), al); } if(rowc > 0 && colc > 0 && prob_.cper_->isCheckpointed(rowc-1, colc-1)) { TAlScore al = prev2[0].sc[0]; if(al == MIN_I16) al = MIN_I64; assert_eq(prob_.cper_->scoreTriangle(rowc-1, colc-1, 0), al); } } #endif } // Pointer to cells in current diagonal // For each diagonal we need to fill in for(size_t i = 0; i < depth; i++) { CpQuad * cur = tri_[i].ptr(); CpQuad * curc = cur; size_t doff = mod - i; // # diagonals we are away from target diag //assert_geq(row, (int64_t)doff); int64_t rowc = row - doff; int64_t colc = col; size_t neval = 0; // # cells evaluated in this diag ASSERT_ONLY(const CpQuad *last = NULL); // Fill this diagonal from upper right to lower left for(size_t j = 0; j < breadth; j++) { if(rowc >= rowmin && rowc <= rowmax && colc >= colmin && colc <= colmax) { neval++; int64_t fromend = prob_.qrylen_ - rowc - 1; bool allowGaps = fromend >= prob_.sc_->gapbar && rowc >= prob_.sc_->gapbar; // Fill this cell // Some things we might want to calculate about this cell up front: // 1. How many matches are possible from this cell to the cell in // row, col, in case this allows us to prune // Get character from read int qc = prob_.qry_[rowc]; // Get quality value from read int qq = prob_.qual_[rowc]; assert_geq(qq, 33); // Get character from reference int rc = prob_.ref_[colc]; assert_range(0, 16, rc); int16_t sc_diag = prob_.sc_->score(qc, rc, qq - 33); int16_t sc_h_up = MIN_I16; int16_t sc_f_up = MIN_I16; int16_t sc_h_lf = MIN_I16; int16_t sc_e_lf = MIN_I16; if(allowGaps) { if(rowc > 0) { assert(local || prev1[j+0].sc[2] < 0); if(prev1[j+0].sc[0] > MIN_I16) { sc_h_up = prev1[j+0].sc[0] - sc_rfo; if(local) sc_h_up = max(sc_h_up, 0); } if(prev1[j+0].sc[2] > MIN_I16) { sc_f_up = prev1[j+0].sc[2] - sc_rfe; if(local) sc_f_up = max(sc_f_up, 0); } #ifndef NDEBUG TAlScore hup = prev1[j+0].sc[0]; TAlScore fup = prev1[j+0].sc[2]; if(hup == MIN_I16) hup = MIN_I64; if(fup == MIN_I16) fup = MIN_I64; if(local) { hup = max(hup, 0); fup = max(fup, 0); } if(prob_.cper_->isCheckpointed(rowc-1, colc)) { assert_eq(hup, prob_.cper_->scoreTriangle(rowc-1, colc, 0)); assert_eq(fup, prob_.cper_->scoreTriangle(rowc-1, colc, 2)); } #endif } if(colc > 0) { assert(local || prev1[j+1].sc[1] < 0); if(prev1[j+1].sc[0] > MIN_I16) { sc_h_lf = prev1[j+1].sc[0] - sc_rdo; if(local) sc_h_lf = max(sc_h_lf, 0); } if(prev1[j+1].sc[1] > MIN_I16) { sc_e_lf = prev1[j+1].sc[1] - sc_rde; if(local) sc_e_lf = max(sc_e_lf, 0); } #ifndef NDEBUG TAlScore hlf = prev1[j+1].sc[0]; TAlScore elf = prev1[j+1].sc[1]; if(hlf == MIN_I16) hlf = MIN_I64; if(elf == MIN_I16) elf = MIN_I64; if(local) { hlf = max(hlf, 0); elf = max(elf, 0); } if(prob_.cper_->isCheckpointed(rowc, colc-1)) { assert_eq(hlf, prob_.cper_->scoreTriangle(rowc, colc-1, 0)); assert_eq(elf, prob_.cper_->scoreTriangle(rowc, colc-1, 1)); } #endif } } assert(rowc <= 1 || colc <= 0 || prev2 != NULL); int16_t sc_h_dg = ((rowc > 0 && colc > 0) ? prev2[j+0].sc[0] : 0); if(colc == 0 && rowc > 0 && !local) { sc_h_dg = MIN_I16; } if(sc_h_dg > MIN_I16) { sc_h_dg += sc_diag; } if(local) sc_h_dg = max(sc_h_dg, 0); // cerr << sc_diag << " " << sc_h_dg << " " << sc_h_up << " " << sc_f_up << " " << sc_h_lf << " " << sc_e_lf << endl; int mask = 0; // Calculate best ways into H, E, F cells starting with H. // Mask bits: // H: 1=diag, 2=hhoriz, 4=ehoriz, 8=hvert, 16=fvert // E: 32=hhoriz, 64=ehoriz // F: 128=hvert, 256=fvert int16_t sc_best = sc_h_dg; if(sc_h_dg > MIN_I64) { mask = 1; } if(colc > 0 && sc_h_lf >= sc_best && sc_h_lf > MIN_I64) { if(sc_h_lf > sc_best) mask = 0; mask |= 2; sc_best = sc_h_lf; } if(colc > 0 && sc_e_lf >= sc_best && sc_e_lf > MIN_I64) { if(sc_e_lf > sc_best) mask = 0; mask |= 4; sc_best = sc_e_lf; } if(rowc > 0 && sc_h_up >= sc_best && sc_h_up > MIN_I64) { if(sc_h_up > sc_best) mask = 0; mask |= 8; sc_best = sc_h_up; } if(rowc > 0 && sc_f_up >= sc_best && sc_f_up > MIN_I64) { if(sc_f_up > sc_best) mask = 0; mask |= 16; sc_best = sc_f_up; } // Calculate best way into E cell int16_t sc_e_best = sc_h_lf; if(colc > 0) { if(sc_h_lf >= sc_e_lf && sc_h_lf > MIN_I64) { if(sc_h_lf == sc_e_lf) { mask |= 64; } mask |= 32; } else if(sc_e_lf > MIN_I64) { sc_e_best = sc_e_lf; mask |= 64; } } if(sc_e_best > sc_best) { sc_best = sc_e_best; mask &= ~31; // don't go diagonal } // Calculate best way into F cell int16_t sc_f_best = sc_h_up; if(rowc > 0) { if(sc_h_up >= sc_f_up && sc_h_up > MIN_I64) { if(sc_h_up == sc_f_up) { mask |= 256; } mask |= 128; } else if(sc_f_up > MIN_I64) { sc_f_best = sc_f_up; mask |= 256; } } if(sc_f_best > sc_best) { sc_best = sc_f_best; mask &= ~127; // don't go horizontal or diagonal } // Install results in cur assert(!prob_.sc_->monotone || sc_best <= 0); assert(!prob_.sc_->monotone || sc_e_best <= 0); assert(!prob_.sc_->monotone || sc_f_best <= 0); curc->sc[0] = sc_best; assert( local || sc_e_best < 0); assert( local || sc_f_best < 0); assert(!local || sc_e_best >= 0 || sc_e_best == MIN_I16); assert(!local || sc_f_best >= 0 || sc_f_best == MIN_I16); curc->sc[1] = sc_e_best; curc->sc[2] = sc_f_best; curc->sc[3] = mask; // cerr << curc->sc[0] << " " << curc->sc[1] << " " << curc->sc[2] << " " << curc->sc[3] << endl; ASSERT_ONLY(last = curc); #ifndef NDEBUG if(prob_.cper_->isCheckpointed(rowc, colc)) { if(local) { sc_e_best = max(sc_e_best, 0); sc_f_best = max(sc_f_best, 0); } TAlScore sc_best64 = sc_best; if(sc_best == MIN_I16) sc_best64 = MIN_I64; TAlScore sc_e_best64 = sc_e_best; if(sc_e_best == MIN_I16) sc_e_best64 = MIN_I64; TAlScore sc_f_best64 = sc_f_best; if(sc_f_best == MIN_I16) sc_f_best64 = MIN_I64; assert_eq(prob_.cper_->scoreTriangle(rowc, colc, 0), sc_best64); assert_eq(prob_.cper_->scoreTriangle(rowc, colc, 1), sc_e_best64); assert_eq(prob_.cper_->scoreTriangle(rowc, colc, 2), sc_f_best64); } #endif } // Update row, col assert_lt(rowc, (int64_t)prob_.qrylen_); rowc++; colc--; curc++; } // for(size_t j = 0; j < breadth; j++) if(i == depth-1) { // Final iteration assert(last != NULL); assert_eq(1, neval); assert_neq(0, last->sc[3]); assert_eq(targ, last->sc[hef]); } else { breadth--; prev2 = prev1 + 1; prev1 = cur; } } // for(size_t i = 0; i < depth; i++) // // Now backtrack through the triangle. Abort as soon as we enter a cell // that was visited by a previous backtrace. // int64_t rowc = row, colc = col; size_t curid; int hefc = hef; if(bs_.empty()) { // Start an initial branch CHECK_ROW_COL(rowc, colc); curid = bs_.alloc(); assert_eq(0, curid); Edit e; bs_[curid].init( prob_, 0, // parent ID 0, // penalty 0, // score_en rowc, // row colc, // col e, // edit 0, // hef true, // I am the root false); // don't try to extend with exact matches bs_[curid].len_ = 0; } else { curid = bs_.size()-1; } size_t idx_orig = (row + col) >> prob_.cper_->perpow2(); while(true) { // What depth are we? size_t mod = (rowc + colc) & prob_.cper_->lomask(); assert_lt(mod, prob_.cper_->per()); CpQuad * cur = tri_[mod].ptr(); int64_t row_off = rowc - row_lo - mod; assert(!local || cur[row_off].sc[0] > 0); assert_geq(row_off, 0); int mask = cur[row_off].sc[3]; assert_gt(mask, 0); int sel = -1; // Select what type of move to make, which depends on whether we're // currently in H, E, F: if(hefc == 0) { if( (mask & 1) != 0) { // diagonal sel = 0; } else if((mask & 8) != 0) { // up to H sel = 3; } else if((mask & 16) != 0) { // up to F sel = 4; } else if((mask & 2) != 0) { // left to H sel = 1; } else if((mask & 4) != 0) { // left to E sel = 2; } } else if(hefc == 1) { if( (mask & 32) != 0) { // left to H sel = 5; } else if((mask & 64) != 0) { // left to E sel = 6; } } else { assert_eq(2, hefc); if( (mask & 128) != 0) { // up to H sel = 7; } else if((mask & 256) != 0) { // up to F sel = 8; } } assert_geq(sel, 0); // Get character from read int qc = prob_.qry_[rowc], qq = prob_.qual_[rowc]; // Get character from reference int rc = prob_.ref_[colc]; assert_range(0, 16, rc); // Now that we know what type of move to make, make it, updating our // row and column and moving updating the branch. if(sel == 0) { assert_geq(rowc, 0); assert_geq(colc, 0); TAlScore scd = prob_.sc_->score(qc, rc, qq - 33); if((rc & (1 << qc)) == 0) { // Mismatch size_t id = curid; // Check if the previous branch was the initial (bottommost) // branch with no matches. If so, the mismatch should be added // to the initial branch, instead of starting a new branch. bool empty = (bs_[curid].len_ == 0 && curid == 0); if(!empty) { id = bs_.alloc(); } Edit e((int)rowc, mask2dna[rc], "ACGTN"[qc], EDIT_TYPE_MM); assert_lt(scd, 0); TAlScore score_en = bs_[curid].score_st_ + scd; bs_[id].init( prob_, curid, // parent ID -scd, // penalty score_en, // score_en rowc, // row colc, // col e, // edit hefc, // hef empty, // root? false); // don't try to extend with exact matches //assert(!local || bs_[id].score_st_ >= 0); curid = id; } else { // Match bs_[curid].score_st_ += prob_.sc_->match(); bs_[curid].len_++; assert_leq((int64_t)bs_[curid].len_, bs_[curid].row_ + 1); } rowc--; colc--; assert(local || bs_[curid].score_st_ >= targ_final); hefc = 0; } else if((sel >= 1 && sel <= 2) || (sel >= 5 && sel <= 6)) { assert_gt(colc, 0); // Read gap size_t id = bs_.alloc(); Edit e((int)rowc+1, mask2dna[rc], '-', EDIT_TYPE_READ_GAP); TAlScore gapp = prob_.sc_->readGapOpen(); if(bs_[curid].len_ == 0 && bs_[curid].e_.inited() && bs_[curid].e_.isReadGap()) { gapp = prob_.sc_->readGapExtend(); } TAlScore score_en = bs_[curid].score_st_ - gapp; bs_[id].init( prob_, curid, // parent ID gapp, // penalty score_en, // score_en rowc, // row colc-1, // col e, // edit hefc, // hef false, // root? false); // don't try to extend with exact matches colc--; curid = id; assert( local || bs_[curid].score_st_ >= targ_final); //assert(!local || bs_[curid].score_st_ >= 0); if(sel == 1 || sel == 5) { hefc = 0; } else { hefc = 1; } } else { assert_gt(rowc, 0); // Reference gap size_t id = bs_.alloc(); Edit e((int)rowc, '-', "ACGTN"[qc], EDIT_TYPE_REF_GAP); TAlScore gapp = prob_.sc_->refGapOpen(); if(bs_[curid].len_ == 0 && bs_[curid].e_.inited() && bs_[curid].e_.isRefGap()) { gapp = prob_.sc_->refGapExtend(); } TAlScore score_en = bs_[curid].score_st_ - gapp; bs_[id].init( prob_, curid, // parent ID gapp, // penalty score_en, // score_en rowc-1, // row colc, // col e, // edit hefc, // hef false, // root? false); // don't try to extend with exact matches rowc--; curid = id; //assert(!local || bs_[curid].score_st_ >= 0); if(sel == 3 || sel == 7) { hefc = 0; } else { hefc = 2; } } CHECK_ROW_COL(rowc, colc); size_t mod_new = (rowc + colc) & prob_.cper_->lomask(); size_t idx = (rowc + colc) >> prob_.cper_->perpow2(); assert_lt(mod_new, prob_.cper_->per()); int64_t row_off_new = rowc - row_lo - mod_new; CpQuad * cur_new = NULL; if(colc >= 0 && rowc >= 0 && idx == idx_orig) { cur_new = tri_[mod_new].ptr(); } bool hit_new_tri = (idx < idx_orig && colc >= 0 && rowc >= 0); // Check whether we made it to the top row or to a cell with score 0 if(colc < 0 || rowc < 0 || (cur_new != NULL && (local && cur_new[row_off_new].sc[0] == 0))) { done = true; assert(bs_[curid].isSolution(prob_)); addSolution(curid); #ifndef NDEBUG // A check to see if any two adjacent branches in the backtrace // overlap. If they do, the whole alignment will be filtered out // in trySolution(...) size_t cur = curid; if(!bs_[cur].root_) { size_t next = bs_[cur].parentId_; while(!bs_[next].root_) { assert_neq(cur, next); if(bs_[next].len_ != 0 || bs_[cur].len_ == 0) { assert(!bs_[cur].overlap(prob_, bs_[next])); } cur = next; next = bs_[cur].parentId_; } } #endif return; } if(hit_new_tri) { assert(rowc < 0 || colc < 0 || prob_.cper_->isCheckpointed(rowc, colc)); row_new = rowc; col_new = colc; hef_new = hefc; done = false; if(rowc < 0 || colc < 0) { assert(local); targ_new = 0; } else { targ_new = prob_.cper_->scoreTriangle(rowc, colc, hefc); } if(local && targ_new == 0) { done = true; assert(bs_[curid].isSolution(prob_)); addSolution(curid); } assert((row_new >= 0 && col_new >= 0) || done); return; } } assert(false); } #ifndef NDEBUG #define DEBUG_CHECK(ss, row, col, hef) { \ if(prob_.cper_->debug() && row >= 0 && col >= 0) { \ TAlScore s = ss; \ if(s == MIN_I16) s = MIN_I64; \ if(local && s < 0) s = 0; \ TAlScore deb = prob_.cper_->debugCell(row, col, hef); \ if(local && deb < 0) deb = 0; \ assert_eq(s, deb); \ } \ } #else #define DEBUG_CHECK(ss, row, col, hef) #endif /** * Fill in a square of the DP table and backtrace from the given cell to * a cell in the previous checkpoint, or to the terminal cell. */ void BtBranchTracer::squareFill( int64_t rw, // row of cell to backtrace from int64_t cl, // column of cell to backtrace from int hef, // cell to backtrace from is H (0), E (1), or F (2) TAlScore targ, // score of cell to backtrace from TAlScore targ_final, // score of alignment we're looking for RandomSource& rnd, // pseudo-random generator int64_t& row_new, // out: row we ended up in after backtrace int64_t& col_new, // out: column we ended up in after backtrace int& hef_new, // out: H/E/F after backtrace TAlScore& targ_new, // out: score up to cell we ended up in bool& done, // out: finished tracing out an alignment? bool& abort) // out: aborted b/c cell was seen before? { assert_geq(rw, 0); assert_geq(cl, 0); assert_range(0, 2, hef); assert_lt(rw, (int64_t)prob_.qrylen_); assert_lt(cl, (int64_t)prob_.reflen_); assert(prob_.usecp_ && prob_.fill_); const bool is8_ = prob_.cper_->is8_; int64_t row = rw, col = cl; assert_leq(prob_.reflen_, (TRefOff)sawcell_.size()); assert_leq(col, (int64_t)prob_.cper_->hicol()); assert_geq(col, (int64_t)prob_.cper_->locol()); assert_geq(prob_.cper_->per(), 2); size_t xmod = col & prob_.cper_->lomask(); size_t ymod = row & prob_.cper_->lomask(); size_t xdiv = col >> prob_.cper_->perpow2(); size_t ydiv = row >> prob_.cper_->perpow2(); size_t sq_ncol = xmod+1, sq_nrow = ymod+1; sq_.resize(sq_ncol * sq_nrow); bool upper = ydiv == 0; bool left = xdiv == 0; const TAlScore sc_rdo = prob_.sc_->readGapOpen(); const TAlScore sc_rde = prob_.sc_->readGapExtend(); const TAlScore sc_rfo = prob_.sc_->refGapOpen(); const TAlScore sc_rfe = prob_.sc_->refGapExtend(); const bool local = !prob_.sc_->monotone; const CpQuad *qup = NULL; const __m128i *qlf = NULL; size_t per = prob_.cper_->per_; ASSERT_ONLY(size_t nrow = prob_.cper_->nrow()); size_t ncol = prob_.cper_->ncol(); assert_eq(prob_.qrylen_, nrow); assert_eq(prob_.reflen_, (TRefOff)ncol); size_t niter = prob_.cper_->niter_; if(!upper) { qup = prob_.cper_->qrows_.ptr() + (ncol * (ydiv-1)) + xdiv * per; } if(!left) { // Set up the column pointers to point to the first __m128i word in the // relevant column size_t off = (niter << 2) * (xdiv-1); qlf = prob_.cper_->qcols_.ptr() + off; } size_t xedge = xdiv * per; // absolute offset of leftmost cell in square size_t yedge = ydiv * per; // absolute offset of topmost cell in square size_t xi = xedge, yi = yedge; // iterators for columns, rows size_t ii = 0; // iterator into packed square // Iterate over rows, then over columns size_t m128mod = yi % prob_.cper_->niter_; size_t m128div = yi / prob_.cper_->niter_; int16_t sc_h_dg_lastrow = MIN_I16; for(size_t i = 0; i <= ymod; i++, yi++) { assert_lt(yi, nrow); xi = xedge; // Handling for first column is done outside the loop size_t fromend = prob_.qrylen_ - yi - 1; bool allowGaps = fromend >= (size_t)prob_.sc_->gapbar && yi >= (size_t)prob_.sc_->gapbar; // Get character, quality from read int qc = prob_.qry_[yi], qq = prob_.qual_[yi]; assert_geq(qq, 33); int16_t sc_h_lf_last = MIN_I16; int16_t sc_e_lf_last = MIN_I16; for(size_t j = 0; j <= xmod; j++, xi++) { assert_lt(xi, ncol); // Get character from reference int rc = prob_.ref_[xi]; assert_range(0, 16, rc); int16_t sc_diag = prob_.sc_->score(qc, rc, qq - 33); int16_t sc_h_up = MIN_I16, sc_f_up = MIN_I16, sc_h_lf = MIN_I16, sc_e_lf = MIN_I16, sc_h_dg = MIN_I16; int16_t sc_h_up_c = MIN_I16, sc_f_up_c = MIN_I16, sc_h_lf_c = MIN_I16, sc_e_lf_c = MIN_I16, sc_h_dg_c = MIN_I16; if(yi == 0) { // If I'm in the first first row or column set it to 0 sc_h_dg = 0; } else if(xi == 0) { // Do nothing; leave it at min if(local) { sc_h_dg = 0; } } else if(i == 0 && j == 0) { // Otherwise, if I'm in the upper-left square corner, I can get // it from the checkpoint sc_h_dg = qup[-1].sc[0]; } else if(j == 0) { // Otherwise, if I'm in the leftmost cell of this row, I can // get it from sc_h_lf in first column of previous row sc_h_dg = sc_h_dg_lastrow; } else { // Otherwise, I can get it from qup sc_h_dg = qup[j-1].sc[0]; } if(yi > 0 && xi > 0) DEBUG_CHECK(sc_h_dg, yi-1, xi-1, 2); // If we're in the leftmost column, calculate sc_h_lf regardless of // allowGaps. if(j == 0 && xi > 0) { // Get values for left neighbors from the checkpoint if(is8_) { size_t vecoff = (m128mod << 6) + m128div; sc_e_lf = ((uint8_t*)(qlf + 0))[vecoff]; sc_h_lf = ((uint8_t*)(qlf + 2))[vecoff]; if(local) { // No adjustment } else { if(sc_h_lf == 0) sc_h_lf = MIN_I16; else sc_h_lf -= 0xff; if(sc_e_lf == 0) sc_e_lf = MIN_I16; else sc_e_lf -= 0xff; } } else { size_t vecoff = (m128mod << 5) + m128div; sc_e_lf = ((int16_t*)(qlf + 0))[vecoff]; sc_h_lf = ((int16_t*)(qlf + 2))[vecoff]; if(local) { sc_h_lf += 0x8000; assert_geq(sc_h_lf, 0); sc_e_lf += 0x8000; assert_geq(sc_e_lf, 0); } else { if(sc_h_lf != MIN_I16) sc_h_lf -= 0x7fff; if(sc_e_lf != MIN_I16) sc_e_lf -= 0x7fff; } } DEBUG_CHECK(sc_e_lf, yi, xi-1, 0); DEBUG_CHECK(sc_h_lf, yi, xi-1, 2); sc_h_dg_lastrow = sc_h_lf; } if(allowGaps) { if(j == 0 /* at left edge */ && xi > 0 /* not extreme */) { sc_h_lf_c = sc_h_lf; sc_e_lf_c = sc_e_lf; if(sc_h_lf_c != MIN_I16) sc_h_lf_c -= sc_rdo; if(sc_e_lf_c != MIN_I16) sc_e_lf_c -= sc_rde; assert_leq(sc_h_lf_c, prob_.cper_->perf_); assert_leq(sc_e_lf_c, prob_.cper_->perf_); } else if(xi > 0) { // Get values for left neighbors from the previous iteration if(sc_h_lf_last != MIN_I16) { sc_h_lf = sc_h_lf_last; sc_h_lf_c = sc_h_lf - sc_rdo; } if(sc_e_lf_last != MIN_I16) { sc_e_lf = sc_e_lf_last; sc_e_lf_c = sc_e_lf - sc_rde; } } if(yi > 0 /* not extreme */) { // Get column values assert(qup != NULL); assert(local || qup[j].sc[2] < 0); if(qup[j].sc[0] > MIN_I16) { DEBUG_CHECK(qup[j].sc[0], yi-1, xi, 2); sc_h_up = qup[j].sc[0]; sc_h_up_c = sc_h_up - sc_rfo; } if(qup[j].sc[2] > MIN_I16) { DEBUG_CHECK(qup[j].sc[2], yi-1, xi, 1); sc_f_up = qup[j].sc[2]; sc_f_up_c = sc_f_up - sc_rfe; } } if(local) { sc_h_up_c = max(sc_h_up_c, 0); sc_f_up_c = max(sc_f_up_c, 0); sc_h_lf_c = max(sc_h_lf_c, 0); sc_e_lf_c = max(sc_e_lf_c, 0); } } if(sc_h_dg > MIN_I16) { sc_h_dg_c = sc_h_dg + sc_diag; } if(local) sc_h_dg_c = max(sc_h_dg_c, 0); int mask = 0; // Calculate best ways into H, E, F cells starting with H. // Mask bits: // H: 1=diag, 2=hhoriz, 4=ehoriz, 8=hvert, 16=fvert // E: 32=hhoriz, 64=ehoriz // F: 128=hvert, 256=fvert int16_t sc_best = sc_h_dg_c; if(sc_h_dg_c > MIN_I64) { mask = 1; } if(xi > 0 && sc_h_lf_c >= sc_best && sc_h_lf_c > MIN_I64) { if(sc_h_lf_c > sc_best) mask = 0; mask |= 2; sc_best = sc_h_lf_c; } if(xi > 0 && sc_e_lf_c >= sc_best && sc_e_lf_c > MIN_I64) { if(sc_e_lf_c > sc_best) mask = 0; mask |= 4; sc_best = sc_e_lf_c; } if(yi > 0 && sc_h_up_c >= sc_best && sc_h_up_c > MIN_I64) { if(sc_h_up_c > sc_best) mask = 0; mask |= 8; sc_best = sc_h_up_c; } if(yi > 0 && sc_f_up_c >= sc_best && sc_f_up_c > MIN_I64) { if(sc_f_up_c > sc_best) mask = 0; mask |= 16; sc_best = sc_f_up_c; } // Calculate best way into E cell int16_t sc_e_best = sc_h_lf_c; if(xi > 0) { if(sc_h_lf_c >= sc_e_lf_c && sc_h_lf_c > MIN_I64) { if(sc_h_lf_c == sc_e_lf_c) { mask |= 64; } mask |= 32; } else if(sc_e_lf_c > MIN_I64) { sc_e_best = sc_e_lf_c; mask |= 64; } } if(sc_e_best > sc_best) { sc_best = sc_e_best; mask &= ~31; // don't go diagonal } // Calculate best way into F cell int16_t sc_f_best = sc_h_up_c; if(yi > 0) { if(sc_h_up_c >= sc_f_up_c && sc_h_up_c > MIN_I64) { if(sc_h_up_c == sc_f_up_c) { mask |= 256; } mask |= 128; } else if(sc_f_up_c > MIN_I64) { sc_f_best = sc_f_up_c; mask |= 256; } } if(sc_f_best > sc_best) { sc_best = sc_f_best; mask &= ~127; // don't go horizontal or diagonal } // Install results in cur assert( local || sc_best <= 0); sq_[ii+j].sc[0] = sc_best; assert( local || sc_e_best < 0); assert( local || sc_f_best < 0); assert(!local || sc_e_best >= 0 || sc_e_best == MIN_I16); assert(!local || sc_f_best >= 0 || sc_f_best == MIN_I16); sq_[ii+j].sc[1] = sc_e_best; sq_[ii+j].sc[2] = sc_f_best; sq_[ii+j].sc[3] = mask; DEBUG_CHECK(sq_[ii+j].sc[0], yi, xi, 2); // H DEBUG_CHECK(sq_[ii+j].sc[1], yi, xi, 0); // E DEBUG_CHECK(sq_[ii+j].sc[2], yi, xi, 1); // F // Update sc_h_lf_last, sc_e_lf_last sc_h_lf_last = sc_best; sc_e_lf_last = sc_e_best; } // Update m128mod, m128div m128mod++; if(m128mod == prob_.cper_->niter_) { m128mod = 0; m128div++; } // update qup ii += sq_ncol; // dimensions of sq_ qup = sq_.ptr() + sq_ncol * i; } assert_eq(targ, sq_[ymod * sq_ncol + xmod].sc[hef]); // // Now backtrack through the triangle. Abort as soon as we enter a cell // that was visited by a previous backtrace. // int64_t rowc = row, colc = col; size_t curid; int hefc = hef; if(bs_.empty()) { // Start an initial branch CHECK_ROW_COL(rowc, colc); curid = bs_.alloc(); assert_eq(0, curid); Edit e; bs_[curid].init( prob_, 0, // parent ID 0, // penalty 0, // score_en rowc, // row colc, // col e, // edit 0, // hef true, // root? false); // don't try to extend with exact matches bs_[curid].len_ = 0; } else { curid = bs_.size()-1; } size_t ymodTimesNcol = ymod * sq_ncol; while(true) { // What depth are we? assert_eq(ymodTimesNcol, ymod * sq_ncol); CpQuad * cur = sq_.ptr() + ymodTimesNcol + xmod; int mask = cur->sc[3]; assert_gt(mask, 0); int sel = -1; // Select what type of move to make, which depends on whether we're // currently in H, E, F: if(hefc == 0) { if( (mask & 1) != 0) { // diagonal sel = 0; } else if((mask & 8) != 0) { // up to H sel = 3; } else if((mask & 16) != 0) { // up to F sel = 4; } else if((mask & 2) != 0) { // left to H sel = 1; } else if((mask & 4) != 0) { // left to E sel = 2; } } else if(hefc == 1) { if( (mask & 32) != 0) { // left to H sel = 5; } else if((mask & 64) != 0) { // left to E sel = 6; } } else { assert_eq(2, hefc); if( (mask & 128) != 0) { // up to H sel = 7; } else if((mask & 256) != 0) { // up to F sel = 8; } } assert_geq(sel, 0); // Get character from read int qc = prob_.qry_[rowc], qq = prob_.qual_[rowc]; // Get character from reference int rc = prob_.ref_[colc]; assert_range(0, 16, rc); bool xexit = false, yexit = false; // Now that we know what type of move to make, make it, updating our // row and column and moving updating the branch. if(sel == 0) { assert_geq(rowc, 0); assert_geq(colc, 0); TAlScore scd = prob_.sc_->score(qc, rc, qq - 33); if((rc & (1 << qc)) == 0) { // Mismatch size_t id = curid; // Check if the previous branch was the initial (bottommost) // branch with no matches. If so, the mismatch should be added // to the initial branch, instead of starting a new branch. bool empty = (bs_[curid].len_ == 0 && curid == 0); if(!empty) { id = bs_.alloc(); } Edit e((int)rowc, mask2dna[rc], "ACGTN"[qc], EDIT_TYPE_MM); assert_lt(scd, 0); TAlScore score_en = bs_[curid].score_st_ + scd; bs_[id].init( prob_, curid, // parent ID -scd, // penalty score_en, // score_en rowc, // row colc, // col e, // edit hefc, // hef empty, // root? false); // don't try to extend with exact matches curid = id; //assert(!local || bs_[curid].score_st_ >= 0); } else { // Match bs_[curid].score_st_ += prob_.sc_->match(); bs_[curid].len_++; assert_leq((int64_t)bs_[curid].len_, bs_[curid].row_ + 1); } if(xmod == 0) xexit = true; if(ymod == 0) yexit = true; rowc--; ymod--; ymodTimesNcol -= sq_ncol; colc--; xmod--; assert(local || bs_[curid].score_st_ >= targ_final); hefc = 0; } else if((sel >= 1 && sel <= 2) || (sel >= 5 && sel <= 6)) { assert_gt(colc, 0); // Read gap size_t id = bs_.alloc(); Edit e((int)rowc+1, mask2dna[rc], '-', EDIT_TYPE_READ_GAP); TAlScore gapp = prob_.sc_->readGapOpen(); if(bs_[curid].len_ == 0 && bs_[curid].e_.inited() && bs_[curid].e_.isReadGap()) { gapp = prob_.sc_->readGapExtend(); } //assert(!local || bs_[curid].score_st_ >= gapp); TAlScore score_en = bs_[curid].score_st_ - gapp; bs_[id].init( prob_, curid, // parent ID gapp, // penalty score_en, // score_en rowc, // row colc-1, // col e, // edit hefc, // hef false, // root? false); // don't try to extend with exact matches if(xmod == 0) xexit = true; colc--; xmod--; curid = id; assert( local || bs_[curid].score_st_ >= targ_final); //assert(!local || bs_[curid].score_st_ >= 0); if(sel == 1 || sel == 5) { hefc = 0; } else { hefc = 1; } } else { assert_gt(rowc, 0); // Reference gap size_t id = bs_.alloc(); Edit e((int)rowc, '-', "ACGTN"[qc], EDIT_TYPE_REF_GAP); TAlScore gapp = prob_.sc_->refGapOpen(); if(bs_[curid].len_ == 0 && bs_[curid].e_.inited() && bs_[curid].e_.isRefGap()) { gapp = prob_.sc_->refGapExtend(); } //assert(!local || bs_[curid].score_st_ >= gapp); TAlScore score_en = bs_[curid].score_st_ - gapp; bs_[id].init( prob_, curid, // parent ID gapp, // penalty score_en, // score_en rowc-1, // row colc, // col e, // edit hefc, // hef false, // root? false); // don't try to extend with exact matches if(ymod == 0) yexit = true; rowc--; ymod--; ymodTimesNcol -= sq_ncol; curid = id; assert( local || bs_[curid].score_st_ >= targ_final); //assert(!local || bs_[curid].score_st_ >= 0); if(sel == 3 || sel == 7) { hefc = 0; } else { hefc = 2; } } CHECK_ROW_COL(rowc, colc); CpQuad * cur_new = NULL; if(!xexit && !yexit) { cur_new = sq_.ptr() + ymodTimesNcol + xmod; } // Check whether we made it to the top row or to a cell with score 0 if(colc < 0 || rowc < 0 || (cur_new != NULL && local && cur_new->sc[0] == 0)) { done = true; assert(bs_[curid].isSolution(prob_)); addSolution(curid); #ifndef NDEBUG // A check to see if any two adjacent branches in the backtrace // overlap. If they do, the whole alignment will be filtered out // in trySolution(...) size_t cur = curid; if(!bs_[cur].root_) { size_t next = bs_[cur].parentId_; while(!bs_[next].root_) { assert_neq(cur, next); if(bs_[next].len_ != 0 || bs_[cur].len_ == 0) { assert(!bs_[cur].overlap(prob_, bs_[next])); } cur = next; next = bs_[cur].parentId_; } } #endif return; } assert(!xexit || hefc == 0 || hefc == 1); assert(!yexit || hefc == 0 || hefc == 2); if(xexit || yexit) { //assert(rowc < 0 || colc < 0 || prob_.cper_->isCheckpointed(rowc, colc)); row_new = rowc; col_new = colc; hef_new = hefc; done = false; if(rowc < 0 || colc < 0) { assert(local); targ_new = 0; } else { // TODO: Don't use scoreSquare targ_new = prob_.cper_->scoreSquare(rowc, colc, hefc); assert(local || targ_new >= targ); assert(local || targ_new >= targ_final); } if(local && targ_new == 0) { assert_eq(0, hefc); done = true; assert(bs_[curid].isSolution(prob_)); addSolution(curid); } assert((row_new >= 0 && col_new >= 0) || done); return; } } assert(false); } /** * Caller gives us score_en, row and col. We figure out score_st and len_ * by comparing characters from the strings. * * If this branch comes after a mismatch, (row, col) describe the cell that the * mismatch occurs in. len_ is initially set to 1, and the next cell we test * is the next cell up and to the left (row-1, col-1). * * If this branch comes after a read gap, (row, col) describe the leftmost cell * involved in the gap. len_ is initially set to 0, and the next cell we test * is the current cell (row, col). * * If this branch comes after a reference gap, (row, col) describe the upper * cell involved in the gap. len_ is initially set to 0, and the next cell we * test is the current cell (row, col). */ void BtBranch::init( const BtBranchProblem& prob, size_t parentId, TAlScore penalty, TAlScore score_en, int64_t row, int64_t col, Edit e, int hef, bool root, bool extend) { score_en_ = score_en; penalty_ = penalty; score_st_ = score_en_; row_ = row; col_ = col; parentId_ = parentId; e_ = e; root_ = root; assert(!root_ || parentId == 0); assert_lt(row, (int64_t)prob.qrylen_); assert_lt(col, (int64_t)prob.reflen_); // First match to check is diagonally above and to the left of the cell // where the edit occurs int64_t rowc = row; int64_t colc = col; len_ = 0; if(e.inited() && e.isMismatch()) { rowc--; colc--; len_ = 1; } int64_t match = prob.sc_->match(); bool cp = prob.usecp_; size_t iters = 0; curtailed_ = false; if(extend) { while(rowc >= 0 && colc >= 0) { int rfm = prob.ref_[colc]; assert_range(0, 16, rfm); int rdc = prob.qry_[rowc]; bool matches = (rfm & (1 << rdc)) != 0; if(!matches) { // What's the mismatch penalty? break; } // Get score from checkpointer score_st_ += match; if(cp && rowc - 1 >= 0 && colc - 1 >= 0 && prob.cper_->isCheckpointed(rowc - 1, colc - 1)) { // Possibly prune int16_t cpsc; cpsc = prob.cper_->scoreTriangle(rowc - 1, colc - 1, hef); if(cpsc + score_st_ < prob.targ_) { curtailed_ = true; break; } } iters++; rowc--; colc--; } } assert_geq(rowc, -1); assert_geq(colc, -1); len_ = (int64_t)row - rowc; assert_leq((int64_t)len_, row_+1); assert_leq((int64_t)len_, col_+1); assert_leq((int64_t)score_st_, (int64_t)prob.qrylen_ * match); } /** * Given a potential branch to add to the queue, see if we can follow the * branch a little further first. If it's still valid, or if we reach a * choice between valid outgoing paths, go ahead and add it to the queue. */ void BtBranchTracer::examineBranch( int64_t row, int64_t col, const Edit& e, TAlScore pen, // penalty associated with edit TAlScore sc, size_t parentId) { size_t id = bs_.alloc(); bs_[id].init(prob_, parentId, pen, sc, row, col, e, 0, false, true); if(bs_[id].isSolution(prob_)) { assert(bs_[id].isValid(prob_)); addSolution(id); } else { // Check if this branch is legit if(bs_[id].isValid(prob_)) { add(id); } else { bs_.pop(); } } } /** * Take all possible ways of leaving the given branch and add them to the * branch queue. */ void BtBranchTracer::addOffshoots(size_t bid) { BtBranch& b = bs_[bid]; TAlScore sc = b.score_en_; int64_t match = prob_.sc_->match(); int64_t scoreFloor = prob_.sc_->monotone ? MIN_I64 : 0; bool cp = prob_.usecp_; // Are there are any checkpoints? ASSERT_ONLY(TAlScore perfectScore = prob_.sc_->perfectScore(prob_.qrylen_)); assert_leq(prob_.targ_, perfectScore); // For each cell in the branch for(size_t i = 0 ; i < b.len_; i++) { assert_leq((int64_t)i, b.row_+1); assert_leq((int64_t)i, b.col_+1); int64_t row = b.row_ - i, col = b.col_ - i; int64_t bonusLeft = (row + 1) * match; int64_t fromend = prob_.qrylen_ - row - 1; bool allowGaps = fromend >= prob_.sc_->gapbar && row >= prob_.sc_->gapbar; if(allowGaps && row >= 0 && col >= 0) { if(col > 0) { // Try a read gap - it's either an extension or an open bool extend = b.e_.inited() && b.e_.isReadGap() && i == 0; TAlScore rdgapPen = extend ? prob_.sc_->readGapExtend() : prob_.sc_->readGapOpen(); bool prune = false; assert_gt(rdgapPen, 0); if(cp && prob_.cper_->isCheckpointed(row, col - 1)) { // Possibly prune int16_t cpsc = (int16_t)prob_.cper_->scoreTriangle(row, col - 1, 0); assert_leq(cpsc, perfectScore); assert_geq(prob_.sc_->readGapOpen(), prob_.sc_->readGapExtend()); TAlScore bonus = prob_.sc_->readGapOpen() - prob_.sc_->readGapExtend(); assert_geq(bonus, 0); if(cpsc + bonus + sc - rdgapPen < prob_.targ_) { prune = true; } } if(prune) { if(extend) { nrdexPrune_++; } else { nrdopPrune_++; } } else if(sc - rdgapPen >= scoreFloor && sc - rdgapPen + bonusLeft >= prob_.targ_) { // Yes, we can introduce a read gap here Edit e((int)row + 1, mask2dna[(int)prob_.ref_[col]], '-', EDIT_TYPE_READ_GAP); assert(e.isReadGap()); examineBranch(row, col - 1, e, rdgapPen, sc - rdgapPen, bid); if(extend) { nrdex_++; } else { nrdop_++; } } } if(row > 0) { // Try a reference gap - it's either an extension or an open bool extend = b.e_.inited() && b.e_.isRefGap() && i == 0; TAlScore rfgapPen = (b.e_.inited() && b.e_.isRefGap()) ? prob_.sc_->refGapExtend() : prob_.sc_->refGapOpen(); bool prune = false; assert_gt(rfgapPen, 0); if(cp && prob_.cper_->isCheckpointed(row - 1, col)) { // Possibly prune int16_t cpsc = (int16_t)prob_.cper_->scoreTriangle(row - 1, col, 0); assert_leq(cpsc, perfectScore); assert_geq(prob_.sc_->refGapOpen(), prob_.sc_->refGapExtend()); TAlScore bonus = prob_.sc_->refGapOpen() - prob_.sc_->refGapExtend(); assert_geq(bonus, 0); if(cpsc + bonus + sc - rfgapPen < prob_.targ_) { prune = true; } } if(prune) { if(extend) { nrfexPrune_++; } else { nrfopPrune_++; } } else if(sc - rfgapPen >= scoreFloor && sc - rfgapPen + bonusLeft >= prob_.targ_) { // Yes, we can introduce a ref gap here Edit e((int)row, '-', "ACGTN"[(int)prob_.qry_[row]], EDIT_TYPE_REF_GAP); assert(e.isRefGap()); examineBranch(row - 1, col, e, rfgapPen, sc - rfgapPen, bid); if(extend) { nrfex_++; } else { nrfop_++; } } } } // If we're at the top of the branch but not yet at the top of // the DP table, a mismatch branch is also possible. if(i == b.len_ && !b.curtailed_ && row >= 0 && col >= 0) { int rfm = prob_.ref_[col]; assert_lt(row, (int64_t)prob_.qrylen_); int rdc = prob_.qry_[row]; int rdq = prob_.qual_[row]; int scdiff = prob_.sc_->score(rdc, rfm, rdq - 33); assert_lt(scdiff, 0); // at end of branch, so can't match bool prune = false; if(cp && row > 0 && col > 0 && prob_.cper_->isCheckpointed(row - 1, col - 1)) { // Possibly prune int16_t cpsc = prob_.cper_->scoreTriangle(row - 1, col - 1, 0); assert_leq(cpsc, perfectScore); assert_leq(cpsc + scdiff + sc, perfectScore); if(cpsc + scdiff + sc < prob_.targ_) { prune = true; } } if(prune) { nmm_++; } else { // Yes, we can introduce a mismatch here if(sc + scdiff >= scoreFloor && sc + scdiff + bonusLeft >= prob_.targ_) { Edit e((int)row, mask2dna[rfm], "ACGTN"[rdc], EDIT_TYPE_MM); bool nmm = (mask2dna[rfm] == 'N' || rdc > 4); assert_neq(e.chr, e.qchr); assert_lt(scdiff, 0); examineBranch(row - 1, col - 1, e, -scdiff, sc + scdiff, bid); if(nmm) { nnmm_++; } else { nmm_++; } } } } sc += match; } } /** * Sort unsorted branches, merge them with master sorted list. */ void BtBranchTracer::flushUnsorted() { if(unsorted_.empty()) { return; } unsorted_.sort(); unsorted_.reverse(); #ifndef NDEBUG for(size_t i = 1; i < unsorted_.size(); i++) { assert_leq(bs_[unsorted_[i].second].score_st_, bs_[unsorted_[i-1].second].score_st_); } #endif EList *src2 = sortedSel_ ? &sorted1_ : &sorted2_; EList *dest = sortedSel_ ? &sorted2_ : &sorted1_; // Merge src1 and src2 into dest dest->clear(); size_t cur1 = 0, cur2 = cur_; while(cur1 < unsorted_.size() || cur2 < src2->size()) { // Take from 1 or 2 next? bool take1 = true; if(cur1 == unsorted_.size()) { take1 = false; } else if(cur2 == src2->size()) { take1 = true; } else { assert_neq(unsorted_[cur1].second, (*src2)[cur2]); take1 = bs_[unsorted_[cur1].second] < bs_[(*src2)[cur2]]; } if(take1) { dest->push_back(unsorted_[cur1++].second); // Take from list 1 } else { dest->push_back((*src2)[cur2++]); // Take from list 2 } } assert_eq(cur1, unsorted_.size()); assert_eq(cur2, src2->size()); sortedSel_ = !sortedSel_; cur_ = 0; unsorted_.clear(); } /** * Try all the solutions accumulated so far. Solutions might be rejected * if they, for instance, overlap a previous solution, have too many Ns, * fail to overlap a core diagonal, etc. */ bool BtBranchTracer::trySolutions( bool lookForOlap, SwResult& res, size_t& off, size_t& nrej, RandomSource& rnd, bool& success) { if(solutions_.size() > 0) { for(size_t i = 0; i < solutions_.size(); i++) { int ret = trySolution(solutions_[i], lookForOlap, res, off, nrej, rnd); if(ret == BT_FOUND) { success = true; return true; // there were solutions and one was good } } solutions_.clear(); success = false; return true; // there were solutions but none were good } return false; // there were no solutions to check } /** * Given the id of a branch that completes a successful backtrace, turn the * chain of branches into */ int BtBranchTracer::trySolution( size_t id, bool lookForOlap, SwResult& res, size_t& off, size_t& nrej, RandomSource& rnd) { AlnScore score; BtBranch *br = &bs_[id]; // 'br' corresponds to the leftmost edit in a right-to-left // chain of edits. EList& ned = res.alres.ned(); const BtBranch *cur = br, *prev = NULL; size_t ns = 0, nrefns = 0; size_t ngap = 0; while(true) { if(cur->e_.inited()) { if(cur->e_.isMismatch()) { if(cur->e_.qchr == 'N' || cur->e_.chr == 'N') { ns++; } } else if(cur->e_.isGap()) { ngap++; } if(cur->e_.chr == 'N') { nrefns++; } ned.push_back(cur->e_); } if(cur->root_) { break; } cur = &bs_[cur->parentId_]; } if(ns > prob_.nceil_) { // Alignment has too many Ns in it! res.reset(); assert(res.alres.ned().empty()); nrej++; return BT_REJECTED_N; } // Update 'seenPaths_' cur = br; bool rejSeen = false; // set =true if we overlap prev path bool rejCore = true; // set =true if we don't touch core diag while(true) { // Consider row, col, len, then do something int64_t row = cur->row_, col = cur->col_; assert_lt(row, (int64_t)prob_.qrylen_); size_t fromend = prob_.qrylen_ - row - 1; size_t diag = fromend + col; // Calculate the diagonal within the *trimmed* rectangle, // i.e. the rectangle we dealt with in align, gather and // backtrack. int64_t diagi = col - row; // Now adjust to the diagonal within the *untrimmed* // rectangle by adding on the amount trimmed from the left. diagi += prob_.rect_->triml; assert_lt(diag, seenPaths_.size()); // Does it overlap a core diagonal? if(diagi >= 0) { size_t diag = (size_t)diagi; if(diag >= prob_.rect_->corel && diag <= prob_.rect_->corer) { // Yes it does - it's OK rejCore = false; } } if(lookForOlap) { int64_t newlo, newhi; if(cur->len_ == 0) { if(prev != NULL && prev->len_ > 0) { // If there's a gap at the base of a non-0 length branch, the // gap will appear to overlap the branch if we give it length 1. newhi = newlo = 0; } else { // Read or ref gap with no matches coming off of it newlo = row; newhi = row + 1; } } else { // Diagonal with matches newlo = row - (cur->len_ - 1); newhi = row + 1; } assert_geq(newlo, 0); assert_geq(newhi, 0); // Does the diagonal cover cells? if(newhi > newlo) { // Check whether there is any overlap with previously traversed // cells bool added = false; const size_t sz = seenPaths_[diag].size(); for(size_t i = 0; i < sz; i++) { // Does the new interval overlap this already-seen // interval? Also of interest: does it abut this // already-seen interval? If so, we should merge them. size_t lo = seenPaths_[diag][i].first; size_t hi = seenPaths_[diag][i].second; assert_lt(lo, hi); size_t lo_sm = newlo, hi_sm = newhi; if(hi - lo < hi_sm - lo_sm) { swap(lo, lo_sm); swap(hi, hi_sm); } if((lo <= lo_sm && hi > lo_sm) || (lo < hi_sm && hi >= hi_sm)) { // One or both of the shorter interval's end points // are contained in the longer interval - so they // overlap. rejSeen = true; // Merge them into one longer interval seenPaths_[diag][i].first = min(lo, lo_sm); seenPaths_[diag][i].second = max(hi, hi_sm); #ifndef NDEBUG for(int64_t ii = seenPaths_[diag][i].first; ii < (int64_t)seenPaths_[diag][i].second; ii++) { //cerr << "trySolution rejected (" << ii << ", " << (ii + col - row) << ")" << endl; } #endif added = true; break; } else if(hi == lo_sm || lo == hi_sm) { // Merge them into one longer interval seenPaths_[diag][i].first = min(lo, lo_sm); seenPaths_[diag][i].second = max(hi, hi_sm); #ifndef NDEBUG for(int64_t ii = seenPaths_[diag][i].first; ii < (int64_t)seenPaths_[diag][i].second; ii++) { //cerr << "trySolution rejected (" << ii << ", " << (ii + col - row) << ")" << endl; } #endif added = true; // Keep going in case it overlaps one of the other // intervals } } if(!added) { seenPaths_[diag].push_back(make_pair(newlo, newhi)); } } } // After the merging that may have occurred above, it's no // longer guarnateed that all the overlapping intervals in // the list have been merged. That's OK though. We'll // still get correct answers to overlap queries. if(cur->root_) { assert_eq(0, cur->parentId_); break; } prev = cur; cur = &bs_[cur->parentId_]; } // while(cur->e_.inited()) if(rejSeen) { res.reset(); assert(res.alres.ned().empty()); nrej++; return BT_NOT_FOUND; } if(rejCore) { res.reset(); assert(res.alres.ned().empty()); nrej++; return BT_REJECTED_CORE_DIAG; } off = br->leftmostCol(); score.score_ = prob_.targ_; score.ns_ = ns; score.gaps_ = ngap; res.alres.setScore(score); res.alres.setRefNs(nrefns); size_t trimBeg = br->uppermostRow(); size_t trimEnd = prob_.qrylen_ - prob_.row_ - 1; assert_leq(trimBeg, prob_.qrylen_); assert_leq(trimEnd, prob_.qrylen_); TRefOff refoff = off + prob_.refoff_ + prob_.rect_->refl; res.alres.setShape( prob_.refid_, // ref id refoff, // 0-based ref offset prob_.treflen(), // ref length prob_.fw_, // aligned to Watson? prob_.qrylen_, // read length 0, // read id true, // pretrim soft? 0, // pretrim 5' end 0, // pretrim 3' end true, // alignment trim soft? prob_.fw_ ? trimBeg : trimEnd, // alignment trim 5' end prob_.fw_ ? trimEnd : trimBeg); // alignment trim 3' end return BT_FOUND; } /** * Get the next valid alignment given a backtrace problem. Return false * if there is no valid solution. Use a backtracking search to find the * solution. This can be very slow. */ bool BtBranchTracer::nextAlignmentBacktrace( size_t maxiter, SwResult& res, size_t& off, size_t& nrej, size_t& niter, RandomSource& rnd) { assert(!empty() || !emptySolution()); assert(prob_.inited()); // There's a subtle case where we might fail to backtracing in // local-alignment mode. The basic fact to remember is that when we're // backtracing from the highest-scoring cell in the table, we're guaranteed // to be able to backtrace without ever dipping below 0. But if we're // backtracing from a cell other than the highest-scoring cell in the // table, we might dip below 0. Dipping below 0 implies that there's a // shorted local alignment with a better score. In which case, it's // perfectly fair for us to abandon any path that dips below the floor, and // this might result in the queue becoming empty before we finish. bool result = false; niter = 0; while(!empty()) { if(trySolutions(true, res, off, nrej, rnd, result)) { return result; } if(niter++ >= maxiter) { break; } size_t brid = best(rnd); // put best branch in 'br' assert(!seen_.contains(brid)); ASSERT_ONLY(seen_.insert(brid)); #if 0 BtBranch *br = &bs_[brid]; cerr << brid << ": targ:" << prob_.targ_ << ", sc:" << br->score_st_ << ", row:" << br->uppermostRow() << ", nmm:" << nmm_ << ", nnmm:" << nnmm_ << ", nrdop:" << nrdop_ << ", nrfop:" << nrfop_ << ", nrdex:" << nrdex_ << ", nrfex:" << nrfex_ << ", nrdop_pr: " << nrdopPrune_ << ", nrfop_pr: " << nrfopPrune_ << ", nrdex_pr: " << nrdexPrune_ << ", nrfex_pr: " << nrfexPrune_ << endl; #endif addOffshoots(brid); } if(trySolutions(true, res, off, nrej, rnd, result)) { return result; } return false; } /** * Get the next valid alignment given a backtrace problem. Return false * if there is no valid solution. Use a triangle-fill backtrace to find * the solution. This is usually fast (it's O(m + n)). */ bool BtBranchTracer::nextAlignmentFill( size_t maxiter, SwResult& res, size_t& off, size_t& nrej, size_t& niter, RandomSource& rnd) { assert(prob_.inited()); assert(!emptySolution()); bool result = false; if(trySolutions(false, res, off, nrej, rnd, result)) { return result; } return false; } /** * Get the next valid alignment given the backtrace problem. Return false * if there is no valid solution, e.g., if */ bool BtBranchTracer::nextAlignment( size_t maxiter, SwResult& res, size_t& off, size_t& nrej, size_t& niter, RandomSource& rnd) { if(prob_.fill_) { return nextAlignmentFill( maxiter, res, off, nrej, niter, rnd); } else { return nextAlignmentBacktrace( maxiter, res, off, nrej, niter, rnd); } } #ifdef MAIN_ALIGNER_BT #include int main(int argc, char **argv) { size_t off = 0; RandomSource rnd(77); BtBranchTracer tr; Scoring sc = Scoring::base1(); SwResult res; tr.init( "ACGTACGT", // in: read sequence "IIIIIIII", // in: quality sequence 8, // in: read sequence length "ACGTACGT", // in: reference sequence 8, // in: reference sequence length 0, // in: reference id 0, // in: reference offset true, // in: orientation sc, // in: scoring scheme 0, // in: N ceiling 8, // in: alignment score 7, // start in this row 7, // start in this column rnd); // random gen, to choose among equal paths size_t nrej = 0; tr.nextAlignment( res, off, nrej, rnd); } #endif /*def MAIN_ALIGNER_BT*/