/* * Copyright 2011, Ben Langmead * * This file is part of Bowtie 2. * * Bowtie 2 is free software: you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation, either version 3 of the License, or * (at your option) any later version. * * Bowtie 2 is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with Bowtie 2. If not, see . */ #ifndef SSE_UTIL_H_ #define SSE_UTIL_H_ #include "assert_helpers.h" #include "ds.h" #include "limit.h" #include #include class EList_m128i { public: /** * Allocate initial default of S elements. */ explicit EList_m128i(int cat = 0) : cat_(cat), last_alloc_(NULL), list_(NULL), sz_(0), cur_(0) { assert_geq(cat, 0); } /** * Destructor. */ ~EList_m128i() { free(); } /** * Return number of elements. */ inline size_t size() const { return cur_; } /** * Return number of elements allocated. */ inline size_t capacity() const { return sz_; } /** * Ensure that there is sufficient capacity to expand to include * 'thresh' more elements without having to expand. */ inline void ensure(size_t thresh) { if(list_ == NULL) lazyInit(); expandCopy(cur_ + thresh); } /** * Ensure that there is sufficient capacity to include 'newsz' elements. * If there isn't enough capacity right now, expand capacity to exactly * equal 'newsz'. */ inline void reserveExact(size_t newsz) { if(list_ == NULL) lazyInitExact(newsz); expandCopyExact(newsz); } /** * Return true iff there are no elements. */ inline bool empty() const { return cur_ == 0; } /** * Return true iff list hasn't been initialized yet. */ inline bool null() const { return list_ == NULL; } /** * If size is less than requested size, resize up to at least sz * and set cur_ to requested sz. */ void resize(size_t sz) { if(sz > 0 && list_ == NULL) lazyInit(); if(sz <= cur_) { cur_ = sz; return; } if(sz_ < sz) { expandCopy(sz); } cur_ = sz; } /** * Zero out contents of vector. */ void zero() { if(cur_ > 0) { memset(list_, 0, cur_ * sizeof(__m128i)); } } /** * If size is less than requested size, resize up to at least sz * and set cur_ to requested sz. Do not copy the elements over. */ void resizeNoCopy(size_t sz) { if(sz > 0 && list_ == NULL) lazyInit(); if(sz <= cur_) { cur_ = sz; return; } if(sz_ < sz) { expandNoCopy(sz); } cur_ = sz; } /** * If size is less than requested size, resize up to exactly sz and set * cur_ to requested sz. */ void resizeExact(size_t sz) { if(sz > 0 && list_ == NULL) lazyInitExact(sz); if(sz <= cur_) { cur_ = sz; return; } if(sz_ < sz) expandCopyExact(sz); cur_ = sz; } /** * Make the stack empty. */ void clear() { cur_ = 0; // re-use stack memory // Don't clear heap; re-use it } /** * Return a reference to the ith element. */ inline __m128i& operator[](size_t i) { assert_lt(i, cur_); return list_[i]; } /** * Return a reference to the ith element. */ inline __m128i operator[](size_t i) const { assert_lt(i, cur_); return list_[i]; } /** * Return a reference to the ith element. */ inline __m128i& get(size_t i) { return operator[](i); } /** * Return a reference to the ith element. */ inline __m128i get(size_t i) const { return operator[](i); } /** * Return a pointer to the beginning of the buffer. */ __m128i *ptr() { return list_; } /** * Return a const pointer to the beginning of the buffer. */ const __m128i *ptr() const { return list_; } /** * Return memory category. */ int cat() const { return cat_; } private: /** * Initialize memory for EList. */ void lazyInit() { assert(list_ == NULL); list_ = alloc(sz_); } /** * Initialize exactly the prescribed number of elements for EList. */ void lazyInitExact(size_t sz) { assert_gt(sz, 0); assert(list_ == NULL); sz_ = sz; list_ = alloc(sz); } /** * Allocate a T array of length sz_ and store in list_. Also, * tally into the global memory tally. */ __m128i *alloc(size_t sz) { __m128i* last_alloc_; try { last_alloc_ = new __m128i[sz + 2]; } catch(std::bad_alloc& e) { std::cerr << "Error: Out of memory allocating " << sz << " __m128i's for DP matrix: '" << e.what() << "'" << std::endl; throw e; } __m128i* tmp = last_alloc_; size_t tmpint = (size_t)tmp; // Align it! if((tmpint & 0xf) != 0) { tmpint += 15; tmpint &= (~0xf); tmp = reinterpret_cast<__m128i*>(tmpint); } assert_eq(0, (tmpint & 0xf)); // should be 16-byte aligned assert(tmp != NULL); gMemTally.add(cat_, sz); return tmp; } /** * Allocate a T array of length sz_ and store in list_. Also, * tally into the global memory tally. */ void free() { if(list_ != NULL) { delete[] last_alloc_; gMemTally.del(cat_, sz_); list_ = NULL; sz_ = cur_ = 0; } } /** * Expand the list_ buffer until it has at least 'thresh' elements. Size * increases quadratically with number of expansions. Copy old contents * into new buffer using operator=. */ void expandCopy(size_t thresh) { if(thresh <= sz_) return; size_t newsz = (sz_ * 2)+1; while(newsz < thresh) newsz *= 2; expandCopyExact(newsz); } /** * Expand the list_ buffer until it has exactly 'newsz' elements. Copy * old contents into new buffer using operator=. */ void expandCopyExact(size_t newsz) { if(newsz <= sz_) return; __m128i* tmp = alloc(newsz); assert(tmp != NULL); size_t cur = cur_; if(list_ != NULL) { for(size_t i = 0; i < cur_; i++) { // Note: operator= is used tmp[i] = list_[i]; } free(); } list_ = tmp; sz_ = newsz; cur_ = cur; } /** * Expand the list_ buffer until it has at least 'thresh' elements. * Size increases quadratically with number of expansions. Don't copy old * contents into the new buffer. */ void expandNoCopy(size_t thresh) { assert(list_ != NULL); if(thresh <= sz_) return; size_t newsz = (sz_ * 2)+1; while(newsz < thresh) newsz *= 2; expandNoCopyExact(newsz); } /** * Expand the list_ buffer until it has exactly 'newsz' elements. Don't * copy old contents into the new buffer. */ void expandNoCopyExact(size_t newsz) { assert(list_ != NULL); assert_gt(newsz, 0); free(); __m128i* tmp = alloc(newsz); assert(tmp != NULL); list_ = tmp; sz_ = newsz; assert_gt(sz_, 0); } int cat_; // memory category, for accounting purposes __m128i* last_alloc_; // what new[] originally returns __m128i *list_; // list ptr, aligned version of what new[] returns size_t sz_; // capacity size_t cur_; // occupancy (AKA size) }; struct CpQuad { CpQuad() { reset(); } void reset() { sc[0] = sc[1] = sc[2] = sc[3] = 0; } bool operator==(const CpQuad& o) const { return sc[0] == o.sc[0] && sc[1] == o.sc[1] && sc[2] == o.sc[2] && sc[3] == o.sc[3]; } int16_t sc[4]; }; /** * Encapsulates a collection of checkpoints. Assumes the scheme is to * checkpoint adjacent pairs of anti-diagonals. */ class Checkpointer { public: Checkpointer() { reset(); } /** * Set the checkpointer up for a new rectangle. */ void init( size_t nrow, // # of rows size_t ncol, // # of columns size_t perpow2, // checkpoint every 1 << perpow2 diags (& next) int64_t perfectScore, // what is a perfect score? for sanity checks bool is8, // 8-bit? bool doTri, // triangle shaped? bool local, // is alignment local? for sanity checks bool debug) // gather debug checkpoints? { assert_gt(perpow2, 0); nrow_ = nrow; ncol_ = ncol; perpow2_ = perpow2; per_ = 1 << perpow2; lomask_ = ~(0xffffffff << perpow2); perf_ = perfectScore; local_ = local; ndiag_ = (ncol + nrow - 1 + 1) / per_; locol_ = MAX_SIZE_T; hicol_ = MIN_SIZE_T; // debug_ = debug; debug_ = true; commitMap_.clear(); firstCommit_ = true; size_t perword = (is8 ? 16 : 8); is8_ = is8; niter_ = ((nrow_ + perword - 1) / perword); if(doTri) { // Save a pair of anti-diagonals every per_ anti-diagonals for // backtrace purposes qdiag1s_.resize(ndiag_ * nrow_); qdiag2s_.resize(ndiag_ * nrow_); } else { // Save every per_ columns and rows for backtrace purposes qrows_.resize((nrow_ / per_) * ncol_); qcols_.resize((ncol_ / per_) * (niter_ << 2)); } if(debug_) { // Save all columns for debug purposes qcolsD_.resize(ncol_ * (niter_ << 2)); } } /** * Return true iff we've been collecting debug cells. */ bool debug() const { return debug_; } /** * Check whether the given score matches the saved score at row, col, hef. */ int64_t debugCell(size_t row, size_t col, int hef) const { assert(debug_); const __m128i* ptr = qcolsD_.ptr() + hef; // Fast forward to appropriate column ptr += ((col * niter_) << 2); size_t mod = row % niter_; // which m128i size_t div = row / niter_; // offset into m128i // Fast forward to appropriate word ptr += (mod << 2); // Extract score int16_t sc = (is8_ ? ((uint8_t*)ptr)[div] : ((int16_t*)ptr)[div]); int64_t asc = MIN_I64; // Convert score if(is8_) { if(local_) { asc = sc; } else { if(sc == 0) asc = MIN_I64; else asc = sc - 0xff; } } else { if(local_) { asc = sc + 0x8000; } else { if(sc != MIN_I16) asc = sc - 0x7fff; } } return asc; } /** * Return true iff the given row/col is checkpointed. */ bool isCheckpointed(size_t row, size_t col) const { assert_leq(col, hicol_); assert_geq(col, locol_); size_t mod = (row + col) & lomask_; assert_lt(mod, per_); return mod >= per_ - 2; } /** * Return the checkpointed H, E, or F score from the given cell. */ inline int64_t scoreTriangle(size_t row, size_t col, int hef) const { assert(isCheckpointed(row, col)); bool diag1 = ((row + col) & lomask_) == per_ - 2; size_t off = (row + col) >> perpow2_; if(diag1) { if(qdiag1s_[off * nrow_ + row].sc[hef] == MIN_I16) { return MIN_I64; } else { return qdiag1s_[off * nrow_ + row].sc[hef]; } } else { if(qdiag2s_[off * nrow_ + row].sc[hef] == MIN_I16) { return MIN_I64; } else { return qdiag2s_[off * nrow_ + row].sc[hef]; } } } /** * Return the checkpointed H, E, or F score from the given cell. */ inline int64_t scoreSquare(size_t row, size_t col, int hef) const { // Is it in a checkpointed row? Note that checkpointed rows don't // necessarily have the horizontal contributions calculated, so we want // to use the column info in that case. if((row & lomask_) == lomask_ && hef != 1) { int64_t sc = qrows_[(row >> perpow2_) * ncol_ + col].sc[hef]; if(sc == MIN_I16) return MIN_I64; return sc; } hef--; if(hef == -1) hef = 2; // It must be in a checkpointed column assert_eq(lomask_, (col & lomask_)); // Fast forward to appropriate column const __m128i* ptr = qcols_.ptr() + hef; ptr += (((col >> perpow2_) * niter_) << 2); size_t mod = row % niter_; // which m128i size_t div = row / niter_; // offset into m128i // Fast forward to appropriate word ptr += (mod << 2); // Extract score int16_t sc = (is8_ ? ((uint8_t*)ptr)[div] : ((int16_t*)ptr)[div]); int64_t asc = MIN_I64; // Convert score if(is8_) { if(local_) { asc = sc; } else { if(sc == 0) asc = MIN_I64; else asc = sc - 0xff; } } else { if(local_) { asc = sc + 0x8000; } else { if(sc != MIN_I16) asc = sc - 0x7fff; } } return asc; } /** * Given a column of filled-in cells, save the checkpointed cells in cs_. */ void commitCol(__m128i *pvH, __m128i *pvE, __m128i *pvF, size_t coli); /** * Reset the state of the Checkpointer. */ void reset() { perpow2_ = per_ = lomask_ = nrow_ = ncol_ = 0; local_ = false; niter_ = ndiag_ = locol_ = hicol_ = 0; perf_ = 0; firstCommit_ = true; is8_ = debug_ = false; } /** * Return true iff the Checkpointer has been initialized. */ bool inited() const { return nrow_ > 0; } size_t per() const { return per_; } size_t perpow2() const { return perpow2_; } size_t lomask() const { return lomask_; } size_t locol() const { return locol_; } size_t hicol() const { return hicol_; } size_t nrow() const { return nrow_; } size_t ncol() const { return ncol_; } const CpQuad* qdiag1sPtr() const { return qdiag1s_.ptr(); } const CpQuad* qdiag2sPtr() const { return qdiag2s_.ptr(); } size_t perpow2_; // 1 << perpow2_ - 2 is the # of uncheckpointed // anti-diags between checkpointed anti-diag pairs size_t per_; // 1 << perpow2_ size_t lomask_; // mask for extracting low bits size_t nrow_; // # rows in current rectangle size_t ncol_; // # cols in current rectangle int64_t perf_; // perfect score bool local_; // local alignment? size_t ndiag_; // # of double-diags size_t locol_; // leftmost column committed size_t hicol_; // rightmost column committed // Map for committing scores from vector columns to checkpointed diagonals EList commitMap_; bool firstCommit_; EList qdiag1s_; // checkpoint H/E/F values for diagonal 1 EList qdiag2s_; // checkpoint H/E/F values for diagonal 2 EList qrows_; // checkpoint H/E/F values for rows // We store columns in this way to reduce overhead of populating them bool is8_; // true -> fill used 8-bit cells size_t niter_; // # __m128i words per column EList_m128i qcols_; // checkpoint E/F/H values for select columns bool debug_; // get debug checkpoints? (i.e. fill qcolsD_?) EList_m128i qcolsD_; // checkpoint E/F/H values for all columns (debug) }; #endif