hisat-3n/sse_util.h

575 lines
14 KiB
C
Raw Normal View History

2025-01-18 13:09:52 +00:00
/*
* Copyright 2011, Ben Langmead <langmea@cs.jhu.edu>
*
* This file is part of Bowtie 2.
*
* Bowtie 2 is free software: you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation, either version 3 of the License, or
* (at your option) any later version.
*
* Bowtie 2 is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with Bowtie 2. If not, see <http://www.gnu.org/licenses/>.
*/
#ifndef SSE_UTIL_H_
#define SSE_UTIL_H_
#include "assert_helpers.h"
#include "ds.h"
#include "limit.h"
#include <iostream>
#include <emmintrin.h>
class EList_m128i {
public:
/**
* Allocate initial default of S elements.
*/
explicit EList_m128i(int cat = 0) :
cat_(cat), last_alloc_(NULL), list_(NULL), sz_(0), cur_(0)
{
assert_geq(cat, 0);
}
/**
* Destructor.
*/
~EList_m128i() { free(); }
/**
* Return number of elements.
*/
inline size_t size() const { return cur_; }
/**
* Return number of elements allocated.
*/
inline size_t capacity() const { return sz_; }
/**
* Ensure that there is sufficient capacity to expand to include
* 'thresh' more elements without having to expand.
*/
inline void ensure(size_t thresh) {
if(list_ == NULL) lazyInit();
expandCopy(cur_ + thresh);
}
/**
* Ensure that there is sufficient capacity to include 'newsz' elements.
* If there isn't enough capacity right now, expand capacity to exactly
* equal 'newsz'.
*/
inline void reserveExact(size_t newsz) {
if(list_ == NULL) lazyInitExact(newsz);
expandCopyExact(newsz);
}
/**
* Return true iff there are no elements.
*/
inline bool empty() const { return cur_ == 0; }
/**
* Return true iff list hasn't been initialized yet.
*/
inline bool null() const { return list_ == NULL; }
/**
* If size is less than requested size, resize up to at least sz
* and set cur_ to requested sz.
*/
void resize(size_t sz) {
if(sz > 0 && list_ == NULL) lazyInit();
if(sz <= cur_) {
cur_ = sz;
return;
}
if(sz_ < sz) {
expandCopy(sz);
}
cur_ = sz;
}
/**
* Zero out contents of vector.
*/
void zero() {
if(cur_ > 0) {
memset(list_, 0, cur_ * sizeof(__m128i));
}
}
/**
* If size is less than requested size, resize up to at least sz
* and set cur_ to requested sz. Do not copy the elements over.
*/
void resizeNoCopy(size_t sz) {
if(sz > 0 && list_ == NULL) lazyInit();
if(sz <= cur_) {
cur_ = sz;
return;
}
if(sz_ < sz) {
expandNoCopy(sz);
}
cur_ = sz;
}
/**
* If size is less than requested size, resize up to exactly sz and set
* cur_ to requested sz.
*/
void resizeExact(size_t sz) {
if(sz > 0 && list_ == NULL) lazyInitExact(sz);
if(sz <= cur_) {
cur_ = sz;
return;
}
if(sz_ < sz) expandCopyExact(sz);
cur_ = sz;
}
/**
* Make the stack empty.
*/
void clear() {
cur_ = 0; // re-use stack memory
// Don't clear heap; re-use it
}
/**
* Return a reference to the ith element.
*/
inline __m128i& operator[](size_t i) {
assert_lt(i, cur_);
return list_[i];
}
/**
* Return a reference to the ith element.
*/
inline __m128i operator[](size_t i) const {
assert_lt(i, cur_);
return list_[i];
}
/**
* Return a reference to the ith element.
*/
inline __m128i& get(size_t i) {
return operator[](i);
}
/**
* Return a reference to the ith element.
*/
inline __m128i get(size_t i) const {
return operator[](i);
}
/**
* Return a pointer to the beginning of the buffer.
*/
__m128i *ptr() { return list_; }
/**
* Return a const pointer to the beginning of the buffer.
*/
const __m128i *ptr() const { return list_; }
/**
* Return memory category.
*/
int cat() const { return cat_; }
private:
/**
* Initialize memory for EList.
*/
void lazyInit() {
assert(list_ == NULL);
list_ = alloc(sz_);
}
/**
* Initialize exactly the prescribed number of elements for EList.
*/
void lazyInitExact(size_t sz) {
assert_gt(sz, 0);
assert(list_ == NULL);
sz_ = sz;
list_ = alloc(sz);
}
/**
* Allocate a T array of length sz_ and store in list_. Also,
* tally into the global memory tally.
*/
__m128i *alloc(size_t sz) {
__m128i* last_alloc_;
try {
last_alloc_ = new __m128i[sz + 2];
} catch(std::bad_alloc& e) {
std::cerr << "Error: Out of memory allocating " << sz << " __m128i's for DP matrix: '" << e.what() << "'" << std::endl;
throw e;
}
__m128i* tmp = last_alloc_;
size_t tmpint = (size_t)tmp;
// Align it!
if((tmpint & 0xf) != 0) {
tmpint += 15;
tmpint &= (~0xf);
tmp = reinterpret_cast<__m128i*>(tmpint);
}
assert_eq(0, (tmpint & 0xf)); // should be 16-byte aligned
assert(tmp != NULL);
gMemTally.add(cat_, sz);
return tmp;
}
/**
* Allocate a T array of length sz_ and store in list_. Also,
* tally into the global memory tally.
*/
void free() {
if(list_ != NULL) {
delete[] last_alloc_;
gMemTally.del(cat_, sz_);
list_ = NULL;
sz_ = cur_ = 0;
}
}
/**
* Expand the list_ buffer until it has at least 'thresh' elements. Size
* increases quadratically with number of expansions. Copy old contents
* into new buffer using operator=.
*/
void expandCopy(size_t thresh) {
if(thresh <= sz_) return;
size_t newsz = (sz_ * 2)+1;
while(newsz < thresh) newsz *= 2;
expandCopyExact(newsz);
}
/**
* Expand the list_ buffer until it has exactly 'newsz' elements. Copy
* old contents into new buffer using operator=.
*/
void expandCopyExact(size_t newsz) {
if(newsz <= sz_) return;
__m128i* tmp = alloc(newsz);
assert(tmp != NULL);
size_t cur = cur_;
if(list_ != NULL) {
for(size_t i = 0; i < cur_; i++) {
// Note: operator= is used
tmp[i] = list_[i];
}
free();
}
list_ = tmp;
sz_ = newsz;
cur_ = cur;
}
/**
* Expand the list_ buffer until it has at least 'thresh' elements.
* Size increases quadratically with number of expansions. Don't copy old
* contents into the new buffer.
*/
void expandNoCopy(size_t thresh) {
assert(list_ != NULL);
if(thresh <= sz_) return;
size_t newsz = (sz_ * 2)+1;
while(newsz < thresh) newsz *= 2;
expandNoCopyExact(newsz);
}
/**
* Expand the list_ buffer until it has exactly 'newsz' elements. Don't
* copy old contents into the new buffer.
*/
void expandNoCopyExact(size_t newsz) {
assert(list_ != NULL);
assert_gt(newsz, 0);
free();
__m128i* tmp = alloc(newsz);
assert(tmp != NULL);
list_ = tmp;
sz_ = newsz;
assert_gt(sz_, 0);
}
int cat_; // memory category, for accounting purposes
__m128i* last_alloc_; // what new[] originally returns
__m128i *list_; // list ptr, aligned version of what new[] returns
size_t sz_; // capacity
size_t cur_; // occupancy (AKA size)
};
struct CpQuad {
CpQuad() { reset(); }
void reset() { sc[0] = sc[1] = sc[2] = sc[3] = 0; }
bool operator==(const CpQuad& o) const {
return sc[0] == o.sc[0] &&
sc[1] == o.sc[1] &&
sc[2] == o.sc[2] &&
sc[3] == o.sc[3];
}
int16_t sc[4];
};
/**
* Encapsulates a collection of checkpoints. Assumes the scheme is to
* checkpoint adjacent pairs of anti-diagonals.
*/
class Checkpointer {
public:
Checkpointer() { reset(); }
/**
* Set the checkpointer up for a new rectangle.
*/
void init(
size_t nrow, // # of rows
size_t ncol, // # of columns
size_t perpow2, // checkpoint every 1 << perpow2 diags (& next)
int64_t perfectScore, // what is a perfect score? for sanity checks
bool is8, // 8-bit?
bool doTri, // triangle shaped?
bool local, // is alignment local? for sanity checks
bool debug) // gather debug checkpoints?
{
assert_gt(perpow2, 0);
nrow_ = nrow;
ncol_ = ncol;
perpow2_ = perpow2;
per_ = 1 << perpow2;
lomask_ = ~(0xffffffff << perpow2);
perf_ = perfectScore;
local_ = local;
ndiag_ = (ncol + nrow - 1 + 1) / per_;
locol_ = MAX_SIZE_T;
hicol_ = MIN_SIZE_T;
// debug_ = debug;
debug_ = true;
commitMap_.clear();
firstCommit_ = true;
size_t perword = (is8 ? 16 : 8);
is8_ = is8;
niter_ = ((nrow_ + perword - 1) / perword);
if(doTri) {
// Save a pair of anti-diagonals every per_ anti-diagonals for
// backtrace purposes
qdiag1s_.resize(ndiag_ * nrow_);
qdiag2s_.resize(ndiag_ * nrow_);
} else {
// Save every per_ columns and rows for backtrace purposes
qrows_.resize((nrow_ / per_) * ncol_);
qcols_.resize((ncol_ / per_) * (niter_ << 2));
}
if(debug_) {
// Save all columns for debug purposes
qcolsD_.resize(ncol_ * (niter_ << 2));
}
}
/**
* Return true iff we've been collecting debug cells.
*/
bool debug() const { return debug_; }
/**
* Check whether the given score matches the saved score at row, col, hef.
*/
int64_t debugCell(size_t row, size_t col, int hef) const {
assert(debug_);
const __m128i* ptr = qcolsD_.ptr() + hef;
// Fast forward to appropriate column
ptr += ((col * niter_) << 2);
size_t mod = row % niter_; // which m128i
size_t div = row / niter_; // offset into m128i
// Fast forward to appropriate word
ptr += (mod << 2);
// Extract score
int16_t sc = (is8_ ? ((uint8_t*)ptr)[div] : ((int16_t*)ptr)[div]);
int64_t asc = MIN_I64;
// Convert score
if(is8_) {
if(local_) {
asc = sc;
} else {
if(sc == 0) asc = MIN_I64;
else asc = sc - 0xff;
}
} else {
if(local_) {
asc = sc + 0x8000;
} else {
if(sc != MIN_I16) asc = sc - 0x7fff;
}
}
return asc;
}
/**
* Return true iff the given row/col is checkpointed.
*/
bool isCheckpointed(size_t row, size_t col) const {
assert_leq(col, hicol_);
assert_geq(col, locol_);
size_t mod = (row + col) & lomask_;
assert_lt(mod, per_);
return mod >= per_ - 2;
}
/**
* Return the checkpointed H, E, or F score from the given cell.
*/
inline int64_t scoreTriangle(size_t row, size_t col, int hef) const {
assert(isCheckpointed(row, col));
bool diag1 = ((row + col) & lomask_) == per_ - 2;
size_t off = (row + col) >> perpow2_;
if(diag1) {
if(qdiag1s_[off * nrow_ + row].sc[hef] == MIN_I16) {
return MIN_I64;
} else {
return qdiag1s_[off * nrow_ + row].sc[hef];
}
} else {
if(qdiag2s_[off * nrow_ + row].sc[hef] == MIN_I16) {
return MIN_I64;
} else {
return qdiag2s_[off * nrow_ + row].sc[hef];
}
}
}
/**
* Return the checkpointed H, E, or F score from the given cell.
*/
inline int64_t scoreSquare(size_t row, size_t col, int hef) const {
// Is it in a checkpointed row? Note that checkpointed rows don't
// necessarily have the horizontal contributions calculated, so we want
// to use the column info in that case.
if((row & lomask_) == lomask_ && hef != 1) {
int64_t sc = qrows_[(row >> perpow2_) * ncol_ + col].sc[hef];
if(sc == MIN_I16) return MIN_I64;
return sc;
}
hef--;
if(hef == -1) hef = 2;
// It must be in a checkpointed column
assert_eq(lomask_, (col & lomask_));
// Fast forward to appropriate column
const __m128i* ptr = qcols_.ptr() + hef;
ptr += (((col >> perpow2_) * niter_) << 2);
size_t mod = row % niter_; // which m128i
size_t div = row / niter_; // offset into m128i
// Fast forward to appropriate word
ptr += (mod << 2);
// Extract score
int16_t sc = (is8_ ? ((uint8_t*)ptr)[div] : ((int16_t*)ptr)[div]);
int64_t asc = MIN_I64;
// Convert score
if(is8_) {
if(local_) {
asc = sc;
} else {
if(sc == 0) asc = MIN_I64;
else asc = sc - 0xff;
}
} else {
if(local_) {
asc = sc + 0x8000;
} else {
if(sc != MIN_I16) asc = sc - 0x7fff;
}
}
return asc;
}
/**
* Given a column of filled-in cells, save the checkpointed cells in cs_.
*/
void commitCol(__m128i *pvH, __m128i *pvE, __m128i *pvF, size_t coli);
/**
* Reset the state of the Checkpointer.
*/
void reset() {
perpow2_ = per_ = lomask_ = nrow_ = ncol_ = 0;
local_ = false;
niter_ = ndiag_ = locol_ = hicol_ = 0;
perf_ = 0;
firstCommit_ = true;
is8_ = debug_ = false;
}
/**
* Return true iff the Checkpointer has been initialized.
*/
bool inited() const {
return nrow_ > 0;
}
size_t per() const { return per_; }
size_t perpow2() const { return perpow2_; }
size_t lomask() const { return lomask_; }
size_t locol() const { return locol_; }
size_t hicol() const { return hicol_; }
size_t nrow() const { return nrow_; }
size_t ncol() const { return ncol_; }
const CpQuad* qdiag1sPtr() const { return qdiag1s_.ptr(); }
const CpQuad* qdiag2sPtr() const { return qdiag2s_.ptr(); }
size_t perpow2_; // 1 << perpow2_ - 2 is the # of uncheckpointed
// anti-diags between checkpointed anti-diag pairs
size_t per_; // 1 << perpow2_
size_t lomask_; // mask for extracting low bits
size_t nrow_; // # rows in current rectangle
size_t ncol_; // # cols in current rectangle
int64_t perf_; // perfect score
bool local_; // local alignment?
size_t ndiag_; // # of double-diags
size_t locol_; // leftmost column committed
size_t hicol_; // rightmost column committed
// Map for committing scores from vector columns to checkpointed diagonals
EList<size_t> commitMap_;
bool firstCommit_;
EList<CpQuad> qdiag1s_; // checkpoint H/E/F values for diagonal 1
EList<CpQuad> qdiag2s_; // checkpoint H/E/F values for diagonal 2
EList<CpQuad> qrows_; // checkpoint H/E/F values for rows
// We store columns in this way to reduce overhead of populating them
bool is8_; // true -> fill used 8-bit cells
size_t niter_; // # __m128i words per column
EList_m128i qcols_; // checkpoint E/F/H values for select columns
bool debug_; // get debug checkpoints? (i.e. fill qcolsD_?)
EList_m128i qcolsD_; // checkpoint E/F/H values for all columns (debug)
};
#endif