hisat-3n/filebuf.h
2025-01-18 21:09:52 +08:00

719 lines
16 KiB
C++

/*
* Copyright 2011, Ben Langmead <langmea@cs.jhu.edu>
*
* This file is part of Bowtie 2.
*
* Bowtie 2 is free software: you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation, either version 3 of the License, or
* (at your option) any later version.
*
* Bowtie 2 is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with Bowtie 2. If not, see <http://www.gnu.org/licenses/>.
*/
#ifndef FILEBUF_H_
#define FILEBUF_H_
#include <iostream>
#include <fstream>
#include <string>
#include <stdio.h>
#include <string.h>
#include <stdint.h>
#include <stdexcept>
#include "assert_helpers.h"
/**
* Simple, fast helper for determining if a character is a newline.
*/
static inline bool isnewline(int c) {
return c == '\r' || c == '\n';
}
/**
* Simple, fast helper for determining if a character is a non-newline
* whitespace character.
*/
static inline bool isspace_notnl(int c) {
return isspace(c) && !isnewline(c);
}
/**
* Simple wrapper for a FILE*, istream or ifstream that reads it in chunks
* using fread and keeps those chunks in a buffer. It also services calls to
* get(), peek() and gets() from the buffer, reading in additional chunks when
* necessary.
*
* Helper functions do things like parse strings, numbers, and FASTA records.
*
*
*/
class FileBuf {
public:
FileBuf() {
init();
}
FileBuf(FILE *in) {
init();
_in = in;
assert(_in != NULL);
}
FileBuf(std::ifstream *inf) {
init();
_inf = inf;
assert(_inf != NULL);
}
FileBuf(std::istream *ins) {
init();
_ins = ins;
assert(_ins != NULL);
}
/**
* Return true iff there is a stream ready to read.
*/
bool isOpen() {
return _in != NULL || _inf != NULL || _ins != NULL;
}
/**
* Close the input stream (if that's possible)
*/
void close() {
if(_in != NULL && _in != stdin) {
fclose(_in);
} else if(_inf != NULL) {
_inf->close();
} else {
// can't close _ins
}
}
/**
* Get the next character of input and advance.
*/
int get() {
assert(_in != NULL || _inf != NULL || _ins != NULL);
int c = peek();
if(c != -1) {
_cur++;
if(_lastn_cur < LASTN_BUF_SZ) _lastn_buf[_lastn_cur++] = c;
}
return c;
}
/**
* Return true iff all input is exhausted.
*/
bool eof() {
return (_cur == _buf_sz) && _done;
}
/**
* Initialize the buffer with a new C-style file.
*/
void newFile(FILE *in) {
_in = in;
_inf = NULL;
_ins = NULL;
_cur = BUF_SZ;
_buf_sz = BUF_SZ;
_done = false;
}
/**
* Initialize the buffer with a new ifstream.
*/
void newFile(std::ifstream *__inf) {
_in = NULL;
_inf = __inf;
_ins = NULL;
_cur = BUF_SZ;
_buf_sz = BUF_SZ;
_done = false;
}
/**
* Initialize the buffer with a new istream.
*/
void newFile(std::istream *__ins) {
_in = NULL;
_inf = NULL;
_ins = __ins;
_cur = BUF_SZ;
_buf_sz = BUF_SZ;
_done = false;
}
/**
* Restore state as though we just started reading the input
* stream.
*/
void reset() {
if(_inf != NULL) {
_inf->clear();
_inf->seekg(0, std::ios::beg);
} else if(_ins != NULL) {
_ins->clear();
_ins->seekg(0, std::ios::beg);
} else {
rewind(_in);
}
_cur = BUF_SZ;
_buf_sz = BUF_SZ;
_done = false;
}
/**
* Peek at the next character of the input stream without
* advancing. Typically we can simple read it from the buffer.
* Occasionally we'll need to read in a new buffer's worth of data.
*/
int peek() {
assert(_in != NULL || _inf != NULL || _ins != NULL);
assert_leq(_cur, _buf_sz);
if(_cur == _buf_sz) {
if(_done) {
// We already exhausted the input stream
return -1;
}
// Read a new buffer's worth of data
else {
// Get the next chunk
if(_inf != NULL) {
_inf->read((char*)_buf, BUF_SZ);
_buf_sz = _inf->gcount();
} else if(_ins != NULL) {
_ins->read((char*)_buf, BUF_SZ);
_buf_sz = _ins->gcount();
} else {
assert(_in != NULL);
_buf_sz = fread(_buf, 1, BUF_SZ, _in);
}
_cur = 0;
if(_buf_sz == 0) {
// Exhausted, and we have nothing to return to the
// caller
_done = true;
return -1;
} else if(_buf_sz < BUF_SZ) {
// Exhausted
_done = true;
}
}
}
return (int)_buf[_cur];
}
/**
* Store a string of characters from the input file into 'buf',
* until we see a newline, EOF, or until 'len' characters have been
* read.
*/
size_t gets(char *buf, size_t len) {
size_t stored = 0;
while(true) {
int c = get();
if(c == -1) {
// End-of-file
buf[stored] = '\0';
return stored;
}
if(stored == len-1 || isnewline(c)) {
// End of string
buf[stored] = '\0';
// Skip over all end-of-line characters
int pc = peek();
while(isnewline(pc)) {
get(); // discard
pc = peek();
}
// Next get() will be after all newline characters
return stored;
}
buf[stored++] = (char)c;
}
}
/**
* Store a string of characters from the input file into 'buf',
* until we see a newline, EOF, or until 'len' characters have been
* read.
*/
size_t get(char *buf, size_t len) {
size_t stored = 0;
for(size_t i = 0; i < len; i++) {
int c = get();
if(c == -1) return i;
buf[stored++] = (char)c;
}
return len;
}
static const size_t LASTN_BUF_SZ = 8 * 1024;
/**
* Keep get()ing characters until a non-whitespace character (or
* -1) is reached, and return it.
*/
int getPastWhitespace() {
int c;
while(isspace(c = get()) && c != -1);
return c;
}
/**
* Keep get()ing characters until a we've passed over the next
* string of newline characters (\r's and \n's) or -1 is reached,
* and return it.
*/
int getPastNewline() {
int c = get();
while(!isnewline(c) && c != -1) c = get();
while(isnewline(c)) c = get();
assert_neq(c, '\r');
assert_neq(c, '\n');
return c;
}
/**
* Keep get()ing characters until a we've passed over the next
* string of newline characters (\r's and \n's) or -1 is reached,
* and return it.
*/
int peekPastNewline() {
int c = peek();
while(!isnewline(c) && c != -1) c = get();
while(isnewline(c)) c = get();
assert_neq(c, '\r');
assert_neq(c, '\n');
return c;
}
/**
* Keep peek()ing then get()ing characters until the next return
* from peek() is just after the last newline of the line.
*/
int peekUptoNewline() {
int c = peek();
while(!isnewline(c) && c != -1) {
get(); c = peek();
}
while(isnewline(c)) {
get();
c = peek();
}
assert_neq(c, '\r');
assert_neq(c, '\n');
return c;
}
/**
* Parse a FASTA record. Append name characters to 'name' and and append
* all sequence characters to 'seq'. If gotCaret is true, assuming the
* file cursor has already moved just past the starting '>' character.
*/
template <typename TNameStr, typename TSeqStr>
void parseFastaRecord(
TNameStr& name,
TSeqStr& seq,
bool gotCaret = false)
{
int c;
if(!gotCaret) {
// Skip over caret and non-newline whitespace
c = peek();
while(isspace_notnl(c) || c == '>') { get(); c = peek(); }
} else {
// Skip over non-newline whitespace
c = peek();
while(isspace_notnl(c)) { get(); c = peek(); }
}
size_t namecur = 0, seqcur = 0;
// c is the first character of the fasta name record, or is the first
// newline character if the name record is empty
while(!isnewline(c) && c != -1) {
name[namecur++] = c; get(); c = peek();
}
// sequence consists of all the non-whitespace characters between here
// and the next caret
while(true) {
// skip over whitespace
while(isspace(c)) { get(); c = peek(); }
// if we see caret or EOF, break
if(c == '>' || c == -1) break;
// append and continue
seq[seqcur++] = c;
get(); c = peek();
}
}
/**
* Parse a FASTA record and return its length. If gotCaret is true,
* assuming the file cursor has already moved just past the starting '>'
* character.
*/
void parseFastaRecordLength(
size_t& nameLen,
size_t& seqLen,
bool gotCaret = false)
{
int c;
nameLen = seqLen = 0;
if(!gotCaret) {
// Skip over caret and non-newline whitespace
c = peek();
while(isspace_notnl(c) || c == '>') { get(); c = peek(); }
} else {
// Skip over non-newline whitespace
c = peek();
while(isspace_notnl(c)) { get(); c = peek(); }
}
// c is the first character of the fasta name record, or is the first
// newline character if the name record is empty
while(!isnewline(c) && c != -1) {
nameLen++; get(); c = peek();
}
// sequence consists of all the non-whitespace characters between here
// and the next caret
while(true) {
// skip over whitespace
while(isspace(c)) { get(); c = peek(); }
// if we see caret or EOF, break
if(c == '>' || c == -1) break;
// append and continue
seqLen++;
get(); c = peek();
}
}
/**
* Reset to the beginning of the last-N-chars buffer.
*/
void resetLastN() {
_lastn_cur = 0;
}
/**
* Copy the last several characters in the last-N-chars buffer
* (since the last reset) into the provided buffer.
*/
size_t copyLastN(char *buf) {
memcpy(buf, _lastn_buf, _lastn_cur);
return _lastn_cur;
}
/**
* Get const pointer to the last-N-chars buffer.
*/
const char *lastN() const {
return _lastn_buf;
}
/**
* Get current size of the last-N-chars buffer.
*/
size_t lastNLen() const {
return _lastn_cur;
}
private:
void init() {
_in = NULL;
_inf = NULL;
_ins = NULL;
_cur = _buf_sz = BUF_SZ;
_done = false;
_lastn_cur = 0;
// no need to clear _buf[]
}
static const size_t BUF_SZ = 256 * 1024;
FILE *_in;
std::ifstream *_inf;
std::istream *_ins;
size_t _cur;
size_t _buf_sz;
bool _done;
uint8_t _buf[BUF_SZ]; // (large) input buffer
size_t _lastn_cur;
char _lastn_buf[LASTN_BUF_SZ]; // buffer of the last N chars dispensed
};
/**
* Wrapper for a buffered output stream that writes bitpairs.
*/
class BitpairOutFileBuf {
public:
/**
* Open a new output stream to a file with given name.
*/
BitpairOutFileBuf(const char *in) : bpPtr_(0), cur_(0) {
assert(in != NULL);
out_ = fopen(in, "wb");
if(out_ == NULL) {
std::cerr << "Error: Could not open bitpair-output file " << in << std::endl;
throw 1;
}
memset(buf_, 0, BUF_SZ);
}
/**
* Write a single bitpair into the buf. Flush the buffer if it's
* full.
*/
void write(int bp) {
assert_lt(bp, 4);
assert_geq(bp, 0);
buf_[cur_] |= (bp << bpPtr_);
if(bpPtr_ == 6) {
bpPtr_ = 0;
cur_++;
if(cur_ == BUF_SZ) {
// Flush the buffer
if(!fwrite((const void *)buf_, BUF_SZ, 1, out_)) {
std::cerr << "Error writing to the reference index file (.4.ebwt)" << std::endl;
throw 1;
}
// Reset to beginning of the buffer
cur_ = 0;
}
// Initialize next octet to 0
buf_[cur_] = 0;
} else {
bpPtr_ += 2;
}
}
/**
* Write any remaining bitpairs and then close the input
*/
void close() {
if(cur_ > 0 || bpPtr_ > 0) {
if(bpPtr_ == 0) cur_--;
if(!fwrite((const void *)buf_, cur_ + 1, 1, out_)) {
std::cerr << "Error writing to the reference index file (.4.ebwt)" << std::endl;
throw 1;
}
}
fclose(out_);
}
private:
static const size_t BUF_SZ = 128 * 1024;
FILE *out_;
int bpPtr_;
size_t cur_;
char buf_[BUF_SZ]; // (large) input buffer
};
/**
* Wrapper for a buffered output stream that writes characters and
* other data types. This class is *not* synchronized; the caller is
* responsible for synchronization.
*/
class OutFileBuf {
public:
/**
* Open a new output stream to a file with given name.
*/
OutFileBuf(const std::string& out, bool binary = false) :
name_(out.c_str()), cur_(0), closed_(false)
{
out_ = fopen(out.c_str(), binary ? "wb" : "w");
if(out_ == NULL) {
std::cerr << "Error: Could not open alignment output file " << out.c_str() << std::endl;
throw 1;
}
if(setvbuf(out_, NULL, _IOFBF, 10* 1024* 1024))
std::cerr << "Warning: Could not allocate the proper buffer size for output file stream. " << std::endl;
}
/**
* Open a new output stream to a file with given name.
*/
OutFileBuf(const char *out, bool binary = false) :
name_(out), cur_(0), closed_(false)
{
assert(out != NULL);
out_ = fopen(out, binary ? "wb" : "w");
if(out_ == NULL) {
std::cerr << "Error: Could not open alignment output file " << out << std::endl;
throw 1;
}
}
/**
* Open a new output stream to standard out.
*/
OutFileBuf() : name_("cout"), cur_(0), closed_(false) {
out_ = stdout;
}
/**
* Close buffer when object is destroyed.
*/
~OutFileBuf() { close(); }
/**
* Open a new output stream to a file with given name.
*/
void setFile(const char *out, bool binary = false) {
assert(out != NULL);
out_ = fopen(out, binary ? "wb" : "w");
if(out_ == NULL) {
std::cerr << "Error: Could not open alignment output file " << out << std::endl;
throw 1;
}
reset();
}
/**
* Write a single character into the write buffer and, if
* necessary, flush.
*/
void write(char c) {
assert(!closed_);
if(cur_ == BUF_SZ) flush();
buf_[cur_++] = c;
}
/**
* Write a c++ string to the write buffer and, if necessary, flush.
*/
void writeString(const std::string& s) {
assert(!closed_);
size_t slen = s.length();
if(cur_ + slen > BUF_SZ) {
if(cur_ > 0) flush();
if(slen >= BUF_SZ) {
fwrite(s.c_str(), slen, 1, out_);
} else {
memcpy(&buf_[cur_], s.data(), slen);
assert_eq(0, cur_);
cur_ = slen;
}
} else {
memcpy(&buf_[cur_], s.data(), slen);
cur_ += slen;
}
assert_leq(cur_, BUF_SZ);
}
/**
* Write a c++ string to the write buffer and, if necessary, flush.
*/
template<typename T>
void writeString(const T& s) {
assert(!closed_);
size_t slen = s.length();
if(cur_ + slen > BUF_SZ) {
if(cur_ > 0) flush();
if(slen >= BUF_SZ) {
fwrite(s.toZBuf(), slen, 1, out_);
} else {
memcpy(&buf_[cur_], s.toZBuf(), slen);
assert_eq(0, cur_);
cur_ = slen;
}
} else {
memcpy(&buf_[cur_], s.toZBuf(), slen);
cur_ += slen;
}
assert_leq(cur_, BUF_SZ);
}
/**
* Write a c++ string to the write buffer and, if necessary, flush.
*/
void writeChars(const char * s, size_t len) {
assert(!closed_);
if(cur_ + len > BUF_SZ) {
if(cur_ > 0) flush();
if(len >= BUF_SZ) {
fwrite(s, len, 1, out_);
} else {
memcpy(&buf_[cur_], s, len);
assert_eq(0, cur_);
cur_ = len;
}
} else {
memcpy(&buf_[cur_], s, len);
cur_ += len;
}
assert_leq(cur_, BUF_SZ);
}
/**
* Write a 0-terminated C string to the output stream.
*/
void writeChars(const char * s) {
writeChars(s, strlen(s));
}
/**
* Write any remaining bitpairs and then close the input
*/
void close() {
if(closed_) return;
if(cur_ > 0) flush();
closed_ = true;
if(out_ != stdout) {
fclose(out_);
}
}
/**
* Reset so that the next write is as though it's the first.
*/
void reset() {
cur_ = 0;
closed_ = false;
}
void flush() {
if(!fwrite((const void *)buf_, cur_, 1, out_)) {
std::cerr << "Error while flushing and closing output" << std::endl;
throw 1;
}
cur_ = 0;
}
/**
* Return true iff this stream is closed.
*/
bool closed() const {
return closed_;
}
/**
* Return the filename.
*/
const char *name() {
return name_;
}
private:
static const size_t BUF_SZ = 16 * 1024;
const char *name_;
FILE *out_;
size_t cur_;
char buf_[BUF_SZ]; // (large) input buffer
bool closed_;
};
#endif /*ndef FILEBUF_H_*/