/* * Copyright 2011, Ben Langmead * * This file is part of Bowtie 2. * * Bowtie 2 is free software: you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation, either version 3 of the License, or * (at your option) any later version. * * Bowtie 2 is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with Bowtie 2. If not, see . */ #ifndef FILEBUF_H_ #define FILEBUF_H_ #include #include #include #include #include #include #include #include "assert_helpers.h" /** * Simple, fast helper for determining if a character is a newline. */ static inline bool isnewline(int c) { return c == '\r' || c == '\n'; } /** * Simple, fast helper for determining if a character is a non-newline * whitespace character. */ static inline bool isspace_notnl(int c) { return isspace(c) && !isnewline(c); } /** * Simple wrapper for a FILE*, istream or ifstream that reads it in chunks * using fread and keeps those chunks in a buffer. It also services calls to * get(), peek() and gets() from the buffer, reading in additional chunks when * necessary. * * Helper functions do things like parse strings, numbers, and FASTA records. * * */ class FileBuf { public: FileBuf() { init(); } FileBuf(FILE *in) { init(); _in = in; assert(_in != NULL); } FileBuf(std::ifstream *inf) { init(); _inf = inf; assert(_inf != NULL); } FileBuf(std::istream *ins) { init(); _ins = ins; assert(_ins != NULL); } /** * Return true iff there is a stream ready to read. */ bool isOpen() { return _in != NULL || _inf != NULL || _ins != NULL; } /** * Close the input stream (if that's possible) */ void close() { if(_in != NULL && _in != stdin) { fclose(_in); } else if(_inf != NULL) { _inf->close(); } else { // can't close _ins } } /** * Get the next character of input and advance. */ int get() { assert(_in != NULL || _inf != NULL || _ins != NULL); int c = peek(); if(c != -1) { _cur++; if(_lastn_cur < LASTN_BUF_SZ) _lastn_buf[_lastn_cur++] = c; } return c; } /** * Return true iff all input is exhausted. */ bool eof() { return (_cur == _buf_sz) && _done; } /** * Initialize the buffer with a new C-style file. */ void newFile(FILE *in) { _in = in; _inf = NULL; _ins = NULL; _cur = BUF_SZ; _buf_sz = BUF_SZ; _done = false; } /** * Initialize the buffer with a new ifstream. */ void newFile(std::ifstream *__inf) { _in = NULL; _inf = __inf; _ins = NULL; _cur = BUF_SZ; _buf_sz = BUF_SZ; _done = false; } /** * Initialize the buffer with a new istream. */ void newFile(std::istream *__ins) { _in = NULL; _inf = NULL; _ins = __ins; _cur = BUF_SZ; _buf_sz = BUF_SZ; _done = false; } /** * Restore state as though we just started reading the input * stream. */ void reset() { if(_inf != NULL) { _inf->clear(); _inf->seekg(0, std::ios::beg); } else if(_ins != NULL) { _ins->clear(); _ins->seekg(0, std::ios::beg); } else { rewind(_in); } _cur = BUF_SZ; _buf_sz = BUF_SZ; _done = false; } /** * Peek at the next character of the input stream without * advancing. Typically we can simple read it from the buffer. * Occasionally we'll need to read in a new buffer's worth of data. */ int peek() { assert(_in != NULL || _inf != NULL || _ins != NULL); assert_leq(_cur, _buf_sz); if(_cur == _buf_sz) { if(_done) { // We already exhausted the input stream return -1; } // Read a new buffer's worth of data else { // Get the next chunk if(_inf != NULL) { _inf->read((char*)_buf, BUF_SZ); _buf_sz = _inf->gcount(); } else if(_ins != NULL) { _ins->read((char*)_buf, BUF_SZ); _buf_sz = _ins->gcount(); } else { assert(_in != NULL); _buf_sz = fread(_buf, 1, BUF_SZ, _in); } _cur = 0; if(_buf_sz == 0) { // Exhausted, and we have nothing to return to the // caller _done = true; return -1; } else if(_buf_sz < BUF_SZ) { // Exhausted _done = true; } } } return (int)_buf[_cur]; } /** * Store a string of characters from the input file into 'buf', * until we see a newline, EOF, or until 'len' characters have been * read. */ size_t gets(char *buf, size_t len) { size_t stored = 0; while(true) { int c = get(); if(c == -1) { // End-of-file buf[stored] = '\0'; return stored; } if(stored == len-1 || isnewline(c)) { // End of string buf[stored] = '\0'; // Skip over all end-of-line characters int pc = peek(); while(isnewline(pc)) { get(); // discard pc = peek(); } // Next get() will be after all newline characters return stored; } buf[stored++] = (char)c; } } /** * Store a string of characters from the input file into 'buf', * until we see a newline, EOF, or until 'len' characters have been * read. */ size_t get(char *buf, size_t len) { size_t stored = 0; for(size_t i = 0; i < len; i++) { int c = get(); if(c == -1) return i; buf[stored++] = (char)c; } return len; } static const size_t LASTN_BUF_SZ = 8 * 1024; /** * Keep get()ing characters until a non-whitespace character (or * -1) is reached, and return it. */ int getPastWhitespace() { int c; while(isspace(c = get()) && c != -1); return c; } /** * Keep get()ing characters until a we've passed over the next * string of newline characters (\r's and \n's) or -1 is reached, * and return it. */ int getPastNewline() { int c = get(); while(!isnewline(c) && c != -1) c = get(); while(isnewline(c)) c = get(); assert_neq(c, '\r'); assert_neq(c, '\n'); return c; } /** * Keep get()ing characters until a we've passed over the next * string of newline characters (\r's and \n's) or -1 is reached, * and return it. */ int peekPastNewline() { int c = peek(); while(!isnewline(c) && c != -1) c = get(); while(isnewline(c)) c = get(); assert_neq(c, '\r'); assert_neq(c, '\n'); return c; } /** * Keep peek()ing then get()ing characters until the next return * from peek() is just after the last newline of the line. */ int peekUptoNewline() { int c = peek(); while(!isnewline(c) && c != -1) { get(); c = peek(); } while(isnewline(c)) { get(); c = peek(); } assert_neq(c, '\r'); assert_neq(c, '\n'); return c; } /** * Parse a FASTA record. Append name characters to 'name' and and append * all sequence characters to 'seq'. If gotCaret is true, assuming the * file cursor has already moved just past the starting '>' character. */ template void parseFastaRecord( TNameStr& name, TSeqStr& seq, bool gotCaret = false) { int c; if(!gotCaret) { // Skip over caret and non-newline whitespace c = peek(); while(isspace_notnl(c) || c == '>') { get(); c = peek(); } } else { // Skip over non-newline whitespace c = peek(); while(isspace_notnl(c)) { get(); c = peek(); } } size_t namecur = 0, seqcur = 0; // c is the first character of the fasta name record, or is the first // newline character if the name record is empty while(!isnewline(c) && c != -1) { name[namecur++] = c; get(); c = peek(); } // sequence consists of all the non-whitespace characters between here // and the next caret while(true) { // skip over whitespace while(isspace(c)) { get(); c = peek(); } // if we see caret or EOF, break if(c == '>' || c == -1) break; // append and continue seq[seqcur++] = c; get(); c = peek(); } } /** * Parse a FASTA record and return its length. If gotCaret is true, * assuming the file cursor has already moved just past the starting '>' * character. */ void parseFastaRecordLength( size_t& nameLen, size_t& seqLen, bool gotCaret = false) { int c; nameLen = seqLen = 0; if(!gotCaret) { // Skip over caret and non-newline whitespace c = peek(); while(isspace_notnl(c) || c == '>') { get(); c = peek(); } } else { // Skip over non-newline whitespace c = peek(); while(isspace_notnl(c)) { get(); c = peek(); } } // c is the first character of the fasta name record, or is the first // newline character if the name record is empty while(!isnewline(c) && c != -1) { nameLen++; get(); c = peek(); } // sequence consists of all the non-whitespace characters between here // and the next caret while(true) { // skip over whitespace while(isspace(c)) { get(); c = peek(); } // if we see caret or EOF, break if(c == '>' || c == -1) break; // append and continue seqLen++; get(); c = peek(); } } /** * Reset to the beginning of the last-N-chars buffer. */ void resetLastN() { _lastn_cur = 0; } /** * Copy the last several characters in the last-N-chars buffer * (since the last reset) into the provided buffer. */ size_t copyLastN(char *buf) { memcpy(buf, _lastn_buf, _lastn_cur); return _lastn_cur; } /** * Get const pointer to the last-N-chars buffer. */ const char *lastN() const { return _lastn_buf; } /** * Get current size of the last-N-chars buffer. */ size_t lastNLen() const { return _lastn_cur; } private: void init() { _in = NULL; _inf = NULL; _ins = NULL; _cur = _buf_sz = BUF_SZ; _done = false; _lastn_cur = 0; // no need to clear _buf[] } static const size_t BUF_SZ = 256 * 1024; FILE *_in; std::ifstream *_inf; std::istream *_ins; size_t _cur; size_t _buf_sz; bool _done; uint8_t _buf[BUF_SZ]; // (large) input buffer size_t _lastn_cur; char _lastn_buf[LASTN_BUF_SZ]; // buffer of the last N chars dispensed }; /** * Wrapper for a buffered output stream that writes bitpairs. */ class BitpairOutFileBuf { public: /** * Open a new output stream to a file with given name. */ BitpairOutFileBuf(const char *in) : bpPtr_(0), cur_(0) { assert(in != NULL); out_ = fopen(in, "wb"); if(out_ == NULL) { std::cerr << "Error: Could not open bitpair-output file " << in << std::endl; throw 1; } memset(buf_, 0, BUF_SZ); } /** * Write a single bitpair into the buf. Flush the buffer if it's * full. */ void write(int bp) { assert_lt(bp, 4); assert_geq(bp, 0); buf_[cur_] |= (bp << bpPtr_); if(bpPtr_ == 6) { bpPtr_ = 0; cur_++; if(cur_ == BUF_SZ) { // Flush the buffer if(!fwrite((const void *)buf_, BUF_SZ, 1, out_)) { std::cerr << "Error writing to the reference index file (.4.ebwt)" << std::endl; throw 1; } // Reset to beginning of the buffer cur_ = 0; } // Initialize next octet to 0 buf_[cur_] = 0; } else { bpPtr_ += 2; } } /** * Write any remaining bitpairs and then close the input */ void close() { if(cur_ > 0 || bpPtr_ > 0) { if(bpPtr_ == 0) cur_--; if(!fwrite((const void *)buf_, cur_ + 1, 1, out_)) { std::cerr << "Error writing to the reference index file (.4.ebwt)" << std::endl; throw 1; } } fclose(out_); } private: static const size_t BUF_SZ = 128 * 1024; FILE *out_; int bpPtr_; size_t cur_; char buf_[BUF_SZ]; // (large) input buffer }; /** * Wrapper for a buffered output stream that writes characters and * other data types. This class is *not* synchronized; the caller is * responsible for synchronization. */ class OutFileBuf { public: /** * Open a new output stream to a file with given name. */ OutFileBuf(const std::string& out, bool binary = false) : name_(out.c_str()), cur_(0), closed_(false) { out_ = fopen(out.c_str(), binary ? "wb" : "w"); if(out_ == NULL) { std::cerr << "Error: Could not open alignment output file " << out.c_str() << std::endl; throw 1; } if(setvbuf(out_, NULL, _IOFBF, 10* 1024* 1024)) std::cerr << "Warning: Could not allocate the proper buffer size for output file stream. " << std::endl; } /** * Open a new output stream to a file with given name. */ OutFileBuf(const char *out, bool binary = false) : name_(out), cur_(0), closed_(false) { assert(out != NULL); out_ = fopen(out, binary ? "wb" : "w"); if(out_ == NULL) { std::cerr << "Error: Could not open alignment output file " << out << std::endl; throw 1; } } /** * Open a new output stream to standard out. */ OutFileBuf() : name_("cout"), cur_(0), closed_(false) { out_ = stdout; } /** * Close buffer when object is destroyed. */ ~OutFileBuf() { close(); } /** * Open a new output stream to a file with given name. */ void setFile(const char *out, bool binary = false) { assert(out != NULL); out_ = fopen(out, binary ? "wb" : "w"); if(out_ == NULL) { std::cerr << "Error: Could not open alignment output file " << out << std::endl; throw 1; } reset(); } /** * Write a single character into the write buffer and, if * necessary, flush. */ void write(char c) { assert(!closed_); if(cur_ == BUF_SZ) flush(); buf_[cur_++] = c; } /** * Write a c++ string to the write buffer and, if necessary, flush. */ void writeString(const std::string& s) { assert(!closed_); size_t slen = s.length(); if(cur_ + slen > BUF_SZ) { if(cur_ > 0) flush(); if(slen >= BUF_SZ) { fwrite(s.c_str(), slen, 1, out_); } else { memcpy(&buf_[cur_], s.data(), slen); assert_eq(0, cur_); cur_ = slen; } } else { memcpy(&buf_[cur_], s.data(), slen); cur_ += slen; } assert_leq(cur_, BUF_SZ); } /** * Write a c++ string to the write buffer and, if necessary, flush. */ template void writeString(const T& s) { assert(!closed_); size_t slen = s.length(); if(cur_ + slen > BUF_SZ) { if(cur_ > 0) flush(); if(slen >= BUF_SZ) { fwrite(s.toZBuf(), slen, 1, out_); } else { memcpy(&buf_[cur_], s.toZBuf(), slen); assert_eq(0, cur_); cur_ = slen; } } else { memcpy(&buf_[cur_], s.toZBuf(), slen); cur_ += slen; } assert_leq(cur_, BUF_SZ); } /** * Write a c++ string to the write buffer and, if necessary, flush. */ void writeChars(const char * s, size_t len) { assert(!closed_); if(cur_ + len > BUF_SZ) { if(cur_ > 0) flush(); if(len >= BUF_SZ) { fwrite(s, len, 1, out_); } else { memcpy(&buf_[cur_], s, len); assert_eq(0, cur_); cur_ = len; } } else { memcpy(&buf_[cur_], s, len); cur_ += len; } assert_leq(cur_, BUF_SZ); } /** * Write a 0-terminated C string to the output stream. */ void writeChars(const char * s) { writeChars(s, strlen(s)); } /** * Write any remaining bitpairs and then close the input */ void close() { if(closed_) return; if(cur_ > 0) flush(); closed_ = true; if(out_ != stdout) { fclose(out_); } } /** * Reset so that the next write is as though it's the first. */ void reset() { cur_ = 0; closed_ = false; } void flush() { if(!fwrite((const void *)buf_, cur_, 1, out_)) { std::cerr << "Error while flushing and closing output" << std::endl; throw 1; } cur_ = 0; } /** * Return true iff this stream is closed. */ bool closed() const { return closed_; } /** * Return the filename. */ const char *name() { return name_; } private: static const size_t BUF_SZ = 16 * 1024; const char *name_; FILE *out_; size_t cur_; char buf_[BUF_SZ]; // (large) input buffer bool closed_; }; #endif /*ndef FILEBUF_H_*/