2798 lines
103 KiB
C++
2798 lines
103 KiB
C++
/*
|
|
* Copyright 2015, Daehwan Kim <infphilo@gmail.com>, Joe Paggi <jpaggi@mit.edu>
|
|
*
|
|
* This file is part of HISAT 2.
|
|
*
|
|
* HISAT 2 is free software: you can redistribute it and/or modify
|
|
* it under the terms of the GNU General Public License as published by
|
|
* the Free Software Foundation, either version 3 of the License, or
|
|
* (at your option) any later version.
|
|
*
|
|
* HISAT 2 is distributed in the hope that it will be useful,
|
|
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
|
* GNU General Public License for more details.
|
|
*
|
|
* You should have received a copy of the GNU General Public License
|
|
* along with HISAT 2. If not, see <http://www.gnu.org/licenses/>.
|
|
*/
|
|
|
|
#ifndef GBWT_GRAPH_H_
|
|
#define GBWT_GRAPH_H_
|
|
|
|
#include <string>
|
|
#include <stdexcept>
|
|
#include <iostream>
|
|
#include <fstream>
|
|
#include <stdlib.h>
|
|
#include <map>
|
|
#include <deque>
|
|
#include <time.h>
|
|
#include "alt.h"
|
|
#include "radix_sort.h"
|
|
|
|
// Reference:
|
|
// Jouni Sirén, Niko Välimäki, and Veli Mäkinen: Indexing Graphs for Path Queries with Applications in Genome Research.
|
|
// IEEE/ACM Transactions on Computational Biology and Bioinformatics 11(2):375-388, 2014.
|
|
// http://ieeexplore.ieee.org/xpl/articleDetails.jsp?arnumber=6698337
|
|
|
|
//--------------------------------------------------------------------------
|
|
|
|
struct NongraphException : public exception
|
|
{
|
|
const char* what () const throw ()
|
|
{
|
|
return "Nongraph exception";
|
|
}
|
|
};
|
|
|
|
struct ExplosionException : public exception
|
|
{
|
|
const char* what () const throw ()
|
|
{
|
|
return "Explosion exception";
|
|
}
|
|
};
|
|
|
|
template <typename index_t> class PathGraph;
|
|
|
|
// Note: I wrote the following codes based on Siren's work, gcsa (see the reference above).
|
|
template <typename index_t>
|
|
class RefGraph {
|
|
friend class PathGraph<index_t>;
|
|
public:
|
|
struct Node {
|
|
char label; // ACGT + Y(head) + Z(tail)
|
|
index_t value; // location in a whole genome
|
|
|
|
Node() { reset(); }
|
|
Node(char label_, index_t value_) : label(label_), value(value_) {}
|
|
void reset() { label = 0; value = 0; }
|
|
|
|
bool write(ofstream& f_out, bool bigEndian) const {
|
|
writeIndex<index_t>(f_out, value, bigEndian);
|
|
writeU16(f_out, label, bigEndian);
|
|
return true;
|
|
}
|
|
|
|
bool read(ifstream& f_in, bool bigEndian) {
|
|
value = readIndex<index_t>(f_in, bigEndian);
|
|
label = (char)readU16(f_in, bigEndian);
|
|
return true;
|
|
}
|
|
|
|
bool operator== (const Node& o) const {
|
|
if(value != o.value) return false;
|
|
if(label != o.label) return false;
|
|
return true;
|
|
}
|
|
|
|
bool operator< (const Node& o) const {
|
|
if(value != o.value) return value < o.value;
|
|
return label < o.label;
|
|
}
|
|
};
|
|
|
|
struct Edge {
|
|
index_t from; // from Node
|
|
index_t to; // to Node
|
|
|
|
Edge() {}
|
|
Edge(index_t from_, index_t to_) : from(from_), to(to_) {}
|
|
|
|
bool write(ofstream& f_out, bool bigEndian) const {
|
|
writeIndex<index_t>(f_out, from, bigEndian);
|
|
writeIndex<index_t>(f_out, to, bigEndian);
|
|
return true;
|
|
}
|
|
|
|
bool read(ifstream& f_in, bool bigEndian) {
|
|
from = readIndex<index_t>(f_in, bigEndian);
|
|
to = readIndex<index_t>(f_in, bigEndian);
|
|
return true;
|
|
}
|
|
|
|
bool operator< (const Edge& o) const {
|
|
if(from != o.from) return from < o.from;
|
|
return to < o.to;
|
|
}
|
|
};
|
|
|
|
static index_t EdgeTo (Edge& a) {
|
|
return a.to;
|
|
}
|
|
|
|
struct EdgeFromCmp {
|
|
bool operator() (const Edge& a, const Edge& b) const {
|
|
return a.from < b.from;
|
|
}
|
|
};
|
|
|
|
struct EdgeToCmp {
|
|
bool operator() (const Edge& a, const Edge& b) const {
|
|
return a.to < b.to;
|
|
};
|
|
};
|
|
|
|
public:
|
|
RefGraph(const SString<char>& s,
|
|
const EList<RefRecord>& szs,
|
|
const EList<ALT<index_t> >& alts,
|
|
const EList<Haplotype<index_t> >& haplotypes,
|
|
const string& out_fname,
|
|
int nthreads_,
|
|
bool verbose);
|
|
|
|
bool repOk() { return true; }
|
|
|
|
void write(const string& fname, bool bigEndian) {
|
|
ofstream rg_file(fname.c_str(), ios::binary);
|
|
if(!rg_file.good()) {
|
|
cerr << "Could not open file for writing a reference graph: \"" << fname << "\"" << endl;
|
|
throw 1;
|
|
}
|
|
writeIndex<index_t>(rg_file, (index_t)nodes.size(), bigEndian);
|
|
for(index_t i = 0; i < nodes.size(); i++) {
|
|
nodes[i].write(rg_file, bigEndian);
|
|
}
|
|
writeIndex<index_t>(rg_file, (index_t)edges.size(), bigEndian);
|
|
for(index_t i = 0; i < edges.size(); i++) {
|
|
edges[i].write(rg_file, bigEndian);
|
|
}
|
|
rg_file.close();
|
|
}
|
|
|
|
void nullify() {
|
|
nodes.nullify();
|
|
edges.nullify();
|
|
}
|
|
|
|
void read(const string& fname, bool bigEndian) {
|
|
ifstream rg_file(fname.c_str(), ios::binary);
|
|
if(!rg_file.good()) {
|
|
cerr << "Could not open file for reading a reference graph: \"" << fname << "\"" << endl;
|
|
throw 1;
|
|
}
|
|
index_t num_nodes = readIndex<index_t>(rg_file, bigEndian);
|
|
nodes.resizeNoCopyExact(num_nodes);
|
|
for(index_t i = 0; i < num_nodes; i++) {
|
|
nodes[i].read(rg_file, bigEndian);
|
|
}
|
|
index_t num_edges = readIndex<index_t>(rg_file, bigEndian);
|
|
edges.resizeNoCopyExact(num_edges);
|
|
for(index_t i = 0; i < num_edges; i++) {
|
|
edges[i].read(rg_file, bigEndian);
|
|
}
|
|
rg_file.close();
|
|
}
|
|
|
|
private:
|
|
static bool isReverseDeterministic(EList<Node>& nodes, EList<Edge>& edges);
|
|
static void reverseDeterminize(EList<Node>& nodes, EList<Edge>& edges, index_t& lastNode, index_t lastNode_add = 0);
|
|
|
|
static void sortEdgesFrom(EList<Edge>& edges) {
|
|
std::sort(edges.begin(), edges.end(), EdgeFromCmp());
|
|
}
|
|
static void sortEdgesTo(EList<Edge>& edges) {
|
|
std::sort(edges.begin(), edges.end(), EdgeToCmp());
|
|
}
|
|
|
|
// Return edge ranges [begin, end)
|
|
static pair<index_t, index_t> findEdges(const EList<Edge>& edges, index_t node, bool from);
|
|
static pair<index_t, index_t> findEdgesFrom(const EList<Edge>& edges, index_t node) {
|
|
return findEdges(edges, node, true);
|
|
}
|
|
static pair<index_t, index_t> findEdgesTo(const EList<Edge>& edges, index_t node) {
|
|
return findEdges(edges, node, false);
|
|
}
|
|
static pair<index_t, index_t> getNextEdgeRange(const EList<Edge>& sep_edges, pair<index_t, index_t> range, bool from) {
|
|
if(range.second >= sep_edges.size()) {
|
|
return pair<index_t, index_t>(0, 0);
|
|
}
|
|
range.first = range.second; range.second++;
|
|
|
|
if(from) {
|
|
while(range.second < sep_edges.size() && sep_edges[range.second].from == sep_edges[range.first].from) {
|
|
range.second++;
|
|
}
|
|
} else {
|
|
while(range.second < sep_edges.size() && sep_edges[range.second].to == sep_edges[range.first].to) {
|
|
range.second++;
|
|
}
|
|
}
|
|
return range;
|
|
}
|
|
|
|
private:
|
|
struct ThreadParam {
|
|
// input
|
|
index_t thread_id;
|
|
RefGraph<index_t>* refGraph;
|
|
const SString<char>* s;
|
|
const EList<ALT<index_t> >* alts;
|
|
const EList<Haplotype<index_t> >* haplotypes;
|
|
string out_fname;
|
|
bool bigEndian;
|
|
|
|
// output
|
|
index_t num_nodes;
|
|
index_t num_edges;
|
|
index_t lastNode;
|
|
bool multipleHeadNodes;
|
|
};
|
|
static void buildGraph_worker(void* vp);
|
|
|
|
private:
|
|
EList<RefRecord> szs;
|
|
EList<RefRecord> tmp_szs;
|
|
|
|
EList<Node> nodes;
|
|
EList<Edge> edges;
|
|
index_t lastNode; // Z
|
|
|
|
int nthreads;
|
|
|
|
#ifndef NDEBUG
|
|
bool debug;
|
|
#endif
|
|
|
|
private:
|
|
// Following composite nodes and edges are used to reverse-determinize an automaton.
|
|
struct CompositeNodeIDs {
|
|
index_t id;
|
|
EList<index_t> add_ids;
|
|
|
|
CompositeNodeIDs() {
|
|
id = (index_t)INDEX_MAX;
|
|
}
|
|
|
|
bool operator<(const CompositeNodeIDs& o) const {
|
|
if(id != o.id) return id < o.id;
|
|
if(add_ids.size() != o.add_ids.size()) return add_ids.size() < o.add_ids.size();
|
|
for(index_t i = 0; i < add_ids.size(); i++) {
|
|
assert_lt(i, o.add_ids.size());
|
|
if(add_ids[i] != o.add_ids[i]) return add_ids[i] < o.add_ids[i];
|
|
}
|
|
return false;
|
|
}
|
|
|
|
index_t size() const {
|
|
if(id == (index_t)INDEX_MAX) return 0;
|
|
return (index_t)add_ids.size() + 1;
|
|
}
|
|
index_t getID(index_t i) const {
|
|
if(i == 0) return id;
|
|
else {
|
|
i -= 1;
|
|
assert_lt(i, add_ids.size());
|
|
return add_ids[i];
|
|
}
|
|
}
|
|
void push_back(index_t node_id) {
|
|
if(id == (index_t)INDEX_MAX) id = node_id;
|
|
else add_ids.push_back(node_id);
|
|
}
|
|
};
|
|
|
|
struct CompositeNode {
|
|
CompositeNodeIDs nodes;
|
|
index_t id;
|
|
char label;
|
|
index_t value;
|
|
|
|
CompositeNode() { reset(); }
|
|
|
|
CompositeNode(char label_, index_t node_id) :
|
|
id(0), label(label_)
|
|
{
|
|
nodes.push_back(node_id);
|
|
}
|
|
|
|
Node getNode() const {
|
|
return Node(label, value);
|
|
}
|
|
|
|
void reset() {
|
|
nodes.id = (index_t)INDEX_MAX;
|
|
nodes.add_ids.clear();
|
|
id = 0;
|
|
label = 0;
|
|
value = 0;
|
|
}
|
|
};
|
|
|
|
struct CompositeEdge {
|
|
index_t from;
|
|
index_t to;
|
|
|
|
CompositeEdge() : from(0), to(0) {}
|
|
CompositeEdge(index_t from_, index_t to_) : from(from_), to(to_) {}
|
|
|
|
Edge getEdge(const EList<CompositeNode>& nodes) const
|
|
{
|
|
assert_lt(from, nodes.size());
|
|
const CompositeNode& from_node = nodes[from];
|
|
assert_lt(to, nodes.size());
|
|
const CompositeNode& to_node = nodes[to];
|
|
return Edge(from_node.id, to_node.id);
|
|
}
|
|
|
|
bool operator < (const CompositeEdge& o) const
|
|
{
|
|
return from < o.from;
|
|
}
|
|
};
|
|
|
|
struct TempNodeLabelCmp {
|
|
TempNodeLabelCmp(const EList<Node>& nodes_) : nodes(nodes_) {}
|
|
bool operator() (index_t a, index_t b) const {
|
|
assert_lt(a, nodes.size());
|
|
assert_lt(b, nodes.size());
|
|
return nodes[a].label < nodes[b].label;
|
|
}
|
|
|
|
const EList<Node>& nodes;
|
|
};
|
|
};
|
|
|
|
/**
|
|
* Load reference sequence file and alt information.
|
|
* Construct a reference graph
|
|
*/
|
|
template <typename index_t>
|
|
RefGraph<index_t>::RefGraph(const SString<char>& s,
|
|
const EList<RefRecord>& szs,
|
|
const EList<ALT<index_t> >& alts,
|
|
const EList<Haplotype<index_t> >& haplotypes,
|
|
const string& out_fname,
|
|
int nthreads_,
|
|
bool verbose)
|
|
: lastNode(0), nthreads(nthreads_)
|
|
{
|
|
const bool bigEndian = false;
|
|
|
|
assert_gt(nthreads, 0);
|
|
assert_gt(szs.size(), 0);
|
|
index_t jlen = (index_t)s.length();
|
|
|
|
#ifndef NDEBUG
|
|
debug = (jlen <= 20);
|
|
#endif
|
|
|
|
// a memory-efficient way to create a population graph with known ALTs
|
|
bool frag_automaton = jlen >= (1 << 16);
|
|
if(frag_automaton) {
|
|
{
|
|
EList<pair<index_t, index_t> > alt_ranges; // each range inclusive
|
|
for(index_t i = 0; i < alts.size(); i++) {
|
|
const ALT<index_t>& alt = alts[i];
|
|
index_t left_relax = 128, right_relax = 128;
|
|
pair<index_t, index_t> range;
|
|
range.first = alt.pos > left_relax ? alt.pos - left_relax - 1 : 0;
|
|
if(alt.type == ALT_SNP_SGL) {
|
|
range.second = alt.pos + 1;
|
|
} else if(alt.type == ALT_SNP_DEL) {
|
|
assert_gt(alt.len, 0);
|
|
range.second = alt.pos + alt.len;
|
|
} else if(alt.type == ALT_SNP_INS) {
|
|
assert_gt(alt.len, 0);
|
|
range.second = alt.pos;
|
|
} else if (alt.type == ALT_SPLICESITE) {
|
|
assert_lt(alt.left, alt.right);
|
|
range.second = alt.right + 1;
|
|
} else {
|
|
assert(alt.exon());
|
|
continue;
|
|
}
|
|
range.second += right_relax;
|
|
|
|
if(alt_ranges.empty() || alt_ranges.back().second + 1 < range.first) {
|
|
alt_ranges.push_back(range);
|
|
} else {
|
|
assert_leq(alt_ranges.back().first, range.first);
|
|
if(alt_ranges.back().second < range.second) {
|
|
alt_ranges.back().second = range.second;
|
|
}
|
|
}
|
|
}
|
|
|
|
index_t chunk_size = 1 << 20;
|
|
index_t pos = 0, range_idx = 0;
|
|
for(index_t i = 0; i < szs.size(); i++) {
|
|
if(szs[i].len == 0) continue;
|
|
if(szs[i].len <= chunk_size) {
|
|
tmp_szs.push_back(szs[i]);
|
|
pos += szs[i].len;
|
|
} else {
|
|
index_t num_chunks = (szs[i].len + chunk_size - 1) / chunk_size;
|
|
assert_gt(num_chunks, 1);
|
|
index_t modified_chunk_size = szs[i].len / num_chunks;
|
|
index_t after_pos = pos + szs[i].len;
|
|
ASSERT_ONLY(index_t sum_len = 0);
|
|
while(pos < after_pos) {
|
|
index_t target_pos = pos + modified_chunk_size;
|
|
if(target_pos < after_pos) {
|
|
for(; range_idx < alt_ranges.size(); range_idx++) {
|
|
if(target_pos < alt_ranges[range_idx].first) break;
|
|
}
|
|
pair<index_t, index_t> alt_free_range;
|
|
if(range_idx == 0) {
|
|
alt_free_range.first = 0;
|
|
} else {
|
|
alt_free_range.first = alt_ranges[range_idx - 1].second + 1;
|
|
if(alt_free_range.first >= jlen) {
|
|
alt_free_range.first = jlen - 1;
|
|
}
|
|
}
|
|
|
|
if(range_idx == alt_ranges.size()) {
|
|
alt_free_range.second = jlen - 1;
|
|
} else {
|
|
alt_free_range.second = alt_ranges[range_idx].first - 1;
|
|
}
|
|
|
|
assert_leq(alt_free_range.first, alt_free_range.second);
|
|
if(target_pos < alt_free_range.first) target_pos = alt_free_range.first;
|
|
if(target_pos > after_pos) target_pos = after_pos;
|
|
} else {
|
|
target_pos = after_pos;
|
|
}
|
|
|
|
tmp_szs.expand();
|
|
tmp_szs.back().len = target_pos - pos;
|
|
tmp_szs.back().off = 0;
|
|
pos = target_pos;
|
|
ASSERT_ONLY(sum_len += tmp_szs.back().len);
|
|
}
|
|
assert_eq(pos, after_pos);
|
|
assert_eq(sum_len, szs[i].len);
|
|
}
|
|
}
|
|
#ifndef NDEBUG
|
|
index_t modified_jlen = 0;
|
|
for(index_t i = 0; i < tmp_szs.size(); i++) {
|
|
modified_jlen += tmp_szs[i].len;
|
|
}
|
|
assert_eq(modified_jlen, jlen);
|
|
#endif
|
|
}
|
|
|
|
if(nthreads > (int)tmp_szs.size()) {
|
|
nthreads = (int)tmp_szs.size();
|
|
}
|
|
assert_gt(nthreads, 0);
|
|
AutoArray<tthread::thread*> threads(nthreads);
|
|
EList<ThreadParam> threadParams;
|
|
for(index_t i = 0; i < (index_t)nthreads; i++) {
|
|
threadParams.expand();
|
|
threadParams.back().thread_id = i;
|
|
threadParams.back().refGraph = this;
|
|
threadParams.back().s = &s;
|
|
threadParams.back().alts = &alts;
|
|
threadParams.back().haplotypes = &haplotypes;
|
|
threadParams.back().out_fname = out_fname;
|
|
threadParams.back().bigEndian = bigEndian;
|
|
threadParams.back().num_nodes = 0;
|
|
threadParams.back().num_edges = 0;
|
|
threadParams.back().lastNode = 0;
|
|
threadParams.back().multipleHeadNodes = false;
|
|
if(nthreads == 1) {
|
|
buildGraph_worker((void*)&threadParams.back());
|
|
} else {
|
|
threads[i] = new tthread::thread(buildGraph_worker, (void*)&threadParams.back());
|
|
}
|
|
}
|
|
|
|
if(nthreads > 1) {
|
|
for(index_t i = 0; i < (index_t)nthreads; i++)
|
|
threads[i]->join();
|
|
}
|
|
|
|
index_t num_nodes = 0, num_edges = 0;
|
|
for(index_t i = 0; i < threadParams.size(); i++) {
|
|
num_nodes += threadParams[i].num_nodes;
|
|
num_edges += threadParams[i].num_edges;
|
|
// Make room for edges spanning graphs by different threads
|
|
if(i > 0) {
|
|
num_edges += 16;
|
|
}
|
|
}
|
|
nodes.resizeExact(num_nodes); nodes.clear();
|
|
edges.resizeExact(num_edges); edges.clear();
|
|
|
|
// Read all the nodes and edges
|
|
EList<index_t> tail_nodes;
|
|
bool multipleHeadNodes = false;
|
|
for(index_t i = 0; i < threadParams.size(); i++) {
|
|
if(threadParams[i].multipleHeadNodes) multipleHeadNodes = true;
|
|
std::ostringstream number; number << i;
|
|
const string rg_fname = out_fname + "." + number.str() + ".rf";
|
|
ifstream rg_in_file(rg_fname.c_str(), ios::binary);
|
|
if(!rg_in_file.good()) {
|
|
cerr << "Could not open file for reading a reference graph: \"" << rg_fname << "\"" << endl;
|
|
throw 1;
|
|
}
|
|
index_t curr_num_nodes = (index_t)nodes.size();
|
|
ASSERT_ONLY(index_t curr_num_edges = (index_t)edges.size());
|
|
ASSERT_ONLY(index_t num_spanning_edges = 0);
|
|
// Read nodes to be connected to last nodes in a previous thread
|
|
if(i > 0) {
|
|
assert_gt(tail_nodes.size(), 0)
|
|
index_t num_head_nodes = readIndex<index_t>(rg_in_file, bigEndian);
|
|
for(index_t j = 0; j < num_head_nodes; j++) {
|
|
index_t head_node = readIndex<index_t>(rg_in_file, bigEndian);
|
|
for(index_t k = 0; k < tail_nodes.size(); k++) {
|
|
edges.expand();
|
|
edges.back().from = tail_nodes[k];
|
|
edges.back().to = head_node + curr_num_nodes;
|
|
ASSERT_ONLY(num_spanning_edges++);
|
|
}
|
|
}
|
|
}
|
|
while(!rg_in_file.eof()) {
|
|
index_t tmp_num_nodes = readIndex<index_t>(rg_in_file, bigEndian);
|
|
for(index_t j = 0; j < tmp_num_nodes; j++) {
|
|
nodes.expand();
|
|
nodes.back().read(rg_in_file, bigEndian);
|
|
}
|
|
index_t tmp_num_edges = readIndex<index_t>(rg_in_file, bigEndian);
|
|
for(index_t j = 0; j < tmp_num_edges; j++) {
|
|
edges.expand();
|
|
edges.back().read(rg_in_file, bigEndian);
|
|
edges.back().from += curr_num_nodes;
|
|
edges.back().to += curr_num_nodes;
|
|
}
|
|
|
|
if(nodes.size() >= curr_num_nodes + threadParams[i].num_nodes) {
|
|
assert_eq(nodes.size(), curr_num_nodes + threadParams[i].num_nodes);
|
|
assert_eq(edges.size(), curr_num_edges + num_spanning_edges + threadParams[i].num_edges);
|
|
// Read last nodes in this thread
|
|
tail_nodes.clear();
|
|
if(i + 1 < (index_t)nthreads) {
|
|
index_t num_tail_nodes = readIndex<index_t>(rg_in_file, bigEndian);
|
|
for(index_t j = 0; j < num_tail_nodes; j++) {
|
|
index_t tail_node = readIndex<index_t>(rg_in_file, bigEndian);
|
|
tail_nodes.push_back(tail_node + curr_num_nodes);
|
|
}
|
|
}
|
|
break;
|
|
}
|
|
}
|
|
rg_in_file.close();
|
|
std::remove(rg_fname.c_str());
|
|
if(i + 1 == (index_t)nthreads) {
|
|
lastNode = threadParams.back().lastNode + curr_num_nodes;
|
|
assert_lt(lastNode, nodes.size());
|
|
assert_eq(nodes[lastNode].label, 'Z');
|
|
}
|
|
}
|
|
|
|
if(s.length() + 2 == nodes.size() && nodes.size() == edges.size() + 1) {
|
|
cerr << "Warning: no variants or splice sites in this graph" << endl;
|
|
throw NongraphException();
|
|
}
|
|
|
|
if(multipleHeadNodes) {
|
|
if(!isReverseDeterministic(nodes, edges)) {
|
|
if(verbose) cerr << "\tis not reverse-deterministic, so reverse-determinize..." << endl;
|
|
reverseDeterminize(nodes, edges, lastNode);
|
|
}
|
|
}
|
|
assert(isReverseDeterministic(nodes, edges));
|
|
} else { // this is memory-consuming, but simple to implement
|
|
index_t num_predicted_nodes = (index_t)(jlen * 1.2);
|
|
nodes.reserveExact(num_predicted_nodes);
|
|
edges.reserveExact(num_predicted_nodes);
|
|
|
|
// Created head node
|
|
nodes.expand();
|
|
nodes.back().label = 'Y';
|
|
nodes.back().value = 0;
|
|
// Create nodes and edges corresponding to a reference genome
|
|
for(size_t i = 0; i < s.length(); i++) {
|
|
nodes.expand();
|
|
nodes.back().label = "ACGT"[(int)s[i]];
|
|
nodes.back().value = (index_t)i;
|
|
|
|
assert_geq(nodes.size(), 2);
|
|
edges.expand();
|
|
edges.back().from = (index_t)nodes.size() - 2;
|
|
edges.back().to = (index_t)nodes.size() - 1;
|
|
}
|
|
|
|
// Create tail node
|
|
nodes.expand();
|
|
nodes.back().label = 'Z';
|
|
nodes.back().value = (index_t)s.length();
|
|
lastNode = (index_t)nodes.size() - 1;
|
|
edges.expand();
|
|
edges.back().from = (index_t)nodes.size() - 2;
|
|
edges.back().to = (index_t)nodes.size() - 1;
|
|
|
|
// Create nodes and edges for haplotypes
|
|
for(index_t i = 0; i < haplotypes.size(); i++) {
|
|
const Haplotype<index_t>& haplotype = haplotypes[i];
|
|
const EList<index_t, 1>& snpIDs = haplotype.alts;
|
|
assert_gt(snpIDs.size(), 0);
|
|
assert_lt(haplotype.right, s.length());
|
|
bool pass = true;
|
|
for(index_t s = 0; s < snpIDs.size(); s++) {
|
|
index_t snpID = snpIDs[s];
|
|
assert_lt(snpID, alts.size());
|
|
const ALT<index_t>& snp = alts[snpID];
|
|
assert(snp.snp());
|
|
if(s + 1 >= snpIDs.size()) break;
|
|
index_t snpID2 = snpIDs[s+1];
|
|
assert_lt(snpID2, alts.size());
|
|
const ALT<index_t>& snp2 = alts[snpID2];
|
|
assert(snp2.snp());
|
|
if(snp.type == ALT_SNP_INS) {
|
|
if(snp.pos > snp2.pos) {
|
|
pass = false;
|
|
break;
|
|
}
|
|
} else if(snp.type == ALT_SNP_DEL) {
|
|
if(snp2.type == ALT_SNP_DEL) {
|
|
if(snp.pos + snp.len >= snp2.pos) {
|
|
pass = false;
|
|
break;
|
|
}
|
|
} else {
|
|
if(snp.pos + snp.len - 1 >= snp2.pos) {
|
|
pass = false;
|
|
break;
|
|
}
|
|
}
|
|
} else {
|
|
if(snp.pos >= snp2.pos) {
|
|
pass = false;
|
|
break;
|
|
}
|
|
}
|
|
}
|
|
|
|
if(!pass) continue;
|
|
|
|
index_t prev_ALT_type = ALT_NONE;
|
|
index_t ID_i = 0;
|
|
for(index_t j = haplotype.left; j <= haplotype.right; j++) {
|
|
if(prev_ALT_type == ALT_SNP_INS) j--;
|
|
const ALT<index_t>* altp = (ID_i < snpIDs.size() ? &(alts[snpIDs[ID_i]]) : NULL);
|
|
assert(altp == NULL || altp->pos >= j);
|
|
if(altp != NULL && altp->pos == j) {
|
|
const ALT<index_t>& alt = *altp;
|
|
assert_lt(alt.pos, s.length());
|
|
assert(alt.snp());
|
|
if(alt.type == ALT_SNP_SGL) {
|
|
assert_eq(alt.len, 1);
|
|
nodes.expand();
|
|
assert_lt(alt.seq, 4);
|
|
assert_neq(alt.seq & 0x3, s[alt.pos]);
|
|
nodes.back().label = "ACGT"[alt.seq];
|
|
nodes.back().value = alt.pos;
|
|
if(prev_ALT_type != ALT_SNP_DEL) {
|
|
edges.expand();
|
|
if(j == haplotype.left) {
|
|
edges.back().from = alt.pos;
|
|
} else {
|
|
assert_gt(nodes.size(), 2);
|
|
edges.back().from = (index_t)nodes.size() - 2;
|
|
}
|
|
edges.back().to = (index_t)nodes.size() - 1;
|
|
}
|
|
if(j == haplotype.right) {
|
|
edges.expand();
|
|
edges.back().from = (index_t)nodes.size() - 1;
|
|
edges.back().to = alt.pos + 2;
|
|
}
|
|
}
|
|
else if(alt.type == ALT_SNP_DEL) {
|
|
assert_gt(alt.len, 0);
|
|
assert_leq(alt.pos + alt.len, s.length());
|
|
edges.expand();
|
|
if(j == haplotype.left) {
|
|
edges.back().from = alt.pos;
|
|
} else {
|
|
edges.back().from = (index_t)nodes.size() - 1;
|
|
}
|
|
j += (alt.len - 1);
|
|
assert_leq(j, haplotype.right);
|
|
if(j == haplotype.right) {
|
|
edges.back().to = alt.pos + alt.len + 1;
|
|
} else {
|
|
edges.back().to = (index_t)nodes.size();
|
|
}
|
|
} else {
|
|
assert_eq(alt.type, ALT_SNP_INS)
|
|
assert_gt(alt.len, 0);
|
|
for(size_t k = 0; k < alt.len; k++) {
|
|
uint64_t bp = alt.seq >> ((alt.len - k - 1) << 1);
|
|
bp &= 0x3;
|
|
char ch = "ACGT"[bp];
|
|
nodes.expand();
|
|
nodes.back().label = ch;
|
|
nodes.back().value = (index_t)INDEX_MAX;
|
|
if(prev_ALT_type == ALT_SNP_DEL && k == 0) continue;
|
|
edges.expand();
|
|
edges.back().from = ((k == 0 && j == haplotype.left) ? alt.pos : (index_t)nodes.size() - 2);
|
|
edges.back().to = (index_t)nodes.size() - 1;
|
|
}
|
|
if(j == haplotype.right) {
|
|
edges.expand();
|
|
edges.back().from = (index_t)nodes.size() - 1;
|
|
edges.back().to = alt.pos + 1;
|
|
}
|
|
}
|
|
ID_i++;
|
|
prev_ALT_type = alt.type;
|
|
} else {
|
|
int nt = s[j];
|
|
assert_lt(nt, 4);
|
|
nodes.expand();
|
|
nodes.back().label = "ACGT"[nt];
|
|
nodes.back().value = j;
|
|
if(prev_ALT_type != ALT_SNP_DEL) {
|
|
edges.expand();
|
|
if(j == haplotype.left && prev_ALT_type == ALT_NONE) {
|
|
edges.back().from = j;
|
|
} else {
|
|
edges.back().from = (index_t)nodes.size() - 2;
|
|
}
|
|
edges.back().to = (index_t)nodes.size() - 1;
|
|
}
|
|
if(j == haplotype.right) {
|
|
edges.expand();
|
|
edges.back().from = (index_t)nodes.size() - 1;
|
|
edges.back().to = j + 2;
|
|
}
|
|
prev_ALT_type = ALT_SNP_SGL;
|
|
}
|
|
}
|
|
}
|
|
|
|
// Create nodes and edges for splice sites
|
|
for(size_t i = 0; i < alts.size(); i++) {
|
|
const ALT<index_t>& alt = alts[i];
|
|
if(alt.pos >= s.length()) break;
|
|
if(alt.type != ALT_SPLICESITE) continue;
|
|
if(alt.excluded) continue;
|
|
assert_lt(alt.left, alt.right);
|
|
edges.expand();
|
|
edges.back().from = alt.left;
|
|
edges.back().to = alt.right + 2;
|
|
}
|
|
|
|
if(s.length() + 2 == nodes.size() && nodes.size() == edges.size() + 1) {
|
|
throw NongraphException();
|
|
}
|
|
|
|
if(!isReverseDeterministic(nodes, edges)) {
|
|
if(verbose) cerr << "\tis not reverse-deterministic, so reverse-determinize..." << endl;
|
|
reverseDeterminize(nodes, edges, lastNode);
|
|
assert(isReverseDeterministic(nodes, edges));
|
|
}
|
|
}
|
|
|
|
#ifndef NDEBUG
|
|
if(debug) {
|
|
cout << "num nodes: " << nodes.size() << endl;
|
|
for(index_t i = 0; i < nodes.size(); i++) {
|
|
const Node& n = nodes[i];
|
|
cout << i << "\t" << n.label << "\t" << n.value << endl;
|
|
}
|
|
|
|
sort(edges.begin(), edges.end());
|
|
cout << "num edges: " << edges.size() << endl;
|
|
for(index_t i = 0; i < edges.size(); i++) {
|
|
const Edge& e = edges[i];
|
|
cout << i << "\t" << e.from << " --> " << e.to << endl;
|
|
}
|
|
}
|
|
#endif
|
|
}
|
|
|
|
template <typename index_t>
|
|
pair<index_t, index_t> RefGraph<index_t>::findEdges(const EList<Edge>& edges, index_t node, bool from) {
|
|
pair<index_t, index_t> range(0, 0);
|
|
assert_gt(edges.size(), 0);
|
|
|
|
// Find lower bound
|
|
index_t low = 0, high = (index_t)edges.size() - 1;
|
|
index_t temp;
|
|
while(low < high) {
|
|
index_t mid = low + (high - low) / 2;
|
|
temp = (from ? edges[mid].from : edges[mid].to);
|
|
if(node == temp) {
|
|
high = mid;
|
|
} else if(node < temp) {
|
|
if(mid == 0) {
|
|
return pair<index_t, index_t>(0, 0);
|
|
}
|
|
|
|
high = mid - 1;
|
|
} else {
|
|
low = mid + 1;
|
|
}
|
|
}
|
|
temp = (from ? edges[low].from : edges[low].to);
|
|
if(node == temp) {
|
|
range.first = low;
|
|
} else {
|
|
return range;
|
|
}
|
|
|
|
// Find upper bound
|
|
high = (index_t)edges.size() - 1;
|
|
while(low < high)
|
|
{
|
|
index_t mid = low + (high - low + 1) / 2;
|
|
temp = (from ? edges[mid].from : edges[mid].to);
|
|
if(node == temp) {
|
|
low = mid;
|
|
} else {
|
|
assert_lt(node, temp);
|
|
high = mid - 1;
|
|
}
|
|
}
|
|
#ifndef NDEBUG
|
|
temp = (from ? edges[high].from : edges[high].to);
|
|
assert_eq(node, temp);
|
|
#endif
|
|
range.second = high + 1;
|
|
return range;
|
|
}
|
|
|
|
template <typename index_t>
|
|
void RefGraph<index_t>::buildGraph_worker(void* vp) {
|
|
ThreadParam* threadParam = (ThreadParam*)vp;
|
|
RefGraph<index_t>& refGraph = *(threadParam->refGraph);
|
|
|
|
const SString<char>& s = *(threadParam->s);
|
|
index_t jlen = (index_t)s.length();
|
|
|
|
const EList<ALT<index_t> >& alts = *(threadParam->alts);
|
|
const EList<Haplotype<index_t> >& haplotypes = *(threadParam->haplotypes);
|
|
|
|
EList<Node> nodes; EList<Edge> edges;
|
|
const EList<RefRecord>& tmp_szs = refGraph.tmp_szs;
|
|
|
|
index_t thread_id = threadParam->thread_id;
|
|
index_t nthreads = refGraph.nthreads;
|
|
std::ostringstream number; number << thread_id;
|
|
const string rg_fname = threadParam->out_fname + "." + number.str() + ".rf";
|
|
ofstream rg_out_file(rg_fname.c_str(), ios::binary);
|
|
if(!rg_out_file.good()) {
|
|
cerr << "Could not open file for writing a reference graph: \"" << rg_fname << "\"" << endl;
|
|
throw 1;
|
|
}
|
|
|
|
#ifndef NDEBUG
|
|
set<index_t> snp_set;
|
|
#endif
|
|
|
|
const bool bigEndian = threadParam->bigEndian;
|
|
|
|
index_t& lastNode = threadParam->lastNode;
|
|
|
|
index_t& num_nodes = threadParam->num_nodes;
|
|
index_t& num_edges = threadParam->num_edges;
|
|
index_t szs_idx = 0, szs_idx_end = (index_t)tmp_szs.size();
|
|
if(threadParam->thread_id != 0) {
|
|
szs_idx = (index_t)((tmp_szs.size() / nthreads) * thread_id);
|
|
}
|
|
if(thread_id + 1 < nthreads) {
|
|
szs_idx_end = (index_t)((tmp_szs.size() / nthreads) * (thread_id + 1));
|
|
}
|
|
|
|
index_t curr_pos = 0;
|
|
for(index_t i = 0; i < szs_idx; i++) {
|
|
curr_pos += tmp_szs[i].len;
|
|
}
|
|
EList<index_t> prev_tail_nodes;
|
|
index_t alt_idx = 0, haplotype_idx = 0;
|
|
for(; szs_idx < szs_idx_end; szs_idx++) {
|
|
index_t curr_len = tmp_szs[szs_idx].len;
|
|
if(curr_len <= 0) continue;
|
|
index_t num_predicted_nodes = (index_t)(curr_len * 1.2);
|
|
nodes.resizeExact(num_predicted_nodes); nodes.clear();
|
|
edges.resizeExact(num_predicted_nodes); edges.clear();
|
|
|
|
// Created head node
|
|
nodes.expand();
|
|
nodes.back().label = 'Y';
|
|
nodes.back().value = 0;
|
|
|
|
// Create nodes and edges corresponding to a reference genome
|
|
assert_leq(curr_pos + curr_len, s.length());
|
|
for(size_t i = curr_pos; i < curr_pos + curr_len; i++) {
|
|
nodes.expand();
|
|
nodes.back().label = "ACGT"[(int)s[i]];
|
|
nodes.back().value = (index_t)i;
|
|
assert_geq(nodes.size(), 2);
|
|
edges.expand();
|
|
edges.back().from = (index_t)nodes.size() - 2;
|
|
edges.back().to = (index_t)nodes.size() - 1;
|
|
}
|
|
|
|
// Create tail node
|
|
nodes.expand();
|
|
nodes.back().label = 'Z';
|
|
nodes.back().value = (index_t)s.length();
|
|
lastNode = (index_t)nodes.size() - 1;
|
|
edges.expand();
|
|
edges.back().from = (index_t)nodes.size() - 2;
|
|
edges.back().to = (index_t)nodes.size() - 1;
|
|
ASSERT_ONLY(index_t backbone_nodes = (index_t)nodes.size());
|
|
|
|
// Create nodes and edges for haplotypes
|
|
for(; haplotype_idx < haplotypes.size(); haplotype_idx++) {
|
|
const Haplotype<index_t>& haplotype = haplotypes[haplotype_idx];
|
|
if(haplotype.left < curr_pos) continue;
|
|
if(haplotype.right >= curr_pos + curr_len) break;
|
|
const EList<index_t, 1>& snpIDs = haplotype.alts;
|
|
assert_gt(snpIDs.size(), 0);
|
|
bool pass = true;
|
|
for(index_t s = 0; s < snpIDs.size(); s++) {
|
|
index_t snpID = snpIDs[s];
|
|
assert_lt(snpID, alts.size());
|
|
const ALT<index_t>& snp = alts[snpID];
|
|
assert(snp.snp());
|
|
if(s + 1 >= snpIDs.size()) break;
|
|
index_t snpID2 = snpIDs[s+1];
|
|
assert_lt(snpID2, alts.size());
|
|
const ALT<index_t>& snp2 = alts[snpID2];
|
|
assert(snp2.snp());
|
|
if(snp.type == ALT_SNP_INS) {
|
|
if(snp.pos > snp2.pos) {
|
|
pass = false;
|
|
break;
|
|
}
|
|
} else if(snp.type == ALT_SNP_DEL) {
|
|
if(snp2.type == ALT_SNP_DEL) {
|
|
if(snp.pos + snp.len >= snp2.pos) {
|
|
pass = false;
|
|
break;
|
|
}
|
|
} else {
|
|
if(snp.pos + snp.len - 1 >= snp2.pos) {
|
|
pass = false;
|
|
break;
|
|
}
|
|
}
|
|
} else {
|
|
if(snp.pos >= snp2.pos) {
|
|
pass = false;
|
|
break;
|
|
}
|
|
}
|
|
}
|
|
|
|
if(!pass) continue;
|
|
|
|
index_t prev_ALT_type = ALT_NONE;
|
|
index_t ID_i = 0;
|
|
for(index_t j = haplotype.left; j <= haplotype.right; j++) {
|
|
if(prev_ALT_type == ALT_SNP_INS) j--;
|
|
const ALT<index_t>* altp = (ID_i < snpIDs.size() ? &(alts[snpIDs[ID_i]]) : NULL);
|
|
assert(altp == NULL || altp->pos >= j);
|
|
if(altp != NULL && altp->pos == j) {
|
|
const ALT<index_t>& alt = *altp;
|
|
assert_lt(alt.pos, s.length());
|
|
assert(alt.snp());
|
|
if(alt.type == ALT_SNP_SGL) {
|
|
assert_eq(alt.len, 1);
|
|
nodes.expand();
|
|
assert_lt(alt.seq, 4);
|
|
assert_neq(alt.seq & 0x3, s[alt.pos]);
|
|
nodes.back().label = "ACGT"[alt.seq];
|
|
nodes.back().value = alt.pos;
|
|
if(prev_ALT_type != ALT_SNP_DEL) {
|
|
edges.expand();
|
|
if(j == haplotype.left) {
|
|
edges.back().from = alt.pos - curr_pos;
|
|
assert_lt(edges.back().from, backbone_nodes);
|
|
} else {
|
|
assert_gt(nodes.size(), 2);
|
|
edges.back().from = (index_t)nodes.size() - 2;
|
|
}
|
|
edges.back().to = (index_t)nodes.size() - 1;
|
|
}
|
|
if(j == haplotype.right) {
|
|
edges.expand();
|
|
edges.back().from = (index_t)nodes.size() - 1;
|
|
edges.back().to = alt.pos - curr_pos + 2;
|
|
assert_lt(edges.back().to, backbone_nodes);
|
|
}
|
|
}
|
|
else if(alt.type == ALT_SNP_DEL) {
|
|
assert_gt(alt.len, 0);
|
|
assert_leq(alt.pos - curr_pos + alt.len, s.length());
|
|
edges.expand();
|
|
if(j == haplotype.left) {
|
|
edges.back().from = alt.pos - curr_pos;
|
|
assert_lt(edges.back().from, backbone_nodes);
|
|
} else {
|
|
edges.back().from = (index_t)nodes.size() - 1;
|
|
}
|
|
j += (alt.len - 1);
|
|
assert_leq(j, haplotype.right);
|
|
if(j == haplotype.right) {
|
|
edges.back().to = alt.pos - curr_pos + alt.len + 1;
|
|
assert_lt(edges.back().to, backbone_nodes);
|
|
} else {
|
|
edges.back().to = (index_t)nodes.size();
|
|
}
|
|
} else {
|
|
assert_eq(alt.type, ALT_SNP_INS)
|
|
assert_gt(alt.len, 0);
|
|
for(size_t k = 0; k < alt.len; k++) {
|
|
uint64_t bp = alt.seq >> ((alt.len - k - 1) << 1);
|
|
bp &= 0x3;
|
|
char ch = "ACGT"[bp];
|
|
nodes.expand();
|
|
nodes.back().label = ch;
|
|
nodes.back().value = (index_t)INDEX_MAX;
|
|
if(prev_ALT_type == ALT_SNP_DEL && k == 0) continue;
|
|
edges.expand();
|
|
edges.back().from = ((k == 0 && j == haplotype.left) ? alt.pos - curr_pos : (index_t)nodes.size() - 2);
|
|
edges.back().to = (index_t)nodes.size() - 1;
|
|
}
|
|
if(j == haplotype.right) {
|
|
edges.expand();
|
|
edges.back().from = (index_t)nodes.size() - 1;
|
|
edges.back().to = alt.pos - curr_pos + 1;
|
|
}
|
|
}
|
|
#ifndef NDEBUG
|
|
snp_set.insert(snpIDs[ID_i]);
|
|
#endif
|
|
ID_i++;
|
|
prev_ALT_type = alt.type;
|
|
} else {
|
|
int nt = s[j];
|
|
assert_lt(nt, 4);
|
|
nodes.expand();
|
|
nodes.back().label = "ACGT"[nt];
|
|
nodes.back().value = j;
|
|
if(prev_ALT_type != ALT_SNP_DEL) {
|
|
edges.expand();
|
|
if(j == haplotype.left && prev_ALT_type == ALT_NONE) {
|
|
edges.back().from = j - curr_pos;
|
|
assert_lt(edges.back().from, backbone_nodes);
|
|
} else {
|
|
edges.back().from = (index_t)nodes.size() - 2;
|
|
}
|
|
edges.back().to = (index_t)nodes.size() - 1;
|
|
}
|
|
if(j == haplotype.right) {
|
|
edges.expand();
|
|
edges.back().from = (index_t)nodes.size() - 1;
|
|
edges.back().to = j - curr_pos + 2;
|
|
assert_lt(edges.back().to, backbone_nodes);
|
|
}
|
|
prev_ALT_type = ALT_SNP_SGL;
|
|
}
|
|
}
|
|
}
|
|
|
|
// Create nodes and edges for splice sites
|
|
for(; alt_idx < alts.size(); alt_idx++) {
|
|
const ALT<index_t>& alt = alts[alt_idx];
|
|
if(alt.pos < curr_pos) continue;
|
|
if(alt.pos >= curr_pos + curr_len) break;
|
|
if(!alt.splicesite()) continue;
|
|
if(alt.excluded) continue;
|
|
assert_lt(alt.left, alt.right);
|
|
edges.expand();
|
|
edges.back().from = alt.left - curr_pos;
|
|
edges.back().to = alt.right - curr_pos + 2;
|
|
assert_lt(edges.back().from, backbone_nodes);
|
|
assert_lt(edges.back().to, backbone_nodes);
|
|
}
|
|
|
|
#ifndef NDEBUG
|
|
if(refGraph.debug) {
|
|
cerr << "Nodes:" << endl;
|
|
for(size_t i = 0; i < nodes.size(); i++) {
|
|
const Node& node = nodes[i];
|
|
cerr << "\t" << i << "\t" << node.label << "\t" << node.value << endl;
|
|
}
|
|
cerr << endl;
|
|
cerr << "Edges: " << endl;
|
|
for(size_t i = 0; i < edges.size(); i++) {
|
|
const Edge& edge = edges[i];
|
|
cerr << "\t" << i << "\t" << edge.from << " --> " << edge.to << endl;
|
|
}
|
|
cerr << endl;
|
|
}
|
|
#endif
|
|
|
|
if(!isReverseDeterministic(nodes, edges)) {
|
|
reverseDeterminize(nodes, edges, lastNode, curr_pos > 0 ? curr_pos + 1 : 0);
|
|
assert(isReverseDeterministic(nodes, edges));
|
|
}
|
|
|
|
// Identify head
|
|
index_t head_node = (index_t)nodes.size();
|
|
for(index_t i = 0; i < nodes.size(); i++) {
|
|
if(nodes[i].label == 'Y') {
|
|
head_node = i;
|
|
break;
|
|
}
|
|
}
|
|
assert_lt(head_node, nodes.size());
|
|
index_t tail_node = lastNode; assert_lt(tail_node, nodes.size());
|
|
|
|
// Update edges
|
|
const index_t invalid = (index_t)INDEX_MAX;
|
|
bool head_off = curr_pos > 0, tail_off = curr_pos + curr_len < jlen;
|
|
for(index_t i = 0; i < edges.size(); i++) {
|
|
index_t from = edges[i].from;
|
|
from = from + num_nodes;
|
|
if(head_off && edges[i].from > head_node) from -= 1;
|
|
if(tail_off && edges[i].from > tail_node) from -= 1;
|
|
if(head_off && edges[i].from == head_node) {
|
|
edges[i].from = invalid;
|
|
} else {
|
|
edges[i].from = from;
|
|
}
|
|
|
|
index_t to = edges[i].to;
|
|
to = to + num_nodes;
|
|
if(head_off && edges[i].to > head_node) to -= 1;
|
|
if(tail_off && edges[i].to > tail_node) to -= 1;
|
|
if(tail_off && edges[i].to == tail_node) {
|
|
edges[i].to = invalid;
|
|
} else {
|
|
edges[i].to = to;
|
|
}
|
|
}
|
|
head_node = tail_node = invalid;
|
|
// Also update lastNode
|
|
if(!tail_off) {
|
|
lastNode += num_nodes;
|
|
if(head_off) lastNode -= 1;
|
|
}
|
|
|
|
// Connect head nodes with tail nodes in the previous automaton
|
|
index_t num_head_nodes = 0;
|
|
index_t tmp_num_edges = (index_t)edges.size();
|
|
if(head_off) {
|
|
EList<index_t> nodes_to_head;
|
|
for(index_t i = 0; i < tmp_num_edges; i++) {
|
|
if(edges[i].from == head_node) {
|
|
num_head_nodes++;
|
|
if(prev_tail_nodes.size() > 0) {
|
|
for(index_t j = 0; j < prev_tail_nodes.size(); j++) {
|
|
edges.expand();
|
|
edges.back().from = prev_tail_nodes[j];
|
|
edges.back().to = edges[i].to;
|
|
assert_lt(edges.back().from, edges.back().to);
|
|
}
|
|
} else {
|
|
nodes_to_head.push_back(edges[i].to);
|
|
}
|
|
}
|
|
}
|
|
|
|
if(nodes_to_head.size() > 0) {
|
|
assert_gt(thread_id, 0);
|
|
assert_eq(prev_tail_nodes.size(), 0);
|
|
writeIndex<index_t>(rg_out_file, (index_t)nodes_to_head.size(), bigEndian);
|
|
for(index_t i = 0; i < nodes_to_head.size(); i++) {
|
|
writeIndex<index_t>(rg_out_file, nodes_to_head[i], bigEndian);
|
|
}
|
|
}
|
|
}
|
|
|
|
// Need to check if it's reverse-deterministic
|
|
if(num_head_nodes > 1) {
|
|
threadParam->multipleHeadNodes = true;
|
|
}
|
|
|
|
// List tail nodes
|
|
prev_tail_nodes.clear();
|
|
if(tail_off) {
|
|
for(index_t i = 0; i < tmp_num_edges; i++) {
|
|
if(edges[i].to == tail_node) {
|
|
prev_tail_nodes.push_back(edges[i].from);
|
|
}
|
|
}
|
|
}
|
|
|
|
// Write nodes and edges
|
|
index_t tmp_num_nodes = (index_t)nodes.size();
|
|
assert_gt(tmp_num_nodes, 2);
|
|
if(head_off) tmp_num_nodes--;
|
|
if(tail_off) tmp_num_nodes--;
|
|
writeIndex<index_t>(rg_out_file, tmp_num_nodes, bigEndian);
|
|
ASSERT_ONLY(index_t num_nodes_written = 0);
|
|
for(index_t i = 0; i < nodes.size(); i++) {
|
|
if(head_off && nodes[i].label == 'Y') continue;
|
|
if(tail_off && nodes[i].label == 'Z') continue;
|
|
nodes[i].write(rg_out_file, bigEndian);
|
|
ASSERT_ONLY(num_nodes_written++);
|
|
}
|
|
assert_eq(tmp_num_nodes, num_nodes_written);
|
|
tmp_num_edges = (index_t)edges.size();
|
|
assert_geq(tmp_num_edges, num_head_nodes + prev_tail_nodes.size());
|
|
if(head_off) tmp_num_edges -= num_head_nodes;
|
|
if(tail_off) tmp_num_edges -= prev_tail_nodes.size();
|
|
writeIndex<index_t>(rg_out_file, tmp_num_edges, bigEndian);
|
|
ASSERT_ONLY(index_t num_edges_written = 0);
|
|
for(index_t i = 0; i < edges.size(); i++) {
|
|
if(head_off && edges[i].from == head_node) continue;
|
|
if(tail_off && edges[i].to == tail_node) continue;
|
|
edges[i].write(rg_out_file, bigEndian);
|
|
ASSERT_ONLY(num_edges_written++);
|
|
}
|
|
assert_eq(tmp_num_edges, num_edges_written);
|
|
|
|
// Clear nodes and edges
|
|
nodes.clear(); edges.clear();
|
|
|
|
curr_pos += curr_len;
|
|
num_nodes += tmp_num_nodes;
|
|
num_edges += tmp_num_edges;
|
|
}
|
|
|
|
if(nthreads > 1 && thread_id + 1 < (index_t)nthreads && prev_tail_nodes.size() > 0) {
|
|
writeIndex<index_t>(rg_out_file, (index_t)prev_tail_nodes.size(), bigEndian);
|
|
for(index_t i = 0; i < prev_tail_nodes.size(); i++) {
|
|
writeIndex<index_t>(rg_out_file, prev_tail_nodes[i], bigEndian);
|
|
}
|
|
}
|
|
|
|
// Close out file handle
|
|
rg_out_file.close();
|
|
}
|
|
|
|
template <typename index_t>
|
|
bool RefGraph<index_t>::isReverseDeterministic(EList<Node>& nodes, EList<Edge>& edges)
|
|
{
|
|
if(edges.size() <= 0) return true;
|
|
|
|
// Sort edges by "to" nodes
|
|
sortEdgesTo(edges);
|
|
|
|
index_t curr_to = (index_t)INDEX_MAX;
|
|
EList<bool> seen; seen.resize(5); seen.fillZero();
|
|
for(index_t i = 0; i < edges.size(); i++) {
|
|
index_t from = edges[i].from;
|
|
assert_lt(from, nodes.size());
|
|
char nt = nodes[from].label;
|
|
assert(nt == 'A' || nt == 'C' || nt == 'G' || nt == 'T' || nt == 'Y');
|
|
if(nt == 'Y') nt = 4;
|
|
else nt = asc2dna[(int)nt];
|
|
assert_lt(nt, seen.size());
|
|
if(curr_to != edges[i].to) {
|
|
curr_to = edges[i].to;
|
|
seen.fillZero();
|
|
seen[nt] = true;
|
|
} else {
|
|
if(seen[nt]) {
|
|
return false;
|
|
}
|
|
seen[nt] = true;
|
|
}
|
|
}
|
|
|
|
return true;
|
|
}
|
|
|
|
|
|
template <typename index_t>
|
|
void RefGraph<index_t>::reverseDeterminize(EList<Node>& nodes, EList<Edge>& edges, index_t& lastNode, index_t lastNode_add)
|
|
{
|
|
EList<CompositeNode> cnodes; cnodes.ensure(nodes.size());
|
|
map<CompositeNodeIDs, index_t> cnode_map;
|
|
deque<index_t> active_cnodes;
|
|
EList<CompositeEdge> cedges; cedges.ensure(edges.size());
|
|
|
|
// Start from the final node ('Z')
|
|
assert_lt(lastNode, nodes.size());
|
|
const Node& last_node = nodes[lastNode];
|
|
cnodes.expand();
|
|
cnodes.back().reset();
|
|
cnodes.back().label = last_node.label;
|
|
cnodes.back().value = last_node.value;
|
|
cnodes.back().nodes.push_back(lastNode);
|
|
active_cnodes.push_back(0);
|
|
cnode_map[cnodes.back().nodes] = 0;
|
|
sortEdgesTo(edges);
|
|
|
|
index_t firstNode = 0; // Y -> ... -> Z
|
|
EList<index_t> predecessors;
|
|
while(!active_cnodes.empty()) {
|
|
index_t cnode_id = active_cnodes.front(); active_cnodes.pop_front();
|
|
assert_lt(cnode_id, cnodes.size());
|
|
|
|
// Find predecessors of this composite node
|
|
predecessors.clear();
|
|
for(size_t i = 0; i < cnodes[cnode_id].nodes.size(); i++) {
|
|
index_t node_id = cnodes[cnode_id].nodes.getID((index_t)i);
|
|
pair<index_t, index_t> edge_range = findEdgesTo(edges, node_id);
|
|
assert_leq(edge_range.first, edge_range.second);
|
|
assert_leq(edge_range.second, edges.size());
|
|
for(index_t j = edge_range.first; j < edge_range.second; j++) {
|
|
assert_eq(edges[j].to, node_id);
|
|
predecessors.push_back(edges[j].from);
|
|
}
|
|
}
|
|
|
|
if(predecessors.size() >= 2) {
|
|
// Remove redundant nodes
|
|
predecessors.sort();
|
|
index_t new_size = (index_t)(unique(predecessors.begin(), predecessors.end()) - predecessors.begin());
|
|
predecessors.resize(new_size);
|
|
|
|
// Create composite nodes by labels
|
|
stable_sort(predecessors.begin(), predecessors.end(), TempNodeLabelCmp(nodes));
|
|
}
|
|
|
|
for(size_t i = 0; i < predecessors.size();) {
|
|
index_t node_id = predecessors[i];
|
|
assert_lt(node_id, nodes.size());
|
|
const Node& node = nodes[node_id]; i++;
|
|
|
|
cnodes.expand();
|
|
cnodes.back().reset();
|
|
cnodes.back().label = node.label;
|
|
cnodes.back().value = node.value;
|
|
cnodes.back().nodes.push_back(node_id);
|
|
|
|
if(node.label == 'Y' && firstNode == 0) {
|
|
firstNode = (index_t)cnodes.size() - 1;
|
|
}
|
|
|
|
while(i < predecessors.size()) {
|
|
index_t next_node_id = predecessors[i];
|
|
assert_lt(next_node_id, nodes.size());
|
|
const Node& next_node = nodes[next_node_id];
|
|
if(next_node.label != node.label) break;
|
|
cnodes.back().nodes.push_back(next_node_id);
|
|
if(next_node.value != (index_t)INDEX_MAX) {
|
|
if(cnodes.back().value == (index_t)INDEX_MAX) {
|
|
cnodes.back().value = next_node.value;
|
|
} else {
|
|
cnodes.back().value = max(cnodes.back().value, next_node.value);
|
|
}
|
|
}
|
|
i++;
|
|
}
|
|
|
|
// Create edges from this new composite node to current composite node
|
|
typename map<CompositeNodeIDs, index_t>::iterator existing = cnode_map.find(cnodes.back().nodes);
|
|
if(existing == cnode_map.end()) {
|
|
cnode_map[cnodes.back().nodes] = (index_t)cnodes.size() - 1;
|
|
active_cnodes.push_back((index_t)cnodes.size() - 1);
|
|
cedges.push_back(CompositeEdge((index_t)cnodes.size() - 1, cnode_id));
|
|
} else {
|
|
cnodes.pop_back();
|
|
cedges.push_back(CompositeEdge((*existing).second, cnode_id));
|
|
}
|
|
|
|
// Increment indegree
|
|
cnodes[cnode_id].id++;
|
|
}
|
|
}
|
|
|
|
// Interchange from and to
|
|
for(index_t i = 0; i < cedges.size(); i++) {
|
|
index_t tmp = cedges[i].from;
|
|
cedges[i].from = cedges[i].to;
|
|
cedges[i].to = tmp;
|
|
}
|
|
sort(cedges.begin(), cedges.end());
|
|
active_cnodes.push_back(0);
|
|
while(!active_cnodes.empty()) {
|
|
index_t cnode_id = active_cnodes.front(); active_cnodes.pop_front();
|
|
assert_lt(cnode_id, cnodes.size());
|
|
const CompositeNode& cnode = cnodes[cnode_id];
|
|
index_t i = (index_t)cedges.bsearchLoBound(CompositeEdge(cnode_id, 0));
|
|
while(i < cedges.size()) {
|
|
assert_geq(cedges[i].from, cnode_id);
|
|
if(cedges[i].from != cnode_id) break;
|
|
index_t predecessor_cnode_id = cedges[i].to;
|
|
assert_lt(predecessor_cnode_id, cnodes.size());
|
|
CompositeNode& predecessor_cnode = cnodes[predecessor_cnode_id];
|
|
if(cnode.value == predecessor_cnode.value + 1) {
|
|
active_cnodes.push_back(predecessor_cnode_id);
|
|
break;
|
|
}
|
|
i++;
|
|
}
|
|
}
|
|
// Restore from and to by interchanging them
|
|
for(index_t i = 0; i < cedges.size(); i++) {
|
|
index_t tmp = cedges[i].from;
|
|
cedges[i].from = cedges[i].to;
|
|
cedges[i].to = tmp;
|
|
}
|
|
|
|
// Create new nodes
|
|
lastNode = 0; // Invalidate lastNode
|
|
nodes.resizeExact(cnodes.size()); nodes.clear();
|
|
assert_neq(firstNode, 0);
|
|
assert_lt(firstNode, cnodes.size());
|
|
CompositeNode& first_node = cnodes[firstNode];
|
|
first_node.id = 0;
|
|
nodes.expand();
|
|
nodes.back() = first_node.getNode();
|
|
active_cnodes.push_back(firstNode);
|
|
sort(cedges.begin(), cedges.end());
|
|
while(!active_cnodes.empty()) {
|
|
index_t cnode_id = active_cnodes.front(); active_cnodes.pop_front();
|
|
assert_lt(cnode_id, cnodes.size());
|
|
index_t i = (index_t)cedges.bsearchLoBound(CompositeEdge(cnode_id, 0));
|
|
while(i < cedges.size()) {
|
|
assert_geq(cedges[i].from, cnode_id);
|
|
if(cedges[i].from != cnode_id) break;
|
|
index_t successor_cnode_id = cedges[i].to;
|
|
assert_lt(successor_cnode_id, cnodes.size());
|
|
CompositeNode& successor_cnode = cnodes[successor_cnode_id];
|
|
assert_gt(successor_cnode.id, 0);
|
|
successor_cnode.id--;
|
|
if(successor_cnode.id == 0) {
|
|
active_cnodes.push_back(successor_cnode_id);
|
|
successor_cnode.id = (index_t)nodes.size();
|
|
nodes.expand();
|
|
nodes.back() = successor_cnode.getNode();
|
|
if(nodes.back().label == 'Z') {
|
|
assert_eq(lastNode, 0);
|
|
assert_gt(nodes.size(), 1);
|
|
lastNode = (index_t)nodes.size() - 1;
|
|
}
|
|
}
|
|
i++;
|
|
}
|
|
}
|
|
|
|
// Create new edges
|
|
edges.resizeExact(cedges.size()); edges.clear();
|
|
for(index_t i = 0; i < cedges.size(); i++) {
|
|
const CompositeEdge& edge = cedges[i];
|
|
edges.expand();
|
|
edges.back() = edge.getEdge(cnodes);
|
|
}
|
|
sortEdgesFrom(edges);
|
|
|
|
#if 0
|
|
#ifndef NDEBUG
|
|
if(debug) {
|
|
cerr << "Nodes:" << endl;
|
|
for(size_t i = 0; i < nodes.size(); i++) {
|
|
const Node& node = nodes[i];
|
|
cerr << "\t" << i << "\t" << node.label << "\t" << node.value << (node.backbone ? "\tbackbone" : "") << endl;
|
|
}
|
|
cerr << endl;
|
|
cerr << "Edges: " << endl;
|
|
for(size_t i = 0; i < edges.size(); i++) {
|
|
const Edge& edge = edges[i];
|
|
cerr << "\t" << i << "\t" << edge.from << " --> " << edge.to << endl;
|
|
}
|
|
cerr << endl;
|
|
}
|
|
#endif
|
|
#endif
|
|
}
|
|
|
|
template <typename index_t>
|
|
class PathGraph {
|
|
public:
|
|
struct PathNode {
|
|
index_t from;
|
|
index_t to;
|
|
pair<index_t, index_t> key;
|
|
|
|
void setSorted() { to = (index_t)INDEX_MAX; }
|
|
bool isSorted() const { return to == (index_t)INDEX_MAX; }
|
|
|
|
index_t value() const { return to; }
|
|
index_t outdegree() const { return key.first; }
|
|
|
|
bool operator< (const PathNode& o) const {
|
|
return key < o.key;
|
|
};
|
|
};
|
|
|
|
static inline index_t PathNodeFrom (PathNode& a) {
|
|
return a.from;
|
|
}
|
|
|
|
static inline index_t PathNodeKey (PathNode& a) {
|
|
return a.key.first;
|
|
}
|
|
|
|
struct PathNodeKeySecondCmp {
|
|
bool operator() (const PathNode& a, const PathNode& b) const {
|
|
return a.key.second < b.key.second;
|
|
}
|
|
};
|
|
|
|
struct PathNodeFromCmp {
|
|
bool operator() (const PathNode& a, const PathNode& b) const {
|
|
return a.from < b.from;
|
|
}
|
|
};
|
|
|
|
struct PathNodeToCmp {
|
|
bool operator() (const PathNode& a, const PathNode& b) const {
|
|
return a.to < b.to;
|
|
}
|
|
};
|
|
|
|
struct PathEdge {
|
|
index_t from;
|
|
union {
|
|
index_t to;
|
|
index_t ranking;
|
|
};
|
|
char label;
|
|
|
|
PathEdge() { reset(); }
|
|
|
|
PathEdge(index_t from_, index_t ranking_, char label_) : from(from_), ranking(ranking_), label(label_) {}
|
|
|
|
void reset() {
|
|
from = 0;
|
|
ranking = 0;
|
|
label = 0;
|
|
}
|
|
|
|
bool operator< (const PathEdge& o) const {
|
|
return label < o.label || (label == o.label && ranking < o.ranking);
|
|
};
|
|
|
|
};
|
|
|
|
static inline index_t PathEdgeTo (PathEdge& a) {
|
|
return a.to;
|
|
}
|
|
|
|
struct PathEdgeFromCmp {
|
|
bool operator() (const PathEdge& a, const PathEdge& b) const {
|
|
return a.from < b.from || (a.from == b.from && a.to < b.to);
|
|
}
|
|
};
|
|
|
|
struct PathEdgeToCmp {
|
|
bool operator() (const PathEdge& a, const PathEdge& b) const {
|
|
return a.to < b.to || (a.to == b.to && a.from < b.from);
|
|
}
|
|
};
|
|
|
|
public:
|
|
// Create a new graph in which paths are represented using nodes
|
|
PathGraph(
|
|
RefGraph<index_t>& parent,
|
|
const string& base_fname,
|
|
size_t max_num_nodes_ = std::numeric_limits<size_t>::max(),
|
|
int nthreads_ = 1,
|
|
bool verbose_ = false);
|
|
|
|
~PathGraph() {}
|
|
|
|
void printInfo();
|
|
|
|
bool generateEdges(RefGraph<index_t>& parent);
|
|
|
|
index_t getNumNodes() const { return (index_t)nodes.size(); }
|
|
index_t getNumEdges() const { return (index_t)edges.size(); }
|
|
|
|
bool isSorted() const { return sorted; }
|
|
|
|
bool nextRow(int& gbwtChar, int& F, int& M, index_t& pos) {
|
|
if(report_node_idx >= nodes.size()) return false;
|
|
bool firstOutEdge = false;
|
|
if(report_edge_range.first >= report_edge_range.second) {
|
|
report_edge_range = getEdges(report_node_idx, false /* from? */);
|
|
firstOutEdge = true;
|
|
if(report_node_idx == 0) {
|
|
report_M = pair<index_t, index_t>(0, 0);
|
|
}
|
|
}
|
|
assert_lt(report_edge_range.first, report_edge_range.second);
|
|
assert_lt(report_edge_range.first, edges.size());
|
|
const PathEdge& edge = edges[report_edge_range.first];
|
|
gbwtChar = edge.label;
|
|
assert_lt(report_node_idx, nodes.size());
|
|
F = (firstOutEdge ? 1 : 0);
|
|
|
|
report_edge_range.first++;
|
|
if(report_edge_range.first >= report_edge_range.second) {
|
|
report_node_idx++;
|
|
}
|
|
assert_lt(report_M.first, nodes.size());
|
|
pos = nodes[report_M.first].to;
|
|
M = (report_M.second == 0 ? 1 : 0);
|
|
report_M.second++;
|
|
if(report_M.second >= nodes[report_M.first].key.first) {
|
|
report_M.first++;
|
|
report_M.second = 0;
|
|
}
|
|
return true;
|
|
}
|
|
|
|
index_t nextFLocation() {
|
|
if(report_F_node_idx >= nodes.size()) return (index_t)INDEX_MAX;
|
|
index_t ret = report_F_location;
|
|
pair<index_t, index_t> edge_range = getEdges(report_F_node_idx, false /* from? */);
|
|
report_F_node_idx++;
|
|
assert_lt(edge_range.first, edge_range.second);
|
|
report_F_location += (edge_range.second - edge_range.first);
|
|
return ret;
|
|
}
|
|
|
|
private:
|
|
void makeFromRef(RefGraph<index_t>& base);
|
|
void generationOne();
|
|
void earlyGeneration();
|
|
void firstPruneGeneration();
|
|
void lateGeneration();
|
|
|
|
void mergeUpdateRank();
|
|
pair<index_t, index_t> nextMaximalSet(pair<index_t, index_t> range);
|
|
pair<index_t, index_t> getEdges(index_t node, bool by_from); // Create index first.
|
|
|
|
struct CreateNewNodesParams {
|
|
PathNode* st;
|
|
PathNode* en;
|
|
PathNode* curr;
|
|
index_t* sub_temp_nodes;
|
|
PathGraph<index_t>* graph;
|
|
};
|
|
static void createNewNodesCounter(void* vp);
|
|
static void createNewNodesMaker(void* vp);
|
|
void createNewNodes();
|
|
|
|
struct GenEdgesParams {
|
|
typename RefGraph<index_t>::Edge* st;
|
|
typename RefGraph<index_t>::Edge* en;
|
|
EList<index_t, 6>* label_index;
|
|
EList<PathNode>* nodes;
|
|
EList<PathEdge>* edges;
|
|
EList<typename RefGraph<index_t>::Node>* ref_nodes;
|
|
};
|
|
static void generateEdgesCounter(void* vp);
|
|
static void generateEdgesMaker(void* vp);
|
|
|
|
private:
|
|
int nthreads;
|
|
bool verbose;
|
|
EList<PathNode> from_table;
|
|
EList<PathNode> past_nodes;
|
|
EList<PathNode> nodes;
|
|
EList<PathEdge> edges;
|
|
index_t ranks;
|
|
index_t max_from; //number of nodes in RefGraph
|
|
index_t temp_nodes; // Total number of nodes created before sorting.
|
|
index_t generation; // Sorted by paths of length 2^generation.
|
|
bool sorted;
|
|
|
|
// For reporting GBWT char, F, and M values
|
|
index_t report_node_idx;
|
|
pair<index_t, index_t> report_edge_range;
|
|
pair<index_t, index_t> report_M;
|
|
// For reporting location in F corresponding to 1 bit in M
|
|
index_t report_F_node_idx;
|
|
index_t report_F_location;
|
|
|
|
size_t max_num_nodes;
|
|
|
|
// following variables are for debugging purposes
|
|
#ifndef NDEBUG
|
|
bool debug;
|
|
#endif
|
|
|
|
EList<char> bwt_string;
|
|
EList<char> F_array;
|
|
EList<char> M_array;
|
|
EList<index_t> bwt_counts;
|
|
|
|
// brute-force implementations
|
|
index_t select(const EList<char>& array, index_t p, char c) {
|
|
if(p <= 0) return 0;
|
|
for(index_t i = 0; i < array.size(); i++) {
|
|
if(array[i] == c) {
|
|
assert_gt(p, 0);
|
|
p--;
|
|
if(p == 0)
|
|
return i;
|
|
}
|
|
}
|
|
return (index_t)array.size();
|
|
}
|
|
|
|
index_t select1(const EList<char>& array, index_t p) {
|
|
return select(array, p, 1);
|
|
}
|
|
|
|
index_t rank(const EList<char>& array, index_t p, char c) {
|
|
index_t count = 0;
|
|
assert_lt(p, array.size());
|
|
for(index_t i = 0; i <= p; i++) {
|
|
if(array[i] == c)
|
|
count++;
|
|
}
|
|
return count;
|
|
}
|
|
|
|
index_t rank1(const EList<char>& array, index_t p) {
|
|
return rank(array, p, 1);
|
|
}
|
|
|
|
// for debugging purposes
|
|
#ifndef NDEBUG
|
|
public: EList<pair<index_t, index_t> > ftab;
|
|
#endif
|
|
};
|
|
|
|
//creates prefix-sorted PathGraph Nodes given a reverse determinized RefGraph
|
|
//outputs nodes sorted by their from attribute
|
|
template <typename index_t>
|
|
PathGraph<index_t>::PathGraph(
|
|
RefGraph<index_t>& base,
|
|
const string& base_fname,
|
|
size_t max_num_nodes_,
|
|
int nthreads_,
|
|
bool verbose_) :
|
|
nthreads(nthreads_), verbose(verbose_),
|
|
ranks(0), temp_nodes(0), generation(0), sorted(false),
|
|
report_node_idx(0), report_edge_range(pair<index_t, index_t>(0, 0)), report_M(pair<index_t, index_t>(0, 0)),
|
|
report_F_node_idx(0), report_F_location(0),
|
|
max_num_nodes(max_num_nodes_)
|
|
{
|
|
#ifndef NDEBUG
|
|
debug = base.nodes.size() <= 20;
|
|
#endif
|
|
// Fill nodes with a PathNode for each edge in base.edges.
|
|
// Set max_from.
|
|
makeFromRef(base);
|
|
|
|
// Write RefGraph into a file
|
|
const bool file_rf = base.nodes.size() > (1 << 22);
|
|
const bool bigEndian = false;
|
|
const string rf_fname = base_fname + ".rf";
|
|
if(file_rf) {
|
|
base.write(rf_fname, bigEndian);
|
|
base.nullify();
|
|
}
|
|
|
|
// In the first generation the nodes enter, not quite sorted by from.
|
|
// We use a counting sort to sort the nodes, otherwise same as early generation.
|
|
generationOne();
|
|
// In early generations no nodes become sorted.
|
|
// Therefore, we skip the pruning step and leave the
|
|
// nodes sorted by from.
|
|
while(generation < 3) {
|
|
earlyGeneration();
|
|
}
|
|
// On the first generation we perform a pruning step,
|
|
// we are forced to sort the entire list of nodes by rank
|
|
// in order to perform pruning step.
|
|
firstPruneGeneration();
|
|
// In later generations, most nodes are already sorted, so we
|
|
// perform a more expensive random access join with nodes in rank order
|
|
// in return for avoiding having to sort by rank in order to prune nodes.
|
|
while(!isSorted()) {
|
|
lateGeneration();
|
|
}
|
|
// In the generateEdges method it is convenient to begin with nodes sorted by from.
|
|
// We perform this action here, while we still have past_nodes allocated to avoid
|
|
// an in-place sort.
|
|
nodes.resizeNoCopyExact(past_nodes.size());
|
|
radix_sort_copy<PathNode, PathNodeFromCmp, index_t>(past_nodes.begin(), past_nodes.end(), nodes.ptr(),
|
|
&PathNodeFrom, max_from, nthreads);
|
|
past_nodes.nullify();
|
|
from_table.nullify();
|
|
|
|
if(file_rf) {
|
|
base.read(rf_fname, bigEndian);
|
|
std::remove(rf_fname.c_str());
|
|
}
|
|
}
|
|
|
|
//make original unsorted PathNodes given a RefGraph
|
|
template <typename index_t>
|
|
void PathGraph<index_t>::makeFromRef(RefGraph<index_t>& base) {
|
|
// Create a path node per edge with a key set to from node's label
|
|
temp_nodes = (index_t)base.edges.size() + 1;
|
|
max_from = 0;
|
|
nodes.reserveExact(temp_nodes);
|
|
for(index_t i = 0; i < base.edges.size(); i++) {
|
|
const typename RefGraph<index_t>::Edge& e = base.edges[i];
|
|
nodes.expand();
|
|
nodes.back().from = e.from;
|
|
if(e.from > max_from) max_from = e.from;
|
|
nodes.back().to = e.to;
|
|
|
|
switch(base.nodes[e.from].label) {
|
|
case 'A':
|
|
nodes.back().key = pair<index_t, index_t>(0, 0);
|
|
break;
|
|
case 'C':
|
|
nodes.back().key = pair<index_t, index_t>(1, 0);
|
|
break;
|
|
case 'G':
|
|
nodes.back().key = pair<index_t, index_t>(2, 0);
|
|
break;
|
|
case 'T':
|
|
nodes.back().key = pair<index_t, index_t>(3, 0);
|
|
break;
|
|
case 'Y':
|
|
nodes.back().key = pair<index_t, index_t>(4, 0);
|
|
break;
|
|
default:
|
|
assert(false);
|
|
throw 1;
|
|
}
|
|
}
|
|
// Final node.
|
|
assert_lt(base.lastNode, base.nodes.size());
|
|
assert_eq(base.nodes[base.lastNode].label, 'Z');
|
|
nodes.expand();
|
|
nodes.back().from = nodes.back().to = base.lastNode;
|
|
if(base.lastNode > max_from) max_from = base.lastNode;
|
|
nodes.back().key = pair<index_t, index_t>(5, 0);
|
|
printInfo();
|
|
}
|
|
|
|
template <typename index_t>
|
|
void PathGraph<index_t>::generationOne() {
|
|
//nodes enter almost sorted by from
|
|
//this is only generation method that whose
|
|
// incoming nodes are in the nodes EList
|
|
generation++;
|
|
//Sort nodes by from using counting sort
|
|
//Copy into past_nodes in the process
|
|
//first count number with each from value
|
|
for(PathNode* node = nodes.begin(); node != nodes.end(); node++) {
|
|
nodes[node->from].key.second++;
|
|
}
|
|
//convert into an index
|
|
index_t tot = nodes[0].key.second;
|
|
nodes[0].key.second = 0;
|
|
for(index_t i = 1; i < max_from + 2; i++) {
|
|
tot += nodes[i].key.second;
|
|
nodes[i].key.second = tot - nodes[i].key.second;
|
|
}
|
|
// use past_nodes as from_table
|
|
past_nodes.resizeExact(nodes.size());
|
|
for(PathNode* node = nodes.begin(); node != nodes.end(); node++) {
|
|
past_nodes[nodes[node->from].key.second++] = *node;
|
|
}
|
|
//reset index
|
|
for(index_t i = max_from + 1; i > 0; i--) {
|
|
past_nodes[i].key.second = nodes[i - 1].key.second;
|
|
}
|
|
past_nodes[0].key.second = 0;
|
|
//Now query direct-access table
|
|
createNewNodes();
|
|
printInfo();
|
|
past_nodes.swap(nodes);
|
|
}
|
|
|
|
template <typename index_t>
|
|
void PathGraph<index_t>::earlyGeneration() {
|
|
//past_nodes enter sorted by from
|
|
//do not yet need to perform pruning step
|
|
generation++;
|
|
for(index_t i = 0; i < past_nodes.size(); i++) {
|
|
past_nodes[past_nodes[i].from + 1].key.second = i + 1;
|
|
}
|
|
createNewNodes();
|
|
printInfo();
|
|
past_nodes.swap(nodes);
|
|
}
|
|
|
|
template <typename index_t>
|
|
void PathGraph<index_t>::firstPruneGeneration() {
|
|
//past_nodes enter sorted by from
|
|
//first generation where we need to perform pruning step
|
|
// results in us needing to sort entirety of nodes after they are made
|
|
generation++;
|
|
//here past_nodes is already sorted by .from
|
|
// first count where to start each from value
|
|
time_t start = time(0);
|
|
//Build from_index
|
|
for(index_t i = 0; i < past_nodes.size(); i++) {
|
|
past_nodes[past_nodes[i].from + 1].key.second = i + 1;
|
|
}
|
|
if(verbose) cerr << "BUILT FROM_INDEX: " << time(0) - start << endl;
|
|
start = time(0);
|
|
// Now query against direct-access table
|
|
createNewNodes();
|
|
past_nodes.resizeNoCopyExact(nodes.size());
|
|
|
|
if(verbose) cerr << "RESIZE NODES: " << time(0) - start << endl;
|
|
start = time(0);
|
|
|
|
//max_rank always corresponds to repeated Z's
|
|
// Z is mapped to 0x101
|
|
// therefore max rank = 101101101101101101101101 = (101) 8 times
|
|
index_t max_rank = 11983725;
|
|
radix_sort_copy<PathNode, less<PathNode>, index_t>(nodes.begin(), nodes.end(), past_nodes.ptr(),
|
|
&PathNodeKey, max_rank, nthreads);
|
|
|
|
if(verbose) cerr << "SORT NODES: " << time(0) - start << endl;
|
|
start = time(0);
|
|
|
|
nodes.swap(past_nodes);
|
|
mergeUpdateRank();
|
|
|
|
if(verbose) cerr << "MERGE, UPDATE RANK: " << time(0) - start << endl;
|
|
start = time(0);
|
|
|
|
printInfo();
|
|
past_nodes.swap(nodes);
|
|
}
|
|
|
|
template <typename index_t>
|
|
void PathGraph<index_t>::lateGeneration() {
|
|
//past_nodes enter sorted by rank
|
|
//build direct-access table sorted by from,
|
|
//but query with original nodes sorted by rank
|
|
//since nodes we query with are sorted by rank,
|
|
// the nodes produced are automatically sorted by key.first
|
|
// therefore we only need to sort clusters with same key.first
|
|
generation++;
|
|
time_t overall = time(0);
|
|
time_t indiv = time(0);
|
|
assert_gt(nthreads, 0);
|
|
assert_neq(past_nodes.size(), ranks);
|
|
from_table.resizeNoCopy(past_nodes.size());
|
|
|
|
if(verbose) cerr << "ALLOCATE FROM_TABLE: " << time(0) - indiv << endl;
|
|
indiv = time(0);
|
|
|
|
radix_sort_copy<PathNode, PathNodeFromCmp, index_t>(past_nodes.begin(), past_nodes.end(), from_table.ptr(),
|
|
&PathNodeFrom, max_from, nthreads);
|
|
|
|
if(verbose) cerr << "BUILD TABLE: " << time(0) - indiv << endl;
|
|
indiv = time(0);
|
|
|
|
//Build from_index
|
|
index_t from_table_size = from_table.size();
|
|
for(index_t i = 0; i < from_table_size; i++) {
|
|
if(from_table[i].from + 1 >= from_table.size()) {
|
|
from_table.resize(from_table[i].from + 2);
|
|
}
|
|
from_table[from_table[i].from + 1].key.second = i + 1;
|
|
}
|
|
|
|
if(verbose) cerr << "BUILD INDEX: " << time(0) - indiv << endl;
|
|
|
|
createNewNodes();
|
|
|
|
indiv = time(0);
|
|
|
|
mergeUpdateRank();
|
|
|
|
if(from_table_size != from_table.size()) {
|
|
assert_lt(from_table_size, from_table.size());
|
|
from_table.resize(from_table_size);
|
|
}
|
|
|
|
if(verbose) cerr << "MERGEUPDATERANK: " << time(0) - indiv << endl;
|
|
if(verbose) cerr << "TOTAL TIME: " << time(0) - overall << endl;
|
|
|
|
if(ranks >= (index_t)max_num_nodes) {
|
|
throw ExplosionException();
|
|
}
|
|
|
|
printInfo();
|
|
past_nodes.swap(nodes);
|
|
}
|
|
|
|
//-----------------------------------------------------------------------------------------------
|
|
|
|
template <typename index_t>
|
|
void PathGraph<index_t>::createNewNodesCounter(void* vp) {
|
|
CreateNewNodesParams* params = (CreateNewNodesParams*)vp;
|
|
PathNode* st = params->st;
|
|
PathNode* en = params->en;
|
|
PathGraph<index_t>& graph = *(params->graph);
|
|
|
|
size_t count = 0;
|
|
if(graph.generation > 4) {
|
|
for(PathNode* node = st; node != en; node++) {
|
|
if(node->isSorted()) {
|
|
count++;
|
|
} else {
|
|
count += graph.from_table[node->to + 1].key.second - graph.from_table[node->to].key.second;
|
|
}
|
|
}
|
|
} else {
|
|
for(PathNode* node = st; node != en; node++) {
|
|
count += graph.past_nodes[node->to + 1].key.second - graph.past_nodes[node->to].key.second;
|
|
}
|
|
}
|
|
*(params->sub_temp_nodes) = (index_t)count;
|
|
|
|
//check for overflow
|
|
if(count > (index_t)-1) {
|
|
cerr << "exceeded integer bounds, remove adjacent SNPs, use haplotypes, or switch to a large index (--large-index)" << endl;
|
|
throw 1;
|
|
}
|
|
}
|
|
template <typename index_t>
|
|
void PathGraph<index_t>::createNewNodesMaker(void* vp) {
|
|
CreateNewNodesParams* params = (CreateNewNodesParams*)vp;
|
|
PathNode* st = params->st;
|
|
PathNode* en = params->en;
|
|
PathNode* curr = params->curr;
|
|
PathGraph<index_t>& graph = *(params->graph);
|
|
if(graph.generation > 4) {
|
|
for(PathNode* node = st; node != en; node++) {
|
|
if(node->isSorted()) {
|
|
*curr++ = *node;
|
|
} else {
|
|
for(index_t j = graph.from_table[node->to].key.second; j < graph.from_table[node->to + 1].key.second; j++) {
|
|
curr->from = node->from;
|
|
curr->to = graph.from_table[j].to;
|
|
(curr++)->key = pair<index_t, index_t>(node->key.first, graph.from_table[j].key.first);
|
|
}
|
|
}
|
|
}
|
|
} else if(graph.generation == 4) {
|
|
for(PathNode* node = st; node != en; node++) {
|
|
for(index_t j = graph.past_nodes[node->to].key.second; j < graph.past_nodes[node->to + 1].key.second; j++) {
|
|
curr->from = node->from;
|
|
curr->to = graph.past_nodes[j].to;
|
|
(curr++)->key = pair<index_t, index_t>(node->key.first, graph.past_nodes[j].key.first);
|
|
}
|
|
}
|
|
} else {
|
|
for(PathNode* node = st; node != en; node++) {
|
|
for(index_t j = graph.past_nodes[node->to].key.second; j < graph.past_nodes[node->to + 1].key.second; j++) {
|
|
curr->from = node->from;
|
|
curr->to = graph.past_nodes[j].to;
|
|
index_t bit_shift = 1 << (graph.generation - 1);
|
|
bit_shift = (bit_shift << 1) + bit_shift;
|
|
(curr++)->key = pair<index_t, index_t>((node->key.first << bit_shift) + graph.past_nodes[j].key.first, 0);
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
template <typename index_t>
|
|
void PathGraph<index_t>::createNewNodes() {
|
|
time_t indiv = time(0);
|
|
AutoArray<tthread::thread*> threads(nthreads);
|
|
EList<CreateNewNodesParams> params; params.resizeExact(nthreads);
|
|
EList<index_t> sub_temp_nodes; sub_temp_nodes.resizeExact(nthreads); sub_temp_nodes.fillZero();
|
|
PathNode* st = past_nodes.begin();
|
|
PathNode* en = st + past_nodes.size() / nthreads;
|
|
for(int i = 0; i < nthreads; i++) {
|
|
params[i].sub_temp_nodes = &sub_temp_nodes[i];
|
|
params[i].st = st;
|
|
params[i].en = en;
|
|
params[i].graph = this;
|
|
if(nthreads == 1) {
|
|
createNewNodesCounter((void*)¶ms[0]);
|
|
} else {
|
|
threads[i] = new tthread::thread(&createNewNodesCounter, (void*)¶ms[i]);
|
|
}
|
|
st = en;
|
|
if(i + 2 == nthreads) {
|
|
en = past_nodes.end();
|
|
} else {
|
|
en = st + past_nodes.size() / nthreads;
|
|
}
|
|
}
|
|
|
|
if(nthreads > 1) {
|
|
for(int i = 0; i < nthreads; i++)
|
|
threads[i]->join();
|
|
}
|
|
if(verbose) cerr << "COUNTED NEW NODES: " << time(0) - indiv << endl;
|
|
indiv = time(0);
|
|
//update all label indexes
|
|
temp_nodes = 0;
|
|
for(int i = 0; i < nthreads; i++) {
|
|
// done to check if we exceed index_t range
|
|
size_t val = (size_t)temp_nodes + (size_t)sub_temp_nodes[i];
|
|
if(val > (index_t)-1) {
|
|
cerr << "exceeded integer bounds, remove adjacent SNPs, use haplotypes, or switch to a large index (--large-index)" << endl;
|
|
throw 1;
|
|
}
|
|
temp_nodes = (index_t)val;
|
|
}
|
|
if(verbose) cerr << "COUNTED TEMP NODES: " << time(0) - indiv << endl;
|
|
indiv = time(0);
|
|
nodes.resizeNoCopyExact(temp_nodes);
|
|
if(verbose) cerr << "RESIZED NODES: " << time(0) - indiv << endl;
|
|
indiv = time(0);
|
|
temp_nodes = 0;
|
|
for(int i = 0; i < nthreads; i++) {
|
|
params[i].curr = nodes.begin() + temp_nodes;
|
|
temp_nodes += sub_temp_nodes[i];
|
|
}
|
|
if(verbose) cerr << "RESIZED NODES: " << time(0) - indiv << endl;
|
|
indiv = time(0);
|
|
//make new nodes
|
|
for(int i = 0; i < nthreads; i++) {
|
|
if(nthreads == 1) {
|
|
createNewNodesMaker((void*)¶ms[0]);
|
|
} else {
|
|
threads[i] = new tthread::thread(&createNewNodesMaker, (void*)¶ms[i]);
|
|
}
|
|
}
|
|
|
|
if(nthreads > 1) {
|
|
for(int i = 0; i < nthreads; i++)
|
|
threads[i]->join();
|
|
}
|
|
if(verbose) cerr << "MADE NEW NODES: " << time(0) - indiv << endl;
|
|
indiv = time(0);
|
|
}
|
|
|
|
//------------------------------------------------------------------------------------
|
|
|
|
template <typename index_t>
|
|
void PathGraph<index_t>::mergeUpdateRank()
|
|
{
|
|
if(generation == 4) {
|
|
// Merge equivalent nodes
|
|
index_t curr = 0;
|
|
pair<index_t, index_t> range(0, 0); // Empty range
|
|
while(true) {
|
|
range = nextMaximalSet(range);
|
|
if(range.first >= range.second)
|
|
break;
|
|
nodes[curr] = nodes[range.first]; curr++;
|
|
}
|
|
nodes.resize(curr);
|
|
|
|
// Set nodes that become sorted as sorted
|
|
PathNode* candidate = &nodes.front();
|
|
pair<index_t, index_t> key = candidate->key; ranks = 1;
|
|
for(index_t i = 1; i < nodes.size(); i++) {
|
|
if(nodes[i].key != key) {
|
|
if(candidate != NULL) {
|
|
candidate->setSorted();
|
|
}
|
|
candidate = &nodes[i];
|
|
key = candidate->key; ranks++;
|
|
} else {
|
|
candidate = NULL;
|
|
}
|
|
}
|
|
if(candidate != NULL) {
|
|
candidate->setSorted();
|
|
}
|
|
ranks = 0;
|
|
key = nodes.front().key;
|
|
for(index_t i = 0; i < nodes.size(); i++) {
|
|
PathNode& node = nodes[i];
|
|
if(node.key != key) {
|
|
key = node.key;
|
|
ranks++;
|
|
}
|
|
node.key = pair<index_t, index_t>(ranks, 0);
|
|
}
|
|
ranks++;
|
|
} else {
|
|
PathNode* block_start = nodes.begin();
|
|
PathNode* curr = nodes.begin();
|
|
PathNode* node = nodes.begin();
|
|
ranks = 0;
|
|
do {
|
|
node++;
|
|
if(node == nodes.end() || node->key.first != block_start->key.first) {
|
|
if(node - block_start == 1) {
|
|
block_start->key.first = ranks++;
|
|
*curr++ = *block_start;
|
|
} else {
|
|
sort(block_start, node, PathNodeKeySecondCmp());
|
|
while(block_start != node) {
|
|
//extend shift while share same key
|
|
index_t shift = 1;
|
|
while(block_start + shift != node && block_start->key == (block_start + shift)->key) {
|
|
shift++;
|
|
}
|
|
//check if all share .from
|
|
//if they share same from, then they are a mergable set
|
|
bool merge = true;
|
|
for(PathNode* n = block_start; n != (block_start + shift); n++) {
|
|
if(n->from != block_start->from) {
|
|
merge = false;
|
|
break;
|
|
}
|
|
}
|
|
//if not mergable, just write all to array
|
|
if(!merge) {
|
|
for(PathNode* n = block_start; n != (block_start + shift); n++) {
|
|
n->key.first = ranks;
|
|
*curr++ = *n;
|
|
}
|
|
ranks++;
|
|
} else if(curr == nodes.begin() || !(curr - 1)->isSorted() || (curr - 1)->from != block_start->from) {
|
|
block_start->setSorted();
|
|
block_start->key.first = ranks++;
|
|
*curr++ = *block_start;
|
|
}
|
|
block_start += shift;
|
|
}
|
|
// if we are at the last node or the last node is mergable into the previous node, we are done
|
|
if(node == nodes.end()) break;
|
|
if(node + 1 == nodes.end()) {
|
|
assert(curr >= nodes.begin() + 1);
|
|
if((curr - 1)->isSorted() && node->from == (curr - 1)->from)
|
|
break;
|
|
}
|
|
// check if we can safely merge the node immediately following the unsorted cluster into the previous node
|
|
// must be that:
|
|
// 1) node is not itself part of an unsorted cluster
|
|
// 2) the previous node is sorted
|
|
// 3) the nodes share the same from attribute
|
|
assert(node + 1 < nodes.end());
|
|
if(node->key.first != (node + 1)->key.first) {
|
|
assert(curr >= nodes.begin() + 1);
|
|
if((curr - 1)->isSorted() && node->from == (curr - 1)->from)
|
|
node++;
|
|
}
|
|
}
|
|
block_start = node;
|
|
}
|
|
} while(node != nodes.end());
|
|
nodes.resizeExact((index_t)(curr - nodes.begin()));
|
|
}
|
|
// if all nodes have unique rank we are done!
|
|
if(ranks == nodes.size()) sorted = true;
|
|
}
|
|
|
|
|
|
// Returns the next maximal mergeable set of PathNodes.
|
|
// A set of PathNodes sharing adjacent keys is mergeable, if each of the
|
|
// PathNodes begins in the same GraphNode, and no other PathNode shares
|
|
// the key. If the maximal set is empty, returns the next PathNode.
|
|
template <typename index_t>
|
|
pair<index_t, index_t> PathGraph<index_t>::nextMaximalSet(pair<index_t, index_t> range) {
|
|
if(range.second >= nodes.size()) {
|
|
return pair<index_t, index_t>(0, 0);
|
|
}
|
|
range.first = range.second;
|
|
range.second = range.first + 1;
|
|
if(range.first > 0 && nodes[range.first - 1].key == nodes[range.first].key) {
|
|
return range;
|
|
}
|
|
|
|
for(index_t i = range.second; i < nodes.size(); i++) {
|
|
if(nodes[i - 1].key != nodes[i].key) {
|
|
range.second = i;
|
|
}
|
|
if(nodes[i].from != nodes[range.first].from) {
|
|
return range;
|
|
}
|
|
}
|
|
range.second = (index_t)nodes.size();
|
|
return range;
|
|
}
|
|
|
|
//-----------------------------------------------------------------------------------------
|
|
|
|
template <typename index_t>
|
|
void PathGraph<index_t>::printInfo()
|
|
{
|
|
if(verbose) {
|
|
cerr << "Generation " << generation
|
|
<< " (" << temp_nodes << " -> " << nodes.size() << " nodes, "
|
|
<< ranks << " ranks)" << endl;
|
|
}
|
|
}
|
|
|
|
//------------------------------------------------------------------------------------------
|
|
|
|
template <typename index_t>
|
|
void PathGraph<index_t>::generateEdgesCounter(void* vp) {
|
|
GenEdgesParams* params = (GenEdgesParams*)vp;
|
|
typename RefGraph<index_t>::Edge* st = params->st;
|
|
typename RefGraph<index_t>::Edge* en = params->en;
|
|
EList<index_t, 6>& label_index = *(params->label_index);
|
|
EList<typename RefGraph<index_t>::Node>& ref_nodes = *(params->ref_nodes);
|
|
EList<PathNode>& nodes = *(params->nodes);
|
|
//first count edges, fill out label_index
|
|
for(typename RefGraph<index_t>::Edge* edge = st; edge != en; edge++) {
|
|
char curr_label = ref_nodes[edge->from].label;
|
|
int curr_label_index;
|
|
switch(curr_label) {
|
|
case 'A': curr_label_index = 0; break;
|
|
case 'C': curr_label_index = 1; break;
|
|
case 'G': curr_label_index = 2; break;
|
|
case 'T': curr_label_index = 3; break;
|
|
case 'Y': curr_label_index = 4; break;
|
|
case 'Z': curr_label_index = 5; break;
|
|
default: assert(false); throw 1;
|
|
}
|
|
assert_lt(edge->to + 1, nodes.size());
|
|
assert_lt(nodes[edge->to].key.second, nodes[edge->to + 1].key.second);
|
|
label_index[curr_label_index] += nodes[edge->to + 1].key.second - nodes[edge->to].key.second;
|
|
}
|
|
}
|
|
|
|
template <typename index_t>
|
|
void PathGraph<index_t>::generateEdgesMaker(void* vp) {
|
|
GenEdgesParams* params = (GenEdgesParams*)vp;
|
|
typename RefGraph<index_t>::Edge* st = params->st;
|
|
typename RefGraph<index_t>::Edge* en = params->en;
|
|
EList<index_t, 6>& label_index = *(params->label_index);
|
|
EList<typename RefGraph<index_t>::Node>& ref_nodes = *(params->ref_nodes);
|
|
EList<PathEdge>& edges = *(params->edges);
|
|
EList<PathNode>& nodes = *(params->nodes);
|
|
for(typename RefGraph<index_t>::Edge* edge = st; edge != en; edge++) {
|
|
char curr_label = ref_nodes[edge->from].label;
|
|
int curr_label_index;
|
|
switch(curr_label) {
|
|
case 'A': curr_label_index = 0; break;
|
|
case 'C': curr_label_index = 1; break;
|
|
case 'G': curr_label_index = 2; break;
|
|
case 'T': curr_label_index = 3; break;
|
|
case 'Y': curr_label_index = 4; break;
|
|
case 'Z': curr_label_index = 5; break;
|
|
default: assert(false); throw 1;
|
|
}
|
|
for(index_t j = nodes[edge->to].key.second; j < nodes[edge->to + 1].key.second; j++) {
|
|
edges[label_index[curr_label_index]++] = PathEdge(edge->from, nodes[j].key.first, curr_label);
|
|
}
|
|
}
|
|
}
|
|
|
|
template <typename index_t>
|
|
bool PathGraph<index_t>::generateEdges(RefGraph<index_t>& base)
|
|
{
|
|
//entering we have:
|
|
// nodes - sorted by from
|
|
// edges - empty
|
|
// base.nodes - almost sorted by from/to
|
|
// base.edges - almost sorted by from/to
|
|
|
|
//need to join:
|
|
// nodes.from -> base.nodes[]
|
|
// nodes.from -> base.edges.to
|
|
// nodes.from -> edges.from
|
|
|
|
if(!sorted) return false;
|
|
|
|
time_t indiv = time(0);
|
|
time_t overall = time(0);
|
|
|
|
//replace nodes.to with genomic position
|
|
//fast because both roughly ordered by from
|
|
for(PathNode* node = nodes.begin(); node != nodes.end(); node++) {
|
|
node->to = base.nodes[node->from].value;
|
|
}
|
|
|
|
if(verbose) cerr << "NODE.TO -> GENOME POS: " << time(0) - indiv << endl;
|
|
indiv = time(0);
|
|
|
|
// build an index for nodes
|
|
index_t node_size = nodes.size();
|
|
for(index_t i = 0; i < node_size; i++) {
|
|
// very rare case where the number of prefix-sorted nodes is smaller than the number of the initial nodes
|
|
// , which could happen with a very small graph and a variant as follows
|
|
// ATAGAGCAGTTCTGAAAAACACTTTTTGTTGAATCTGCAAG(T)GGACATTTGGATAGATTTGAAGATTTCGTTGGAAACGGGAATATCTTCATATCAAATG
|
|
// (G)
|
|
// where G(T) and G(G) will be combined as there is no other node that intervene those two path nodes.
|
|
if(nodes[i].from + 1 >= nodes.size()) {
|
|
nodes.resize(nodes[i].from + 2);
|
|
}
|
|
nodes[nodes[i].from + 1].key.second = i + 1;
|
|
}
|
|
|
|
if(verbose) cerr << "BUILD FROM_INDEX " << time(0) - indiv << endl;
|
|
indiv = time(0);
|
|
|
|
// Now join nodes.from to edges.to
|
|
// fast because base.edges roughly sorted by to
|
|
|
|
//count number of edges
|
|
AutoArray<tthread::thread*> threads(nthreads);
|
|
EList<GenEdgesParams> params; params.resizeExact(nthreads);
|
|
ELList<index_t, 6> label_index; label_index.resize(nthreads);
|
|
typename RefGraph<index_t>::Edge* st = base.edges.begin();
|
|
typename RefGraph<index_t>::Edge* en = st + base.edges.size() / nthreads;
|
|
for(int i = 0; i < nthreads; i++) {
|
|
label_index[i].resizeExact(6);
|
|
label_index[i].fillZero();
|
|
params[i].label_index = &label_index[i];
|
|
params[i].st = st;
|
|
params[i].en = en;
|
|
params[i].nodes = &nodes;
|
|
params[i].edges = &edges;
|
|
params[i].ref_nodes = &base.nodes;
|
|
if(nthreads == 1) {
|
|
generateEdgesCounter((void*)¶ms[0]);
|
|
} else {
|
|
threads[i] = new tthread::thread(&generateEdgesCounter, (void*)¶ms[i]);
|
|
}
|
|
st = en;
|
|
if(i + 2 == nthreads) {
|
|
en = base.edges.end();
|
|
} else {
|
|
en = st + base.edges.size() / nthreads;
|
|
}
|
|
}
|
|
|
|
if(nthreads > 1) {
|
|
for(int i = 0; i < nthreads; i++)
|
|
threads[i]->join();
|
|
}
|
|
|
|
if(verbose) cerr << "COUNTED NEW EDGES: " << time(0) - indiv << endl;
|
|
indiv = time(0);
|
|
//update all label indexes
|
|
index_t tot = label_index[0][0];
|
|
label_index[0][0] = 0;
|
|
for(int i = 1; i < nthreads; i++) {
|
|
tot += label_index[i][0];
|
|
label_index[i][0] = tot - label_index[i][0];
|
|
}
|
|
for(int j = 1; j < 6; j++) {
|
|
for(int i = 0; i < nthreads; i++) {
|
|
tot += label_index[i][j];
|
|
label_index[i][j] = tot - label_index[i][j];
|
|
}
|
|
}
|
|
edges.resizeExact(tot);
|
|
//make new edges
|
|
for(int i = 0; i < nthreads; i++) {
|
|
if(nthreads == 1) {
|
|
generateEdgesMaker((void*)¶ms[0]);
|
|
} else {
|
|
threads[i] = new tthread::thread(&generateEdgesMaker, (void*)¶ms[i]);
|
|
}
|
|
}
|
|
|
|
if(nthreads > 1) {
|
|
for(int i = 0; i < nthreads; i++) {
|
|
threads[i]->join();
|
|
}
|
|
}
|
|
base.nullify();
|
|
|
|
// delete unused nodes
|
|
if(node_size != nodes.size()) {
|
|
assert_lt(node_size, nodes.size());
|
|
nodes.resize(node_size);
|
|
}
|
|
|
|
if(verbose) cerr << "MADE NEW EDGES: " << time(0) - indiv << endl;
|
|
indiv = time(0);
|
|
|
|
EList<index_t, 6>& index = label_index[nthreads - 1];
|
|
|
|
EList<PathEdge> temp_edges; temp_edges.resizeExact(edges.size());
|
|
|
|
radix_sort_copy<PathEdge, less<PathEdge>, index_t>(edges.begin() , edges.begin() + index[0], temp_edges.ptr(),
|
|
&PathEdgeTo, (index_t)nodes.size(), nthreads);
|
|
radix_sort_copy<PathEdge, less<PathEdge>, index_t>(edges.begin() + index[0], edges.begin() + index[1], temp_edges.ptr() + index[0],
|
|
&PathEdgeTo, (index_t)nodes.size(), nthreads);
|
|
radix_sort_copy<PathEdge, less<PathEdge>, index_t>(edges.begin() + index[1], edges.begin() + index[2], temp_edges.ptr() + index[1],
|
|
&PathEdgeTo, (index_t)nodes.size(), nthreads);
|
|
radix_sort_copy<PathEdge, less<PathEdge>, index_t>(edges.begin() + index[2], edges.begin() + index[3], temp_edges.ptr() + index[2],
|
|
&PathEdgeTo, (index_t)nodes.size(), nthreads);
|
|
for(index_t i = index[3]; i < edges.size(); i++) {
|
|
temp_edges[i] = edges[i];
|
|
}
|
|
sort(temp_edges.begin() + index[3], temp_edges.begin() + index[4]);
|
|
sort(temp_edges.begin() + index[4], temp_edges.begin() + index[5]);
|
|
edges.xfer(temp_edges);
|
|
|
|
if(verbose) cerr << "SORTED NEW EDGES: " << time(0) - indiv << endl;
|
|
indiv = time(0);
|
|
|
|
EList<PathNode> past_nodes; past_nodes.resizeExact(nodes.size());
|
|
radix_sort_copy<PathNode, less<PathNode>, index_t>(nodes.begin(), nodes.end(), past_nodes.ptr(), &PathNodeKey, ranks, nthreads);
|
|
nodes.xfer(past_nodes);
|
|
|
|
if(verbose) cerr << "RE-SORTED NODES: " << time(0) - indiv << endl;
|
|
indiv = time(0);
|
|
|
|
#ifndef NDEBUG
|
|
if(debug) {
|
|
cerr << "just after creating path edges" << endl;
|
|
cerr << "Ref edges" << endl;
|
|
for(size_t i = 0; i < base.edges.size(); i++) {
|
|
const typename RefGraph<index_t>::Edge& edge = base.edges[i];
|
|
cerr << "\t" << i << "\t" << edge.from << " --> " << edge.to << endl;
|
|
}
|
|
|
|
cerr << "Path nodes" << endl;
|
|
for(size_t i = 0; i < nodes.size(); i++) {
|
|
const PathNode& node = nodes[i];
|
|
cerr << "\t" << i << "\t(" << node.key.first << ", " << node.key.second << ")\t"
|
|
<< node.from << " --> " << node.to << endl;
|
|
}
|
|
|
|
cerr << "Path edges" << endl;
|
|
for(size_t i = 0; i < edges.size(); i++) {
|
|
const PathEdge& edge = edges[i];
|
|
cerr << "\t" << i << "\tfrom: " << edge.from << "\tranking: " << edge.ranking << "\t" << edge.label << endl;
|
|
}
|
|
}
|
|
#endif
|
|
|
|
#ifndef NDEBUG
|
|
|
|
// Switch char array[x][y]; to char** array;
|
|
if(debug) {
|
|
cerr << "after sorting nodes by ranking and edges by label and ranking" << endl;
|
|
cerr << "Path nodes" << endl;
|
|
for(size_t i = 0; i < nodes.size(); i++) {
|
|
const PathNode& node = nodes[i];
|
|
cerr << "\t" << i << "\t(" << node.key.first << ", " << node.key.second << ")\t"
|
|
<< node.from << " --> " << node.to << endl;
|
|
}
|
|
|
|
cerr << "Path edges" << endl;
|
|
for(size_t i = 0; i < edges.size(); i++) {
|
|
const PathEdge& edge = edges[i];
|
|
cerr << "\t" << i << "\tfrom: " << edge.from << "\tranking: " << edge.ranking << "\t" << edge.label << endl;
|
|
}
|
|
}
|
|
#endif
|
|
|
|
// Sets PathNode.to = GraphNode.value and PathNode.key.first to outdegree
|
|
// Replaces (from.from, to) with (from, to)
|
|
{
|
|
PathNode* node = nodes.begin(); node->key.first = 0;
|
|
PathEdge* edge = edges.begin();
|
|
while(node != nodes.end() && edge != edges.end()) {
|
|
if(edge->from == node->from) {
|
|
edge->from = (index_t)(node - nodes.begin()); edge++;
|
|
node->key.first++;
|
|
} else {
|
|
node++; node->key.first = 0;
|
|
}
|
|
}
|
|
}
|
|
|
|
if(verbose) cerr << "PROCESS EDGES: " << time(0) - indiv << endl;
|
|
indiv = time(0);
|
|
|
|
// Remove 'Y' node
|
|
assert_gt(nodes.size(), 2);
|
|
nodes.back().key.first = nodes[nodes.size() - 2].key.first;
|
|
nodes[nodes.size() - 2] = nodes.back();
|
|
nodes.pop_back();
|
|
// Adjust edges accordingly
|
|
for(size_t i = 0; i < edges.size(); i++) {
|
|
PathEdge& edge = edges[i];
|
|
if(edge.label == 'Y') {
|
|
edge.label = 'Z';
|
|
} else if(edge.ranking >= nodes.size()) {
|
|
assert_eq(edge.ranking, nodes.size());
|
|
edge.ranking -= 1;
|
|
}
|
|
}
|
|
if(verbose) cerr << "REMOVE Y: " << time(0) - indiv << endl;
|
|
indiv = time(0);
|
|
|
|
#ifndef NDEBUG
|
|
if(debug) {
|
|
cerr << "Path nodes" << endl;
|
|
for(size_t i = 0; i < nodes.size(); i++) {
|
|
const PathNode& node = nodes[i];
|
|
cerr << "\t" << i << "\t(" << node.key.first << ", " << node.key.second << ")\t"
|
|
<< node.from << " --> " << node.to << endl;
|
|
}
|
|
|
|
cerr << "Path edges" << endl;
|
|
for(size_t i = 0; i < edges.size(); i++) {
|
|
const PathEdge& edge = edges[i];
|
|
cerr << "\t" << i << "\tfrom: " << edge.from << "\tranking: " << edge.ranking << "\t" << edge.label << endl;
|
|
}
|
|
}
|
|
#endif
|
|
temp_edges.resizeExact(edges.size());
|
|
radix_sort_copy<PathEdge, PathEdgeToCmp, index_t>(edges.begin(), edges.end(), temp_edges.ptr(), &PathEdgeTo, (index_t)nodes.size(), nthreads);
|
|
edges.xfer(temp_edges);
|
|
for(index_t i = 0; i < edges.size(); i++) {
|
|
nodes[edges[i].ranking].key.second = i + 1;
|
|
}
|
|
|
|
if(verbose) cerr << "SORT, Make index: " << time(0) - indiv << endl;
|
|
if(verbose) cerr << "TOTAL: " << time(0) - overall << endl;
|
|
|
|
return true;
|
|
|
|
//-----------------------------------------------------------------------------------------------------
|
|
bwt_string.clear();
|
|
F_array.clear();
|
|
M_array.clear();
|
|
bwt_counts.resizeExact(5); bwt_counts.fillZero();
|
|
for(index_t node = 0; node < nodes.size(); node++) {
|
|
pair<index_t, index_t> edge_range = getEdges(node, false /* from? */);
|
|
for(index_t i = edge_range.first; i < edge_range.second; i++) {
|
|
assert_lt(i, edges.size());
|
|
char label = edges[i].label;
|
|
if(label == 'Y') {
|
|
label = 'Z';
|
|
}
|
|
bwt_string.push_back(label);
|
|
F_array.push_back(i == edge_range.first ? 1 : 0);
|
|
|
|
if(label != 'Z') {
|
|
char nt = asc2dna[(int)label];
|
|
assert_lt(nt + 1, bwt_counts.size());
|
|
bwt_counts[nt + 1]++;
|
|
}
|
|
}
|
|
for(index_t i = 0; i < nodes[node].key.first; i++) {
|
|
M_array.push_back(i == 0 ? 1 : 0);
|
|
}
|
|
}
|
|
assert_gt(bwt_string.size(), 0);
|
|
assert_eq(bwt_string.size(), F_array.size());
|
|
assert_eq(bwt_string.size(), M_array.size());
|
|
|
|
for(size_t i = 0; i < bwt_counts.size(); i++) {
|
|
if(i > 0) bwt_counts[i] += bwt_counts[i - 1];
|
|
}
|
|
|
|
#ifndef NDEBUG
|
|
if(debug) {
|
|
cerr << "Path nodes (final)" << endl;
|
|
for(size_t i = 0; i < nodes.size(); i++) {
|
|
const PathNode& node = nodes[i];
|
|
cerr << "\t" << i << "\t(" << node.key.first << ", " << node.key.second << ")\t"
|
|
<< node.from << " --> " << node.to << endl;
|
|
}
|
|
|
|
cerr << "Path edges (final)" << endl;
|
|
for(size_t i = 0; i < edges.size(); i++) {
|
|
const PathEdge& edge = edges[i];
|
|
cerr << "\t" << i << "\tfrom: " << edge.from << "\tranking: " << edge.ranking << "\t" << edge.label << endl;
|
|
}
|
|
|
|
cerr << "i\tBWT\tF\tM" << endl;
|
|
for(index_t i = 0; i < bwt_string.size(); i++) {
|
|
cerr << i << "\t" << bwt_string[i] << "\t" // BWT char
|
|
<< (int)F_array[i] << "\t" // F bit value
|
|
<< (int)M_array[i] << endl; // M bit value
|
|
}
|
|
|
|
for(size_t i = 0; i < bwt_counts.size(); i++) {
|
|
cerr << i << "\t" << bwt_counts[i] << endl;
|
|
}
|
|
}
|
|
#endif
|
|
|
|
// Test searches, based on paper_example
|
|
#if 1
|
|
EList<string> queries; EList<index_t> answers;
|
|
# if 1
|
|
# if 1
|
|
queries.push_back("GACGT"); answers.push_back(9);
|
|
queries.push_back("GATGT"); answers.push_back(9);
|
|
queries.push_back("GACT"); answers.push_back(9);
|
|
queries.push_back("ATGT"); answers.push_back(4);
|
|
queries.push_back("GTAC"); answers.push_back(10);
|
|
queries.push_back("ACTG"); answers.push_back(3);
|
|
# else
|
|
// rs55902548, at 402, ref, alt, unknown alt
|
|
queries.push_back("GGCAGCTCCCATGGGTACACACTGGGCCCAGAACTGGGATGGAGGATGCA");
|
|
// queries.push_back("GGCAGCTCCCATGGGTACACACTGGTCCCAGAACTGGGATGGAGGATGCA");
|
|
// queries.push_back("GGCAGCTCCCATGGGTACACACTGGACCCAGAACTGGGATGGAGGATGCA");
|
|
|
|
// rs5759268, at 926787, ref, alt, unknown alt
|
|
// queries.push_back("AAATTGCTCAGCCTTGTGCTGTGCACACCTGGTTCTCTTTCCAGTGTTAT");
|
|
// queries.push_back("AAATTGCTCAGCCTTGTGCTGTGCATACCTGGTTCTCTTTCCAGTGTTAT");
|
|
// queries.push_back("AAATTGCTCAGCCTTGTGCTGTGCAGACCTGGTTCTCTTTCCAGTGTTAT");
|
|
# endif
|
|
|
|
for(size_t q = 0; q < queries.size(); q++) {
|
|
const string& query = queries[q];
|
|
assert_gt(query.length(), 0);
|
|
index_t top = 0, bot = edges.size();
|
|
index_t node_top = 0, node_bot = 0;
|
|
cerr << "Aligning " << query << endl;
|
|
index_t i = 0;
|
|
for(; i < query.length(); i++) {
|
|
if(top >= bot) break;
|
|
int nt = query[query.length() - i - 1];
|
|
nt = asc2dna[nt];
|
|
assert_lt(nt, 4);
|
|
|
|
cerr << "\t" << i << "\tBWT range: [" << top << ", " << bot << ")" << endl;
|
|
|
|
top = bwt_counts[(int)nt] + (top <= 0 ? 0 : rank(bwt_string, top - 1, "ACGT"[nt]));
|
|
bot = bwt_counts[(int)nt] + rank(bwt_string, bot - 1, "ACGT"[nt]);
|
|
cerr << "\t\tLF BWT range: [" << top << ", " << bot << ")" << endl;
|
|
|
|
node_top = rank1(M_array, top) - 1;
|
|
node_bot = rank1(M_array, bot - 1);
|
|
cerr << "\t\tnode range: [" << node_top << ", " << node_bot << ")" << endl;
|
|
|
|
top = select1(F_array, node_top + 1);
|
|
bot = select1(F_array, node_bot + 1);
|
|
}
|
|
cerr << "\t" << i << "\tBWT range: [" << top << ", " << bot << ")" << endl;
|
|
// assert_eq(top, answers[q]);
|
|
cerr << "finished... ";
|
|
if(node_top < node_bot && node_top < nodes.size()) {
|
|
index_t pos = nodes[node_top].to;
|
|
index_t gpos = pos;
|
|
const EList<RefRecord>& szs = base.szs;
|
|
for(index_t i = 0; i < szs.size(); i++) {
|
|
gpos += szs[i].off;
|
|
if(pos < szs[i].len) break;
|
|
pos -= szs[i].len;
|
|
}
|
|
|
|
cerr << "being aligned at " << gpos;
|
|
}
|
|
cerr << endl << endl;
|
|
}
|
|
# endif
|
|
|
|
// See inconsistencies between F and M arraystimy thread
|
|
# if 0
|
|
cerr << endl << endl;
|
|
EList<index_t> tmp_F;
|
|
for(index_t i = 0; i < F_array.size(); i++) {
|
|
if(F_array[i] == 1) tmp_F.push_back(i);
|
|
}
|
|
|
|
EList<index_t> tmp_M;
|
|
for(index_t i = 0; i < M_array.size(); i++) {
|
|
if(M_array[i] == 1) tmp_M.push_back(i);
|
|
}
|
|
|
|
index_t max_diff = 0;
|
|
assert_eq(tmp_F.size(), tmp_M.size());
|
|
for(index_t i = 0; i < tmp_F.size(); i++) {
|
|
index_t diff = (tmp_F[i] >= tmp_M[i] ? tmp_F[i] - tmp_M[i] : tmp_M[i] - tmp_F[i]);
|
|
if(diff > max_diff) {
|
|
max_diff = diff;
|
|
cerr << i << "\tdiff: " << max_diff << "\t" << (tmp_F[i] >= tmp_M[i] ? "+" : "-") << endl;
|
|
}
|
|
}
|
|
cerr << "Final: " << tmp_F.back() << " vs. " << tmp_M.back() << endl;
|
|
# endif
|
|
|
|
#endif
|
|
return true;
|
|
}
|
|
|
|
//--------------------------------------------------------------------------
|
|
|
|
template <typename index_t>
|
|
pair<index_t, index_t> PathGraph<index_t>::getEdges(index_t node, bool by_from) {
|
|
if(node >= nodes.size()) {
|
|
cerr << "Error: Trying to get edges " << (by_from ? "from " : "to ") << node << endl;
|
|
}
|
|
if(nodes[node].key.second == 0) {
|
|
return pair<index_t, index_t>(0, 0);
|
|
}
|
|
if(node == 0) {
|
|
return pair<index_t, index_t>(0, nodes[node].key.second);
|
|
} else {
|
|
return pair<index_t, index_t>(nodes[node - 1].key.second, nodes[node].key.second);
|
|
}
|
|
}
|
|
|
|
#endif /*GBWT_GRAPH_H_*/
|