/* * Copyright 2015, Daehwan Kim , Joe Paggi * * This file is part of HISAT 2. * * HISAT 2 is free software: you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation, either version 3 of the License, or * (at your option) any later version. * * HISAT 2 is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with HISAT 2. If not, see . */ #ifndef GBWT_GRAPH_H_ #define GBWT_GRAPH_H_ #include #include #include #include #include #include #include #include #include "alt.h" #include "radix_sort.h" // Reference: // Jouni Sirén, Niko Välimäki, and Veli Mäkinen: Indexing Graphs for Path Queries with Applications in Genome Research. // IEEE/ACM Transactions on Computational Biology and Bioinformatics 11(2):375-388, 2014. // http://ieeexplore.ieee.org/xpl/articleDetails.jsp?arnumber=6698337 //-------------------------------------------------------------------------- struct NongraphException : public exception { const char* what () const throw () { return "Nongraph exception"; } }; struct ExplosionException : public exception { const char* what () const throw () { return "Explosion exception"; } }; template class PathGraph; // Note: I wrote the following codes based on Siren's work, gcsa (see the reference above). template class RefGraph { friend class PathGraph; public: struct Node { char label; // ACGT + Y(head) + Z(tail) index_t value; // location in a whole genome Node() { reset(); } Node(char label_, index_t value_) : label(label_), value(value_) {} void reset() { label = 0; value = 0; } bool write(ofstream& f_out, bool bigEndian) const { writeIndex(f_out, value, bigEndian); writeU16(f_out, label, bigEndian); return true; } bool read(ifstream& f_in, bool bigEndian) { value = readIndex(f_in, bigEndian); label = (char)readU16(f_in, bigEndian); return true; } bool operator== (const Node& o) const { if(value != o.value) return false; if(label != o.label) return false; return true; } bool operator< (const Node& o) const { if(value != o.value) return value < o.value; return label < o.label; } }; struct Edge { index_t from; // from Node index_t to; // to Node Edge() {} Edge(index_t from_, index_t to_) : from(from_), to(to_) {} bool write(ofstream& f_out, bool bigEndian) const { writeIndex(f_out, from, bigEndian); writeIndex(f_out, to, bigEndian); return true; } bool read(ifstream& f_in, bool bigEndian) { from = readIndex(f_in, bigEndian); to = readIndex(f_in, bigEndian); return true; } bool operator< (const Edge& o) const { if(from != o.from) return from < o.from; return to < o.to; } }; static index_t EdgeTo (Edge& a) { return a.to; } struct EdgeFromCmp { bool operator() (const Edge& a, const Edge& b) const { return a.from < b.from; } }; struct EdgeToCmp { bool operator() (const Edge& a, const Edge& b) const { return a.to < b.to; }; }; public: RefGraph(const SString& s, const EList& szs, const EList >& alts, const EList >& haplotypes, const string& out_fname, int nthreads_, bool verbose); bool repOk() { return true; } void write(const string& fname, bool bigEndian) { ofstream rg_file(fname.c_str(), ios::binary); if(!rg_file.good()) { cerr << "Could not open file for writing a reference graph: \"" << fname << "\"" << endl; throw 1; } writeIndex(rg_file, (index_t)nodes.size(), bigEndian); for(index_t i = 0; i < nodes.size(); i++) { nodes[i].write(rg_file, bigEndian); } writeIndex(rg_file, (index_t)edges.size(), bigEndian); for(index_t i = 0; i < edges.size(); i++) { edges[i].write(rg_file, bigEndian); } rg_file.close(); } void nullify() { nodes.nullify(); edges.nullify(); } void read(const string& fname, bool bigEndian) { ifstream rg_file(fname.c_str(), ios::binary); if(!rg_file.good()) { cerr << "Could not open file for reading a reference graph: \"" << fname << "\"" << endl; throw 1; } index_t num_nodes = readIndex(rg_file, bigEndian); nodes.resizeNoCopyExact(num_nodes); for(index_t i = 0; i < num_nodes; i++) { nodes[i].read(rg_file, bigEndian); } index_t num_edges = readIndex(rg_file, bigEndian); edges.resizeNoCopyExact(num_edges); for(index_t i = 0; i < num_edges; i++) { edges[i].read(rg_file, bigEndian); } rg_file.close(); } private: static bool isReverseDeterministic(EList& nodes, EList& edges); static void reverseDeterminize(EList& nodes, EList& edges, index_t& lastNode, index_t lastNode_add = 0); static void sortEdgesFrom(EList& edges) { std::sort(edges.begin(), edges.end(), EdgeFromCmp()); } static void sortEdgesTo(EList& edges) { std::sort(edges.begin(), edges.end(), EdgeToCmp()); } // Return edge ranges [begin, end) static pair findEdges(const EList& edges, index_t node, bool from); static pair findEdgesFrom(const EList& edges, index_t node) { return findEdges(edges, node, true); } static pair findEdgesTo(const EList& edges, index_t node) { return findEdges(edges, node, false); } static pair getNextEdgeRange(const EList& sep_edges, pair range, bool from) { if(range.second >= sep_edges.size()) { return pair(0, 0); } range.first = range.second; range.second++; if(from) { while(range.second < sep_edges.size() && sep_edges[range.second].from == sep_edges[range.first].from) { range.second++; } } else { while(range.second < sep_edges.size() && sep_edges[range.second].to == sep_edges[range.first].to) { range.second++; } } return range; } private: struct ThreadParam { // input index_t thread_id; RefGraph* refGraph; const SString* s; const EList >* alts; const EList >* haplotypes; string out_fname; bool bigEndian; // output index_t num_nodes; index_t num_edges; index_t lastNode; bool multipleHeadNodes; }; static void buildGraph_worker(void* vp); private: EList szs; EList tmp_szs; EList nodes; EList edges; index_t lastNode; // Z int nthreads; #ifndef NDEBUG bool debug; #endif private: // Following composite nodes and edges are used to reverse-determinize an automaton. struct CompositeNodeIDs { index_t id; EList add_ids; CompositeNodeIDs() { id = (index_t)INDEX_MAX; } bool operator<(const CompositeNodeIDs& o) const { if(id != o.id) return id < o.id; if(add_ids.size() != o.add_ids.size()) return add_ids.size() < o.add_ids.size(); for(index_t i = 0; i < add_ids.size(); i++) { assert_lt(i, o.add_ids.size()); if(add_ids[i] != o.add_ids[i]) return add_ids[i] < o.add_ids[i]; } return false; } index_t size() const { if(id == (index_t)INDEX_MAX) return 0; return (index_t)add_ids.size() + 1; } index_t getID(index_t i) const { if(i == 0) return id; else { i -= 1; assert_lt(i, add_ids.size()); return add_ids[i]; } } void push_back(index_t node_id) { if(id == (index_t)INDEX_MAX) id = node_id; else add_ids.push_back(node_id); } }; struct CompositeNode { CompositeNodeIDs nodes; index_t id; char label; index_t value; CompositeNode() { reset(); } CompositeNode(char label_, index_t node_id) : id(0), label(label_) { nodes.push_back(node_id); } Node getNode() const { return Node(label, value); } void reset() { nodes.id = (index_t)INDEX_MAX; nodes.add_ids.clear(); id = 0; label = 0; value = 0; } }; struct CompositeEdge { index_t from; index_t to; CompositeEdge() : from(0), to(0) {} CompositeEdge(index_t from_, index_t to_) : from(from_), to(to_) {} Edge getEdge(const EList& nodes) const { assert_lt(from, nodes.size()); const CompositeNode& from_node = nodes[from]; assert_lt(to, nodes.size()); const CompositeNode& to_node = nodes[to]; return Edge(from_node.id, to_node.id); } bool operator < (const CompositeEdge& o) const { return from < o.from; } }; struct TempNodeLabelCmp { TempNodeLabelCmp(const EList& nodes_) : nodes(nodes_) {} bool operator() (index_t a, index_t b) const { assert_lt(a, nodes.size()); assert_lt(b, nodes.size()); return nodes[a].label < nodes[b].label; } const EList& nodes; }; }; /** * Load reference sequence file and alt information. * Construct a reference graph */ template RefGraph::RefGraph(const SString& s, const EList& szs, const EList >& alts, const EList >& haplotypes, const string& out_fname, int nthreads_, bool verbose) : lastNode(0), nthreads(nthreads_) { const bool bigEndian = false; assert_gt(nthreads, 0); assert_gt(szs.size(), 0); index_t jlen = (index_t)s.length(); #ifndef NDEBUG debug = (jlen <= 20); #endif // a memory-efficient way to create a population graph with known ALTs bool frag_automaton = jlen >= (1 << 16); if(frag_automaton) { { EList > alt_ranges; // each range inclusive for(index_t i = 0; i < alts.size(); i++) { const ALT& alt = alts[i]; index_t left_relax = 128, right_relax = 128; pair range; range.first = alt.pos > left_relax ? alt.pos - left_relax - 1 : 0; if(alt.type == ALT_SNP_SGL) { range.second = alt.pos + 1; } else if(alt.type == ALT_SNP_DEL) { assert_gt(alt.len, 0); range.second = alt.pos + alt.len; } else if(alt.type == ALT_SNP_INS) { assert_gt(alt.len, 0); range.second = alt.pos; } else if (alt.type == ALT_SPLICESITE) { assert_lt(alt.left, alt.right); range.second = alt.right + 1; } else { assert(alt.exon()); continue; } range.second += right_relax; if(alt_ranges.empty() || alt_ranges.back().second + 1 < range.first) { alt_ranges.push_back(range); } else { assert_leq(alt_ranges.back().first, range.first); if(alt_ranges.back().second < range.second) { alt_ranges.back().second = range.second; } } } index_t chunk_size = 1 << 20; index_t pos = 0, range_idx = 0; for(index_t i = 0; i < szs.size(); i++) { if(szs[i].len == 0) continue; if(szs[i].len <= chunk_size) { tmp_szs.push_back(szs[i]); pos += szs[i].len; } else { index_t num_chunks = (szs[i].len + chunk_size - 1) / chunk_size; assert_gt(num_chunks, 1); index_t modified_chunk_size = szs[i].len / num_chunks; index_t after_pos = pos + szs[i].len; ASSERT_ONLY(index_t sum_len = 0); while(pos < after_pos) { index_t target_pos = pos + modified_chunk_size; if(target_pos < after_pos) { for(; range_idx < alt_ranges.size(); range_idx++) { if(target_pos < alt_ranges[range_idx].first) break; } pair alt_free_range; if(range_idx == 0) { alt_free_range.first = 0; } else { alt_free_range.first = alt_ranges[range_idx - 1].second + 1; if(alt_free_range.first >= jlen) { alt_free_range.first = jlen - 1; } } if(range_idx == alt_ranges.size()) { alt_free_range.second = jlen - 1; } else { alt_free_range.second = alt_ranges[range_idx].first - 1; } assert_leq(alt_free_range.first, alt_free_range.second); if(target_pos < alt_free_range.first) target_pos = alt_free_range.first; if(target_pos > after_pos) target_pos = after_pos; } else { target_pos = after_pos; } tmp_szs.expand(); tmp_szs.back().len = target_pos - pos; tmp_szs.back().off = 0; pos = target_pos; ASSERT_ONLY(sum_len += tmp_szs.back().len); } assert_eq(pos, after_pos); assert_eq(sum_len, szs[i].len); } } #ifndef NDEBUG index_t modified_jlen = 0; for(index_t i = 0; i < tmp_szs.size(); i++) { modified_jlen += tmp_szs[i].len; } assert_eq(modified_jlen, jlen); #endif } if(nthreads > (int)tmp_szs.size()) { nthreads = (int)tmp_szs.size(); } assert_gt(nthreads, 0); AutoArray threads(nthreads); EList threadParams; for(index_t i = 0; i < (index_t)nthreads; i++) { threadParams.expand(); threadParams.back().thread_id = i; threadParams.back().refGraph = this; threadParams.back().s = &s; threadParams.back().alts = &alts; threadParams.back().haplotypes = &haplotypes; threadParams.back().out_fname = out_fname; threadParams.back().bigEndian = bigEndian; threadParams.back().num_nodes = 0; threadParams.back().num_edges = 0; threadParams.back().lastNode = 0; threadParams.back().multipleHeadNodes = false; if(nthreads == 1) { buildGraph_worker((void*)&threadParams.back()); } else { threads[i] = new tthread::thread(buildGraph_worker, (void*)&threadParams.back()); } } if(nthreads > 1) { for(index_t i = 0; i < (index_t)nthreads; i++) threads[i]->join(); } index_t num_nodes = 0, num_edges = 0; for(index_t i = 0; i < threadParams.size(); i++) { num_nodes += threadParams[i].num_nodes; num_edges += threadParams[i].num_edges; // Make room for edges spanning graphs by different threads if(i > 0) { num_edges += 16; } } nodes.resizeExact(num_nodes); nodes.clear(); edges.resizeExact(num_edges); edges.clear(); // Read all the nodes and edges EList tail_nodes; bool multipleHeadNodes = false; for(index_t i = 0; i < threadParams.size(); i++) { if(threadParams[i].multipleHeadNodes) multipleHeadNodes = true; std::ostringstream number; number << i; const string rg_fname = out_fname + "." + number.str() + ".rf"; ifstream rg_in_file(rg_fname.c_str(), ios::binary); if(!rg_in_file.good()) { cerr << "Could not open file for reading a reference graph: \"" << rg_fname << "\"" << endl; throw 1; } index_t curr_num_nodes = (index_t)nodes.size(); ASSERT_ONLY(index_t curr_num_edges = (index_t)edges.size()); ASSERT_ONLY(index_t num_spanning_edges = 0); // Read nodes to be connected to last nodes in a previous thread if(i > 0) { assert_gt(tail_nodes.size(), 0) index_t num_head_nodes = readIndex(rg_in_file, bigEndian); for(index_t j = 0; j < num_head_nodes; j++) { index_t head_node = readIndex(rg_in_file, bigEndian); for(index_t k = 0; k < tail_nodes.size(); k++) { edges.expand(); edges.back().from = tail_nodes[k]; edges.back().to = head_node + curr_num_nodes; ASSERT_ONLY(num_spanning_edges++); } } } while(!rg_in_file.eof()) { index_t tmp_num_nodes = readIndex(rg_in_file, bigEndian); for(index_t j = 0; j < tmp_num_nodes; j++) { nodes.expand(); nodes.back().read(rg_in_file, bigEndian); } index_t tmp_num_edges = readIndex(rg_in_file, bigEndian); for(index_t j = 0; j < tmp_num_edges; j++) { edges.expand(); edges.back().read(rg_in_file, bigEndian); edges.back().from += curr_num_nodes; edges.back().to += curr_num_nodes; } if(nodes.size() >= curr_num_nodes + threadParams[i].num_nodes) { assert_eq(nodes.size(), curr_num_nodes + threadParams[i].num_nodes); assert_eq(edges.size(), curr_num_edges + num_spanning_edges + threadParams[i].num_edges); // Read last nodes in this thread tail_nodes.clear(); if(i + 1 < (index_t)nthreads) { index_t num_tail_nodes = readIndex(rg_in_file, bigEndian); for(index_t j = 0; j < num_tail_nodes; j++) { index_t tail_node = readIndex(rg_in_file, bigEndian); tail_nodes.push_back(tail_node + curr_num_nodes); } } break; } } rg_in_file.close(); std::remove(rg_fname.c_str()); if(i + 1 == (index_t)nthreads) { lastNode = threadParams.back().lastNode + curr_num_nodes; assert_lt(lastNode, nodes.size()); assert_eq(nodes[lastNode].label, 'Z'); } } if(s.length() + 2 == nodes.size() && nodes.size() == edges.size() + 1) { cerr << "Warning: no variants or splice sites in this graph" << endl; throw NongraphException(); } if(multipleHeadNodes) { if(!isReverseDeterministic(nodes, edges)) { if(verbose) cerr << "\tis not reverse-deterministic, so reverse-determinize..." << endl; reverseDeterminize(nodes, edges, lastNode); } } assert(isReverseDeterministic(nodes, edges)); } else { // this is memory-consuming, but simple to implement index_t num_predicted_nodes = (index_t)(jlen * 1.2); nodes.reserveExact(num_predicted_nodes); edges.reserveExact(num_predicted_nodes); // Created head node nodes.expand(); nodes.back().label = 'Y'; nodes.back().value = 0; // Create nodes and edges corresponding to a reference genome for(size_t i = 0; i < s.length(); i++) { nodes.expand(); nodes.back().label = "ACGT"[(int)s[i]]; nodes.back().value = (index_t)i; assert_geq(nodes.size(), 2); edges.expand(); edges.back().from = (index_t)nodes.size() - 2; edges.back().to = (index_t)nodes.size() - 1; } // Create tail node nodes.expand(); nodes.back().label = 'Z'; nodes.back().value = (index_t)s.length(); lastNode = (index_t)nodes.size() - 1; edges.expand(); edges.back().from = (index_t)nodes.size() - 2; edges.back().to = (index_t)nodes.size() - 1; // Create nodes and edges for haplotypes for(index_t i = 0; i < haplotypes.size(); i++) { const Haplotype& haplotype = haplotypes[i]; const EList& snpIDs = haplotype.alts; assert_gt(snpIDs.size(), 0); assert_lt(haplotype.right, s.length()); bool pass = true; for(index_t s = 0; s < snpIDs.size(); s++) { index_t snpID = snpIDs[s]; assert_lt(snpID, alts.size()); const ALT& snp = alts[snpID]; assert(snp.snp()); if(s + 1 >= snpIDs.size()) break; index_t snpID2 = snpIDs[s+1]; assert_lt(snpID2, alts.size()); const ALT& snp2 = alts[snpID2]; assert(snp2.snp()); if(snp.type == ALT_SNP_INS) { if(snp.pos > snp2.pos) { pass = false; break; } } else if(snp.type == ALT_SNP_DEL) { if(snp2.type == ALT_SNP_DEL) { if(snp.pos + snp.len >= snp2.pos) { pass = false; break; } } else { if(snp.pos + snp.len - 1 >= snp2.pos) { pass = false; break; } } } else { if(snp.pos >= snp2.pos) { pass = false; break; } } } if(!pass) continue; index_t prev_ALT_type = ALT_NONE; index_t ID_i = 0; for(index_t j = haplotype.left; j <= haplotype.right; j++) { if(prev_ALT_type == ALT_SNP_INS) j--; const ALT* altp = (ID_i < snpIDs.size() ? &(alts[snpIDs[ID_i]]) : NULL); assert(altp == NULL || altp->pos >= j); if(altp != NULL && altp->pos == j) { const ALT& alt = *altp; assert_lt(alt.pos, s.length()); assert(alt.snp()); if(alt.type == ALT_SNP_SGL) { assert_eq(alt.len, 1); nodes.expand(); assert_lt(alt.seq, 4); assert_neq(alt.seq & 0x3, s[alt.pos]); nodes.back().label = "ACGT"[alt.seq]; nodes.back().value = alt.pos; if(prev_ALT_type != ALT_SNP_DEL) { edges.expand(); if(j == haplotype.left) { edges.back().from = alt.pos; } else { assert_gt(nodes.size(), 2); edges.back().from = (index_t)nodes.size() - 2; } edges.back().to = (index_t)nodes.size() - 1; } if(j == haplotype.right) { edges.expand(); edges.back().from = (index_t)nodes.size() - 1; edges.back().to = alt.pos + 2; } } else if(alt.type == ALT_SNP_DEL) { assert_gt(alt.len, 0); assert_leq(alt.pos + alt.len, s.length()); edges.expand(); if(j == haplotype.left) { edges.back().from = alt.pos; } else { edges.back().from = (index_t)nodes.size() - 1; } j += (alt.len - 1); assert_leq(j, haplotype.right); if(j == haplotype.right) { edges.back().to = alt.pos + alt.len + 1; } else { edges.back().to = (index_t)nodes.size(); } } else { assert_eq(alt.type, ALT_SNP_INS) assert_gt(alt.len, 0); for(size_t k = 0; k < alt.len; k++) { uint64_t bp = alt.seq >> ((alt.len - k - 1) << 1); bp &= 0x3; char ch = "ACGT"[bp]; nodes.expand(); nodes.back().label = ch; nodes.back().value = (index_t)INDEX_MAX; if(prev_ALT_type == ALT_SNP_DEL && k == 0) continue; edges.expand(); edges.back().from = ((k == 0 && j == haplotype.left) ? alt.pos : (index_t)nodes.size() - 2); edges.back().to = (index_t)nodes.size() - 1; } if(j == haplotype.right) { edges.expand(); edges.back().from = (index_t)nodes.size() - 1; edges.back().to = alt.pos + 1; } } ID_i++; prev_ALT_type = alt.type; } else { int nt = s[j]; assert_lt(nt, 4); nodes.expand(); nodes.back().label = "ACGT"[nt]; nodes.back().value = j; if(prev_ALT_type != ALT_SNP_DEL) { edges.expand(); if(j == haplotype.left && prev_ALT_type == ALT_NONE) { edges.back().from = j; } else { edges.back().from = (index_t)nodes.size() - 2; } edges.back().to = (index_t)nodes.size() - 1; } if(j == haplotype.right) { edges.expand(); edges.back().from = (index_t)nodes.size() - 1; edges.back().to = j + 2; } prev_ALT_type = ALT_SNP_SGL; } } } // Create nodes and edges for splice sites for(size_t i = 0; i < alts.size(); i++) { const ALT& alt = alts[i]; if(alt.pos >= s.length()) break; if(alt.type != ALT_SPLICESITE) continue; if(alt.excluded) continue; assert_lt(alt.left, alt.right); edges.expand(); edges.back().from = alt.left; edges.back().to = alt.right + 2; } if(s.length() + 2 == nodes.size() && nodes.size() == edges.size() + 1) { throw NongraphException(); } if(!isReverseDeterministic(nodes, edges)) { if(verbose) cerr << "\tis not reverse-deterministic, so reverse-determinize..." << endl; reverseDeterminize(nodes, edges, lastNode); assert(isReverseDeterministic(nodes, edges)); } } #ifndef NDEBUG if(debug) { cout << "num nodes: " << nodes.size() << endl; for(index_t i = 0; i < nodes.size(); i++) { const Node& n = nodes[i]; cout << i << "\t" << n.label << "\t" << n.value << endl; } sort(edges.begin(), edges.end()); cout << "num edges: " << edges.size() << endl; for(index_t i = 0; i < edges.size(); i++) { const Edge& e = edges[i]; cout << i << "\t" << e.from << " --> " << e.to << endl; } } #endif } template pair RefGraph::findEdges(const EList& edges, index_t node, bool from) { pair range(0, 0); assert_gt(edges.size(), 0); // Find lower bound index_t low = 0, high = (index_t)edges.size() - 1; index_t temp; while(low < high) { index_t mid = low + (high - low) / 2; temp = (from ? edges[mid].from : edges[mid].to); if(node == temp) { high = mid; } else if(node < temp) { if(mid == 0) { return pair(0, 0); } high = mid - 1; } else { low = mid + 1; } } temp = (from ? edges[low].from : edges[low].to); if(node == temp) { range.first = low; } else { return range; } // Find upper bound high = (index_t)edges.size() - 1; while(low < high) { index_t mid = low + (high - low + 1) / 2; temp = (from ? edges[mid].from : edges[mid].to); if(node == temp) { low = mid; } else { assert_lt(node, temp); high = mid - 1; } } #ifndef NDEBUG temp = (from ? edges[high].from : edges[high].to); assert_eq(node, temp); #endif range.second = high + 1; return range; } template void RefGraph::buildGraph_worker(void* vp) { ThreadParam* threadParam = (ThreadParam*)vp; RefGraph& refGraph = *(threadParam->refGraph); const SString& s = *(threadParam->s); index_t jlen = (index_t)s.length(); const EList >& alts = *(threadParam->alts); const EList >& haplotypes = *(threadParam->haplotypes); EList nodes; EList edges; const EList& tmp_szs = refGraph.tmp_szs; index_t thread_id = threadParam->thread_id; index_t nthreads = refGraph.nthreads; std::ostringstream number; number << thread_id; const string rg_fname = threadParam->out_fname + "." + number.str() + ".rf"; ofstream rg_out_file(rg_fname.c_str(), ios::binary); if(!rg_out_file.good()) { cerr << "Could not open file for writing a reference graph: \"" << rg_fname << "\"" << endl; throw 1; } #ifndef NDEBUG set snp_set; #endif const bool bigEndian = threadParam->bigEndian; index_t& lastNode = threadParam->lastNode; index_t& num_nodes = threadParam->num_nodes; index_t& num_edges = threadParam->num_edges; index_t szs_idx = 0, szs_idx_end = (index_t)tmp_szs.size(); if(threadParam->thread_id != 0) { szs_idx = (index_t)((tmp_szs.size() / nthreads) * thread_id); } if(thread_id + 1 < nthreads) { szs_idx_end = (index_t)((tmp_szs.size() / nthreads) * (thread_id + 1)); } index_t curr_pos = 0; for(index_t i = 0; i < szs_idx; i++) { curr_pos += tmp_szs[i].len; } EList prev_tail_nodes; index_t alt_idx = 0, haplotype_idx = 0; for(; szs_idx < szs_idx_end; szs_idx++) { index_t curr_len = tmp_szs[szs_idx].len; if(curr_len <= 0) continue; index_t num_predicted_nodes = (index_t)(curr_len * 1.2); nodes.resizeExact(num_predicted_nodes); nodes.clear(); edges.resizeExact(num_predicted_nodes); edges.clear(); // Created head node nodes.expand(); nodes.back().label = 'Y'; nodes.back().value = 0; // Create nodes and edges corresponding to a reference genome assert_leq(curr_pos + curr_len, s.length()); for(size_t i = curr_pos; i < curr_pos + curr_len; i++) { nodes.expand(); nodes.back().label = "ACGT"[(int)s[i]]; nodes.back().value = (index_t)i; assert_geq(nodes.size(), 2); edges.expand(); edges.back().from = (index_t)nodes.size() - 2; edges.back().to = (index_t)nodes.size() - 1; } // Create tail node nodes.expand(); nodes.back().label = 'Z'; nodes.back().value = (index_t)s.length(); lastNode = (index_t)nodes.size() - 1; edges.expand(); edges.back().from = (index_t)nodes.size() - 2; edges.back().to = (index_t)nodes.size() - 1; ASSERT_ONLY(index_t backbone_nodes = (index_t)nodes.size()); // Create nodes and edges for haplotypes for(; haplotype_idx < haplotypes.size(); haplotype_idx++) { const Haplotype& haplotype = haplotypes[haplotype_idx]; if(haplotype.left < curr_pos) continue; if(haplotype.right >= curr_pos + curr_len) break; const EList& snpIDs = haplotype.alts; assert_gt(snpIDs.size(), 0); bool pass = true; for(index_t s = 0; s < snpIDs.size(); s++) { index_t snpID = snpIDs[s]; assert_lt(snpID, alts.size()); const ALT& snp = alts[snpID]; assert(snp.snp()); if(s + 1 >= snpIDs.size()) break; index_t snpID2 = snpIDs[s+1]; assert_lt(snpID2, alts.size()); const ALT& snp2 = alts[snpID2]; assert(snp2.snp()); if(snp.type == ALT_SNP_INS) { if(snp.pos > snp2.pos) { pass = false; break; } } else if(snp.type == ALT_SNP_DEL) { if(snp2.type == ALT_SNP_DEL) { if(snp.pos + snp.len >= snp2.pos) { pass = false; break; } } else { if(snp.pos + snp.len - 1 >= snp2.pos) { pass = false; break; } } } else { if(snp.pos >= snp2.pos) { pass = false; break; } } } if(!pass) continue; index_t prev_ALT_type = ALT_NONE; index_t ID_i = 0; for(index_t j = haplotype.left; j <= haplotype.right; j++) { if(prev_ALT_type == ALT_SNP_INS) j--; const ALT* altp = (ID_i < snpIDs.size() ? &(alts[snpIDs[ID_i]]) : NULL); assert(altp == NULL || altp->pos >= j); if(altp != NULL && altp->pos == j) { const ALT& alt = *altp; assert_lt(alt.pos, s.length()); assert(alt.snp()); if(alt.type == ALT_SNP_SGL) { assert_eq(alt.len, 1); nodes.expand(); assert_lt(alt.seq, 4); assert_neq(alt.seq & 0x3, s[alt.pos]); nodes.back().label = "ACGT"[alt.seq]; nodes.back().value = alt.pos; if(prev_ALT_type != ALT_SNP_DEL) { edges.expand(); if(j == haplotype.left) { edges.back().from = alt.pos - curr_pos; assert_lt(edges.back().from, backbone_nodes); } else { assert_gt(nodes.size(), 2); edges.back().from = (index_t)nodes.size() - 2; } edges.back().to = (index_t)nodes.size() - 1; } if(j == haplotype.right) { edges.expand(); edges.back().from = (index_t)nodes.size() - 1; edges.back().to = alt.pos - curr_pos + 2; assert_lt(edges.back().to, backbone_nodes); } } else if(alt.type == ALT_SNP_DEL) { assert_gt(alt.len, 0); assert_leq(alt.pos - curr_pos + alt.len, s.length()); edges.expand(); if(j == haplotype.left) { edges.back().from = alt.pos - curr_pos; assert_lt(edges.back().from, backbone_nodes); } else { edges.back().from = (index_t)nodes.size() - 1; } j += (alt.len - 1); assert_leq(j, haplotype.right); if(j == haplotype.right) { edges.back().to = alt.pos - curr_pos + alt.len + 1; assert_lt(edges.back().to, backbone_nodes); } else { edges.back().to = (index_t)nodes.size(); } } else { assert_eq(alt.type, ALT_SNP_INS) assert_gt(alt.len, 0); for(size_t k = 0; k < alt.len; k++) { uint64_t bp = alt.seq >> ((alt.len - k - 1) << 1); bp &= 0x3; char ch = "ACGT"[bp]; nodes.expand(); nodes.back().label = ch; nodes.back().value = (index_t)INDEX_MAX; if(prev_ALT_type == ALT_SNP_DEL && k == 0) continue; edges.expand(); edges.back().from = ((k == 0 && j == haplotype.left) ? alt.pos - curr_pos : (index_t)nodes.size() - 2); edges.back().to = (index_t)nodes.size() - 1; } if(j == haplotype.right) { edges.expand(); edges.back().from = (index_t)nodes.size() - 1; edges.back().to = alt.pos - curr_pos + 1; } } #ifndef NDEBUG snp_set.insert(snpIDs[ID_i]); #endif ID_i++; prev_ALT_type = alt.type; } else { int nt = s[j]; assert_lt(nt, 4); nodes.expand(); nodes.back().label = "ACGT"[nt]; nodes.back().value = j; if(prev_ALT_type != ALT_SNP_DEL) { edges.expand(); if(j == haplotype.left && prev_ALT_type == ALT_NONE) { edges.back().from = j - curr_pos; assert_lt(edges.back().from, backbone_nodes); } else { edges.back().from = (index_t)nodes.size() - 2; } edges.back().to = (index_t)nodes.size() - 1; } if(j == haplotype.right) { edges.expand(); edges.back().from = (index_t)nodes.size() - 1; edges.back().to = j - curr_pos + 2; assert_lt(edges.back().to, backbone_nodes); } prev_ALT_type = ALT_SNP_SGL; } } } // Create nodes and edges for splice sites for(; alt_idx < alts.size(); alt_idx++) { const ALT& alt = alts[alt_idx]; if(alt.pos < curr_pos) continue; if(alt.pos >= curr_pos + curr_len) break; if(!alt.splicesite()) continue; if(alt.excluded) continue; assert_lt(alt.left, alt.right); edges.expand(); edges.back().from = alt.left - curr_pos; edges.back().to = alt.right - curr_pos + 2; assert_lt(edges.back().from, backbone_nodes); assert_lt(edges.back().to, backbone_nodes); } #ifndef NDEBUG if(refGraph.debug) { cerr << "Nodes:" << endl; for(size_t i = 0; i < nodes.size(); i++) { const Node& node = nodes[i]; cerr << "\t" << i << "\t" << node.label << "\t" << node.value << endl; } cerr << endl; cerr << "Edges: " << endl; for(size_t i = 0; i < edges.size(); i++) { const Edge& edge = edges[i]; cerr << "\t" << i << "\t" << edge.from << " --> " << edge.to << endl; } cerr << endl; } #endif if(!isReverseDeterministic(nodes, edges)) { reverseDeterminize(nodes, edges, lastNode, curr_pos > 0 ? curr_pos + 1 : 0); assert(isReverseDeterministic(nodes, edges)); } // Identify head index_t head_node = (index_t)nodes.size(); for(index_t i = 0; i < nodes.size(); i++) { if(nodes[i].label == 'Y') { head_node = i; break; } } assert_lt(head_node, nodes.size()); index_t tail_node = lastNode; assert_lt(tail_node, nodes.size()); // Update edges const index_t invalid = (index_t)INDEX_MAX; bool head_off = curr_pos > 0, tail_off = curr_pos + curr_len < jlen; for(index_t i = 0; i < edges.size(); i++) { index_t from = edges[i].from; from = from + num_nodes; if(head_off && edges[i].from > head_node) from -= 1; if(tail_off && edges[i].from > tail_node) from -= 1; if(head_off && edges[i].from == head_node) { edges[i].from = invalid; } else { edges[i].from = from; } index_t to = edges[i].to; to = to + num_nodes; if(head_off && edges[i].to > head_node) to -= 1; if(tail_off && edges[i].to > tail_node) to -= 1; if(tail_off && edges[i].to == tail_node) { edges[i].to = invalid; } else { edges[i].to = to; } } head_node = tail_node = invalid; // Also update lastNode if(!tail_off) { lastNode += num_nodes; if(head_off) lastNode -= 1; } // Connect head nodes with tail nodes in the previous automaton index_t num_head_nodes = 0; index_t tmp_num_edges = (index_t)edges.size(); if(head_off) { EList nodes_to_head; for(index_t i = 0; i < tmp_num_edges; i++) { if(edges[i].from == head_node) { num_head_nodes++; if(prev_tail_nodes.size() > 0) { for(index_t j = 0; j < prev_tail_nodes.size(); j++) { edges.expand(); edges.back().from = prev_tail_nodes[j]; edges.back().to = edges[i].to; assert_lt(edges.back().from, edges.back().to); } } else { nodes_to_head.push_back(edges[i].to); } } } if(nodes_to_head.size() > 0) { assert_gt(thread_id, 0); assert_eq(prev_tail_nodes.size(), 0); writeIndex(rg_out_file, (index_t)nodes_to_head.size(), bigEndian); for(index_t i = 0; i < nodes_to_head.size(); i++) { writeIndex(rg_out_file, nodes_to_head[i], bigEndian); } } } // Need to check if it's reverse-deterministic if(num_head_nodes > 1) { threadParam->multipleHeadNodes = true; } // List tail nodes prev_tail_nodes.clear(); if(tail_off) { for(index_t i = 0; i < tmp_num_edges; i++) { if(edges[i].to == tail_node) { prev_tail_nodes.push_back(edges[i].from); } } } // Write nodes and edges index_t tmp_num_nodes = (index_t)nodes.size(); assert_gt(tmp_num_nodes, 2); if(head_off) tmp_num_nodes--; if(tail_off) tmp_num_nodes--; writeIndex(rg_out_file, tmp_num_nodes, bigEndian); ASSERT_ONLY(index_t num_nodes_written = 0); for(index_t i = 0; i < nodes.size(); i++) { if(head_off && nodes[i].label == 'Y') continue; if(tail_off && nodes[i].label == 'Z') continue; nodes[i].write(rg_out_file, bigEndian); ASSERT_ONLY(num_nodes_written++); } assert_eq(tmp_num_nodes, num_nodes_written); tmp_num_edges = (index_t)edges.size(); assert_geq(tmp_num_edges, num_head_nodes + prev_tail_nodes.size()); if(head_off) tmp_num_edges -= num_head_nodes; if(tail_off) tmp_num_edges -= prev_tail_nodes.size(); writeIndex(rg_out_file, tmp_num_edges, bigEndian); ASSERT_ONLY(index_t num_edges_written = 0); for(index_t i = 0; i < edges.size(); i++) { if(head_off && edges[i].from == head_node) continue; if(tail_off && edges[i].to == tail_node) continue; edges[i].write(rg_out_file, bigEndian); ASSERT_ONLY(num_edges_written++); } assert_eq(tmp_num_edges, num_edges_written); // Clear nodes and edges nodes.clear(); edges.clear(); curr_pos += curr_len; num_nodes += tmp_num_nodes; num_edges += tmp_num_edges; } if(nthreads > 1 && thread_id + 1 < (index_t)nthreads && prev_tail_nodes.size() > 0) { writeIndex(rg_out_file, (index_t)prev_tail_nodes.size(), bigEndian); for(index_t i = 0; i < prev_tail_nodes.size(); i++) { writeIndex(rg_out_file, prev_tail_nodes[i], bigEndian); } } // Close out file handle rg_out_file.close(); } template bool RefGraph::isReverseDeterministic(EList& nodes, EList& edges) { if(edges.size() <= 0) return true; // Sort edges by "to" nodes sortEdgesTo(edges); index_t curr_to = (index_t)INDEX_MAX; EList seen; seen.resize(5); seen.fillZero(); for(index_t i = 0; i < edges.size(); i++) { index_t from = edges[i].from; assert_lt(from, nodes.size()); char nt = nodes[from].label; assert(nt == 'A' || nt == 'C' || nt == 'G' || nt == 'T' || nt == 'Y'); if(nt == 'Y') nt = 4; else nt = asc2dna[(int)nt]; assert_lt(nt, seen.size()); if(curr_to != edges[i].to) { curr_to = edges[i].to; seen.fillZero(); seen[nt] = true; } else { if(seen[nt]) { return false; } seen[nt] = true; } } return true; } template void RefGraph::reverseDeterminize(EList& nodes, EList& edges, index_t& lastNode, index_t lastNode_add) { EList cnodes; cnodes.ensure(nodes.size()); map cnode_map; deque active_cnodes; EList cedges; cedges.ensure(edges.size()); // Start from the final node ('Z') assert_lt(lastNode, nodes.size()); const Node& last_node = nodes[lastNode]; cnodes.expand(); cnodes.back().reset(); cnodes.back().label = last_node.label; cnodes.back().value = last_node.value; cnodes.back().nodes.push_back(lastNode); active_cnodes.push_back(0); cnode_map[cnodes.back().nodes] = 0; sortEdgesTo(edges); index_t firstNode = 0; // Y -> ... -> Z EList predecessors; while(!active_cnodes.empty()) { index_t cnode_id = active_cnodes.front(); active_cnodes.pop_front(); assert_lt(cnode_id, cnodes.size()); // Find predecessors of this composite node predecessors.clear(); for(size_t i = 0; i < cnodes[cnode_id].nodes.size(); i++) { index_t node_id = cnodes[cnode_id].nodes.getID((index_t)i); pair edge_range = findEdgesTo(edges, node_id); assert_leq(edge_range.first, edge_range.second); assert_leq(edge_range.second, edges.size()); for(index_t j = edge_range.first; j < edge_range.second; j++) { assert_eq(edges[j].to, node_id); predecessors.push_back(edges[j].from); } } if(predecessors.size() >= 2) { // Remove redundant nodes predecessors.sort(); index_t new_size = (index_t)(unique(predecessors.begin(), predecessors.end()) - predecessors.begin()); predecessors.resize(new_size); // Create composite nodes by labels stable_sort(predecessors.begin(), predecessors.end(), TempNodeLabelCmp(nodes)); } for(size_t i = 0; i < predecessors.size();) { index_t node_id = predecessors[i]; assert_lt(node_id, nodes.size()); const Node& node = nodes[node_id]; i++; cnodes.expand(); cnodes.back().reset(); cnodes.back().label = node.label; cnodes.back().value = node.value; cnodes.back().nodes.push_back(node_id); if(node.label == 'Y' && firstNode == 0) { firstNode = (index_t)cnodes.size() - 1; } while(i < predecessors.size()) { index_t next_node_id = predecessors[i]; assert_lt(next_node_id, nodes.size()); const Node& next_node = nodes[next_node_id]; if(next_node.label != node.label) break; cnodes.back().nodes.push_back(next_node_id); if(next_node.value != (index_t)INDEX_MAX) { if(cnodes.back().value == (index_t)INDEX_MAX) { cnodes.back().value = next_node.value; } else { cnodes.back().value = max(cnodes.back().value, next_node.value); } } i++; } // Create edges from this new composite node to current composite node typename map::iterator existing = cnode_map.find(cnodes.back().nodes); if(existing == cnode_map.end()) { cnode_map[cnodes.back().nodes] = (index_t)cnodes.size() - 1; active_cnodes.push_back((index_t)cnodes.size() - 1); cedges.push_back(CompositeEdge((index_t)cnodes.size() - 1, cnode_id)); } else { cnodes.pop_back(); cedges.push_back(CompositeEdge((*existing).second, cnode_id)); } // Increment indegree cnodes[cnode_id].id++; } } // Interchange from and to for(index_t i = 0; i < cedges.size(); i++) { index_t tmp = cedges[i].from; cedges[i].from = cedges[i].to; cedges[i].to = tmp; } sort(cedges.begin(), cedges.end()); active_cnodes.push_back(0); while(!active_cnodes.empty()) { index_t cnode_id = active_cnodes.front(); active_cnodes.pop_front(); assert_lt(cnode_id, cnodes.size()); const CompositeNode& cnode = cnodes[cnode_id]; index_t i = (index_t)cedges.bsearchLoBound(CompositeEdge(cnode_id, 0)); while(i < cedges.size()) { assert_geq(cedges[i].from, cnode_id); if(cedges[i].from != cnode_id) break; index_t predecessor_cnode_id = cedges[i].to; assert_lt(predecessor_cnode_id, cnodes.size()); CompositeNode& predecessor_cnode = cnodes[predecessor_cnode_id]; if(cnode.value == predecessor_cnode.value + 1) { active_cnodes.push_back(predecessor_cnode_id); break; } i++; } } // Restore from and to by interchanging them for(index_t i = 0; i < cedges.size(); i++) { index_t tmp = cedges[i].from; cedges[i].from = cedges[i].to; cedges[i].to = tmp; } // Create new nodes lastNode = 0; // Invalidate lastNode nodes.resizeExact(cnodes.size()); nodes.clear(); assert_neq(firstNode, 0); assert_lt(firstNode, cnodes.size()); CompositeNode& first_node = cnodes[firstNode]; first_node.id = 0; nodes.expand(); nodes.back() = first_node.getNode(); active_cnodes.push_back(firstNode); sort(cedges.begin(), cedges.end()); while(!active_cnodes.empty()) { index_t cnode_id = active_cnodes.front(); active_cnodes.pop_front(); assert_lt(cnode_id, cnodes.size()); index_t i = (index_t)cedges.bsearchLoBound(CompositeEdge(cnode_id, 0)); while(i < cedges.size()) { assert_geq(cedges[i].from, cnode_id); if(cedges[i].from != cnode_id) break; index_t successor_cnode_id = cedges[i].to; assert_lt(successor_cnode_id, cnodes.size()); CompositeNode& successor_cnode = cnodes[successor_cnode_id]; assert_gt(successor_cnode.id, 0); successor_cnode.id--; if(successor_cnode.id == 0) { active_cnodes.push_back(successor_cnode_id); successor_cnode.id = (index_t)nodes.size(); nodes.expand(); nodes.back() = successor_cnode.getNode(); if(nodes.back().label == 'Z') { assert_eq(lastNode, 0); assert_gt(nodes.size(), 1); lastNode = (index_t)nodes.size() - 1; } } i++; } } // Create new edges edges.resizeExact(cedges.size()); edges.clear(); for(index_t i = 0; i < cedges.size(); i++) { const CompositeEdge& edge = cedges[i]; edges.expand(); edges.back() = edge.getEdge(cnodes); } sortEdgesFrom(edges); #if 0 #ifndef NDEBUG if(debug) { cerr << "Nodes:" << endl; for(size_t i = 0; i < nodes.size(); i++) { const Node& node = nodes[i]; cerr << "\t" << i << "\t" << node.label << "\t" << node.value << (node.backbone ? "\tbackbone" : "") << endl; } cerr << endl; cerr << "Edges: " << endl; for(size_t i = 0; i < edges.size(); i++) { const Edge& edge = edges[i]; cerr << "\t" << i << "\t" << edge.from << " --> " << edge.to << endl; } cerr << endl; } #endif #endif } template class PathGraph { public: struct PathNode { index_t from; index_t to; pair key; void setSorted() { to = (index_t)INDEX_MAX; } bool isSorted() const { return to == (index_t)INDEX_MAX; } index_t value() const { return to; } index_t outdegree() const { return key.first; } bool operator< (const PathNode& o) const { return key < o.key; }; }; static inline index_t PathNodeFrom (PathNode& a) { return a.from; } static inline index_t PathNodeKey (PathNode& a) { return a.key.first; } struct PathNodeKeySecondCmp { bool operator() (const PathNode& a, const PathNode& b) const { return a.key.second < b.key.second; } }; struct PathNodeFromCmp { bool operator() (const PathNode& a, const PathNode& b) const { return a.from < b.from; } }; struct PathNodeToCmp { bool operator() (const PathNode& a, const PathNode& b) const { return a.to < b.to; } }; struct PathEdge { index_t from; union { index_t to; index_t ranking; }; char label; PathEdge() { reset(); } PathEdge(index_t from_, index_t ranking_, char label_) : from(from_), ranking(ranking_), label(label_) {} void reset() { from = 0; ranking = 0; label = 0; } bool operator< (const PathEdge& o) const { return label < o.label || (label == o.label && ranking < o.ranking); }; }; static inline index_t PathEdgeTo (PathEdge& a) { return a.to; } struct PathEdgeFromCmp { bool operator() (const PathEdge& a, const PathEdge& b) const { return a.from < b.from || (a.from == b.from && a.to < b.to); } }; struct PathEdgeToCmp { bool operator() (const PathEdge& a, const PathEdge& b) const { return a.to < b.to || (a.to == b.to && a.from < b.from); } }; public: // Create a new graph in which paths are represented using nodes PathGraph( RefGraph& parent, const string& base_fname, size_t max_num_nodes_ = std::numeric_limits::max(), int nthreads_ = 1, bool verbose_ = false); ~PathGraph() {} void printInfo(); bool generateEdges(RefGraph& parent); index_t getNumNodes() const { return (index_t)nodes.size(); } index_t getNumEdges() const { return (index_t)edges.size(); } bool isSorted() const { return sorted; } bool nextRow(int& gbwtChar, int& F, int& M, index_t& pos) { if(report_node_idx >= nodes.size()) return false; bool firstOutEdge = false; if(report_edge_range.first >= report_edge_range.second) { report_edge_range = getEdges(report_node_idx, false /* from? */); firstOutEdge = true; if(report_node_idx == 0) { report_M = pair(0, 0); } } assert_lt(report_edge_range.first, report_edge_range.second); assert_lt(report_edge_range.first, edges.size()); const PathEdge& edge = edges[report_edge_range.first]; gbwtChar = edge.label; assert_lt(report_node_idx, nodes.size()); F = (firstOutEdge ? 1 : 0); report_edge_range.first++; if(report_edge_range.first >= report_edge_range.second) { report_node_idx++; } assert_lt(report_M.first, nodes.size()); pos = nodes[report_M.first].to; M = (report_M.second == 0 ? 1 : 0); report_M.second++; if(report_M.second >= nodes[report_M.first].key.first) { report_M.first++; report_M.second = 0; } return true; } index_t nextFLocation() { if(report_F_node_idx >= nodes.size()) return (index_t)INDEX_MAX; index_t ret = report_F_location; pair edge_range = getEdges(report_F_node_idx, false /* from? */); report_F_node_idx++; assert_lt(edge_range.first, edge_range.second); report_F_location += (edge_range.second - edge_range.first); return ret; } private: void makeFromRef(RefGraph& base); void generationOne(); void earlyGeneration(); void firstPruneGeneration(); void lateGeneration(); void mergeUpdateRank(); pair nextMaximalSet(pair range); pair getEdges(index_t node, bool by_from); // Create index first. struct CreateNewNodesParams { PathNode* st; PathNode* en; PathNode* curr; index_t* sub_temp_nodes; PathGraph* graph; }; static void createNewNodesCounter(void* vp); static void createNewNodesMaker(void* vp); void createNewNodes(); struct GenEdgesParams { typename RefGraph::Edge* st; typename RefGraph::Edge* en; EList* label_index; EList* nodes; EList* edges; EList::Node>* ref_nodes; }; static void generateEdgesCounter(void* vp); static void generateEdgesMaker(void* vp); private: int nthreads; bool verbose; EList from_table; EList past_nodes; EList nodes; EList edges; index_t ranks; index_t max_from; //number of nodes in RefGraph index_t temp_nodes; // Total number of nodes created before sorting. index_t generation; // Sorted by paths of length 2^generation. bool sorted; // For reporting GBWT char, F, and M values index_t report_node_idx; pair report_edge_range; pair report_M; // For reporting location in F corresponding to 1 bit in M index_t report_F_node_idx; index_t report_F_location; size_t max_num_nodes; // following variables are for debugging purposes #ifndef NDEBUG bool debug; #endif EList bwt_string; EList F_array; EList M_array; EList bwt_counts; // brute-force implementations index_t select(const EList& array, index_t p, char c) { if(p <= 0) return 0; for(index_t i = 0; i < array.size(); i++) { if(array[i] == c) { assert_gt(p, 0); p--; if(p == 0) return i; } } return (index_t)array.size(); } index_t select1(const EList& array, index_t p) { return select(array, p, 1); } index_t rank(const EList& array, index_t p, char c) { index_t count = 0; assert_lt(p, array.size()); for(index_t i = 0; i <= p; i++) { if(array[i] == c) count++; } return count; } index_t rank1(const EList& array, index_t p) { return rank(array, p, 1); } // for debugging purposes #ifndef NDEBUG public: EList > ftab; #endif }; //creates prefix-sorted PathGraph Nodes given a reverse determinized RefGraph //outputs nodes sorted by their from attribute template PathGraph::PathGraph( RefGraph& base, const string& base_fname, size_t max_num_nodes_, int nthreads_, bool verbose_) : nthreads(nthreads_), verbose(verbose_), ranks(0), temp_nodes(0), generation(0), sorted(false), report_node_idx(0), report_edge_range(pair(0, 0)), report_M(pair(0, 0)), report_F_node_idx(0), report_F_location(0), max_num_nodes(max_num_nodes_) { #ifndef NDEBUG debug = base.nodes.size() <= 20; #endif // Fill nodes with a PathNode for each edge in base.edges. // Set max_from. makeFromRef(base); // Write RefGraph into a file const bool file_rf = base.nodes.size() > (1 << 22); const bool bigEndian = false; const string rf_fname = base_fname + ".rf"; if(file_rf) { base.write(rf_fname, bigEndian); base.nullify(); } // In the first generation the nodes enter, not quite sorted by from. // We use a counting sort to sort the nodes, otherwise same as early generation. generationOne(); // In early generations no nodes become sorted. // Therefore, we skip the pruning step and leave the // nodes sorted by from. while(generation < 3) { earlyGeneration(); } // On the first generation we perform a pruning step, // we are forced to sort the entire list of nodes by rank // in order to perform pruning step. firstPruneGeneration(); // In later generations, most nodes are already sorted, so we // perform a more expensive random access join with nodes in rank order // in return for avoiding having to sort by rank in order to prune nodes. while(!isSorted()) { lateGeneration(); } // In the generateEdges method it is convenient to begin with nodes sorted by from. // We perform this action here, while we still have past_nodes allocated to avoid // an in-place sort. nodes.resizeNoCopyExact(past_nodes.size()); radix_sort_copy(past_nodes.begin(), past_nodes.end(), nodes.ptr(), &PathNodeFrom, max_from, nthreads); past_nodes.nullify(); from_table.nullify(); if(file_rf) { base.read(rf_fname, bigEndian); std::remove(rf_fname.c_str()); } } //make original unsorted PathNodes given a RefGraph template void PathGraph::makeFromRef(RefGraph& base) { // Create a path node per edge with a key set to from node's label temp_nodes = (index_t)base.edges.size() + 1; max_from = 0; nodes.reserveExact(temp_nodes); for(index_t i = 0; i < base.edges.size(); i++) { const typename RefGraph::Edge& e = base.edges[i]; nodes.expand(); nodes.back().from = e.from; if(e.from > max_from) max_from = e.from; nodes.back().to = e.to; switch(base.nodes[e.from].label) { case 'A': nodes.back().key = pair(0, 0); break; case 'C': nodes.back().key = pair(1, 0); break; case 'G': nodes.back().key = pair(2, 0); break; case 'T': nodes.back().key = pair(3, 0); break; case 'Y': nodes.back().key = pair(4, 0); break; default: assert(false); throw 1; } } // Final node. assert_lt(base.lastNode, base.nodes.size()); assert_eq(base.nodes[base.lastNode].label, 'Z'); nodes.expand(); nodes.back().from = nodes.back().to = base.lastNode; if(base.lastNode > max_from) max_from = base.lastNode; nodes.back().key = pair(5, 0); printInfo(); } template void PathGraph::generationOne() { //nodes enter almost sorted by from //this is only generation method that whose // incoming nodes are in the nodes EList generation++; //Sort nodes by from using counting sort //Copy into past_nodes in the process //first count number with each from value for(PathNode* node = nodes.begin(); node != nodes.end(); node++) { nodes[node->from].key.second++; } //convert into an index index_t tot = nodes[0].key.second; nodes[0].key.second = 0; for(index_t i = 1; i < max_from + 2; i++) { tot += nodes[i].key.second; nodes[i].key.second = tot - nodes[i].key.second; } // use past_nodes as from_table past_nodes.resizeExact(nodes.size()); for(PathNode* node = nodes.begin(); node != nodes.end(); node++) { past_nodes[nodes[node->from].key.second++] = *node; } //reset index for(index_t i = max_from + 1; i > 0; i--) { past_nodes[i].key.second = nodes[i - 1].key.second; } past_nodes[0].key.second = 0; //Now query direct-access table createNewNodes(); printInfo(); past_nodes.swap(nodes); } template void PathGraph::earlyGeneration() { //past_nodes enter sorted by from //do not yet need to perform pruning step generation++; for(index_t i = 0; i < past_nodes.size(); i++) { past_nodes[past_nodes[i].from + 1].key.second = i + 1; } createNewNodes(); printInfo(); past_nodes.swap(nodes); } template void PathGraph::firstPruneGeneration() { //past_nodes enter sorted by from //first generation where we need to perform pruning step // results in us needing to sort entirety of nodes after they are made generation++; //here past_nodes is already sorted by .from // first count where to start each from value time_t start = time(0); //Build from_index for(index_t i = 0; i < past_nodes.size(); i++) { past_nodes[past_nodes[i].from + 1].key.second = i + 1; } if(verbose) cerr << "BUILT FROM_INDEX: " << time(0) - start << endl; start = time(0); // Now query against direct-access table createNewNodes(); past_nodes.resizeNoCopyExact(nodes.size()); if(verbose) cerr << "RESIZE NODES: " << time(0) - start << endl; start = time(0); //max_rank always corresponds to repeated Z's // Z is mapped to 0x101 // therefore max rank = 101101101101101101101101 = (101) 8 times index_t max_rank = 11983725; radix_sort_copy, index_t>(nodes.begin(), nodes.end(), past_nodes.ptr(), &PathNodeKey, max_rank, nthreads); if(verbose) cerr << "SORT NODES: " << time(0) - start << endl; start = time(0); nodes.swap(past_nodes); mergeUpdateRank(); if(verbose) cerr << "MERGE, UPDATE RANK: " << time(0) - start << endl; start = time(0); printInfo(); past_nodes.swap(nodes); } template void PathGraph::lateGeneration() { //past_nodes enter sorted by rank //build direct-access table sorted by from, //but query with original nodes sorted by rank //since nodes we query with are sorted by rank, // the nodes produced are automatically sorted by key.first // therefore we only need to sort clusters with same key.first generation++; time_t overall = time(0); time_t indiv = time(0); assert_gt(nthreads, 0); assert_neq(past_nodes.size(), ranks); from_table.resizeNoCopy(past_nodes.size()); if(verbose) cerr << "ALLOCATE FROM_TABLE: " << time(0) - indiv << endl; indiv = time(0); radix_sort_copy(past_nodes.begin(), past_nodes.end(), from_table.ptr(), &PathNodeFrom, max_from, nthreads); if(verbose) cerr << "BUILD TABLE: " << time(0) - indiv << endl; indiv = time(0); //Build from_index index_t from_table_size = from_table.size(); for(index_t i = 0; i < from_table_size; i++) { if(from_table[i].from + 1 >= from_table.size()) { from_table.resize(from_table[i].from + 2); } from_table[from_table[i].from + 1].key.second = i + 1; } if(verbose) cerr << "BUILD INDEX: " << time(0) - indiv << endl; createNewNodes(); indiv = time(0); mergeUpdateRank(); if(from_table_size != from_table.size()) { assert_lt(from_table_size, from_table.size()); from_table.resize(from_table_size); } if(verbose) cerr << "MERGEUPDATERANK: " << time(0) - indiv << endl; if(verbose) cerr << "TOTAL TIME: " << time(0) - overall << endl; if(ranks >= (index_t)max_num_nodes) { throw ExplosionException(); } printInfo(); past_nodes.swap(nodes); } //----------------------------------------------------------------------------------------------- template void PathGraph::createNewNodesCounter(void* vp) { CreateNewNodesParams* params = (CreateNewNodesParams*)vp; PathNode* st = params->st; PathNode* en = params->en; PathGraph& graph = *(params->graph); size_t count = 0; if(graph.generation > 4) { for(PathNode* node = st; node != en; node++) { if(node->isSorted()) { count++; } else { count += graph.from_table[node->to + 1].key.second - graph.from_table[node->to].key.second; } } } else { for(PathNode* node = st; node != en; node++) { count += graph.past_nodes[node->to + 1].key.second - graph.past_nodes[node->to].key.second; } } *(params->sub_temp_nodes) = (index_t)count; //check for overflow if(count > (index_t)-1) { cerr << "exceeded integer bounds, remove adjacent SNPs, use haplotypes, or switch to a large index (--large-index)" << endl; throw 1; } } template void PathGraph::createNewNodesMaker(void* vp) { CreateNewNodesParams* params = (CreateNewNodesParams*)vp; PathNode* st = params->st; PathNode* en = params->en; PathNode* curr = params->curr; PathGraph& graph = *(params->graph); if(graph.generation > 4) { for(PathNode* node = st; node != en; node++) { if(node->isSorted()) { *curr++ = *node; } else { for(index_t j = graph.from_table[node->to].key.second; j < graph.from_table[node->to + 1].key.second; j++) { curr->from = node->from; curr->to = graph.from_table[j].to; (curr++)->key = pair(node->key.first, graph.from_table[j].key.first); } } } } else if(graph.generation == 4) { for(PathNode* node = st; node != en; node++) { for(index_t j = graph.past_nodes[node->to].key.second; j < graph.past_nodes[node->to + 1].key.second; j++) { curr->from = node->from; curr->to = graph.past_nodes[j].to; (curr++)->key = pair(node->key.first, graph.past_nodes[j].key.first); } } } else { for(PathNode* node = st; node != en; node++) { for(index_t j = graph.past_nodes[node->to].key.second; j < graph.past_nodes[node->to + 1].key.second; j++) { curr->from = node->from; curr->to = graph.past_nodes[j].to; index_t bit_shift = 1 << (graph.generation - 1); bit_shift = (bit_shift << 1) + bit_shift; (curr++)->key = pair((node->key.first << bit_shift) + graph.past_nodes[j].key.first, 0); } } } } template void PathGraph::createNewNodes() { time_t indiv = time(0); AutoArray threads(nthreads); EList params; params.resizeExact(nthreads); EList sub_temp_nodes; sub_temp_nodes.resizeExact(nthreads); sub_temp_nodes.fillZero(); PathNode* st = past_nodes.begin(); PathNode* en = st + past_nodes.size() / nthreads; for(int i = 0; i < nthreads; i++) { params[i].sub_temp_nodes = &sub_temp_nodes[i]; params[i].st = st; params[i].en = en; params[i].graph = this; if(nthreads == 1) { createNewNodesCounter((void*)¶ms[0]); } else { threads[i] = new tthread::thread(&createNewNodesCounter, (void*)¶ms[i]); } st = en; if(i + 2 == nthreads) { en = past_nodes.end(); } else { en = st + past_nodes.size() / nthreads; } } if(nthreads > 1) { for(int i = 0; i < nthreads; i++) threads[i]->join(); } if(verbose) cerr << "COUNTED NEW NODES: " << time(0) - indiv << endl; indiv = time(0); //update all label indexes temp_nodes = 0; for(int i = 0; i < nthreads; i++) { // done to check if we exceed index_t range size_t val = (size_t)temp_nodes + (size_t)sub_temp_nodes[i]; if(val > (index_t)-1) { cerr << "exceeded integer bounds, remove adjacent SNPs, use haplotypes, or switch to a large index (--large-index)" << endl; throw 1; } temp_nodes = (index_t)val; } if(verbose) cerr << "COUNTED TEMP NODES: " << time(0) - indiv << endl; indiv = time(0); nodes.resizeNoCopyExact(temp_nodes); if(verbose) cerr << "RESIZED NODES: " << time(0) - indiv << endl; indiv = time(0); temp_nodes = 0; for(int i = 0; i < nthreads; i++) { params[i].curr = nodes.begin() + temp_nodes; temp_nodes += sub_temp_nodes[i]; } if(verbose) cerr << "RESIZED NODES: " << time(0) - indiv << endl; indiv = time(0); //make new nodes for(int i = 0; i < nthreads; i++) { if(nthreads == 1) { createNewNodesMaker((void*)¶ms[0]); } else { threads[i] = new tthread::thread(&createNewNodesMaker, (void*)¶ms[i]); } } if(nthreads > 1) { for(int i = 0; i < nthreads; i++) threads[i]->join(); } if(verbose) cerr << "MADE NEW NODES: " << time(0) - indiv << endl; indiv = time(0); } //------------------------------------------------------------------------------------ template void PathGraph::mergeUpdateRank() { if(generation == 4) { // Merge equivalent nodes index_t curr = 0; pair range(0, 0); // Empty range while(true) { range = nextMaximalSet(range); if(range.first >= range.second) break; nodes[curr] = nodes[range.first]; curr++; } nodes.resize(curr); // Set nodes that become sorted as sorted PathNode* candidate = &nodes.front(); pair key = candidate->key; ranks = 1; for(index_t i = 1; i < nodes.size(); i++) { if(nodes[i].key != key) { if(candidate != NULL) { candidate->setSorted(); } candidate = &nodes[i]; key = candidate->key; ranks++; } else { candidate = NULL; } } if(candidate != NULL) { candidate->setSorted(); } ranks = 0; key = nodes.front().key; for(index_t i = 0; i < nodes.size(); i++) { PathNode& node = nodes[i]; if(node.key != key) { key = node.key; ranks++; } node.key = pair(ranks, 0); } ranks++; } else { PathNode* block_start = nodes.begin(); PathNode* curr = nodes.begin(); PathNode* node = nodes.begin(); ranks = 0; do { node++; if(node == nodes.end() || node->key.first != block_start->key.first) { if(node - block_start == 1) { block_start->key.first = ranks++; *curr++ = *block_start; } else { sort(block_start, node, PathNodeKeySecondCmp()); while(block_start != node) { //extend shift while share same key index_t shift = 1; while(block_start + shift != node && block_start->key == (block_start + shift)->key) { shift++; } //check if all share .from //if they share same from, then they are a mergable set bool merge = true; for(PathNode* n = block_start; n != (block_start + shift); n++) { if(n->from != block_start->from) { merge = false; break; } } //if not mergable, just write all to array if(!merge) { for(PathNode* n = block_start; n != (block_start + shift); n++) { n->key.first = ranks; *curr++ = *n; } ranks++; } else if(curr == nodes.begin() || !(curr - 1)->isSorted() || (curr - 1)->from != block_start->from) { block_start->setSorted(); block_start->key.first = ranks++; *curr++ = *block_start; } block_start += shift; } // if we are at the last node or the last node is mergable into the previous node, we are done if(node == nodes.end()) break; if(node + 1 == nodes.end()) { assert(curr >= nodes.begin() + 1); if((curr - 1)->isSorted() && node->from == (curr - 1)->from) break; } // check if we can safely merge the node immediately following the unsorted cluster into the previous node // must be that: // 1) node is not itself part of an unsorted cluster // 2) the previous node is sorted // 3) the nodes share the same from attribute assert(node + 1 < nodes.end()); if(node->key.first != (node + 1)->key.first) { assert(curr >= nodes.begin() + 1); if((curr - 1)->isSorted() && node->from == (curr - 1)->from) node++; } } block_start = node; } } while(node != nodes.end()); nodes.resizeExact((index_t)(curr - nodes.begin())); } // if all nodes have unique rank we are done! if(ranks == nodes.size()) sorted = true; } // Returns the next maximal mergeable set of PathNodes. // A set of PathNodes sharing adjacent keys is mergeable, if each of the // PathNodes begins in the same GraphNode, and no other PathNode shares // the key. If the maximal set is empty, returns the next PathNode. template pair PathGraph::nextMaximalSet(pair range) { if(range.second >= nodes.size()) { return pair(0, 0); } range.first = range.second; range.second = range.first + 1; if(range.first > 0 && nodes[range.first - 1].key == nodes[range.first].key) { return range; } for(index_t i = range.second; i < nodes.size(); i++) { if(nodes[i - 1].key != nodes[i].key) { range.second = i; } if(nodes[i].from != nodes[range.first].from) { return range; } } range.second = (index_t)nodes.size(); return range; } //----------------------------------------------------------------------------------------- template void PathGraph::printInfo() { if(verbose) { cerr << "Generation " << generation << " (" << temp_nodes << " -> " << nodes.size() << " nodes, " << ranks << " ranks)" << endl; } } //------------------------------------------------------------------------------------------ template void PathGraph::generateEdgesCounter(void* vp) { GenEdgesParams* params = (GenEdgesParams*)vp; typename RefGraph::Edge* st = params->st; typename RefGraph::Edge* en = params->en; EList& label_index = *(params->label_index); EList::Node>& ref_nodes = *(params->ref_nodes); EList& nodes = *(params->nodes); //first count edges, fill out label_index for(typename RefGraph::Edge* edge = st; edge != en; edge++) { char curr_label = ref_nodes[edge->from].label; int curr_label_index; switch(curr_label) { case 'A': curr_label_index = 0; break; case 'C': curr_label_index = 1; break; case 'G': curr_label_index = 2; break; case 'T': curr_label_index = 3; break; case 'Y': curr_label_index = 4; break; case 'Z': curr_label_index = 5; break; default: assert(false); throw 1; } assert_lt(edge->to + 1, nodes.size()); assert_lt(nodes[edge->to].key.second, nodes[edge->to + 1].key.second); label_index[curr_label_index] += nodes[edge->to + 1].key.second - nodes[edge->to].key.second; } } template void PathGraph::generateEdgesMaker(void* vp) { GenEdgesParams* params = (GenEdgesParams*)vp; typename RefGraph::Edge* st = params->st; typename RefGraph::Edge* en = params->en; EList& label_index = *(params->label_index); EList::Node>& ref_nodes = *(params->ref_nodes); EList& edges = *(params->edges); EList& nodes = *(params->nodes); for(typename RefGraph::Edge* edge = st; edge != en; edge++) { char curr_label = ref_nodes[edge->from].label; int curr_label_index; switch(curr_label) { case 'A': curr_label_index = 0; break; case 'C': curr_label_index = 1; break; case 'G': curr_label_index = 2; break; case 'T': curr_label_index = 3; break; case 'Y': curr_label_index = 4; break; case 'Z': curr_label_index = 5; break; default: assert(false); throw 1; } for(index_t j = nodes[edge->to].key.second; j < nodes[edge->to + 1].key.second; j++) { edges[label_index[curr_label_index]++] = PathEdge(edge->from, nodes[j].key.first, curr_label); } } } template bool PathGraph::generateEdges(RefGraph& base) { //entering we have: // nodes - sorted by from // edges - empty // base.nodes - almost sorted by from/to // base.edges - almost sorted by from/to //need to join: // nodes.from -> base.nodes[] // nodes.from -> base.edges.to // nodes.from -> edges.from if(!sorted) return false; time_t indiv = time(0); time_t overall = time(0); //replace nodes.to with genomic position //fast because both roughly ordered by from for(PathNode* node = nodes.begin(); node != nodes.end(); node++) { node->to = base.nodes[node->from].value; } if(verbose) cerr << "NODE.TO -> GENOME POS: " << time(0) - indiv << endl; indiv = time(0); // build an index for nodes index_t node_size = nodes.size(); for(index_t i = 0; i < node_size; i++) { // very rare case where the number of prefix-sorted nodes is smaller than the number of the initial nodes // , which could happen with a very small graph and a variant as follows // ATAGAGCAGTTCTGAAAAACACTTTTTGTTGAATCTGCAAG(T)GGACATTTGGATAGATTTGAAGATTTCGTTGGAAACGGGAATATCTTCATATCAAATG // (G) // where G(T) and G(G) will be combined as there is no other node that intervene those two path nodes. if(nodes[i].from + 1 >= nodes.size()) { nodes.resize(nodes[i].from + 2); } nodes[nodes[i].from + 1].key.second = i + 1; } if(verbose) cerr << "BUILD FROM_INDEX " << time(0) - indiv << endl; indiv = time(0); // Now join nodes.from to edges.to // fast because base.edges roughly sorted by to //count number of edges AutoArray threads(nthreads); EList params; params.resizeExact(nthreads); ELList label_index; label_index.resize(nthreads); typename RefGraph::Edge* st = base.edges.begin(); typename RefGraph::Edge* en = st + base.edges.size() / nthreads; for(int i = 0; i < nthreads; i++) { label_index[i].resizeExact(6); label_index[i].fillZero(); params[i].label_index = &label_index[i]; params[i].st = st; params[i].en = en; params[i].nodes = &nodes; params[i].edges = &edges; params[i].ref_nodes = &base.nodes; if(nthreads == 1) { generateEdgesCounter((void*)¶ms[0]); } else { threads[i] = new tthread::thread(&generateEdgesCounter, (void*)¶ms[i]); } st = en; if(i + 2 == nthreads) { en = base.edges.end(); } else { en = st + base.edges.size() / nthreads; } } if(nthreads > 1) { for(int i = 0; i < nthreads; i++) threads[i]->join(); } if(verbose) cerr << "COUNTED NEW EDGES: " << time(0) - indiv << endl; indiv = time(0); //update all label indexes index_t tot = label_index[0][0]; label_index[0][0] = 0; for(int i = 1; i < nthreads; i++) { tot += label_index[i][0]; label_index[i][0] = tot - label_index[i][0]; } for(int j = 1; j < 6; j++) { for(int i = 0; i < nthreads; i++) { tot += label_index[i][j]; label_index[i][j] = tot - label_index[i][j]; } } edges.resizeExact(tot); //make new edges for(int i = 0; i < nthreads; i++) { if(nthreads == 1) { generateEdgesMaker((void*)¶ms[0]); } else { threads[i] = new tthread::thread(&generateEdgesMaker, (void*)¶ms[i]); } } if(nthreads > 1) { for(int i = 0; i < nthreads; i++) { threads[i]->join(); } } base.nullify(); // delete unused nodes if(node_size != nodes.size()) { assert_lt(node_size, nodes.size()); nodes.resize(node_size); } if(verbose) cerr << "MADE NEW EDGES: " << time(0) - indiv << endl; indiv = time(0); EList& index = label_index[nthreads - 1]; EList temp_edges; temp_edges.resizeExact(edges.size()); radix_sort_copy, index_t>(edges.begin() , edges.begin() + index[0], temp_edges.ptr(), &PathEdgeTo, (index_t)nodes.size(), nthreads); radix_sort_copy, index_t>(edges.begin() + index[0], edges.begin() + index[1], temp_edges.ptr() + index[0], &PathEdgeTo, (index_t)nodes.size(), nthreads); radix_sort_copy, index_t>(edges.begin() + index[1], edges.begin() + index[2], temp_edges.ptr() + index[1], &PathEdgeTo, (index_t)nodes.size(), nthreads); radix_sort_copy, index_t>(edges.begin() + index[2], edges.begin() + index[3], temp_edges.ptr() + index[2], &PathEdgeTo, (index_t)nodes.size(), nthreads); for(index_t i = index[3]; i < edges.size(); i++) { temp_edges[i] = edges[i]; } sort(temp_edges.begin() + index[3], temp_edges.begin() + index[4]); sort(temp_edges.begin() + index[4], temp_edges.begin() + index[5]); edges.xfer(temp_edges); if(verbose) cerr << "SORTED NEW EDGES: " << time(0) - indiv << endl; indiv = time(0); EList past_nodes; past_nodes.resizeExact(nodes.size()); radix_sort_copy, index_t>(nodes.begin(), nodes.end(), past_nodes.ptr(), &PathNodeKey, ranks, nthreads); nodes.xfer(past_nodes); if(verbose) cerr << "RE-SORTED NODES: " << time(0) - indiv << endl; indiv = time(0); #ifndef NDEBUG if(debug) { cerr << "just after creating path edges" << endl; cerr << "Ref edges" << endl; for(size_t i = 0; i < base.edges.size(); i++) { const typename RefGraph::Edge& edge = base.edges[i]; cerr << "\t" << i << "\t" << edge.from << " --> " << edge.to << endl; } cerr << "Path nodes" << endl; for(size_t i = 0; i < nodes.size(); i++) { const PathNode& node = nodes[i]; cerr << "\t" << i << "\t(" << node.key.first << ", " << node.key.second << ")\t" << node.from << " --> " << node.to << endl; } cerr << "Path edges" << endl; for(size_t i = 0; i < edges.size(); i++) { const PathEdge& edge = edges[i]; cerr << "\t" << i << "\tfrom: " << edge.from << "\tranking: " << edge.ranking << "\t" << edge.label << endl; } } #endif #ifndef NDEBUG // Switch char array[x][y]; to char** array; if(debug) { cerr << "after sorting nodes by ranking and edges by label and ranking" << endl; cerr << "Path nodes" << endl; for(size_t i = 0; i < nodes.size(); i++) { const PathNode& node = nodes[i]; cerr << "\t" << i << "\t(" << node.key.first << ", " << node.key.second << ")\t" << node.from << " --> " << node.to << endl; } cerr << "Path edges" << endl; for(size_t i = 0; i < edges.size(); i++) { const PathEdge& edge = edges[i]; cerr << "\t" << i << "\tfrom: " << edge.from << "\tranking: " << edge.ranking << "\t" << edge.label << endl; } } #endif // Sets PathNode.to = GraphNode.value and PathNode.key.first to outdegree // Replaces (from.from, to) with (from, to) { PathNode* node = nodes.begin(); node->key.first = 0; PathEdge* edge = edges.begin(); while(node != nodes.end() && edge != edges.end()) { if(edge->from == node->from) { edge->from = (index_t)(node - nodes.begin()); edge++; node->key.first++; } else { node++; node->key.first = 0; } } } if(verbose) cerr << "PROCESS EDGES: " << time(0) - indiv << endl; indiv = time(0); // Remove 'Y' node assert_gt(nodes.size(), 2); nodes.back().key.first = nodes[nodes.size() - 2].key.first; nodes[nodes.size() - 2] = nodes.back(); nodes.pop_back(); // Adjust edges accordingly for(size_t i = 0; i < edges.size(); i++) { PathEdge& edge = edges[i]; if(edge.label == 'Y') { edge.label = 'Z'; } else if(edge.ranking >= nodes.size()) { assert_eq(edge.ranking, nodes.size()); edge.ranking -= 1; } } if(verbose) cerr << "REMOVE Y: " << time(0) - indiv << endl; indiv = time(0); #ifndef NDEBUG if(debug) { cerr << "Path nodes" << endl; for(size_t i = 0; i < nodes.size(); i++) { const PathNode& node = nodes[i]; cerr << "\t" << i << "\t(" << node.key.first << ", " << node.key.second << ")\t" << node.from << " --> " << node.to << endl; } cerr << "Path edges" << endl; for(size_t i = 0; i < edges.size(); i++) { const PathEdge& edge = edges[i]; cerr << "\t" << i << "\tfrom: " << edge.from << "\tranking: " << edge.ranking << "\t" << edge.label << endl; } } #endif temp_edges.resizeExact(edges.size()); radix_sort_copy(edges.begin(), edges.end(), temp_edges.ptr(), &PathEdgeTo, (index_t)nodes.size(), nthreads); edges.xfer(temp_edges); for(index_t i = 0; i < edges.size(); i++) { nodes[edges[i].ranking].key.second = i + 1; } if(verbose) cerr << "SORT, Make index: " << time(0) - indiv << endl; if(verbose) cerr << "TOTAL: " << time(0) - overall << endl; return true; //----------------------------------------------------------------------------------------------------- bwt_string.clear(); F_array.clear(); M_array.clear(); bwt_counts.resizeExact(5); bwt_counts.fillZero(); for(index_t node = 0; node < nodes.size(); node++) { pair edge_range = getEdges(node, false /* from? */); for(index_t i = edge_range.first; i < edge_range.second; i++) { assert_lt(i, edges.size()); char label = edges[i].label; if(label == 'Y') { label = 'Z'; } bwt_string.push_back(label); F_array.push_back(i == edge_range.first ? 1 : 0); if(label != 'Z') { char nt = asc2dna[(int)label]; assert_lt(nt + 1, bwt_counts.size()); bwt_counts[nt + 1]++; } } for(index_t i = 0; i < nodes[node].key.first; i++) { M_array.push_back(i == 0 ? 1 : 0); } } assert_gt(bwt_string.size(), 0); assert_eq(bwt_string.size(), F_array.size()); assert_eq(bwt_string.size(), M_array.size()); for(size_t i = 0; i < bwt_counts.size(); i++) { if(i > 0) bwt_counts[i] += bwt_counts[i - 1]; } #ifndef NDEBUG if(debug) { cerr << "Path nodes (final)" << endl; for(size_t i = 0; i < nodes.size(); i++) { const PathNode& node = nodes[i]; cerr << "\t" << i << "\t(" << node.key.first << ", " << node.key.second << ")\t" << node.from << " --> " << node.to << endl; } cerr << "Path edges (final)" << endl; for(size_t i = 0; i < edges.size(); i++) { const PathEdge& edge = edges[i]; cerr << "\t" << i << "\tfrom: " << edge.from << "\tranking: " << edge.ranking << "\t" << edge.label << endl; } cerr << "i\tBWT\tF\tM" << endl; for(index_t i = 0; i < bwt_string.size(); i++) { cerr << i << "\t" << bwt_string[i] << "\t" // BWT char << (int)F_array[i] << "\t" // F bit value << (int)M_array[i] << endl; // M bit value } for(size_t i = 0; i < bwt_counts.size(); i++) { cerr << i << "\t" << bwt_counts[i] << endl; } } #endif // Test searches, based on paper_example #if 1 EList queries; EList answers; # if 1 # if 1 queries.push_back("GACGT"); answers.push_back(9); queries.push_back("GATGT"); answers.push_back(9); queries.push_back("GACT"); answers.push_back(9); queries.push_back("ATGT"); answers.push_back(4); queries.push_back("GTAC"); answers.push_back(10); queries.push_back("ACTG"); answers.push_back(3); # else // rs55902548, at 402, ref, alt, unknown alt queries.push_back("GGCAGCTCCCATGGGTACACACTGGGCCCAGAACTGGGATGGAGGATGCA"); // queries.push_back("GGCAGCTCCCATGGGTACACACTGGTCCCAGAACTGGGATGGAGGATGCA"); // queries.push_back("GGCAGCTCCCATGGGTACACACTGGACCCAGAACTGGGATGGAGGATGCA"); // rs5759268, at 926787, ref, alt, unknown alt // queries.push_back("AAATTGCTCAGCCTTGTGCTGTGCACACCTGGTTCTCTTTCCAGTGTTAT"); // queries.push_back("AAATTGCTCAGCCTTGTGCTGTGCATACCTGGTTCTCTTTCCAGTGTTAT"); // queries.push_back("AAATTGCTCAGCCTTGTGCTGTGCAGACCTGGTTCTCTTTCCAGTGTTAT"); # endif for(size_t q = 0; q < queries.size(); q++) { const string& query = queries[q]; assert_gt(query.length(), 0); index_t top = 0, bot = edges.size(); index_t node_top = 0, node_bot = 0; cerr << "Aligning " << query << endl; index_t i = 0; for(; i < query.length(); i++) { if(top >= bot) break; int nt = query[query.length() - i - 1]; nt = asc2dna[nt]; assert_lt(nt, 4); cerr << "\t" << i << "\tBWT range: [" << top << ", " << bot << ")" << endl; top = bwt_counts[(int)nt] + (top <= 0 ? 0 : rank(bwt_string, top - 1, "ACGT"[nt])); bot = bwt_counts[(int)nt] + rank(bwt_string, bot - 1, "ACGT"[nt]); cerr << "\t\tLF BWT range: [" << top << ", " << bot << ")" << endl; node_top = rank1(M_array, top) - 1; node_bot = rank1(M_array, bot - 1); cerr << "\t\tnode range: [" << node_top << ", " << node_bot << ")" << endl; top = select1(F_array, node_top + 1); bot = select1(F_array, node_bot + 1); } cerr << "\t" << i << "\tBWT range: [" << top << ", " << bot << ")" << endl; // assert_eq(top, answers[q]); cerr << "finished... "; if(node_top < node_bot && node_top < nodes.size()) { index_t pos = nodes[node_top].to; index_t gpos = pos; const EList& szs = base.szs; for(index_t i = 0; i < szs.size(); i++) { gpos += szs[i].off; if(pos < szs[i].len) break; pos -= szs[i].len; } cerr << "being aligned at " << gpos; } cerr << endl << endl; } # endif // See inconsistencies between F and M arraystimy thread # if 0 cerr << endl << endl; EList tmp_F; for(index_t i = 0; i < F_array.size(); i++) { if(F_array[i] == 1) tmp_F.push_back(i); } EList tmp_M; for(index_t i = 0; i < M_array.size(); i++) { if(M_array[i] == 1) tmp_M.push_back(i); } index_t max_diff = 0; assert_eq(tmp_F.size(), tmp_M.size()); for(index_t i = 0; i < tmp_F.size(); i++) { index_t diff = (tmp_F[i] >= tmp_M[i] ? tmp_F[i] - tmp_M[i] : tmp_M[i] - tmp_F[i]); if(diff > max_diff) { max_diff = diff; cerr << i << "\tdiff: " << max_diff << "\t" << (tmp_F[i] >= tmp_M[i] ? "+" : "-") << endl; } } cerr << "Final: " << tmp_F.back() << " vs. " << tmp_M.back() << endl; # endif #endif return true; } //-------------------------------------------------------------------------- template pair PathGraph::getEdges(index_t node, bool by_from) { if(node >= nodes.size()) { cerr << "Error: Trying to get edges " << (by_from ? "from " : "to ") << node << endl; } if(nodes[node].key.second == 0) { return pair(0, 0); } if(node == 0) { return pair(0, nodes[node].key.second); } else { return pair(nodes[node - 1].key.second, nodes[node].key.second); } } #endif /*GBWT_GRAPH_H_*/