#!/usr/bin/env python import sys, os, subprocess import multiprocessing import string, re import platform from datetime import datetime, date, time import copy from argparse import ArgumentParser, FileType from multiprocessing import Process import bisect mp_mode = False mp_num = 1 cigar_re = re.compile('\d+\w') osx_mode = False if sys.platform == 'darwin': osx_mode = True """ """ def parse_mem_usage(resource): if osx_mode: resource = resource.strip().split('\n') for line in resource: if line.find('maximum resident set size') != -1: return int(line.split()[0]) / 1024 else: resource = resource.split(' ') for line in resource: idx = line.find('maxresident') if idx != -1: return line[:idx] return '0' """ """ def reverse_complement(seq): result = "" for nt in seq: base = nt if nt == 'A': base = 'T' elif nt == 'a': base = 't' elif nt == 'C': base = 'G' elif nt == 'c': base = 'g' elif nt == 'G': base = 'C' elif nt == 'g': base = 'c' elif nt == 'T': base = 'A' elif nt == 't': base = 'a' result = base + result return result """ RepeatDB """ class RepeatAllele: def __init__(self): self.repeat_name = '' self.allele_idx = 0 self.repeat_pos = 0 self.repeat_length = 0 self.positions = [] return def __repr__(self): return '[' + ','.join([str(self.repeat_name), str(self.allele_idx), str(self.repeat_pos), str(self.repeat_length), str(len(self.positions))]) + ']' def add_position(self, chr, pos, strand): self.positions.append([chr, pos, strand]) def __lt__(self, other): if self.repeat_pos < other.repeat_pos: return True elif self.repeat_pos == other.repeat_pos: return self.repeat_length < other.repeat_length else: return False class Repeat: def __init__(self): self.repeat_name = '' self.repeat_length = 0 self.repeat_pos = 0 self.allele = [] return def add_allele(self, allele_idx, repeatAllele): #self.allele[allele_idx] = repeatAllele self.allele.append(repeatAllele) def allele_sort(self): self.allele = sorted(self.allele) def cmp_repeatmap(a, b): if a[0] < b[0]: return -1 elif a[0] == b[0]: return 0 else: return 1 def read_len_cigar(cigar_str): cigars = cigar_re.findall(cigar_str) cigars = [[cigar[-1], int(cigar[:-1])] for cigar in cigars] read_len = 0 for cigar in cigars: cigar_op, length = cigar if cigar_op in "MISH": read_len += int(length) return read_len def read_repeatdb(repeat_filename): repeat_db = {} if os.path.exists(repeat_filename): for line in open(repeat_filename, 'r'): if line[0] == '>': line = line.strip()[1:] name, rptRefName, pos, rep_len, _, _ = line.split()[:6] pos = int(pos) rep_len = int(rep_len) rptName, allele_idx = name.split('*')[0:2] allele_idx = int(allele_idx) repeatAllele = RepeatAllele() repeatAllele.repeat_name = rptName repeatAllele.allele_idx = allele_idx repeatAllele.repeat_pos = pos repeatAllele.repeat_length = rep_len if rptRefName not in repeat_db: # new rptRefName repeat_db[rptRefName] = {} if rptName not in repeat_db[rptRefName]: # new rptName assert allele_idx == 0 repeat_db[rptRefName][rptName] = Repeat() repeat_db[rptRefName][rptName].repeat_name = rptName repeat_db[rptRefName][rptName].add_allele(allele_idx, repeatAllele) else: coords = line.split() for coord in coords: chr, pos, strand = coord.split(':') pos = int(pos) repeat_db[rptRefName][rptName].allele[allele_idx].add_position(chr, pos, strand) else: print >> sys.stderr, 'Cannot open file', repeat_filename print >> sys.stderr, 'Build repeatMap' repeat_map = {} for rptRefName, repeats in repeat_db.items(): #print 'Processing', rptRefName repeat_pos_list = [] for repeatName, repeat in repeats.items(): #print 'Common Allele:', repeatName, repeat.repeat_name repeat_left = sys.maxint repeat_right = 0 #for allele_id, repeatAllele in repeat.allele.items(): for repeatAllele in repeat.allele: left = repeatAllele.repeat_pos right = left + repeatAllele.repeat_length if left < repeat_left: repeat_left = left if right > repeat_right: repeat_right = right repeat.repeat_pos = repeat_left repeat.repeat_length = repeat_right - repeat_left #print repeat.allele #repeat.allele_sort() #print repeat.allele #print repeat_left, repeat_right repeat_pos_list.append((repeat_right, repeatName)) repeat_map[rptRefName] = sorted(repeat_pos_list, cmp=cmp_repeatmap) #print repeat_map[rptRefName] return repeat_db, repeat_map def find_leftmost_pos(rmap, left): pos = bisect.bisect_left(rmap, (left, None)) #print pos if pos == len(rmap): return pos if rmap[pos][0] == left: while pos < len(rmap): if rmap[pos][0] != left: break pos += 1 return pos def repeat_to_genome_pos(repeat_db, repeat_map, rptRefName, pos, cigar_str = ''): assert rptRefName in repeat_db readlen = read_len_cigar(cigar_str) #readlen = 101 # pos in sam-result. pos is 1-based left = pos - 1 right = left + readlen repeats = repeat_db[rptRefName] rmap = repeat_map[rptRefName] #print len(rmap) #print rmap i = find_leftmost_pos(rmap, left) if i >= len(rmap): print >> sys.stderr, 'Value Error' return if right > rmap[i][0]: print >> sys.stderr, 'Not repeat' return repeat = repeats[rmap[i][1]] #print 'Allele Size:', len(repeat.allele) #print repeat.allele for allele in repeat.allele: rpos = allele.repeat_pos rlen = allele.repeat_length if (left >= rpos) and (right <= rpos + rlen): offset = left - rpos for genome_pos in allele.positions: print genome_pos[0], genome_pos[1] + offset + 1, genome_pos[2], genome_pos[1] """ """ def read_genome(genome_filename): chr_dic = {} genome_file = open(genome_filename, "r") chr_name, sequence = "", "" for line in genome_file: if line[0] == ">": if chr_name and sequence: chr_dic[chr_name] = sequence chr_name = line[1:-1].split()[0] sequence = "" else: sequence += line[:-1] if chr_name and sequence: chr_dic[chr_name] = sequence genome_file.close() print >> sys.stderr, "genome is loaded" return chr_dic """ """ def extract_splice_sites(gtf_fname): trans = {} gtf_file = open(gtf_fname) # Parse valid exon lines from the GTF file into a dict by transcript_id for line in gtf_file: line = line.strip() if not line or line.startswith('#'): continue if '#' in line: line = line.split('#')[0].strip() try: chrom, source, feature, left, right, score, \ strand, frame, values = line.split('\t') except ValueError: continue left, right = int(left), int(right) if feature != 'exon' or left >= right: continue values_dict = {} for attr in values.split(';')[:-1]: attr, _, val = attr.strip().partition(' ') values_dict[attr] = val.strip('"') if 'gene_id' not in values_dict or \ 'transcript_id' not in values_dict: continue transcript_id = values_dict['transcript_id'] if transcript_id not in trans: trans[transcript_id] = [chrom, strand, [[left, right]]] else: trans[transcript_id][2].append([left, right]) gtf_file.close() # Sort exons and merge where separating introns are <=5 bps for tran, [chrom, strand, exons] in trans.items(): exons.sort() tmp_exons = [exons[0]] for i in range(1, len(exons)): if exons[i][0] - tmp_exons[-1][1] <= 5: tmp_exons[-1][1] = exons[i][1] else: tmp_exons.append(exons[i]) trans[tran] = [chrom, strand, tmp_exons] # Calculate and print the unique junctions junctions = set() for chrom, strand, exons in trans.values(): for i in range(1, len(exons)): junctions.add(to_junction_str([chrom, exons[i-1][1], exons[i][0]])) return junctions """ """ def read_repeat_info(repeat_filename): repeat_info, repeat_dic = {}, {} repeat_pos = {} if os.path.exists(repeat_filename): for line in open(repeat_filename): if line[0] == ">": line = line.strip()[1:] allele, rep, pos, rep_len, _, _ = line.split()[:6] pos, rep_len = int(pos), int(rep_len) common_allele = allele.split('*')[0] if rep not in repeat_info: repeat_info[rep] = [] repeat_dic[rep] = {} repeat_pos[rep] = {} repeat_info[rep].append([allele, pos, rep_len]) if allele not in repeat_dic[rep]: repeat_dic[rep][allele] = [] repeat_pos[rep][allele] = set() else: coords = line.split() for coord in coords: chr, pos, strand = coord.split(':') pos = int(pos) if pos in repeat_pos[rep][allele]: continue repeat_dic[rep][allele].append([chr, pos, strand]) repeat_pos[rep][allele].add(pos) for rep, repeats in repeat_info.items(): def my_cmp(a, b): if a[1] < b[1]: return -1 elif a[1] == b[1]: return a[2] - b[2] else: return 1 repeat_info[rep] = sorted(repeat_info[rep], cmp=my_cmp) return repeat_info, repeat_dic """ """ def find_repeat(repeat_info, pos): if len(repeat_info) <= 0: return -1 l, r = 0, len(repeat_info) while l < r: m = (l + r) / 2 _, rep_pos, rep_len, _ = repeat_info[m] if rep_pos <= pos and pos < rep_pos + rep_len: return m elif pos < rep_pos: r = m else: l = m + 1 return -1 def reverse_cigar(cigar_str): cigars = cigar_re.findall(cigar_str) cigars = [[cigar[-1], int(cigar[:-1])] for cigar in cigars] cigars[::-1] read_len = 0 cigar_str = "" for cigar in cigars: cigar_op, length = cigar cigar_str += ("%d%s" % (length, cigar_op)) if cigar_op in "MISH": read_len += int(length) return read_len, cigar_str def to_junction_str(junction): return "%s-%d-%d" % (junction[0], junction[1], junction[2]) def to_junction(junction_str): fields = junction_str.split("-") if len(fields) > 3: chr, left, right = "-".join(fields[:-2]), fields[-2], fields[-1] else: assert len(fields) == 3 chr, left, right = fields return [chr, int(left), int(right)] """ """ def junction_cmp(a, b): if a[0] != b[0]: if a[0] < b[0]: return -1 else: return 1 if a[1] != b[1]: if a[1] < b[1]: return -1 else: return 1 if a[2] != b[2]: if a[2] < b[2]: return -1 else: return 1 return 0 """ # chr and pos are assumed to be integers """ def get_junctions(chr, pos, cigar_str, min_anchor_len = 0, read_len = 100): junctions = [] right_pos = pos cigars = cigar_re.findall(cigar_str) cigars = [[int(cigars[i][:-1]), cigars[i][-1]] for i in range(len(cigars))] left_anchor_lens = [] cur_left_anchor_len = 0 for i in range(len(cigars)): length, cigar_op = cigars[i] if cigar_op in "MI": cur_left_anchor_len += length elif cigar_op == "N": assert cur_left_anchor_len > 0 left_anchor_lens.append(cur_left_anchor_len) cur_left_anchor_len = 0 for i in range(len(cigars)): length, cigar_op = cigars[i] if cigar_op == "N": left, right = right_pos - 1, right_pos + length if i > 0 and cigars[i-1][1] in "ID": if cigars[i-1][1] == "I": left += cigars[i-1][0] else: left -= cigars[i-1][0] if i + 1 < len(cigars) and cigars[i+1][1] in "ID": if cigars[i+1][1] == "I": right -= cigars[i+1][0] else: right += cigars[i+1][0] junction_idx = len(junctions) assert junction_idx < len(left_anchor_lens) left_anchor_len = left_anchor_lens[junction_idx] assert left_anchor_len > 0 and left_anchor_len < read_len right_anchor_len = read_len - left_anchor_len if left_anchor_len >= min_anchor_len and right_anchor_len >= min_anchor_len: junctions.append([chr, left, right]) if cigar_op in "MND": right_pos += length return junctions def get_right(pos, cigars): right_pos = pos cigars = cigar_re.findall(cigars) for cigar in cigars: length = int(cigar[:-1]) cigar_op = cigar[-1] if cigar_op in "MDN": right_pos += length return right_pos def get_cigar_chars(cigars): cigars = cigar_re.findall(cigars) cigar_chars = "" for cigar in cigars: cigar_op = cigar[-1] cigar_chars += cigar_op return cigar_chars """ """ def get_cigar_chars_MN(cigars): cigars = cigar_re.findall(cigars) cigar_chars = "" for cigar in cigars: cigar_op = cigar[-1] if cigar_op in "MN": if cigar_chars == "" or cigar_chars[-1] != cigar_op: cigar_chars += cigar_op return cigar_chars """ """ def is_small_anchor_junction_read(cigars): cigar_list = [] for cigar in cigar_re.findall(cigars): cigar_op = cigar[-1] cigar_len = int(cigar[:-1]) cigar_list.append([cigar_op, cigar_len]) if len(cigar_list) < 3: return False if cigar_list[0][0] != 'M' or cigar_list[-1][0] != 'M': return False if cigar_list[0][1] > 10 and cigar_list[-1][1] > 10: return False if cigar_list[1][0] != 'N' or cigar_list[-2][0] != 'N': return False return True """ """ def is_canonical_junction(chr_dic, junction): chr, left, right = junction donor = chr_dic[chr][left:left+2] acceptor = chr_dic[chr][right-3:right-1] rev_donor = reverse_complement(acceptor) rev_acceptor = reverse_complement(donor) if (donor == "GT" and acceptor == "AG") or \ (rev_donor == "GT" and rev_acceptor == "AG"): return True return False """ """ def is_small_exon_junction_read(cigars, min_exon_len = 23): cigars = cigar_re.findall(cigars) for i in range(1, len(cigars) - 1): cigar = cigars[i] cigar_op = cigar[-1] cigar_len = int(cigar[:-1]) prev_op = cigars[i-1][-1] next_op = cigars[i+1][-1] if prev_op == 'N' and cigar_op == 'M' and next_op == 'N': if cigar_len <= min_exon_len: return True return False """ """ """ def repeat_to_genome_alignment(repeat_info, repeat_dic, rep, pos, cigar_str = ""): assert rep in repeat_info left = pos - 1 # convert 1-based offset to zero-based repeats = repeat_info[rep] l, r = 0, len(repeats) while l < r: m = (l + r) / 2 rep_allele, rpos, rlen = repeats[m] if left >= rpos and left < rpos + rlen: while m > 0: rep_allele, rpos, rlen = repeats[m-1] if left < rpos: break m -= 1 break if left < rpos: r = m else: l = m + 1 alignments = [] while m < len(repeats): rep_allele, rpos, rlen = repeats[m] if left >= rpos + rlen: m += 1 continue if left < rpos: break assert rep_allele in repeat_dic[rep] coords = repeat_dic[rep][rep_allele] assert len(coords) > 0 for coord in coords: cchr, cpos, cstrand = coord adj_left = left - rpos if cstrand == '+': rep_left = cpos + adj_left rep_cigar_str = cigar_str else: if cigar_str: read_len, rep_cigar_str = reverse_cigar(cigar_str) else: read_len, rep_cigar_str = 0, "" rc_adj_left = rlen - adj_left - read_len; rep_left = cpos + rc_adj_left alignments.append([cchr, rep_left + 1, rep_cigar_str]) m += 1 return alignments """ def repeat_to_genome_alignment(repeat_db, repeat_map, rptRefName, pos, cigar_str = ''): assert rptRefName in repeat_db readlen = read_len_cigar(cigar_str) #readlen = 101 # pos in sam-result. pos is 1-based left = pos - 1 right = left + readlen repeats = repeat_db[rptRefName] rmap = repeat_map[rptRefName] #print len(rmap) #print rmap alignments = [] i = find_leftmost_pos(rmap, left) if i >= len(rmap): print >> sys.stderr, 'Value Error' return alignments if right > rmap[i][0]: print >> sys.stderr, 'Not repeat' return alignments repeat = repeats[rmap[i][1]] #print 'Allele Size:', len(repeat.allele) #print repeat.allele for allele in repeat.allele: rpos = allele.repeat_pos rlen = allele.repeat_length if (left >= rpos) and (right <= rpos + rlen): offset = left - rpos for coord in allele.positions: cchr, cpos, cstrand = coord if cstrand == '+': rep_left = cpos + offset rep_cigar_str = cigar_str else: if cigar_str: rep_read_len, rep_cigar_str = reverse_cigar(cigar_str) else: rep_read_len, rep_cigar_str = 0, "" rc_offset = rlen - offset - rep_read_len rep_left = cpos + rc_offset alignments.append([cchr, rep_left + 1, rep_cigar_str]) #print genome_pos[0], genome_pos[1] + offset + 1, genome_pos[2], genome_pos[1] return alignments """ """ def extract_single(infilename, outfilename, chr_dic, aligner, version, repeat_db, repeat_map, debug_dic, hash_idx): infile = open(infilename) if hash_idx == -1: outfile = open(outfilename, "w") else: outfile = open(outfilename + "." + str(hash_idx), "w") prev_read_id = "" num_reads, num_aligned_reads, num_ualigned_reads = 0, 0, 0 prev_NM, prev_NH, NH_real = 0, 0, 0 for line in infile: if line[0] == '@': continue cols = line[:-1].split() read_id, flag, chr, pos, mapQ, cigar_str = cols[:6] read_seq = cols[9] if len(read_id) >= 3 and read_id[-2] == "/": read_id = read_id[:-2] if read_id.find("seq.") == 0: read_id = read_id[4:] if aligner == "gsnap": chr = chr.replace("_", ":") if hash_idx != -1: hashval = hash(read_id) if hashval % mp_num != hash_idx: continue if read_id != prev_read_id: num_reads += 1 flag, pos, mapQ = int(flag), int(pos), int(mapQ) if flag & 0x4 != 0 or \ 'H' in cigar_str: prev_read_id = read_id continue NH, NM, XA = "", sys.maxint, [] for i in range(11, len(cols)): col = cols[i] # "nM" from STAR if col.startswith("NM") or col.startswith("nM"): NM = int(col[5:]) elif col.startswith("NH"): NH = col elif col.startswith("XA"): XA = col[5:].split(';')[:-1] if NH != "": NH = int(NH[5:]) if aligner == "hisat2": if prev_read_id == read_id: assert prev_NH == NH if NH == 1 or mapQ == 60: assert NH == 1 and mapQ == 60 if read_id != prev_read_id: num_aligned_reads += 1 if aligner == "hisat2" and \ NH == 1: num_ualigned_reads += 1 else: # In case of Bowtie2, only consier the best alignments if aligner in ["bowtie2", "bwa", "vg"]: if NM > prev_NM: continue def adjust_alignment(chr, pos, cigar_str): NM_real = 0 read_pos, right_pos = 0, pos - 1 cigars = cigar_re.findall(cigar_str) cigars = [[cigar[-1], int(cigar[:-1])] for cigar in cigars] for i in range(len(cigars)): cigar_op, length = cigars[i] if cigar_op == "S": assert i == 0 or i == len(cigars) - 1 if i == 0: assert cigars[i+1][0] == "M" ref_seq = chr_dic[chr][right_pos-length:right_pos] else: assert cigars[i-1][0] == "M" ref_seq = chr_dic[chr][right_pos:right_pos+length] ref_seq = ref_seq.upper() if length == len(ref_seq): for j in range(length): if ref_seq[j] != "N" and read_seq[read_pos+j] != ref_seq[j]: NM_real += 1 else: NM_real += length if cigar_op in "MND": right_pos += length if cigar_op in "MIS": read_pos += length if cigars[0][0] == "S": assert cigars[1][0] == "M" pos -= cigars[0][1] cigars[1][1] += cigars[0][1] cigars = cigars[1:] if cigars[-1][0] == "S": assert cigars[-2][0] == "M" cigars[-2][1] += cigars[-1][1] cigars = cigars[:-1] cigar_str = "" for cigar in cigars: cigar_op, length = cigar cigar_str += ("%d%s" % (length, cigar_op)) return pos, cigar_str, NM_real alignments = [[chr, pos, cigar_str]] if aligner == "bwa" and len(XA) > 0: for alignment in XA: alt_chr, alt_pos, alt_cigar_str, alt_NM = alignment.split(',') alt_pos, alt_NM = abs(int(alt_pos)), int(alt_NM) if alt_NM > NM: break alignments.append([alt_chr, alt_pos, alt_cigar_str]) # Convert repeat alignments to genome alignments if aligner == "hisat2" and chr.startswith("rep") and len(repeat_map) > 0: alignments = repeat_to_genome_alignment(repeat_db, repeat_map, chr, pos, cigar_str) for i, alignment in enumerate(alignments): chr, pos, cigar_str = alignment pos, cigar_str, NM_real = adjust_alignment(chr, pos, cigar_str) p_str = "%s\t%s\t%d\t%s\tNM:i:%d" % (read_id, chr, pos, cigar_str, NM_real) print >> outfile, p_str if aligner == "hisat2": if prev_read_id != read_id: if prev_read_id != "": assert prev_NH == NH_real NH_real = 1 else: NH_real += 1 prev_NH = NH prev_NM = NM prev_read_id = read_id if aligner == "hisat2": if prev_read_id != "": assert prev_NH == NH_real outfile.close() infile.close() # Sanity check for HISAT2's alignment summary if aligner == "hisat2" and os.path.exists(infilename + ".summary") and (not mp_mode): hisat2_reads, hisat2_0aligned_reads, hisat2_ualigned_reads, hisat2_maligned_reads = 0, 0, 0, 0 for line in open(infilename + ".summary"): line = line.strip() if line.startswith("HISAT2 summary") or \ line.startswith("Overall"): continue category, num = line.split(':') num = num.strip() num = int(num.split(' ')[0]) if category.startswith("Total reads"): hisat2_reads = num elif category.startswith("Aligned 0 time"): hisat2_0aligned_reads = num elif category.startswith("Aligned 1 time"): hisat2_ualigned_reads = num else: assert category.startswith("Aligned >1 time") assert hisat2_reads == hisat2_0aligned_reads + hisat2_ualigned_reads + num hisat2_aligned_reads = hisat2_reads - hisat2_0aligned_reads assert hisat2_reads == num_reads assert hisat2_aligned_reads == num_aligned_reads assert hisat2_ualigned_reads == num_ualigned_reads """ """ def extract_pair(infilename, outfilename, chr_dic, RNA, aligner, version, repeat_db, repeat_map, debug_dic, hash_idx): read_dic = {} pair_reported = set() infile = open(infilename) if hash_idx == -1: outfile = open(outfilename, "w") else: outfile = open(outfilename + "." + str(hash_idx), "w") num_pairs, num_conc_aligned_pairs, num_conc_ualigned_pairs, num_disc_aligned_pairs = 0, 0, 0, 0 num_aligned_reads, num_ualigned_reads = 0, 0 prev_read_id, pair_list = "", set() prev_NM = sys.maxint prev_NH1, prev_NH2 = 0, 0 NH1_real, NH2_real = 0, 0 for line in infile: if line[0] == '@': continue cols = line[:-1].split() read_id, flag, chr1, pos1, mapQ, cigar1_str, chr2, pos2 = cols[:8] read_seq = cols[9] if len(read_id) >= 3 and read_id[-2] == "/": read_id = read_id[:-2] if read_id.find("seq.") == 0: read_id = read_id[4:] if hash_idx != -1: hashval = hash(read_id) if hashval % mp_num != hash_idx: continue if aligner == "gsnap": chr1 = chr1.replace("_", ":") chr2 = chr2.replace("_", ":") if read_id != prev_read_id: num_pairs += 1 pair_list = set() prev_NM = sys.maxint flag = int(flag) canonical_pos1, canonical_pos2 = int(pos1), int(pos2) left_read = (flag & 0x40 != 0) pos1 = canonical_pos1 mapQ = int(mapQ) if flag & 0x4 != 0 or \ 'H' in cigar1_str: prev_read_id, is_prev_read_left = read_id, left_read continue concordant = (flag & 0x2 != 0) NH, NM1, YT, XA = sys.maxint, sys.maxint, "", [] for i in range(11, len(cols)): col = cols[i] # "nM" from STAR if col.startswith("NM") or col.startswith("nM"): NM1 = int(col[5:]) elif col.startswith("NH"): NH = int(col[5:]) elif col.startswith("YT"): YT = col[5:] elif col.startswith("XA"): XA = col[5:].split(';')[:-1] if aligner == "hisat2": if prev_read_id == read_id: if left_read: assert prev_NH1 == 0 or prev_NH1 == NH else: assert prev_NH2 == 0 or prev_NH2 == NH if NH == 1 or mapQ == 60: assert NH == 1 and mapQ == 60 unpaired = (flag & 0x8 != 0) or (YT in ["UU", "UP"]) if unpaired: if left_read not in pair_list: pair_list.add(left_read) num_aligned_reads += 1 if aligner == "hisat2" and NH == 1: num_ualigned_reads += 1 assert mapQ == 60 else: if read_id != prev_read_id: if concordant: num_conc_aligned_pairs += 1 if aligner == "hisat2" and NH == 1: num_conc_ualigned_pairs += 1 else: if aligner == "hisat2": assert YT == "DP" num_disc_aligned_pairs += 1 if chr2 == '*': continue def adjust_alignment(chr, pos, cigar_str): NM_real = 0 read_pos, right_pos = 0, pos - 1 cigars = cigar_re.findall(cigar_str) cigars = [[cigar[-1], int(cigar[:-1])] for cigar in cigars] for i in range(len(cigars)): cigar_op, length = cigars[i] if cigar_op == "S": assert i == 0 or i == len(cigars) - 1 if i == 0: assert cigars[i+1][0] == "M" ref_seq = chr_dic[chr1][right_pos-length:right_pos] else: assert cigars[i-1][0] == "M" ref_seq = chr_dic[chr1][right_pos:right_pos+length] ref_seq = ref_seq.upper() if length == len(ref_seq): for j in range(length): if ref_seq[j] != "N" and read_seq[read_pos+j] != ref_seq[j]: NM_real += 1 else: NM_real += length if cigar_op in "MND": right_pos += length if cigar_op in "MIS": read_pos += length if cigars[0][0] == "S": assert cigars[1][0] == "M" pos -= cigars[0][1] cigars[1][1] += cigars[0][1] cigars = cigars[1:] if cigars[-1][0] == "S": assert cigars[-2][0] == "M" cigars[-2][1] += cigars[-1][1] cigars = cigars[:-1] cigar_str = "" for cigar in cigars: cigar_op, length = cigar cigar_str += ("%d%s" % (length, cigar_op)) return pos, cigar_str, NM_real alignments = [[chr1, pos1, cigar1_str]] if aligner == "bwa" and len(XA) > 0: for alignment in XA: alt_chr, alt_pos, alt_cigar_str, alt_NM = alignment.split(',') alt_pos, alt_NM = abs(int(alt_pos)), int(alt_NM) if alt_NM > NM1: break alignments.append([alt_chr, alt_pos, alt_cigar_str]) # Convert repeat alignments to genome alignments if aligner == "hisat2" and (chr1.startswith("rep") or chr2.startswith("rep")) and len(repeat_map) > 0: if chr1.startswith("rep"): alignments = repeat_to_genome_alignment(repeat_db, repeat_map, chr1, pos1, cigar1_str) if chr2.startswith("rep") or (chr1.startswith("rep") and chr2 == "="): chr_tmp = chr1 if chr2 == "=" else chr2 alignments2 = repeat_to_genome_alignment(repeat_db, repeat_map, chr_tmp, int(pos2)) else: alignments2 = [[chr2, int(pos2)]] selected_alignments = [] for alignment in alignments: _chr1, _pos1 = alignment[:2] add = False for alignment2 in alignments2: _chr2, _pos2 = alignment2[:2] if _chr1 == _chr2 and abs(_pos1 - _pos2) <= 1000: add = True break if add: selected_alignments.append(alignment) alignments = selected_alignments for i, alignment in enumerate(alignments): chr1, pos1, cigar1_str = alignment pos1, cigar_str, NM_real = adjust_alignment(chr1, pos1, cigar1_str) chr2 = chr1 if aligner == "bwa": me = "%s\t%s" % (read_id, chr1) partner = "%s\t%s" % (read_id, chr2) else: me = "%s\t%s\t%d" % (read_id, chr1, canonical_pos1) partner = "%s\t%s\t%d" % (read_id, chr2, canonical_pos2) if partner in read_dic: maps = read_dic[partner] for map in maps: if (map[0] == me) or len(maps) == 1: cigar2_str, NM2, pos2 = map[1:4] if aligner == "bwa": if abs(pos1 - pos2) >= 1000: continue if aligner in ["bowtie2", "bwa"]: if NM1 + NM2 > prev_NM: continue else: prev_NM = NM1 + NM2 if chr1 != chr2: continue # DK - debugging purposes if RNA: if aligner in ["bowtie2", "bwa"] and abs(pos1 - pos2) > 1000: continue else: if abs(pos1 - pos2) > 1000: continue if int(pos2) > int(pos1): p_str = "%s\t%s\t%d\t%s\t%s\t%d\t%s\tNM:i:%d\tNM:i:%d" % \ (read_id, chr1, pos1, cigar_str, chr2, pos2, cigar2_str, NM1, NM2) else: p_str = "%s\t%s\t%d\t%s\t%s\t%d\t%s\tNM:i:%d\tNM:i:%d" % \ (read_id, chr2, pos2, cigar2_str, chr1, pos1, cigar_str, NM2, NM1) if p_str not in pair_reported: pair_reported.add(p_str) print >> outfile, p_str if not me in read_dic: read_dic[me] = [] read_dic[me].append([partner, cigar_str, NM1, pos1]) if aligner == "hisat2": if prev_read_id != read_id: if prev_read_id != "": assert prev_NH1 == NH1_real assert prev_NH2 == NH2_real prev_NH1, prev_NH2 = 0, 0 if left_read: NH1_real, NH2_real = 1, 0 else: NH1_real, NH2_real = 0, 1 else: if left_read: NH1_real += 1 else: NH2_real += 1 if left_read: prev_NH1 = NH else: prev_NH2 = NH prev_read_id = read_id if aligner == "hisat2": if prev_read_id != "": assert prev_NH1 == NH1_real assert prev_NH2 == NH2_real outfile.close() infile.close() # Sanity check for HISAT2's alignment summary if aligner == "hisat2" and os.path.exists(infilename + ".summary") and (not mp_mode): hisat2_pairs, hisat2_0aligned_pairs, hisat2_conc_ualigned_pairs, hisat2_conc_maligned_pairs, hisat2_disc_aligned_pairs = 0, 0, 0, 0, 0 hisat2_reads, hisat2_0aligned_reads, hisat2_ualigned_reads, hisat2_maligned_reads = 0, 0, 0, 0 for line in open(infilename + ".summary"): line = line.strip() if line.startswith("HISAT2 summary") or \ line.startswith("Overall"): continue category, num = line.split(':') num = num.strip() num = int(num.split(' ')[0]) if category.startswith("Total pairs"): hisat2_pairs = num elif category.startswith("Aligned concordantly or discordantly 0 time"): hisat2_0aligned_pairs = num elif category.startswith("Aligned concordantly 1 time"): hisat2_conc_ualigned_pairs = num elif category.startswith("Aligned concordantly >1 time"): hisat2_conc_maligned_pairs = num elif category.startswith("Aligned discordantly"): hisat2_disc_aligned_pairs = num assert hisat2_pairs == hisat2_0aligned_pairs + hisat2_conc_ualigned_pairs + hisat2_conc_maligned_pairs + hisat2_disc_aligned_pairs elif category.startswith("Total unpaired reads"): hisat2_reads = num assert hisat2_reads == hisat2_0aligned_pairs * 2 elif category.startswith("Aligned 0 time"): hisat2_0aligned_reads = num elif category.startswith("Aligned 1 time"): hisat2_ualigned_reads = num else: assert category.startswith("Aligned >1 times") hisat2_maligned_reads = num assert hisat2_reads == hisat2_0aligned_reads + hisat2_ualigned_reads + hisat2_maligned_reads assert hisat2_pairs == num_pairs assert hisat2_conc_ualigned_pairs == num_conc_ualigned_pairs assert hisat2_conc_maligned_pairs == num_conc_aligned_pairs - num_conc_ualigned_pairs assert hisat2_disc_aligned_pairs == num_disc_aligned_pairs assert hisat2_ualigned_reads == num_ualigned_reads assert hisat2_maligned_reads == num_aligned_reads - num_ualigned_reads """ """ def is_junction_read(junctions_dic, chr, pos, cigar_str): result_junctions = [] junctions = get_junctions(chr, pos, cigar_str) for junction in junctions: junction_str = to_junction_str(junction) result_junctions.append([junction_str, junction_str in junctions_dic]) return result_junctions """ """ def is_junction_pair(junctions_dic, chr, pos, cigar_str, mate_chr, mate_pos, mate_cigar_str): junctions = is_junction_read(junctions_dic, chr, pos, cigar_str) mate_junctions = is_junction_read(junctions_dic, mate_chr, mate_pos, mate_cigar_str) junctions += mate_junctions return junctions """ """ def find_in_gtf_junctions(chr_dic, gtf_junctions, junction, relax_dist = 5): def find_in_gtf_junctions(gtf_junctions, junction): l, u = 0, len(gtf_junctions) while l < u: m = (l + u) / 2 assert m >= 0 and m < len(gtf_junctions) cmp_result = junction_cmp(junction, gtf_junctions[m]) if cmp_result == 0: return m elif cmp_result < 0: u = m else: l = m + 1 return l chr, left, right = junction gtf_index = find_in_gtf_junctions(gtf_junctions, [chr, left - relax_dist, right - relax_dist]) if gtf_index >= 0: i = gtf_index while i < len(gtf_junctions): chr2, left2, right2 = gtf_junctions[i] if chr2 > chr or \ left2 - left > relax_dist or \ right2 - right > relax_dist: break if abs(left - left2) <= relax_dist and left - left2 == right - right2: test_small = ":" in chr if is_canonical_junction(chr_dic, gtf_junctions[i]): if left == left2: return i else: return -1 else: return i i += 1 return -1 def singleInMaps(chr, pos, pos2, cigar, NM, maps): for i in range(len(maps)): if chr != maps[i][0]: continue if (pos == maps[i][1] or pos2 == maps[i][2]): return True, i return False, -1 """ """ def compare_single_sam(RNA, reference_sam, query_sam, mapped_fname, chr_dic, gtf_junctions, gtf_junctions_set, ex_gtf_junctions, aligner): aligned, multi_aligned = 0, 0 db_dic, db_junction_dic = {}, {} mapped_file = open(mapped_fname, "w") first_mapped_file = open(mapped_fname + ".first", "w") file = open(reference_sam, "r") junction_read_dic = {} for line in file: if line.startswith('@'): continue read_name, chr, pos, cigar, NM = line[:-1].split() pos, NM = int(pos), int(NM[5:]) if read_name.find("seq.") == 0: read_name = read_name[4:] if len(read_name) > 2 and read_name[-2] == '/': read_name = read_name[:-2] multi_aligned += 1 if read_name not in db_dic: db_dic[read_name] = [] aligned += 1 pos2 = get_right(pos, cigar) db_dic[read_name].append([chr, pos, pos2, cigar, NM]) read_junctions = is_junction_read(gtf_junctions_set, chr, pos, cigar) if len(read_junctions) > 0: if read_name not in db_junction_dic: db_junction_dic[read_name] = [] for junction_str, is_gtf_junction in read_junctions: db_junction_dic[read_name].append([junction_str, is_gtf_junction]) if junction_str not in junction_read_dic: junction_read_dic[junction_str] = [] junction_read_dic[junction_str].append(line[:-1]) file.close() temp_junctions, temp_gtf_junctions = set(), set() for read_name, can_junctions in db_junction_dic.items(): if len(can_junctions) <= 0: continue # DK - for debugging purposes # 1. select the best candidate among spliced alignments if multiple def pickup_junction(can_junctions): junctions = [can_junctions[0]] for i in range(1, len(can_junctions)): def get_intron_len(can_junction): chr, left, right = to_junction(can_junction) return right - left - 1 intron, intron_cmp = get_intron_len(junctions[0][0]), get_intron_len(can_junctions[i][0]) if intron > intron_cmp: junctions = [can_junctions[i]] elif intron == intron_cmp: junctions.append(can_junctions[i]) return junctions # can_junctions = pickup_junction(can_junctions) for can_junction in can_junctions: found_junction_str = None junction_str, is_gtf_junction = can_junction if is_gtf_junction: found_junction_str = junction_str if not found_junction_str: junction = to_junction(junction_str) gtf_index = find_in_gtf_junctions(chr_dic, gtf_junctions, junction) if gtf_index >= 0: is_gtf_junction = True found_junction_str = to_junction_str(gtf_junctions[gtf_index]) if found_junction_str: temp_gtf_junctions.add(found_junction_str) temp_junctions.add(found_junction_str) else: if junction_str not in temp_junctions: None # assert junction_str in junction_read_dic # DK - for debugging purposes """ if len(junction_read_dic[junction_str]) <= 2: canonical = is_canonical_junction(chr_dic, to_junction(junction_str)) if not canonical: print >> sys.stdout, read_name, junction_str, len(junction_read_dic[junction_str]), can_junctions for line in junction_read_dic[junction_str]: print >> sys.stdout, "\t", line """ temp_junctions.add(junction_str) temp2_junctions = [] for junction in temp_junctions: temp2_junctions.append(to_junction(junction)) temp_junctions = sorted(list(temp2_junctions), cmp=junction_cmp) temp2_junctions = [] for can_junction in temp_junctions: if len(temp2_junctions) <= 0: temp2_junctions.append(can_junction) else: chr, left, right = temp2_junctions[-1] chr2, left2, right2 = can_junction if chr == chr2 and \ abs(left - left2) == abs(right - right2) and \ abs(left - left2) <= 10 and \ not to_junction_str(can_junction) in temp_gtf_junctions: continue temp2_junctions.append(can_junction) temp_junctions = set() for junction in temp2_junctions: temp_junctions.add(to_junction_str(junction)) file = open(query_sam) mapped, unmapped, unique_mapped, first_mapped, mapping_point = 0, 0, 0, 0, 0.0 snp_mapped, snp_unmapped, snp_unique_mapped, snp_first_mapped = 0, 0, 0, 0 for line in file: if line.startswith('@'): continue fields = line[:-1].split() read_name, chr, pos, cigar = fields[:4] trans_id, NM, Zs = None, None, None for field in fields[4:]: if field.startswith("TI"): trans_id = field[5:] elif field.startswith("NM"): NM = int(field[5:]) elif field.startswith("Zs"): Zs = field[5:] snp_included = (Zs != None) pos = int(pos) pos2 = get_right(pos, cigar) if read_name not in db_dic: unmapped += 1 if snp_included: snp_unmapped += 1 continue maps = db_dic[read_name] found = False found_at_first = False if (aligner == "hisat-TLA" and RNA): found, index = singleInMaps(chr, pos, pos2, cigar, NM, maps) if found: if index == 0: found_at_first = True else: if [chr, pos, pos2, cigar, NM] in maps: found = True if maps.index([chr, pos, pos2, cigar, NM]) == 0: found_at_first = True # DK - debugging purposes if False and len(maps) > 0 and maps[0][-1] < NM: found = True if not found: for idx, map in enumerate(maps): if chr == map[0] and \ pos == map[1] and \ pos2 == map[2] and \ get_cigar_chars(cigar) == get_cigar_chars(map[3]): read_junctions = is_junction_read(gtf_junctions_set, map[0], map[1], map[3]) if True: found_list = [False for i in range(len(read_junctions))] for j in range(len(read_junctions)): junction_str, is_gtf_junction = read_junctions[j] junction = to_junction(junction_str) gtf_index = find_in_gtf_junctions(chr_dic, gtf_junctions, junction) if gtf_index >= 0: found_list[j] = True found = not (False in found_list) else: found = False if found: if idx == 0: found_at_first = True break if found: print >> mapped_file, read_name mapped += 1 if snp_included: snp_mapped += 1 if len(maps) == 1: unique_mapped += 1 if snp_included: snp_unique_mapped += 1 if found_at_first: print >> first_mapped_file, read_name first_mapped += 1 if snp_included: snp_first_mapped += 1 mapping_point += (1.0 / len(maps)) else: unmapped += 1 if snp_included: snp_unmapped += 1 file.close() mapped_file.close() first_mapped_file.close() # DK - for debugging purposes false_can_junctions, false_noncan_junctions = 0, 0 for junction_str in temp_junctions: if junction_str in temp_gtf_junctions: continue if junction_str in ex_gtf_junctions: continue if is_canonical_junction(chr_dic, to_junction(junction_str)): false_can_junctions += 1 else: false_noncan_junctions += 1 print >> sys.stderr, "\t\t\tfalse junctions: %d (canonical), %d (non-canonical)" % (false_can_junctions, false_noncan_junctions) return mapped, unique_mapped, first_mapped, unmapped, aligned, multi_aligned, \ snp_mapped, snp_unique_mapped, snp_first_mapped, snp_unmapped, \ len(temp_junctions), len(temp_gtf_junctions), mapping_point def pairedInMaps(chr, pos, pos_right, cigar, pos2, pos2_right, cigar2, NM, NM2, maps): for i in range(len(maps)): if chr != maps[i][0]: continue; if (pos == maps[i][1] or pos_right == maps[i][2]) and (pos2 == maps[i][4] or pos2_right == maps[i][5]): return True, i return False, -1 """ """ def compare_paired_sam(RNA, reference_sam, query_sam, mapped_fname, chr_dic, gtf_junctions, gtf_junctions_set, ex_gtf_junctions, aligner): aligned, multi_aligned = 0, 0 db_dic, db_junction_dic, junction_pair_dic = {}, {}, {} mapped_file = open(mapped_fname, "w") uniq_mapped_file = open(mapped_fname + '.uniq', "w") first_mapped_file = open(mapped_fname + '.first', "w") file = open(reference_sam, "r") for line in file: if line[0] == '@': continue read_name, chr, pos, cigar, chr2, pos2, cigar2, NM, NM2 = line[:-1].split() pos, pos2 = int(pos), int(pos2) NM, NM2 = int(NM[5:]), int(NM2[5:]) if read_name.find("seq.") == 0: read_name = read_name[4:] if len(read_name) > 2 and read_name[-2] == '/': read_name = read_name[:-2] multi_aligned += 1 if read_name not in db_dic: db_dic[read_name] = [] aligned += 1 pos_right, pos2_right = get_right(pos, cigar), get_right(pos2, cigar2) db_dic[read_name].append([chr, pos, pos_right, cigar, pos2, pos2_right, cigar2, NM, NM2]) pair_junctions = is_junction_pair(gtf_junctions_set, chr, pos, cigar, chr2, pos2, cigar2) if len(pair_junctions) > 0: if read_name not in db_junction_dic: db_junction_dic[read_name] = [] for junction_str, is_gtf_junction in pair_junctions: db_junction_dic[read_name].append([junction_str, is_gtf_junction]) # DK - for debugging purposes if junction_str not in junction_pair_dic: junction_pair_dic[junction_str] = [] junction_pair_dic[junction_str].append(line[:-1]) file.close() temp_junctions, temp_gtf_junctions = set(), set() for read_name, can_junctions in db_junction_dic.items(): if len(can_junctions) <= 0: continue # DK - for debugging purposes # 1. select the best candidate among spliced alignments if multiple def pickup_junction(can_junctions): junctions = [can_junctions[0]] for i in range(1, len(can_junctions)): def get_intron_len(can_junction): chr, left, right = to_junction(can_junction) return right - left - 1 intron, intron_cmp = get_intron_len(junctions[0][0]), get_intron_len(can_junctions[i][0]) if intron > intron_cmp: junctions = [can_junctions[i]] elif intron == intron_cmp: junctions.append(can_junctions[i]) return junctions # can_junctions = pickup_junction(can_junctions) for can_junction in can_junctions: found_junction_str = None junction_str, is_gtf_junction = can_junction # DK - for debugging purposes assert junction_str in junction_pair_dic if len(junction_pair_dic[junction_str]) <= 5: continue if is_gtf_junction: found_junction_str = junction_str if not found_junction_str: junction = to_junction(junction_str) gtf_index = find_in_gtf_junctions(chr_dic, gtf_junctions, junction) if gtf_index >= 0: is_gtf_junction = True found_junction_str = to_junction_str(gtf_junctions[gtf_index]) if found_junction_str: temp_gtf_junctions.add(found_junction_str) temp_junctions.add(found_junction_str) else: if junction_str not in temp_junctions: None # assert junction_str in junction_read_dic # print >> sys.stdout, read_name, junction_str, len(junction_read_dic[junction_str]) # for line in junction_read_dic[junction_str]: # print >> sys.stdout, "\t", line temp_junctions.add(junction_str) # DK - for debugging purposes filter_junction_db = {} temp2_junctions = [] for junction in temp_junctions: temp2_junctions.append(to_junction(junction)) temp_junctions = sorted(list(temp2_junctions), cmp=junction_cmp) temp2_junctions = [] for can_junction in temp_junctions: if len(temp2_junctions) <= 0: temp2_junctions.append(can_junction) # DK - for debugging purposes # assert to_junction_str(can_junction) in junction_pair_dic # filter_junction_db[to_junction_str(can_junction)] = len(junction_pair_dic[to_junction_str(can_junction)]) else: chr, left, right = temp2_junctions[-1] chr2, left2, right2 = can_junction if chr == chr2 and \ abs(left - left2) == abs(right - right2) and \ abs(left - left2) <= 10 and \ not to_junction_str(can_junction) in temp_gtf_junctions: # DK - for debugging purposes # assert to_junction_str(temp2_junctions[-1]) in junction_pair_dic # assert to_junction_str(temp2_junctions[-1]) in filter_junction_dic # filter_junction_db[to_junction_str(temp2_junctions[-1])] += len(junction_pair_dic[to_junction_str(temp2_junctions[-1])]) continue temp2_junctions.append(can_junction) # DK - for debugging purposes # assert to_junction_str(can_junction) in junction_pair_dic # filter_junction_db[to_junction_str(can_junction)] = len(junction_pair_dic[to_junction_str(can_junction)]) temp_junctions = set() for junction in temp2_junctions: # DK - for debugging purposes # assert to_junction_str(junction) in filter_junction_dic # if filter_junction_dic[to_junction_str(junction)] <= 5: # continue temp_junctions.add(to_junction_str(junction)) file = open(query_sam) mapped, unique_mapped, first_mapped, unmapped, mapping_point = 0, 0, 0, 0, 0.0 snp_mapped, snp_unique_mapped, snp_first_mapped, snp_unmapped = 0, 0, 0, 0 for line in file: if line.startswith('@'): continue fields = line[:-1].split() read_name, chr, pos, cigar, chr2, pos2, cigar2 = fields[:7] trains_id, NM, NM2, Zs, Zs2 = None, None, None, None, None for field in fields[7:]: if field.startswith("TI"): trans_id = field[5:] elif field.startswith("NM"): if NM == None: NM = int(field[5:]) else: NM2 = int(field[5:]) elif field.startswith("Zs"): if Zs == None: Zs = field[5:] else: Zs2 = field[5:] snp_included = (Zs != None or Zs2 != None) pos, pos2 = int(pos), int(pos2) pos_right, pos2_right = get_right(pos, cigar), get_right(pos2, cigar2) if read_name not in db_dic: unmapped += 1 if snp_included: snp_unmapped += 1 continue maps = db_dic[read_name] found = False found_at_first = False if (aligner == "hisat-TLA" and RNA) : found, index = pairedInMaps(chr, pos, pos_right, cigar, pos2, pos2_right, cigar2, NM, NM2, maps) if (found): if (index == 0): found_at_first = True else: if [chr, pos, pos_right, cigar, pos2, pos2_right, cigar2, NM, NM2] in maps: found = True if maps.index([chr, pos, pos_right, cigar, pos2, pos2_right, cigar2, NM, NM2]) == 0: found_at_first = True # DK - debugging purposes if False and len(maps) > 0 and maps[0][-1] + maps[0][-2] < NM + NM2: found = True if not found: for idx, map in enumerate(maps): if chr == map[0] and \ pos == map[1] and \ pos_right == map[2] and \ get_cigar_chars(cigar) == get_cigar_chars(map[3]) and \ pos2 == map[4] and \ pos2_right == map[5] and \ get_cigar_chars(cigar2) == get_cigar_chars(map[6]): pair_junctions = is_junction_pair(gtf_junctions_set, map[0], map[1], map[3], map[0], map[4], map[6]) if True: found_list = [False for i in range(len(pair_junctions))] for j in range(len(pair_junctions)): junction_str, is_gtf_junction = pair_junctions[j] junction = to_junction(junction_str) gtf_index = find_in_gtf_junctions(chr_dic, gtf_junctions, junction) if gtf_index >= 0: found_list[j] = True found = not (False in found_list) else: found = False if found: if idx == 0: found_at_first = True break if found: print >> mapped_file, read_name mapped += 1 if snp_included: snp_mapped += 1 if len(maps) == 1: unique_mapped += 1 print >> uniq_mapped_file, read_name if snp_included: snp_unique_mapped += 1 if found_at_first: print >> first_mapped_file, read_name first_mapped += 1 if snp_included: snp_first_mapped += 1 mapping_point += (1.0 / len(maps)) else: unmapped += 1 if snp_included: snp_unmapped += 1 file.close() mapped_file.close() uniq_mapped_file.close() first_mapped_file.close() # DK - for debugging purposes false_can_junctions, false_noncan_junctions = 0, 0 for junction_str in temp_junctions: if junction_str in temp_gtf_junctions: continue if is_canonical_junction(chr_dic, to_junction(junction_str)): false_can_junctions += 1 else: false_noncan_junctions += 1 print >> sys.stderr, "\t\t\tfalse junctions: %d (canonical), %d (non-canonical)" % (false_can_junctions, false_noncan_junctions) return mapped, unique_mapped, first_mapped, unmapped, aligned, multi_aligned, \ snp_mapped, snp_unique_mapped, snp_first_mapped, snp_unmapped, \ len(temp_junctions), len(temp_gtf_junctions), mapping_point """ """ def extract_mapped_unmapped(read_fname, mapped_id_fname, mapped_fname, unmapped_fname, read2_fname = "", mapped2_fname = "", unmapped2_fname = ""): mapped_ids = set() mapped_id_file = open(mapped_id_fname) for line in mapped_id_file: read_id = int(line[:-1]) mapped_ids.add(read_id) mapped_id_file.close() def write_reads(read_fname, mapped_fname, unmapped_fname): mapped_file = open(mapped_fname, "w") unmapped_file = open(unmapped_fname, "w") read_file = open(read_fname) write = False for line in read_file: if line[0] == "@": read_id = int(line[1:-1]) write = read_id in mapped_ids if write: print >> mapped_file, line[:-1] else: print >> unmapped_file, line[:-1] read_file.close() mapped_file.close() unmapped_file.close() write_reads(read_fname, mapped_fname, unmapped_fname) if read2_fname != "": assert mapped2_fname != "" assert unmapped2_fname != "" write_reads(read2_fname, mapped2_fname, unmapped2_fname) """ """ def sql_execute(sql_db, sql_query): sql_cmd = [ "sqlite3", sql_db, "-separator", "\t", "%s;" % sql_query ] # print >> sys.stderr, sql_cmd sql_process = subprocess.Popen(sql_cmd, stdout=subprocess.PIPE) output = sql_process.communicate()[0][:-1] return output """ """ def create_sql_db(sql_db): if os.path.exists(sql_db): print >> sys.stderr, sql_db, "already exists!" return columns = [ ["id", "integer primary key autoincrement"], ["genome", "text"], ["head", "text"], ["end_type", "text"], ["type", "text"], ["aligner", "text"], ["version", "text"], ["num_reads", "integer"], ["mapped_reads", "integer"], ["unique_mapped_reads", "integer"], ["unmapped_reads", "integer"], ["mapping_point", "real"], ["snp_mapped_reads", "integer"], ["snp_unique_mapped_reads", "integer"], ["snp_unmapped_reads", "integer"], ["time", "real"], ["mem", "integer"], ["true_gtf_junctions", "integer"], ["temp_junctions", "integer"], ["temp_gtf_junctions", "integer"], ["host", "text"], ["created", "text"], ["cmd", "text"] ] sql_create_table = "CREATE TABLE ReadCosts (" for i in range(len(columns)): name, type = columns[i] if i != 0: sql_create_table += ", " sql_create_table += ("%s %s" % (name, type)) sql_create_table += ");" sql_execute(sql_db, sql_create_table) """ """ def write_analysis_data(sql_db, genome_name, database_name): if not os.path.exists(sql_db): return aligners = [] sql_aligners = "SELECT aligner FROM ReadCosts GROUP BY aligner" output = sql_execute(sql_db, sql_aligners) aligners = output.split() can_read_types = ["all", "M", "2M_gt_15", "2M_8_15", "2M_1_7", "gt_2M"] tmp_read_types = [] sql_types = "SELECT type FROM ReadCosts GROUP BY type" output = sql_execute(sql_db, sql_types) tmp_read_types = output.split() read_types = [] for read_type in can_read_types: if read_type in tmp_read_types: read_types.append(read_type) for paired in [False, True]: database_fname = genome_name + "_" + database_name if paired: end_type = "paired" database_fname += "_paired" else: end_type = "single" database_fname += "_single" database_fname += ".analysis" database_file = open(database_fname, "w") print >> database_file, "end_type\ttype\taligner\tnum_reads\ttime\tmem\tmapped_reads\tunique_mapped_reads\tunmapped_reads\tmapping_point\ttrue_gtf_junctions\ttemp_junctions\ttemp_gtf_junctions" for aligner in aligners: for read_type in read_types: sql_row = "SELECT end_type, type, aligner, num_reads, time, mem, mapped_reads, unique_mapped_reads, unmapped_reads, mapping_point, snp_mapped_reads, snp_unique_mapped_reads, snp_unmapped_reads, true_gtf_junctions, temp_junctions, temp_gtf_junctions FROM ReadCosts" sql_row += " WHERE genome = '%s' and head = '%s' and aligner = '%s' and type = '%s' and end_type = '%s' ORDER BY created DESC LIMIT 1" % (genome_name, database_name, aligner, read_type, end_type) output = sql_execute(sql_db, sql_row) if output: print >> database_file, output database_file.close() """ """ def calculate_read_cost(single_end, paired_end, test_aligners, fresh, runtime_only, baseChange, verbose): sql_db_name = "analysis.db" if not os.path.exists(sql_db_name): create_sql_db(sql_db_name) num_cpus = multiprocessing.cpu_count() if num_cpus > 10: num_threads = min(10, num_cpus) desktop = False else: #num_threads = min(4, num_cpus) num_threads = num_cpus - 1 desktop = True data_base = "sim" test_large_index = False verbose = False sql_write = True readtypes = ["all", "M", "2M_gt_15", "2M_8_15", "2M_1_7", "gt_2M"] aligners = [ # [aligner, two_step, index_type, aligner_version, addition_options] # ["hisat", "", "", "", ""], # ["hisat2", "", "", "204", ""], # ["hisat2", "", "", "210", ""], # ["hisat2", "", "snp", "203b", ""], # ["hisat2", "", "snp", "210", ""], # ["hisat2", "", "tran", "210", ""], # ["hisat2", "", "snp_tran", "210", ""], # ["hisat2", "", "", "210", ""], ["hisat2", "", "", "", ""], #["hisat2", "", "rep", "", ""], # ["hisat2", "", "rep-100-300", "", ""], # ["hisat2", "", "rep_mm", "", ""], # ["hisat2", "", "", "", "--sensitive"], # ["hisat2", "", "rep", "", "--sensitive"], # ["hisat2", "", "", "", "--very-sensitive"], # ["hisat2", "", "snp", "", ""], # ["hisat2", "", "snp", "", "--sensitive"], # ["hisat2", "", "snp_noht", "", ""], # ["hisat2", "x2", "", "", ""], # ["hisat2", "x1", "tran", "", ""], # ["hisat2", "", "tran", "", ""], # ["hisat2", "", "snp_tran", "", ""], # ["hisat2", "x1", "snp_tran", "", ""], # ["hisat2", "x1", "snp_tran_ercc", "", ""], # ["tophat2", "gtfonly", "", "", ""], # ["tophat2", "gtf", "", "", ""], #["star", "", "", "", ""], # ["star", "x2", "", "", ""], # ["star", "gtf", "", "", ""], # ["bowtie", "", "", "", ""], #["bowtie2", "", "", "", ""], # ["bowtie2", "", "", "", "-k 10"], # ["bowtie2", "", "", "", "-k 1000 --extends 2000"], # ["gsnap", "", "", "", ""], # ["bwa", "mem", "", "", ""], #["bwa", "mem", "", "", "-a"], # ["hisat2", "", "snp", "", ""], # ["hisat2", "", "tran", "", ""], # ["hisat2", "", "snp_tran", "", ""], # ["vg", "", "", "", ""], # ["vg", "", "", "", "-M 10"], # ["vg", "", "snp", "", ""], # ["vg", "", "snp", "", "-M 10"], # ["minimap2", "", "", "", ""], ["hisat-TLA", "", "", "", ""], ] readtypes = ["all"] verbose = True debug = False # sql_write = False cwd = os.getcwd() if len(cwd.split("reads_")) > 1: genome = cwd.split("reads_")[1].split("_")[0] else: genome = "genome" RNA = (cwd.find("RNA") != -1) test_small = (genome != "genome") if runtime_only: verbose = True chr_dic = read_genome("../../data/%s.fa" % genome) gtf_junctions = extract_splice_sites("../../data/%s.gtf" % genome) repeat_db, repeat_map = read_repeatdb("../../data/%s_rep.rep.info" % genome) align_stat = [] for paired in [False, True]: if not paired and not single_end: continue if paired and not paired_end: continue for readtype in readtypes: if paired: base_fname = data_base + "_paired" type_sam_fname = base_fname + "_" + readtype + ".sam" type_read1_fname = base_fname + "_1_" + readtype + ".fa" type_read2_fname = base_fname + "_2_" + readtype + ".fa" type_junction_fname = base_fname + "_" + readtype + ".junc" else: base_fname = data_base + "_single" type_sam_fname = base_fname + "_" + readtype + ".sam" type_read1_fname = base_fname + "_" + readtype + ".fa" type_read2_fname = "" type_junction_fname = base_fname + "_" + readtype + ".junc" assert os.path.exists(type_sam_fname) and os.path.exists(type_junction_fname) numreads = 0 type_sam_file = open(type_sam_fname) for line in type_sam_file: numreads += 1 type_sam_file.close() if numreads <= 0: continue print >> sys.stderr, "%s\t%d" % (readtype, numreads) junctions, junctions_set = [], set() type_junction_file = open(type_junction_fname) for line in type_junction_file: chr, left, right = line[:-1].split() left, right = int(left), int(right) junctions.append([chr, left, right]) junctions_set.add(to_junction_str([chr, left, right])) type_junction_file.close() aligner_bin_base = "../../../aligners/bin" def get_aligner_version(aligner, version): version = "" if aligner == "hisat2" or \ aligner == "hisat" or \ aligner == "bowtie" or \ aligner == "bowtie2": if version: cmd = ["%s/%s_%s/%s" % (aligner_bin_base, aligner, version, aligner)] else: cmd = ["%s/%s" % (aligner_bin_base, aligner)] cmd += ["--version"] cmd_process = subprocess.Popen(cmd, stdout=subprocess.PIPE) version = cmd_process.communicate()[0][:-1].split("\n")[0] version = version.split()[-1] elif aligner == "tophat2": cmd = ["%s/tophat" % (aligner_bin_base)] cmd += ["--version"] cmd_process = subprocess.Popen(cmd, stdout=subprocess.PIPE) version = cmd_process.communicate()[0][:-1].split()[-1] elif aligner in ["star", "starx2"]: version = "2.4.2a" elif aligner == "gsnap": cmd = ["%s/gsnap" % (aligner_bin_base)] cmd_process = subprocess.Popen(cmd, stderr=subprocess.PIPE) version = cmd_process.communicate()[1][:-1].split("\n")[0] version = version.split()[2] elif aligner == "bwa": if version: cmd = ["%s/bwa_%s/bwa" % (aligner_bin_base, version)] else: cmd = ["%s/bwa" % (aligner_bin_base)] cmd_process = subprocess.Popen(cmd, stderr=subprocess.PIPE) version = cmd_process.communicate()[1][:-1].split("\n")[2] version = version.split()[1] elif aligner == "vg": cmd = ["%s/vg" % (aligner_bin_base)] cmd_process = subprocess.Popen(cmd, stderr=subprocess.PIPE) version = cmd_process.communicate()[1][:-1].split("\n")[0] version = version.split()[5] elif aligner == "minimap2": cmd = ["%s/minimap2" % (aligner_bin_base)] cmd += ["--version"] cmd_process = subprocess.Popen(cmd, stdout=subprocess.PIPE) version = cmd_process.communicate()[0][:-1].split("\n")[0] return version index_base = "../../../indexes" index_add = "" if genome != "genome": index_add = "_" + genome def get_aligner_cmd(RNA, aligner, type, index_type, version, options, read1_fname, read2_fname, out_fname, cmd_idx = 0): cmd = ["/usr/bin/time"] if osx_mode: cmd += ['-l'] if aligner == "hisat2": if version: cmd += ["%s/hisat2_%s/hisat2" % (aligner_bin_base, version)] else: cmd += ["%s/hisat2" % (aligner_bin_base)] if num_threads > 1: cmd += ["-p", str(num_threads)] cmd += ["-f"] # cmd += ["-k", "100"] # cmd += ["--max-seeds", "100"] # cmd += ["--score-min", "C,-100"] # cmd += ["--pen-cansplice", "0"] # cmd += ["--pen-noncansplice", "12"] # cmd += ["--pen-intronlen", "G,-8,1"] # cmd += ["--metrics", "1", # "--metrics-file", "metrics.out"] if not RNA: cmd += ["--no-spliced-alignment"] if type in ["x1", "x2"]: cmd += ["--no-temp-splicesite"] # cmd += ["--no-anchorstop"] if version == "" or \ (version != "" and int(version) >= 210): cmd += ["--new-summary", "--summary-file", out_fname + ".summary"] if version == "" or int(version) >= 220: cmd += ["--repeat"] # cmd += ["--dta"] # cmd += ["--dta-cufflinks"] if options != "": cmd += options.split(' ') if type == "x2": if cmd_idx == 0: cmd += ["--novel-splicesite-outfile"] else: cmd += ["--novel-splicesite-infile"] cmd += ["splicesites.txt"] # "--novel-splicesite-infile", # "../splicesites.txt", # "--rna-strandness", # "FR", if version: index_cmd = "%s/HISAT2_%s%s/" % (index_base, version, index_add) + genome else: index_cmd = "%s/HISAT2%s/" % (index_base, index_add) + genome if index_type: index_cmd += ("_" + index_type) cmd += [index_cmd] if paired: cmd += ["-1", read1_fname, "-2", read2_fname] else: cmd += [read1_fname] elif aligner == "hisat-TLA": if version: cmd += ["%s/hisat2_%s/hisat2" % (aligner_bin_base, version)] else: cmd += ["%s/hisat2" % (aligner_bin_base)] if num_threads > 1: cmd += ["-p", str(num_threads)] cmd += ["-f"] if not RNA: cmd += ["--no-spliced-alignment"] if type in ["x1", "x2"]: cmd += ["--no-temp-splicesite"] if version == "" or \ (version != "" and int(version) >= 210): cmd += ["--new-summary", "--summary-file", out_fname + ".summary"] if version == "" or int(version) >= 220: cmd += ["--repeat"] if paired: cmd += ["-1", read1_fname, "-2", read2_fname] else: cmd += ["-U", read1_fname] cmd += ["--TLA"] cmd += ["--base-change"] cmd += [baseChange] cmd += ["--expand-repeat"] index1_cmd = "%s/HISAT2%s/" % (index_base, index_add) + genome + '_' + baseChange if index_type: index1_cmd += ("_" + index_type) cmd += ["--index1"] cmd += [index1_cmd] index2_cmd = "%s/HISAT2%s/" % (index_base, index_add) + genome + '_' + reverse_complement(baseChange[0]) + reverse_complement(baseChange[1]) if index_type: index2_cmd += ("_" + index_type) cmd += ["--index2"] cmd += [index2_cmd] cmd += ["--reference"] cmd += ["../../../data/%s.fa" % genome] elif aligner == "hisat": cmd += ["%s/hisat" % (aligner_bin_base)] if num_threads > 1: cmd += ["-p", str(num_threads)] cmd += ["-f"] # cmd += ["-k", "5"] # cmd += ["--score-min", "C,-18"] if version != "": version = int(version) else: version = sys.maxint if not RNA: cmd += ["--no-spliced-alignment"] if type in ["x1", "x2"]: cmd += ["--no-temp-splicesite"] """ cmd += ["--rdg", "100,100", "--rfg", "100,100"] """ if type == "x2": if cmd_idx == 0: cmd += ["--novel-splicesite-outfile"] else: cmd += ["--novel-splicesite-infile"] cmd += ["splicesites.txt"] # "--novel-splicesite-infile", # "../splicesites.txt", # "--rna-strandness", # "FR", cmd += ["%s/HISAT%s/" % (index_base, index_add) + genome] if paired: cmd += ["-1", read1_fname, "-2", read2_fname] else: cmd += [read1_fname] elif aligner == "tophat2": cmd += ["%s/tophat" % (aligner_bin_base)] if num_threads > 1: cmd += ["-p", str(num_threads)] if type.find("gtf") != -1: cmd += ["--transcriptome-index=%s/HISAT%s/gtf/%s" % (index_base, index_add, genome)] if type == "gtfonly": cmd += ["--transcriptome-only"] cmd += ["--read-edit-dist", "3"] cmd += ["--no-sort-bam"] cmd += ["--read-realign-edit-dist", "0"] cmd += ["--keep-tmp", "%s/HISAT%s/" % (index_base, index_add) + genome, read1_fname] if paired: cmd += [read2_fname] elif aligner == "star": cmd += ["%s/STAR" % (aligner_bin_base)] if num_threads > 1: cmd += ["--runThreadN", str(num_threads)] cmd += ["--genomeDir"] if cmd_idx == 0: if type == "gtf": cmd += ["%s/STAR%s/gtf" % (index_base, index_add)] else: cmd += ["%s/STAR%s" % (index_base, index_add)] else: assert cmd_idx == 1 cmd += ["."] if desktop: cmd += ["--genomeLoad", "NoSharedMemory"] else: cmd += ["--genomeLoad", "LoadAndKeep"] if type == "x2": if cmd_idx == 1: cmd += ["--alignSJDBoverhangMin", "1"] cmd += ["--readFilesIn", read1_fname] if paired: cmd += [read2_fname] if paired: cmd += ["--outFilterMismatchNmax", "6"] else: cmd += ["--outFilterMismatchNmax", "3"] elif aligner == "bowtie": cmd += ["%s/bowtie" % (aligner_bin_base)] if num_threads > 1: cmd += ["-p", str(num_threads)] cmd += ["-f", "--sam", "-k", "10"] cmd += ["-n", "3"] if paired: cmd += ["-X", "500"] cmd += ["%s/Bowtie%s/" % (index_base, index_add) + genome] if paired: cmd += ["-1", read1_fname, "-2", read2_fname] else: cmd += [read1_fname] elif aligner == "bowtie2": if version: cmd += ["%s/bowtie2_%s/bowtie2" % (aligner_bin_base, version)] else: cmd += ["%s/bowtie2" % (aligner_bin_base)] if num_threads > 1: cmd += ["-p", str(num_threads)] cmd += ["-f"] if options != "": cmd += options.split(' ') if version: cmd += ["-x %s/Bowtie2_%s%s/" % (index_base, version, index_add) + genome] else: cmd += ["-x %s/Bowtie2%s/" % (index_base, index_add) + genome] if paired: cmd += ["-1", read1_fname, "-2", read2_fname] else: cmd += [read1_fname] elif aligner == "gsnap": cmd += ["%s/gsnap" % (aligner_bin_base), "-A", "sam"] if num_threads > 1: cmd += ["-t", str(num_threads)] cmd += ["--max-mismatches=3", "-D", "%s/GSNAP%s" % (index_base, index_add), "-N", "1", "-d", genome, read1_fname] if paired: cmd += [read2_fname] elif aligner == "bwa": if version: cmd += ["%s/bwa_%s/bwa" % (aligner_bin_base, version)] else: cmd += ["%s/bwa" % (aligner_bin_base)] if type in ["mem", "aln"]: cmd += [type] elif type == "sw": cmd += ["bwa" + type] if num_threads > 1: cmd += ["-t", str(num_threads)] if options != "": cmd += options.split(' ') # if paired: # cmd += ["-T", "60"] if version: cmd += ["%s/BWA_%s%s/%s.fa" % (index_base, version, index_add, genome)] else: cmd += ["%s/BWA%s/%s.fa" % (index_base, index_add, genome)] cmd += [read1_fname] if paired: cmd += [read2_fname] elif aligner == "vg": # vg map -d 22 -t 6 -M 10 -f ../sim-1.fa -f ../sim-2.fa > result.sam.gam cmd += ["%s/vg" % (aligner_bin_base)] cmd += ["map"] cmd += ["-t", str(num_threads)] cmd += ["--surject-to", "sam"] index_cmd = "%s/VG%s/" % (index_base, index_add) + genome if index_type: index_cmd += ("_" + index_type) if options != "": cmd += options.split(' ') cmd += ["-d", index_cmd] cmd += ["-f", read1_fname] if paired: cmd += ["-f", read2_fname] elif aligner == "minimap2": # minimap2 -a -x sr 22.mmi sim_1.fa sim_2.fa > result.sam cmd += ["%s/minimap2" % (aligner_bin_base)] cmd += ["-a"] cmd += ["-x", "sr"] index_cmd = "%s/minimap2%s/" % (index_base, index_add) + genome if index_type: index_cmd += ("_" + index_type) index_cmd += ".mmi" cmd += [index_cmd] cmd += [read1_fname] if paired: cmd += [read2_fname] else: assert False return cmd for aligner, type, index_type, version, options in aligners: skip = False if len(test_aligners) > 0: skip = True for test_aligner in test_aligners: if aligner == test_aligner: skip = False if skip: continue aligner_name = aligner + type if version != "": aligner_name += ("_%s" % version) if aligner == "hisat2" and index_type != "": aligner_name += ("_" + index_type) if aligner == "vg" and index_type != "": aligner_name += ("_" + index_type) two_step = (aligner == "tophat2" or type == "x2" or (aligner in ["hisat2", "hisat"] and type == "")) if RNA and readtype != "M": if aligner in ["bowtie", "bowtie2", "bwa"]: continue if readtype != "all": if two_step: continue if not RNA and readtype != "all": continue print >> sys.stderr, "\t%s\t%s" % (aligner_name, str(datetime.now())) if options != "": option_name = options.replace(' ', '').replace('-', '').replace(',', '') aligner_name = aligner_name + '_' + option_name if paired: aligner_dir = aligner_name + "_paired" else: aligner_dir = aligner_name + "_single" if fresh and os.path.exists(aligner_dir): os.system("rm -rf %s" % aligner_dir) if not os.path.exists(aligner_dir): os.mkdir(aligner_dir) os.chdir(aligner_dir) out_fname = base_fname + "_" + readtype + ".sam" out_fname2 = out_fname + "2" duration = -1.0 mem_usage = '0' if not os.path.exists(out_fname): if not os.path.exists("../one.fa") or not os.path.exists("../two.fa"): os.system("head -400 ../%s_1.fa > ../one.fa" % (data_base)) os.system("head -400 ../%s_2.fa > ../two.fa" % (data_base)) if runtime_only: out_fname = "/dev/null" out_fname2 = "/dev/null" if not two_step: align_stat.append([readtype, aligner_name]) # dummy commands for caching index and simulated reads loading_time = 0 if aligner != "tophat2": for i in range(3): dummy_cmd = get_aligner_cmd(RNA, aligner, type, index_type, version, options, "../one.fa", "../two.fa", "/dev/null") start_time = datetime.now() if verbose: print >> sys.stderr, start_time, "\t", " ".join(dummy_cmd) if aligner in ["hisat2", "hisat", "bowtie", "bowtie2", "gsnap", "bwa"]: proc = subprocess.Popen(dummy_cmd, stdout=open("/dev/null", "w"), stderr=subprocess.PIPE) else: proc = subprocess.Popen(dummy_cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE) proc.communicate() finish_time = datetime.now() duration = finish_time - start_time duration = duration.total_seconds() if verbose: print >> sys.stderr, finish_time, "duration:", duration loading_time = duration # Align all reads aligner_cmd = get_aligner_cmd(RNA, aligner, type, index_type, version, options, "../" + type_read1_fname, "../" + type_read2_fname, out_fname) start_time = datetime.now() if verbose: print >> sys.stderr, "\t", start_time, " ".join(aligner_cmd) if aligner in ["hisat2", "hisat", "bowtie", "bowtie2", "gsnap", "bwa", "vg", "minimap2", "hisat-TLA"]: proc = subprocess.Popen(aligner_cmd, stdout=open(out_fname, "w"), stderr=subprocess.PIPE) else: proc = subprocess.Popen(aligner_cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE) _, mem_usage = proc.communicate() mem_usage = parse_mem_usage(mem_usage) finish_time = datetime.now() duration = finish_time - start_time duration = duration.total_seconds() - loading_time if duration < 0.1: duration = 0.1 if verbose: print >> sys.stderr, "\t", finish_time, "finished:", duration if debug and aligner == "hisat2": os.system("cat metrics.out") print >> sys.stderr, "\ttime: %.4f" % (duration) if aligner == "star" and type in ["", "gtf"]: os.system("mv Aligned.out.sam %s" % out_fname) elif aligner in ["hisat2", "hisat"] and type == "x2": aligner_cmd = get_aligner_cmd(RNA, aligner, type, index_type, version, options, "../" + type_read1_fname, "../" + type_read2_fname, out_fname, 1) start_time = datetime.now() if verbose: print >> sys.stderr, "\t", start_time, " ".join(aligner_cmd) proc = subprocess.Popen(aligner_cmd, stdout=open(out_fname, "w"), stderr=subprocess.PIPE) proc.communicate() finish_time = datetime.now() duration += (finish_time - start_time).total_seconds() duration -= loading_time if duration < 0.1: duration = 0.1 if verbose: print >> sys.stderr, "\t", finish_time, "finished:", duration elif aligner == "star" and type == "x2": assert os.path.exists("SJ.out.tab") os.system("awk 'BEGIN {OFS=\"\t\"; strChar[0]=\".\"; strChar[1]=\"+\"; strChar[2]=\"-\";} {if($5>0){print $1,$2,$3,strChar[$4]}}' SJ.out.tab > SJ.out.tab.Pass1.sjdb") for file in os.listdir("."): if file in ["SJ.out.tab.Pass1.sjdb", "genome.fa"]: continue os.remove(file) star_index_cmd = "%s/STAR --genomeDir ./ --runMode genomeGenerate --genomeFastaFiles ../../../data/%s.fa --sjdbFileChrStartEnd SJ.out.tab.Pass1.sjdb --sjdbOverhang 99 --runThreadN %d" % (aligner_bin_base, genome, num_threads) if verbose: print >> sys.stderr, "\t", datetime.now(), star_index_cmd os.system(star_index_cmd) if verbose: print >> sys.stderr, "\t", datetime.now(), " ".join(dummy_cmd) proc = subprocess.Popen(dummy_cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE) proc.communicate() if verbose: print >> sys.stderr, "\t", datetime.now(), "finished" aligner_cmd = get_aligner_cmd(RNA, aligner, type, index_type, version, options, "../" + type_read1_fname, "../" + type_read2_fname, out_fname, 1) start_time = datetime.now() if verbose: print >> sys.stderr, "\t", start_time, " ".join(aligner_cmd) proc = subprocess.Popen(aligner_cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE) proc.communicate() finish_time = datetime.now() duration += (finish_time - start_time).total_seconds() duration -= loading_time if duration < 0.1: duration = 0.1 if verbose: print >> sys.stderr, "\t", finish_time, "finished:", duration os.system("mv Aligned.out.sam %s" % out_fname) elif aligner == "tophat2": os.system("samtools sort -n tophat_out/accepted_hits.bam accepted_hits; samtools view -h accepted_hits.bam > %s" % out_fname) elif aligner == "vg": index_name = "%s/VG%s/" % (index_base, index_add) + genome if index_type: index_name += ("_" + index_type) if aligner in ["gsnap", "tophat2"]: os.system("tar cvzf %s.tar.gz %s &> /dev/null" % (out_fname, out_fname)) if runtime_only: print >> sys.stderr, "\t\t\tMemory Usage: %dMB" % (int(mem_usage) / 1024) os.chdir("..") continue if not os.path.exists(out_fname2): debug_dic = {} pid_list = [] if paired: if mp_mode: for i in xrange(mp_num): p = Process(target=extract_pair, args=(out_fname, out_fname2, chr_dic, RNA, aligner, version, repeat_db, repeat_map, debug_dic, i)) pid_list.append(p) p.start() for p in pid_list: p.join() # merge os.system("mv %s %s" % (out_fname2 + ".0", out_fname2)) for i in xrange(1, mp_num): os.system("cat %s >> %s" % (out_fname2 + "." + str(i), out_fname2)) os.system("rm %s" % (out_fname2 + "." + str(i))) else: extract_pair(out_fname, out_fname2, chr_dic, RNA, aligner, version, repeat_db, repeat_map, debug_dic, -1) else: if mp_mode: # Prepare queues for i in xrange(mp_num): p = Process(target=extract_single, args=(out_fname, out_fname2, chr_dic, aligner, version, repeat_db, repeat_map, debug_dic, i)) pid_list.append(p) p.start() # wait for p in pid_list: p.join() # merge os.system("mv %s %s" % (out_fname2 + ".0", out_fname2)) for i in xrange(1, mp_num): os.system("cat %s >> %s" % (out_fname2 + "." + str(i), out_fname2)) os.system("rm %s" % (out_fname2 + "." + str(i))) else: extract_single(out_fname, out_fname2, chr_dic, aligner, version, repeat_db, repeat_map, debug_dic, -1) for readtype2 in readtypes: if not two_step and readtype != readtype2: continue type_sam_fname2 = base_fname + "_" + readtype2 + ".sam" if os.path.exists(type_sam_fname2 + ".done"): continue if paired: type_read_fname2 = base_fname + "_1_" + readtype2 + ".fa" else: type_read_fname2 = base_fname + "_" + readtype2 + ".fa" mapped_id_fname = base_fname + "_" + readtype2 + ".read_id" if paired: mapped, unique_mapped, first_mapped, unmapped, aligned, multi_aligned, \ snp_mapped, snp_unique_mapped, snp_first_mapped, snp_unmapped, \ temp_junctions, temp_gtf_junctions, mapping_point \ = compare_paired_sam(RNA, out_fname2, "../" + type_sam_fname2, mapped_id_fname, chr_dic, junctions, junctions_set, gtf_junctions, aligner) else: mapped, unique_mapped, first_mapped, unmapped, aligned, multi_aligned, \ snp_mapped, snp_unique_mapped, snp_first_mapped, snp_unmapped, \ temp_junctions, temp_gtf_junctions, mapping_point \ = compare_single_sam(RNA, out_fname2, "../" + type_sam_fname2, mapped_id_fname, chr_dic, junctions, junctions_set, gtf_junctions, aligner) proc = subprocess.Popen(["wc", "-l", "../" + type_read_fname2], stdout=subprocess.PIPE) out = proc.communicate()[0] numreads = int(out.split()[0]) / 2 assert mapped + unmapped == numreads if two_step: print >> sys.stderr, "\t\t%s" % readtype2 print >> sys.stderr, "\t\taligned: %d, multi aligned: %d" % (aligned, multi_aligned) print >> sys.stderr, "\t\tcorrectly mapped: %d (%.2f%%) mapping_point: %.2f" % (mapped, float(mapped) * 100.0 / numreads, mapping_point * 100.0 / numreads) print >> sys.stderr, "\t\tcorrectly mapped at first: %d (%.2f%%)" % (first_mapped, float(first_mapped) * 100.0 / numreads) print >> sys.stderr, "\t\tuniquely and correctly mapped: %d (%.2f%%)" % (unique_mapped, float(unique_mapped) * 100.0 / numreads) snp_numreads = snp_mapped + snp_unmapped if snp_numreads > 0: print >> sys.stderr, "\t\t\t\tSNP: reads: %d" % (snp_numreads) print >> sys.stderr, "\t\t\t\tSNP: correctly mapped: %d (%.2f%%)" % (snp_mapped, float(snp_mapped) * 100.0 / snp_numreads) print >> sys.stderr, "\t\t\t\tSNP: correctly mapped at first: %d (%.2f%%)" % (snp_first_mapped, float(snp_first_mapped) * 100.0 / snp_numreads) print >> sys.stderr, "\t\t\t\tSNP: uniquely and correctly mapped: %d (%.2f%%)" % (snp_unique_mapped, float(snp_unique_mapped) * 100.0 / snp_numreads) if readtype == readtype2: print >> sys.stderr, "\t\t\t%d reads per sec (all)" % (numreads / max(1.0, duration)) if RNA: print >> sys.stderr, "\t\tjunc. sensitivity %d / %d (%.2f%%), junc. accuracy: %d / %d (%.2f%%)" % \ (temp_gtf_junctions, len(junctions), float(temp_gtf_junctions) * 100.0 / max(1, len(junctions)), \ temp_gtf_junctions, temp_junctions, float(temp_gtf_junctions) * 100.0 / max(1, temp_junctions)) print >> sys.stderr, "\t\t\tMemory Usage: %dMB" % (int(mem_usage) / 1024) if duration > 0.0: if sql_write and os.path.exists("../" + sql_db_name): if paired: end_type = "paired" else: end_type = "single" mem_used = int(mem_usage) / 1024 sql_insert = "INSERT INTO \"ReadCosts\" VALUES(NULL, '%s', '%s', '%s', '%s', '%s', '%s', %d, %d, %d, %d, %f, %f, %d, %d, %d, %d, %d, %d, %d, '%s', datetime('now', 'localtime'), '%s');" % \ (genome, data_base, end_type, readtype2, aligner_name, get_aligner_version(aligner, version), numreads, mapped, unique_mapped, unmapped, mapping_point, snp_mapped, snp_unique_mapped, snp_unmapped, duration, mem_used, len(junctions), temp_junctions, temp_gtf_junctions, platform.node(), " ".join(aligner_cmd)) sql_execute("../" + sql_db_name, sql_insert) if two_step: align_stat.append([readtype2, aligner_name, numreads, duration, mem_used, mapped, unique_mapped, unmapped, mapping_point, snp_mapped, snp_unique_mapped, snp_unmapped, len(junctions), temp_junctions, temp_gtf_junctions]) else: align_stat[-1].extend([numreads, duration, mem_used, mapped, unique_mapped, unmapped, mapping_point, snp_mapped, snp_unique_mapped, snp_unmapped, len(junctions), temp_junctions, temp_gtf_junctions]) os.system("touch %s.done" % type_sam_fname2) os.chdir("..") print >> sys.stdout, "\t".join(["type", "aligner", "all", "all_time", "mem", "mapped", "unique_mapped", "unmapped", "mapping point", "snp_mapped", "snp_unique_mapped", "snp_unmapped", "true_gtf_junctions", "temp_junctions", "temp_gtf_junctions"]) for line in align_stat: outstr = "" for item in line: if outstr != "": outstr += "\t" outstr += str(item) print >> sys.stdout, outstr if os.path.exists(sql_db_name): write_analysis_data(sql_db_name, genome, data_base) """ """ if __name__ == "__main__": parser = ArgumentParser( description='test HISAT2, and compare HISAT2 with other popular aligners such as TopHat2, STAR, Bowtie1/2, GSNAP, BWA-mem, etc.') parser.add_argument('--single-end', dest='paired_end', action='store_false', help='run single-end only') parser.add_argument('--paired-end', dest='single_end', action='store_false', help='run paired_end only') parser.add_argument('--aligner-list', dest='aligner_list', type=str, default="", help='comma-separated list of aligners (e.g. hisat2,bowtie2,bwa') parser.add_argument('--fresh', dest='fresh', action='store_true', help='delete existing alignment related directories (e.g. hisat2_single)') parser.add_argument('--runtime-only', '--runtime', dest='runtime_only', action='store_true', help='run programs without evaluation') parser.add_argument('-p', '--multi-process', dest='mp_num', action='store', type=int, default=1, help='Use multiple process mode') parser.add_argument('--base-change', dest='baseChange', type=str, default="", help='base change for Hisat-TLA') parser.add_argument('-v', '--verbose', dest='verbose', action='store_true', help='also print some statistics to stderr') args = parser.parse_args() aligners = [] for aligner in args.aligner_list.split(','): if aligner == "": continue aligners.append(aligner) mp_num = args.mp_num mp_mode = (mp_num > 1) calculate_read_cost(args.single_end, args.paired_end, aligners, args.fresh, args.runtime_only, args.baseChange, args.verbose)