#!/usr/bin/env python import sys, os, subprocess, signal import multiprocessing import platform import string import re from datetime import datetime, date, time from collections import defaultdict from argparse import ArgumentParser, FileType osx_mode = False if sys.platform == 'darwin': osx_mode = True MAX_EDIT = 21 signal.signal(signal.SIGPIPE, signal.SIG_DFL) cigar_re = re.compile('\d+\w') """ """ def parse_mem_usage(resource): if osx_mode: resource = resource.strip().split('\n') for line in resource: if line.find('maximum resident set size') != -1: return int(line.split()[0]) / 1024 else: resource = resource.split(' ') for line in resource: idx = line.find('maxresident') if idx != -1: return line[:idx] return '0' """ """ def reverse_complement(seq): result = "" for nt in seq: base = nt if nt == 'A': base = 'T' elif nt == 'a': base = 't' elif nt == 'C': base = 'G' elif nt == 'c': base = 'g' elif nt == 'G': base = 'C' elif nt == 'g': base = 'c' elif nt == 'T': base = 'A' elif nt == 't': base = 'a' result = base + result return result """ """ def read_genome(genome_filename): chr_dic = {} genome_file = open(genome_filename, "r") chr_name, sequence = "", "" for line in genome_file: if line[0] == ">": if chr_name and sequence: chr_dic[chr_name] = sequence chr_name = line[1:-1].split()[0] sequence = "" else: sequence += line[:-1] if chr_name and sequence: chr_dic[chr_name] = sequence genome_file.close() print >> sys.stderr, "genome is loaded" return chr_dic """ """ def read_snp(snp_filename): snps = defaultdict(list) snp_file = open(snp_filename, 'r') for line in snp_file: line = line.strip() if not line or line.startswith('#'): continue try: snpID, type, chr, pos, data = line.split('\t') except ValueError: continue assert type in ["single", "deletion", "insertion"] if type == "deletion": data = int(data) snps[chr].append([snpID, type, int(pos), data]) print >> sys.stderr, "snp is loaded" return snps """ """ def extract_splice_sites(gtf_fname): trans = {} gtf_file = open(gtf_fname) # Parse valid exon lines from the GTF file into a dict by transcript_id for line in gtf_file: line = line.strip() if not line or line.startswith('#'): continue if '#' in line: line = line.split('#')[0].strip() try: chrom, source, feature, left, right, score, \ strand, frame, values = line.split('\t') except ValueError: continue left, right = int(left), int(right) if feature != 'exon' or left >= right: continue values_dict = {} for attr in values.split(';')[:-1]: attr, _, val = attr.strip().partition(' ') values_dict[attr] = val.strip('"') if 'gene_id' not in values_dict or \ 'transcript_id' not in values_dict: continue transcript_id = values_dict['transcript_id'] if transcript_id not in trans: trans[transcript_id] = [chrom, strand, [[left, right]]] else: trans[transcript_id][2].append([left, right]) gtf_file.close() # Sort exons and merge where separating introns are <=5 bps for tran, [chrom, strand, exons] in trans.items(): exons.sort() tmp_exons = [exons[0]] for i in range(1, len(exons)): if exons[i][0] - tmp_exons[-1][1] <= 5: tmp_exons[-1][1] = exons[i][1] else: tmp_exons.append(exons[i]) trans[tran] = [chrom, strand, tmp_exons] # Calculate and print the unique junctions junctions = set() for chrom, strand, exons in trans.values(): for i in range(1, len(exons)): junctions.add(to_junction_str([chrom, exons[i-1][1], exons[i][0]])) return junctions def to_junction_str(junction): return "%s-%d-%d" % (junction[0], junction[1], junction[2]) def to_junction(junction_str): chr, left, right = junction_str.split("-") return [chr, int(left), int(right)] def junction_cmp(a, b): if a[0] != b[0]: if a[0] < b[0]: return -1 else: return 1 if a[1] != b[1]: if a[1] < b[1]: return -1 else: return 1 if a[2] != b[2]: if a[2] < b[2]: return -1 else: return 1 return 0 # chr and pos are assumed to be integers def get_junctions(chr, pos, cigar_str, min_anchor_len = 0, read_len = 100): junctions = [] right_pos = pos cigars = cigar_re.findall(cigar_str) cigars = [[int(cigars[i][:-1]), cigars[i][-1]] for i in range(len(cigars))] left_anchor_lens = [] cur_left_anchor_len = 0 for i in range(len(cigars)): length, cigar_op = cigars[i] if cigar_op in "MI": cur_left_anchor_len += length elif cigar_op == "N": assert cur_left_anchor_len > 0 left_anchor_lens.append(cur_left_anchor_len) cur_left_anchor_len = 0 for i in range(len(cigars)): length, cigar_op = cigars[i] if cigar_op == "N": left, right = right_pos - 1, right_pos + length if i > 0 and cigars[i-1][1] in "ID": if cigars[i-1][1] == "I": left += cigars[i-1][0] else: left -= cigars[i-1][0] if i + 1 < len(cigars) and cigars[i+1][1] in "ID": if cigars[i+1][1] == "I": right -= cigars[i+1][0] else: right += cigars[i+1][0] junction_idx = len(junctions) assert junction_idx < len(left_anchor_lens) left_anchor_len = left_anchor_lens[junction_idx] assert left_anchor_len > 0 and left_anchor_len < read_len right_anchor_len = read_len - left_anchor_len if left_anchor_len >= min_anchor_len and right_anchor_len >= min_anchor_len: junctions.append([chr, left, right]) if cigar_op in "MND": right_pos += length return junctions def get_right(pos, cigars): right_pos = pos cigars = cigar_re.findall(cigars) for cigar in cigars: length = int(cigar[:-1]) cigar_op = cigar[-1] if cigar_op in "MDN": right_pos += length return right_pos def get_cigar_chars(cigars): cigars = cigar_re.findall(cigars) cigar_chars = "" for cigar in cigars: cigar_op = cigar[-1] cigar_chars += cigar_op return cigar_chars def get_cigar_chars_MN(cigars): cigars = cigar_re.findall(cigars) cigar_chars = "" for cigar in cigars: cigar_op = cigar[-1] if cigar_op in "MN": if cigar_chars == "" or cigar_chars[-1] != cigar_op: cigar_chars += cigar_op return cigar_chars def is_non_canonical_junction_read(chr_dic, chr, left, cigars, canonical_junctions = [["GT", "AG"], ["GC", "AG"], ["AT", "AC"]]): pos = left for cigar in cigar_re.findall(cigars): cigar_op = cigar[-1] cigar_len = int(cigar[:-1]) if cigar_op in 'MD': pos += cigar_len elif cigar_op == 'N': right = pos + cigar_len donor = chr_dic[chr][pos-1:pos+1] acceptor = chr_dic[chr][right-3:right-1] rev_donor = reverse_complement(acceptor) rev_acceptor = reverse_complement(donor) # print donor, acceptor # print rev_donor, rev_acceptor if [donor, acceptor] not in canonical_junctions and [rev_donor, rev_acceptor] not in canonical_junctions: return True pos = right return False def is_canonical_junction(chr_dic, junction): chr, left, right = junction donor = chr_dic[chr][left:left+2] acceptor = chr_dic[chr][right-3:right-1] rev_donor = reverse_complement(acceptor) rev_acceptor = reverse_complement(donor) if (donor == "GT" and acceptor == "AG") or \ (rev_donor == "GT" and rev_acceptor == "AG"): return True return False def is_junction_read(chr_dic, gtf_junctions, chr, pos, cigar_str): result_junctions = [] junctions = get_junctions(chr, pos, cigar_str, 0, 101) for junction in junctions: junction_str = to_junction_str(junction) is_gtf_junction = False def find_in_gtf_junctions(gtf_junctions, junction): l, u = 0, len(gtf_junctions) while l < u: m = (l + u) / 2 assert m >= 0 and m < len(gtf_junctions) cmp_result = junction_cmp(junction, gtf_junctions[m]) if cmp_result == 0: return m elif cmp_result < 0: u = m else: l = m + 1 return l # allow small (<= 5bp) discrepancy for non-canonical splice sites. relaxed_junction_dist = 5 chr, left, right = junction gtf_index = find_in_gtf_junctions(gtf_junctions, [chr, left - relaxed_junction_dist, right - relaxed_junction_dist]) if gtf_index >= 0: i = gtf_index while i < len(gtf_junctions): chr2, left2, right2 = gtf_junctions[i] if chr2 > chr or \ left2 - left > relaxed_junction_dist or \ right2 - right > relaxed_junction_dist: break if abs(left - left2) <= relaxed_junction_dist and left - left2 == right - right2: canonical = is_canonical_junction(chr_dic, gtf_junctions[i]) if left == left2 or not canonical: is_gtf_junction = True break i += 1 result_junctions.append([junction_str, is_gtf_junction]) is_gtf_junction_read = False if len(result_junctions) > 0: is_gtf_junction_read = True for junction_str, is_gtf_junction in result_junctions: if not is_gtf_junction: is_gtf_junction_read = False break return result_junctions, len(result_junctions) > 0, is_gtf_junction_read def is_junction_pair(chr_dic, gtf_junctions, chr, pos, cigar_str, mate_chr, mate_pos, mate_cigar_str): junctions, junction_read, gtf_junction_read = is_junction_read(chr_dic, gtf_junctions, chr, pos, cigar_str) mate_junctions, mate_junction_read, mate_gtf_junction_read = is_junction_read(chr_dic, gtf_junctions, mate_chr, mate_pos, mate_cigar_str) junctions += mate_junctions junction_pair = len(junctions) > 0 if junction_pair: gtf_junction_pair = True if junction_read and not gtf_junction_read: gtf_junction_pair = False if mate_junction_read and not mate_gtf_junction_read: gtf_junction_pair = False else: gtf_junction_pair = False return junctions, junction_pair, gtf_junction_pair """ """ def getSNPs(chr_snps, left, right): low, high = 0, len(chr_snps) while low < high: mid = (low + high) / 2 snpID, type, pos, data = chr_snps[mid] if pos < left: low = mid + 1 else: high = mid - 1 snps = [] for i in xrange(low, len(chr_snps)): snp = chr_snps[i] snpID, type, pos, data = snp pos2 = pos if type == "deletion": pos2 += data if pos2 >= right: break if pos >= left: if len(snps) > 0: _, prev_type, prev_pos, prev_data = snps[-1] assert prev_pos <= pos prev_pos2 = prev_pos if prev_type == "deletion": prev_pos2 += prev_data if pos <= prev_pos2: continue snps.append(snp) return snps """ """ def check_snps(snps, check_type, ref_pos, read_seq): found = False for snp in snps: snp_type, snp_pos, snp_data = snp[1:4] if snp_type == check_type: if snp_type == 'single': if snp_pos == ref_pos and snp_data[0] == read_seq[0]: found = True break elif snp_type == 'insertion': if snp_pos == ref_pos and snp_data == read_seq: found = True break elif snp_type == 'deletion': # snp_data and read_seq are length of sequence deleted if snp_pos == ref_pos and int(snp_data) == int(read_seq): found = True break return found def extract_reads_and_pairs(chr_dic, sam_filename, read_filename, pair_filename, unmapped_read_1_fq_name, unmapped_read_2_fq_name, snps_dict = None): temp_read_filename, temp_pair_filename = read_filename + ".temp", pair_filename + ".temp" temp_read_file, temp_pair_file = open(temp_read_filename, "w"), open(temp_pair_filename, "w") unmapped_read_1_fq, unmapped_read_2_fq = open(unmapped_read_1_fq_name, "w"), open(unmapped_read_2_fq_name, "w") hisat2 = read_filename.find("hisat2") != -1 or pair_filename.find("hisat2") != -1 vg = read_filename.find("vg") != -1 or pair_filename.find("vg") != -1 read_dic = {} prev_read_id, prev_read_seq = "", "" sam_file = open(sam_filename, "r") for line in sam_file: if line[0] == "@": continue fields = line[:-1].split() read_id, flag, chr, pos, mapQ, cigar_str, mate_chr, mate_pos, template_len, read_seq, read_qual = fields[:11] if 'H' in cigar_str: continue flag, pos, mate_pos = int(flag), int(pos), int(mate_pos) if read_seq == "*" and prev_read_id == read_id: read_seq = prev_read_seq read_seq = read_seq.upper() if read_seq == "" or read_seq == "*": continue if flag & 0x04 != 0 or \ chr == "*" or \ cigar_str == "*": """ if flag & 0x80 != 0: print >> unmapped_read_2_fq, "@%s\n%s\n+%s\n%s" % (read_id, read_seq, read_id, read_qual) else: print >> unmapped_read_1_fq, "@%s\n%s\n+%s\n%s" % (read_id, read_seq, read_id, read_qual) """ continue if mate_chr == '=': mate_chr = chr if len(read_id) >= 3 and read_id[-2] == "/": read_id = read_id[:-2] if read_id.find("seq.") == 0: read_id = read_id[4:] if read_id != prev_read_id: read_dic = {} HISAT2_XM, HISAT2_NM = 0, 0 if hisat2: for field in fields[11:]: if field[:5] == "XM:i:": HISAT2_XM = int(field[5:]) elif field[:5] == "NM:i:": HISAT2_NM = int(field[5:]) prev_read_id = read_id prev_read_seq = read_seq if snps_dict != None and chr in snps_dict: chr_snps = snps_dict[chr] else: chr_snps = [] snps = None XM, gap = 0, 0 read_pos, right_pos = 0, pos - 1, junction_read = False cigars = cigar_re.findall(cigar_str) for i in range(len(cigars)): cigar = cigars[i] length = int(cigar[:-1]) cigar_op = cigar[-1] if cigar_op == "S": if i != 0 and i != len(cigars) - 1: print >> sys.stderr, "S is located at %dth out of %d %s" % (i+1, len(cigars), cigar_str) if cigar_op in "MS": ref_pos = right_pos if cigar_op == "S" and i == 0: ref_seq = chr_dic[chr][right_pos-length:right_pos] ref_pos = right_pos - length else: ref_seq = chr_dic[chr][right_pos:right_pos+length] ref_seq = ref_seq.upper() if length == len(ref_seq): for j in range(length): if ref_seq[j] != "N" and read_seq[read_pos+j] != ref_seq[j]: if snps_dict == None: XM += 1 else: if snps == None: snps = getSNPs(chr_snps, pos - 1, pos + len(read_seq)) found_snp = check_snps(snps, 'single', ref_pos + j, read_seq[read_pos + j]) if not found_snp: XM += 1 if hisat2 and cigar_op == "S": HISAT2_XM += 1 HISAT2_NM += 1 else: XM += length if cigar_op in "I": if snps == None: snps = getSNPs(chr_snps, pos - 1, pos + len(read_seq)) found_snp = check_snps(snps, 'insertion', right_pos, read_seq[read_pos:read_pos + length]) if not found_snp: gap += length if cigar_op in "D": if snps == None: snps = getSNPs(chr_snps, pos - 1, pos + len(read_seq)) found_snp = check_snps(snps, 'deletion', right_pos, length) if not found_snp: gap += length if cigar_op in "MND": right_pos += length if cigar_op in "MIS": read_pos += length if cigar_op == "N": junction_read = True NM = XM + gap if hisat2: XM, NM = HISAT2_XM, HISAT2_NM if NM < MAX_EDIT: print >> temp_read_file, "%s\t%d\t%s\t%s\t%s\tXM:i:%d\tNM:i:%d" % \ (read_id, flag, chr, pos, cigar_str, XM, NM) found = False me = "%s\t%s\t%d" % (read_id, chr, pos) partner = "%s\t%s\t%d" % (read_id, mate_chr, mate_pos) if partner in read_dic: maps = read_dic[partner] for map in maps: if map[0] == me: mate_flag, mate_cigar_str, mate_XM, mate_NM = map[1:] if mate_pos > pos: flag, chr, pos, cigar_str, XM, NM, mate_flag, mate_chr_str, mate_pos, mate_cigar_str, mate_XM, mate_NM = \ mate_flag, mate_chr, mate_pos, mate_cigar_str, mate_XM, mate_NM, flag, chr, pos, cigar_str, XM, NM print >> temp_pair_file, "%s\t%d\t%s\t%d\t%s\tXM:i:%d\tNM:i:%d\t%d\t%s\t%d\t%s\tXM:i:%d\tNM:i:%d" % \ (read_id, mate_flag, mate_chr, mate_pos, mate_cigar_str, mate_XM, mate_NM, flag, chr, pos, cigar_str, XM, NM) found = True break if not found: if not me in read_dic: read_dic[me] = [] read_dic[me].append([partner, flag, cigar_str, XM, NM]) sam_file.close() temp_read_file.close() temp_pair_file.close() unmapped_read_1_fq.close() unmapped_read_2_fq.close() sort = False if vg: sort = True if sort: command = "sort %s | uniq > %s; rm %s" % (temp_read_filename, read_filename, temp_read_filename) os.system(command) command = "sort %s | uniq > %s; rm %s" % (temp_pair_filename, pair_filename, temp_pair_filename) os.system(command) else: command = "mv %s %s; mv %s %s" % (temp_read_filename, read_filename, temp_pair_filename, pair_filename) os.system(command) def remove_redundant_junctions(junctions): temp_junctions = [] for junction in junctions: temp_junctions.append(to_junction(junction)) junctions = sorted(list(temp_junctions), cmp=junction_cmp) temp_junctions = [] for can_junction in junctions: if len(temp_junctions) <= 0: temp_junctions.append(can_junction) else: chr, left, right = temp_junctions[-1] chr2, left2, right2 = can_junction if chr == chr2 and \ abs(left - left2) == abs(right - right2) and \ abs(left - left2) <= 10: continue temp_junctions.append(can_junction) junctions = set() for junction in temp_junctions: junctions.add(to_junction_str(junction)) return junctions def read_stat(read_filename, gtf_junctions, chr_dic = None, debug = False): read_stat = [[0, 0, 0] for i in range(MAX_EDIT)] temp_junctions = [set() for i in range(MAX_EDIT)] temp_gtf_junctions = [set() for i in range(MAX_EDIT)] alignment = [] prev_read_id = "" read_file = open(read_filename, "r") for line in read_file: read_id, flag, chr, pos, cigar_str, XM, NM = line[:-1].split() flag, pos = int(flag), int(pos) XM, NM = int(XM[5:]), int(NM[5:]) read_junctions, junction_read, gtf_junction_read = \ is_junction_read(chr_dic, gtf_junctions, chr, pos, cigar_str) if junction_read: for junction_str, is_gtf_junction in read_junctions: if NM < len(temp_junctions): temp_junctions[NM].add(junction_str) if is_gtf_junction: temp_gtf_junctions[NM].add(junction_str) if read_id != prev_read_id: if prev_read_id != "": NM2, junction_read2, gtf_junction_read2 = alignment if NM2 < len(read_stat): read_stat[NM2][0] += 1 if junction_read2: read_stat[NM2][1] += 1 if gtf_junction_read2: read_stat[NM2][2] += 1 alignment = [] prev_read_id = read_id if not alignment: alignment = [NM, junction_read, gtf_junction_read] elif alignment[0] > NM or \ (alignment[0] == NM and not alignment[2] and junction_read): alignment = [NM, junction_read, gtf_junction_read] read_file.close() for i in range(len(read_stat)): temp_junctions[i] = remove_redundant_junctions(temp_junctions[i]) temp_gtf_junctions[i] = remove_redundant_junctions(temp_gtf_junctions[i]) for i in range(len(read_stat)): read_stat[i].append(len(temp_junctions[i])) read_stat[i].append(len(temp_gtf_junctions[i])) if alignment: NM2, junction_read2, gtf_junction_read2 = alignment if NM2 < len(read_stat): read_stat[NM2][0] += 1 if junction_read2: read_stat[NM2][1] += 1 if gtf_junction_read2: read_stat[NM2][2] += 1 return read_stat def cal_read_len(cigar_str): length = 0 leftmost_softclip = 0 rightmost_softclip = 0 cigars = cigar_re.findall(cigar_str) for i in range(len(cigars)): cigar = cigars[i] cigar_length = int(cigar[:-1]) cigar_op = cigar[-1] if cigar_op in "MIS": length += cigar_length if (i == 0) and (cigar_op == "S"): leftmost_softclip = cigar_length if (i == (len(cigars) - 1)) and (cigar_op == "S"): rightmost_softclip = cigar_length return length, leftmost_softclip, rightmost_softclip def is_concordantly(read_id, flag, chr, pos, cigar_str, XM, NM, mate_flag, mate_chr, mate_pos, mate_cigar_str, mate_XM, mate_NM): concord_length = 1000 segment_length = sys.maxint pairs = {} pairs[0] = [flag, chr, pos, cigar_str, XM, NM] pairs[1] = [mate_flag, mate_chr, mate_pos, mate_cigar_str, mate_XM, mate_NM] if chr != mate_chr: return False, segment_length if (flag & 0x10 == 0x10) or (mate_flag & 0x10 == 0): return False, segment_length assert pos <= mate_pos left = pairs[0] right = pairs[1] left_start = left[2] left_len, _, _ = cal_read_len(left[3]) # cigar right_start = right[2] right_len, _, right_soft = cal_read_len(right[3]) segment_length = (right_start + right_len) - left_start - right_soft assert segment_length >= 0 if segment_length > concord_length: return False, segment_length return True, segment_length def pair_stat(pair_filename, gtf_junctions, chr_dic): # pair_stat = NM, junction_pair, gtf_junction, concordant_alignment] pair_stat = [[0, 0, 0, 0] for i in range(MAX_EDIT)] dis_pair_stat = [0 for i in range(MAX_EDIT)] temp_junctions = [set() for i in range(MAX_EDIT)] temp_gtf_junctions = [set() for i in range(MAX_EDIT)] alignment, dis_alignments = [], [] prev_read_id = "" con_file = open(pair_filename + ".con", "w") discon_file = open(pair_filename + ".discon", "w") pair_file = open(pair_filename, "r") for line in pair_file: read_id, flag, chr, pos, cigar_str, XM, NM, mate_flag, mate_chr, mate_pos, mate_cigar_str, mate_XM, mate_NM = line[:-1].split() flag, pos, XM, NM, mate_flag, mate_pos, mate_XM, mate_NM = \ int(flag), int(pos), int(XM[5:]), int(NM[5:]), int(mate_flag), int(mate_pos), int(mate_XM[5:]), int(mate_NM[5:]) pair_XM = XM + mate_XM pair_NM = NM + mate_NM pair_junctions, junction_pair, gtf_junction_pair = \ is_junction_pair(chr_dic, gtf_junctions, chr, pos, cigar_str, mate_chr, mate_pos, mate_cigar_str) # check concordantly concord_align, segment_len = is_concordantly(read_id, flag, chr, pos, cigar_str, XM, NM, mate_flag, mate_chr, mate_pos, mate_cigar_str, mate_XM, mate_NM) print >> (con_file if concord_align else discon_file), line.strip(), ('none', 'first')[(flag & 0x40 == 0x40)], ('none', 'last')[(mate_flag & 0x80 == 0x80)], segment_len if junction_pair: for junction_str, is_gtf_junction in pair_junctions: if pair_NM < len(temp_junctions): temp_junctions[pair_NM].add(junction_str) if is_gtf_junction: temp_gtf_junctions[pair_NM].add(junction_str) if read_id != prev_read_id: if prev_read_id != "": NM2, junction_read2, gtf_junction_read2, concord_align2 = alignment if NM2 < len(pair_stat): pair_stat[NM2][0] += 1 if junction_read2: pair_stat[NM2][1] += 1 if gtf_junction_read2: pair_stat[NM2][2] += 1 if concord_align2: pair_stat[NM2][3] += 1 for NM2 in dis_alignments: if NM2 < len(dis_pair_stat): dis_pair_stat[NM2] += 1 alignment = [] dis_alignment = [] prev_read_id = read_id if not alignment: alignment = [pair_NM, junction_pair, gtf_junction_pair, concord_align] elif alignment[0] > pair_NM or \ (alignment[0] == pair_NM and not alignment[2] and junction_pair): alignment = [pair_NM, junction_pair, gtf_junction_pair, concord_align] if mate_chr != chr or ((flag & 0x10) != 0 or (mate_flag & 0x10) == 0): if len(dis_alignments) == 0: dis_alignments = [pair_NM] elif dis_alignments[0] > pair_NM: dis_alignments = [pair_NM] pair_file.close() con_file.close() discon_file.close() # process last line if alignment: NM2, junction_read2, gtf_junction_read2, concord_align2 = alignment if NM2 < len(pair_stat): pair_stat[NM2][0] += 1 if junction_read2: pair_stat[NM2][1] += 1 if gtf_junction_read2: pair_stat[NM2][2] += 1 if concord_align2: pair_stat[NM2][3] += 1 assert len(dis_alignments) <= 1 for NM2 in dis_alignments: if NM2 < len(dis_pair_stat): dis_pair_stat[NM2] += 1 for i in range(len(pair_stat)): temp_junctions[i] = remove_redundant_junctions(temp_junctions[i]) temp_gtf_junctions[i] = remove_redundant_junctions(temp_gtf_junctions[i]) for i in range(len(pair_stat)): pair_stat[i].append(len(temp_junctions[i])) pair_stat[i].append(len(temp_gtf_junctions[i])) return pair_stat, dis_pair_stat def sql_execute(sql_db, sql_query): sql_cmd = [ "sqlite3", sql_db, "-separator", "\t", "%s;" % sql_query ] # print >> sys.stderr, sql_cmd sql_process = subprocess.Popen(sql_cmd, stdout=subprocess.PIPE) output = sql_process.communicate()[0][:-1] return output def create_sql_db(sql_db): if os.path.exists(sql_db): print >> sys.stderr, sql_db, "already exists!" return columns = [ ["id", "integer primary key autoincrement"], ["reads", "text"], ["genome", "text"], ["end_type", "text"], ["aligner", "text"], ["version", "test"], ["use_annotation", "text"], ["edit_distance", "integer"], ["mapped_reads", "integer"], ["junction_reads", "integer"], ["gtf_junction_reads", "integer"], ["junctions", "integer"], ["gtf_junctions", "integer"], ["runtime", "real"], ["host", "text"], ["created", "text"], ["cmd", "text"] ] sql_create_table = "CREATE TABLE Mappings (" for i in range(len(columns)): name, type = columns[i] if i != 0: sql_create_table += ", " sql_create_table += ("%s %s" % (name, type)) sql_create_table += ");" sql_execute(sql_db, sql_create_table) def write_analysis_data(sql_db, database_name, paired): if not os.path.exists(sql_db): return if paired: paired = "paired" else: paired = "single" aligners = [] sql_aligners = "SELECT aligner FROM Mappings WHERE end_type = '%s' GROUP BY aligner" % (paired) output = sql_execute(sql_db, sql_aligners) aligners = output.split() database_fname = database_name + "_" + paired + ".analysis" database_file = open(database_fname, "w") print >> database_file, "aligner\tuse_annotation\tend_type\tedit_distance\tmapped_reads\tjunction_reads\tgtf_junction_reads\tjunctions\tgtf_junctions\truntime" for aligner in aligners: for edit_distance in range(MAX_EDIT): sql_row = "SELECT aligner, use_annotation, end_type, edit_distance, mapped_reads, junction_reads, gtf_junction_reads, junctions, gtf_junctions, runtime FROM Mappings" sql_row += " WHERE reads = '%s' and aligner = '%s' and edit_distance = %d and end_type = '%s' ORDER BY created DESC LIMIT 1" % (database_name, aligner, edit_distance, paired) output = sql_execute(sql_db, sql_row) if output: print >> database_file, output database_file.close() def calculate_read_cost(single_end, paired_end, test_aligners, fresh, runtime_only, verbose): sql_db_name = "analysis.db" if not os.path.exists(sql_db_name): create_sql_db(sql_db_name) full_workdir = os.getcwd() workdir = full_workdir.split("/")[-1] num_cpus = multiprocessing.cpu_count() if num_cpus > 8: num_threads = min(8, num_cpus) desktop = False else: num_threads = min(3, num_cpus) desktop = True verbose = False sql_write = True is_large_file = False gz_file = False if os.path.exists("1.fq.gz"): gz_file = True if os.path.getsize("1.fq.gz") > (1024 * 1024 * 1024): is_large_file = True elif os.path.exists("1.fq"): gz_file = False if os.path.getsize("1.fq") > (2 * 1024 * 1024 * 1024): is_large_file = True else: assert(False) aligners = [ # ["hisat2", "", "", "", ""], # ["hisat2", "", "", "", "--sensitive"], # ["hisat2", "", "", "", "--very-sensitive"], # ["hisat2", "", "", "", "-k 50 --score-min C,-50,0"], # ["hisat2", "", "snp", "", ""], # ["hisat2", "", "snp", "", "--sensitive"], # ["hisat2", "", "snp", "", "-k 50"], # ["hisat2", "", "", "205", ""], # ["hisat2", "", "snp", "205", ""], # ["hisat2", "", "snp_tran", "205", ""], # ["hisat2", "", "tran", "", ""], # ["hisat2", "x1", "snp", "", ""], # ["hisat2", "x1", "", "", ""], # ["hisat2", "x2", "", "", ""], # ["hisat2", "", "tran", "", ""], # ["hisat2", "", "snp_tran", "204", ""], # ["hisat2", "", "snp_tran", "", ""], # ["hisat2", "", "", "210", ""], ["hisat2", "", "rep", "", ""], # ["hisat2", "", "rep", "", "--read-lengths 101"], # ["hisat2", "", "rep", "", "--sensitive"], # ["hisat2", "", "rep-100-300", "", ""], # ["hisat2", "", "rep-101-300", "", "--sensitive"], # ["hisat2", "", "rep-101-300", "", "-k 10 --score-min C,-50,0"], # ["hisat2", "", "rep-150-300", "", ""], # ["tophat2", "", "", "", ""], # ["bowtie", "", "", "", ""], ["bowtie2", "", "", "", ""], # ["bowtie2", "", "", "", "-k 10"], ["bwa", "mem", "", "", ""], # ["bwa", "mem", "", "", "-a"], # ["bwa", "sw", "", "", ""], # ["star", "", "", "", ""], # ["star", "x2", "", "", ""], # ["vg", "", "", "", ""], # ["vg", "", "", "", "-M 10"], # ["vg", "", "snp", "", ""], # ["vg", "", "snp", "", "-M 10"], # ["minimap2", "", "", "", ""], ] # sql_write = False verbose = True debug = False genome = "genome" cwd = os.getcwd() RNA = (cwd.find("RNA") != -1) chr_dic = read_genome("../../../data/" + genome + ".fa") snp_dic = read_snp("../../../data/" + genome + ".snp") gtf_junction_strs = extract_splice_sites("../../../data/" + genome + ".gtf") gene = "no" gtf_junctions = [] for junction_str in gtf_junction_strs: junction = to_junction(junction_str) gtf_junctions.append(junction) gtf_junctions = sorted(gtf_junctions, cmp=junction_cmp) print >> sys.stderr, "aligner\tuse_annotation\tend_type\tedit_distance\tmapped_reads\tjunction_reads\tgtf_junction_reads\tjunctions\tgtf_junctions\truntime" for paired in [False, True]: if not paired and not single_end: continue if paired and not paired_end: continue type_read1_fname = "1.fq" if gz_file: type_read1_fname += ".gz" if paired: type_read2_fname = "2.fq" if gz_file: type_read2_fname += ".gz" else: type_read2_fname = "" aligner_bin_base = "../../../../aligners/bin" def get_aligner_version(aligner): version = "" if aligner == "hisat2" or \ aligner == "hisat" or \ aligner == "bowtie" or \ aligner == "bowtie2": if version: cmd = ["%s/%s_%s/%s" % (aligner_bin_base, aligner, version, aligner)] else: cmd = ["%s/%s" % (aligner_bin_base, aligner)] cmd += ["--version"] cmd_process = subprocess.Popen(cmd, stdout=subprocess.PIPE) version = cmd_process.communicate()[0][:-1].split("\n")[0] version = version.split()[-1] elif aligner == "tophat2": cmd = ["%s/tophat" % (aligner_bin_base)] cmd += ["--version"] cmd_process = subprocess.Popen(cmd, stdout=subprocess.PIPE) version = cmd_process.communicate()[0][:-1].split()[-1] elif aligner in ["star", "starx2"]: version = "2.4.2a" elif aligner == "gsnap": cmd = ["%s/gsnap" % (aligner_bin_base)] cmd_process = subprocess.Popen(cmd, stderr=subprocess.PIPE) version = cmd_process.communicate()[1][:-1].split("\n")[0] version = version.split()[2] elif aligner == "bwa": if version: cmd = ["%s/bwa_%s/bwa" % (aligner_bin_base, version)] else: cmd = ["%s/bwa" % (aligner_bin_base)] cmd_process = subprocess.Popen(cmd, stderr=subprocess.PIPE) version = cmd_process.communicate()[1][:-1].split("\n")[2] version = version.split()[1] elif aligner == "vg": cmd = ["%s/vg" % (aligner_bin_base)] cmd_process = subprocess.Popen(cmd, stderr=subprocess.PIPE) version = cmd_process.communicate()[1][:-1].split("\n")[0] version = version.split()[5] elif aligner == "minimap2": cmd = ["%s/minimap2" % (aligner_bin_base)] cmd += ["--version"] cmd_process = subprocess.Popen(cmd, stdout=subprocess.PIPE) version = cmd_process.communicate()[0][:-1].split("\n")[0] return version index_base = "../../../../indexes" index_add = "" if genome != "genome": index_add = "_" + genome def get_aligner_cmd(RNA, aligner, type, index_type, version, options, read1_fname, read2_fname, out_fname, cmd_idx = 0): cmd = ["/usr/bin/time"] if osx_mode: cmd += ['-l'] if aligner == "hisat2": if version: cmd += ["%s/hisat2_%s/hisat2" % (aligner_bin_base, version)] else: cmd += ["%s/hisat2" % (aligner_bin_base)] if num_threads > 1: cmd += ["-p", str(num_threads)] # cmd += ["-k", "5"] # cmd += ["--score-min", "C,-18"] # daehwan - for debugging purposes # cmd += ["--score-min", "C,-50"] # cmd += ["--pen-cansplice", "0"] # cmd += ["--pen-noncansplice", "12"] # cmd += ["--pen-intronlen", "G,-8,1"] # cmd += ["--metrics", "1", # "--metrics-file", "metrics.out"] if version == "204": cmd += ["--sp", "2,1"] if not RNA: cmd += ["--no-spliced-alignment"] if type in ["x1", "x2"]: cmd += ["--no-temp-splicesite"] # DK - for debugging purposes # cmd += ["--dta"] """ if index_type == "tran": cmd += ["--no-anchorstop"] cmd += ["-k", "100"] """ if options != "": cmd += options.split(' ') if type == "x2": if cmd_idx == 0: cmd += ["--novel-splicesite-outfile"] else: cmd += ["--novel-splicesite-infile"] cmd += ["splicesites.txt"] # "--novel-splicesite-infile", # "../splicesites.txt", # "--rna-strandness", # "FR", if version: index_cmd = "%s/HISAT2_%s%s/" % (index_base, version, index_add) + genome else: index_cmd = "%s/HISAT2%s/" % (index_base, index_add) + genome if index_type: index_cmd += ("_" + index_type) cmd += [index_cmd] if paired: cmd += ["-1", read1_fname, "-2", read2_fname] else: cmd += ["-U", read1_fname] elif aligner == "hisat": cmd += ["%s/hisat" % (aligner_bin_base)] if num_threads > 1: cmd += ["-p", str(num_threads)] # cmd += ["-k", "5"] # cmd += ["--score-min", "C,-18"] if version != "": version = int(version) else: version = sys.maxint if not RNA: cmd += ["--no-spliced-alignment"] if type in ["x1", "x2"] or not RNA: cmd += ["--no-temp-splicesite"] """ cmd += ["--rdg", "100,100", "--rfg", "100,100"] """ if type == "x2": if cmd_idx == 0: cmd += ["--novel-splicesite-outfile"] else: cmd += ["--novel-splicesite-infile"] cmd += ["splicesites.txt"] # "--novel-splicesite-infile", # "../splicesites.txt", # "--rna-strandness", # "FR", cmd += ["%s/HISAT%s/" % (index_base, index_add) + genome] if paired: cmd += ["-1", read1_fname, "-2", read2_fname] else: cmd += [read1_fname] elif aligner == "tophat2": cmd += ["%s/tophat" % (aligner_bin_base)] if num_threads > 1: cmd += ["-p", str(num_threads)] cmd += ["--read-edit-dist", "3"] cmd += ["--no-sort-bam"] cmd += ["--read-realign-edit-dist", "0"] cmd += ["--keep-tmp", "%s/HISAT%s/" % (index_base, index_add) + genome, read1_fname] if paired: cmd += [read2_fname] elif aligner == "star": cmd += ["%s/STAR" % (aligner_bin_base)] if num_threads > 1: cmd += ["--runThreadN", str(num_threads)] if type == "x2" and cmd_idx == 1: cmd += ["--genomeDir", "."] else: cmd += ["--genomeDir", "%s/STAR%s" % (index_base, index_add)] if desktop: cmd += ["--genomeLoad", "NoSharedMemory"] else: cmd += ["--genomeLoad", "LoadAndKeep"] if type == "x2": if cmd_idx == 1: cmd += ["--alignSJDBoverhangMin", "1"] cmd += ["--readFilesIn", read1_fname] if paired: cmd += [read2_fname] if paired: cmd += ["--outFilterMismatchNmax", "6"] else: cmd += ["--outFilterMismatchNmax", "3"] elif aligner == "bowtie": cmd += ["%s/bowtie" % (aligner_bin_base)] if num_threads > 1: cmd += ["-p", str(num_threads)] cmd += ["--sam", "-k", "10"] cmd += ["-n", "3"] if paired: cmd += ["-X", "500"] cmd += ["%s/Bowtie%s/" % (index_base, index_add) + genome] if paired: cmd += ["-1", read1_fname, "-2", read2_fname] else: cmd += [read1_fname] elif aligner == "bowtie2": if version: cmd += ["%s/bowtie2_%s/bowtie2" % (aligner_bin_base, version)] else: cmd += ["%s/bowtie2" % (aligner_bin_base)] if num_threads > 1: cmd += ["-p", str(num_threads)] #cmd += ["-k", "10"] #cmd += ["--score-min", "C,-18"] cmd += ["-X", "1000"] if options: cmd += options.split(' ') if version: cmd += ["-x %s/Bowtie2_%s%s/" % (index_base, version, index_add) + genome] else: cmd += ["-x %s/Bowtie2%s/" % (index_base, index_add) + genome] if paired: cmd += ["-1", read1_fname, "-2", read2_fname] else: cmd += [read1_fname] elif aligner == "gsnap": cmd += ["%s/gsnap" % (aligner_bin_base), "-A", "sam"] if num_threads > 1: cmd += ["-t", str(num_threads)] cmd += ["--max-mismatches=3", "-D", "%s/GSNAP%s" % (index_base, index_add), "-N", "1", "-d", genome, read1_fname] if paired: cmd += [read2_fname] elif aligner == "bwa": if version: cmd += ["%s/bwa_%s/bwa" % (aligner_bin_base, version)] else: cmd += ["%s/bwa" % (aligner_bin_base)] if type in ["mem", "aln"]: cmd += [type] elif type == "sw": cmd += ["bwa" + type] if num_threads > 1: cmd += ["-t", str(num_threads)] if options: cmd += options.split(' ') if version: cmd += ["%s/BWA_%s%s/%s.fa" % (index_base, version, index_add, genome)] else: cmd += ["%s/BWA%s/%s.fa" % (index_base, index_add, genome)] cmd += [read1_fname] if paired: cmd += [read2_fname] elif aligner == "vg": # vg map -d 22 -t 6 -M 10 -f ../sim-1.fa -f ../sim-2.fa --surject-to sam > result.sam cmd += ["%s/vg" % (aligner_bin_base)] cmd += ["map"] cmd += ["-t", str(num_threads)] cmd += ["--surject-to", "sam"] index_cmd = "%s/VG%s/" % (index_base, index_add) + genome if index_type: index_cmd += ("_" + index_type) if options: cmd += options.split(' ') cmd += ["-d", index_cmd] cmd += ["-f", read1_fname] if paired: cmd += ["-f", read2_fname] elif aligner == "minimap2": # minimap2 -a -x sr 22.mmi sim_1.fa sim_2.fa > result.sam cmd += ["%s/minimap2" % (aligner_bin_base)] cmd += ["-a"] cmd += ["-x", "sr"] index_cmd = "%s/minimap2%s/" % (index_base, index_add) + genome if index_type: index_cmd += ("_" + index_type) index_cmd += ".mmi" cmd += [index_cmd] cmd += [read1_fname] if paired: cmd += [read2_fname] else: assert False return cmd for aligner, type, index_type, version, options in aligners: skip = False if len(test_aligners) > 0: skip = True for test_aligner in test_aligners: if aligner == test_aligner: skip = False if skip: continue aligner_name = aligner + type + version if (aligner == "hisat2" or aligner == "vg") and index_type != "": aligner_name += ("_" + index_type) if options != "": option_name = options.replace(' ', '').replace('-', '').replace(',', '') aligner_name = aligner_name + '_' + option_name if paired: aligner_dir = aligner_name + "_paired" else: aligner_dir = aligner_name + "_single" if fresh and os.path.exists(aligner_dir): os.system("rm -rf %s" % aligner_dir) if not os.path.exists(aligner_dir): os.mkdir(aligner_dir) os.chdir(aligner_dir) out_fname = "accepted.sam" aligner_cmd = get_aligner_cmd(RNA, aligner, type, index_type, version, options, "../" + type_read1_fname, "../" + type_read2_fname, out_fname) duration = 0.1 mem_usage = '' if not os.path.exists(out_fname): if not os.path.exists("../one.fq") or not os.path.exists("../two.fq"): if gz_file: os.system("gzip -cd ../1.fq.gz | head -400 > ../one.fq") os.system("gzip -cd ../2.fq.gz | head -400 > ../two.fq") else: os.system("head -400 ../1.fq > ../one.fq") os.system("head -400 ../2.fq > ../two.fq") # dummy commands for caching index loading_time = 0 if aligner not in ["tophat2"]: for i in range(3): dummy_cmd = get_aligner_cmd(RNA, aligner, type, index_type, version, options, "../one.fq", "../two.fq", "/dev/null") start_time = datetime.now() if verbose: print >> sys.stderr, start_time, "\t", " ".join(dummy_cmd) if aligner in ["hisat2", "hisat", "bowtie", "bowtie2", "gsnap", "bwa"]: proc = subprocess.Popen(dummy_cmd, stdout=open("/dev/null", "w"), stderr=subprocess.PIPE) else: proc = subprocess.Popen(dummy_cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE) proc.communicate() finish_time = datetime.now() duration = finish_time - start_time duration = duration.total_seconds() if verbose: print >> sys.stderr, finish_time, "duration:", duration loading_time = duration # align all reads if paired: sweep_read_cmd = "cat ../%s ../%s > /dev/null" % (type_read1_fname, type_read2_fname) else: sweep_read_cmd = "cat ../%s > /dev/null" % (type_read1_fname) print >> sys.stderr, datetime.now(), "\t", sweep_read_cmd os.system(sweep_read_cmd) skip_alignment = False if paired and aligner == "olego" and os.path.exists(out_fname + "1"): skip_alignment = True if not skip_alignment: aligner_cmd = get_aligner_cmd(RNA, aligner, type, index_type, version, options, "../" + type_read1_fname, "../" + type_read2_fname, out_fname) start_time = datetime.now() if verbose: print >> sys.stderr, start_time, "\t", " ".join(aligner_cmd) if aligner in ["hisat2", "hisat", "bowtie", "bowtie2", "gsnap", "bwa", "vg", "minimap2"]: proc = subprocess.Popen(aligner_cmd, stdout=open(out_fname, "w"), stderr=subprocess.PIPE) else: proc = subprocess.Popen(aligner_cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE) _, mem_usage = proc.communicate() mem_usage = parse_mem_usage(mem_usage) finish_time = datetime.now() duration = finish_time - start_time duration = duration.total_seconds() - loading_time if duration < 0.1: duration = 0.1 if verbose: print >> sys.stderr, finish_time, "duration:", duration if verbose: print >> sys.stderr, finish_time, "Memory Usage: %dMB" % (int(mem_usage) / 1024) if debug and aligner == "hisat" and type == "x1": os.system("cat metrics.out") print >> sys.stderr, "\ttime: %.4f" % (duration) # break if aligner == "star" and type in ["", "gtf"]: os.system("mv Aligned.out.sam %s" % out_fname) elif aligner in ["hisat2", "hisat"] and type == "x2": aligner_cmd = get_aligner_cmd(RNA, aligner, type, index_type, version, options, "../" + type_read1_fname, "../" + type_read2_fname, out_fname, 1) if verbose: print >> sys.stderr, start_time, "\t", " ".join(aligner_cmd) start_time = datetime.now() proc = subprocess.Popen(aligner_cmd, stdout=open(out_fname, "w"), stderr=subprocess.PIPE) proc.communicate() finish_time = datetime.now() duration += (finish_time - start_time).total_seconds() duration -= loading_time if duration < 0.1: duration = 0.1 if verbose: print >> sys.stderr, finish_time, "duration:", duration elif aligner == "star" and type == "x2": assert os.path.exists("SJ.out.tab") os.system("awk 'BEGIN {OFS=\"\t\"; strChar[0]=\".\"; strChar[1]=\"+\"; strChar[2]=\"-\";} {if($5>0){print $1,$2,$3,strChar[$4]}}' SJ.out.tab > SJ.out.tab.Pass1.sjdb") for file in os.listdir("."): if file in ["SJ.out.tab.Pass1.sjdb", "genome.fa"]: continue os.remove(file) star_index_cmd = "STAR --genomeDir ./ --runMode genomeGenerate --genomeFastaFiles ../../../../data/genome.fa --sjdbFileChrStartEnd SJ.out.tab.Pass1.sjdb --sjdbOverhang 100 --runThreadN %d" % (num_threads) print >> sys.stderr, "\t", datetime.now(), star_index_cmd os.system(star_index_cmd) if verbose: print >> sys.stderr, "\t", datetime.now(), " ".join(dummy_cmd) proc = subprocess.Popen(dummy_cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE) proc.communicate() if verbose: print >> sys.stderr, "\t", datetime.now(), "finished" aligner_cmd = get_aligner_cmd(RNA, aligner, type, index_type, version, options, "../" + type_read1_fname, "../" + type_read2_fname, out_fname, 1) start_time = datetime.now() if verbose: print >> sys.stderr, "\t", start_time, " ".join(aligner_cmd) proc = subprocess.Popen(aligner_cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE) proc.communicate() finish_time = datetime.now() duration += (finish_time - start_time).total_seconds() duration -= loading_time if duration < 0.1: duration = 0.1 if verbose: print >> sys.stderr, "\t", finish_time, "finished:", duration os.system("mv Aligned.out.sam %s" % out_fname) elif aligner == "tophat2": os.system("samtools sort -n tophat_out/accepted_hits.bam accepted_hits; samtools view -h accepted_hits.bam > %s" % out_fname) elif aligner == "vg": index_name = '%s/VG%s/' % (index_base, index_add) + genome if index_type: index_name += ('_' + index_type) os.system("echo %s %s %f >> runtime" % (str(datetime.now()), aligner, duration)) if aligner in ["star", "tophat2", "gsnap"]: os.system("tar cvzf %s.tar.gz %s &> /dev/null" % (out_fname, out_fname)) if runtime_only: os.chdir("..") continue suffix = aligner read_sam, pair_sam = suffix + ".read.sam", suffix + ".pair.sam" unmapped_read_1_fq, unmapped_read_2_fq = suffix + ".unmapped.1.fq", suffix + ".unmapped.2.fq" if not os.path.exists(read_sam) or not os.path.exists(pair_sam): if index_type == 'snp': extract_reads_and_pairs(chr_dic, out_fname, read_sam, pair_sam, unmapped_read_1_fq, unmapped_read_2_fq, snp_dic) else: extract_reads_and_pairs(chr_dic, out_fname, read_sam, pair_sam, unmapped_read_1_fq, unmapped_read_2_fq) out = '' if gz_file: out = subprocess.check_output("gzip -cd ../%s | wc -l" % type_read1_fname, shell=True) else: out = subprocess.check_output("wc -l ../%s" % type_read1_fname, shell=True) numreads = int(out.split()[0]) / 4 done_filename = suffix + ".done" if not os.path.exists(done_filename): done_file = open(done_filename, "w") if paired: sum = [0, 0, 0, 0, 0, 0] # mappep_read, junction_read, gtf_junction_reads, concord_mapped_read, num_junctions, num_gtf_junctions dis_sum = 0 stat, dis_stat = pair_stat(pair_sam, gtf_junctions, chr_dic) output = "" for i in range(len(stat)): for j in range(len(sum)): sum[j] += stat[i][j] dis_sum += dis_stat[i] mapped_reads, junction_reads, gtf_junction_reads, concord_mapped_read, num_junctions, num_gtf_junctions = sum output += "%s\t%s\tpaired\t%d\t%d\t%.2f%%\t%d\t%d\t%d\t%d\t%f\t%d\t%d\t%.2f%%\n" % \ (aligner_name, gene, i, mapped_reads, float(mapped_reads) * 100.0 / numreads, junction_reads, gtf_junction_reads, num_junctions, num_gtf_junctions, duration, (numreads / max(1.0, duration)), concord_mapped_read, float(concord_mapped_read) * 100.0 / numreads) if sql_write and os.path.exists("../" + sql_db_name): sql_insert = "INSERT INTO \"Mappings\" VALUES(NULL, '%s', '%s', '%s', '%s', '%s', '%s', %d, %d, %d, %d, %d, %d, %f, '%s', datetime('now', 'localtime'), '%s');" % \ (workdir, genome, "paired", aligner_name, get_aligner_version(aligner), "no", i, mapped_reads, junction_reads, gtf_junction_reads, num_junctions, num_gtf_junctions, duration, platform.node(), " ".join(aligner_cmd)) sql_execute("../" + sql_db_name, sql_insert) print >> sys.stderr, output, print >> done_file, output else: sum = [0, 0, 0, 0, 0] stat = read_stat(read_sam, gtf_junctions, chr_dic) output = "" for i in range(len(stat)): for j in range(len(sum)): sum[j] += stat[i][j] mapped_reads, junction_reads, gtf_junction_reads, num_junctions, num_gtf_junctions = sum output += "%s\t%s\tsingle\t%d\t%d\t%.2f%%\t%d\t%d\t%d\t%d\t%f\t%d\n" % \ (aligner_name, gene, i, mapped_reads, float(mapped_reads) * 100.0 / numreads, junction_reads, gtf_junction_reads, num_junctions, num_gtf_junctions, duration, (numreads / max(1.0, duration))) if sql_write and os.path.exists("../" + sql_db_name): sql_insert = "INSERT INTO \"Mappings\" VALUES(NULL, '%s', '%s', '%s', '%s', '%s', '%s', %d, %d, %d, %d, %d, %d, %f, '%s', datetime('now', 'localtime'), '%s');" % \ (workdir, genome, "single", aligner_name, get_aligner_version(aligner), "no", i, mapped_reads, junction_reads, gtf_junction_reads, num_junctions, num_gtf_junctions, duration, platform.node(), " ".join(aligner_cmd)) sql_execute("../" + sql_db_name, sql_insert) print >> sys.stderr, output, print >> done_file, output done_file.close() os.chdir("..") if os.path.exists(sql_db_name): write_analysis_data(sql_db_name, workdir, paired) if __name__ == "__main__": parser = ArgumentParser( description='test HISAT2, and compare HISAT2 with other popular aligners such as TopHat2, STAR, Bowtie1/2, GSNAP, BWA-mem, etc.') parser.add_argument('--single-end', dest='paired_end', action='store_false', help='run single-end only') parser.add_argument('--paired-end', dest='single_end', action='store_false', help='run paired_end only') parser.add_argument('--aligner-list', dest='aligner_list', type=str, default="", help='comma-separated list of aligners (e.g. hisat2,bowtie2,bwa') parser.add_argument('--fresh', dest='fresh', action='store_true', help='delete existing alignment related directories (e.g. hisat2_single)') parser.add_argument('--runtime-only', dest='runtime_only', action='store_true', help='run programs without evaluation') parser.add_argument('-v', '--verbose', dest='verbose', action='store_true', help='also print some statistics to stderr') args = parser.parse_args() aligners = [] for aligner in args.aligner_list.split(','): if aligner == "": continue aligners.append(aligner) calculate_read_cost(args.single_end, args.paired_end, aligners, args.fresh, args.runtime_only, args.verbose)