hisat-3n/evaluation/real/calculate_read_cost.py

1676 lines
62 KiB
Python
Raw Normal View History

2025-01-18 13:09:52 +00:00
#!/usr/bin/env python
import sys, os, subprocess, signal
import multiprocessing
import platform
import string
import re
from datetime import datetime, date, time
from collections import defaultdict
from argparse import ArgumentParser, FileType
osx_mode = False
if sys.platform == 'darwin':
osx_mode = True
MAX_EDIT = 21
signal.signal(signal.SIGPIPE, signal.SIG_DFL)
cigar_re = re.compile('\d+\w')
"""
"""
def parse_mem_usage(resource):
if osx_mode:
resource = resource.strip().split('\n')
for line in resource:
if line.find('maximum resident set size') != -1:
return int(line.split()[0]) / 1024
else:
resource = resource.split(' ')
for line in resource:
idx = line.find('maxresident')
if idx != -1:
return line[:idx]
return '0'
"""
"""
def reverse_complement(seq):
result = ""
for nt in seq:
base = nt
if nt == 'A':
base = 'T'
elif nt == 'a':
base = 't'
elif nt == 'C':
base = 'G'
elif nt == 'c':
base = 'g'
elif nt == 'G':
base = 'C'
elif nt == 'g':
base = 'c'
elif nt == 'T':
base = 'A'
elif nt == 't':
base = 'a'
result = base + result
return result
"""
"""
def read_genome(genome_filename):
chr_dic = {}
genome_file = open(genome_filename, "r")
chr_name, sequence = "", ""
for line in genome_file:
if line[0] == ">":
if chr_name and sequence:
chr_dic[chr_name] = sequence
chr_name = line[1:-1].split()[0]
sequence = ""
else:
sequence += line[:-1]
if chr_name and sequence:
chr_dic[chr_name] = sequence
genome_file.close()
print >> sys.stderr, "genome is loaded"
return chr_dic
"""
"""
def read_snp(snp_filename):
snps = defaultdict(list)
snp_file = open(snp_filename, 'r')
for line in snp_file:
line = line.strip()
if not line or line.startswith('#'):
continue
try:
snpID, type, chr, pos, data = line.split('\t')
except ValueError:
continue
assert type in ["single", "deletion", "insertion"]
if type == "deletion":
data = int(data)
snps[chr].append([snpID, type, int(pos), data])
print >> sys.stderr, "snp is loaded"
return snps
"""
"""
def extract_splice_sites(gtf_fname):
trans = {}
gtf_file = open(gtf_fname)
# Parse valid exon lines from the GTF file into a dict by transcript_id
for line in gtf_file:
line = line.strip()
if not line or line.startswith('#'):
continue
if '#' in line:
line = line.split('#')[0].strip()
try:
chrom, source, feature, left, right, score, \
strand, frame, values = line.split('\t')
except ValueError:
continue
left, right = int(left), int(right)
if feature != 'exon' or left >= right:
continue
values_dict = {}
for attr in values.split(';')[:-1]:
attr, _, val = attr.strip().partition(' ')
values_dict[attr] = val.strip('"')
if 'gene_id' not in values_dict or \
'transcript_id' not in values_dict:
continue
transcript_id = values_dict['transcript_id']
if transcript_id not in trans:
trans[transcript_id] = [chrom, strand, [[left, right]]]
else:
trans[transcript_id][2].append([left, right])
gtf_file.close()
# Sort exons and merge where separating introns are <=5 bps
for tran, [chrom, strand, exons] in trans.items():
exons.sort()
tmp_exons = [exons[0]]
for i in range(1, len(exons)):
if exons[i][0] - tmp_exons[-1][1] <= 5:
tmp_exons[-1][1] = exons[i][1]
else:
tmp_exons.append(exons[i])
trans[tran] = [chrom, strand, tmp_exons]
# Calculate and print the unique junctions
junctions = set()
for chrom, strand, exons in trans.values():
for i in range(1, len(exons)):
junctions.add(to_junction_str([chrom, exons[i-1][1], exons[i][0]]))
return junctions
def to_junction_str(junction):
return "%s-%d-%d" % (junction[0], junction[1], junction[2])
def to_junction(junction_str):
chr, left, right = junction_str.split("-")
return [chr, int(left), int(right)]
def junction_cmp(a, b):
if a[0] != b[0]:
if a[0] < b[0]:
return -1
else:
return 1
if a[1] != b[1]:
if a[1] < b[1]:
return -1
else:
return 1
if a[2] != b[2]:
if a[2] < b[2]:
return -1
else:
return 1
return 0
# chr and pos are assumed to be integers
def get_junctions(chr, pos, cigar_str, min_anchor_len = 0, read_len = 100):
junctions = []
right_pos = pos
cigars = cigar_re.findall(cigar_str)
cigars = [[int(cigars[i][:-1]), cigars[i][-1]] for i in range(len(cigars))]
left_anchor_lens = []
cur_left_anchor_len = 0
for i in range(len(cigars)):
length, cigar_op = cigars[i]
if cigar_op in "MI":
cur_left_anchor_len += length
elif cigar_op == "N":
assert cur_left_anchor_len > 0
left_anchor_lens.append(cur_left_anchor_len)
cur_left_anchor_len = 0
for i in range(len(cigars)):
length, cigar_op = cigars[i]
if cigar_op == "N":
left, right = right_pos - 1, right_pos + length
if i > 0 and cigars[i-1][1] in "ID":
if cigars[i-1][1] == "I":
left += cigars[i-1][0]
else:
left -= cigars[i-1][0]
if i + 1 < len(cigars) and cigars[i+1][1] in "ID":
if cigars[i+1][1] == "I":
right -= cigars[i+1][0]
else:
right += cigars[i+1][0]
junction_idx = len(junctions)
assert junction_idx < len(left_anchor_lens)
left_anchor_len = left_anchor_lens[junction_idx]
assert left_anchor_len > 0 and left_anchor_len < read_len
right_anchor_len = read_len - left_anchor_len
if left_anchor_len >= min_anchor_len and right_anchor_len >= min_anchor_len:
junctions.append([chr, left, right])
if cigar_op in "MND":
right_pos += length
return junctions
def get_right(pos, cigars):
right_pos = pos
cigars = cigar_re.findall(cigars)
for cigar in cigars:
length = int(cigar[:-1])
cigar_op = cigar[-1]
if cigar_op in "MDN":
right_pos += length
return right_pos
def get_cigar_chars(cigars):
cigars = cigar_re.findall(cigars)
cigar_chars = ""
for cigar in cigars:
cigar_op = cigar[-1]
cigar_chars += cigar_op
return cigar_chars
def get_cigar_chars_MN(cigars):
cigars = cigar_re.findall(cigars)
cigar_chars = ""
for cigar in cigars:
cigar_op = cigar[-1]
if cigar_op in "MN":
if cigar_chars == "" or cigar_chars[-1] != cigar_op:
cigar_chars += cigar_op
return cigar_chars
def is_non_canonical_junction_read(chr_dic, chr, left, cigars, canonical_junctions = [["GT", "AG"], ["GC", "AG"], ["AT", "AC"]]):
pos = left
for cigar in cigar_re.findall(cigars):
cigar_op = cigar[-1]
cigar_len = int(cigar[:-1])
if cigar_op in 'MD':
pos += cigar_len
elif cigar_op == 'N':
right = pos + cigar_len
donor = chr_dic[chr][pos-1:pos+1]
acceptor = chr_dic[chr][right-3:right-1]
rev_donor = reverse_complement(acceptor)
rev_acceptor = reverse_complement(donor)
# print donor, acceptor
# print rev_donor, rev_acceptor
if [donor, acceptor] not in canonical_junctions and [rev_donor, rev_acceptor] not in canonical_junctions:
return True
pos = right
return False
def is_canonical_junction(chr_dic, junction):
chr, left, right = junction
donor = chr_dic[chr][left:left+2]
acceptor = chr_dic[chr][right-3:right-1]
rev_donor = reverse_complement(acceptor)
rev_acceptor = reverse_complement(donor)
if (donor == "GT" and acceptor == "AG") or \
(rev_donor == "GT" and rev_acceptor == "AG"):
return True
return False
def is_junction_read(chr_dic, gtf_junctions, chr, pos, cigar_str):
result_junctions = []
junctions = get_junctions(chr, pos, cigar_str, 0, 101)
for junction in junctions:
junction_str = to_junction_str(junction)
is_gtf_junction = False
def find_in_gtf_junctions(gtf_junctions, junction):
l, u = 0, len(gtf_junctions)
while l < u:
m = (l + u) / 2
assert m >= 0 and m < len(gtf_junctions)
cmp_result = junction_cmp(junction, gtf_junctions[m])
if cmp_result == 0:
return m
elif cmp_result < 0:
u = m
else:
l = m + 1
return l
# allow small (<= 5bp) discrepancy for non-canonical splice sites.
relaxed_junction_dist = 5
chr, left, right = junction
gtf_index = find_in_gtf_junctions(gtf_junctions, [chr, left - relaxed_junction_dist, right - relaxed_junction_dist])
if gtf_index >= 0:
i = gtf_index
while i < len(gtf_junctions):
chr2, left2, right2 = gtf_junctions[i]
if chr2 > chr or \
left2 - left > relaxed_junction_dist or \
right2 - right > relaxed_junction_dist:
break
if abs(left - left2) <= relaxed_junction_dist and left - left2 == right - right2:
canonical = is_canonical_junction(chr_dic, gtf_junctions[i])
if left == left2 or not canonical:
is_gtf_junction = True
break
i += 1
result_junctions.append([junction_str, is_gtf_junction])
is_gtf_junction_read = False
if len(result_junctions) > 0:
is_gtf_junction_read = True
for junction_str, is_gtf_junction in result_junctions:
if not is_gtf_junction:
is_gtf_junction_read = False
break
return result_junctions, len(result_junctions) > 0, is_gtf_junction_read
def is_junction_pair(chr_dic, gtf_junctions, chr, pos, cigar_str, mate_chr, mate_pos, mate_cigar_str):
junctions, junction_read, gtf_junction_read = is_junction_read(chr_dic, gtf_junctions, chr, pos, cigar_str)
mate_junctions, mate_junction_read, mate_gtf_junction_read = is_junction_read(chr_dic, gtf_junctions, mate_chr, mate_pos, mate_cigar_str)
junctions += mate_junctions
junction_pair = len(junctions) > 0
if junction_pair:
gtf_junction_pair = True
if junction_read and not gtf_junction_read:
gtf_junction_pair = False
if mate_junction_read and not mate_gtf_junction_read:
gtf_junction_pair = False
else:
gtf_junction_pair = False
return junctions, junction_pair, gtf_junction_pair
"""
"""
def getSNPs(chr_snps, left, right):
low, high = 0, len(chr_snps)
while low < high:
mid = (low + high) / 2
snpID, type, pos, data = chr_snps[mid]
if pos < left:
low = mid + 1
else:
high = mid - 1
snps = []
for i in xrange(low, len(chr_snps)):
snp = chr_snps[i]
snpID, type, pos, data = snp
pos2 = pos
if type == "deletion":
pos2 += data
if pos2 >= right:
break
if pos >= left:
if len(snps) > 0:
_, prev_type, prev_pos, prev_data = snps[-1]
assert prev_pos <= pos
prev_pos2 = prev_pos
if prev_type == "deletion":
prev_pos2 += prev_data
if pos <= prev_pos2:
continue
snps.append(snp)
return snps
"""
"""
def check_snps(snps, check_type, ref_pos, read_seq):
found = False
for snp in snps:
snp_type, snp_pos, snp_data = snp[1:4]
if snp_type == check_type:
if snp_type == 'single':
if snp_pos == ref_pos and snp_data[0] == read_seq[0]:
found = True
break
elif snp_type == 'insertion':
if snp_pos == ref_pos and snp_data == read_seq:
found = True
break
elif snp_type == 'deletion':
# snp_data and read_seq are length of sequence deleted
if snp_pos == ref_pos and int(snp_data) == int(read_seq):
found = True
break
return found
def extract_reads_and_pairs(chr_dic, sam_filename, read_filename, pair_filename, unmapped_read_1_fq_name, unmapped_read_2_fq_name, snps_dict = None):
temp_read_filename, temp_pair_filename = read_filename + ".temp", pair_filename + ".temp"
temp_read_file, temp_pair_file = open(temp_read_filename, "w"), open(temp_pair_filename, "w")
unmapped_read_1_fq, unmapped_read_2_fq = open(unmapped_read_1_fq_name, "w"), open(unmapped_read_2_fq_name, "w")
hisat2 = read_filename.find("hisat2") != -1 or pair_filename.find("hisat2") != -1
vg = read_filename.find("vg") != -1 or pair_filename.find("vg") != -1
read_dic = {}
prev_read_id, prev_read_seq = "", ""
sam_file = open(sam_filename, "r")
for line in sam_file:
if line[0] == "@":
continue
fields = line[:-1].split()
read_id, flag, chr, pos, mapQ, cigar_str, mate_chr, mate_pos, template_len, read_seq, read_qual = fields[:11]
if 'H' in cigar_str:
continue
flag, pos, mate_pos = int(flag), int(pos), int(mate_pos)
if read_seq == "*" and prev_read_id == read_id:
read_seq = prev_read_seq
read_seq = read_seq.upper()
if read_seq == "" or read_seq == "*":
continue
if flag & 0x04 != 0 or \
chr == "*" or \
cigar_str == "*":
"""
if flag & 0x80 != 0:
print >> unmapped_read_2_fq, "@%s\n%s\n+%s\n%s" % (read_id, read_seq, read_id, read_qual)
else:
print >> unmapped_read_1_fq, "@%s\n%s\n+%s\n%s" % (read_id, read_seq, read_id, read_qual)
"""
continue
if mate_chr == '=':
mate_chr = chr
if len(read_id) >= 3 and read_id[-2] == "/":
read_id = read_id[:-2]
if read_id.find("seq.") == 0:
read_id = read_id[4:]
if read_id != prev_read_id:
read_dic = {}
HISAT2_XM, HISAT2_NM = 0, 0
if hisat2:
for field in fields[11:]:
if field[:5] == "XM:i:":
HISAT2_XM = int(field[5:])
elif field[:5] == "NM:i:":
HISAT2_NM = int(field[5:])
prev_read_id = read_id
prev_read_seq = read_seq
if snps_dict != None and chr in snps_dict:
chr_snps = snps_dict[chr]
else:
chr_snps = []
snps = None
XM, gap = 0, 0
read_pos, right_pos = 0, pos - 1,
junction_read = False
cigars = cigar_re.findall(cigar_str)
for i in range(len(cigars)):
cigar = cigars[i]
length = int(cigar[:-1])
cigar_op = cigar[-1]
if cigar_op == "S":
if i != 0 and i != len(cigars) - 1:
print >> sys.stderr, "S is located at %dth out of %d %s" % (i+1, len(cigars), cigar_str)
if cigar_op in "MS":
ref_pos = right_pos
if cigar_op == "S" and i == 0:
ref_seq = chr_dic[chr][right_pos-length:right_pos]
ref_pos = right_pos - length
else:
ref_seq = chr_dic[chr][right_pos:right_pos+length]
ref_seq = ref_seq.upper()
if length == len(ref_seq):
for j in range(length):
if ref_seq[j] != "N" and read_seq[read_pos+j] != ref_seq[j]:
if snps_dict == None:
XM += 1
else:
if snps == None:
snps = getSNPs(chr_snps, pos - 1, pos + len(read_seq))
found_snp = check_snps(snps, 'single', ref_pos + j, read_seq[read_pos + j])
if not found_snp:
XM += 1
if hisat2 and cigar_op == "S":
HISAT2_XM += 1
HISAT2_NM += 1
else:
XM += length
if cigar_op in "I":
if snps == None:
snps = getSNPs(chr_snps, pos - 1, pos + len(read_seq))
found_snp = check_snps(snps, 'insertion', right_pos, read_seq[read_pos:read_pos + length])
if not found_snp:
gap += length
if cigar_op in "D":
if snps == None:
snps = getSNPs(chr_snps, pos - 1, pos + len(read_seq))
found_snp = check_snps(snps, 'deletion', right_pos, length)
if not found_snp:
gap += length
if cigar_op in "MND":
right_pos += length
if cigar_op in "MIS":
read_pos += length
if cigar_op == "N":
junction_read = True
NM = XM + gap
if hisat2:
XM, NM = HISAT2_XM, HISAT2_NM
if NM < MAX_EDIT:
print >> temp_read_file, "%s\t%d\t%s\t%s\t%s\tXM:i:%d\tNM:i:%d" % \
(read_id, flag, chr, pos, cigar_str, XM, NM)
found = False
me = "%s\t%s\t%d" % (read_id, chr, pos)
partner = "%s\t%s\t%d" % (read_id, mate_chr, mate_pos)
if partner in read_dic:
maps = read_dic[partner]
for map in maps:
if map[0] == me:
mate_flag, mate_cigar_str, mate_XM, mate_NM = map[1:]
if mate_pos > pos:
flag, chr, pos, cigar_str, XM, NM, mate_flag, mate_chr_str, mate_pos, mate_cigar_str, mate_XM, mate_NM = \
mate_flag, mate_chr, mate_pos, mate_cigar_str, mate_XM, mate_NM, flag, chr, pos, cigar_str, XM, NM
print >> temp_pair_file, "%s\t%d\t%s\t%d\t%s\tXM:i:%d\tNM:i:%d\t%d\t%s\t%d\t%s\tXM:i:%d\tNM:i:%d" % \
(read_id, mate_flag, mate_chr, mate_pos, mate_cigar_str, mate_XM, mate_NM, flag, chr, pos, cigar_str, XM, NM)
found = True
break
if not found:
if not me in read_dic:
read_dic[me] = []
read_dic[me].append([partner, flag, cigar_str, XM, NM])
sam_file.close()
temp_read_file.close()
temp_pair_file.close()
unmapped_read_1_fq.close()
unmapped_read_2_fq.close()
sort = False
if vg:
sort = True
if sort:
command = "sort %s | uniq > %s; rm %s" % (temp_read_filename, read_filename, temp_read_filename)
os.system(command)
command = "sort %s | uniq > %s; rm %s" % (temp_pair_filename, pair_filename, temp_pair_filename)
os.system(command)
else:
command = "mv %s %s; mv %s %s" % (temp_read_filename, read_filename, temp_pair_filename, pair_filename)
os.system(command)
def remove_redundant_junctions(junctions):
temp_junctions = []
for junction in junctions:
temp_junctions.append(to_junction(junction))
junctions = sorted(list(temp_junctions), cmp=junction_cmp)
temp_junctions = []
for can_junction in junctions:
if len(temp_junctions) <= 0:
temp_junctions.append(can_junction)
else:
chr, left, right = temp_junctions[-1]
chr2, left2, right2 = can_junction
if chr == chr2 and \
abs(left - left2) == abs(right - right2) and \
abs(left - left2) <= 10:
continue
temp_junctions.append(can_junction)
junctions = set()
for junction in temp_junctions:
junctions.add(to_junction_str(junction))
return junctions
def read_stat(read_filename, gtf_junctions, chr_dic = None, debug = False):
read_stat = [[0, 0, 0] for i in range(MAX_EDIT)]
temp_junctions = [set() for i in range(MAX_EDIT)]
temp_gtf_junctions = [set() for i in range(MAX_EDIT)]
alignment = []
prev_read_id = ""
read_file = open(read_filename, "r")
for line in read_file:
read_id, flag, chr, pos, cigar_str, XM, NM = line[:-1].split()
flag, pos = int(flag), int(pos)
XM, NM = int(XM[5:]), int(NM[5:])
read_junctions, junction_read, gtf_junction_read = \
is_junction_read(chr_dic, gtf_junctions, chr, pos, cigar_str)
if junction_read:
for junction_str, is_gtf_junction in read_junctions:
if NM < len(temp_junctions):
temp_junctions[NM].add(junction_str)
if is_gtf_junction:
temp_gtf_junctions[NM].add(junction_str)
if read_id != prev_read_id:
if prev_read_id != "":
NM2, junction_read2, gtf_junction_read2 = alignment
if NM2 < len(read_stat):
read_stat[NM2][0] += 1
if junction_read2:
read_stat[NM2][1] += 1
if gtf_junction_read2:
read_stat[NM2][2] += 1
alignment = []
prev_read_id = read_id
if not alignment:
alignment = [NM, junction_read, gtf_junction_read]
elif alignment[0] > NM or \
(alignment[0] == NM and not alignment[2] and junction_read):
alignment = [NM, junction_read, gtf_junction_read]
read_file.close()
for i in range(len(read_stat)):
temp_junctions[i] = remove_redundant_junctions(temp_junctions[i])
temp_gtf_junctions[i] = remove_redundant_junctions(temp_gtf_junctions[i])
for i in range(len(read_stat)):
read_stat[i].append(len(temp_junctions[i]))
read_stat[i].append(len(temp_gtf_junctions[i]))
if alignment:
NM2, junction_read2, gtf_junction_read2 = alignment
if NM2 < len(read_stat):
read_stat[NM2][0] += 1
if junction_read2:
read_stat[NM2][1] += 1
if gtf_junction_read2:
read_stat[NM2][2] += 1
return read_stat
def cal_read_len(cigar_str):
length = 0
leftmost_softclip = 0
rightmost_softclip = 0
cigars = cigar_re.findall(cigar_str)
for i in range(len(cigars)):
cigar = cigars[i]
cigar_length = int(cigar[:-1])
cigar_op = cigar[-1]
if cigar_op in "MIS":
length += cigar_length
if (i == 0) and (cigar_op == "S"):
leftmost_softclip = cigar_length
if (i == (len(cigars) - 1)) and (cigar_op == "S"):
rightmost_softclip = cigar_length
return length, leftmost_softclip, rightmost_softclip
def is_concordantly(read_id, flag, chr, pos, cigar_str, XM, NM, mate_flag, mate_chr, mate_pos, mate_cigar_str, mate_XM, mate_NM):
concord_length = 1000
segment_length = sys.maxint
pairs = {}
pairs[0] = [flag, chr, pos, cigar_str, XM, NM]
pairs[1] = [mate_flag, mate_chr, mate_pos, mate_cigar_str, mate_XM, mate_NM]
if chr != mate_chr:
return False, segment_length
if (flag & 0x10 == 0x10) or (mate_flag & 0x10 == 0):
return False, segment_length
assert pos <= mate_pos
left = pairs[0]
right = pairs[1]
left_start = left[2]
left_len, _, _ = cal_read_len(left[3]) # cigar
right_start = right[2]
right_len, _, right_soft = cal_read_len(right[3])
segment_length = (right_start + right_len) - left_start - right_soft
assert segment_length >= 0
if segment_length > concord_length:
return False, segment_length
return True, segment_length
def pair_stat(pair_filename, gtf_junctions, chr_dic):
# pair_stat = NM, junction_pair, gtf_junction, concordant_alignment]
pair_stat = [[0, 0, 0, 0] for i in range(MAX_EDIT)]
dis_pair_stat = [0 for i in range(MAX_EDIT)]
temp_junctions = [set() for i in range(MAX_EDIT)]
temp_gtf_junctions = [set() for i in range(MAX_EDIT)]
alignment, dis_alignments = [], []
prev_read_id = ""
con_file = open(pair_filename + ".con", "w")
discon_file = open(pair_filename + ".discon", "w")
pair_file = open(pair_filename, "r")
for line in pair_file:
read_id, flag, chr, pos, cigar_str, XM, NM, mate_flag, mate_chr, mate_pos, mate_cigar_str, mate_XM, mate_NM = line[:-1].split()
flag, pos, XM, NM, mate_flag, mate_pos, mate_XM, mate_NM = \
int(flag), int(pos), int(XM[5:]), int(NM[5:]), int(mate_flag), int(mate_pos), int(mate_XM[5:]), int(mate_NM[5:])
pair_XM = XM + mate_XM
pair_NM = NM + mate_NM
pair_junctions, junction_pair, gtf_junction_pair = \
is_junction_pair(chr_dic, gtf_junctions, chr, pos, cigar_str, mate_chr, mate_pos, mate_cigar_str)
# check concordantly
concord_align, segment_len = is_concordantly(read_id, flag, chr, pos, cigar_str, XM, NM, mate_flag, mate_chr, mate_pos, mate_cigar_str, mate_XM, mate_NM)
print >> (con_file if concord_align else discon_file), line.strip(), ('none', 'first')[(flag & 0x40 == 0x40)], ('none', 'last')[(mate_flag & 0x80 == 0x80)], segment_len
if junction_pair:
for junction_str, is_gtf_junction in pair_junctions:
if pair_NM < len(temp_junctions):
temp_junctions[pair_NM].add(junction_str)
if is_gtf_junction:
temp_gtf_junctions[pair_NM].add(junction_str)
if read_id != prev_read_id:
if prev_read_id != "":
NM2, junction_read2, gtf_junction_read2, concord_align2 = alignment
if NM2 < len(pair_stat):
pair_stat[NM2][0] += 1
if junction_read2:
pair_stat[NM2][1] += 1
if gtf_junction_read2:
pair_stat[NM2][2] += 1
if concord_align2:
pair_stat[NM2][3] += 1
for NM2 in dis_alignments:
if NM2 < len(dis_pair_stat):
dis_pair_stat[NM2] += 1
alignment = []
dis_alignment = []
prev_read_id = read_id
if not alignment:
alignment = [pair_NM, junction_pair, gtf_junction_pair, concord_align]
elif alignment[0] > pair_NM or \
(alignment[0] == pair_NM and not alignment[2] and junction_pair):
alignment = [pair_NM, junction_pair, gtf_junction_pair, concord_align]
if mate_chr != chr or ((flag & 0x10) != 0 or (mate_flag & 0x10) == 0):
if len(dis_alignments) == 0:
dis_alignments = [pair_NM]
elif dis_alignments[0] > pair_NM:
dis_alignments = [pair_NM]
pair_file.close()
con_file.close()
discon_file.close()
# process last line
if alignment:
NM2, junction_read2, gtf_junction_read2, concord_align2 = alignment
if NM2 < len(pair_stat):
pair_stat[NM2][0] += 1
if junction_read2:
pair_stat[NM2][1] += 1
if gtf_junction_read2:
pair_stat[NM2][2] += 1
if concord_align2:
pair_stat[NM2][3] += 1
assert len(dis_alignments) <= 1
for NM2 in dis_alignments:
if NM2 < len(dis_pair_stat):
dis_pair_stat[NM2] += 1
for i in range(len(pair_stat)):
temp_junctions[i] = remove_redundant_junctions(temp_junctions[i])
temp_gtf_junctions[i] = remove_redundant_junctions(temp_gtf_junctions[i])
for i in range(len(pair_stat)):
pair_stat[i].append(len(temp_junctions[i]))
pair_stat[i].append(len(temp_gtf_junctions[i]))
return pair_stat, dis_pair_stat
def sql_execute(sql_db, sql_query):
sql_cmd = [
"sqlite3", sql_db,
"-separator", "\t",
"%s;" % sql_query
]
# print >> sys.stderr, sql_cmd
sql_process = subprocess.Popen(sql_cmd, stdout=subprocess.PIPE)
output = sql_process.communicate()[0][:-1]
return output
def create_sql_db(sql_db):
if os.path.exists(sql_db):
print >> sys.stderr, sql_db, "already exists!"
return
columns = [
["id", "integer primary key autoincrement"],
["reads", "text"],
["genome", "text"],
["end_type", "text"],
["aligner", "text"],
["version", "test"],
["use_annotation", "text"],
["edit_distance", "integer"],
["mapped_reads", "integer"],
["junction_reads", "integer"],
["gtf_junction_reads", "integer"],
["junctions", "integer"],
["gtf_junctions", "integer"],
["runtime", "real"],
["host", "text"],
["created", "text"],
["cmd", "text"]
]
sql_create_table = "CREATE TABLE Mappings ("
for i in range(len(columns)):
name, type = columns[i]
if i != 0:
sql_create_table += ", "
sql_create_table += ("%s %s" % (name, type))
sql_create_table += ");"
sql_execute(sql_db, sql_create_table)
def write_analysis_data(sql_db, database_name, paired):
if not os.path.exists(sql_db):
return
if paired:
paired = "paired"
else:
paired = "single"
aligners = []
sql_aligners = "SELECT aligner FROM Mappings WHERE end_type = '%s' GROUP BY aligner" % (paired)
output = sql_execute(sql_db, sql_aligners)
aligners = output.split()
database_fname = database_name + "_" + paired + ".analysis"
database_file = open(database_fname, "w")
print >> database_file, "aligner\tuse_annotation\tend_type\tedit_distance\tmapped_reads\tjunction_reads\tgtf_junction_reads\tjunctions\tgtf_junctions\truntime"
for aligner in aligners:
for edit_distance in range(MAX_EDIT):
sql_row = "SELECT aligner, use_annotation, end_type, edit_distance, mapped_reads, junction_reads, gtf_junction_reads, junctions, gtf_junctions, runtime FROM Mappings"
sql_row += " WHERE reads = '%s' and aligner = '%s' and edit_distance = %d and end_type = '%s' ORDER BY created DESC LIMIT 1" % (database_name, aligner, edit_distance, paired)
output = sql_execute(sql_db, sql_row)
if output:
print >> database_file, output
database_file.close()
def calculate_read_cost(single_end,
paired_end,
test_aligners,
fresh,
runtime_only,
verbose):
sql_db_name = "analysis.db"
if not os.path.exists(sql_db_name):
create_sql_db(sql_db_name)
full_workdir = os.getcwd()
workdir = full_workdir.split("/")[-1]
num_cpus = multiprocessing.cpu_count()
if num_cpus > 8:
num_threads = min(8, num_cpus)
desktop = False
else:
num_threads = min(3, num_cpus)
desktop = True
verbose = False
sql_write = True
is_large_file = False
gz_file = False
if os.path.exists("1.fq.gz"):
gz_file = True
if os.path.getsize("1.fq.gz") > (1024 * 1024 * 1024):
is_large_file = True
elif os.path.exists("1.fq"):
gz_file = False
if os.path.getsize("1.fq") > (2 * 1024 * 1024 * 1024):
is_large_file = True
else:
assert(False)
aligners = [
# ["hisat2", "", "", "", ""],
# ["hisat2", "", "", "", "--sensitive"],
# ["hisat2", "", "", "", "--very-sensitive"],
# ["hisat2", "", "", "", "-k 50 --score-min C,-50,0"],
# ["hisat2", "", "snp", "", ""],
# ["hisat2", "", "snp", "", "--sensitive"],
# ["hisat2", "", "snp", "", "-k 50"],
# ["hisat2", "", "", "205", ""],
# ["hisat2", "", "snp", "205", ""],
# ["hisat2", "", "snp_tran", "205", ""],
# ["hisat2", "", "tran", "", ""],
# ["hisat2", "x1", "snp", "", ""],
# ["hisat2", "x1", "", "", ""],
# ["hisat2", "x2", "", "", ""],
# ["hisat2", "", "tran", "", ""],
# ["hisat2", "", "snp_tran", "204", ""],
# ["hisat2", "", "snp_tran", "", ""],
# ["hisat2", "", "", "210", ""],
["hisat2", "", "rep", "", ""],
# ["hisat2", "", "rep", "", "--read-lengths 101"],
# ["hisat2", "", "rep", "", "--sensitive"],
# ["hisat2", "", "rep-100-300", "", ""],
# ["hisat2", "", "rep-101-300", "", "--sensitive"],
# ["hisat2", "", "rep-101-300", "", "-k 10 --score-min C,-50,0"],
# ["hisat2", "", "rep-150-300", "", ""],
# ["tophat2", "", "", "", ""],
# ["bowtie", "", "", "", ""],
["bowtie2", "", "", "", ""],
# ["bowtie2", "", "", "", "-k 10"],
["bwa", "mem", "", "", ""],
# ["bwa", "mem", "", "", "-a"],
# ["bwa", "sw", "", "", ""],
# ["star", "", "", "", ""],
# ["star", "x2", "", "", ""],
# ["vg", "", "", "", ""],
# ["vg", "", "", "", "-M 10"],
# ["vg", "", "snp", "", ""],
# ["vg", "", "snp", "", "-M 10"],
# ["minimap2", "", "", "", ""],
]
# sql_write = False
verbose = True
debug = False
genome = "genome"
cwd = os.getcwd()
RNA = (cwd.find("RNA") != -1)
chr_dic = read_genome("../../../data/" + genome + ".fa")
snp_dic = read_snp("../../../data/" + genome + ".snp")
gtf_junction_strs = extract_splice_sites("../../../data/" + genome + ".gtf")
gene = "no"
gtf_junctions = []
for junction_str in gtf_junction_strs:
junction = to_junction(junction_str)
gtf_junctions.append(junction)
gtf_junctions = sorted(gtf_junctions, cmp=junction_cmp)
print >> sys.stderr, "aligner\tuse_annotation\tend_type\tedit_distance\tmapped_reads\tjunction_reads\tgtf_junction_reads\tjunctions\tgtf_junctions\truntime"
for paired in [False, True]:
if not paired and not single_end:
continue
if paired and not paired_end:
continue
type_read1_fname = "1.fq"
if gz_file:
type_read1_fname += ".gz"
if paired:
type_read2_fname = "2.fq"
if gz_file:
type_read2_fname += ".gz"
else:
type_read2_fname = ""
aligner_bin_base = "../../../../aligners/bin"
def get_aligner_version(aligner):
version = ""
if aligner == "hisat2" or \
aligner == "hisat" or \
aligner == "bowtie" or \
aligner == "bowtie2":
if version:
cmd = ["%s/%s_%s/%s" % (aligner_bin_base, aligner, version, aligner)]
else:
cmd = ["%s/%s" % (aligner_bin_base, aligner)]
cmd += ["--version"]
cmd_process = subprocess.Popen(cmd, stdout=subprocess.PIPE)
version = cmd_process.communicate()[0][:-1].split("\n")[0]
version = version.split()[-1]
elif aligner == "tophat2":
cmd = ["%s/tophat" % (aligner_bin_base)]
cmd += ["--version"]
cmd_process = subprocess.Popen(cmd, stdout=subprocess.PIPE)
version = cmd_process.communicate()[0][:-1].split()[-1]
elif aligner in ["star", "starx2"]:
version = "2.4.2a"
elif aligner == "gsnap":
cmd = ["%s/gsnap" % (aligner_bin_base)]
cmd_process = subprocess.Popen(cmd, stderr=subprocess.PIPE)
version = cmd_process.communicate()[1][:-1].split("\n")[0]
version = version.split()[2]
elif aligner == "bwa":
if version:
cmd = ["%s/bwa_%s/bwa" % (aligner_bin_base, version)]
else:
cmd = ["%s/bwa" % (aligner_bin_base)]
cmd_process = subprocess.Popen(cmd, stderr=subprocess.PIPE)
version = cmd_process.communicate()[1][:-1].split("\n")[2]
version = version.split()[1]
elif aligner == "vg":
cmd = ["%s/vg" % (aligner_bin_base)]
cmd_process = subprocess.Popen(cmd, stderr=subprocess.PIPE)
version = cmd_process.communicate()[1][:-1].split("\n")[0]
version = version.split()[5]
elif aligner == "minimap2":
cmd = ["%s/minimap2" % (aligner_bin_base)]
cmd += ["--version"]
cmd_process = subprocess.Popen(cmd, stdout=subprocess.PIPE)
version = cmd_process.communicate()[0][:-1].split("\n")[0]
return version
index_base = "../../../../indexes"
index_add = ""
if genome != "genome":
index_add = "_" + genome
def get_aligner_cmd(RNA, aligner, type, index_type, version, options, read1_fname, read2_fname, out_fname, cmd_idx = 0):
cmd = ["/usr/bin/time"]
if osx_mode:
cmd += ['-l']
if aligner == "hisat2":
if version:
cmd += ["%s/hisat2_%s/hisat2" % (aligner_bin_base, version)]
else:
cmd += ["%s/hisat2" % (aligner_bin_base)]
if num_threads > 1:
cmd += ["-p", str(num_threads)]
# cmd += ["-k", "5"]
# cmd += ["--score-min", "C,-18"]
# daehwan - for debugging purposes
# cmd += ["--score-min", "C,-50"]
# cmd += ["--pen-cansplice", "0"]
# cmd += ["--pen-noncansplice", "12"]
# cmd += ["--pen-intronlen", "G,-8,1"]
# cmd += ["--metrics", "1",
# "--metrics-file", "metrics.out"]
if version == "204":
cmd += ["--sp", "2,1"]
if not RNA:
cmd += ["--no-spliced-alignment"]
if type in ["x1", "x2"]:
cmd += ["--no-temp-splicesite"]
# DK - for debugging purposes
# cmd += ["--dta"]
"""
if index_type == "tran":
cmd += ["--no-anchorstop"]
cmd += ["-k", "100"]
"""
if options != "":
cmd += options.split(' ')
if type == "x2":
if cmd_idx == 0:
cmd += ["--novel-splicesite-outfile"]
else:
cmd += ["--novel-splicesite-infile"]
cmd += ["splicesites.txt"]
# "--novel-splicesite-infile",
# "../splicesites.txt",
# "--rna-strandness",
# "FR",
if version:
index_cmd = "%s/HISAT2_%s%s/" % (index_base, version, index_add) + genome
else:
index_cmd = "%s/HISAT2%s/" % (index_base, index_add) + genome
if index_type:
index_cmd += ("_" + index_type)
cmd += [index_cmd]
if paired:
cmd += ["-1", read1_fname,
"-2", read2_fname]
else:
cmd += ["-U", read1_fname]
elif aligner == "hisat":
cmd += ["%s/hisat" % (aligner_bin_base)]
if num_threads > 1:
cmd += ["-p", str(num_threads)]
# cmd += ["-k", "5"]
# cmd += ["--score-min", "C,-18"]
if version != "":
version = int(version)
else:
version = sys.maxint
if not RNA:
cmd += ["--no-spliced-alignment"]
if type in ["x1", "x2"] or not RNA:
cmd += ["--no-temp-splicesite"]
"""
cmd += ["--rdg", "100,100",
"--rfg", "100,100"]
"""
if type == "x2":
if cmd_idx == 0:
cmd += ["--novel-splicesite-outfile"]
else:
cmd += ["--novel-splicesite-infile"]
cmd += ["splicesites.txt"]
# "--novel-splicesite-infile",
# "../splicesites.txt",
# "--rna-strandness",
# "FR",
cmd += ["%s/HISAT%s/" % (index_base, index_add) + genome]
if paired:
cmd += ["-1", read1_fname,
"-2", read2_fname]
else:
cmd += [read1_fname]
elif aligner == "tophat2":
cmd += ["%s/tophat" % (aligner_bin_base)]
if num_threads > 1:
cmd += ["-p", str(num_threads)]
cmd += ["--read-edit-dist", "3"]
cmd += ["--no-sort-bam"]
cmd += ["--read-realign-edit-dist", "0"]
cmd += ["--keep-tmp",
"%s/HISAT%s/" % (index_base, index_add) + genome,
read1_fname]
if paired:
cmd += [read2_fname]
elif aligner == "star":
cmd += ["%s/STAR" % (aligner_bin_base)]
if num_threads > 1:
cmd += ["--runThreadN", str(num_threads)]
if type == "x2" and cmd_idx == 1:
cmd += ["--genomeDir", "."]
else:
cmd += ["--genomeDir", "%s/STAR%s" % (index_base, index_add)]
if desktop:
cmd += ["--genomeLoad", "NoSharedMemory"]
else:
cmd += ["--genomeLoad", "LoadAndKeep"]
if type == "x2":
if cmd_idx == 1:
cmd += ["--alignSJDBoverhangMin", "1"]
cmd += ["--readFilesIn",
read1_fname]
if paired:
cmd += [read2_fname]
if paired:
cmd += ["--outFilterMismatchNmax", "6"]
else:
cmd += ["--outFilterMismatchNmax", "3"]
elif aligner == "bowtie":
cmd += ["%s/bowtie" % (aligner_bin_base)]
if num_threads > 1:
cmd += ["-p", str(num_threads)]
cmd += ["--sam",
"-k", "10"]
cmd += ["-n", "3"]
if paired:
cmd += ["-X", "500"]
cmd += ["%s/Bowtie%s/" % (index_base, index_add) + genome]
if paired:
cmd += ["-1", read1_fname,
"-2", read2_fname]
else:
cmd += [read1_fname]
elif aligner == "bowtie2":
if version:
cmd += ["%s/bowtie2_%s/bowtie2" % (aligner_bin_base, version)]
else:
cmd += ["%s/bowtie2" % (aligner_bin_base)]
if num_threads > 1:
cmd += ["-p", str(num_threads)]
#cmd += ["-k", "10"]
#cmd += ["--score-min", "C,-18"]
cmd += ["-X", "1000"]
if options:
cmd += options.split(' ')
if version:
cmd += ["-x %s/Bowtie2_%s%s/" % (index_base, version, index_add) + genome]
else:
cmd += ["-x %s/Bowtie2%s/" % (index_base, index_add) + genome]
if paired:
cmd += ["-1", read1_fname,
"-2", read2_fname]
else:
cmd += [read1_fname]
elif aligner == "gsnap":
cmd += ["%s/gsnap" % (aligner_bin_base),
"-A",
"sam"]
if num_threads > 1:
cmd += ["-t", str(num_threads)]
cmd += ["--max-mismatches=3",
"-D", "%s/GSNAP%s" % (index_base, index_add),
"-N", "1",
"-d", genome,
read1_fname]
if paired:
cmd += [read2_fname]
elif aligner == "bwa":
if version:
cmd += ["%s/bwa_%s/bwa" % (aligner_bin_base, version)]
else:
cmd += ["%s/bwa" % (aligner_bin_base)]
if type in ["mem", "aln"]:
cmd += [type]
elif type == "sw":
cmd += ["bwa" + type]
if num_threads > 1:
cmd += ["-t", str(num_threads)]
if options:
cmd += options.split(' ')
if version:
cmd += ["%s/BWA_%s%s/%s.fa" % (index_base, version, index_add, genome)]
else:
cmd += ["%s/BWA%s/%s.fa" % (index_base, index_add, genome)]
cmd += [read1_fname]
if paired:
cmd += [read2_fname]
elif aligner == "vg":
# vg map -d 22 -t 6 -M 10 -f ../sim-1.fa -f ../sim-2.fa --surject-to sam > result.sam
cmd += ["%s/vg" % (aligner_bin_base)]
cmd += ["map"]
cmd += ["-t", str(num_threads)]
cmd += ["--surject-to", "sam"]
index_cmd = "%s/VG%s/" % (index_base, index_add) + genome
if index_type:
index_cmd += ("_" + index_type)
if options:
cmd += options.split(' ')
cmd += ["-d", index_cmd]
cmd += ["-f", read1_fname]
if paired:
cmd += ["-f", read2_fname]
elif aligner == "minimap2":
# minimap2 -a -x sr 22.mmi sim_1.fa sim_2.fa > result.sam
cmd += ["%s/minimap2" % (aligner_bin_base)]
cmd += ["-a"]
cmd += ["-x", "sr"]
index_cmd = "%s/minimap2%s/" % (index_base, index_add) + genome
if index_type:
index_cmd += ("_" + index_type)
index_cmd += ".mmi"
cmd += [index_cmd]
cmd += [read1_fname]
if paired:
cmd += [read2_fname]
else:
assert False
return cmd
for aligner, type, index_type, version, options in aligners:
skip = False
if len(test_aligners) > 0:
skip = True
for test_aligner in test_aligners:
if aligner == test_aligner:
skip = False
if skip:
continue
aligner_name = aligner + type + version
if (aligner == "hisat2" or aligner == "vg") and index_type != "":
aligner_name += ("_" + index_type)
if options != "":
option_name = options.replace(' ', '').replace('-', '').replace(',', '')
aligner_name = aligner_name + '_' + option_name
if paired:
aligner_dir = aligner_name + "_paired"
else:
aligner_dir = aligner_name + "_single"
if fresh and os.path.exists(aligner_dir):
os.system("rm -rf %s" % aligner_dir)
if not os.path.exists(aligner_dir):
os.mkdir(aligner_dir)
os.chdir(aligner_dir)
out_fname = "accepted.sam"
aligner_cmd = get_aligner_cmd(RNA, aligner, type, index_type, version, options, "../" + type_read1_fname, "../" + type_read2_fname, out_fname)
duration = 0.1
mem_usage = ''
if not os.path.exists(out_fname):
if not os.path.exists("../one.fq") or not os.path.exists("../two.fq"):
if gz_file:
os.system("gzip -cd ../1.fq.gz | head -400 > ../one.fq")
os.system("gzip -cd ../2.fq.gz | head -400 > ../two.fq")
else:
os.system("head -400 ../1.fq > ../one.fq")
os.system("head -400 ../2.fq > ../two.fq")
# dummy commands for caching index
loading_time = 0
if aligner not in ["tophat2"]:
for i in range(3):
dummy_cmd = get_aligner_cmd(RNA, aligner, type, index_type, version, options, "../one.fq", "../two.fq", "/dev/null")
start_time = datetime.now()
if verbose:
print >> sys.stderr, start_time, "\t", " ".join(dummy_cmd)
if aligner in ["hisat2", "hisat", "bowtie", "bowtie2", "gsnap", "bwa"]:
proc = subprocess.Popen(dummy_cmd, stdout=open("/dev/null", "w"), stderr=subprocess.PIPE)
else:
proc = subprocess.Popen(dummy_cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
proc.communicate()
finish_time = datetime.now()
duration = finish_time - start_time
duration = duration.total_seconds()
if verbose:
print >> sys.stderr, finish_time, "duration:", duration
loading_time = duration
# align all reads
if paired:
sweep_read_cmd = "cat ../%s ../%s > /dev/null" % (type_read1_fname, type_read2_fname)
else:
sweep_read_cmd = "cat ../%s > /dev/null" % (type_read1_fname)
print >> sys.stderr, datetime.now(), "\t", sweep_read_cmd
os.system(sweep_read_cmd)
skip_alignment = False
if paired and aligner == "olego" and os.path.exists(out_fname + "1"):
skip_alignment = True
if not skip_alignment:
aligner_cmd = get_aligner_cmd(RNA, aligner, type, index_type, version, options, "../" + type_read1_fname, "../" + type_read2_fname, out_fname)
start_time = datetime.now()
if verbose:
print >> sys.stderr, start_time, "\t", " ".join(aligner_cmd)
if aligner in ["hisat2", "hisat", "bowtie", "bowtie2", "gsnap", "bwa", "vg", "minimap2"]:
proc = subprocess.Popen(aligner_cmd, stdout=open(out_fname, "w"), stderr=subprocess.PIPE)
else:
proc = subprocess.Popen(aligner_cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
_, mem_usage = proc.communicate()
mem_usage = parse_mem_usage(mem_usage)
finish_time = datetime.now()
duration = finish_time - start_time
duration = duration.total_seconds() - loading_time
if duration < 0.1:
duration = 0.1
if verbose:
print >> sys.stderr, finish_time, "duration:", duration
if verbose:
print >> sys.stderr, finish_time, "Memory Usage: %dMB" % (int(mem_usage) / 1024)
if debug and aligner == "hisat" and type == "x1":
os.system("cat metrics.out")
print >> sys.stderr, "\ttime: %.4f" % (duration)
# break
if aligner == "star" and type in ["", "gtf"]:
os.system("mv Aligned.out.sam %s" % out_fname)
elif aligner in ["hisat2", "hisat"] and type == "x2":
aligner_cmd = get_aligner_cmd(RNA, aligner, type, index_type, version, options, "../" + type_read1_fname, "../" + type_read2_fname, out_fname, 1)
if verbose:
print >> sys.stderr, start_time, "\t", " ".join(aligner_cmd)
start_time = datetime.now()
proc = subprocess.Popen(aligner_cmd, stdout=open(out_fname, "w"), stderr=subprocess.PIPE)
proc.communicate()
finish_time = datetime.now()
duration += (finish_time - start_time).total_seconds()
duration -= loading_time
if duration < 0.1:
duration = 0.1
if verbose:
print >> sys.stderr, finish_time, "duration:", duration
elif aligner == "star" and type == "x2":
assert os.path.exists("SJ.out.tab")
os.system("awk 'BEGIN {OFS=\"\t\"; strChar[0]=\".\"; strChar[1]=\"+\"; strChar[2]=\"-\";} {if($5>0){print $1,$2,$3,strChar[$4]}}' SJ.out.tab > SJ.out.tab.Pass1.sjdb")
for file in os.listdir("."):
if file in ["SJ.out.tab.Pass1.sjdb", "genome.fa"]:
continue
os.remove(file)
star_index_cmd = "STAR --genomeDir ./ --runMode genomeGenerate --genomeFastaFiles ../../../../data/genome.fa --sjdbFileChrStartEnd SJ.out.tab.Pass1.sjdb --sjdbOverhang 100 --runThreadN %d" % (num_threads)
print >> sys.stderr, "\t", datetime.now(), star_index_cmd
os.system(star_index_cmd)
if verbose:
print >> sys.stderr, "\t", datetime.now(), " ".join(dummy_cmd)
proc = subprocess.Popen(dummy_cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
proc.communicate()
if verbose:
print >> sys.stderr, "\t", datetime.now(), "finished"
aligner_cmd = get_aligner_cmd(RNA, aligner, type, index_type, version, options, "../" + type_read1_fname, "../" + type_read2_fname, out_fname, 1)
start_time = datetime.now()
if verbose:
print >> sys.stderr, "\t", start_time, " ".join(aligner_cmd)
proc = subprocess.Popen(aligner_cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
proc.communicate()
finish_time = datetime.now()
duration += (finish_time - start_time).total_seconds()
duration -= loading_time
if duration < 0.1:
duration = 0.1
if verbose:
print >> sys.stderr, "\t", finish_time, "finished:", duration
os.system("mv Aligned.out.sam %s" % out_fname)
elif aligner == "tophat2":
os.system("samtools sort -n tophat_out/accepted_hits.bam accepted_hits; samtools view -h accepted_hits.bam > %s" % out_fname)
elif aligner == "vg":
index_name = '%s/VG%s/' % (index_base, index_add) + genome
if index_type:
index_name += ('_' + index_type)
os.system("echo %s %s %f >> runtime" % (str(datetime.now()), aligner, duration))
if aligner in ["star", "tophat2", "gsnap"]:
os.system("tar cvzf %s.tar.gz %s &> /dev/null" % (out_fname, out_fname))
if runtime_only:
os.chdir("..")
continue
suffix = aligner
read_sam, pair_sam = suffix + ".read.sam", suffix + ".pair.sam"
unmapped_read_1_fq, unmapped_read_2_fq = suffix + ".unmapped.1.fq", suffix + ".unmapped.2.fq"
if not os.path.exists(read_sam) or not os.path.exists(pair_sam):
if index_type == 'snp':
extract_reads_and_pairs(chr_dic, out_fname, read_sam, pair_sam, unmapped_read_1_fq, unmapped_read_2_fq, snp_dic)
else:
extract_reads_and_pairs(chr_dic, out_fname, read_sam, pair_sam, unmapped_read_1_fq, unmapped_read_2_fq)
out = ''
if gz_file:
out = subprocess.check_output("gzip -cd ../%s | wc -l" % type_read1_fname, shell=True)
else:
out = subprocess.check_output("wc -l ../%s" % type_read1_fname, shell=True)
numreads = int(out.split()[0]) / 4
done_filename = suffix + ".done"
if not os.path.exists(done_filename):
done_file = open(done_filename, "w")
if paired:
sum = [0, 0, 0, 0, 0, 0] # mappep_read, junction_read, gtf_junction_reads, concord_mapped_read, num_junctions, num_gtf_junctions
dis_sum = 0
stat, dis_stat = pair_stat(pair_sam, gtf_junctions, chr_dic)
output = ""
for i in range(len(stat)):
for j in range(len(sum)):
sum[j] += stat[i][j]
dis_sum += dis_stat[i]
mapped_reads, junction_reads, gtf_junction_reads, concord_mapped_read, num_junctions, num_gtf_junctions = sum
output += "%s\t%s\tpaired\t%d\t%d\t%.2f%%\t%d\t%d\t%d\t%d\t%f\t%d\t%d\t%.2f%%\n" % \
(aligner_name, gene, i, mapped_reads, float(mapped_reads) * 100.0 / numreads, junction_reads, gtf_junction_reads, num_junctions, num_gtf_junctions, duration, (numreads / max(1.0, duration)), concord_mapped_read, float(concord_mapped_read) * 100.0 / numreads)
if sql_write and os.path.exists("../" + sql_db_name):
sql_insert = "INSERT INTO \"Mappings\" VALUES(NULL, '%s', '%s', '%s', '%s', '%s', '%s', %d, %d, %d, %d, %d, %d, %f, '%s', datetime('now', 'localtime'), '%s');" % \
(workdir, genome, "paired", aligner_name, get_aligner_version(aligner), "no", i, mapped_reads, junction_reads, gtf_junction_reads, num_junctions, num_gtf_junctions, duration, platform.node(), " ".join(aligner_cmd))
sql_execute("../" + sql_db_name, sql_insert)
print >> sys.stderr, output,
print >> done_file, output
else:
sum = [0, 0, 0, 0, 0]
stat = read_stat(read_sam, gtf_junctions, chr_dic)
output = ""
for i in range(len(stat)):
for j in range(len(sum)):
sum[j] += stat[i][j]
mapped_reads, junction_reads, gtf_junction_reads, num_junctions, num_gtf_junctions = sum
output += "%s\t%s\tsingle\t%d\t%d\t%.2f%%\t%d\t%d\t%d\t%d\t%f\t%d\n" % \
(aligner_name, gene, i, mapped_reads, float(mapped_reads) * 100.0 / numreads, junction_reads, gtf_junction_reads, num_junctions, num_gtf_junctions, duration, (numreads / max(1.0, duration)))
if sql_write and os.path.exists("../" + sql_db_name):
sql_insert = "INSERT INTO \"Mappings\" VALUES(NULL, '%s', '%s', '%s', '%s', '%s', '%s', %d, %d, %d, %d, %d, %d, %f, '%s', datetime('now', 'localtime'), '%s');" % \
(workdir, genome, "single", aligner_name, get_aligner_version(aligner), "no", i, mapped_reads, junction_reads, gtf_junction_reads, num_junctions, num_gtf_junctions, duration, platform.node(), " ".join(aligner_cmd))
sql_execute("../" + sql_db_name, sql_insert)
print >> sys.stderr, output,
print >> done_file, output
done_file.close()
os.chdir("..")
if os.path.exists(sql_db_name):
write_analysis_data(sql_db_name, workdir, paired)
if __name__ == "__main__":
parser = ArgumentParser(
description='test HISAT2, and compare HISAT2 with other popular aligners such as TopHat2, STAR, Bowtie1/2, GSNAP, BWA-mem, etc.')
parser.add_argument('--single-end',
dest='paired_end',
action='store_false',
help='run single-end only')
parser.add_argument('--paired-end',
dest='single_end',
action='store_false',
help='run paired_end only')
parser.add_argument('--aligner-list',
dest='aligner_list',
type=str,
default="",
help='comma-separated list of aligners (e.g. hisat2,bowtie2,bwa')
parser.add_argument('--fresh',
dest='fresh',
action='store_true',
help='delete existing alignment related directories (e.g. hisat2_single)')
parser.add_argument('--runtime-only',
dest='runtime_only',
action='store_true',
help='run programs without evaluation')
parser.add_argument('-v', '--verbose',
dest='verbose',
action='store_true',
help='also print some statistics to stderr')
args = parser.parse_args()
aligners = []
for aligner in args.aligner_list.split(','):
if aligner == "":
continue
aligners.append(aligner)
calculate_read_cost(args.single_end,
args.paired_end,
aligners,
args.fresh,
args.runtime_only,
args.verbose)