1676 lines
62 KiB
Python
1676 lines
62 KiB
Python
#!/usr/bin/env python
|
|
|
|
import sys, os, subprocess, signal
|
|
import multiprocessing
|
|
import platform
|
|
import string
|
|
import re
|
|
from datetime import datetime, date, time
|
|
from collections import defaultdict
|
|
from argparse import ArgumentParser, FileType
|
|
|
|
osx_mode = False
|
|
if sys.platform == 'darwin':
|
|
osx_mode = True
|
|
|
|
MAX_EDIT = 21
|
|
signal.signal(signal.SIGPIPE, signal.SIG_DFL)
|
|
|
|
cigar_re = re.compile('\d+\w')
|
|
|
|
"""
|
|
"""
|
|
def parse_mem_usage(resource):
|
|
if osx_mode:
|
|
resource = resource.strip().split('\n')
|
|
for line in resource:
|
|
if line.find('maximum resident set size') != -1:
|
|
return int(line.split()[0]) / 1024
|
|
else:
|
|
resource = resource.split(' ')
|
|
for line in resource:
|
|
idx = line.find('maxresident')
|
|
if idx != -1:
|
|
return line[:idx]
|
|
|
|
return '0'
|
|
|
|
|
|
"""
|
|
"""
|
|
def reverse_complement(seq):
|
|
result = ""
|
|
for nt in seq:
|
|
base = nt
|
|
if nt == 'A':
|
|
base = 'T'
|
|
elif nt == 'a':
|
|
base = 't'
|
|
elif nt == 'C':
|
|
base = 'G'
|
|
elif nt == 'c':
|
|
base = 'g'
|
|
elif nt == 'G':
|
|
base = 'C'
|
|
elif nt == 'g':
|
|
base = 'c'
|
|
elif nt == 'T':
|
|
base = 'A'
|
|
elif nt == 't':
|
|
base = 'a'
|
|
|
|
result = base + result
|
|
|
|
return result
|
|
|
|
|
|
"""
|
|
"""
|
|
def read_genome(genome_filename):
|
|
chr_dic = {}
|
|
genome_file = open(genome_filename, "r")
|
|
|
|
chr_name, sequence = "", ""
|
|
for line in genome_file:
|
|
if line[0] == ">":
|
|
if chr_name and sequence:
|
|
chr_dic[chr_name] = sequence
|
|
|
|
chr_name = line[1:-1].split()[0]
|
|
sequence = ""
|
|
else:
|
|
sequence += line[:-1]
|
|
|
|
if chr_name and sequence:
|
|
chr_dic[chr_name] = sequence
|
|
|
|
genome_file.close()
|
|
|
|
print >> sys.stderr, "genome is loaded"
|
|
|
|
return chr_dic
|
|
|
|
|
|
"""
|
|
"""
|
|
def read_snp(snp_filename):
|
|
snps = defaultdict(list)
|
|
snp_file = open(snp_filename, 'r')
|
|
|
|
for line in snp_file:
|
|
line = line.strip()
|
|
if not line or line.startswith('#'):
|
|
continue
|
|
try:
|
|
snpID, type, chr, pos, data = line.split('\t')
|
|
except ValueError:
|
|
continue
|
|
|
|
assert type in ["single", "deletion", "insertion"]
|
|
if type == "deletion":
|
|
data = int(data)
|
|
snps[chr].append([snpID, type, int(pos), data])
|
|
|
|
print >> sys.stderr, "snp is loaded"
|
|
|
|
return snps
|
|
|
|
|
|
"""
|
|
"""
|
|
def extract_splice_sites(gtf_fname):
|
|
trans = {}
|
|
|
|
gtf_file = open(gtf_fname)
|
|
# Parse valid exon lines from the GTF file into a dict by transcript_id
|
|
for line in gtf_file:
|
|
line = line.strip()
|
|
if not line or line.startswith('#'):
|
|
continue
|
|
if '#' in line:
|
|
line = line.split('#')[0].strip()
|
|
|
|
try:
|
|
chrom, source, feature, left, right, score, \
|
|
strand, frame, values = line.split('\t')
|
|
except ValueError:
|
|
continue
|
|
left, right = int(left), int(right)
|
|
|
|
if feature != 'exon' or left >= right:
|
|
continue
|
|
|
|
values_dict = {}
|
|
for attr in values.split(';')[:-1]:
|
|
attr, _, val = attr.strip().partition(' ')
|
|
values_dict[attr] = val.strip('"')
|
|
|
|
if 'gene_id' not in values_dict or \
|
|
'transcript_id' not in values_dict:
|
|
continue
|
|
|
|
transcript_id = values_dict['transcript_id']
|
|
if transcript_id not in trans:
|
|
trans[transcript_id] = [chrom, strand, [[left, right]]]
|
|
else:
|
|
trans[transcript_id][2].append([left, right])
|
|
|
|
gtf_file.close()
|
|
|
|
# Sort exons and merge where separating introns are <=5 bps
|
|
for tran, [chrom, strand, exons] in trans.items():
|
|
exons.sort()
|
|
tmp_exons = [exons[0]]
|
|
for i in range(1, len(exons)):
|
|
if exons[i][0] - tmp_exons[-1][1] <= 5:
|
|
tmp_exons[-1][1] = exons[i][1]
|
|
else:
|
|
tmp_exons.append(exons[i])
|
|
trans[tran] = [chrom, strand, tmp_exons]
|
|
|
|
# Calculate and print the unique junctions
|
|
junctions = set()
|
|
for chrom, strand, exons in trans.values():
|
|
for i in range(1, len(exons)):
|
|
junctions.add(to_junction_str([chrom, exons[i-1][1], exons[i][0]]))
|
|
|
|
return junctions
|
|
|
|
|
|
def to_junction_str(junction):
|
|
return "%s-%d-%d" % (junction[0], junction[1], junction[2])
|
|
|
|
|
|
def to_junction(junction_str):
|
|
chr, left, right = junction_str.split("-")
|
|
return [chr, int(left), int(right)]
|
|
|
|
|
|
def junction_cmp(a, b):
|
|
if a[0] != b[0]:
|
|
if a[0] < b[0]:
|
|
return -1
|
|
else:
|
|
return 1
|
|
|
|
if a[1] != b[1]:
|
|
if a[1] < b[1]:
|
|
return -1
|
|
else:
|
|
return 1
|
|
|
|
if a[2] != b[2]:
|
|
if a[2] < b[2]:
|
|
return -1
|
|
else:
|
|
return 1
|
|
|
|
return 0
|
|
|
|
|
|
# chr and pos are assumed to be integers
|
|
def get_junctions(chr, pos, cigar_str, min_anchor_len = 0, read_len = 100):
|
|
junctions = []
|
|
right_pos = pos
|
|
cigars = cigar_re.findall(cigar_str)
|
|
cigars = [[int(cigars[i][:-1]), cigars[i][-1]] for i in range(len(cigars))]
|
|
|
|
left_anchor_lens = []
|
|
cur_left_anchor_len = 0
|
|
for i in range(len(cigars)):
|
|
length, cigar_op = cigars[i]
|
|
if cigar_op in "MI":
|
|
cur_left_anchor_len += length
|
|
elif cigar_op == "N":
|
|
assert cur_left_anchor_len > 0
|
|
left_anchor_lens.append(cur_left_anchor_len)
|
|
cur_left_anchor_len = 0
|
|
|
|
for i in range(len(cigars)):
|
|
length, cigar_op = cigars[i]
|
|
if cigar_op == "N":
|
|
left, right = right_pos - 1, right_pos + length
|
|
|
|
if i > 0 and cigars[i-1][1] in "ID":
|
|
if cigars[i-1][1] == "I":
|
|
left += cigars[i-1][0]
|
|
else:
|
|
left -= cigars[i-1][0]
|
|
if i + 1 < len(cigars) and cigars[i+1][1] in "ID":
|
|
if cigars[i+1][1] == "I":
|
|
right -= cigars[i+1][0]
|
|
else:
|
|
right += cigars[i+1][0]
|
|
|
|
junction_idx = len(junctions)
|
|
assert junction_idx < len(left_anchor_lens)
|
|
left_anchor_len = left_anchor_lens[junction_idx]
|
|
assert left_anchor_len > 0 and left_anchor_len < read_len
|
|
right_anchor_len = read_len - left_anchor_len
|
|
if left_anchor_len >= min_anchor_len and right_anchor_len >= min_anchor_len:
|
|
junctions.append([chr, left, right])
|
|
|
|
if cigar_op in "MND":
|
|
right_pos += length
|
|
|
|
return junctions
|
|
|
|
|
|
def get_right(pos, cigars):
|
|
right_pos = pos
|
|
cigars = cigar_re.findall(cigars)
|
|
for cigar in cigars:
|
|
length = int(cigar[:-1])
|
|
cigar_op = cigar[-1]
|
|
if cigar_op in "MDN":
|
|
right_pos += length
|
|
|
|
return right_pos
|
|
|
|
def get_cigar_chars(cigars):
|
|
cigars = cigar_re.findall(cigars)
|
|
cigar_chars = ""
|
|
for cigar in cigars:
|
|
cigar_op = cigar[-1]
|
|
cigar_chars += cigar_op
|
|
|
|
return cigar_chars
|
|
|
|
def get_cigar_chars_MN(cigars):
|
|
cigars = cigar_re.findall(cigars)
|
|
cigar_chars = ""
|
|
for cigar in cigars:
|
|
cigar_op = cigar[-1]
|
|
if cigar_op in "MN":
|
|
if cigar_chars == "" or cigar_chars[-1] != cigar_op:
|
|
cigar_chars += cigar_op
|
|
|
|
return cigar_chars
|
|
|
|
def is_non_canonical_junction_read(chr_dic, chr, left, cigars, canonical_junctions = [["GT", "AG"], ["GC", "AG"], ["AT", "AC"]]):
|
|
pos = left
|
|
for cigar in cigar_re.findall(cigars):
|
|
cigar_op = cigar[-1]
|
|
cigar_len = int(cigar[:-1])
|
|
|
|
if cigar_op in 'MD':
|
|
pos += cigar_len
|
|
elif cigar_op == 'N':
|
|
right = pos + cigar_len
|
|
|
|
donor = chr_dic[chr][pos-1:pos+1]
|
|
acceptor = chr_dic[chr][right-3:right-1]
|
|
|
|
rev_donor = reverse_complement(acceptor)
|
|
rev_acceptor = reverse_complement(donor)
|
|
|
|
# print donor, acceptor
|
|
# print rev_donor, rev_acceptor
|
|
|
|
if [donor, acceptor] not in canonical_junctions and [rev_donor, rev_acceptor] not in canonical_junctions:
|
|
return True
|
|
|
|
pos = right
|
|
|
|
return False
|
|
|
|
def is_canonical_junction(chr_dic, junction):
|
|
chr, left, right = junction
|
|
donor = chr_dic[chr][left:left+2]
|
|
acceptor = chr_dic[chr][right-3:right-1]
|
|
rev_donor = reverse_complement(acceptor)
|
|
rev_acceptor = reverse_complement(donor)
|
|
|
|
if (donor == "GT" and acceptor == "AG") or \
|
|
(rev_donor == "GT" and rev_acceptor == "AG"):
|
|
return True
|
|
|
|
return False
|
|
|
|
def is_junction_read(chr_dic, gtf_junctions, chr, pos, cigar_str):
|
|
result_junctions = []
|
|
junctions = get_junctions(chr, pos, cigar_str, 0, 101)
|
|
for junction in junctions:
|
|
junction_str = to_junction_str(junction)
|
|
is_gtf_junction = False
|
|
def find_in_gtf_junctions(gtf_junctions, junction):
|
|
l, u = 0, len(gtf_junctions)
|
|
while l < u:
|
|
m = (l + u) / 2
|
|
assert m >= 0 and m < len(gtf_junctions)
|
|
cmp_result = junction_cmp(junction, gtf_junctions[m])
|
|
if cmp_result == 0:
|
|
return m
|
|
elif cmp_result < 0:
|
|
u = m
|
|
else:
|
|
l = m + 1
|
|
|
|
return l
|
|
|
|
# allow small (<= 5bp) discrepancy for non-canonical splice sites.
|
|
relaxed_junction_dist = 5
|
|
chr, left, right = junction
|
|
gtf_index = find_in_gtf_junctions(gtf_junctions, [chr, left - relaxed_junction_dist, right - relaxed_junction_dist])
|
|
if gtf_index >= 0:
|
|
i = gtf_index
|
|
while i < len(gtf_junctions):
|
|
chr2, left2, right2 = gtf_junctions[i]
|
|
if chr2 > chr or \
|
|
left2 - left > relaxed_junction_dist or \
|
|
right2 - right > relaxed_junction_dist:
|
|
break
|
|
|
|
if abs(left - left2) <= relaxed_junction_dist and left - left2 == right - right2:
|
|
canonical = is_canonical_junction(chr_dic, gtf_junctions[i])
|
|
if left == left2 or not canonical:
|
|
is_gtf_junction = True
|
|
break
|
|
|
|
i += 1
|
|
|
|
result_junctions.append([junction_str, is_gtf_junction])
|
|
|
|
is_gtf_junction_read = False
|
|
if len(result_junctions) > 0:
|
|
is_gtf_junction_read = True
|
|
for junction_str, is_gtf_junction in result_junctions:
|
|
if not is_gtf_junction:
|
|
is_gtf_junction_read = False
|
|
break
|
|
|
|
return result_junctions, len(result_junctions) > 0, is_gtf_junction_read
|
|
|
|
|
|
def is_junction_pair(chr_dic, gtf_junctions, chr, pos, cigar_str, mate_chr, mate_pos, mate_cigar_str):
|
|
junctions, junction_read, gtf_junction_read = is_junction_read(chr_dic, gtf_junctions, chr, pos, cigar_str)
|
|
mate_junctions, mate_junction_read, mate_gtf_junction_read = is_junction_read(chr_dic, gtf_junctions, mate_chr, mate_pos, mate_cigar_str)
|
|
junctions += mate_junctions
|
|
junction_pair = len(junctions) > 0
|
|
if junction_pair:
|
|
gtf_junction_pair = True
|
|
if junction_read and not gtf_junction_read:
|
|
gtf_junction_pair = False
|
|
if mate_junction_read and not mate_gtf_junction_read:
|
|
gtf_junction_pair = False
|
|
else:
|
|
gtf_junction_pair = False
|
|
|
|
return junctions, junction_pair, gtf_junction_pair
|
|
|
|
"""
|
|
"""
|
|
def getSNPs(chr_snps, left, right):
|
|
low, high = 0, len(chr_snps)
|
|
while low < high:
|
|
mid = (low + high) / 2
|
|
snpID, type, pos, data = chr_snps[mid]
|
|
if pos < left:
|
|
low = mid + 1
|
|
else:
|
|
high = mid - 1
|
|
|
|
snps = []
|
|
for i in xrange(low, len(chr_snps)):
|
|
snp = chr_snps[i]
|
|
snpID, type, pos, data = snp
|
|
pos2 = pos
|
|
if type == "deletion":
|
|
pos2 += data
|
|
if pos2 >= right:
|
|
break
|
|
if pos >= left:
|
|
if len(snps) > 0:
|
|
_, prev_type, prev_pos, prev_data = snps[-1]
|
|
assert prev_pos <= pos
|
|
prev_pos2 = prev_pos
|
|
if prev_type == "deletion":
|
|
prev_pos2 += prev_data
|
|
if pos <= prev_pos2:
|
|
continue
|
|
snps.append(snp)
|
|
|
|
return snps
|
|
|
|
"""
|
|
"""
|
|
def check_snps(snps, check_type, ref_pos, read_seq):
|
|
found = False
|
|
|
|
for snp in snps:
|
|
snp_type, snp_pos, snp_data = snp[1:4]
|
|
|
|
if snp_type == check_type:
|
|
if snp_type == 'single':
|
|
if snp_pos == ref_pos and snp_data[0] == read_seq[0]:
|
|
found = True
|
|
break
|
|
elif snp_type == 'insertion':
|
|
if snp_pos == ref_pos and snp_data == read_seq:
|
|
found = True
|
|
break
|
|
|
|
elif snp_type == 'deletion':
|
|
# snp_data and read_seq are length of sequence deleted
|
|
if snp_pos == ref_pos and int(snp_data) == int(read_seq):
|
|
found = True
|
|
break
|
|
|
|
return found
|
|
|
|
|
|
def extract_reads_and_pairs(chr_dic, sam_filename, read_filename, pair_filename, unmapped_read_1_fq_name, unmapped_read_2_fq_name, snps_dict = None):
|
|
temp_read_filename, temp_pair_filename = read_filename + ".temp", pair_filename + ".temp"
|
|
temp_read_file, temp_pair_file = open(temp_read_filename, "w"), open(temp_pair_filename, "w")
|
|
|
|
unmapped_read_1_fq, unmapped_read_2_fq = open(unmapped_read_1_fq_name, "w"), open(unmapped_read_2_fq_name, "w")
|
|
hisat2 = read_filename.find("hisat2") != -1 or pair_filename.find("hisat2") != -1
|
|
vg = read_filename.find("vg") != -1 or pair_filename.find("vg") != -1
|
|
|
|
read_dic = {}
|
|
prev_read_id, prev_read_seq = "", ""
|
|
sam_file = open(sam_filename, "r")
|
|
for line in sam_file:
|
|
if line[0] == "@":
|
|
continue
|
|
|
|
fields = line[:-1].split()
|
|
read_id, flag, chr, pos, mapQ, cigar_str, mate_chr, mate_pos, template_len, read_seq, read_qual = fields[:11]
|
|
if 'H' in cigar_str:
|
|
continue
|
|
|
|
flag, pos, mate_pos = int(flag), int(pos), int(mate_pos)
|
|
if read_seq == "*" and prev_read_id == read_id:
|
|
read_seq = prev_read_seq
|
|
read_seq = read_seq.upper()
|
|
if read_seq == "" or read_seq == "*":
|
|
continue
|
|
|
|
if flag & 0x04 != 0 or \
|
|
chr == "*" or \
|
|
cigar_str == "*":
|
|
"""
|
|
if flag & 0x80 != 0:
|
|
print >> unmapped_read_2_fq, "@%s\n%s\n+%s\n%s" % (read_id, read_seq, read_id, read_qual)
|
|
else:
|
|
print >> unmapped_read_1_fq, "@%s\n%s\n+%s\n%s" % (read_id, read_seq, read_id, read_qual)
|
|
"""
|
|
continue
|
|
|
|
if mate_chr == '=':
|
|
mate_chr = chr
|
|
|
|
if len(read_id) >= 3 and read_id[-2] == "/":
|
|
read_id = read_id[:-2]
|
|
|
|
if read_id.find("seq.") == 0:
|
|
read_id = read_id[4:]
|
|
|
|
if read_id != prev_read_id:
|
|
read_dic = {}
|
|
|
|
HISAT2_XM, HISAT2_NM = 0, 0
|
|
if hisat2:
|
|
for field in fields[11:]:
|
|
if field[:5] == "XM:i:":
|
|
HISAT2_XM = int(field[5:])
|
|
elif field[:5] == "NM:i:":
|
|
HISAT2_NM = int(field[5:])
|
|
|
|
prev_read_id = read_id
|
|
prev_read_seq = read_seq
|
|
|
|
if snps_dict != None and chr in snps_dict:
|
|
chr_snps = snps_dict[chr]
|
|
else:
|
|
chr_snps = []
|
|
|
|
snps = None
|
|
|
|
XM, gap = 0, 0
|
|
read_pos, right_pos = 0, pos - 1,
|
|
junction_read = False
|
|
|
|
cigars = cigar_re.findall(cigar_str)
|
|
for i in range(len(cigars)):
|
|
cigar = cigars[i]
|
|
length = int(cigar[:-1])
|
|
cigar_op = cigar[-1]
|
|
|
|
if cigar_op == "S":
|
|
if i != 0 and i != len(cigars) - 1:
|
|
print >> sys.stderr, "S is located at %dth out of %d %s" % (i+1, len(cigars), cigar_str)
|
|
|
|
if cigar_op in "MS":
|
|
ref_pos = right_pos
|
|
if cigar_op == "S" and i == 0:
|
|
ref_seq = chr_dic[chr][right_pos-length:right_pos]
|
|
ref_pos = right_pos - length
|
|
else:
|
|
ref_seq = chr_dic[chr][right_pos:right_pos+length]
|
|
|
|
ref_seq = ref_seq.upper()
|
|
if length == len(ref_seq):
|
|
for j in range(length):
|
|
if ref_seq[j] != "N" and read_seq[read_pos+j] != ref_seq[j]:
|
|
if snps_dict == None:
|
|
XM += 1
|
|
else:
|
|
if snps == None:
|
|
snps = getSNPs(chr_snps, pos - 1, pos + len(read_seq))
|
|
|
|
found_snp = check_snps(snps, 'single', ref_pos + j, read_seq[read_pos + j])
|
|
if not found_snp:
|
|
XM += 1
|
|
|
|
if hisat2 and cigar_op == "S":
|
|
HISAT2_XM += 1
|
|
HISAT2_NM += 1
|
|
else:
|
|
XM += length
|
|
|
|
if cigar_op in "I":
|
|
if snps == None:
|
|
snps = getSNPs(chr_snps, pos - 1, pos + len(read_seq))
|
|
found_snp = check_snps(snps, 'insertion', right_pos, read_seq[read_pos:read_pos + length])
|
|
if not found_snp:
|
|
gap += length
|
|
|
|
if cigar_op in "D":
|
|
if snps == None:
|
|
snps = getSNPs(chr_snps, pos - 1, pos + len(read_seq))
|
|
found_snp = check_snps(snps, 'deletion', right_pos, length)
|
|
if not found_snp:
|
|
gap += length
|
|
|
|
if cigar_op in "MND":
|
|
right_pos += length
|
|
|
|
if cigar_op in "MIS":
|
|
read_pos += length
|
|
|
|
if cigar_op == "N":
|
|
junction_read = True
|
|
|
|
NM = XM + gap
|
|
if hisat2:
|
|
XM, NM = HISAT2_XM, HISAT2_NM
|
|
if NM < MAX_EDIT:
|
|
print >> temp_read_file, "%s\t%d\t%s\t%s\t%s\tXM:i:%d\tNM:i:%d" % \
|
|
(read_id, flag, chr, pos, cigar_str, XM, NM)
|
|
|
|
found = False
|
|
me = "%s\t%s\t%d" % (read_id, chr, pos)
|
|
partner = "%s\t%s\t%d" % (read_id, mate_chr, mate_pos)
|
|
if partner in read_dic:
|
|
maps = read_dic[partner]
|
|
for map in maps:
|
|
if map[0] == me:
|
|
mate_flag, mate_cigar_str, mate_XM, mate_NM = map[1:]
|
|
if mate_pos > pos:
|
|
flag, chr, pos, cigar_str, XM, NM, mate_flag, mate_chr_str, mate_pos, mate_cigar_str, mate_XM, mate_NM = \
|
|
mate_flag, mate_chr, mate_pos, mate_cigar_str, mate_XM, mate_NM, flag, chr, pos, cigar_str, XM, NM
|
|
|
|
print >> temp_pair_file, "%s\t%d\t%s\t%d\t%s\tXM:i:%d\tNM:i:%d\t%d\t%s\t%d\t%s\tXM:i:%d\tNM:i:%d" % \
|
|
(read_id, mate_flag, mate_chr, mate_pos, mate_cigar_str, mate_XM, mate_NM, flag, chr, pos, cigar_str, XM, NM)
|
|
found = True
|
|
break
|
|
|
|
if not found:
|
|
if not me in read_dic:
|
|
read_dic[me] = []
|
|
|
|
read_dic[me].append([partner, flag, cigar_str, XM, NM])
|
|
|
|
sam_file.close()
|
|
|
|
temp_read_file.close()
|
|
temp_pair_file.close()
|
|
|
|
unmapped_read_1_fq.close()
|
|
unmapped_read_2_fq.close()
|
|
|
|
|
|
sort = False
|
|
if vg:
|
|
sort = True
|
|
|
|
if sort:
|
|
command = "sort %s | uniq > %s; rm %s" % (temp_read_filename, read_filename, temp_read_filename)
|
|
os.system(command)
|
|
|
|
command = "sort %s | uniq > %s; rm %s" % (temp_pair_filename, pair_filename, temp_pair_filename)
|
|
os.system(command)
|
|
else:
|
|
command = "mv %s %s; mv %s %s" % (temp_read_filename, read_filename, temp_pair_filename, pair_filename)
|
|
os.system(command)
|
|
|
|
|
|
def remove_redundant_junctions(junctions):
|
|
temp_junctions = []
|
|
for junction in junctions:
|
|
temp_junctions.append(to_junction(junction))
|
|
junctions = sorted(list(temp_junctions), cmp=junction_cmp)
|
|
temp_junctions = []
|
|
for can_junction in junctions:
|
|
if len(temp_junctions) <= 0:
|
|
temp_junctions.append(can_junction)
|
|
else:
|
|
chr, left, right = temp_junctions[-1]
|
|
chr2, left2, right2 = can_junction
|
|
if chr == chr2 and \
|
|
abs(left - left2) == abs(right - right2) and \
|
|
abs(left - left2) <= 10:
|
|
continue
|
|
temp_junctions.append(can_junction)
|
|
junctions = set()
|
|
for junction in temp_junctions:
|
|
junctions.add(to_junction_str(junction))
|
|
|
|
return junctions
|
|
|
|
|
|
|
|
def read_stat(read_filename, gtf_junctions, chr_dic = None, debug = False):
|
|
read_stat = [[0, 0, 0] for i in range(MAX_EDIT)]
|
|
temp_junctions = [set() for i in range(MAX_EDIT)]
|
|
temp_gtf_junctions = [set() for i in range(MAX_EDIT)]
|
|
|
|
alignment = []
|
|
prev_read_id = ""
|
|
read_file = open(read_filename, "r")
|
|
for line in read_file:
|
|
read_id, flag, chr, pos, cigar_str, XM, NM = line[:-1].split()
|
|
flag, pos = int(flag), int(pos)
|
|
XM, NM = int(XM[5:]), int(NM[5:])
|
|
|
|
read_junctions, junction_read, gtf_junction_read = \
|
|
is_junction_read(chr_dic, gtf_junctions, chr, pos, cigar_str)
|
|
|
|
if junction_read:
|
|
for junction_str, is_gtf_junction in read_junctions:
|
|
if NM < len(temp_junctions):
|
|
temp_junctions[NM].add(junction_str)
|
|
|
|
if is_gtf_junction:
|
|
temp_gtf_junctions[NM].add(junction_str)
|
|
|
|
if read_id != prev_read_id:
|
|
if prev_read_id != "":
|
|
NM2, junction_read2, gtf_junction_read2 = alignment
|
|
if NM2 < len(read_stat):
|
|
read_stat[NM2][0] += 1
|
|
|
|
if junction_read2:
|
|
read_stat[NM2][1] += 1
|
|
|
|
if gtf_junction_read2:
|
|
read_stat[NM2][2] += 1
|
|
|
|
alignment = []
|
|
|
|
prev_read_id = read_id
|
|
|
|
if not alignment:
|
|
alignment = [NM, junction_read, gtf_junction_read]
|
|
elif alignment[0] > NM or \
|
|
(alignment[0] == NM and not alignment[2] and junction_read):
|
|
alignment = [NM, junction_read, gtf_junction_read]
|
|
|
|
read_file.close()
|
|
|
|
for i in range(len(read_stat)):
|
|
temp_junctions[i] = remove_redundant_junctions(temp_junctions[i])
|
|
temp_gtf_junctions[i] = remove_redundant_junctions(temp_gtf_junctions[i])
|
|
|
|
for i in range(len(read_stat)):
|
|
read_stat[i].append(len(temp_junctions[i]))
|
|
read_stat[i].append(len(temp_gtf_junctions[i]))
|
|
|
|
if alignment:
|
|
NM2, junction_read2, gtf_junction_read2 = alignment
|
|
if NM2 < len(read_stat):
|
|
read_stat[NM2][0] += 1
|
|
|
|
if junction_read2:
|
|
read_stat[NM2][1] += 1
|
|
|
|
if gtf_junction_read2:
|
|
read_stat[NM2][2] += 1
|
|
|
|
return read_stat
|
|
|
|
|
|
def cal_read_len(cigar_str):
|
|
length = 0
|
|
leftmost_softclip = 0
|
|
rightmost_softclip = 0
|
|
cigars = cigar_re.findall(cigar_str)
|
|
|
|
for i in range(len(cigars)):
|
|
cigar = cigars[i]
|
|
cigar_length = int(cigar[:-1])
|
|
cigar_op = cigar[-1]
|
|
|
|
if cigar_op in "MIS":
|
|
length += cigar_length
|
|
|
|
if (i == 0) and (cigar_op == "S"):
|
|
leftmost_softclip = cigar_length
|
|
if (i == (len(cigars) - 1)) and (cigar_op == "S"):
|
|
rightmost_softclip = cigar_length
|
|
|
|
return length, leftmost_softclip, rightmost_softclip
|
|
|
|
def is_concordantly(read_id, flag, chr, pos, cigar_str, XM, NM, mate_flag, mate_chr, mate_pos, mate_cigar_str, mate_XM, mate_NM):
|
|
concord_length = 1000
|
|
segment_length = sys.maxint
|
|
|
|
pairs = {}
|
|
pairs[0] = [flag, chr, pos, cigar_str, XM, NM]
|
|
pairs[1] = [mate_flag, mate_chr, mate_pos, mate_cigar_str, mate_XM, mate_NM]
|
|
|
|
if chr != mate_chr:
|
|
return False, segment_length
|
|
if (flag & 0x10 == 0x10) or (mate_flag & 0x10 == 0):
|
|
return False, segment_length
|
|
|
|
assert pos <= mate_pos
|
|
|
|
left = pairs[0]
|
|
right = pairs[1]
|
|
|
|
left_start = left[2]
|
|
left_len, _, _ = cal_read_len(left[3]) # cigar
|
|
|
|
right_start = right[2]
|
|
right_len, _, right_soft = cal_read_len(right[3])
|
|
|
|
segment_length = (right_start + right_len) - left_start - right_soft
|
|
assert segment_length >= 0
|
|
|
|
if segment_length > concord_length:
|
|
return False, segment_length
|
|
|
|
return True, segment_length
|
|
|
|
def pair_stat(pair_filename, gtf_junctions, chr_dic):
|
|
# pair_stat = NM, junction_pair, gtf_junction, concordant_alignment]
|
|
pair_stat = [[0, 0, 0, 0] for i in range(MAX_EDIT)]
|
|
dis_pair_stat = [0 for i in range(MAX_EDIT)]
|
|
temp_junctions = [set() for i in range(MAX_EDIT)]
|
|
temp_gtf_junctions = [set() for i in range(MAX_EDIT)]
|
|
|
|
alignment, dis_alignments = [], []
|
|
prev_read_id = ""
|
|
con_file = open(pair_filename + ".con", "w")
|
|
discon_file = open(pair_filename + ".discon", "w")
|
|
pair_file = open(pair_filename, "r")
|
|
for line in pair_file:
|
|
read_id, flag, chr, pos, cigar_str, XM, NM, mate_flag, mate_chr, mate_pos, mate_cigar_str, mate_XM, mate_NM = line[:-1].split()
|
|
flag, pos, XM, NM, mate_flag, mate_pos, mate_XM, mate_NM = \
|
|
int(flag), int(pos), int(XM[5:]), int(NM[5:]), int(mate_flag), int(mate_pos), int(mate_XM[5:]), int(mate_NM[5:])
|
|
|
|
pair_XM = XM + mate_XM
|
|
pair_NM = NM + mate_NM
|
|
|
|
pair_junctions, junction_pair, gtf_junction_pair = \
|
|
is_junction_pair(chr_dic, gtf_junctions, chr, pos, cigar_str, mate_chr, mate_pos, mate_cigar_str)
|
|
|
|
# check concordantly
|
|
concord_align, segment_len = is_concordantly(read_id, flag, chr, pos, cigar_str, XM, NM, mate_flag, mate_chr, mate_pos, mate_cigar_str, mate_XM, mate_NM)
|
|
print >> (con_file if concord_align else discon_file), line.strip(), ('none', 'first')[(flag & 0x40 == 0x40)], ('none', 'last')[(mate_flag & 0x80 == 0x80)], segment_len
|
|
|
|
if junction_pair:
|
|
for junction_str, is_gtf_junction in pair_junctions:
|
|
if pair_NM < len(temp_junctions):
|
|
temp_junctions[pair_NM].add(junction_str)
|
|
|
|
if is_gtf_junction:
|
|
temp_gtf_junctions[pair_NM].add(junction_str)
|
|
|
|
if read_id != prev_read_id:
|
|
if prev_read_id != "":
|
|
NM2, junction_read2, gtf_junction_read2, concord_align2 = alignment
|
|
if NM2 < len(pair_stat):
|
|
pair_stat[NM2][0] += 1
|
|
|
|
if junction_read2:
|
|
pair_stat[NM2][1] += 1
|
|
if gtf_junction_read2:
|
|
pair_stat[NM2][2] += 1
|
|
if concord_align2:
|
|
pair_stat[NM2][3] += 1
|
|
|
|
for NM2 in dis_alignments:
|
|
if NM2 < len(dis_pair_stat):
|
|
dis_pair_stat[NM2] += 1
|
|
|
|
alignment = []
|
|
dis_alignment = []
|
|
|
|
prev_read_id = read_id
|
|
|
|
if not alignment:
|
|
alignment = [pair_NM, junction_pair, gtf_junction_pair, concord_align]
|
|
elif alignment[0] > pair_NM or \
|
|
(alignment[0] == pair_NM and not alignment[2] and junction_pair):
|
|
alignment = [pair_NM, junction_pair, gtf_junction_pair, concord_align]
|
|
|
|
if mate_chr != chr or ((flag & 0x10) != 0 or (mate_flag & 0x10) == 0):
|
|
if len(dis_alignments) == 0:
|
|
dis_alignments = [pair_NM]
|
|
elif dis_alignments[0] > pair_NM:
|
|
dis_alignments = [pair_NM]
|
|
|
|
pair_file.close()
|
|
con_file.close()
|
|
discon_file.close()
|
|
|
|
# process last line
|
|
if alignment:
|
|
NM2, junction_read2, gtf_junction_read2, concord_align2 = alignment
|
|
if NM2 < len(pair_stat):
|
|
pair_stat[NM2][0] += 1
|
|
|
|
if junction_read2:
|
|
pair_stat[NM2][1] += 1
|
|
if gtf_junction_read2:
|
|
pair_stat[NM2][2] += 1
|
|
|
|
if concord_align2:
|
|
pair_stat[NM2][3] += 1
|
|
|
|
assert len(dis_alignments) <= 1
|
|
for NM2 in dis_alignments:
|
|
if NM2 < len(dis_pair_stat):
|
|
dis_pair_stat[NM2] += 1
|
|
|
|
for i in range(len(pair_stat)):
|
|
temp_junctions[i] = remove_redundant_junctions(temp_junctions[i])
|
|
temp_gtf_junctions[i] = remove_redundant_junctions(temp_gtf_junctions[i])
|
|
|
|
for i in range(len(pair_stat)):
|
|
pair_stat[i].append(len(temp_junctions[i]))
|
|
pair_stat[i].append(len(temp_gtf_junctions[i]))
|
|
|
|
return pair_stat, dis_pair_stat
|
|
|
|
|
|
def sql_execute(sql_db, sql_query):
|
|
sql_cmd = [
|
|
"sqlite3", sql_db,
|
|
"-separator", "\t",
|
|
"%s;" % sql_query
|
|
]
|
|
# print >> sys.stderr, sql_cmd
|
|
sql_process = subprocess.Popen(sql_cmd, stdout=subprocess.PIPE)
|
|
output = sql_process.communicate()[0][:-1]
|
|
return output
|
|
|
|
|
|
def create_sql_db(sql_db):
|
|
if os.path.exists(sql_db):
|
|
print >> sys.stderr, sql_db, "already exists!"
|
|
return
|
|
|
|
columns = [
|
|
["id", "integer primary key autoincrement"],
|
|
["reads", "text"],
|
|
["genome", "text"],
|
|
["end_type", "text"],
|
|
["aligner", "text"],
|
|
["version", "test"],
|
|
["use_annotation", "text"],
|
|
["edit_distance", "integer"],
|
|
["mapped_reads", "integer"],
|
|
["junction_reads", "integer"],
|
|
["gtf_junction_reads", "integer"],
|
|
["junctions", "integer"],
|
|
["gtf_junctions", "integer"],
|
|
["runtime", "real"],
|
|
["host", "text"],
|
|
["created", "text"],
|
|
["cmd", "text"]
|
|
]
|
|
|
|
sql_create_table = "CREATE TABLE Mappings ("
|
|
for i in range(len(columns)):
|
|
name, type = columns[i]
|
|
if i != 0:
|
|
sql_create_table += ", "
|
|
sql_create_table += ("%s %s" % (name, type))
|
|
sql_create_table += ");"
|
|
sql_execute(sql_db, sql_create_table)
|
|
|
|
|
|
def write_analysis_data(sql_db, database_name, paired):
|
|
if not os.path.exists(sql_db):
|
|
return
|
|
|
|
if paired:
|
|
paired = "paired"
|
|
else:
|
|
paired = "single"
|
|
|
|
aligners = []
|
|
sql_aligners = "SELECT aligner FROM Mappings WHERE end_type = '%s' GROUP BY aligner" % (paired)
|
|
output = sql_execute(sql_db, sql_aligners)
|
|
aligners = output.split()
|
|
|
|
database_fname = database_name + "_" + paired + ".analysis"
|
|
database_file = open(database_fname, "w")
|
|
|
|
print >> database_file, "aligner\tuse_annotation\tend_type\tedit_distance\tmapped_reads\tjunction_reads\tgtf_junction_reads\tjunctions\tgtf_junctions\truntime"
|
|
for aligner in aligners:
|
|
for edit_distance in range(MAX_EDIT):
|
|
sql_row = "SELECT aligner, use_annotation, end_type, edit_distance, mapped_reads, junction_reads, gtf_junction_reads, junctions, gtf_junctions, runtime FROM Mappings"
|
|
sql_row += " WHERE reads = '%s' and aligner = '%s' and edit_distance = %d and end_type = '%s' ORDER BY created DESC LIMIT 1" % (database_name, aligner, edit_distance, paired)
|
|
output = sql_execute(sql_db, sql_row)
|
|
if output:
|
|
print >> database_file, output
|
|
|
|
database_file.close()
|
|
|
|
|
|
def calculate_read_cost(single_end,
|
|
paired_end,
|
|
test_aligners,
|
|
fresh,
|
|
runtime_only,
|
|
verbose):
|
|
sql_db_name = "analysis.db"
|
|
if not os.path.exists(sql_db_name):
|
|
create_sql_db(sql_db_name)
|
|
|
|
full_workdir = os.getcwd()
|
|
workdir = full_workdir.split("/")[-1]
|
|
|
|
num_cpus = multiprocessing.cpu_count()
|
|
if num_cpus > 8:
|
|
num_threads = min(8, num_cpus)
|
|
desktop = False
|
|
else:
|
|
num_threads = min(3, num_cpus)
|
|
desktop = True
|
|
|
|
verbose = False
|
|
sql_write = True
|
|
is_large_file = False
|
|
gz_file = False
|
|
if os.path.exists("1.fq.gz"):
|
|
gz_file = True
|
|
if os.path.getsize("1.fq.gz") > (1024 * 1024 * 1024):
|
|
is_large_file = True
|
|
|
|
elif os.path.exists("1.fq"):
|
|
gz_file = False
|
|
if os.path.getsize("1.fq") > (2 * 1024 * 1024 * 1024):
|
|
is_large_file = True
|
|
|
|
else:
|
|
assert(False)
|
|
|
|
|
|
aligners = [
|
|
# ["hisat2", "", "", "", ""],
|
|
# ["hisat2", "", "", "", "--sensitive"],
|
|
# ["hisat2", "", "", "", "--very-sensitive"],
|
|
# ["hisat2", "", "", "", "-k 50 --score-min C,-50,0"],
|
|
# ["hisat2", "", "snp", "", ""],
|
|
# ["hisat2", "", "snp", "", "--sensitive"],
|
|
# ["hisat2", "", "snp", "", "-k 50"],
|
|
# ["hisat2", "", "", "205", ""],
|
|
# ["hisat2", "", "snp", "205", ""],
|
|
# ["hisat2", "", "snp_tran", "205", ""],
|
|
# ["hisat2", "", "tran", "", ""],
|
|
# ["hisat2", "x1", "snp", "", ""],
|
|
# ["hisat2", "x1", "", "", ""],
|
|
# ["hisat2", "x2", "", "", ""],
|
|
# ["hisat2", "", "tran", "", ""],
|
|
# ["hisat2", "", "snp_tran", "204", ""],
|
|
# ["hisat2", "", "snp_tran", "", ""],
|
|
# ["hisat2", "", "", "210", ""],
|
|
["hisat2", "", "rep", "", ""],
|
|
# ["hisat2", "", "rep", "", "--read-lengths 101"],
|
|
# ["hisat2", "", "rep", "", "--sensitive"],
|
|
# ["hisat2", "", "rep-100-300", "", ""],
|
|
# ["hisat2", "", "rep-101-300", "", "--sensitive"],
|
|
# ["hisat2", "", "rep-101-300", "", "-k 10 --score-min C,-50,0"],
|
|
# ["hisat2", "", "rep-150-300", "", ""],
|
|
# ["tophat2", "", "", "", ""],
|
|
# ["bowtie", "", "", "", ""],
|
|
["bowtie2", "", "", "", ""],
|
|
# ["bowtie2", "", "", "", "-k 10"],
|
|
["bwa", "mem", "", "", ""],
|
|
# ["bwa", "mem", "", "", "-a"],
|
|
# ["bwa", "sw", "", "", ""],
|
|
# ["star", "", "", "", ""],
|
|
# ["star", "x2", "", "", ""],
|
|
# ["vg", "", "", "", ""],
|
|
# ["vg", "", "", "", "-M 10"],
|
|
# ["vg", "", "snp", "", ""],
|
|
# ["vg", "", "snp", "", "-M 10"],
|
|
# ["minimap2", "", "", "", ""],
|
|
]
|
|
|
|
# sql_write = False
|
|
verbose = True
|
|
debug = False
|
|
|
|
genome = "genome"
|
|
cwd = os.getcwd()
|
|
RNA = (cwd.find("RNA") != -1)
|
|
|
|
chr_dic = read_genome("../../../data/" + genome + ".fa")
|
|
snp_dic = read_snp("../../../data/" + genome + ".snp")
|
|
gtf_junction_strs = extract_splice_sites("../../../data/" + genome + ".gtf")
|
|
gene = "no"
|
|
gtf_junctions = []
|
|
for junction_str in gtf_junction_strs:
|
|
junction = to_junction(junction_str)
|
|
gtf_junctions.append(junction)
|
|
gtf_junctions = sorted(gtf_junctions, cmp=junction_cmp)
|
|
|
|
print >> sys.stderr, "aligner\tuse_annotation\tend_type\tedit_distance\tmapped_reads\tjunction_reads\tgtf_junction_reads\tjunctions\tgtf_junctions\truntime"
|
|
|
|
for paired in [False, True]:
|
|
if not paired and not single_end:
|
|
continue
|
|
if paired and not paired_end:
|
|
continue
|
|
|
|
type_read1_fname = "1.fq"
|
|
if gz_file:
|
|
type_read1_fname += ".gz"
|
|
|
|
if paired:
|
|
type_read2_fname = "2.fq"
|
|
if gz_file:
|
|
type_read2_fname += ".gz"
|
|
|
|
else:
|
|
type_read2_fname = ""
|
|
|
|
aligner_bin_base = "../../../../aligners/bin"
|
|
def get_aligner_version(aligner):
|
|
version = ""
|
|
if aligner == "hisat2" or \
|
|
aligner == "hisat" or \
|
|
aligner == "bowtie" or \
|
|
aligner == "bowtie2":
|
|
if version:
|
|
cmd = ["%s/%s_%s/%s" % (aligner_bin_base, aligner, version, aligner)]
|
|
else:
|
|
cmd = ["%s/%s" % (aligner_bin_base, aligner)]
|
|
cmd += ["--version"]
|
|
cmd_process = subprocess.Popen(cmd, stdout=subprocess.PIPE)
|
|
version = cmd_process.communicate()[0][:-1].split("\n")[0]
|
|
version = version.split()[-1]
|
|
elif aligner == "tophat2":
|
|
cmd = ["%s/tophat" % (aligner_bin_base)]
|
|
cmd += ["--version"]
|
|
cmd_process = subprocess.Popen(cmd, stdout=subprocess.PIPE)
|
|
version = cmd_process.communicate()[0][:-1].split()[-1]
|
|
elif aligner in ["star", "starx2"]:
|
|
version = "2.4.2a"
|
|
elif aligner == "gsnap":
|
|
cmd = ["%s/gsnap" % (aligner_bin_base)]
|
|
cmd_process = subprocess.Popen(cmd, stderr=subprocess.PIPE)
|
|
version = cmd_process.communicate()[1][:-1].split("\n")[0]
|
|
version = version.split()[2]
|
|
elif aligner == "bwa":
|
|
if version:
|
|
cmd = ["%s/bwa_%s/bwa" % (aligner_bin_base, version)]
|
|
else:
|
|
cmd = ["%s/bwa" % (aligner_bin_base)]
|
|
cmd_process = subprocess.Popen(cmd, stderr=subprocess.PIPE)
|
|
version = cmd_process.communicate()[1][:-1].split("\n")[2]
|
|
version = version.split()[1]
|
|
elif aligner == "vg":
|
|
cmd = ["%s/vg" % (aligner_bin_base)]
|
|
cmd_process = subprocess.Popen(cmd, stderr=subprocess.PIPE)
|
|
version = cmd_process.communicate()[1][:-1].split("\n")[0]
|
|
version = version.split()[5]
|
|
elif aligner == "minimap2":
|
|
cmd = ["%s/minimap2" % (aligner_bin_base)]
|
|
cmd += ["--version"]
|
|
cmd_process = subprocess.Popen(cmd, stdout=subprocess.PIPE)
|
|
version = cmd_process.communicate()[0][:-1].split("\n")[0]
|
|
|
|
return version
|
|
|
|
index_base = "../../../../indexes"
|
|
index_add = ""
|
|
if genome != "genome":
|
|
index_add = "_" + genome
|
|
def get_aligner_cmd(RNA, aligner, type, index_type, version, options, read1_fname, read2_fname, out_fname, cmd_idx = 0):
|
|
cmd = ["/usr/bin/time"]
|
|
if osx_mode:
|
|
cmd += ['-l']
|
|
if aligner == "hisat2":
|
|
if version:
|
|
cmd += ["%s/hisat2_%s/hisat2" % (aligner_bin_base, version)]
|
|
else:
|
|
cmd += ["%s/hisat2" % (aligner_bin_base)]
|
|
if num_threads > 1:
|
|
cmd += ["-p", str(num_threads)]
|
|
|
|
# cmd += ["-k", "5"]
|
|
# cmd += ["--score-min", "C,-18"]
|
|
|
|
# daehwan - for debugging purposes
|
|
# cmd += ["--score-min", "C,-50"]
|
|
# cmd += ["--pen-cansplice", "0"]
|
|
# cmd += ["--pen-noncansplice", "12"]
|
|
# cmd += ["--pen-intronlen", "G,-8,1"]
|
|
# cmd += ["--metrics", "1",
|
|
# "--metrics-file", "metrics.out"]
|
|
|
|
if version == "204":
|
|
cmd += ["--sp", "2,1"]
|
|
|
|
if not RNA:
|
|
cmd += ["--no-spliced-alignment"]
|
|
|
|
if type in ["x1", "x2"]:
|
|
cmd += ["--no-temp-splicesite"]
|
|
|
|
# DK - for debugging purposes
|
|
# cmd += ["--dta"]
|
|
"""
|
|
if index_type == "tran":
|
|
cmd += ["--no-anchorstop"]
|
|
cmd += ["-k", "100"]
|
|
"""
|
|
|
|
if options != "":
|
|
cmd += options.split(' ')
|
|
|
|
if type == "x2":
|
|
if cmd_idx == 0:
|
|
cmd += ["--novel-splicesite-outfile"]
|
|
else:
|
|
cmd += ["--novel-splicesite-infile"]
|
|
cmd += ["splicesites.txt"]
|
|
|
|
# "--novel-splicesite-infile",
|
|
# "../splicesites.txt",
|
|
# "--rna-strandness",
|
|
# "FR",
|
|
if version:
|
|
index_cmd = "%s/HISAT2_%s%s/" % (index_base, version, index_add) + genome
|
|
else:
|
|
index_cmd = "%s/HISAT2%s/" % (index_base, index_add) + genome
|
|
if index_type:
|
|
index_cmd += ("_" + index_type)
|
|
cmd += [index_cmd]
|
|
if paired:
|
|
cmd += ["-1", read1_fname,
|
|
"-2", read2_fname]
|
|
else:
|
|
cmd += ["-U", read1_fname]
|
|
elif aligner == "hisat":
|
|
cmd += ["%s/hisat" % (aligner_bin_base)]
|
|
if num_threads > 1:
|
|
cmd += ["-p", str(num_threads)]
|
|
# cmd += ["-k", "5"]
|
|
# cmd += ["--score-min", "C,-18"]
|
|
if version != "":
|
|
version = int(version)
|
|
else:
|
|
version = sys.maxint
|
|
|
|
if not RNA:
|
|
cmd += ["--no-spliced-alignment"]
|
|
|
|
if type in ["x1", "x2"] or not RNA:
|
|
cmd += ["--no-temp-splicesite"]
|
|
|
|
"""
|
|
cmd += ["--rdg", "100,100",
|
|
"--rfg", "100,100"]
|
|
"""
|
|
|
|
if type == "x2":
|
|
if cmd_idx == 0:
|
|
cmd += ["--novel-splicesite-outfile"]
|
|
else:
|
|
cmd += ["--novel-splicesite-infile"]
|
|
cmd += ["splicesites.txt"]
|
|
|
|
# "--novel-splicesite-infile",
|
|
# "../splicesites.txt",
|
|
# "--rna-strandness",
|
|
# "FR",
|
|
cmd += ["%s/HISAT%s/" % (index_base, index_add) + genome]
|
|
if paired:
|
|
cmd += ["-1", read1_fname,
|
|
"-2", read2_fname]
|
|
else:
|
|
cmd += [read1_fname]
|
|
elif aligner == "tophat2":
|
|
cmd += ["%s/tophat" % (aligner_bin_base)]
|
|
if num_threads > 1:
|
|
cmd += ["-p", str(num_threads)]
|
|
cmd += ["--read-edit-dist", "3"]
|
|
cmd += ["--no-sort-bam"]
|
|
cmd += ["--read-realign-edit-dist", "0"]
|
|
cmd += ["--keep-tmp",
|
|
"%s/HISAT%s/" % (index_base, index_add) + genome,
|
|
read1_fname]
|
|
if paired:
|
|
cmd += [read2_fname]
|
|
elif aligner == "star":
|
|
cmd += ["%s/STAR" % (aligner_bin_base)]
|
|
if num_threads > 1:
|
|
cmd += ["--runThreadN", str(num_threads)]
|
|
if type == "x2" and cmd_idx == 1:
|
|
cmd += ["--genomeDir", "."]
|
|
else:
|
|
cmd += ["--genomeDir", "%s/STAR%s" % (index_base, index_add)]
|
|
if desktop:
|
|
cmd += ["--genomeLoad", "NoSharedMemory"]
|
|
else:
|
|
cmd += ["--genomeLoad", "LoadAndKeep"]
|
|
if type == "x2":
|
|
if cmd_idx == 1:
|
|
cmd += ["--alignSJDBoverhangMin", "1"]
|
|
cmd += ["--readFilesIn",
|
|
read1_fname]
|
|
if paired:
|
|
cmd += [read2_fname]
|
|
if paired:
|
|
cmd += ["--outFilterMismatchNmax", "6"]
|
|
else:
|
|
cmd += ["--outFilterMismatchNmax", "3"]
|
|
elif aligner == "bowtie":
|
|
cmd += ["%s/bowtie" % (aligner_bin_base)]
|
|
if num_threads > 1:
|
|
cmd += ["-p", str(num_threads)]
|
|
cmd += ["--sam",
|
|
"-k", "10"]
|
|
cmd += ["-n", "3"]
|
|
if paired:
|
|
cmd += ["-X", "500"]
|
|
cmd += ["%s/Bowtie%s/" % (index_base, index_add) + genome]
|
|
if paired:
|
|
cmd += ["-1", read1_fname,
|
|
"-2", read2_fname]
|
|
else:
|
|
cmd += [read1_fname]
|
|
elif aligner == "bowtie2":
|
|
if version:
|
|
cmd += ["%s/bowtie2_%s/bowtie2" % (aligner_bin_base, version)]
|
|
else:
|
|
cmd += ["%s/bowtie2" % (aligner_bin_base)]
|
|
if num_threads > 1:
|
|
cmd += ["-p", str(num_threads)]
|
|
#cmd += ["-k", "10"]
|
|
#cmd += ["--score-min", "C,-18"]
|
|
cmd += ["-X", "1000"]
|
|
|
|
if options:
|
|
cmd += options.split(' ')
|
|
|
|
if version:
|
|
cmd += ["-x %s/Bowtie2_%s%s/" % (index_base, version, index_add) + genome]
|
|
else:
|
|
cmd += ["-x %s/Bowtie2%s/" % (index_base, index_add) + genome]
|
|
if paired:
|
|
cmd += ["-1", read1_fname,
|
|
"-2", read2_fname]
|
|
else:
|
|
cmd += [read1_fname]
|
|
elif aligner == "gsnap":
|
|
cmd += ["%s/gsnap" % (aligner_bin_base),
|
|
"-A",
|
|
"sam"]
|
|
if num_threads > 1:
|
|
cmd += ["-t", str(num_threads)]
|
|
cmd += ["--max-mismatches=3",
|
|
"-D", "%s/GSNAP%s" % (index_base, index_add),
|
|
"-N", "1",
|
|
"-d", genome,
|
|
read1_fname]
|
|
if paired:
|
|
cmd += [read2_fname]
|
|
elif aligner == "bwa":
|
|
if version:
|
|
cmd += ["%s/bwa_%s/bwa" % (aligner_bin_base, version)]
|
|
else:
|
|
cmd += ["%s/bwa" % (aligner_bin_base)]
|
|
if type in ["mem", "aln"]:
|
|
cmd += [type]
|
|
elif type == "sw":
|
|
cmd += ["bwa" + type]
|
|
if num_threads > 1:
|
|
cmd += ["-t", str(num_threads)]
|
|
if options:
|
|
cmd += options.split(' ')
|
|
if version:
|
|
cmd += ["%s/BWA_%s%s/%s.fa" % (index_base, version, index_add, genome)]
|
|
else:
|
|
cmd += ["%s/BWA%s/%s.fa" % (index_base, index_add, genome)]
|
|
cmd += [read1_fname]
|
|
if paired:
|
|
cmd += [read2_fname]
|
|
elif aligner == "vg":
|
|
# vg map -d 22 -t 6 -M 10 -f ../sim-1.fa -f ../sim-2.fa --surject-to sam > result.sam
|
|
cmd += ["%s/vg" % (aligner_bin_base)]
|
|
cmd += ["map"]
|
|
cmd += ["-t", str(num_threads)]
|
|
cmd += ["--surject-to", "sam"]
|
|
index_cmd = "%s/VG%s/" % (index_base, index_add) + genome
|
|
if index_type:
|
|
index_cmd += ("_" + index_type)
|
|
|
|
if options:
|
|
cmd += options.split(' ')
|
|
|
|
cmd += ["-d", index_cmd]
|
|
|
|
cmd += ["-f", read1_fname]
|
|
if paired:
|
|
cmd += ["-f", read2_fname]
|
|
|
|
elif aligner == "minimap2":
|
|
# minimap2 -a -x sr 22.mmi sim_1.fa sim_2.fa > result.sam
|
|
cmd += ["%s/minimap2" % (aligner_bin_base)]
|
|
cmd += ["-a"]
|
|
cmd += ["-x", "sr"]
|
|
index_cmd = "%s/minimap2%s/" % (index_base, index_add) + genome
|
|
if index_type:
|
|
index_cmd += ("_" + index_type)
|
|
index_cmd += ".mmi"
|
|
cmd += [index_cmd]
|
|
cmd += [read1_fname]
|
|
if paired:
|
|
cmd += [read2_fname]
|
|
else:
|
|
assert False
|
|
|
|
return cmd
|
|
|
|
for aligner, type, index_type, version, options in aligners:
|
|
skip = False
|
|
if len(test_aligners) > 0:
|
|
skip = True
|
|
for test_aligner in test_aligners:
|
|
if aligner == test_aligner:
|
|
skip = False
|
|
if skip:
|
|
continue
|
|
|
|
aligner_name = aligner + type + version
|
|
if (aligner == "hisat2" or aligner == "vg") and index_type != "":
|
|
aligner_name += ("_" + index_type)
|
|
|
|
if options != "":
|
|
option_name = options.replace(' ', '').replace('-', '').replace(',', '')
|
|
aligner_name = aligner_name + '_' + option_name
|
|
if paired:
|
|
aligner_dir = aligner_name + "_paired"
|
|
else:
|
|
aligner_dir = aligner_name + "_single"
|
|
|
|
if fresh and os.path.exists(aligner_dir):
|
|
os.system("rm -rf %s" % aligner_dir)
|
|
|
|
if not os.path.exists(aligner_dir):
|
|
os.mkdir(aligner_dir)
|
|
os.chdir(aligner_dir)
|
|
|
|
out_fname = "accepted.sam"
|
|
aligner_cmd = get_aligner_cmd(RNA, aligner, type, index_type, version, options, "../" + type_read1_fname, "../" + type_read2_fname, out_fname)
|
|
duration = 0.1
|
|
mem_usage = ''
|
|
if not os.path.exists(out_fname):
|
|
if not os.path.exists("../one.fq") or not os.path.exists("../two.fq"):
|
|
if gz_file:
|
|
os.system("gzip -cd ../1.fq.gz | head -400 > ../one.fq")
|
|
os.system("gzip -cd ../2.fq.gz | head -400 > ../two.fq")
|
|
else:
|
|
os.system("head -400 ../1.fq > ../one.fq")
|
|
os.system("head -400 ../2.fq > ../two.fq")
|
|
|
|
# dummy commands for caching index
|
|
loading_time = 0
|
|
if aligner not in ["tophat2"]:
|
|
for i in range(3):
|
|
dummy_cmd = get_aligner_cmd(RNA, aligner, type, index_type, version, options, "../one.fq", "../two.fq", "/dev/null")
|
|
start_time = datetime.now()
|
|
if verbose:
|
|
print >> sys.stderr, start_time, "\t", " ".join(dummy_cmd)
|
|
if aligner in ["hisat2", "hisat", "bowtie", "bowtie2", "gsnap", "bwa"]:
|
|
proc = subprocess.Popen(dummy_cmd, stdout=open("/dev/null", "w"), stderr=subprocess.PIPE)
|
|
else:
|
|
proc = subprocess.Popen(dummy_cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
|
|
proc.communicate()
|
|
finish_time = datetime.now()
|
|
duration = finish_time - start_time
|
|
duration = duration.total_seconds()
|
|
if verbose:
|
|
print >> sys.stderr, finish_time, "duration:", duration
|
|
loading_time = duration
|
|
|
|
# align all reads
|
|
if paired:
|
|
sweep_read_cmd = "cat ../%s ../%s > /dev/null" % (type_read1_fname, type_read2_fname)
|
|
else:
|
|
sweep_read_cmd = "cat ../%s > /dev/null" % (type_read1_fname)
|
|
print >> sys.stderr, datetime.now(), "\t", sweep_read_cmd
|
|
os.system(sweep_read_cmd)
|
|
|
|
skip_alignment = False
|
|
if paired and aligner == "olego" and os.path.exists(out_fname + "1"):
|
|
skip_alignment = True
|
|
|
|
if not skip_alignment:
|
|
aligner_cmd = get_aligner_cmd(RNA, aligner, type, index_type, version, options, "../" + type_read1_fname, "../" + type_read2_fname, out_fname)
|
|
start_time = datetime.now()
|
|
if verbose:
|
|
print >> sys.stderr, start_time, "\t", " ".join(aligner_cmd)
|
|
if aligner in ["hisat2", "hisat", "bowtie", "bowtie2", "gsnap", "bwa", "vg", "minimap2"]:
|
|
proc = subprocess.Popen(aligner_cmd, stdout=open(out_fname, "w"), stderr=subprocess.PIPE)
|
|
else:
|
|
proc = subprocess.Popen(aligner_cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
|
|
_, mem_usage = proc.communicate()
|
|
mem_usage = parse_mem_usage(mem_usage)
|
|
finish_time = datetime.now()
|
|
duration = finish_time - start_time
|
|
duration = duration.total_seconds() - loading_time
|
|
if duration < 0.1:
|
|
duration = 0.1
|
|
if verbose:
|
|
print >> sys.stderr, finish_time, "duration:", duration
|
|
|
|
if verbose:
|
|
print >> sys.stderr, finish_time, "Memory Usage: %dMB" % (int(mem_usage) / 1024)
|
|
|
|
if debug and aligner == "hisat" and type == "x1":
|
|
os.system("cat metrics.out")
|
|
print >> sys.stderr, "\ttime: %.4f" % (duration)
|
|
# break
|
|
|
|
if aligner == "star" and type in ["", "gtf"]:
|
|
os.system("mv Aligned.out.sam %s" % out_fname)
|
|
elif aligner in ["hisat2", "hisat"] and type == "x2":
|
|
aligner_cmd = get_aligner_cmd(RNA, aligner, type, index_type, version, options, "../" + type_read1_fname, "../" + type_read2_fname, out_fname, 1)
|
|
if verbose:
|
|
print >> sys.stderr, start_time, "\t", " ".join(aligner_cmd)
|
|
start_time = datetime.now()
|
|
proc = subprocess.Popen(aligner_cmd, stdout=open(out_fname, "w"), stderr=subprocess.PIPE)
|
|
proc.communicate()
|
|
finish_time = datetime.now()
|
|
duration += (finish_time - start_time).total_seconds()
|
|
duration -= loading_time
|
|
if duration < 0.1:
|
|
duration = 0.1
|
|
if verbose:
|
|
print >> sys.stderr, finish_time, "duration:", duration
|
|
elif aligner == "star" and type == "x2":
|
|
assert os.path.exists("SJ.out.tab")
|
|
os.system("awk 'BEGIN {OFS=\"\t\"; strChar[0]=\".\"; strChar[1]=\"+\"; strChar[2]=\"-\";} {if($5>0){print $1,$2,$3,strChar[$4]}}' SJ.out.tab > SJ.out.tab.Pass1.sjdb")
|
|
for file in os.listdir("."):
|
|
if file in ["SJ.out.tab.Pass1.sjdb", "genome.fa"]:
|
|
continue
|
|
os.remove(file)
|
|
star_index_cmd = "STAR --genomeDir ./ --runMode genomeGenerate --genomeFastaFiles ../../../../data/genome.fa --sjdbFileChrStartEnd SJ.out.tab.Pass1.sjdb --sjdbOverhang 100 --runThreadN %d" % (num_threads)
|
|
print >> sys.stderr, "\t", datetime.now(), star_index_cmd
|
|
os.system(star_index_cmd)
|
|
if verbose:
|
|
print >> sys.stderr, "\t", datetime.now(), " ".join(dummy_cmd)
|
|
proc = subprocess.Popen(dummy_cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
|
|
proc.communicate()
|
|
if verbose:
|
|
print >> sys.stderr, "\t", datetime.now(), "finished"
|
|
aligner_cmd = get_aligner_cmd(RNA, aligner, type, index_type, version, options, "../" + type_read1_fname, "../" + type_read2_fname, out_fname, 1)
|
|
start_time = datetime.now()
|
|
if verbose:
|
|
print >> sys.stderr, "\t", start_time, " ".join(aligner_cmd)
|
|
proc = subprocess.Popen(aligner_cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
|
|
proc.communicate()
|
|
finish_time = datetime.now()
|
|
duration += (finish_time - start_time).total_seconds()
|
|
duration -= loading_time
|
|
if duration < 0.1:
|
|
duration = 0.1
|
|
if verbose:
|
|
print >> sys.stderr, "\t", finish_time, "finished:", duration
|
|
os.system("mv Aligned.out.sam %s" % out_fname)
|
|
elif aligner == "tophat2":
|
|
os.system("samtools sort -n tophat_out/accepted_hits.bam accepted_hits; samtools view -h accepted_hits.bam > %s" % out_fname)
|
|
elif aligner == "vg":
|
|
index_name = '%s/VG%s/' % (index_base, index_add) + genome
|
|
if index_type:
|
|
index_name += ('_' + index_type)
|
|
|
|
os.system("echo %s %s %f >> runtime" % (str(datetime.now()), aligner, duration))
|
|
|
|
if aligner in ["star", "tophat2", "gsnap"]:
|
|
os.system("tar cvzf %s.tar.gz %s &> /dev/null" % (out_fname, out_fname))
|
|
|
|
if runtime_only:
|
|
os.chdir("..")
|
|
continue
|
|
|
|
suffix = aligner
|
|
read_sam, pair_sam = suffix + ".read.sam", suffix + ".pair.sam"
|
|
unmapped_read_1_fq, unmapped_read_2_fq = suffix + ".unmapped.1.fq", suffix + ".unmapped.2.fq"
|
|
if not os.path.exists(read_sam) or not os.path.exists(pair_sam):
|
|
if index_type == 'snp':
|
|
extract_reads_and_pairs(chr_dic, out_fname, read_sam, pair_sam, unmapped_read_1_fq, unmapped_read_2_fq, snp_dic)
|
|
else:
|
|
extract_reads_and_pairs(chr_dic, out_fname, read_sam, pair_sam, unmapped_read_1_fq, unmapped_read_2_fq)
|
|
|
|
|
|
|
|
out = ''
|
|
if gz_file:
|
|
out = subprocess.check_output("gzip -cd ../%s | wc -l" % type_read1_fname, shell=True)
|
|
else:
|
|
out = subprocess.check_output("wc -l ../%s" % type_read1_fname, shell=True)
|
|
|
|
numreads = int(out.split()[0]) / 4
|
|
|
|
done_filename = suffix + ".done"
|
|
if not os.path.exists(done_filename):
|
|
done_file = open(done_filename, "w")
|
|
if paired:
|
|
sum = [0, 0, 0, 0, 0, 0] # mappep_read, junction_read, gtf_junction_reads, concord_mapped_read, num_junctions, num_gtf_junctions
|
|
dis_sum = 0
|
|
stat, dis_stat = pair_stat(pair_sam, gtf_junctions, chr_dic)
|
|
output = ""
|
|
for i in range(len(stat)):
|
|
for j in range(len(sum)):
|
|
sum[j] += stat[i][j]
|
|
|
|
dis_sum += dis_stat[i]
|
|
mapped_reads, junction_reads, gtf_junction_reads, concord_mapped_read, num_junctions, num_gtf_junctions = sum
|
|
output += "%s\t%s\tpaired\t%d\t%d\t%.2f%%\t%d\t%d\t%d\t%d\t%f\t%d\t%d\t%.2f%%\n" % \
|
|
(aligner_name, gene, i, mapped_reads, float(mapped_reads) * 100.0 / numreads, junction_reads, gtf_junction_reads, num_junctions, num_gtf_junctions, duration, (numreads / max(1.0, duration)), concord_mapped_read, float(concord_mapped_read) * 100.0 / numreads)
|
|
|
|
if sql_write and os.path.exists("../" + sql_db_name):
|
|
sql_insert = "INSERT INTO \"Mappings\" VALUES(NULL, '%s', '%s', '%s', '%s', '%s', '%s', %d, %d, %d, %d, %d, %d, %f, '%s', datetime('now', 'localtime'), '%s');" % \
|
|
(workdir, genome, "paired", aligner_name, get_aligner_version(aligner), "no", i, mapped_reads, junction_reads, gtf_junction_reads, num_junctions, num_gtf_junctions, duration, platform.node(), " ".join(aligner_cmd))
|
|
sql_execute("../" + sql_db_name, sql_insert)
|
|
|
|
|
|
print >> sys.stderr, output,
|
|
print >> done_file, output
|
|
else:
|
|
sum = [0, 0, 0, 0, 0]
|
|
stat = read_stat(read_sam, gtf_junctions, chr_dic)
|
|
output = ""
|
|
for i in range(len(stat)):
|
|
for j in range(len(sum)):
|
|
sum[j] += stat[i][j]
|
|
|
|
mapped_reads, junction_reads, gtf_junction_reads, num_junctions, num_gtf_junctions = sum
|
|
output += "%s\t%s\tsingle\t%d\t%d\t%.2f%%\t%d\t%d\t%d\t%d\t%f\t%d\n" % \
|
|
(aligner_name, gene, i, mapped_reads, float(mapped_reads) * 100.0 / numreads, junction_reads, gtf_junction_reads, num_junctions, num_gtf_junctions, duration, (numreads / max(1.0, duration)))
|
|
|
|
if sql_write and os.path.exists("../" + sql_db_name):
|
|
sql_insert = "INSERT INTO \"Mappings\" VALUES(NULL, '%s', '%s', '%s', '%s', '%s', '%s', %d, %d, %d, %d, %d, %d, %f, '%s', datetime('now', 'localtime'), '%s');" % \
|
|
(workdir, genome, "single", aligner_name, get_aligner_version(aligner), "no", i, mapped_reads, junction_reads, gtf_junction_reads, num_junctions, num_gtf_junctions, duration, platform.node(), " ".join(aligner_cmd))
|
|
sql_execute("../" + sql_db_name, sql_insert)
|
|
|
|
print >> sys.stderr, output,
|
|
print >> done_file, output
|
|
|
|
done_file.close()
|
|
|
|
|
|
os.chdir("..")
|
|
|
|
if os.path.exists(sql_db_name):
|
|
write_analysis_data(sql_db_name, workdir, paired)
|
|
|
|
|
|
|
|
if __name__ == "__main__":
|
|
parser = ArgumentParser(
|
|
description='test HISAT2, and compare HISAT2 with other popular aligners such as TopHat2, STAR, Bowtie1/2, GSNAP, BWA-mem, etc.')
|
|
parser.add_argument('--single-end',
|
|
dest='paired_end',
|
|
action='store_false',
|
|
help='run single-end only')
|
|
parser.add_argument('--paired-end',
|
|
dest='single_end',
|
|
action='store_false',
|
|
help='run paired_end only')
|
|
parser.add_argument('--aligner-list',
|
|
dest='aligner_list',
|
|
type=str,
|
|
default="",
|
|
help='comma-separated list of aligners (e.g. hisat2,bowtie2,bwa')
|
|
parser.add_argument('--fresh',
|
|
dest='fresh',
|
|
action='store_true',
|
|
help='delete existing alignment related directories (e.g. hisat2_single)')
|
|
parser.add_argument('--runtime-only',
|
|
dest='runtime_only',
|
|
action='store_true',
|
|
help='run programs without evaluation')
|
|
parser.add_argument('-v', '--verbose',
|
|
dest='verbose',
|
|
action='store_true',
|
|
help='also print some statistics to stderr')
|
|
|
|
args = parser.parse_args()
|
|
|
|
aligners = []
|
|
for aligner in args.aligner_list.split(','):
|
|
if aligner == "":
|
|
continue
|
|
aligners.append(aligner)
|
|
|
|
calculate_read_cost(args.single_end,
|
|
args.paired_end,
|
|
aligners,
|
|
args.fresh,
|
|
args.runtime_only,
|
|
args.verbose)
|
|
|