2835 lines
105 KiB
Python
2835 lines
105 KiB
Python
#!/usr/bin/env python
|
|
|
|
import sys, os, subprocess
|
|
import multiprocessing
|
|
import string, re
|
|
import platform
|
|
from datetime import datetime, date, time
|
|
import copy
|
|
from argparse import ArgumentParser, FileType
|
|
from multiprocessing import Process
|
|
import bisect
|
|
|
|
mp_mode = False
|
|
mp_num = 1
|
|
|
|
cigar_re = re.compile('\d+\w')
|
|
|
|
osx_mode = False
|
|
if sys.platform == 'darwin':
|
|
osx_mode = True
|
|
|
|
"""
|
|
"""
|
|
def parse_mem_usage(resource):
|
|
if osx_mode:
|
|
resource = resource.strip().split('\n')
|
|
for line in resource:
|
|
if line.find('maximum resident set size') != -1:
|
|
return int(line.split()[0]) / 1024
|
|
else:
|
|
resource = resource.split(' ')
|
|
for line in resource:
|
|
idx = line.find('maxresident')
|
|
if idx != -1:
|
|
return line[:idx]
|
|
|
|
return '0'
|
|
|
|
|
|
"""
|
|
"""
|
|
def reverse_complement(seq):
|
|
result = ""
|
|
for nt in seq:
|
|
base = nt
|
|
if nt == 'A':
|
|
base = 'T'
|
|
elif nt == 'a':
|
|
base = 't'
|
|
elif nt == 'C':
|
|
base = 'G'
|
|
elif nt == 'c':
|
|
base = 'g'
|
|
elif nt == 'G':
|
|
base = 'C'
|
|
elif nt == 'g':
|
|
base = 'c'
|
|
elif nt == 'T':
|
|
base = 'A'
|
|
elif nt == 't':
|
|
base = 'a'
|
|
|
|
result = base + result
|
|
|
|
return result
|
|
|
|
|
|
"""
|
|
RepeatDB
|
|
"""
|
|
|
|
class RepeatAllele:
|
|
def __init__(self):
|
|
self.repeat_name = ''
|
|
self.allele_idx = 0
|
|
self.repeat_pos = 0
|
|
self.repeat_length = 0
|
|
self.positions = []
|
|
return
|
|
|
|
def __repr__(self):
|
|
return '[' + ','.join([str(self.repeat_name), str(self.allele_idx), str(self.repeat_pos), str(self.repeat_length), str(len(self.positions))]) + ']'
|
|
|
|
def add_position(self, chr, pos, strand):
|
|
self.positions.append([chr, pos, strand])
|
|
|
|
|
|
def __lt__(self, other):
|
|
if self.repeat_pos < other.repeat_pos:
|
|
return True
|
|
elif self.repeat_pos == other.repeat_pos:
|
|
return self.repeat_length < other.repeat_length
|
|
else:
|
|
return False
|
|
|
|
class Repeat:
|
|
def __init__(self):
|
|
self.repeat_name = ''
|
|
self.repeat_length = 0
|
|
self.repeat_pos = 0
|
|
self.allele = []
|
|
return
|
|
|
|
def add_allele(self, allele_idx, repeatAllele):
|
|
#self.allele[allele_idx] = repeatAllele
|
|
self.allele.append(repeatAllele)
|
|
|
|
def allele_sort(self):
|
|
self.allele = sorted(self.allele)
|
|
|
|
def cmp_repeatmap(a, b):
|
|
if a[0] < b[0]:
|
|
return -1
|
|
elif a[0] == b[0]:
|
|
return 0
|
|
else:
|
|
return 1
|
|
|
|
def read_len_cigar(cigar_str):
|
|
cigars = cigar_re.findall(cigar_str)
|
|
cigars = [[cigar[-1], int(cigar[:-1])] for cigar in cigars]
|
|
|
|
read_len = 0
|
|
for cigar in cigars:
|
|
cigar_op, length = cigar
|
|
if cigar_op in "MISH":
|
|
read_len += int(length)
|
|
|
|
return read_len
|
|
|
|
def read_repeatdb(repeat_filename):
|
|
repeat_db = {}
|
|
|
|
if os.path.exists(repeat_filename):
|
|
|
|
for line in open(repeat_filename, 'r'):
|
|
|
|
if line[0] == '>':
|
|
line = line.strip()[1:]
|
|
name, rptRefName, pos, rep_len, _, _ = line.split()[:6]
|
|
pos = int(pos)
|
|
rep_len = int(rep_len)
|
|
rptName, allele_idx = name.split('*')[0:2]
|
|
allele_idx = int(allele_idx)
|
|
|
|
repeatAllele = RepeatAllele()
|
|
repeatAllele.repeat_name = rptName
|
|
repeatAllele.allele_idx = allele_idx
|
|
repeatAllele.repeat_pos = pos
|
|
repeatAllele.repeat_length = rep_len
|
|
|
|
if rptRefName not in repeat_db:
|
|
# new rptRefName
|
|
repeat_db[rptRefName] = {}
|
|
|
|
if rptName not in repeat_db[rptRefName]:
|
|
# new rptName
|
|
assert allele_idx == 0
|
|
repeat_db[rptRefName][rptName] = Repeat()
|
|
repeat_db[rptRefName][rptName].repeat_name = rptName
|
|
|
|
repeat_db[rptRefName][rptName].add_allele(allele_idx, repeatAllele)
|
|
|
|
else:
|
|
coords = line.split()
|
|
for coord in coords:
|
|
chr, pos, strand = coord.split(':')
|
|
pos = int(pos)
|
|
|
|
repeat_db[rptRefName][rptName].allele[allele_idx].add_position(chr, pos, strand)
|
|
|
|
else:
|
|
print >> sys.stderr, 'Cannot open file', repeat_filename
|
|
|
|
|
|
print >> sys.stderr, 'Build repeatMap'
|
|
repeat_map = {}
|
|
for rptRefName, repeats in repeat_db.items():
|
|
#print 'Processing', rptRefName
|
|
repeat_pos_list = []
|
|
|
|
for repeatName, repeat in repeats.items():
|
|
#print 'Common Allele:', repeatName, repeat.repeat_name
|
|
repeat_left = sys.maxint
|
|
repeat_right = 0
|
|
|
|
#for allele_id, repeatAllele in repeat.allele.items():
|
|
for repeatAllele in repeat.allele:
|
|
left = repeatAllele.repeat_pos
|
|
right = left + repeatAllele.repeat_length
|
|
if left < repeat_left:
|
|
repeat_left = left
|
|
if right > repeat_right:
|
|
repeat_right = right
|
|
|
|
repeat.repeat_pos = repeat_left
|
|
repeat.repeat_length = repeat_right - repeat_left
|
|
|
|
#print repeat.allele
|
|
#repeat.allele_sort()
|
|
#print repeat.allele
|
|
|
|
#print repeat_left, repeat_right
|
|
|
|
repeat_pos_list.append((repeat_right, repeatName))
|
|
|
|
repeat_map[rptRefName] = sorted(repeat_pos_list, cmp=cmp_repeatmap)
|
|
#print repeat_map[rptRefName]
|
|
|
|
return repeat_db, repeat_map
|
|
|
|
|
|
def find_leftmost_pos(rmap, left):
|
|
pos = bisect.bisect_left(rmap, (left, None))
|
|
#print pos
|
|
|
|
if pos == len(rmap):
|
|
return pos
|
|
|
|
if rmap[pos][0] == left:
|
|
while pos < len(rmap):
|
|
if rmap[pos][0] != left:
|
|
break
|
|
pos += 1
|
|
|
|
return pos
|
|
|
|
def repeat_to_genome_pos(repeat_db, repeat_map, rptRefName, pos, cigar_str = ''):
|
|
assert rptRefName in repeat_db
|
|
readlen = read_len_cigar(cigar_str)
|
|
#readlen = 101
|
|
|
|
# pos in sam-result. pos is 1-based
|
|
left = pos - 1
|
|
right = left + readlen
|
|
|
|
repeats = repeat_db[rptRefName]
|
|
rmap = repeat_map[rptRefName]
|
|
|
|
#print len(rmap)
|
|
#print rmap
|
|
|
|
i = find_leftmost_pos(rmap, left)
|
|
if i >= len(rmap):
|
|
print >> sys.stderr, 'Value Error'
|
|
return
|
|
|
|
if right > rmap[i][0]:
|
|
print >> sys.stderr, 'Not repeat'
|
|
return
|
|
|
|
repeat = repeats[rmap[i][1]]
|
|
|
|
#print 'Allele Size:', len(repeat.allele)
|
|
#print repeat.allele
|
|
for allele in repeat.allele:
|
|
rpos = allele.repeat_pos
|
|
rlen = allele.repeat_length
|
|
|
|
if (left >= rpos) and (right <= rpos + rlen):
|
|
offset = left - rpos
|
|
for genome_pos in allele.positions:
|
|
print genome_pos[0], genome_pos[1] + offset + 1, genome_pos[2], genome_pos[1]
|
|
|
|
"""
|
|
"""
|
|
def read_genome(genome_filename):
|
|
chr_dic = {}
|
|
genome_file = open(genome_filename, "r")
|
|
|
|
chr_name, sequence = "", ""
|
|
for line in genome_file:
|
|
if line[0] == ">":
|
|
if chr_name and sequence:
|
|
chr_dic[chr_name] = sequence
|
|
|
|
chr_name = line[1:-1].split()[0]
|
|
sequence = ""
|
|
else:
|
|
sequence += line[:-1]
|
|
|
|
if chr_name and sequence:
|
|
chr_dic[chr_name] = sequence
|
|
|
|
genome_file.close()
|
|
|
|
print >> sys.stderr, "genome is loaded"
|
|
|
|
return chr_dic
|
|
|
|
|
|
"""
|
|
"""
|
|
def extract_splice_sites(gtf_fname):
|
|
trans = {}
|
|
|
|
gtf_file = open(gtf_fname)
|
|
# Parse valid exon lines from the GTF file into a dict by transcript_id
|
|
for line in gtf_file:
|
|
line = line.strip()
|
|
if not line or line.startswith('#'):
|
|
continue
|
|
if '#' in line:
|
|
line = line.split('#')[0].strip()
|
|
|
|
try:
|
|
chrom, source, feature, left, right, score, \
|
|
strand, frame, values = line.split('\t')
|
|
except ValueError:
|
|
continue
|
|
left, right = int(left), int(right)
|
|
|
|
if feature != 'exon' or left >= right:
|
|
continue
|
|
|
|
values_dict = {}
|
|
for attr in values.split(';')[:-1]:
|
|
attr, _, val = attr.strip().partition(' ')
|
|
values_dict[attr] = val.strip('"')
|
|
|
|
if 'gene_id' not in values_dict or \
|
|
'transcript_id' not in values_dict:
|
|
continue
|
|
|
|
transcript_id = values_dict['transcript_id']
|
|
if transcript_id not in trans:
|
|
trans[transcript_id] = [chrom, strand, [[left, right]]]
|
|
else:
|
|
trans[transcript_id][2].append([left, right])
|
|
|
|
gtf_file.close()
|
|
|
|
# Sort exons and merge where separating introns are <=5 bps
|
|
for tran, [chrom, strand, exons] in trans.items():
|
|
exons.sort()
|
|
tmp_exons = [exons[0]]
|
|
for i in range(1, len(exons)):
|
|
if exons[i][0] - tmp_exons[-1][1] <= 5:
|
|
tmp_exons[-1][1] = exons[i][1]
|
|
else:
|
|
tmp_exons.append(exons[i])
|
|
trans[tran] = [chrom, strand, tmp_exons]
|
|
|
|
# Calculate and print the unique junctions
|
|
junctions = set()
|
|
for chrom, strand, exons in trans.values():
|
|
for i in range(1, len(exons)):
|
|
junctions.add(to_junction_str([chrom, exons[i-1][1], exons[i][0]]))
|
|
|
|
return junctions
|
|
|
|
"""
|
|
"""
|
|
def read_repeat_info(repeat_filename):
|
|
repeat_info, repeat_dic = {}, {}
|
|
repeat_pos = {}
|
|
if os.path.exists(repeat_filename):
|
|
for line in open(repeat_filename):
|
|
if line[0] == ">":
|
|
line = line.strip()[1:]
|
|
allele, rep, pos, rep_len, _, _ = line.split()[:6]
|
|
pos, rep_len = int(pos), int(rep_len)
|
|
common_allele = allele.split('*')[0]
|
|
|
|
if rep not in repeat_info:
|
|
repeat_info[rep] = []
|
|
repeat_dic[rep] = {}
|
|
repeat_pos[rep] = {}
|
|
|
|
repeat_info[rep].append([allele, pos, rep_len])
|
|
if allele not in repeat_dic[rep]:
|
|
repeat_dic[rep][allele] = []
|
|
repeat_pos[rep][allele] = set()
|
|
else:
|
|
coords = line.split()
|
|
for coord in coords:
|
|
chr, pos, strand = coord.split(':')
|
|
pos = int(pos)
|
|
if pos in repeat_pos[rep][allele]:
|
|
continue
|
|
repeat_dic[rep][allele].append([chr, pos, strand])
|
|
repeat_pos[rep][allele].add(pos)
|
|
|
|
for rep, repeats in repeat_info.items():
|
|
def my_cmp(a, b):
|
|
if a[1] < b[1]:
|
|
return -1
|
|
elif a[1] == b[1]:
|
|
return a[2] - b[2]
|
|
else:
|
|
return 1
|
|
repeat_info[rep] = sorted(repeat_info[rep], cmp=my_cmp)
|
|
|
|
return repeat_info, repeat_dic
|
|
|
|
|
|
"""
|
|
"""
|
|
def find_repeat(repeat_info, pos):
|
|
if len(repeat_info) <= 0:
|
|
return -1
|
|
|
|
l, r = 0, len(repeat_info)
|
|
while l < r:
|
|
m = (l + r) / 2
|
|
_, rep_pos, rep_len, _ = repeat_info[m]
|
|
if rep_pos <= pos and pos < rep_pos + rep_len:
|
|
return m
|
|
elif pos < rep_pos:
|
|
r = m
|
|
else:
|
|
l = m + 1
|
|
|
|
return -1
|
|
|
|
def reverse_cigar(cigar_str):
|
|
cigars = cigar_re.findall(cigar_str)
|
|
cigars = [[cigar[-1], int(cigar[:-1])] for cigar in cigars]
|
|
cigars[::-1]
|
|
|
|
read_len = 0
|
|
cigar_str = ""
|
|
for cigar in cigars:
|
|
cigar_op, length = cigar
|
|
cigar_str += ("%d%s" % (length, cigar_op))
|
|
if cigar_op in "MISH":
|
|
read_len += int(length)
|
|
|
|
return read_len, cigar_str
|
|
|
|
|
|
|
|
def to_junction_str(junction):
|
|
return "%s-%d-%d" % (junction[0], junction[1], junction[2])
|
|
|
|
|
|
def to_junction(junction_str):
|
|
fields = junction_str.split("-")
|
|
if len(fields) > 3:
|
|
chr, left, right = "-".join(fields[:-2]), fields[-2], fields[-1]
|
|
else:
|
|
assert len(fields) == 3
|
|
chr, left, right = fields
|
|
|
|
return [chr, int(left), int(right)]
|
|
|
|
|
|
"""
|
|
"""
|
|
def junction_cmp(a, b):
|
|
if a[0] != b[0]:
|
|
if a[0] < b[0]:
|
|
return -1
|
|
else:
|
|
return 1
|
|
|
|
if a[1] != b[1]:
|
|
if a[1] < b[1]:
|
|
return -1
|
|
else:
|
|
return 1
|
|
|
|
if a[2] != b[2]:
|
|
if a[2] < b[2]:
|
|
return -1
|
|
else:
|
|
return 1
|
|
|
|
return 0
|
|
|
|
|
|
"""
|
|
# chr and pos are assumed to be integers
|
|
"""
|
|
def get_junctions(chr, pos, cigar_str, min_anchor_len = 0, read_len = 100):
|
|
junctions = []
|
|
right_pos = pos
|
|
cigars = cigar_re.findall(cigar_str)
|
|
cigars = [[int(cigars[i][:-1]), cigars[i][-1]] for i in range(len(cigars))]
|
|
|
|
left_anchor_lens = []
|
|
cur_left_anchor_len = 0
|
|
for i in range(len(cigars)):
|
|
length, cigar_op = cigars[i]
|
|
if cigar_op in "MI":
|
|
cur_left_anchor_len += length
|
|
elif cigar_op == "N":
|
|
assert cur_left_anchor_len > 0
|
|
left_anchor_lens.append(cur_left_anchor_len)
|
|
cur_left_anchor_len = 0
|
|
|
|
for i in range(len(cigars)):
|
|
length, cigar_op = cigars[i]
|
|
if cigar_op == "N":
|
|
left, right = right_pos - 1, right_pos + length
|
|
|
|
if i > 0 and cigars[i-1][1] in "ID":
|
|
if cigars[i-1][1] == "I":
|
|
left += cigars[i-1][0]
|
|
else:
|
|
left -= cigars[i-1][0]
|
|
if i + 1 < len(cigars) and cigars[i+1][1] in "ID":
|
|
if cigars[i+1][1] == "I":
|
|
right -= cigars[i+1][0]
|
|
else:
|
|
right += cigars[i+1][0]
|
|
|
|
junction_idx = len(junctions)
|
|
assert junction_idx < len(left_anchor_lens)
|
|
left_anchor_len = left_anchor_lens[junction_idx]
|
|
assert left_anchor_len > 0 and left_anchor_len < read_len
|
|
right_anchor_len = read_len - left_anchor_len
|
|
if left_anchor_len >= min_anchor_len and right_anchor_len >= min_anchor_len:
|
|
junctions.append([chr, left, right])
|
|
|
|
if cigar_op in "MND":
|
|
right_pos += length
|
|
|
|
return junctions
|
|
|
|
def get_right(pos, cigars):
|
|
right_pos = pos
|
|
cigars = cigar_re.findall(cigars)
|
|
for cigar in cigars:
|
|
length = int(cigar[:-1])
|
|
cigar_op = cigar[-1]
|
|
if cigar_op in "MDN":
|
|
right_pos += length
|
|
|
|
return right_pos
|
|
|
|
def get_cigar_chars(cigars):
|
|
cigars = cigar_re.findall(cigars)
|
|
cigar_chars = ""
|
|
for cigar in cigars:
|
|
cigar_op = cigar[-1]
|
|
cigar_chars += cigar_op
|
|
|
|
return cigar_chars
|
|
|
|
|
|
"""
|
|
"""
|
|
def get_cigar_chars_MN(cigars):
|
|
cigars = cigar_re.findall(cigars)
|
|
cigar_chars = ""
|
|
for cigar in cigars:
|
|
cigar_op = cigar[-1]
|
|
if cigar_op in "MN":
|
|
if cigar_chars == "" or cigar_chars[-1] != cigar_op:
|
|
cigar_chars += cigar_op
|
|
|
|
return cigar_chars
|
|
|
|
|
|
"""
|
|
"""
|
|
def is_small_anchor_junction_read(cigars):
|
|
cigar_list = []
|
|
for cigar in cigar_re.findall(cigars):
|
|
cigar_op = cigar[-1]
|
|
cigar_len = int(cigar[:-1])
|
|
cigar_list.append([cigar_op, cigar_len])
|
|
|
|
if len(cigar_list) < 3:
|
|
return False
|
|
|
|
if cigar_list[0][0] != 'M' or cigar_list[-1][0] != 'M':
|
|
return False
|
|
|
|
if cigar_list[0][1] > 10 and cigar_list[-1][1] > 10:
|
|
return False
|
|
|
|
if cigar_list[1][0] != 'N' or cigar_list[-2][0] != 'N':
|
|
return False
|
|
|
|
return True
|
|
|
|
|
|
"""
|
|
"""
|
|
def is_canonical_junction(chr_dic, junction):
|
|
chr, left, right = junction
|
|
donor = chr_dic[chr][left:left+2]
|
|
acceptor = chr_dic[chr][right-3:right-1]
|
|
rev_donor = reverse_complement(acceptor)
|
|
rev_acceptor = reverse_complement(donor)
|
|
|
|
if (donor == "GT" and acceptor == "AG") or \
|
|
(rev_donor == "GT" and rev_acceptor == "AG"):
|
|
return True
|
|
|
|
return False
|
|
|
|
|
|
"""
|
|
"""
|
|
def is_small_exon_junction_read(cigars, min_exon_len = 23):
|
|
cigars = cigar_re.findall(cigars)
|
|
for i in range(1, len(cigars) - 1):
|
|
cigar = cigars[i]
|
|
cigar_op = cigar[-1]
|
|
cigar_len = int(cigar[:-1])
|
|
|
|
prev_op = cigars[i-1][-1]
|
|
next_op = cigars[i+1][-1]
|
|
|
|
if prev_op == 'N' and cigar_op == 'M' and next_op == 'N':
|
|
if cigar_len <= min_exon_len:
|
|
return True
|
|
|
|
return False
|
|
|
|
|
|
"""
|
|
"""
|
|
"""
|
|
def repeat_to_genome_alignment(repeat_info, repeat_dic, rep, pos, cigar_str = ""):
|
|
assert rep in repeat_info
|
|
left = pos - 1 # convert 1-based offset to zero-based
|
|
|
|
repeats = repeat_info[rep]
|
|
l, r = 0, len(repeats)
|
|
while l < r:
|
|
m = (l + r) / 2
|
|
rep_allele, rpos, rlen = repeats[m]
|
|
if left >= rpos and left < rpos + rlen:
|
|
while m > 0:
|
|
rep_allele, rpos, rlen = repeats[m-1]
|
|
if left < rpos:
|
|
break
|
|
m -= 1
|
|
break
|
|
if left < rpos:
|
|
r = m
|
|
else:
|
|
l = m + 1
|
|
|
|
alignments = []
|
|
while m < len(repeats):
|
|
rep_allele, rpos, rlen = repeats[m]
|
|
if left >= rpos + rlen:
|
|
m += 1
|
|
continue
|
|
|
|
if left < rpos:
|
|
break
|
|
|
|
assert rep_allele in repeat_dic[rep]
|
|
coords = repeat_dic[rep][rep_allele]
|
|
assert len(coords) > 0
|
|
for coord in coords:
|
|
cchr, cpos, cstrand = coord
|
|
adj_left = left - rpos
|
|
if cstrand == '+':
|
|
rep_left = cpos + adj_left
|
|
rep_cigar_str = cigar_str
|
|
else:
|
|
if cigar_str:
|
|
read_len, rep_cigar_str = reverse_cigar(cigar_str)
|
|
else:
|
|
read_len, rep_cigar_str = 0, ""
|
|
rc_adj_left = rlen - adj_left - read_len;
|
|
rep_left = cpos + rc_adj_left
|
|
|
|
alignments.append([cchr, rep_left + 1, rep_cigar_str])
|
|
m += 1
|
|
|
|
return alignments
|
|
"""
|
|
def repeat_to_genome_alignment(repeat_db, repeat_map, rptRefName, pos, cigar_str = ''):
|
|
assert rptRefName in repeat_db
|
|
readlen = read_len_cigar(cigar_str)
|
|
#readlen = 101
|
|
|
|
# pos in sam-result. pos is 1-based
|
|
left = pos - 1
|
|
right = left + readlen
|
|
|
|
repeats = repeat_db[rptRefName]
|
|
rmap = repeat_map[rptRefName]
|
|
|
|
#print len(rmap)
|
|
#print rmap
|
|
|
|
alignments = []
|
|
|
|
i = find_leftmost_pos(rmap, left)
|
|
if i >= len(rmap):
|
|
print >> sys.stderr, 'Value Error'
|
|
return alignments
|
|
|
|
if right > rmap[i][0]:
|
|
print >> sys.stderr, 'Not repeat'
|
|
return alignments
|
|
|
|
repeat = repeats[rmap[i][1]]
|
|
|
|
#print 'Allele Size:', len(repeat.allele)
|
|
#print repeat.allele
|
|
for allele in repeat.allele:
|
|
rpos = allele.repeat_pos
|
|
rlen = allele.repeat_length
|
|
|
|
if (left >= rpos) and (right <= rpos + rlen):
|
|
offset = left - rpos
|
|
for coord in allele.positions:
|
|
cchr, cpos, cstrand = coord
|
|
if cstrand == '+':
|
|
rep_left = cpos + offset
|
|
rep_cigar_str = cigar_str
|
|
else:
|
|
if cigar_str:
|
|
rep_read_len, rep_cigar_str = reverse_cigar(cigar_str)
|
|
else:
|
|
rep_read_len, rep_cigar_str = 0, ""
|
|
|
|
rc_offset = rlen - offset - rep_read_len
|
|
rep_left = cpos + rc_offset
|
|
|
|
alignments.append([cchr, rep_left + 1, rep_cigar_str])
|
|
#print genome_pos[0], genome_pos[1] + offset + 1, genome_pos[2], genome_pos[1]
|
|
|
|
return alignments
|
|
|
|
|
|
"""
|
|
"""
|
|
def extract_single(infilename,
|
|
outfilename,
|
|
chr_dic,
|
|
aligner,
|
|
version,
|
|
repeat_db,
|
|
repeat_map,
|
|
debug_dic,
|
|
hash_idx):
|
|
infile = open(infilename)
|
|
if hash_idx == -1:
|
|
outfile = open(outfilename, "w")
|
|
else:
|
|
outfile = open(outfilename + "." + str(hash_idx), "w")
|
|
|
|
prev_read_id = ""
|
|
num_reads, num_aligned_reads, num_ualigned_reads = 0, 0, 0
|
|
prev_NM, prev_NH, NH_real = 0, 0, 0
|
|
|
|
for line in infile:
|
|
if line[0] == '@':
|
|
continue
|
|
|
|
cols = line[:-1].split()
|
|
read_id, flag, chr, pos, mapQ, cigar_str = cols[:6]
|
|
read_seq = cols[9]
|
|
if len(read_id) >= 3 and read_id[-2] == "/":
|
|
read_id = read_id[:-2]
|
|
|
|
if read_id.find("seq.") == 0:
|
|
read_id = read_id[4:]
|
|
|
|
if aligner == "gsnap":
|
|
chr = chr.replace("_", ":")
|
|
|
|
if hash_idx != -1:
|
|
hashval = hash(read_id)
|
|
if hashval % mp_num != hash_idx:
|
|
continue
|
|
|
|
if read_id != prev_read_id:
|
|
num_reads += 1
|
|
|
|
flag, pos, mapQ = int(flag), int(pos), int(mapQ)
|
|
if flag & 0x4 != 0 or \
|
|
'H' in cigar_str:
|
|
prev_read_id = read_id
|
|
continue
|
|
|
|
NH, NM, XA = "", sys.maxint, []
|
|
for i in range(11, len(cols)):
|
|
col = cols[i]
|
|
# "nM" from STAR
|
|
if col.startswith("NM") or col.startswith("nM"):
|
|
NM = int(col[5:])
|
|
elif col.startswith("NH"):
|
|
NH = col
|
|
elif col.startswith("XA"):
|
|
XA = col[5:].split(';')[:-1]
|
|
if NH != "":
|
|
NH = int(NH[5:])
|
|
if aligner == "hisat2":
|
|
if prev_read_id == read_id:
|
|
assert prev_NH == NH
|
|
if NH == 1 or mapQ == 60:
|
|
assert NH == 1 and mapQ == 60
|
|
|
|
if read_id != prev_read_id:
|
|
num_aligned_reads += 1
|
|
if aligner == "hisat2" and \
|
|
NH == 1:
|
|
num_ualigned_reads += 1
|
|
else:
|
|
# In case of Bowtie2, only consier the best alignments
|
|
if aligner in ["bowtie2", "bwa", "vg"]:
|
|
if NM > prev_NM:
|
|
continue
|
|
|
|
def adjust_alignment(chr, pos, cigar_str):
|
|
NM_real = 0
|
|
read_pos, right_pos = 0, pos - 1
|
|
cigars = cigar_re.findall(cigar_str)
|
|
cigars = [[cigar[-1], int(cigar[:-1])] for cigar in cigars]
|
|
for i in range(len(cigars)):
|
|
cigar_op, length = cigars[i]
|
|
if cigar_op == "S":
|
|
assert i == 0 or i == len(cigars) - 1
|
|
if i == 0:
|
|
assert cigars[i+1][0] == "M"
|
|
ref_seq = chr_dic[chr][right_pos-length:right_pos]
|
|
else:
|
|
assert cigars[i-1][0] == "M"
|
|
ref_seq = chr_dic[chr][right_pos:right_pos+length]
|
|
|
|
ref_seq = ref_seq.upper()
|
|
if length == len(ref_seq):
|
|
for j in range(length):
|
|
if ref_seq[j] != "N" and read_seq[read_pos+j] != ref_seq[j]:
|
|
NM_real += 1
|
|
else:
|
|
NM_real += length
|
|
if cigar_op in "MND":
|
|
right_pos += length
|
|
if cigar_op in "MIS":
|
|
read_pos += length
|
|
|
|
if cigars[0][0] == "S":
|
|
assert cigars[1][0] == "M"
|
|
pos -= cigars[0][1]
|
|
cigars[1][1] += cigars[0][1]
|
|
cigars = cigars[1:]
|
|
if cigars[-1][0] == "S":
|
|
assert cigars[-2][0] == "M"
|
|
cigars[-2][1] += cigars[-1][1]
|
|
cigars = cigars[:-1]
|
|
|
|
cigar_str = ""
|
|
for cigar in cigars:
|
|
cigar_op, length = cigar
|
|
cigar_str += ("%d%s" % (length, cigar_op))
|
|
|
|
return pos, cigar_str, NM_real
|
|
|
|
|
|
alignments = [[chr, pos, cigar_str]]
|
|
if aligner == "bwa" and len(XA) > 0:
|
|
for alignment in XA:
|
|
alt_chr, alt_pos, alt_cigar_str, alt_NM = alignment.split(',')
|
|
alt_pos, alt_NM = abs(int(alt_pos)), int(alt_NM)
|
|
if alt_NM > NM:
|
|
break
|
|
alignments.append([alt_chr, alt_pos, alt_cigar_str])
|
|
|
|
# Convert repeat alignments to genome alignments
|
|
if aligner == "hisat2" and chr.startswith("rep") and len(repeat_map) > 0:
|
|
alignments = repeat_to_genome_alignment(repeat_db, repeat_map, chr, pos, cigar_str)
|
|
|
|
for i, alignment in enumerate(alignments):
|
|
chr, pos, cigar_str = alignment
|
|
pos, cigar_str, NM_real = adjust_alignment(chr, pos, cigar_str)
|
|
p_str = "%s\t%s\t%d\t%s\tNM:i:%d" % (read_id, chr, pos, cigar_str, NM_real)
|
|
print >> outfile, p_str
|
|
|
|
if aligner == "hisat2":
|
|
if prev_read_id != read_id:
|
|
if prev_read_id != "":
|
|
assert prev_NH == NH_real
|
|
NH_real = 1
|
|
else:
|
|
NH_real += 1
|
|
prev_NH = NH
|
|
prev_NM = NM
|
|
prev_read_id = read_id
|
|
|
|
if aligner == "hisat2":
|
|
if prev_read_id != "":
|
|
assert prev_NH == NH_real
|
|
|
|
outfile.close()
|
|
infile.close()
|
|
|
|
# Sanity check for HISAT2's alignment summary
|
|
if aligner == "hisat2" and os.path.exists(infilename + ".summary") and (not mp_mode):
|
|
hisat2_reads, hisat2_0aligned_reads, hisat2_ualigned_reads, hisat2_maligned_reads = 0, 0, 0, 0
|
|
for line in open(infilename + ".summary"):
|
|
line = line.strip()
|
|
if line.startswith("HISAT2 summary") or \
|
|
line.startswith("Overall"):
|
|
continue
|
|
category, num = line.split(':')
|
|
num = num.strip()
|
|
num = int(num.split(' ')[0])
|
|
if category.startswith("Total reads"):
|
|
hisat2_reads = num
|
|
elif category.startswith("Aligned 0 time"):
|
|
hisat2_0aligned_reads = num
|
|
elif category.startswith("Aligned 1 time"):
|
|
hisat2_ualigned_reads = num
|
|
else:
|
|
assert category.startswith("Aligned >1 time")
|
|
assert hisat2_reads == hisat2_0aligned_reads + hisat2_ualigned_reads + num
|
|
|
|
hisat2_aligned_reads = hisat2_reads - hisat2_0aligned_reads
|
|
|
|
assert hisat2_reads == num_reads
|
|
assert hisat2_aligned_reads == num_aligned_reads
|
|
assert hisat2_ualigned_reads == num_ualigned_reads
|
|
|
|
|
|
"""
|
|
"""
|
|
def extract_pair(infilename,
|
|
outfilename,
|
|
chr_dic,
|
|
RNA,
|
|
aligner,
|
|
version,
|
|
repeat_db,
|
|
repeat_map,
|
|
debug_dic,
|
|
hash_idx):
|
|
read_dic = {}
|
|
pair_reported = set()
|
|
|
|
infile = open(infilename)
|
|
if hash_idx == -1:
|
|
outfile = open(outfilename, "w")
|
|
else:
|
|
outfile = open(outfilename + "." + str(hash_idx), "w")
|
|
|
|
num_pairs, num_conc_aligned_pairs, num_conc_ualigned_pairs, num_disc_aligned_pairs = 0, 0, 0, 0
|
|
num_aligned_reads, num_ualigned_reads = 0, 0
|
|
|
|
prev_read_id, pair_list = "", set()
|
|
prev_NM = sys.maxint
|
|
prev_NH1, prev_NH2 = 0, 0
|
|
NH1_real, NH2_real = 0, 0
|
|
|
|
for line in infile:
|
|
if line[0] == '@':
|
|
continue
|
|
|
|
cols = line[:-1].split()
|
|
read_id, flag, chr1, pos1, mapQ, cigar1_str, chr2, pos2 = cols[:8]
|
|
read_seq = cols[9]
|
|
if len(read_id) >= 3 and read_id[-2] == "/":
|
|
read_id = read_id[:-2]
|
|
|
|
if read_id.find("seq.") == 0:
|
|
read_id = read_id[4:]
|
|
|
|
if hash_idx != -1:
|
|
hashval = hash(read_id)
|
|
if hashval % mp_num != hash_idx:
|
|
continue
|
|
|
|
if aligner == "gsnap":
|
|
chr1 = chr1.replace("_", ":")
|
|
chr2 = chr2.replace("_", ":")
|
|
|
|
if read_id != prev_read_id:
|
|
num_pairs += 1
|
|
pair_list = set()
|
|
prev_NM = sys.maxint
|
|
|
|
flag = int(flag)
|
|
canonical_pos1, canonical_pos2 = int(pos1), int(pos2)
|
|
left_read = (flag & 0x40 != 0)
|
|
pos1 = canonical_pos1
|
|
mapQ = int(mapQ)
|
|
if flag & 0x4 != 0 or \
|
|
'H' in cigar1_str:
|
|
prev_read_id, is_prev_read_left = read_id, left_read
|
|
continue
|
|
|
|
concordant = (flag & 0x2 != 0)
|
|
NH, NM1, YT, XA = sys.maxint, sys.maxint, "", []
|
|
for i in range(11, len(cols)):
|
|
col = cols[i]
|
|
# "nM" from STAR
|
|
if col.startswith("NM") or col.startswith("nM"):
|
|
NM1 = int(col[5:])
|
|
elif col.startswith("NH"):
|
|
NH = int(col[5:])
|
|
elif col.startswith("YT"):
|
|
YT = col[5:]
|
|
elif col.startswith("XA"):
|
|
XA = col[5:].split(';')[:-1]
|
|
|
|
if aligner == "hisat2":
|
|
if prev_read_id == read_id:
|
|
if left_read:
|
|
assert prev_NH1 == 0 or prev_NH1 == NH
|
|
else:
|
|
assert prev_NH2 == 0 or prev_NH2 == NH
|
|
if NH == 1 or mapQ == 60:
|
|
assert NH == 1 and mapQ == 60
|
|
|
|
unpaired = (flag & 0x8 != 0) or (YT in ["UU", "UP"])
|
|
if unpaired:
|
|
if left_read not in pair_list:
|
|
pair_list.add(left_read)
|
|
num_aligned_reads += 1
|
|
if aligner == "hisat2" and NH == 1:
|
|
num_ualigned_reads += 1
|
|
assert mapQ == 60
|
|
else:
|
|
if read_id != prev_read_id:
|
|
if concordant:
|
|
num_conc_aligned_pairs += 1
|
|
if aligner == "hisat2" and NH == 1:
|
|
num_conc_ualigned_pairs += 1
|
|
else:
|
|
if aligner == "hisat2":
|
|
assert YT == "DP"
|
|
num_disc_aligned_pairs += 1
|
|
|
|
if chr2 == '*':
|
|
continue
|
|
|
|
def adjust_alignment(chr, pos, cigar_str):
|
|
NM_real = 0
|
|
read_pos, right_pos = 0, pos - 1
|
|
cigars = cigar_re.findall(cigar_str)
|
|
cigars = [[cigar[-1], int(cigar[:-1])] for cigar in cigars]
|
|
for i in range(len(cigars)):
|
|
cigar_op, length = cigars[i]
|
|
if cigar_op == "S":
|
|
assert i == 0 or i == len(cigars) - 1
|
|
if i == 0:
|
|
assert cigars[i+1][0] == "M"
|
|
ref_seq = chr_dic[chr1][right_pos-length:right_pos]
|
|
else:
|
|
assert cigars[i-1][0] == "M"
|
|
ref_seq = chr_dic[chr1][right_pos:right_pos+length]
|
|
|
|
ref_seq = ref_seq.upper()
|
|
if length == len(ref_seq):
|
|
for j in range(length):
|
|
if ref_seq[j] != "N" and read_seq[read_pos+j] != ref_seq[j]:
|
|
NM_real += 1
|
|
else:
|
|
NM_real += length
|
|
if cigar_op in "MND":
|
|
right_pos += length
|
|
if cigar_op in "MIS":
|
|
read_pos += length
|
|
|
|
if cigars[0][0] == "S":
|
|
assert cigars[1][0] == "M"
|
|
pos -= cigars[0][1]
|
|
cigars[1][1] += cigars[0][1]
|
|
cigars = cigars[1:]
|
|
if cigars[-1][0] == "S":
|
|
assert cigars[-2][0] == "M"
|
|
cigars[-2][1] += cigars[-1][1]
|
|
cigars = cigars[:-1]
|
|
|
|
cigar_str = ""
|
|
for cigar in cigars:
|
|
cigar_op, length = cigar
|
|
cigar_str += ("%d%s" % (length, cigar_op))
|
|
|
|
return pos, cigar_str, NM_real
|
|
|
|
alignments = [[chr1, pos1, cigar1_str]]
|
|
if aligner == "bwa" and len(XA) > 0:
|
|
for alignment in XA:
|
|
alt_chr, alt_pos, alt_cigar_str, alt_NM = alignment.split(',')
|
|
alt_pos, alt_NM = abs(int(alt_pos)), int(alt_NM)
|
|
if alt_NM > NM1:
|
|
break
|
|
alignments.append([alt_chr, alt_pos, alt_cigar_str])
|
|
|
|
# Convert repeat alignments to genome alignments
|
|
if aligner == "hisat2" and (chr1.startswith("rep") or chr2.startswith("rep")) and len(repeat_map) > 0:
|
|
if chr1.startswith("rep"):
|
|
alignments = repeat_to_genome_alignment(repeat_db, repeat_map, chr1, pos1, cigar1_str)
|
|
if chr2.startswith("rep") or (chr1.startswith("rep") and chr2 == "="):
|
|
chr_tmp = chr1 if chr2 == "=" else chr2
|
|
alignments2 = repeat_to_genome_alignment(repeat_db, repeat_map, chr_tmp, int(pos2))
|
|
else:
|
|
alignments2 = [[chr2, int(pos2)]]
|
|
|
|
selected_alignments = []
|
|
for alignment in alignments:
|
|
_chr1, _pos1 = alignment[:2]
|
|
add = False
|
|
for alignment2 in alignments2:
|
|
_chr2, _pos2 = alignment2[:2]
|
|
if _chr1 == _chr2 and abs(_pos1 - _pos2) <= 1000:
|
|
add = True
|
|
break
|
|
if add:
|
|
selected_alignments.append(alignment)
|
|
|
|
alignments = selected_alignments
|
|
|
|
for i, alignment in enumerate(alignments):
|
|
chr1, pos1, cigar1_str = alignment
|
|
pos1, cigar_str, NM_real = adjust_alignment(chr1, pos1, cigar1_str)
|
|
chr2 = chr1
|
|
|
|
if aligner == "bwa":
|
|
me = "%s\t%s" % (read_id, chr1)
|
|
partner = "%s\t%s" % (read_id, chr2)
|
|
else:
|
|
me = "%s\t%s\t%d" % (read_id, chr1, canonical_pos1)
|
|
partner = "%s\t%s\t%d" % (read_id, chr2, canonical_pos2)
|
|
if partner in read_dic:
|
|
maps = read_dic[partner]
|
|
|
|
for map in maps:
|
|
if (map[0] == me) or len(maps) == 1:
|
|
cigar2_str, NM2, pos2 = map[1:4]
|
|
|
|
if aligner == "bwa":
|
|
if abs(pos1 - pos2) >= 1000:
|
|
continue
|
|
|
|
if aligner in ["bowtie2", "bwa"]:
|
|
if NM1 + NM2 > prev_NM:
|
|
continue
|
|
else:
|
|
prev_NM = NM1 + NM2
|
|
|
|
if chr1 != chr2:
|
|
continue
|
|
|
|
# DK - debugging purposes
|
|
if RNA:
|
|
if aligner in ["bowtie2", "bwa"] and abs(pos1 - pos2) > 1000:
|
|
continue
|
|
else:
|
|
if abs(pos1 - pos2) > 1000:
|
|
continue
|
|
|
|
if int(pos2) > int(pos1):
|
|
p_str = "%s\t%s\t%d\t%s\t%s\t%d\t%s\tNM:i:%d\tNM:i:%d" % \
|
|
(read_id, chr1, pos1, cigar_str, chr2, pos2, cigar2_str, NM1, NM2)
|
|
else:
|
|
p_str = "%s\t%s\t%d\t%s\t%s\t%d\t%s\tNM:i:%d\tNM:i:%d" % \
|
|
(read_id, chr2, pos2, cigar2_str, chr1, pos1, cigar_str, NM2, NM1)
|
|
|
|
if p_str not in pair_reported:
|
|
pair_reported.add(p_str)
|
|
print >> outfile, p_str
|
|
|
|
|
|
|
|
if not me in read_dic:
|
|
read_dic[me] = []
|
|
|
|
read_dic[me].append([partner, cigar_str, NM1, pos1])
|
|
|
|
if aligner == "hisat2":
|
|
if prev_read_id != read_id:
|
|
if prev_read_id != "":
|
|
assert prev_NH1 == NH1_real
|
|
assert prev_NH2 == NH2_real
|
|
prev_NH1, prev_NH2 = 0, 0
|
|
if left_read:
|
|
NH1_real, NH2_real = 1, 0
|
|
else:
|
|
NH1_real, NH2_real = 0, 1
|
|
else:
|
|
if left_read:
|
|
NH1_real += 1
|
|
else:
|
|
NH2_real += 1
|
|
if left_read:
|
|
prev_NH1 = NH
|
|
else:
|
|
prev_NH2 = NH
|
|
prev_read_id = read_id
|
|
|
|
if aligner == "hisat2":
|
|
if prev_read_id != "":
|
|
assert prev_NH1 == NH1_real
|
|
assert prev_NH2 == NH2_real
|
|
|
|
outfile.close()
|
|
infile.close()
|
|
|
|
# Sanity check for HISAT2's alignment summary
|
|
if aligner == "hisat2" and os.path.exists(infilename + ".summary") and (not mp_mode):
|
|
hisat2_pairs, hisat2_0aligned_pairs, hisat2_conc_ualigned_pairs, hisat2_conc_maligned_pairs, hisat2_disc_aligned_pairs = 0, 0, 0, 0, 0
|
|
hisat2_reads, hisat2_0aligned_reads, hisat2_ualigned_reads, hisat2_maligned_reads = 0, 0, 0, 0
|
|
|
|
for line in open(infilename + ".summary"):
|
|
line = line.strip()
|
|
if line.startswith("HISAT2 summary") or \
|
|
line.startswith("Overall"):
|
|
continue
|
|
category, num = line.split(':')
|
|
num = num.strip()
|
|
num = int(num.split(' ')[0])
|
|
if category.startswith("Total pairs"):
|
|
hisat2_pairs = num
|
|
elif category.startswith("Aligned concordantly or discordantly 0 time"):
|
|
hisat2_0aligned_pairs = num
|
|
elif category.startswith("Aligned concordantly 1 time"):
|
|
hisat2_conc_ualigned_pairs = num
|
|
elif category.startswith("Aligned concordantly >1 time"):
|
|
hisat2_conc_maligned_pairs = num
|
|
elif category.startswith("Aligned discordantly"):
|
|
hisat2_disc_aligned_pairs = num
|
|
assert hisat2_pairs == hisat2_0aligned_pairs + hisat2_conc_ualigned_pairs + hisat2_conc_maligned_pairs + hisat2_disc_aligned_pairs
|
|
elif category.startswith("Total unpaired reads"):
|
|
hisat2_reads = num
|
|
assert hisat2_reads == hisat2_0aligned_pairs * 2
|
|
elif category.startswith("Aligned 0 time"):
|
|
hisat2_0aligned_reads = num
|
|
elif category.startswith("Aligned 1 time"):
|
|
hisat2_ualigned_reads = num
|
|
else:
|
|
assert category.startswith("Aligned >1 times")
|
|
hisat2_maligned_reads = num
|
|
assert hisat2_reads == hisat2_0aligned_reads + hisat2_ualigned_reads + hisat2_maligned_reads
|
|
|
|
assert hisat2_pairs == num_pairs
|
|
assert hisat2_conc_ualigned_pairs == num_conc_ualigned_pairs
|
|
assert hisat2_conc_maligned_pairs == num_conc_aligned_pairs - num_conc_ualigned_pairs
|
|
assert hisat2_disc_aligned_pairs == num_disc_aligned_pairs
|
|
assert hisat2_ualigned_reads == num_ualigned_reads
|
|
assert hisat2_maligned_reads == num_aligned_reads - num_ualigned_reads
|
|
|
|
|
|
"""
|
|
"""
|
|
def is_junction_read(junctions_dic, chr, pos, cigar_str):
|
|
result_junctions = []
|
|
junctions = get_junctions(chr, pos, cigar_str)
|
|
for junction in junctions:
|
|
junction_str = to_junction_str(junction)
|
|
result_junctions.append([junction_str, junction_str in junctions_dic])
|
|
|
|
return result_junctions
|
|
|
|
|
|
"""
|
|
"""
|
|
def is_junction_pair(junctions_dic, chr, pos, cigar_str, mate_chr, mate_pos, mate_cigar_str):
|
|
junctions = is_junction_read(junctions_dic, chr, pos, cigar_str)
|
|
mate_junctions = is_junction_read(junctions_dic, mate_chr, mate_pos, mate_cigar_str)
|
|
junctions += mate_junctions
|
|
return junctions
|
|
|
|
|
|
"""
|
|
"""
|
|
def find_in_gtf_junctions(chr_dic, gtf_junctions, junction, relax_dist = 5):
|
|
def find_in_gtf_junctions(gtf_junctions, junction):
|
|
l, u = 0, len(gtf_junctions)
|
|
while l < u:
|
|
m = (l + u) / 2
|
|
assert m >= 0 and m < len(gtf_junctions)
|
|
cmp_result = junction_cmp(junction, gtf_junctions[m])
|
|
if cmp_result == 0:
|
|
return m
|
|
elif cmp_result < 0:
|
|
u = m
|
|
else:
|
|
l = m + 1
|
|
|
|
return l
|
|
|
|
chr, left, right = junction
|
|
gtf_index = find_in_gtf_junctions(gtf_junctions, [chr, left - relax_dist, right - relax_dist])
|
|
|
|
if gtf_index >= 0:
|
|
i = gtf_index
|
|
while i < len(gtf_junctions):
|
|
chr2, left2, right2 = gtf_junctions[i]
|
|
if chr2 > chr or \
|
|
left2 - left > relax_dist or \
|
|
right2 - right > relax_dist:
|
|
break
|
|
|
|
if abs(left - left2) <= relax_dist and left - left2 == right - right2:
|
|
test_small = ":" in chr
|
|
if is_canonical_junction(chr_dic, gtf_junctions[i]):
|
|
if left == left2:
|
|
return i
|
|
else:
|
|
return -1
|
|
else:
|
|
return i
|
|
i += 1
|
|
|
|
return -1
|
|
|
|
|
|
def singleInMaps(chr, pos, pos2, cigar, NM, maps):
|
|
for i in range(len(maps)):
|
|
if chr != maps[i][0]:
|
|
continue
|
|
if (pos == maps[i][1] or pos2 == maps[i][2]):
|
|
return True, i
|
|
return False, -1
|
|
|
|
|
|
|
|
"""
|
|
"""
|
|
def compare_single_sam(RNA,
|
|
reference_sam,
|
|
query_sam,
|
|
mapped_fname,
|
|
chr_dic,
|
|
gtf_junctions,
|
|
gtf_junctions_set,
|
|
ex_gtf_junctions,
|
|
aligner):
|
|
aligned, multi_aligned = 0, 0
|
|
db_dic, db_junction_dic = {}, {}
|
|
mapped_file = open(mapped_fname, "w")
|
|
first_mapped_file = open(mapped_fname + ".first", "w")
|
|
file = open(reference_sam, "r")
|
|
junction_read_dic = {}
|
|
for line in file:
|
|
if line.startswith('@'):
|
|
continue
|
|
|
|
read_name, chr, pos, cigar, NM = line[:-1].split()
|
|
pos, NM = int(pos), int(NM[5:])
|
|
|
|
if read_name.find("seq.") == 0:
|
|
read_name = read_name[4:]
|
|
|
|
if len(read_name) > 2 and read_name[-2] == '/':
|
|
read_name = read_name[:-2]
|
|
|
|
multi_aligned += 1
|
|
if read_name not in db_dic:
|
|
db_dic[read_name] = []
|
|
aligned += 1
|
|
|
|
pos2 = get_right(pos, cigar)
|
|
db_dic[read_name].append([chr, pos, pos2, cigar, NM])
|
|
|
|
read_junctions = is_junction_read(gtf_junctions_set, chr, pos, cigar)
|
|
if len(read_junctions) > 0:
|
|
if read_name not in db_junction_dic:
|
|
db_junction_dic[read_name] = []
|
|
|
|
for junction_str, is_gtf_junction in read_junctions:
|
|
db_junction_dic[read_name].append([junction_str, is_gtf_junction])
|
|
|
|
if junction_str not in junction_read_dic:
|
|
junction_read_dic[junction_str] = []
|
|
junction_read_dic[junction_str].append(line[:-1])
|
|
|
|
file.close()
|
|
|
|
temp_junctions, temp_gtf_junctions = set(), set()
|
|
for read_name, can_junctions in db_junction_dic.items():
|
|
if len(can_junctions) <= 0:
|
|
continue
|
|
|
|
# DK - for debugging purposes
|
|
# 1. select the best candidate among spliced alignments if multiple
|
|
|
|
def pickup_junction(can_junctions):
|
|
junctions = [can_junctions[0]]
|
|
for i in range(1, len(can_junctions)):
|
|
def get_intron_len(can_junction):
|
|
chr, left, right = to_junction(can_junction)
|
|
return right - left - 1
|
|
|
|
intron, intron_cmp = get_intron_len(junctions[0][0]), get_intron_len(can_junctions[i][0])
|
|
|
|
if intron > intron_cmp:
|
|
junctions = [can_junctions[i]]
|
|
elif intron == intron_cmp:
|
|
junctions.append(can_junctions[i])
|
|
|
|
return junctions
|
|
|
|
# can_junctions = pickup_junction(can_junctions)
|
|
|
|
for can_junction in can_junctions:
|
|
found_junction_str = None
|
|
junction_str, is_gtf_junction = can_junction
|
|
if is_gtf_junction:
|
|
found_junction_str = junction_str
|
|
|
|
if not found_junction_str:
|
|
junction = to_junction(junction_str)
|
|
gtf_index = find_in_gtf_junctions(chr_dic, gtf_junctions, junction)
|
|
|
|
if gtf_index >= 0:
|
|
is_gtf_junction = True
|
|
found_junction_str = to_junction_str(gtf_junctions[gtf_index])
|
|
|
|
if found_junction_str:
|
|
temp_gtf_junctions.add(found_junction_str)
|
|
temp_junctions.add(found_junction_str)
|
|
else:
|
|
if junction_str not in temp_junctions:
|
|
None
|
|
# assert junction_str in junction_read_dic
|
|
# DK - for debugging purposes
|
|
"""
|
|
if len(junction_read_dic[junction_str]) <= 2:
|
|
canonical = is_canonical_junction(chr_dic, to_junction(junction_str))
|
|
if not canonical:
|
|
print >> sys.stdout, read_name, junction_str, len(junction_read_dic[junction_str]), can_junctions
|
|
for line in junction_read_dic[junction_str]:
|
|
print >> sys.stdout, "\t", line
|
|
"""
|
|
temp_junctions.add(junction_str)
|
|
|
|
|
|
temp2_junctions = []
|
|
for junction in temp_junctions:
|
|
temp2_junctions.append(to_junction(junction))
|
|
temp_junctions = sorted(list(temp2_junctions), cmp=junction_cmp)
|
|
temp2_junctions = []
|
|
for can_junction in temp_junctions:
|
|
if len(temp2_junctions) <= 0:
|
|
temp2_junctions.append(can_junction)
|
|
else:
|
|
chr, left, right = temp2_junctions[-1]
|
|
chr2, left2, right2 = can_junction
|
|
if chr == chr2 and \
|
|
abs(left - left2) == abs(right - right2) and \
|
|
abs(left - left2) <= 10 and \
|
|
not to_junction_str(can_junction) in temp_gtf_junctions:
|
|
continue
|
|
temp2_junctions.append(can_junction)
|
|
|
|
temp_junctions = set()
|
|
for junction in temp2_junctions:
|
|
temp_junctions.add(to_junction_str(junction))
|
|
|
|
file = open(query_sam)
|
|
mapped, unmapped, unique_mapped, first_mapped, mapping_point = 0, 0, 0, 0, 0.0
|
|
snp_mapped, snp_unmapped, snp_unique_mapped, snp_first_mapped = 0, 0, 0, 0
|
|
for line in file:
|
|
if line.startswith('@'):
|
|
continue
|
|
fields = line[:-1].split()
|
|
read_name, chr, pos, cigar = fields[:4]
|
|
trans_id, NM, Zs = None, None, None
|
|
for field in fields[4:]:
|
|
if field.startswith("TI"):
|
|
trans_id = field[5:]
|
|
elif field.startswith("NM"):
|
|
NM = int(field[5:])
|
|
elif field.startswith("Zs"):
|
|
Zs = field[5:]
|
|
snp_included = (Zs != None)
|
|
|
|
pos = int(pos)
|
|
pos2 = get_right(pos, cigar)
|
|
if read_name not in db_dic:
|
|
unmapped += 1
|
|
if snp_included:
|
|
snp_unmapped += 1
|
|
continue
|
|
|
|
maps = db_dic[read_name]
|
|
found = False
|
|
found_at_first = False
|
|
|
|
if (aligner == "hisat-TLA" and RNA):
|
|
found, index = singleInMaps(chr, pos, pos2, cigar, NM, maps)
|
|
if found:
|
|
if index == 0:
|
|
found_at_first = True
|
|
|
|
else:
|
|
if [chr, pos, pos2, cigar, NM] in maps:
|
|
found = True
|
|
if maps.index([chr, pos, pos2, cigar, NM]) == 0:
|
|
found_at_first = True
|
|
|
|
# DK - debugging purposes
|
|
if False and len(maps) > 0 and maps[0][-1] < NM:
|
|
found = True
|
|
|
|
if not found:
|
|
for idx, map in enumerate(maps):
|
|
if chr == map[0] and \
|
|
pos == map[1] and \
|
|
pos2 == map[2] and \
|
|
get_cigar_chars(cigar) == get_cigar_chars(map[3]):
|
|
|
|
read_junctions = is_junction_read(gtf_junctions_set, map[0], map[1], map[3])
|
|
if True:
|
|
found_list = [False for i in range(len(read_junctions))]
|
|
for j in range(len(read_junctions)):
|
|
junction_str, is_gtf_junction = read_junctions[j]
|
|
junction = to_junction(junction_str)
|
|
gtf_index = find_in_gtf_junctions(chr_dic, gtf_junctions, junction)
|
|
if gtf_index >= 0:
|
|
found_list[j] = True
|
|
found = not (False in found_list)
|
|
else:
|
|
found = False
|
|
|
|
if found:
|
|
if idx == 0:
|
|
found_at_first = True
|
|
break
|
|
|
|
if found:
|
|
print >> mapped_file, read_name
|
|
mapped += 1
|
|
if snp_included:
|
|
snp_mapped += 1
|
|
if len(maps) == 1:
|
|
unique_mapped += 1
|
|
if snp_included:
|
|
snp_unique_mapped += 1
|
|
if found_at_first:
|
|
print >> first_mapped_file, read_name
|
|
first_mapped += 1
|
|
if snp_included:
|
|
snp_first_mapped += 1
|
|
mapping_point += (1.0 / len(maps))
|
|
else:
|
|
unmapped += 1
|
|
if snp_included:
|
|
snp_unmapped += 1
|
|
|
|
file.close()
|
|
mapped_file.close()
|
|
first_mapped_file.close()
|
|
|
|
# DK - for debugging purposes
|
|
false_can_junctions, false_noncan_junctions = 0, 0
|
|
for junction_str in temp_junctions:
|
|
if junction_str in temp_gtf_junctions:
|
|
continue
|
|
if junction_str in ex_gtf_junctions:
|
|
continue
|
|
if is_canonical_junction(chr_dic, to_junction(junction_str)):
|
|
false_can_junctions += 1
|
|
else:
|
|
false_noncan_junctions += 1
|
|
print >> sys.stderr, "\t\t\tfalse junctions: %d (canonical), %d (non-canonical)" % (false_can_junctions, false_noncan_junctions)
|
|
|
|
return mapped, unique_mapped, first_mapped, unmapped, aligned, multi_aligned, \
|
|
snp_mapped, snp_unique_mapped, snp_first_mapped, snp_unmapped, \
|
|
len(temp_junctions), len(temp_gtf_junctions), mapping_point
|
|
|
|
|
|
|
|
def pairedInMaps(chr, pos, pos_right, cigar, pos2, pos2_right, cigar2, NM, NM2, maps):
|
|
for i in range(len(maps)):
|
|
if chr != maps[i][0]:
|
|
continue;
|
|
if (pos == maps[i][1] or pos_right == maps[i][2]) and (pos2 == maps[i][4] or pos2_right == maps[i][5]):
|
|
return True, i
|
|
return False, -1
|
|
"""
|
|
"""
|
|
def compare_paired_sam(RNA,
|
|
reference_sam,
|
|
query_sam,
|
|
mapped_fname,
|
|
chr_dic,
|
|
gtf_junctions,
|
|
gtf_junctions_set,
|
|
ex_gtf_junctions,
|
|
aligner):
|
|
aligned, multi_aligned = 0, 0
|
|
db_dic, db_junction_dic, junction_pair_dic = {}, {}, {}
|
|
mapped_file = open(mapped_fname, "w")
|
|
uniq_mapped_file = open(mapped_fname + '.uniq', "w")
|
|
first_mapped_file = open(mapped_fname + '.first', "w")
|
|
file = open(reference_sam, "r")
|
|
for line in file:
|
|
if line[0] == '@':
|
|
continue
|
|
read_name, chr, pos, cigar, chr2, pos2, cigar2, NM, NM2 = line[:-1].split()
|
|
pos, pos2 = int(pos), int(pos2)
|
|
NM, NM2 = int(NM[5:]), int(NM2[5:])
|
|
|
|
if read_name.find("seq.") == 0:
|
|
read_name = read_name[4:]
|
|
|
|
if len(read_name) > 2 and read_name[-2] == '/':
|
|
read_name = read_name[:-2]
|
|
|
|
multi_aligned += 1
|
|
if read_name not in db_dic:
|
|
db_dic[read_name] = []
|
|
aligned += 1
|
|
|
|
pos_right, pos2_right = get_right(pos, cigar), get_right(pos2, cigar2)
|
|
db_dic[read_name].append([chr, pos, pos_right, cigar, pos2, pos2_right, cigar2, NM, NM2])
|
|
|
|
|
|
pair_junctions = is_junction_pair(gtf_junctions_set, chr, pos, cigar, chr2, pos2, cigar2)
|
|
if len(pair_junctions) > 0:
|
|
if read_name not in db_junction_dic:
|
|
db_junction_dic[read_name] = []
|
|
|
|
for junction_str, is_gtf_junction in pair_junctions:
|
|
db_junction_dic[read_name].append([junction_str, is_gtf_junction])
|
|
|
|
# DK - for debugging purposes
|
|
if junction_str not in junction_pair_dic:
|
|
junction_pair_dic[junction_str] = []
|
|
junction_pair_dic[junction_str].append(line[:-1])
|
|
|
|
file.close()
|
|
|
|
temp_junctions, temp_gtf_junctions = set(), set()
|
|
for read_name, can_junctions in db_junction_dic.items():
|
|
if len(can_junctions) <= 0:
|
|
continue
|
|
|
|
# DK - for debugging purposes
|
|
# 1. select the best candidate among spliced alignments if multiple
|
|
|
|
def pickup_junction(can_junctions):
|
|
junctions = [can_junctions[0]]
|
|
for i in range(1, len(can_junctions)):
|
|
def get_intron_len(can_junction):
|
|
chr, left, right = to_junction(can_junction)
|
|
return right - left - 1
|
|
|
|
intron, intron_cmp = get_intron_len(junctions[0][0]), get_intron_len(can_junctions[i][0])
|
|
|
|
if intron > intron_cmp:
|
|
junctions = [can_junctions[i]]
|
|
elif intron == intron_cmp:
|
|
junctions.append(can_junctions[i])
|
|
|
|
return junctions
|
|
|
|
# can_junctions = pickup_junction(can_junctions)
|
|
|
|
for can_junction in can_junctions:
|
|
found_junction_str = None
|
|
junction_str, is_gtf_junction = can_junction
|
|
|
|
# DK - for debugging purposes
|
|
assert junction_str in junction_pair_dic
|
|
if len(junction_pair_dic[junction_str]) <= 5:
|
|
continue
|
|
|
|
if is_gtf_junction:
|
|
found_junction_str = junction_str
|
|
|
|
if not found_junction_str:
|
|
junction = to_junction(junction_str)
|
|
gtf_index = find_in_gtf_junctions(chr_dic, gtf_junctions, junction)
|
|
|
|
if gtf_index >= 0:
|
|
is_gtf_junction = True
|
|
found_junction_str = to_junction_str(gtf_junctions[gtf_index])
|
|
|
|
if found_junction_str:
|
|
temp_gtf_junctions.add(found_junction_str)
|
|
temp_junctions.add(found_junction_str)
|
|
else:
|
|
if junction_str not in temp_junctions:
|
|
None
|
|
# assert junction_str in junction_read_dic
|
|
# print >> sys.stdout, read_name, junction_str, len(junction_read_dic[junction_str])
|
|
# for line in junction_read_dic[junction_str]:
|
|
# print >> sys.stdout, "\t", line
|
|
|
|
temp_junctions.add(junction_str)
|
|
|
|
# DK - for debugging purposes
|
|
filter_junction_db = {}
|
|
|
|
temp2_junctions = []
|
|
for junction in temp_junctions:
|
|
temp2_junctions.append(to_junction(junction))
|
|
temp_junctions = sorted(list(temp2_junctions), cmp=junction_cmp)
|
|
temp2_junctions = []
|
|
for can_junction in temp_junctions:
|
|
if len(temp2_junctions) <= 0:
|
|
temp2_junctions.append(can_junction)
|
|
|
|
# DK - for debugging purposes
|
|
# assert to_junction_str(can_junction) in junction_pair_dic
|
|
# filter_junction_db[to_junction_str(can_junction)] = len(junction_pair_dic[to_junction_str(can_junction)])
|
|
else:
|
|
chr, left, right = temp2_junctions[-1]
|
|
chr2, left2, right2 = can_junction
|
|
if chr == chr2 and \
|
|
abs(left - left2) == abs(right - right2) and \
|
|
abs(left - left2) <= 10 and \
|
|
not to_junction_str(can_junction) in temp_gtf_junctions:
|
|
|
|
# DK - for debugging purposes
|
|
# assert to_junction_str(temp2_junctions[-1]) in junction_pair_dic
|
|
# assert to_junction_str(temp2_junctions[-1]) in filter_junction_dic
|
|
# filter_junction_db[to_junction_str(temp2_junctions[-1])] += len(junction_pair_dic[to_junction_str(temp2_junctions[-1])])
|
|
|
|
continue
|
|
|
|
temp2_junctions.append(can_junction)
|
|
|
|
# DK - for debugging purposes
|
|
# assert to_junction_str(can_junction) in junction_pair_dic
|
|
# filter_junction_db[to_junction_str(can_junction)] = len(junction_pair_dic[to_junction_str(can_junction)])
|
|
|
|
temp_junctions = set()
|
|
for junction in temp2_junctions:
|
|
# DK - for debugging purposes
|
|
# assert to_junction_str(junction) in filter_junction_dic
|
|
# if filter_junction_dic[to_junction_str(junction)] <= 5:
|
|
# continue
|
|
|
|
temp_junctions.add(to_junction_str(junction))
|
|
|
|
file = open(query_sam)
|
|
mapped, unique_mapped, first_mapped, unmapped, mapping_point = 0, 0, 0, 0, 0.0
|
|
snp_mapped, snp_unique_mapped, snp_first_mapped, snp_unmapped = 0, 0, 0, 0
|
|
for line in file:
|
|
if line.startswith('@'):
|
|
continue
|
|
|
|
fields = line[:-1].split()
|
|
read_name, chr, pos, cigar, chr2, pos2, cigar2 = fields[:7]
|
|
trains_id, NM, NM2, Zs, Zs2 = None, None, None, None, None
|
|
for field in fields[7:]:
|
|
if field.startswith("TI"):
|
|
trans_id = field[5:]
|
|
elif field.startswith("NM"):
|
|
if NM == None:
|
|
NM = int(field[5:])
|
|
else:
|
|
NM2 = int(field[5:])
|
|
elif field.startswith("Zs"):
|
|
if Zs == None:
|
|
Zs = field[5:]
|
|
else:
|
|
Zs2 = field[5:]
|
|
snp_included = (Zs != None or Zs2 != None)
|
|
|
|
pos, pos2 = int(pos), int(pos2)
|
|
pos_right, pos2_right = get_right(pos, cigar), get_right(pos2, cigar2)
|
|
|
|
if read_name not in db_dic:
|
|
unmapped += 1
|
|
if snp_included:
|
|
snp_unmapped += 1
|
|
continue
|
|
|
|
maps = db_dic[read_name]
|
|
|
|
found = False
|
|
found_at_first = False
|
|
|
|
|
|
if (aligner == "hisat-TLA" and RNA) :
|
|
found, index = pairedInMaps(chr, pos, pos_right, cigar, pos2, pos2_right, cigar2, NM, NM2, maps)
|
|
if (found):
|
|
if (index == 0):
|
|
found_at_first = True
|
|
else:
|
|
if [chr, pos, pos_right, cigar, pos2, pos2_right, cigar2, NM, NM2] in maps:
|
|
found = True
|
|
if maps.index([chr, pos, pos_right, cigar, pos2, pos2_right, cigar2, NM, NM2]) == 0:
|
|
found_at_first = True
|
|
|
|
# DK - debugging purposes
|
|
if False and len(maps) > 0 and maps[0][-1] + maps[0][-2] < NM + NM2:
|
|
found = True
|
|
|
|
if not found:
|
|
for idx, map in enumerate(maps):
|
|
if chr == map[0] and \
|
|
pos == map[1] and \
|
|
pos_right == map[2] and \
|
|
get_cigar_chars(cigar) == get_cigar_chars(map[3]) and \
|
|
pos2 == map[4] and \
|
|
pos2_right == map[5] and \
|
|
get_cigar_chars(cigar2) == get_cigar_chars(map[6]):
|
|
|
|
pair_junctions = is_junction_pair(gtf_junctions_set, map[0], map[1], map[3], map[0], map[4], map[6])
|
|
if True:
|
|
found_list = [False for i in range(len(pair_junctions))]
|
|
for j in range(len(pair_junctions)):
|
|
junction_str, is_gtf_junction = pair_junctions[j]
|
|
junction = to_junction(junction_str)
|
|
gtf_index = find_in_gtf_junctions(chr_dic, gtf_junctions, junction)
|
|
if gtf_index >= 0:
|
|
found_list[j] = True
|
|
found = not (False in found_list)
|
|
else:
|
|
found = False
|
|
|
|
if found:
|
|
if idx == 0:
|
|
found_at_first = True
|
|
break
|
|
|
|
if found:
|
|
print >> mapped_file, read_name
|
|
mapped += 1
|
|
if snp_included:
|
|
snp_mapped += 1
|
|
if len(maps) == 1:
|
|
unique_mapped += 1
|
|
print >> uniq_mapped_file, read_name
|
|
if snp_included:
|
|
snp_unique_mapped += 1
|
|
if found_at_first:
|
|
print >> first_mapped_file, read_name
|
|
first_mapped += 1
|
|
if snp_included:
|
|
snp_first_mapped += 1
|
|
|
|
mapping_point += (1.0 / len(maps))
|
|
else:
|
|
unmapped += 1
|
|
if snp_included:
|
|
snp_unmapped += 1
|
|
|
|
file.close()
|
|
mapped_file.close()
|
|
uniq_mapped_file.close()
|
|
first_mapped_file.close()
|
|
|
|
# DK - for debugging purposes
|
|
false_can_junctions, false_noncan_junctions = 0, 0
|
|
for junction_str in temp_junctions:
|
|
if junction_str in temp_gtf_junctions:
|
|
continue
|
|
if is_canonical_junction(chr_dic, to_junction(junction_str)):
|
|
false_can_junctions += 1
|
|
else:
|
|
false_noncan_junctions += 1
|
|
print >> sys.stderr, "\t\t\tfalse junctions: %d (canonical), %d (non-canonical)" % (false_can_junctions, false_noncan_junctions)
|
|
|
|
|
|
return mapped, unique_mapped, first_mapped, unmapped, aligned, multi_aligned, \
|
|
snp_mapped, snp_unique_mapped, snp_first_mapped, snp_unmapped, \
|
|
len(temp_junctions), len(temp_gtf_junctions), mapping_point
|
|
|
|
|
|
"""
|
|
"""
|
|
def extract_mapped_unmapped(read_fname, mapped_id_fname, mapped_fname, unmapped_fname, read2_fname = "", mapped2_fname = "", unmapped2_fname = ""):
|
|
mapped_ids = set()
|
|
mapped_id_file = open(mapped_id_fname)
|
|
for line in mapped_id_file:
|
|
read_id = int(line[:-1])
|
|
mapped_ids.add(read_id)
|
|
|
|
mapped_id_file.close()
|
|
|
|
def write_reads(read_fname, mapped_fname, unmapped_fname):
|
|
mapped_file = open(mapped_fname, "w")
|
|
unmapped_file = open(unmapped_fname, "w")
|
|
read_file = open(read_fname)
|
|
write = False
|
|
for line in read_file:
|
|
if line[0] == "@":
|
|
read_id = int(line[1:-1])
|
|
write = read_id in mapped_ids
|
|
|
|
if write:
|
|
print >> mapped_file, line[:-1]
|
|
else:
|
|
print >> unmapped_file, line[:-1]
|
|
|
|
read_file.close()
|
|
mapped_file.close()
|
|
unmapped_file.close()
|
|
|
|
write_reads(read_fname, mapped_fname, unmapped_fname)
|
|
if read2_fname != "":
|
|
assert mapped2_fname != ""
|
|
assert unmapped2_fname != ""
|
|
write_reads(read2_fname, mapped2_fname, unmapped2_fname)
|
|
|
|
|
|
"""
|
|
"""
|
|
def sql_execute(sql_db, sql_query):
|
|
sql_cmd = [
|
|
"sqlite3", sql_db,
|
|
"-separator", "\t",
|
|
"%s;" % sql_query
|
|
]
|
|
# print >> sys.stderr, sql_cmd
|
|
sql_process = subprocess.Popen(sql_cmd, stdout=subprocess.PIPE)
|
|
output = sql_process.communicate()[0][:-1]
|
|
return output
|
|
|
|
|
|
"""
|
|
"""
|
|
def create_sql_db(sql_db):
|
|
if os.path.exists(sql_db):
|
|
print >> sys.stderr, sql_db, "already exists!"
|
|
return
|
|
|
|
columns = [
|
|
["id", "integer primary key autoincrement"],
|
|
["genome", "text"],
|
|
["head", "text"],
|
|
["end_type", "text"],
|
|
["type", "text"],
|
|
["aligner", "text"],
|
|
["version", "text"],
|
|
["num_reads", "integer"],
|
|
["mapped_reads", "integer"],
|
|
["unique_mapped_reads", "integer"],
|
|
["unmapped_reads", "integer"],
|
|
["mapping_point", "real"],
|
|
["snp_mapped_reads", "integer"],
|
|
["snp_unique_mapped_reads", "integer"],
|
|
["snp_unmapped_reads", "integer"],
|
|
["time", "real"],
|
|
["mem", "integer"],
|
|
["true_gtf_junctions", "integer"],
|
|
["temp_junctions", "integer"],
|
|
["temp_gtf_junctions", "integer"],
|
|
["host", "text"],
|
|
["created", "text"],
|
|
["cmd", "text"]
|
|
]
|
|
|
|
sql_create_table = "CREATE TABLE ReadCosts ("
|
|
for i in range(len(columns)):
|
|
name, type = columns[i]
|
|
if i != 0:
|
|
sql_create_table += ", "
|
|
sql_create_table += ("%s %s" % (name, type))
|
|
sql_create_table += ");"
|
|
sql_execute(sql_db, sql_create_table)
|
|
|
|
|
|
"""
|
|
"""
|
|
def write_analysis_data(sql_db, genome_name, database_name):
|
|
if not os.path.exists(sql_db):
|
|
return
|
|
|
|
aligners = []
|
|
sql_aligners = "SELECT aligner FROM ReadCosts GROUP BY aligner"
|
|
output = sql_execute(sql_db, sql_aligners)
|
|
aligners = output.split()
|
|
|
|
can_read_types = ["all", "M", "2M_gt_15", "2M_8_15", "2M_1_7", "gt_2M"]
|
|
tmp_read_types = []
|
|
sql_types = "SELECT type FROM ReadCosts GROUP BY type"
|
|
output = sql_execute(sql_db, sql_types)
|
|
tmp_read_types = output.split()
|
|
|
|
read_types = []
|
|
for read_type in can_read_types:
|
|
if read_type in tmp_read_types:
|
|
read_types.append(read_type)
|
|
|
|
for paired in [False, True]:
|
|
database_fname = genome_name + "_" + database_name
|
|
if paired:
|
|
end_type = "paired"
|
|
database_fname += "_paired"
|
|
else:
|
|
end_type = "single"
|
|
database_fname += "_single"
|
|
database_fname += ".analysis"
|
|
database_file = open(database_fname, "w")
|
|
print >> database_file, "end_type\ttype\taligner\tnum_reads\ttime\tmem\tmapped_reads\tunique_mapped_reads\tunmapped_reads\tmapping_point\ttrue_gtf_junctions\ttemp_junctions\ttemp_gtf_junctions"
|
|
for aligner in aligners:
|
|
for read_type in read_types:
|
|
sql_row = "SELECT end_type, type, aligner, num_reads, time, mem, mapped_reads, unique_mapped_reads, unmapped_reads, mapping_point, snp_mapped_reads, snp_unique_mapped_reads, snp_unmapped_reads, true_gtf_junctions, temp_junctions, temp_gtf_junctions FROM ReadCosts"
|
|
sql_row += " WHERE genome = '%s' and head = '%s' and aligner = '%s' and type = '%s' and end_type = '%s' ORDER BY created DESC LIMIT 1" % (genome_name, database_name, aligner, read_type, end_type)
|
|
output = sql_execute(sql_db, sql_row)
|
|
if output:
|
|
print >> database_file, output
|
|
|
|
database_file.close()
|
|
|
|
|
|
"""
|
|
"""
|
|
def calculate_read_cost(single_end,
|
|
paired_end,
|
|
test_aligners,
|
|
fresh,
|
|
runtime_only,
|
|
baseChange,
|
|
verbose):
|
|
sql_db_name = "analysis.db"
|
|
if not os.path.exists(sql_db_name):
|
|
create_sql_db(sql_db_name)
|
|
|
|
num_cpus = multiprocessing.cpu_count()
|
|
if num_cpus > 10:
|
|
num_threads = min(10, num_cpus)
|
|
desktop = False
|
|
else:
|
|
#num_threads = min(4, num_cpus)
|
|
num_threads = num_cpus - 1
|
|
desktop = True
|
|
|
|
data_base = "sim"
|
|
test_large_index = False
|
|
verbose = False
|
|
sql_write = True
|
|
readtypes = ["all", "M", "2M_gt_15", "2M_8_15", "2M_1_7", "gt_2M"]
|
|
|
|
aligners = [
|
|
# [aligner, two_step, index_type, aligner_version, addition_options]
|
|
# ["hisat", "", "", "", ""],
|
|
# ["hisat2", "", "", "204", ""],
|
|
# ["hisat2", "", "", "210", ""],
|
|
# ["hisat2", "", "snp", "203b", ""],
|
|
# ["hisat2", "", "snp", "210", ""],
|
|
# ["hisat2", "", "tran", "210", ""],
|
|
# ["hisat2", "", "snp_tran", "210", ""],
|
|
# ["hisat2", "", "", "210", ""],
|
|
["hisat2", "", "", "", ""],
|
|
#["hisat2", "", "rep", "", ""],
|
|
# ["hisat2", "", "rep-100-300", "", ""],
|
|
# ["hisat2", "", "rep_mm", "", ""],
|
|
# ["hisat2", "", "", "", "--sensitive"],
|
|
# ["hisat2", "", "rep", "", "--sensitive"],
|
|
# ["hisat2", "", "", "", "--very-sensitive"],
|
|
# ["hisat2", "", "snp", "", ""],
|
|
# ["hisat2", "", "snp", "", "--sensitive"],
|
|
# ["hisat2", "", "snp_noht", "", ""],
|
|
# ["hisat2", "x2", "", "", ""],
|
|
# ["hisat2", "x1", "tran", "", ""],
|
|
# ["hisat2", "", "tran", "", ""],
|
|
# ["hisat2", "", "snp_tran", "", ""],
|
|
# ["hisat2", "x1", "snp_tran", "", ""],
|
|
# ["hisat2", "x1", "snp_tran_ercc", "", ""],
|
|
# ["tophat2", "gtfonly", "", "", ""],
|
|
# ["tophat2", "gtf", "", "", ""],
|
|
#["star", "", "", "", ""],
|
|
# ["star", "x2", "", "", ""],
|
|
# ["star", "gtf", "", "", ""],
|
|
# ["bowtie", "", "", "", ""],
|
|
#["bowtie2", "", "", "", ""],
|
|
# ["bowtie2", "", "", "", "-k 10"],
|
|
# ["bowtie2", "", "", "", "-k 1000 --extends 2000"],
|
|
# ["gsnap", "", "", "", ""],
|
|
# ["bwa", "mem", "", "", ""],
|
|
#["bwa", "mem", "", "", "-a"],
|
|
# ["hisat2", "", "snp", "", ""],
|
|
# ["hisat2", "", "tran", "", ""],
|
|
# ["hisat2", "", "snp_tran", "", ""],
|
|
# ["vg", "", "", "", ""],
|
|
# ["vg", "", "", "", "-M 10"],
|
|
# ["vg", "", "snp", "", ""],
|
|
# ["vg", "", "snp", "", "-M 10"],
|
|
# ["minimap2", "", "", "", ""],
|
|
["hisat-TLA", "", "", "", ""],
|
|
]
|
|
readtypes = ["all"]
|
|
verbose = True
|
|
debug = False
|
|
# sql_write = False
|
|
|
|
cwd = os.getcwd()
|
|
if len(cwd.split("reads_")) > 1:
|
|
genome = cwd.split("reads_")[1].split("_")[0]
|
|
else:
|
|
genome = "genome"
|
|
RNA = (cwd.find("RNA") != -1)
|
|
|
|
test_small = (genome != "genome")
|
|
|
|
if runtime_only:
|
|
verbose = True
|
|
|
|
chr_dic = read_genome("../../data/%s.fa" % genome)
|
|
gtf_junctions = extract_splice_sites("../../data/%s.gtf" % genome)
|
|
repeat_db, repeat_map = read_repeatdb("../../data/%s_rep.rep.info" % genome)
|
|
align_stat = []
|
|
for paired in [False, True]:
|
|
if not paired and not single_end:
|
|
continue
|
|
if paired and not paired_end:
|
|
continue
|
|
for readtype in readtypes:
|
|
if paired:
|
|
base_fname = data_base + "_paired"
|
|
type_sam_fname = base_fname + "_" + readtype + ".sam"
|
|
type_read1_fname = base_fname + "_1_" + readtype + ".fa"
|
|
type_read2_fname = base_fname + "_2_" + readtype + ".fa"
|
|
type_junction_fname = base_fname + "_" + readtype + ".junc"
|
|
else:
|
|
base_fname = data_base + "_single"
|
|
type_sam_fname = base_fname + "_" + readtype + ".sam"
|
|
type_read1_fname = base_fname + "_" + readtype + ".fa"
|
|
type_read2_fname = ""
|
|
type_junction_fname = base_fname + "_" + readtype + ".junc"
|
|
|
|
assert os.path.exists(type_sam_fname) and os.path.exists(type_junction_fname)
|
|
numreads = 0
|
|
type_sam_file = open(type_sam_fname)
|
|
for line in type_sam_file:
|
|
numreads += 1
|
|
type_sam_file.close()
|
|
if numreads <= 0:
|
|
continue
|
|
print >> sys.stderr, "%s\t%d" % (readtype, numreads)
|
|
|
|
junctions, junctions_set = [], set()
|
|
type_junction_file = open(type_junction_fname)
|
|
for line in type_junction_file:
|
|
chr, left, right = line[:-1].split()
|
|
left, right = int(left), int(right)
|
|
junctions.append([chr, left, right])
|
|
junctions_set.add(to_junction_str([chr, left, right]))
|
|
|
|
type_junction_file.close()
|
|
|
|
aligner_bin_base = "../../../aligners/bin"
|
|
def get_aligner_version(aligner, version):
|
|
version = ""
|
|
if aligner == "hisat2" or \
|
|
aligner == "hisat" or \
|
|
aligner == "bowtie" or \
|
|
aligner == "bowtie2":
|
|
if version:
|
|
cmd = ["%s/%s_%s/%s" % (aligner_bin_base, aligner, version, aligner)]
|
|
else:
|
|
cmd = ["%s/%s" % (aligner_bin_base, aligner)]
|
|
cmd += ["--version"]
|
|
cmd_process = subprocess.Popen(cmd, stdout=subprocess.PIPE)
|
|
version = cmd_process.communicate()[0][:-1].split("\n")[0]
|
|
version = version.split()[-1]
|
|
elif aligner == "tophat2":
|
|
cmd = ["%s/tophat" % (aligner_bin_base)]
|
|
cmd += ["--version"]
|
|
cmd_process = subprocess.Popen(cmd, stdout=subprocess.PIPE)
|
|
version = cmd_process.communicate()[0][:-1].split()[-1]
|
|
elif aligner in ["star", "starx2"]:
|
|
version = "2.4.2a"
|
|
elif aligner == "gsnap":
|
|
cmd = ["%s/gsnap" % (aligner_bin_base)]
|
|
cmd_process = subprocess.Popen(cmd, stderr=subprocess.PIPE)
|
|
version = cmd_process.communicate()[1][:-1].split("\n")[0]
|
|
version = version.split()[2]
|
|
elif aligner == "bwa":
|
|
if version:
|
|
cmd = ["%s/bwa_%s/bwa" % (aligner_bin_base, version)]
|
|
else:
|
|
cmd = ["%s/bwa" % (aligner_bin_base)]
|
|
cmd_process = subprocess.Popen(cmd, stderr=subprocess.PIPE)
|
|
version = cmd_process.communicate()[1][:-1].split("\n")[2]
|
|
version = version.split()[1]
|
|
elif aligner == "vg":
|
|
cmd = ["%s/vg" % (aligner_bin_base)]
|
|
cmd_process = subprocess.Popen(cmd, stderr=subprocess.PIPE)
|
|
version = cmd_process.communicate()[1][:-1].split("\n")[0]
|
|
version = version.split()[5]
|
|
elif aligner == "minimap2":
|
|
cmd = ["%s/minimap2" % (aligner_bin_base)]
|
|
cmd += ["--version"]
|
|
cmd_process = subprocess.Popen(cmd, stdout=subprocess.PIPE)
|
|
version = cmd_process.communicate()[0][:-1].split("\n")[0]
|
|
|
|
return version
|
|
|
|
index_base = "../../../indexes"
|
|
index_add = ""
|
|
if genome != "genome":
|
|
index_add = "_" + genome
|
|
def get_aligner_cmd(RNA, aligner, type, index_type, version, options, read1_fname, read2_fname, out_fname, cmd_idx = 0):
|
|
cmd = ["/usr/bin/time"]
|
|
if osx_mode:
|
|
cmd += ['-l']
|
|
if aligner == "hisat2":
|
|
if version:
|
|
cmd += ["%s/hisat2_%s/hisat2" % (aligner_bin_base, version)]
|
|
else:
|
|
cmd += ["%s/hisat2" % (aligner_bin_base)]
|
|
if num_threads > 1:
|
|
cmd += ["-p", str(num_threads)]
|
|
cmd += ["-f"]
|
|
# cmd += ["-k", "100"]
|
|
# cmd += ["--max-seeds", "100"]
|
|
# cmd += ["--score-min", "C,-100"]
|
|
# cmd += ["--pen-cansplice", "0"]
|
|
# cmd += ["--pen-noncansplice", "12"]
|
|
# cmd += ["--pen-intronlen", "G,-8,1"]
|
|
# cmd += ["--metrics", "1",
|
|
# "--metrics-file", "metrics.out"]
|
|
|
|
if not RNA:
|
|
cmd += ["--no-spliced-alignment"]
|
|
|
|
if type in ["x1", "x2"]:
|
|
cmd += ["--no-temp-splicesite"]
|
|
|
|
# cmd += ["--no-anchorstop"]
|
|
if version == "" or \
|
|
(version != "" and int(version) >= 210):
|
|
cmd += ["--new-summary",
|
|
"--summary-file", out_fname + ".summary"]
|
|
|
|
if version == "" or int(version) >= 220:
|
|
cmd += ["--repeat"]
|
|
|
|
# cmd += ["--dta"]
|
|
# cmd += ["--dta-cufflinks"]
|
|
|
|
if options != "":
|
|
cmd += options.split(' ')
|
|
|
|
if type == "x2":
|
|
if cmd_idx == 0:
|
|
cmd += ["--novel-splicesite-outfile"]
|
|
else:
|
|
cmd += ["--novel-splicesite-infile"]
|
|
cmd += ["splicesites.txt"]
|
|
|
|
# "--novel-splicesite-infile",
|
|
# "../splicesites.txt",
|
|
# "--rna-strandness",
|
|
# "FR",
|
|
if version:
|
|
index_cmd = "%s/HISAT2_%s%s/" % (index_base, version, index_add) + genome
|
|
else:
|
|
index_cmd = "%s/HISAT2%s/" % (index_base, index_add) + genome
|
|
if index_type:
|
|
index_cmd += ("_" + index_type)
|
|
cmd += [index_cmd]
|
|
if paired:
|
|
cmd += ["-1", read1_fname,
|
|
"-2", read2_fname]
|
|
else:
|
|
cmd += [read1_fname]
|
|
|
|
elif aligner == "hisat-TLA":
|
|
|
|
if version:
|
|
cmd += ["%s/hisat2_%s/hisat2" % (aligner_bin_base, version)]
|
|
else:
|
|
cmd += ["%s/hisat2" % (aligner_bin_base)]
|
|
if num_threads > 1:
|
|
cmd += ["-p", str(num_threads)]
|
|
cmd += ["-f"]
|
|
if not RNA:
|
|
cmd += ["--no-spliced-alignment"]
|
|
if type in ["x1", "x2"]:
|
|
cmd += ["--no-temp-splicesite"]
|
|
if version == "" or \
|
|
(version != "" and int(version) >= 210):
|
|
cmd += ["--new-summary",
|
|
"--summary-file", out_fname + ".summary"]
|
|
if version == "" or int(version) >= 220:
|
|
cmd += ["--repeat"]
|
|
if paired:
|
|
cmd += ["-1", read1_fname,
|
|
"-2", read2_fname]
|
|
else:
|
|
cmd += ["-U", read1_fname]
|
|
|
|
cmd += ["--TLA"]
|
|
cmd += ["--base-change"]
|
|
cmd += [baseChange]
|
|
cmd += ["--expand-repeat"]
|
|
|
|
index1_cmd = "%s/HISAT2%s/" % (index_base, index_add) + genome + '_' + baseChange
|
|
if index_type:
|
|
index1_cmd += ("_" + index_type)
|
|
cmd += ["--index1"]
|
|
cmd += [index1_cmd]
|
|
|
|
index2_cmd = "%s/HISAT2%s/" % (index_base, index_add) + genome + '_' + reverse_complement(baseChange[0]) + reverse_complement(baseChange[1])
|
|
if index_type:
|
|
index2_cmd += ("_" + index_type)
|
|
cmd += ["--index2"]
|
|
cmd += [index2_cmd]
|
|
|
|
cmd += ["--reference"]
|
|
cmd += ["../../../data/%s.fa" % genome]
|
|
|
|
|
|
|
|
elif aligner == "hisat":
|
|
cmd += ["%s/hisat" % (aligner_bin_base)]
|
|
if num_threads > 1:
|
|
cmd += ["-p", str(num_threads)]
|
|
cmd += ["-f"]
|
|
# cmd += ["-k", "5"]
|
|
# cmd += ["--score-min", "C,-18"]
|
|
if version != "":
|
|
version = int(version)
|
|
else:
|
|
version = sys.maxint
|
|
|
|
if not RNA:
|
|
cmd += ["--no-spliced-alignment"]
|
|
|
|
if type in ["x1", "x2"]:
|
|
cmd += ["--no-temp-splicesite"]
|
|
|
|
"""
|
|
cmd += ["--rdg", "100,100",
|
|
"--rfg", "100,100"]
|
|
"""
|
|
|
|
if type == "x2":
|
|
if cmd_idx == 0:
|
|
cmd += ["--novel-splicesite-outfile"]
|
|
else:
|
|
cmd += ["--novel-splicesite-infile"]
|
|
cmd += ["splicesites.txt"]
|
|
|
|
# "--novel-splicesite-infile",
|
|
# "../splicesites.txt",
|
|
# "--rna-strandness",
|
|
# "FR",
|
|
cmd += ["%s/HISAT%s/" % (index_base, index_add) + genome]
|
|
if paired:
|
|
cmd += ["-1", read1_fname,
|
|
"-2", read2_fname]
|
|
else:
|
|
cmd += [read1_fname]
|
|
elif aligner == "tophat2":
|
|
cmd += ["%s/tophat" % (aligner_bin_base)]
|
|
if num_threads > 1:
|
|
cmd += ["-p", str(num_threads)]
|
|
if type.find("gtf") != -1:
|
|
cmd += ["--transcriptome-index=%s/HISAT%s/gtf/%s" % (index_base, index_add, genome)]
|
|
if type == "gtfonly":
|
|
cmd += ["--transcriptome-only"]
|
|
cmd += ["--read-edit-dist", "3"]
|
|
cmd += ["--no-sort-bam"]
|
|
cmd += ["--read-realign-edit-dist", "0"]
|
|
cmd += ["--keep-tmp",
|
|
"%s/HISAT%s/" % (index_base, index_add) + genome,
|
|
read1_fname]
|
|
if paired:
|
|
cmd += [read2_fname]
|
|
elif aligner == "star":
|
|
cmd += ["%s/STAR" % (aligner_bin_base)]
|
|
if num_threads > 1:
|
|
cmd += ["--runThreadN", str(num_threads)]
|
|
cmd += ["--genomeDir"]
|
|
if cmd_idx == 0:
|
|
if type == "gtf":
|
|
cmd += ["%s/STAR%s/gtf" % (index_base, index_add)]
|
|
else:
|
|
cmd += ["%s/STAR%s" % (index_base, index_add)]
|
|
else:
|
|
assert cmd_idx == 1
|
|
cmd += ["."]
|
|
|
|
if desktop:
|
|
cmd += ["--genomeLoad", "NoSharedMemory"]
|
|
else:
|
|
cmd += ["--genomeLoad", "LoadAndKeep"]
|
|
if type == "x2":
|
|
if cmd_idx == 1:
|
|
cmd += ["--alignSJDBoverhangMin", "1"]
|
|
cmd += ["--readFilesIn",
|
|
read1_fname]
|
|
if paired:
|
|
cmd += [read2_fname]
|
|
if paired:
|
|
cmd += ["--outFilterMismatchNmax", "6"]
|
|
else:
|
|
cmd += ["--outFilterMismatchNmax", "3"]
|
|
elif aligner == "bowtie":
|
|
cmd += ["%s/bowtie" % (aligner_bin_base)]
|
|
if num_threads > 1:
|
|
cmd += ["-p", str(num_threads)]
|
|
cmd += ["-f",
|
|
"--sam",
|
|
"-k", "10"]
|
|
cmd += ["-n", "3"]
|
|
if paired:
|
|
cmd += ["-X", "500"]
|
|
cmd += ["%s/Bowtie%s/" % (index_base, index_add) + genome]
|
|
if paired:
|
|
cmd += ["-1", read1_fname,
|
|
"-2", read2_fname]
|
|
else:
|
|
cmd += [read1_fname]
|
|
elif aligner == "bowtie2":
|
|
if version:
|
|
cmd += ["%s/bowtie2_%s/bowtie2" % (aligner_bin_base, version)]
|
|
else:
|
|
cmd += ["%s/bowtie2" % (aligner_bin_base)]
|
|
if num_threads > 1:
|
|
cmd += ["-p", str(num_threads)]
|
|
cmd += ["-f"]
|
|
if options != "":
|
|
cmd += options.split(' ')
|
|
if version:
|
|
cmd += ["-x %s/Bowtie2_%s%s/" % (index_base, version, index_add) + genome]
|
|
else:
|
|
cmd += ["-x %s/Bowtie2%s/" % (index_base, index_add) + genome]
|
|
if paired:
|
|
cmd += ["-1", read1_fname,
|
|
"-2", read2_fname]
|
|
else:
|
|
cmd += [read1_fname]
|
|
elif aligner == "gsnap":
|
|
cmd += ["%s/gsnap" % (aligner_bin_base),
|
|
"-A",
|
|
"sam"]
|
|
if num_threads > 1:
|
|
cmd += ["-t", str(num_threads)]
|
|
cmd += ["--max-mismatches=3",
|
|
"-D", "%s/GSNAP%s" % (index_base, index_add),
|
|
"-N", "1",
|
|
"-d", genome,
|
|
read1_fname]
|
|
if paired:
|
|
cmd += [read2_fname]
|
|
elif aligner == "bwa":
|
|
if version:
|
|
cmd += ["%s/bwa_%s/bwa" % (aligner_bin_base, version)]
|
|
else:
|
|
cmd += ["%s/bwa" % (aligner_bin_base)]
|
|
if type in ["mem", "aln"]:
|
|
cmd += [type]
|
|
elif type == "sw":
|
|
cmd += ["bwa" + type]
|
|
if num_threads > 1:
|
|
cmd += ["-t", str(num_threads)]
|
|
if options != "":
|
|
cmd += options.split(' ')
|
|
# if paired:
|
|
# cmd += ["-T", "60"]
|
|
if version:
|
|
cmd += ["%s/BWA_%s%s/%s.fa" % (index_base, version, index_add, genome)]
|
|
else:
|
|
cmd += ["%s/BWA%s/%s.fa" % (index_base, index_add, genome)]
|
|
cmd += [read1_fname]
|
|
if paired:
|
|
cmd += [read2_fname]
|
|
elif aligner == "vg":
|
|
# vg map -d 22 -t 6 -M 10 -f ../sim-1.fa -f ../sim-2.fa > result.sam.gam
|
|
cmd += ["%s/vg" % (aligner_bin_base)]
|
|
cmd += ["map"]
|
|
cmd += ["-t", str(num_threads)]
|
|
cmd += ["--surject-to", "sam"]
|
|
index_cmd = "%s/VG%s/" % (index_base, index_add) + genome
|
|
if index_type:
|
|
index_cmd += ("_" + index_type)
|
|
|
|
if options != "":
|
|
cmd += options.split(' ')
|
|
|
|
cmd += ["-d", index_cmd]
|
|
|
|
cmd += ["-f", read1_fname]
|
|
if paired:
|
|
cmd += ["-f", read2_fname]
|
|
elif aligner == "minimap2":
|
|
# minimap2 -a -x sr 22.mmi sim_1.fa sim_2.fa > result.sam
|
|
cmd += ["%s/minimap2" % (aligner_bin_base)]
|
|
cmd += ["-a"]
|
|
cmd += ["-x", "sr"]
|
|
index_cmd = "%s/minimap2%s/" % (index_base, index_add) + genome
|
|
if index_type:
|
|
index_cmd += ("_" + index_type)
|
|
index_cmd += ".mmi"
|
|
cmd += [index_cmd]
|
|
cmd += [read1_fname]
|
|
if paired:
|
|
cmd += [read2_fname]
|
|
else:
|
|
assert False
|
|
|
|
return cmd
|
|
|
|
for aligner, type, index_type, version, options in aligners:
|
|
skip = False
|
|
if len(test_aligners) > 0:
|
|
skip = True
|
|
for test_aligner in test_aligners:
|
|
if aligner == test_aligner:
|
|
skip = False
|
|
if skip:
|
|
continue
|
|
|
|
aligner_name = aligner + type
|
|
if version != "":
|
|
aligner_name += ("_%s" % version)
|
|
if aligner == "hisat2" and index_type != "":
|
|
aligner_name += ("_" + index_type)
|
|
if aligner == "vg" and index_type != "":
|
|
aligner_name += ("_" + index_type)
|
|
|
|
two_step = (aligner == "tophat2" or type == "x2" or (aligner in ["hisat2", "hisat"] and type == ""))
|
|
if RNA and readtype != "M":
|
|
if aligner in ["bowtie", "bowtie2", "bwa"]:
|
|
continue
|
|
if readtype != "all":
|
|
if two_step:
|
|
continue
|
|
if not RNA and readtype != "all":
|
|
continue
|
|
|
|
print >> sys.stderr, "\t%s\t%s" % (aligner_name, str(datetime.now()))
|
|
if options != "":
|
|
option_name = options.replace(' ', '').replace('-', '').replace(',', '')
|
|
aligner_name = aligner_name + '_' + option_name
|
|
if paired:
|
|
aligner_dir = aligner_name + "_paired"
|
|
else:
|
|
aligner_dir = aligner_name + "_single"
|
|
|
|
if fresh and os.path.exists(aligner_dir):
|
|
os.system("rm -rf %s" % aligner_dir)
|
|
|
|
if not os.path.exists(aligner_dir):
|
|
os.mkdir(aligner_dir)
|
|
|
|
os.chdir(aligner_dir)
|
|
|
|
out_fname = base_fname + "_" + readtype + ".sam"
|
|
out_fname2 = out_fname + "2"
|
|
duration = -1.0
|
|
mem_usage = '0'
|
|
if not os.path.exists(out_fname):
|
|
if not os.path.exists("../one.fa") or not os.path.exists("../two.fa"):
|
|
os.system("head -400 ../%s_1.fa > ../one.fa" % (data_base))
|
|
os.system("head -400 ../%s_2.fa > ../two.fa" % (data_base))
|
|
|
|
if runtime_only:
|
|
out_fname = "/dev/null"
|
|
out_fname2 = "/dev/null"
|
|
|
|
if not two_step:
|
|
align_stat.append([readtype, aligner_name])
|
|
|
|
# dummy commands for caching index and simulated reads
|
|
loading_time = 0
|
|
if aligner != "tophat2":
|
|
for i in range(3):
|
|
dummy_cmd = get_aligner_cmd(RNA, aligner, type, index_type, version, options, "../one.fa", "../two.fa", "/dev/null")
|
|
start_time = datetime.now()
|
|
if verbose:
|
|
print >> sys.stderr, start_time, "\t", " ".join(dummy_cmd)
|
|
if aligner in ["hisat2", "hisat", "bowtie", "bowtie2", "gsnap", "bwa"]:
|
|
proc = subprocess.Popen(dummy_cmd, stdout=open("/dev/null", "w"), stderr=subprocess.PIPE)
|
|
else:
|
|
proc = subprocess.Popen(dummy_cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
|
|
proc.communicate()
|
|
finish_time = datetime.now()
|
|
duration = finish_time - start_time
|
|
duration = duration.total_seconds()
|
|
if verbose:
|
|
print >> sys.stderr, finish_time, "duration:", duration
|
|
loading_time = duration
|
|
|
|
# Align all reads
|
|
aligner_cmd = get_aligner_cmd(RNA, aligner, type, index_type, version, options, "../" + type_read1_fname, "../" + type_read2_fname, out_fname)
|
|
start_time = datetime.now()
|
|
if verbose:
|
|
print >> sys.stderr, "\t", start_time, " ".join(aligner_cmd)
|
|
if aligner in ["hisat2", "hisat", "bowtie", "bowtie2", "gsnap", "bwa", "vg", "minimap2", "hisat-TLA"]:
|
|
proc = subprocess.Popen(aligner_cmd, stdout=open(out_fname, "w"), stderr=subprocess.PIPE)
|
|
else:
|
|
proc = subprocess.Popen(aligner_cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
|
|
_, mem_usage = proc.communicate()
|
|
mem_usage = parse_mem_usage(mem_usage)
|
|
finish_time = datetime.now()
|
|
duration = finish_time - start_time
|
|
duration = duration.total_seconds() - loading_time
|
|
if duration < 0.1:
|
|
duration = 0.1
|
|
if verbose:
|
|
print >> sys.stderr, "\t", finish_time, "finished:", duration
|
|
|
|
if debug and aligner == "hisat2":
|
|
os.system("cat metrics.out")
|
|
print >> sys.stderr, "\ttime: %.4f" % (duration)
|
|
|
|
if aligner == "star" and type in ["", "gtf"]:
|
|
os.system("mv Aligned.out.sam %s" % out_fname)
|
|
elif aligner in ["hisat2", "hisat"] and type == "x2":
|
|
aligner_cmd = get_aligner_cmd(RNA, aligner, type, index_type, version, options, "../" + type_read1_fname, "../" + type_read2_fname, out_fname, 1)
|
|
start_time = datetime.now()
|
|
if verbose:
|
|
print >> sys.stderr, "\t", start_time, " ".join(aligner_cmd)
|
|
proc = subprocess.Popen(aligner_cmd, stdout=open(out_fname, "w"), stderr=subprocess.PIPE)
|
|
proc.communicate()
|
|
finish_time = datetime.now()
|
|
duration += (finish_time - start_time).total_seconds()
|
|
duration -= loading_time
|
|
if duration < 0.1:
|
|
duration = 0.1
|
|
if verbose:
|
|
print >> sys.stderr, "\t", finish_time, "finished:", duration
|
|
elif aligner == "star" and type == "x2":
|
|
assert os.path.exists("SJ.out.tab")
|
|
os.system("awk 'BEGIN {OFS=\"\t\"; strChar[0]=\".\"; strChar[1]=\"+\"; strChar[2]=\"-\";} {if($5>0){print $1,$2,$3,strChar[$4]}}' SJ.out.tab > SJ.out.tab.Pass1.sjdb")
|
|
for file in os.listdir("."):
|
|
if file in ["SJ.out.tab.Pass1.sjdb", "genome.fa"]:
|
|
continue
|
|
os.remove(file)
|
|
star_index_cmd = "%s/STAR --genomeDir ./ --runMode genomeGenerate --genomeFastaFiles ../../../data/%s.fa --sjdbFileChrStartEnd SJ.out.tab.Pass1.sjdb --sjdbOverhang 99 --runThreadN %d" % (aligner_bin_base, genome, num_threads)
|
|
if verbose:
|
|
print >> sys.stderr, "\t", datetime.now(), star_index_cmd
|
|
os.system(star_index_cmd)
|
|
if verbose:
|
|
print >> sys.stderr, "\t", datetime.now(), " ".join(dummy_cmd)
|
|
proc = subprocess.Popen(dummy_cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
|
|
proc.communicate()
|
|
if verbose:
|
|
print >> sys.stderr, "\t", datetime.now(), "finished"
|
|
aligner_cmd = get_aligner_cmd(RNA, aligner, type, index_type, version, options, "../" + type_read1_fname, "../" + type_read2_fname, out_fname, 1)
|
|
start_time = datetime.now()
|
|
if verbose:
|
|
print >> sys.stderr, "\t", start_time, " ".join(aligner_cmd)
|
|
proc = subprocess.Popen(aligner_cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
|
|
proc.communicate()
|
|
finish_time = datetime.now()
|
|
duration += (finish_time - start_time).total_seconds()
|
|
duration -= loading_time
|
|
if duration < 0.1:
|
|
duration = 0.1
|
|
if verbose:
|
|
print >> sys.stderr, "\t", finish_time, "finished:", duration
|
|
os.system("mv Aligned.out.sam %s" % out_fname)
|
|
elif aligner == "tophat2":
|
|
os.system("samtools sort -n tophat_out/accepted_hits.bam accepted_hits; samtools view -h accepted_hits.bam > %s" % out_fname)
|
|
elif aligner == "vg":
|
|
index_name = "%s/VG%s/" % (index_base, index_add) + genome
|
|
if index_type:
|
|
index_name += ("_" + index_type)
|
|
|
|
if aligner in ["gsnap", "tophat2"]:
|
|
os.system("tar cvzf %s.tar.gz %s &> /dev/null" % (out_fname, out_fname))
|
|
|
|
if runtime_only:
|
|
print >> sys.stderr, "\t\t\tMemory Usage: %dMB" % (int(mem_usage) / 1024)
|
|
os.chdir("..")
|
|
continue
|
|
|
|
if not os.path.exists(out_fname2):
|
|
debug_dic = {}
|
|
pid_list = []
|
|
if paired:
|
|
if mp_mode:
|
|
for i in xrange(mp_num):
|
|
p = Process(target=extract_pair, args=(out_fname, out_fname2, chr_dic, RNA, aligner, version, repeat_db, repeat_map, debug_dic, i))
|
|
pid_list.append(p)
|
|
p.start()
|
|
|
|
for p in pid_list:
|
|
p.join()
|
|
|
|
# merge
|
|
os.system("mv %s %s" % (out_fname2 + ".0", out_fname2))
|
|
for i in xrange(1, mp_num):
|
|
os.system("cat %s >> %s" % (out_fname2 + "." + str(i), out_fname2))
|
|
os.system("rm %s" % (out_fname2 + "." + str(i)))
|
|
|
|
else:
|
|
extract_pair(out_fname, out_fname2, chr_dic, RNA, aligner, version, repeat_db, repeat_map, debug_dic, -1)
|
|
|
|
|
|
else:
|
|
if mp_mode:
|
|
# Prepare queues
|
|
for i in xrange(mp_num):
|
|
p = Process(target=extract_single, args=(out_fname, out_fname2, chr_dic, aligner, version, repeat_db, repeat_map, debug_dic, i))
|
|
pid_list.append(p)
|
|
p.start()
|
|
|
|
# wait
|
|
for p in pid_list:
|
|
p.join()
|
|
|
|
# merge
|
|
os.system("mv %s %s" % (out_fname2 + ".0", out_fname2))
|
|
for i in xrange(1, mp_num):
|
|
os.system("cat %s >> %s" % (out_fname2 + "." + str(i), out_fname2))
|
|
os.system("rm %s" % (out_fname2 + "." + str(i)))
|
|
|
|
else:
|
|
extract_single(out_fname, out_fname2, chr_dic, aligner, version, repeat_db, repeat_map, debug_dic, -1)
|
|
|
|
for readtype2 in readtypes:
|
|
if not two_step and readtype != readtype2:
|
|
continue
|
|
|
|
type_sam_fname2 = base_fname + "_" + readtype2 + ".sam"
|
|
if os.path.exists(type_sam_fname2 + ".done"):
|
|
continue
|
|
|
|
if paired:
|
|
type_read_fname2 = base_fname + "_1_" + readtype2 + ".fa"
|
|
else:
|
|
type_read_fname2 = base_fname + "_" + readtype2 + ".fa"
|
|
mapped_id_fname = base_fname + "_" + readtype2 + ".read_id"
|
|
if paired:
|
|
mapped, unique_mapped, first_mapped, unmapped, aligned, multi_aligned, \
|
|
snp_mapped, snp_unique_mapped, snp_first_mapped, snp_unmapped, \
|
|
temp_junctions, temp_gtf_junctions, mapping_point \
|
|
= compare_paired_sam(RNA, out_fname2, "../" + type_sam_fname2, mapped_id_fname, chr_dic, junctions, junctions_set, gtf_junctions, aligner)
|
|
else:
|
|
mapped, unique_mapped, first_mapped, unmapped, aligned, multi_aligned, \
|
|
snp_mapped, snp_unique_mapped, snp_first_mapped, snp_unmapped, \
|
|
temp_junctions, temp_gtf_junctions, mapping_point \
|
|
= compare_single_sam(RNA, out_fname2, "../" + type_sam_fname2, mapped_id_fname, chr_dic, junctions, junctions_set, gtf_junctions, aligner)
|
|
proc = subprocess.Popen(["wc", "-l", "../" + type_read_fname2], stdout=subprocess.PIPE)
|
|
out = proc.communicate()[0]
|
|
numreads = int(out.split()[0]) / 2
|
|
assert mapped + unmapped == numreads
|
|
|
|
if two_step:
|
|
print >> sys.stderr, "\t\t%s" % readtype2
|
|
print >> sys.stderr, "\t\taligned: %d, multi aligned: %d" % (aligned, multi_aligned)
|
|
print >> sys.stderr, "\t\tcorrectly mapped: %d (%.2f%%) mapping_point: %.2f" % (mapped, float(mapped) * 100.0 / numreads, mapping_point * 100.0 / numreads)
|
|
print >> sys.stderr, "\t\tcorrectly mapped at first: %d (%.2f%%)" % (first_mapped, float(first_mapped) * 100.0 / numreads)
|
|
print >> sys.stderr, "\t\tuniquely and correctly mapped: %d (%.2f%%)" % (unique_mapped, float(unique_mapped) * 100.0 / numreads)
|
|
snp_numreads = snp_mapped + snp_unmapped
|
|
if snp_numreads > 0:
|
|
print >> sys.stderr, "\t\t\t\tSNP: reads: %d" % (snp_numreads)
|
|
print >> sys.stderr, "\t\t\t\tSNP: correctly mapped: %d (%.2f%%)" % (snp_mapped, float(snp_mapped) * 100.0 / snp_numreads)
|
|
print >> sys.stderr, "\t\t\t\tSNP: correctly mapped at first: %d (%.2f%%)" % (snp_first_mapped, float(snp_first_mapped) * 100.0 / snp_numreads)
|
|
print >> sys.stderr, "\t\t\t\tSNP: uniquely and correctly mapped: %d (%.2f%%)" % (snp_unique_mapped, float(snp_unique_mapped) * 100.0 / snp_numreads)
|
|
if readtype == readtype2:
|
|
print >> sys.stderr, "\t\t\t%d reads per sec (all)" % (numreads / max(1.0, duration))
|
|
if RNA:
|
|
print >> sys.stderr, "\t\tjunc. sensitivity %d / %d (%.2f%%), junc. accuracy: %d / %d (%.2f%%)" % \
|
|
(temp_gtf_junctions, len(junctions), float(temp_gtf_junctions) * 100.0 / max(1, len(junctions)), \
|
|
temp_gtf_junctions, temp_junctions, float(temp_gtf_junctions) * 100.0 / max(1, temp_junctions))
|
|
|
|
print >> sys.stderr, "\t\t\tMemory Usage: %dMB" % (int(mem_usage) / 1024)
|
|
|
|
if duration > 0.0:
|
|
if sql_write and os.path.exists("../" + sql_db_name):
|
|
if paired:
|
|
end_type = "paired"
|
|
else:
|
|
end_type = "single"
|
|
|
|
mem_used = int(mem_usage) / 1024
|
|
sql_insert = "INSERT INTO \"ReadCosts\" VALUES(NULL, '%s', '%s', '%s', '%s', '%s', '%s', %d, %d, %d, %d, %f, %f, %d, %d, %d, %d, %d, %d, %d, '%s', datetime('now', 'localtime'), '%s');" % \
|
|
(genome, data_base, end_type, readtype2, aligner_name, get_aligner_version(aligner, version), numreads, mapped, unique_mapped, unmapped, mapping_point, snp_mapped, snp_unique_mapped, snp_unmapped, duration, mem_used, len(junctions), temp_junctions, temp_gtf_junctions, platform.node(), " ".join(aligner_cmd))
|
|
sql_execute("../" + sql_db_name, sql_insert)
|
|
|
|
if two_step:
|
|
align_stat.append([readtype2, aligner_name, numreads, duration, mem_used, mapped, unique_mapped, unmapped, mapping_point, snp_mapped, snp_unique_mapped, snp_unmapped, len(junctions), temp_junctions, temp_gtf_junctions])
|
|
else:
|
|
align_stat[-1].extend([numreads, duration, mem_used, mapped, unique_mapped, unmapped, mapping_point, snp_mapped, snp_unique_mapped, snp_unmapped, len(junctions), temp_junctions, temp_gtf_junctions])
|
|
|
|
os.system("touch %s.done" % type_sam_fname2)
|
|
|
|
os.chdir("..")
|
|
|
|
print >> sys.stdout, "\t".join(["type", "aligner", "all", "all_time", "mem", "mapped", "unique_mapped", "unmapped", "mapping point", "snp_mapped", "snp_unique_mapped", "snp_unmapped", "true_gtf_junctions", "temp_junctions", "temp_gtf_junctions"])
|
|
for line in align_stat:
|
|
outstr = ""
|
|
for item in line:
|
|
if outstr != "":
|
|
outstr += "\t"
|
|
outstr += str(item)
|
|
print >> sys.stdout, outstr
|
|
|
|
if os.path.exists(sql_db_name):
|
|
write_analysis_data(sql_db_name, genome, data_base)
|
|
|
|
|
|
|
|
"""
|
|
"""
|
|
if __name__ == "__main__":
|
|
parser = ArgumentParser(
|
|
description='test HISAT2, and compare HISAT2 with other popular aligners such as TopHat2, STAR, Bowtie1/2, GSNAP, BWA-mem, etc.')
|
|
parser.add_argument('--single-end',
|
|
dest='paired_end',
|
|
action='store_false',
|
|
help='run single-end only')
|
|
parser.add_argument('--paired-end',
|
|
dest='single_end',
|
|
action='store_false',
|
|
help='run paired_end only')
|
|
parser.add_argument('--aligner-list',
|
|
dest='aligner_list',
|
|
type=str,
|
|
default="",
|
|
help='comma-separated list of aligners (e.g. hisat2,bowtie2,bwa')
|
|
parser.add_argument('--fresh',
|
|
dest='fresh',
|
|
action='store_true',
|
|
help='delete existing alignment related directories (e.g. hisat2_single)')
|
|
parser.add_argument('--runtime-only', '--runtime',
|
|
dest='runtime_only',
|
|
action='store_true',
|
|
help='run programs without evaluation')
|
|
parser.add_argument('-p', '--multi-process',
|
|
dest='mp_num',
|
|
action='store',
|
|
type=int,
|
|
default=1,
|
|
help='Use multiple process mode')
|
|
parser.add_argument('--base-change',
|
|
dest='baseChange',
|
|
type=str,
|
|
default="",
|
|
help='base change for Hisat-TLA')
|
|
parser.add_argument('-v', '--verbose',
|
|
dest='verbose',
|
|
action='store_true',
|
|
help='also print some statistics to stderr')
|
|
|
|
args = parser.parse_args()
|
|
|
|
aligners = []
|
|
for aligner in args.aligner_list.split(','):
|
|
if aligner == "":
|
|
continue
|
|
aligners.append(aligner)
|
|
|
|
mp_num = args.mp_num
|
|
mp_mode = (mp_num > 1)
|
|
|
|
calculate_read_cost(args.single_end,
|
|
args.paired_end,
|
|
aligners,
|
|
args.fresh,
|
|
args.runtime_only,
|
|
args.baseChange,
|
|
args.verbose)
|