hisat-3n/hisat2_simulate_reads.py

972 lines
34 KiB
Python
Raw Normal View History

2025-01-18 13:09:52 +00:00
#!/usr/bin/env python3
#
# Copyright 2015, Daehwan Kim <infphilo@gmail.com>
#
# This file is part of HISAT 2.
#
# HISAT 2 is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# HISAT 2 is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with HISAT 2. If not, see <http://www.gnu.org/licenses/>.
#
import os, sys, math, random, re
from collections import defaultdict, Counter
from argparse import ArgumentParser, FileType
"""
"""
def reverse_complement(seq):
result = ""
for nt in seq:
base = nt
if nt == 'A':
base = 'T'
elif nt == 'a':
base = 't'
elif nt == 'C':
base = 'G'
elif nt == 'c':
base = 'g'
elif nt == 'G':
base = 'C'
elif nt == 'g':
base = 'c'
elif nt == 'T':
base = 'A'
elif nt == 't':
base = 'a'
result = base + result
return result
"""
python2 style randint
"""
def myrandint(m, x):
s = x - m + 1
return m + int(random.random() * s)
"""
Random source for sequencing errors
"""
class ErrRandomSource:
def __init__(self, prob = 0.0, size = 1 << 20):
self.size = size
self.rands = []
for i in range(self.size):
if random.random() < prob:
self.rands.append(1)
else:
self.rands.append(0)
self.cur = 0
def getRand(self):
assert self.cur < len(self.rands)
rand = self.rands[self.cur]
self.cur = (self.cur + 1) % len(self.rands)
return rand
"""
"""
def read_genome(genome_file):
chr_dic = {}
chr_name, sequence = "", ""
for line in genome_file:
if line[0] == ">":
if chr_name and sequence:
chr_dic[chr_name] = sequence
chr_name = line.strip().split()[0][1:]
sequence = ""
else:
sequence += line[:-1]
if chr_name and sequence:
chr_dic[chr_name] = sequence
chr_filter = [str(x) for x in list(range(1, 23)) + ['X', 'Y']]
#chr_filter = None
if chr_filter:
for chr_id, chr_seq in chr_dic.items():
if not chr_id in chr_filter:
chr_dic.pop(chr_id, None)
return chr_dic
"""
"""
def read_transcript(genome_seq, gtf_file, frag_len):
genes = defaultdict(list)
transcripts = {}
# Parse valid exon lines from the GTF file into a dict by transcript_id
for line in gtf_file:
line = line.strip()
if not line or line.startswith('#'):
continue
if '#' in line:
line = line.split('#')[0].strip()
try:
chrom, source, feature, left, right, score, \
strand, frame, values = line.split('\t')
except ValueError:
continue
if chrom not in genome_seq:
continue
# Zero-based offset
left, right = int(left) - 1, int(right) - 1
if feature != 'exon' or left >= right:
continue
values_dict = {}
for attr in values.split(';')[:-1]:
attr, _, val = attr.strip().partition(' ')
values_dict[attr] = val.strip('"')
if 'gene_id' not in values_dict or \
'transcript_id' not in values_dict:
continue
transcript_id = values_dict['transcript_id']
if transcript_id not in transcripts:
transcripts[transcript_id] = [chrom, strand, [[left, right]]]
genes[values_dict['gene_id']].append(transcript_id)
else:
transcripts[transcript_id][2].append([left, right])
# Sort exons and merge where separating introns are <=5 bps
for tran, [chr, strand, exons] in transcripts.items():
exons.sort()
tmp_exons = [exons[0]]
for i in range(1, len(exons)):
if exons[i][0] - tmp_exons[-1][1] <= 5:
tmp_exons[-1][1] = exons[i][1]
else:
tmp_exons.append(exons[i])
transcripts[tran] = [chr, strand, tmp_exons]
tmp_transcripts = {}
for tran, [chr, strand, exons] in transcripts.items():
exon_lens = [e[1] - e[0] + 1 for e in exons]
transcript_len = sum(exon_lens)
if transcript_len >= frag_len:
tmp_transcripts[tran] = [chr, strand, transcript_len, exons]
transcripts = tmp_transcripts
return genes, transcripts
"""
"""
def read_snp(snp_file):
snps = defaultdict(list)
for line in snp_file:
line = line.strip()
if not line or line.startswith('#'):
continue
try:
snpID, type, chr, pos, data = line.split('\t')
except ValueError:
continue
assert type in ["single", "deletion", "insertion"]
if type == "deletion":
data = int(data)
snps[chr].append([snpID, type, int(pos), data])
return snps
"""
"""
def sanity_check_input(genome_seq, genes, transcripts, snps, frag_len):
num_canon_ss, num_ss = 0, 0
for transcript, [chr, strand, transcript_len, exons] in transcripts.items():
assert transcript_len >= frag_len
if len(exons) <= 1:
continue
if chr not in genome_seq:
continue
chr_seq = genome_seq[chr]
for i in range(len(exons) - 1):
left1, right1 = exons[i]
assert left1 < right1
left2, right2 = exons[i+1]
assert left2 < right2
assert left1 < left2 and right1 < right2
donor = chr_seq[right1+1:right1+3]
acceptor = chr_seq[left2-2:left2]
if strand == "-":
donor, acceptor = reverse_complement(acceptor), reverse_complement(donor)
if donor == "GT" and acceptor == "AG":
num_canon_ss += 1
num_ss += 1
if num_ss > 0:
print("GT/AG splice sites: {}/{} ({:.2%})".format(num_canon_ss, num_ss, (float(num_canon_ss) / num_ss)), file=sys.stderr)
num_alt_single, num_single = 0, 0
for chr, chr_snps in snps.items():
if chr not in genome_seq:
continue
chr_seq = genome_seq[chr]
prev_snp = None
for snp in chr_snps:
snpID, type, pos, data = snp
if prev_snp:
assert prev_snp[2] <= pos
prev_snp = snp
if type != "single":
continue
assert pos < len(chr_seq)
if chr_seq[pos] != data:
num_alt_single += 1
num_single += 1
if num_single > 0:
print("Alternative bases: {}/{} ({:.2%})".format(num_alt_single, num_single, (float(num_alt_single) / num_single)), file=sys.stderr)
"""
"""
def generate_rna_expr_profile(expr_profile_type, num_transcripts = 10000):
# Modelling and simulating generic RNA-Seq experiments with the flux simulator
# http://nar.oxfordjournals.org/content/suppl/2012/06/29/gks666.DC1/nar-02667-n-2011-File002.pdf
def calc_expr(x, a):
x, a, b = float(x), 9500.0, 9500.0
k = -0.6
return (x**k) * math.exp(x/a * (x/b)**2)
expr_profile = [0.0] * num_transcripts
for i in range(len(expr_profile)):
if expr_profile_type == "flux":
expr_profile[i] = calc_expr(i + 1, num_transcripts)
elif expr_profile_type == "constant":
expr_profile[i] = 1.0
else:
assert False
expr_sum = sum(expr_profile)
expr_profile = [expr_profile[i] / expr_sum for i in range(len(expr_profile))]
assert abs(sum(expr_profile) - 1.0) < 0.001
return expr_profile
"""
"""
def generate_dna_expr_profile(genome_seq):
expr_profile = []
for chr_id, chr_seq in genome_seq.items():
expr_profile.append(len(chr_seq))
expr_sum = float(sum(expr_profile))
expr_profile = [expr_profile[i] / expr_sum for i in range(len(expr_profile))]
assert abs(sum(expr_profile) - 1.0) < 0.001
return expr_profile
"""
"""
def getSNPs(chr_snps, left, right):
low, high = 0, len(chr_snps)
while low < high:
mid = (low + high) // 2
snpID, type, pos, data = chr_snps[mid]
if pos < left:
low = mid + 1
else:
high = mid - 1
snps = []
for snp in chr_snps[low:]:
snpID, type, pos, data = snp
pos2 = pos
if type == "deletion":
pos2 += data
if pos2 >= right:
break
if pos >= left:
if len(snps) > 0:
_, prev_type, prev_pos, prev_data = snps[-1]
assert prev_pos <= pos
prev_pos2 = prev_pos
if prev_type == "deletion":
prev_pos2 += prev_data
if pos <= prev_pos2:
continue
snps.append(snp)
return snps
"""
"""
def getSamAlignment(rna, exons, chr_seq, trans_seq, frag_pos, read_len, chr_snps, snp_prob, err_rand_src, max_mismatch):
# Find the genomic position for frag_pos and exon number
tmp_frag_pos, tmp_read_len = frag_pos, read_len
pos, cigars, cigar_descs = exons[0][0], [], []
e_pos = 0
prev_e = None
for e_i in range(len(exons)):
e = exons[e_i]
if prev_e:
i_len = e[0] - prev_e[1] - 1
pos += i_len
e_len = e[1] - e[0] + 1
if e_len <= tmp_frag_pos:
tmp_frag_pos -= e_len
pos += e_len
else:
pos += tmp_frag_pos
e_pos = tmp_frag_pos
break
prev_e = e
# Define Cigar and its descriptions
assert e_i < len(exons)
e_len = exons[e_i][1] - exons[e_i][0] + 1
assert e_pos < e_len
cur_pos = pos
match_len = 0
prev_e = None
mismatch, remain_trans_len = 0, len(trans_seq) - (frag_pos + read_len)
assert remain_trans_len >= 0
for e_i in range(e_i, len(exons)):
e = exons[e_i]
if prev_e:
i_len = e[0] - prev_e[1] - 1
cur_pos += i_len
cigars.append(("{}N".format(i_len)))
cigar_descs.append([])
tmp_e_left = e_left = e[0] + e_pos
e_pos = 0
# Retreive SNPs
if rna:
snps = getSNPs(chr_snps, e_left, e[1])
else:
snps = getSNPs(chr_snps, frag_pos, frag_pos + read_len)
if snp_prob < 1.0 and len(snps) > 0:
snps_ = []
for snp in snps:
if random.random() <= snp_prob:
snps_.append(snp)
snps = snps_
# Simulate mismatches due to sequencing errors
mms = []
for i in range(e_left, min(e[1], e_left + tmp_read_len - 1)):
if err_rand_src.getRand() == 1:
assert i < len(chr_seq)
err_base = "A"
#rand = random.randint(0, 2)
rand = myrandint(0, 2)
if chr_seq[i] == "A":
err_base = "GCT"[rand]
elif chr_seq[i] == "C":
err_base = "AGT"[rand]
elif chr_seq[i] == "G":
err_base = "ACT"[rand]
else:
err_base = "ACG"[rand]
mms.append(["", "single", i, err_base])
tmp_diffs = snps + mms
# def diff_sort(a , b):
# return a[2] - b[2]
tmp_diffs = sorted(tmp_diffs, key=lambda t: t[2])
diffs = []
if len(tmp_diffs) > 0:
diffs = tmp_diffs[:1]
for diff in tmp_diffs[1:]:
_, tmp_type, tmp_pos, tmp_data = diff
_, prev_type, prev_pos, prev_data = diffs[-1]
if prev_type == "deletion":
prev_pos += prev_data
if tmp_pos <= prev_pos:
continue
diffs.append(diff)
cigar_descs.append([])
prev_diff = None
for diff in diffs:
diff_id, diff_type, diff_pos, diff_data = diff
if prev_diff:
prev_diff_id, prev_diff_type, prev_diff_pos, prev_diff_data = prev_diff
if prev_diff_type == "deletion":
prev_diff_pos += prev_diff_data
assert prev_diff_pos < diff_pos
diff_pos2 = diff_pos
if diff_type == "deletion":
diff_pos2 += diff_data
if e_left + tmp_read_len - 1 < diff_pos2 or e[1] < diff_pos2:
break
if diff_type == "single":
if mismatch + 1 > max_mismatch:
continue
cigar_descs[-1].append([diff_pos - tmp_e_left, diff_data, diff_id])
tmp_e_left = diff_pos + 1
mismatch += 1
elif diff_type == "deletion":
del_len = diff_data
if mismatch + del_len > max_mismatch:
continue
if len(cigars) <= 0 and diff_pos - e_left <= 0:
continue
if remain_trans_len < del_len:
continue
remain_trans_len -= del_len
if diff_pos - e_left > 0:
cigars.append("{}M".format(diff_pos - e_left))
cigar_descs[-1].append([diff_pos - tmp_e_left, "", ""])
cigar_descs.append([])
cigars.append("{}D".format(del_len))
cigar_descs[-1].append([0, del_len, diff_id])
cigar_descs.append([])
tmp_read_len -= (diff_pos - e_left)
e_left = tmp_e_left = diff_pos + del_len
elif diff_type == "insertion":
ins_len = len(diff_data)
if mismatch + ins_len > max_mismatch:
continue
if len(cigars) <= 0 and diff_pos - e_left <= 0:
continue
if e_left + tmp_read_len - 1 < diff_pos + ins_len:
break
if diff_pos - e_left > 0:
cigars.append("{}M".format(diff_pos - e_left))
cigar_descs[-1].append([diff_pos - tmp_e_left, "", ""])
cigar_descs.append([])
cigars.append("{}I".format(ins_len))
cigar_descs[-1].append([0, diff_data, diff_id])
cigar_descs.append([])
tmp_read_len -= (diff_pos - e_left)
tmp_read_len -= ins_len
e_left = tmp_e_left = diff_pos
else:
assert False
prev_diff = diff
e_right = min(e[1], e_left + tmp_read_len - 1)
e_len = e_right - e_left + 1
remain_e_len = e_right - tmp_e_left + 1
if remain_e_len > 0:
cigar_descs[-1].append([remain_e_len, "", ""])
if e_len < tmp_read_len:
tmp_read_len -= e_len
cigars.append(("{}M".format(e_len)))
else:
assert e_len == tmp_read_len
cigars.append(("{}M".format(tmp_read_len)))
tmp_read_len = 0
break
prev_e = e
# Define MD, XM, NM, Zs, read_seq
MD, XM, NM, Zs, read_seq = "", 0, 0, "", ""
assert len(cigars) == len(cigar_descs)
MD_match_len, Zs_match_len = 0, 0
cur_trans_pos = frag_pos
for c in range(len(cigars)):
cigar = cigars[c]
cigar_len, cigar_op = int(cigar[:-1]), cigar[-1]
cigar_desc = cigar_descs[c]
if cigar_op == 'N':
continue
if cigar_op == 'M':
for add_match_len, alt_base, snp_id in cigar_desc:
MD_match_len += add_match_len
Zs_match_len += add_match_len
assert cur_trans_pos + add_match_len <= len(trans_seq)
read_seq += trans_seq[cur_trans_pos:cur_trans_pos+add_match_len]
cur_trans_pos += add_match_len
if alt_base != "":
if MD_match_len > 0:
MD += ("{}".format(MD_match_len))
MD_match_len = 0
MD += trans_seq[cur_trans_pos]
if snp_id != "":
if Zs != "":
Zs += ","
Zs += ("{}|S|{}".format(Zs_match_len, snp_id))
Zs_match_len = 0
else:
Zs_match_len += 1
if snp_id == "":
XM += 1
NM += 1
read_seq += alt_base
cur_trans_pos += 1
elif cigar_op == 'D':
assert len(cigar_desc) == 1
add_match_len, del_len, snp_id = cigar_desc[0]
MD_match_len += add_match_len
Zs_match_len += add_match_len
if MD_match_len > 0:
MD += ("{}".format(MD_match_len))
MD_match_len = 0
MD += ("^{}".format(trans_seq[cur_trans_pos:cur_trans_pos+cigar_len]))
read_seq += trans_seq[cur_trans_pos:cur_trans_pos+add_match_len]
if Zs != "":
Zs += ","
Zs += ("{}|D|{}".format(Zs_match_len, cigar_desc[0][-1]))
Zs_match_len = 0
cur_trans_pos += cigar_len
elif cigar_op == 'I':
assert len(cigar_desc) == 1
add_match_len, ins_seq, snp_id = cigar_desc[0]
ins_len = len(ins_seq)
MD_match_len += add_match_len
Zs_match_len += add_match_len
read_seq += trans_seq[cur_trans_pos:cur_trans_pos+add_match_len]
read_seq += ins_seq
if Zs != "":
Zs += ","
Zs += ("{}|I|{}".format(Zs_match_len, cigar_desc[0][-1]))
Zs_match_len = 0
else:
assert False
if MD_match_len > 0:
MD += ("{}".format(MD_match_len))
if len(read_seq) != read_len:
print("read length differs:", len(read_seq), "vs.", read_len, file=sys.stderr)
print(pos, "".join(cigars), cigar_descs, MD, XM, NM, Zs, file=sys.stderr)
assert False
return pos, cigars, cigar_descs, MD, XM, NM, Zs, read_seq
"""
"""
cigar_re = re.compile('\d+\w')
def samRepOk(genome_seq, read_seq, chr, pos, cigar, XM, NM, MD, Zs, max_mismatch):
assert chr in genome_seq
chr_seq = genome_seq[chr]
assert pos < len(chr_seq)
# Calculate XM and NM based on Cigar and Zs
cigars = cigar_re.findall(cigar)
cigars = [[int(cigars[i][:-1]), cigars[i][-1]] for i in range(len(cigars))]
ref_pos, read_pos = pos, 0
ann_ref_seq, ann_ref_rel, ann_read_seq, ann_read_rel = [], [], [], []
for i in range(len(cigars)):
cigar_len, cigar_op = cigars[i]
if cigar_op == "M":
partial_ref_seq = chr_seq[ref_pos:ref_pos+cigar_len]
partial_read_seq = read_seq[read_pos:read_pos+cigar_len]
assert len(partial_ref_seq) == len(partial_read_seq)
ann_ref_seq += list(partial_ref_seq)
ann_read_seq += list(partial_read_seq)
for j in range(len(partial_ref_seq)):
if partial_ref_seq[j] == partial_read_seq[j]:
ann_ref_rel.append("=")
ann_read_rel.append("=")
else:
ann_ref_rel.append("X")
ann_read_rel.append("X")
ref_pos += cigar_len
read_pos += cigar_len
elif cigar_op == "D":
partial_ref_seq = chr_seq[ref_pos:ref_pos+cigar_len]
ann_ref_rel += list(partial_ref_seq)
ann_ref_seq += list(partial_ref_seq)
ann_read_rel += (["-"] * cigar_len)
ann_read_seq += (["-"] * cigar_len)
ref_pos += cigar_len
elif cigar_op == "I":
partial_read_seq = read_seq[read_pos:read_pos+cigar_len]
ann_ref_rel += (["-"] * cigar_len)
ann_ref_seq += (["-"] * cigar_len)
ann_read_rel += list(partial_read_seq)
ann_read_seq += list(partial_read_seq)
read_pos += cigar_len
elif cigar_op == "N":
ref_pos += cigar_len
else:
assert False
assert len(ann_ref_seq) == len(ann_read_seq)
assert len(ann_ref_seq) == len(ann_ref_rel)
assert len(ann_ref_seq) == len(ann_read_rel)
ann_Zs_seq = ["0" for i in range(len(ann_ref_seq))]
Zss, Zs_i, snp_pos_add = [], 0, 0
if Zs != "":
Zss = Zs.split(',')
Zss = [zs.split('|') for zs in Zss]
ann_read_pos = 0
for zs in Zss:
zs_pos, zs_type, zs_id = zs
zs_pos = int(zs_pos)
for i in range(zs_pos):
while ann_read_rel[ann_read_pos] == '-':
ann_read_pos += 1
ann_read_pos += 1
if zs_type == "S":
ann_Zs_seq[ann_read_pos] = "1"
ann_read_pos += 1
elif zs_type == "D":
while ann_read_rel[ann_read_pos] == '-':
ann_Zs_seq[ann_read_pos] = "1"
ann_read_pos += 1
elif zs_type == "I":
while ann_ref_rel[ann_read_pos] == '-':
ann_Zs_seq[ann_read_pos] = "1"
ann_read_pos += 1
else:
assert False
tMD, tXM, tNM = "", 0, 0
match_len = 0
i = 0
while i < len(ann_ref_seq):
if ann_ref_rel[i] == "=":
assert ann_read_rel[i] == "="
match_len += 1
i += 1
continue
assert ann_read_rel[i] != "="
if ann_ref_rel[i] == "X" and ann_read_rel[i] == "X":
if match_len > 0:
tMD += ("{}".format(match_len))
match_len = 0
tMD += ann_ref_seq[i]
if ann_Zs_seq[i] == "0":
tXM += 1
tNM += 1
i += 1
else:
assert ann_ref_rel[i] == "-" or ann_read_rel[i] == "-"
if ann_ref_rel[i] == '-':
while ann_ref_rel[i] == '-':
if ann_Zs_seq[i] == "0":
tNM += 1
i += 1
else:
assert ann_read_rel[i] == '-'
del_seq = ""
while ann_read_rel[i] == '-':
del_seq += ann_ref_seq[i]
if ann_Zs_seq[i] == "0":
tNM += 1
i += 1
if match_len > 0:
tMD += ("{}".format(match_len))
match_len = 0
tMD += ("^{}".format(del_seq))
if match_len > 0:
tMD += ("{}".format(match_len))
if tMD != MD or tXM != XM or tNM != NM or XM > max_mismatch or XM != NM:
print(chr, pos, cigar, MD, XM, NM, Zs, file=sys.stderr)
print(tMD, tXM, tNM, file=sys.stderr)
assert False
"""
"""
def simulate_reads(genome_file, gtf_file, snp_file, base_fname,
rna, paired_end, read_len, frag_len,
num_frag, expr_profile_type, repeat_fname,
error_rate, max_mismatch,
random_seed, snp_prob, sanity_check, verbose):
random.seed(random_seed, version=1)
err_rand_src = ErrRandomSource(error_rate / 100.0)
if read_len > frag_len:
frag_len = read_len
genome_seq = read_genome(genome_file)
if rna:
genes, transcripts = read_transcript(genome_seq, gtf_file, frag_len)
else:
genes, transcripts = {}, {}
snps = read_snp(snp_file)
if sanity_check:
sanity_check_input(genome_seq, genes, transcripts, snps, frag_len)
if rna:
num_transcripts = min(len(transcripts), 10000)
expr_profile = generate_rna_expr_profile(expr_profile_type, num_transcripts)
else:
expr_profile = generate_dna_expr_profile(genome_seq)
expr_profile = [int(expr_profile[i] * num_frag) for i in range(len(expr_profile))]
assert num_frag >= sum(expr_profile)
while sum(expr_profile) < num_frag:
for i in range(min(num_frag - sum(expr_profile), len(expr_profile))):
expr_profile[i] += 1
assert num_frag == sum(expr_profile)
repeat_loci = {}
if repeat_fname != "" and os.path.exists(repeat_fname):
for line in open(repeat_fname):
if line.startswith('>'):
continue
coords = line.strip().split()
for coord in coords:
chr, pos, strand = coord.split(':')
if chr not in repeat_loci:
repeat_loci[chr] = []
repeat_loci[chr].append([int(pos), strand])
if rna:
transcript_ids = sorted(list(transcripts.keys()))
random.shuffle(transcript_ids, random=random.random)
assert len(transcript_ids) >= len(expr_profile)
else:
chr_ids = list(genome_seq.keys())
sam_file = open(base_fname + ".sam", "w")
# Write SAM header
print("@HD\tVN:1.0\tSO:unsorted", file=sam_file)
for chr in genome_seq.keys():
print("@SQ\tSN:%s\tLN:%d" % (chr, len(genome_seq[chr])), file=sam_file)
read_file = open(base_fname + "_1.fa", "w")
if paired_end:
read2_file = open(base_fname + "_2.fa", "w")
cur_read_id = 1
for t in range(len(expr_profile)):
t_num_frags = expr_profile[t]
if rna:
transcript_id = transcript_ids[t]
chr, strand, transcript_len, exons = transcripts[transcript_id]
print(transcript_id, t_num_frags, file=sys.stderr)
else:
chr = chr_ids[t]
print(chr, t_num_frags, file=sys.stderr)
assert chr in genome_seq
chr_seq = genome_seq[chr]
chr_len = len(chr_seq)
if chr in repeat_loci:
chr_repeat_loci = repeat_loci[chr]
else:
chr_repeat_loci = []
if rna:
t_seq = ""
for e in exons:
assert e[0] < e[1]
t_seq += chr_seq[e[0]:e[1]+1]
assert len(t_seq) == transcript_len
else:
t_seq = chr_seq
exons = [[0, chr_len - 1]]
if chr in snps:
chr_snps = snps[chr]
else:
chr_snps = []
for f in range(t_num_frags):
if rna:
#frag_pos = random.randint(0, transcript_len - frag_len)
frag_pos = myrandint(0, transcript_len - frag_len)
else:
while True:
if len(chr_repeat_loci):
#locus_id = random.randint(0, len(chr_repeat_loci) - 1)
locus_id = myrandint(0, len(chr_repeat_loci) - 1)
frag_pos = chr_repeat_loci[locus_id][0]
else:
#frag_pos = random.randint(0, chr_len - frag_len)
frag_pos = myrandint(0, chr_len - frag_len)
if 'N' not in chr_seq[frag_pos:frag_pos + frag_len]:
break
# SAM specification (v1.4)
# http://samtools.sourceforge.net/
flag, flag2 = 99, 163 # 83, 147
pos, cigars, cigar_descs, MD, XM, NM, Zs, read_seq = getSamAlignment(rna, exons, chr_seq, t_seq, frag_pos, read_len, chr_snps, snp_prob, err_rand_src, max_mismatch)
pos2, cigars2, cigar2_descs, MD2, XM2, NM2, Zs2, read2_seq = getSamAlignment(rna, exons, chr_seq, t_seq, frag_pos+frag_len-read_len, read_len, chr_snps, snp_prob, err_rand_src, max_mismatch)
swapped = False
if paired_end:
#if random.randint(0, 1) == 1:
if myrandint(0, 1) == 1:
swapped = True
if swapped:
flag, flag2 = flag - 16, flag2 - 16
pos, pos2 = pos2, pos
cigars, cigars2 = cigars2, cigars
cigar_descs, cigar2_descs = cigar2_descs, cigar_descs
read_seq, read2_seq = read2_seq, read_seq
XM, XM2 = XM2, XM
NM, NM2 = NM2, NM
MD, MD2 = MD2, MD
Zs, Zs2 = Zs2, Zs
cigar_str, cigar2_str = "".join(cigars), "".join(cigars2)
if sanity_check:
samRepOk(genome_seq, read_seq, chr, pos, cigar_str, XM, NM, MD, Zs, max_mismatch)
samRepOk(genome_seq, read2_seq, chr, pos2, cigar2_str, XM2, NM2, MD2, Zs2, max_mismatch)
if Zs != "":
Zs = ("\tZs:Z:{}".format(Zs))
if Zs2 != "":
Zs2 = ("\tZs:Z:{}".format(Zs2))
if rna:
XS = "\tXS:A:{}".format(strand)
TI = "\tTI:Z:{}".format(transcript_id)
else:
XS, TI = "", ""
print(">{}".format(cur_read_id), file=read_file)
if swapped:
print(reverse_complement(read_seq), file=read_file)
else:
print(read_seq, file=read_file)
print("{}\t{}\t{}\t{}\t255\t{}\t{}\t{}\t0\t{}\t*\tXM:i:{}\tNM:i:{}\tMD:Z:{}{}{}{}".format(cur_read_id, flag, chr, pos + 1, cigar_str, chr, pos2 + 1, read_seq, XM, NM, MD, Zs, XS, TI), file=sam_file)
if paired_end:
print(">{}".format(cur_read_id), file=read2_file)
if swapped:
print(read2_seq, file=read2_file)
else:
print(reverse_complement(read2_seq), file=read2_file)
print("{}\t{}\t{}\t{}\t255\t{}\t{}\t{}\t0\t{}\t*\tXM:i:{}\tNM:i:{}\tMD:Z:{}{}{}{}".format(cur_read_id, flag2, chr, pos2 + 1, cigar2_str, chr, pos + 1, read2_seq, XM2, NM2, MD2, Zs2, XS, TI), file=sam_file)
cur_read_id += 1
sam_file.close()
read_file.close()
if paired_end:
read2_file.close()
if __name__ == '__main__':
parser = ArgumentParser(
description='Simulate reads from GENOME (fasta) and GTF files')
parser.add_argument('genome_file',
nargs='?',
type=FileType('r'),
help='input GENOME file')
parser.add_argument('gtf_file',
nargs='?',
type=FileType('r'),
help='input GTF file')
parser.add_argument('snp_file',
nargs='?',
type=FileType('r'),
help='input SNP file')
parser.add_argument('base_fname',
nargs='?',
type=str,
help='output base filename')
parser.add_argument('-d', '--dna',
dest='rna',
action='store_false',
default=True,
help='DNA-seq reads (default: RNA-seq reads)')
parser.add_argument('--single-end',
dest='paired_end',
action='store_false',
default=True,
help='single-end reads (default: paired-end reads)')
parser.add_argument('-r', '--read-length',
dest='read_len',
action='store',
type=int,
default=100,
help='read length (default: 100)')
parser.add_argument('-f', '--fragment-length',
dest='frag_len',
action='store',
type=int,
default=250,
help='fragment length (default: 250)')
parser.add_argument('-n', '--num-fragment',
dest='num_frag',
action='store',
type=int,
default=1000000,
help='number of fragments (default: 1000000)')
parser.add_argument('-e', '--expr-profile',
dest='expr_profile',
action='store',
type=str,
default='flux',
help='expression profile: flux or constant (default: flux)')
parser.add_argument('--repeat-info',
dest='repeat_fname',
action='store',
type=str,
default='',
help='repeat information filename')
parser.add_argument('--error-rate',
dest='error_rate',
action='store',
type=float,
default=0.0,
help='per-base sequencing error rate (%%) (default: 0.0)')
parser.add_argument('--max-mismatch',
dest='max_mismatch',
action='store',
type=int,
default=3,
help='max mismatches due to sequencing errors (default: 3)')
parser.add_argument('--random-seed',
dest='random_seed',
action='store',
type=int,
default=0,
help='random seeding value (default: 0)')
parser.add_argument('--snp-prob',
dest='snp_prob',
action='store',
type=float,
default=1.0,
help='probability of a read including a snp when the read spans the snp ranging from 0.0 to 1.0 (default: 1.0)')
parser.add_argument('--sanity-check',
dest='sanity_check',
action='store_true',
help='sanity check')
parser.add_argument('-v', '--verbose',
dest='verbose',
action='store_true',
help='also print some statistics to stderr')
parser.add_argument('--version',
action='version',
version='%(prog)s 2.0.0-alpha')
args = parser.parse_args()
if not args.genome_file or not args.gtf_file or not args.snp_file:
parser.print_help()
exit(1)
if not args.rna:
args.expr_profile = "constant"
simulate_reads(args.genome_file, args.gtf_file, args.snp_file, args.base_fname,
args.rna, args.paired_end, args.read_len, args.frag_len,
args.num_frag, args.expr_profile, args.repeat_fname,
args.error_rate, args.max_mismatch,
args.random_seed, args.snp_prob, args.sanity_check, args.verbose)