163 lines
5.2 KiB
Python
163 lines
5.2 KiB
Python
#!/usr/bin/env python
|
|
|
|
import sys, os, random
|
|
from argparse import ArgumentParser, FileType
|
|
from multiprocessing import Process
|
|
|
|
def shuffle_reads(read_fname, random_list):
|
|
reads = []
|
|
read_file = open(read_fname)
|
|
for line in read_file:
|
|
if line[0] == ">":
|
|
reads.append([])
|
|
reads[-1].append(line[:-1])
|
|
read_file.close()
|
|
|
|
read_fname_out = read_fname + ".shuffle"
|
|
read_file_out = open(read_fname_out, "w")
|
|
assert len(random_list) == len(reads)
|
|
for i in random_list:
|
|
read = reads[random_list[i]]
|
|
print >> read_file_out, "\n".join(read)
|
|
read_file_out.close()
|
|
|
|
|
|
def shuffle_pairs(read1_fname, read2_fname):
|
|
read1_file = open(read1_fname)
|
|
num_reads = 0
|
|
for line in read1_file:
|
|
if line[0] == ">":
|
|
num_reads += 1
|
|
read1_file.close()
|
|
|
|
random_list = [i for i in range(num_reads)]
|
|
random.shuffle(random_list)
|
|
|
|
shuffle_reads(read1_fname, random_list)
|
|
shuffle_reads(read2_fname, random_list)
|
|
|
|
|
|
def simulate_reads():
|
|
if not os.path.exists("reads"):
|
|
os.mkdir("reads")
|
|
os.chdir("reads")
|
|
if not os.path.exists("simulation"):
|
|
os.mkdir("simulation")
|
|
os.chdir("simulation")
|
|
|
|
_rna, _mismatch, _snp, _constant = True, True, True, True
|
|
_dna = not _rna
|
|
datasets = [
|
|
["22", 1000000, _dna, not _snp, not _mismatch, _constant],
|
|
["22", 1000000, _dna, not _snp, _mismatch, _constant],
|
|
["22", 1000000, _dna, _snp, not _mismatch, _constant],
|
|
["22", 1000000, _dna, _snp, _mismatch, _constant],
|
|
["22", 1000000, _rna, not _snp, not _mismatch, not _constant],
|
|
["22", 1000000, _rna, not _snp, not _mismatch, _constant],
|
|
["22", 1000000, _rna, not _snp, _mismatch, not _constant],
|
|
["22", 1000000, _rna, not _snp, _mismatch, _constant],
|
|
["22", 1000000, _rna, _snp, not _mismatch, not _constant],
|
|
["22", 1000000, _rna, _snp, not _mismatch, _constant],
|
|
["22", 1000000, _rna, _snp, _mismatch, not _constant],
|
|
["22", 1000000, _rna, _snp, _mismatch, _constant],
|
|
# ["22_20-21M", 1000000, _rna, not _snp, not _mismatch, not _constant],
|
|
# ["22_20-21M", 1000000, _rna, _snp, not _mismatch, _constant],
|
|
["genome", 10000000, _dna, not _snp, not _mismatch, _constant],
|
|
["genome", 10000000, _dna, _snp, not _mismatch, _constant],
|
|
["genome", 10000000, _dna, _snp, _mismatch, _constant],
|
|
["genome", 10000000, _rna, not _snp, not _mismatch, not _constant],
|
|
["genome", 10000000, _rna, _snp, not _mismatch, not _constant],
|
|
["genome", 10000000, _rna, _snp, _mismatch, not _constant],
|
|
]
|
|
|
|
data_dir_base = "../../../data"
|
|
|
|
def generate_reads(cmd):
|
|
print >> sys.stderr, cmd
|
|
os.system(cmd)
|
|
|
|
random.seed(0)
|
|
print >> sys.stderr, "shuffle reads sim_1.fa and sim_2.fa"
|
|
shuffle_pairs("sim_1.fa", "sim_2.fa")
|
|
shuffle_reads_cmd = " mv sim_1.fa.shuffle sim_1.fa"
|
|
shuffle_reads_cmd += "; mv sim_2.fa.shuffle sim_2.fa"
|
|
os.system(shuffle_reads_cmd)
|
|
|
|
|
|
pid_list = []
|
|
|
|
for genome, numreads, rna, snp, mismatch, constant in datasets:
|
|
if rna:
|
|
molecule = "RNA"
|
|
else:
|
|
molecule = "DNA"
|
|
if numreads >= 1000000:
|
|
dirname = "%dM_%s" % (numreads / 1000000, molecule)
|
|
else:
|
|
dirname = "%dk_%s" % (numreads / 1000, molecule)
|
|
|
|
if mismatch:
|
|
dirname += "_mismatch"
|
|
if snp:
|
|
dirname += "_snp"
|
|
if rna and constant:
|
|
dirname += "_constant"
|
|
dirname += "_reads"
|
|
dirname += ("_" + genome)
|
|
if os.path.exists(dirname):
|
|
continue
|
|
os.mkdir(dirname)
|
|
os.chdir(dirname)
|
|
genome_fname = data_dir_base + "/%s.fa" % (genome)
|
|
|
|
if rna:
|
|
gtf_fname = data_dir_base + "/%s.gtf" % (genome)
|
|
else:
|
|
gtf_fname = "/dev/null"
|
|
|
|
if snp:
|
|
snp_fname = data_dir_base + "/%s.snp" % (genome)
|
|
else:
|
|
snp_fname = "/dev/null"
|
|
|
|
cmd_add = ""
|
|
if not rna:
|
|
cmd_add += "--dna "
|
|
if mismatch:
|
|
cmd_add += "--error-rate 0.2 "
|
|
if rna and constant:
|
|
cmd_add += "--expr-profile constant "
|
|
cmd = "../../../aligners/bin/hisat2_simulate_reads.py --sanity-check %s --num-fragment %d %s %s %s sim" % \
|
|
(cmd_add, numreads, genome_fname, gtf_fname, snp_fname)
|
|
|
|
"""
|
|
print >> sys.stderr, cmd
|
|
os.system(cmd)
|
|
|
|
random.seed(0)
|
|
print >> sys.stderr, "shuffle reads sim_1.fa and sim_2.fa"
|
|
shuffle_pairs("sim_1.fa", "sim_2.fa")
|
|
shuffle_reads_cmd = " mv sim_1.fa.shuffle sim_1.fa"
|
|
shuffle_reads_cmd += "; mv sim_2.fa.shuffle sim_2.fa"
|
|
os.system(shuffle_reads_cmd)
|
|
"""
|
|
#generate_reads(cmd)
|
|
p = Process(target=generate_reads, args=(cmd,))
|
|
p.start()
|
|
pid_list.append(p)
|
|
|
|
os.chdir("..")
|
|
|
|
os.chdir("..")
|
|
|
|
# wait
|
|
for p in pid_list:
|
|
p.join()
|
|
|
|
|
|
if __name__ == "__main__":
|
|
parser = ArgumentParser(
|
|
description='Generate reads using simulate_reads.py in HISAT2')
|
|
args = parser.parse_args()
|
|
simulate_reads()
|