hisat-3n/evaluation/tests/one_snp_test/simulate_one_snp_reads.py
2025-01-18 21:09:52 +08:00

393 lines
14 KiB
Python

#!/usr/bin/env python
#
# Copyright 2015, Daehwan Kim <infphilo@gmail.com>
#
# This file is part of HISAT 2.
#
# HISAT 2 is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# HISAT 2 is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with HISAT 2. If not, see <http://www.gnu.org/licenses/>.
#
import sys, math, random, re
from collections import defaultdict, Counter
from argparse import ArgumentParser, FileType
"""
"""
def reverse_complement(seq):
result = ""
for nt in seq:
base = nt
if nt == 'A':
base = 'T'
elif nt == 'a':
base = 't'
elif nt == 'C':
base = 'G'
elif nt == 'c':
base = 'g'
elif nt == 'G':
base = 'C'
elif nt == 'g':
base = 'c'
elif nt == 'T':
base = 'A'
elif nt == 't':
base = 'a'
result = base + result
return result
"""
"""
def read_genome(genome_file):
chr_dic = {}
chr_name, sequence = "", ""
for line in genome_file:
if line[0] == ">":
if chr_name and sequence:
chr_dic[chr_name] = sequence
chr_name = line.strip().split()[0][1:]
sequence = ""
else:
sequence += line[:-1]
if chr_name and sequence:
chr_dic[chr_name] = sequence
return chr_dic
"""
"""
def read_snp(snp_file):
snps = defaultdict(list)
for line in snp_file:
line = line.strip()
if not line or line.startswith('#'):
continue
try:
snpID, type, chr, pos, data = line.split('\t')
chr = chr.split()[0]
except ValueError:
continue
assert type in ["single", "deletion", "insertion"]
if type == "deletion":
data = int(data)
snps[chr].append([snpID, type, int(pos), data])
return snps
"""
"""
def getSamAlignment(chr_seq, read_len, snp):
snp_id, snp_type, snp_pos, snp_data = snp
# Define MD, XM, NM, Zs, read_seq
read_seq, MD, Zs = "", "", ""
left_read_len = read_len / 2
pos = snp_pos - left_read_len
assert pos >= 0
if snp_type == "single":
cigar_str = "100M"
read_seq = chr_seq[pos:pos+left_read_len] + snp_data + chr_seq[pos+left_read_len+1:pos+read_len]
MD = "%d%s%d" % (left_read_len, chr_seq[pos+left_read_len], read_len - left_read_len - 1)
Zs = "%d|S|%s" % (left_read_len, snp_id)
elif snp_type == "deletion":
del_len = int(snp_data)
cigar_str = "%dM%dD%dM" % (left_read_len, del_len, read_len - left_read_len)
read_seq = chr_seq[pos:pos+left_read_len] + chr_seq[pos+left_read_len+del_len:pos+read_len+del_len]
MD = "%d^%s%d" % (left_read_len, chr_seq[pos+left_read_len:pos+left_read_len+del_len], read_len - left_read_len)
Zs = "%d|D|%s" % (left_read_len, snp_id)
else:
assert snp_type == "insertion"
ins_len = len(snp_data)
assert ins_len < read_len
cigar_str = "%dM%dI%dM" % (left_read_len, ins_len, read_len - left_read_len - ins_len)
read_seq = chr_seq[pos:pos+left_read_len] + snp_data + chr_seq[pos+left_read_len:pos+read_len-ins_len]
MD = "%d" % (read_len - ins_len)
Zs = "%d|I|%s" % (left_read_len, snp_id)
if len(read_seq) != read_len:
print >> sys.stderr, "read length differs:", len(read_seq), "vs.", read_len
print >> sys.stderr, pos, cigar_str, MD, Zs
assert False
ref_read_seq = chr_seq[pos:pos+read_len]
return pos, cigar_str, MD, Zs, read_seq, ref_read_seq
"""
"""
cigar_re = re.compile('\d+\w')
def samRepOk(genome_seq, read_seq, chr, pos, cigar, MD, Zs):
assert chr in genome_seq
chr_seq = genome_seq[chr]
assert pos < len(chr_seq)
# Calculate XM and NM based on Cigar and Zs
cigars = cigar_re.findall(cigar)
cigars = [[int(cigars[i][:-1]), cigars[i][-1]] for i in range(len(cigars))]
ref_pos, read_pos = pos, 0
ann_ref_seq, ann_ref_rel, ann_read_seq, ann_read_rel = [], [], [], []
for i in range(len(cigars)):
cigar_len, cigar_op = cigars[i]
if cigar_op == "M":
partial_ref_seq = chr_seq[ref_pos:ref_pos+cigar_len]
partial_read_seq = read_seq[read_pos:read_pos+cigar_len]
assert len(partial_ref_seq) == len(partial_read_seq)
ann_ref_seq += list(partial_ref_seq)
ann_read_seq += list(partial_read_seq)
for j in range(len(partial_ref_seq)):
if partial_ref_seq[j] == partial_read_seq[j]:
ann_ref_rel.append("=")
ann_read_rel.append("=")
else:
ann_ref_rel.append("X")
ann_read_rel.append("X")
ref_pos += cigar_len
read_pos += cigar_len
elif cigar_op == "D":
partial_ref_seq = chr_seq[ref_pos:ref_pos+cigar_len]
ann_ref_rel += list(partial_ref_seq)
ann_ref_seq += list(partial_ref_seq)
ann_read_rel += (["-"] * cigar_len)
ann_read_seq += (["-"] * cigar_len)
ref_pos += cigar_len
elif cigar_op == "I":
partial_read_seq = read_seq[read_pos:read_pos+cigar_len]
ann_ref_rel += (["-"] * cigar_len)
ann_ref_seq += (["-"] * cigar_len)
ann_read_rel += list(partial_read_seq)
ann_read_seq += list(partial_read_seq)
read_pos += cigar_len
elif cigar_op == "N":
ref_pos += cigar_len
else:
assert False
assert len(ann_ref_seq) == len(ann_read_seq)
assert len(ann_ref_seq) == len(ann_ref_rel)
assert len(ann_ref_seq) == len(ann_read_rel)
ann_Zs_seq = ["0" for i in range(len(ann_ref_seq))]
Zss, Zs_i, snp_pos_add = [], 0, 0
if Zs != "":
Zss = Zs.split(',')
Zss = [zs.split('|') for zs in Zss]
ann_read_pos = 0
for zs in Zss:
zs_pos, zs_type, zs_id = zs
zs_pos = int(zs_pos)
for i in range(zs_pos):
while ann_read_rel[ann_read_pos] == '-':
ann_read_pos += 1
ann_read_pos += 1
if zs_type == "S":
ann_Zs_seq[ann_read_pos] = "1"
ann_read_pos += 1
elif zs_type == "D":
while ann_read_rel[ann_read_pos] == '-':
ann_Zs_seq[ann_read_pos] = "1"
ann_read_pos += 1
elif zs_type == "I":
while ann_ref_rel[ann_read_pos] == '-':
ann_Zs_seq[ann_read_pos] = "1"
ann_read_pos += 1
else:
assert False
tMD, tXM, tNM = "", 0, 0
match_len = 0
i = 0
while i < len(ann_ref_seq):
if ann_ref_rel[i] == "=":
assert ann_read_rel[i] == "="
match_len += 1
i += 1
continue
assert ann_read_rel[i] != "="
if ann_ref_rel[i] == "X" and ann_read_rel[i] == "X":
if match_len > 0:
tMD += ("{}".format(match_len))
match_len = 0
tMD += ann_ref_seq[i]
if ann_Zs_seq[i] == "0":
tXM += 1
tNM += 1
i += 1
else:
assert ann_ref_rel[i] == "-" or ann_read_rel[i] == "-"
if ann_ref_rel[i] == '-':
while ann_ref_rel[i] == '-':
if ann_Zs_seq[i] == "0":
tNM += 1
i += 1
else:
assert ann_read_rel[i] == '-'
del_seq = ""
while ann_read_rel[i] == '-':
del_seq += ann_ref_seq[i]
if ann_Zs_seq[i] == "0":
tNM += 1
i += 1
if match_len > 0:
tMD += ("{}".format(match_len))
match_len = 0
tMD += ("^{}".format(del_seq))
if match_len > 0:
tMD += ("{}".format(match_len))
if tMD != MD:
print >> sys.stderr, chr, pos, cigar, MD, Zs
print >> sys.stderr, tMD
assert False
"""
"""
def simulate_reads(genome_file, snp_file, base_fname, \
paired_end, read_len, frag_len, \
num_frag, sanity_check, verbose):
if read_len > frag_len:
frag_len = read_len
genome_seq = read_genome(genome_file)
snps = read_snp(snp_file)
chr_ids = genome_seq.keys()
sam_file = open(base_fname + ".sam", "w")
# Write SAM header
print >> sam_file, "@HD\tVN:1.0\tSO:unsorted"
for chr in genome_seq.keys():
print >> sam_file, "@SQ\tSN:%s\tLN:%d" % (chr, len(genome_seq[chr]))
read_file = open(base_fname + "_snp_1.fa", "w")
ref_read_file = open(base_fname + "_ref_1.fa", "w")
if paired_end:
read2_file = open(base_fname + "_snp_2.fa", "w")
ref_read2_file = open(base_fname + "_ref_2.fa", "w")
cur_read_id = 1
for chr in snps:
assert chr in genome_seq
chr_seq = genome_seq[chr]
chr_len = len(chr_seq)
if chr in snps:
chr_snps = snps[chr]
else:
chr_snps = []
for snp in chr_snps:
# SAM specification (v1.4)
# http://samtools.sourceforge.net/
flag, flag2 = 99, 163 # 83, 147
pos, cigar_str, MD, Zs, read_seq, ref_read_seq = getSamAlignment(chr_seq, read_len, snp)
# pos2, cigar2_str, MD2, Zs2, read2_seq = getSamAlignment(chr_seq, frag_pos+frag_len-read_len, read_len, snp)
if sanity_check:
samRepOk(genome_seq, read_seq, chr, pos, cigar_str, MD, Zs)
# samRepOk(genome_seq, read2_seq, chr, pos2, cigar2_str, MD2, Zs2)
if Zs != "":
Zs = ("\tZs:Z:{}".format(Zs))
# if Zs2 != "":
# Zs2 = ("\tZs:Z:{}".format(Zs2))
read_id_str = "{}_{}_{}_{}".format(cur_read_id, chr, pos, cigar_str)
print >> read_file, ">{}".format(read_id_str)
print >> read_file, read_seq
print >> sam_file, "{}\t{}\t{}\t{}\t255\t{}\t{}\t{}\t0\t{}\t*\tXM:i:0\tNM:i:0\tMD:Z:{}{}".format(read_id_str, flag, chr, pos + 1, cigar_str, chr, pos + 1, read_seq, MD, Zs)
print >> ref_read_file, ">{}_{}_{}_100M".format(cur_read_id, chr, pos)
print >> ref_read_file, ref_read_seq
"""
if paired_end:
print >> read2_file, ">{}".format(cur_read_id)
print >> read2_file, reverse_complement(read2_seq)
print >> sam_file, "{}\t{}\t{}\t{}\t255\t{}\t{}\t{}\t0\t{}\t*\tXM:i:0\tNM:i:0\tMD:Z:{}{}".format(cur_read_id, flag2, chr, pos2 + 1, cigar2_str, chr, pos + 1, read2_seq, MD2, Zs2)
"""
cur_read_id += 1
sam_file.close()
read_file.close()
ref_read_file.close()
if paired_end:
read2_file.close()
ref_read2_file.close()
if __name__ == '__main__':
parser = ArgumentParser(
description='Simulate reads from GENOME (fasta) and GTF files')
parser.add_argument('genome_file',
nargs='?',
type=FileType('r'),
help='input GENOME file')
parser.add_argument('snp_file',
nargs='?',
type=FileType('r'),
help='input SNP file')
parser.add_argument('base_fname',
nargs='?',
type=str,
help='output base filename')
parser.add_argument('--paired-end',
dest='paired_end',
action='store_true',
help='single-end reads (default: paired-end reads)')
parser.add_argument('-r', '--read-length',
dest='read_len',
action='store',
type=int,
default=100,
help='read length (default: 100)')
parser.add_argument('-f', '--fragment-length',
dest='frag_len',
action='store',
type=int,
default=250,
help='fragment length (default: 250)')
parser.add_argument('-n', '--num-fragment',
dest='num_frag',
action='store',
type=int,
default=1000000,
help='number of fragments (default: 1000000)')
parser.add_argument('--sanity-check',
dest='sanity_check',
action='store_true',
help='sanity check')
parser.add_argument('-v', '--verbose',
dest='verbose',
action='store_true',
help='also print some statistics to stderr')
parser.add_argument('--version',
action='version',
version='%(prog)s 2.1.0')
args = parser.parse_args()
if not args.genome_file or not args.snp_file:
parser.print_help()
exit(1)
simulate_reads(args.genome_file, args.snp_file, args.base_fname, \
args.paired_end, args.read_len, args.frag_len, \
args.num_frag, args.sanity_check, args.verbose)