hisat-3n/evaluation/tests/repeat/generate_repeats.py

213 lines
5.6 KiB
Python
Raw Normal View History

2025-01-18 13:09:52 +00:00
#!/usr/bin/env python
import sys
import struct
chr_name = "20"
chr_seq = ""
for line in open("%s.fa" % chr_name):
if line.startswith('>'):
continue
line = line.strip()
line = line.replace('N', '')
chr_seq += line
chr_seq += '$'
chr_sa = []
f = open("%s.sa" % chr_name, "rb")
while True:
fourbytes = f.read(4)
if fourbytes == "":
break
num = struct.unpack('I', fourbytes)[0]
chr_sa.append(num)
if len(chr_sa) % 5000000 == 0:
print len(chr_sa)
f.close()
assert chr_sa[0] + 1 == len(chr_sa)
chr_sa = chr_sa[1:]
assert len(chr_seq) == len(chr_sa)
# for i, num in enumerate(chr_sa):
# print "%10d\t%s\t%d" % (i, chr_seq[num:num+100], num)
seq_len = 100
i = 0
repeats = []
while i < len(chr_sa) - 1:
pos = chr_sa[i]
base_seq = chr_seq[pos:pos+seq_len]
for j in xrange(i+1, len(chr_sa)):
pos2 = chr_sa[j]
cmp_seq = chr_seq[pos2:pos2+seq_len]
if base_seq != cmp_seq:
break
if j - i >= 200:
repeats.append([base_seq, sorted(chr_sa[i:j])])
i = j
if i % 5000000 == 0:
print i
found = False
print len(repeats), "repeats"
deleted = set()
for i in xrange(len(repeats) - 1):
for j in xrange(i + 1, len(repeats)):
if j in deleted:
continue
num_close, num_close2 = 0, 0
pos_seq, pos_set = repeats[i]
pos_seq2, pos_set2 = repeats[j]
k1, k2 = 0, 0
while k1 < len(pos_set) and k2 < len(pos_set2):
_pos, _pos2 = pos_set[k1], pos_set2[k2]
if abs(_pos - _pos2) < 300:
num_close += 1
if abs(_pos - _pos2) < 1100:
num_close2 += 1
if _pos <= _pos2:
k1 += 1
else:
k2 += 1
if num_close > min(len(pos_set), len(pos_set2)) * 0.95:
deleted.add(j)
if num_close == 1 and num_close2 < 5:
found = True
print pos_set
print pos_set2
print pos_seq
print pos_seq2
file1 = open("1.fa", "w")
file2 = open("2.fa", "w")
pos_seq2_rc = list(pos_seq2)
pos_seq2_rc = pos_seq2_rc[::-1]
for k in xrange(seq_len):
nt = pos_seq2_rc[k]
if nt == 'A':
nt = 'T'
elif nt == 'C':
nt = 'G'
elif nt == 'G':
nt = 'C'
else:
assert nt == 'T'
nt = 'A'
pos_seq2_rc[k] = nt
pos_seq2_rc = ''.join(pos_seq2_rc)
for k in xrange(1000000):
print >> file1, ">%d" % k
print >> file2, ">%d" % k
print >> file1, pos_seq
print >> file2, pos_seq2_rc
file1.close()
file2.close()
break
if found:
break
print i
chr_seq = ""
for line in open("%s.fa" % chr_name):
if line.startswith('>'):
continue
line = line.strip()
chr_seq += line
N_ranges = []
prev_nt = None
for i in xrange(len(chr_seq)):
nt = chr_seq[i]
if nt == 'N':
if prev_nt != 'N':
N_ranges.append([i, i]) # inclusive
else:
assert len(N_ranges) > 0
N_ranges[-1][1] = i
prev_nt = nt
to_joined_list = []
for N_start, N_end in N_ranges:
if len(to_joined_list) == 0:
if N_start > 0:
to_joined_list.append([0, 0])
else:
to_joined_list.append([N_end + 1, 0])
else:
N_size = N_end - N_start + 1
to = N_end + 1 - to_joined_list[-1][0] + to_joined_list[-1][1] - N_size
assert to > to_joined_list[-1][1]
to_joined_list.append([N_end + 1, to])
to_genome_list = [[y, x] for x, y in to_joined_list]
N_ranges_tmp = []
for i in xrange(len(to_genome_list)):
to_genome = to_genome_list[i]
if i == 0:
if to_genome[1] > 0:
N_ranges_tmp.append([0, to_genome[1] - 1])
else:
to_genome_before = to_genome_list[i-1]
N_ranges_tmp.append([to_genome_before[1] + to_genome[0] - to_genome_before[0], to_genome[1] - 1])
assert N_ranges == N_ranges_tmp
file = open("%s_rep.info" % chr_name, "w")
def print_rep_info(rep_name, rep_pos, rep_len, pos_set, pos_seq):
print >> file, ">%s*0\trep\t%d\t%d\t%d\t0" % (rep_name, rep_pos, rep_len, len(pos_set))
for i in xrange(0, len(pos_set), 10):
output = ""
for j in range(i, i + 10):
if j >= len(pos_set):
break
if j > i:
output += " "
def convert(pos):
for i in xrange(len(to_genome_list)):
if i + 1 == len(to_genome_list) or (pos >= to_genome_list[i][0] and pos < to_genome_list[i+1][0]):
return pos - to_genome_list[i][0] + to_genome_list[i][1]
assert False
pos = convert(pos_set[j])
assert chr_seq[pos:pos+seq_len] == pos_seq
output += ("%s:%d:+" % (chr_name, pos))
print >> file, output
print_rep_info("rep1", 0, seq_len, pos_set, pos_seq)
print_rep_info("rep2", seq_len, seq_len, pos_set2, pos_seq2)
file.close()
chr_seq = chr_seq.replace(pos_seq, 'N' * seq_len)
chr_seq = chr_seq.replace(pos_seq2, 'N' * seq_len)
file = open("%s_mask.fa" % chr_name, "w")
print >> file, ">%s_mask" % chr_name
for i in xrange(0, len(chr_seq), 60):
print >> file, chr_seq[i:i+60]
file.close()
file = open("%s_rep.fa" % chr_name, "w")
rep_seq = pos_seq + pos_seq2
print >> file, ">rep"
for i in xrange(0, len(rep_seq), 60):
print >> file, rep_seq[i:i+60]
file.close()