213 lines
5.6 KiB
Python
213 lines
5.6 KiB
Python
|
#!/usr/bin/env python
|
||
|
|
||
|
import sys
|
||
|
import struct
|
||
|
|
||
|
chr_name = "20"
|
||
|
|
||
|
chr_seq = ""
|
||
|
for line in open("%s.fa" % chr_name):
|
||
|
if line.startswith('>'):
|
||
|
continue
|
||
|
line = line.strip()
|
||
|
line = line.replace('N', '')
|
||
|
chr_seq += line
|
||
|
chr_seq += '$'
|
||
|
|
||
|
chr_sa = []
|
||
|
f = open("%s.sa" % chr_name, "rb")
|
||
|
while True:
|
||
|
fourbytes = f.read(4)
|
||
|
if fourbytes == "":
|
||
|
break
|
||
|
|
||
|
num = struct.unpack('I', fourbytes)[0]
|
||
|
chr_sa.append(num)
|
||
|
|
||
|
if len(chr_sa) % 5000000 == 0:
|
||
|
print len(chr_sa)
|
||
|
f.close()
|
||
|
|
||
|
assert chr_sa[0] + 1 == len(chr_sa)
|
||
|
chr_sa = chr_sa[1:]
|
||
|
assert len(chr_seq) == len(chr_sa)
|
||
|
|
||
|
# for i, num in enumerate(chr_sa):
|
||
|
# print "%10d\t%s\t%d" % (i, chr_seq[num:num+100], num)
|
||
|
|
||
|
seq_len = 100
|
||
|
i = 0
|
||
|
repeats = []
|
||
|
while i < len(chr_sa) - 1:
|
||
|
pos = chr_sa[i]
|
||
|
base_seq = chr_seq[pos:pos+seq_len]
|
||
|
for j in xrange(i+1, len(chr_sa)):
|
||
|
pos2 = chr_sa[j]
|
||
|
cmp_seq = chr_seq[pos2:pos2+seq_len]
|
||
|
if base_seq != cmp_seq:
|
||
|
break
|
||
|
|
||
|
if j - i >= 200:
|
||
|
repeats.append([base_seq, sorted(chr_sa[i:j])])
|
||
|
|
||
|
i = j
|
||
|
|
||
|
if i % 5000000 == 0:
|
||
|
print i
|
||
|
|
||
|
found = False
|
||
|
print len(repeats), "repeats"
|
||
|
deleted = set()
|
||
|
for i in xrange(len(repeats) - 1):
|
||
|
for j in xrange(i + 1, len(repeats)):
|
||
|
if j in deleted:
|
||
|
continue
|
||
|
|
||
|
num_close, num_close2 = 0, 0
|
||
|
pos_seq, pos_set = repeats[i]
|
||
|
pos_seq2, pos_set2 = repeats[j]
|
||
|
|
||
|
k1, k2 = 0, 0
|
||
|
while k1 < len(pos_set) and k2 < len(pos_set2):
|
||
|
_pos, _pos2 = pos_set[k1], pos_set2[k2]
|
||
|
if abs(_pos - _pos2) < 300:
|
||
|
num_close += 1
|
||
|
if abs(_pos - _pos2) < 1100:
|
||
|
num_close2 += 1
|
||
|
if _pos <= _pos2:
|
||
|
k1 += 1
|
||
|
else:
|
||
|
k2 += 1
|
||
|
|
||
|
if num_close > min(len(pos_set), len(pos_set2)) * 0.95:
|
||
|
deleted.add(j)
|
||
|
|
||
|
if num_close == 1 and num_close2 < 5:
|
||
|
found = True
|
||
|
print pos_set
|
||
|
print pos_set2
|
||
|
print pos_seq
|
||
|
print pos_seq2
|
||
|
|
||
|
file1 = open("1.fa", "w")
|
||
|
file2 = open("2.fa", "w")
|
||
|
|
||
|
pos_seq2_rc = list(pos_seq2)
|
||
|
pos_seq2_rc = pos_seq2_rc[::-1]
|
||
|
for k in xrange(seq_len):
|
||
|
nt = pos_seq2_rc[k]
|
||
|
if nt == 'A':
|
||
|
nt = 'T'
|
||
|
elif nt == 'C':
|
||
|
nt = 'G'
|
||
|
elif nt == 'G':
|
||
|
nt = 'C'
|
||
|
else:
|
||
|
assert nt == 'T'
|
||
|
nt = 'A'
|
||
|
pos_seq2_rc[k] = nt
|
||
|
pos_seq2_rc = ''.join(pos_seq2_rc)
|
||
|
|
||
|
for k in xrange(1000000):
|
||
|
print >> file1, ">%d" % k
|
||
|
print >> file2, ">%d" % k
|
||
|
print >> file1, pos_seq
|
||
|
print >> file2, pos_seq2_rc
|
||
|
|
||
|
file1.close()
|
||
|
file2.close()
|
||
|
|
||
|
break
|
||
|
if found:
|
||
|
break
|
||
|
|
||
|
print i
|
||
|
|
||
|
chr_seq = ""
|
||
|
for line in open("%s.fa" % chr_name):
|
||
|
if line.startswith('>'):
|
||
|
continue
|
||
|
line = line.strip()
|
||
|
chr_seq += line
|
||
|
|
||
|
N_ranges = []
|
||
|
prev_nt = None
|
||
|
for i in xrange(len(chr_seq)):
|
||
|
nt = chr_seq[i]
|
||
|
if nt == 'N':
|
||
|
if prev_nt != 'N':
|
||
|
N_ranges.append([i, i]) # inclusive
|
||
|
else:
|
||
|
assert len(N_ranges) > 0
|
||
|
N_ranges[-1][1] = i
|
||
|
prev_nt = nt
|
||
|
|
||
|
to_joined_list = []
|
||
|
for N_start, N_end in N_ranges:
|
||
|
if len(to_joined_list) == 0:
|
||
|
if N_start > 0:
|
||
|
to_joined_list.append([0, 0])
|
||
|
else:
|
||
|
to_joined_list.append([N_end + 1, 0])
|
||
|
else:
|
||
|
N_size = N_end - N_start + 1
|
||
|
to = N_end + 1 - to_joined_list[-1][0] + to_joined_list[-1][1] - N_size
|
||
|
assert to > to_joined_list[-1][1]
|
||
|
to_joined_list.append([N_end + 1, to])
|
||
|
|
||
|
to_genome_list = [[y, x] for x, y in to_joined_list]
|
||
|
|
||
|
N_ranges_tmp = []
|
||
|
for i in xrange(len(to_genome_list)):
|
||
|
to_genome = to_genome_list[i]
|
||
|
if i == 0:
|
||
|
if to_genome[1] > 0:
|
||
|
N_ranges_tmp.append([0, to_genome[1] - 1])
|
||
|
else:
|
||
|
to_genome_before = to_genome_list[i-1]
|
||
|
N_ranges_tmp.append([to_genome_before[1] + to_genome[0] - to_genome_before[0], to_genome[1] - 1])
|
||
|
|
||
|
assert N_ranges == N_ranges_tmp
|
||
|
|
||
|
file = open("%s_rep.info" % chr_name, "w")
|
||
|
def print_rep_info(rep_name, rep_pos, rep_len, pos_set, pos_seq):
|
||
|
print >> file, ">%s*0\trep\t%d\t%d\t%d\t0" % (rep_name, rep_pos, rep_len, len(pos_set))
|
||
|
for i in xrange(0, len(pos_set), 10):
|
||
|
output = ""
|
||
|
for j in range(i, i + 10):
|
||
|
if j >= len(pos_set):
|
||
|
break
|
||
|
if j > i:
|
||
|
output += " "
|
||
|
|
||
|
def convert(pos):
|
||
|
for i in xrange(len(to_genome_list)):
|
||
|
if i + 1 == len(to_genome_list) or (pos >= to_genome_list[i][0] and pos < to_genome_list[i+1][0]):
|
||
|
return pos - to_genome_list[i][0] + to_genome_list[i][1]
|
||
|
|
||
|
assert False
|
||
|
|
||
|
pos = convert(pos_set[j])
|
||
|
assert chr_seq[pos:pos+seq_len] == pos_seq
|
||
|
output += ("%s:%d:+" % (chr_name, pos))
|
||
|
print >> file, output
|
||
|
print_rep_info("rep1", 0, seq_len, pos_set, pos_seq)
|
||
|
print_rep_info("rep2", seq_len, seq_len, pos_set2, pos_seq2)
|
||
|
file.close()
|
||
|
|
||
|
chr_seq = chr_seq.replace(pos_seq, 'N' * seq_len)
|
||
|
chr_seq = chr_seq.replace(pos_seq2, 'N' * seq_len)
|
||
|
file = open("%s_mask.fa" % chr_name, "w")
|
||
|
print >> file, ">%s_mask" % chr_name
|
||
|
for i in xrange(0, len(chr_seq), 60):
|
||
|
print >> file, chr_seq[i:i+60]
|
||
|
file.close()
|
||
|
|
||
|
file = open("%s_rep.fa" % chr_name, "w")
|
||
|
rep_seq = pos_seq + pos_seq2
|
||
|
print >> file, ">rep"
|
||
|
for i in xrange(0, len(rep_seq), 60):
|
||
|
print >> file, rep_seq[i:i+60]
|
||
|
file.close()
|
||
|
|