hisat-3n/hisat2_simulate_reads.py

#!/usr/bin/env python3
#
# Copyright 2015, Daehwan Kim <infphilo@gmail.com>
#
# This file is part of HISAT 2.
#
# HISAT 2 is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# HISAT 2 is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with HISAT 2.  If not, see <http://www.gnu.org/licenses/>.
#

import os, sys, math, random, re
from collections import defaultdict, Counter
from argparse import ArgumentParser, FileType


"""
"""
def reverse_complement(seq):
    result = ""
    for nt in seq:
        base = nt
        if nt == 'A':
            base = 'T'
        elif nt == 'a':
            base = 't'
        elif nt == 'C':
            base = 'G'
        elif nt == 'c':
            base = 'g'
        elif nt == 'G':
            base = 'C'
        elif nt == 'g':
            base = 'c'
        elif nt == 'T':
            base = 'A'
        elif nt == 't':
            base = 'a'
        
        result = base + result
    
    return result


"""
python2 style randint
"""
def myrandint(m, x):
    s = x - m + 1
    return m + int(random.random() * s)

"""
Random source for sequencing errors
"""
class ErrRandomSource:
    def __init__(self, prob = 0.0, size = 1 << 20):
        self.size = size
        self.rands = []
        for i in range(self.size):
            if random.random() < prob:
                self.rands.append(1)
            else:
                self.rands.append(0)
        self.cur = 0
        
    def getRand(self):
        assert self.cur < len(self.rands)
        rand = self.rands[self.cur]
        self.cur = (self.cur + 1) % len(self.rands)
        return rand


"""
"""
def read_genome(genome_file):
    chr_dic = {}
    
    chr_name, sequence = "", ""
    for line in genome_file:
        if line[0] == ">":
            if chr_name and sequence:
                chr_dic[chr_name] = sequence
            chr_name = line.strip().split()[0][1:]
            sequence = ""
        else:
            sequence += line[:-1]

    if chr_name and sequence:
        chr_dic[chr_name] = sequence


    chr_filter = [str(x) for x in list(range(1, 23)) + ['X', 'Y']]
    #chr_filter = None

    if chr_filter:
        for chr_id, chr_seq in chr_dic.items():
            if not chr_id in chr_filter: 
                chr_dic.pop(chr_id, None)
    
    return chr_dic


"""
"""
def read_transcript(genome_seq, gtf_file, frag_len):
    genes = defaultdict(list)
    transcripts = {}

    # Parse valid exon lines from the GTF file into a dict by transcript_id
    for line in gtf_file:
        line = line.strip()
        if not line or line.startswith('#'):
            continue
        if '#' in line:
            line = line.split('#')[0].strip()
        try:
            chrom, source, feature, left, right, score, \
                strand, frame, values = line.split('\t')
        except ValueError:
            continue
        if chrom not in genome_seq:
            continue
        
        # Zero-based offset
        left, right = int(left) - 1, int(right) - 1
        if feature != 'exon' or left >= right:
            continue

        values_dict = {}
        for attr in values.split(';')[:-1]:
            attr, _, val = attr.strip().partition(' ')
            values_dict[attr] = val.strip('"')

        if 'gene_id' not in values_dict or \
                'transcript_id' not in values_dict:
            continue

        transcript_id = values_dict['transcript_id']
        if transcript_id not in transcripts:
            transcripts[transcript_id] = [chrom, strand, [[left, right]]]
            genes[values_dict['gene_id']].append(transcript_id)
        else:
            transcripts[transcript_id][2].append([left, right])

    # Sort exons and merge where separating introns are <=5 bps
    for tran, [chr, strand, exons] in transcripts.items():
            exons.sort()
            tmp_exons = [exons[0]]
            for i in range(1, len(exons)):
                if exons[i][0] - tmp_exons[-1][1] <= 5:
                    tmp_exons[-1][1] = exons[i][1]
                else:
                    tmp_exons.append(exons[i])
            transcripts[tran] = [chr, strand, tmp_exons]

    tmp_transcripts = {}
    for tran, [chr, strand, exons] in transcripts.items():
        exon_lens = [e[1] - e[0] + 1 for e in exons]
        transcript_len = sum(exon_lens)
        if transcript_len >= frag_len:
            tmp_transcripts[tran] = [chr, strand, transcript_len, exons]

    transcripts = tmp_transcripts

    return genes, transcripts
    

"""
"""
def read_snp(snp_file):
    snps = defaultdict(list)
    for line in snp_file:
        line = line.strip()
        if not line or line.startswith('#'):
            continue
        try:
            snpID, type, chr, pos, data = line.split('\t')
        except ValueError:
            continue

        assert type in ["single", "deletion", "insertion"]
        if type == "deletion":
            data = int(data)
        snps[chr].append([snpID, type, int(pos), data])

    return snps


"""
"""
def sanity_check_input(genome_seq, genes, transcripts, snps, frag_len):
    num_canon_ss, num_ss = 0, 0
    for transcript, [chr, strand, transcript_len, exons] in transcripts.items():
        assert transcript_len >= frag_len
        if len(exons) <= 1:
            continue
        if chr not in genome_seq:
            continue
        chr_seq = genome_seq[chr]
        for i in range(len(exons) - 1):
            left1, right1 = exons[i]
            assert left1 < right1
            left2, right2 = exons[i+1]
            assert left2 < right2
            assert left1 < left2 and right1 < right2
            donor = chr_seq[right1+1:right1+3]
            acceptor = chr_seq[left2-2:left2]
            if strand == "-":
                donor, acceptor = reverse_complement(acceptor), reverse_complement(donor)
            if donor == "GT" and acceptor == "AG":
                num_canon_ss += 1
            num_ss += 1

    if num_ss > 0:
        print("GT/AG splice sites: {}/{} ({:.2%})".format(num_canon_ss, num_ss, (float(num_canon_ss) / num_ss)), file=sys.stderr)

    num_alt_single, num_single = 0, 0
    for chr, chr_snps in snps.items():
        if chr not in genome_seq:
            continue
        chr_seq = genome_seq[chr]
        prev_snp = None
        for snp in chr_snps:
            snpID, type, pos, data = snp
            if prev_snp:
                assert prev_snp[2] <= pos
            prev_snp = snp
            if type != "single":
                continue
            assert pos < len(chr_seq)
            if chr_seq[pos] != data:
                num_alt_single += 1
            num_single += 1

    if num_single > 0:
        print("Alternative bases: {}/{} ({:.2%})".format(num_alt_single, num_single, (float(num_alt_single) / num_single)), file=sys.stderr)


"""
"""
def generate_rna_expr_profile(expr_profile_type, num_transcripts = 10000):
    # Modelling and simulating generic RNA-Seq experiments with the flux simulator
    # http://nar.oxfordjournals.org/content/suppl/2012/06/29/gks666.DC1/nar-02667-n-2011-File002.pdf
    def calc_expr(x, a):
        x, a, b = float(x), 9500.0, 9500.0
        k = -0.6
        return (x**k) * math.exp(x/a * (x/b)**2)
    
    expr_profile = [0.0] * num_transcripts
    for i in range(len(expr_profile)):
        if expr_profile_type == "flux":
            expr_profile[i] = calc_expr(i + 1, num_transcripts)
        elif expr_profile_type == "constant":
            expr_profile[i] = 1.0
        else:
            assert False

    expr_sum = sum(expr_profile)
    expr_profile = [expr_profile[i] / expr_sum for i in range(len(expr_profile))]
    assert abs(sum(expr_profile) - 1.0) < 0.001
    return expr_profile


"""
"""
def generate_dna_expr_profile(genome_seq):
    expr_profile = []
    for chr_id, chr_seq in genome_seq.items():
        expr_profile.append(len(chr_seq))
    expr_sum = float(sum(expr_profile))
    expr_profile = [expr_profile[i] / expr_sum for i in range(len(expr_profile))]
    assert abs(sum(expr_profile) - 1.0) < 0.001
    return expr_profile


"""
"""
def getSNPs(chr_snps, left, right):
    low, high = 0, len(chr_snps)
    while low < high:
        mid = (low + high) // 2
        snpID, type, pos, data = chr_snps[mid]
        if pos < left:
            low = mid + 1
        else:
            high = mid - 1

    snps = []
    for snp in chr_snps[low:]:
        snpID, type, pos, data = snp
        pos2 = pos
        if type == "deletion":
            pos2 += data
        if pos2 >= right:
            break
        if pos >= left:
            if len(snps) > 0:
                _, prev_type, prev_pos, prev_data = snps[-1]
                assert prev_pos <= pos
                prev_pos2 = prev_pos
                if prev_type == "deletion":
                    prev_pos2 += prev_data
                if pos <= prev_pos2:
                    continue
            snps.append(snp)

    return snps


"""
"""
def getSamAlignment(rna, exons, chr_seq, trans_seq, frag_pos, read_len, chr_snps, snp_prob, err_rand_src, max_mismatch):
    # Find the genomic position for frag_pos and exon number
    tmp_frag_pos, tmp_read_len = frag_pos, read_len
    pos, cigars, cigar_descs = exons[0][0], [], []
    e_pos = 0
    prev_e = None
    for e_i in range(len(exons)):
        e = exons[e_i]
        if prev_e:
            i_len = e[0] - prev_e[1] - 1
            pos += i_len
        e_len = e[1] - e[0] + 1
        if e_len <= tmp_frag_pos:
            tmp_frag_pos -= e_len
            pos += e_len
        else:
            pos += tmp_frag_pos
            e_pos = tmp_frag_pos
            break                        
        prev_e = e

    # Define Cigar and its descriptions
    assert e_i < len(exons)
    e_len = exons[e_i][1] - exons[e_i][0] + 1
    assert e_pos < e_len
    cur_pos = pos
    match_len = 0
    prev_e = None
    mismatch, remain_trans_len = 0, len(trans_seq) - (frag_pos + read_len)
    assert remain_trans_len >= 0
    for e_i in range(e_i, len(exons)):
        e = exons[e_i]
        if prev_e:
            i_len = e[0] - prev_e[1] - 1
            cur_pos += i_len
            cigars.append(("{}N".format(i_len)))
            cigar_descs.append([])
        tmp_e_left = e_left = e[0] + e_pos
        e_pos = 0

        # Retreive SNPs
        if rna:
            snps = getSNPs(chr_snps, e_left, e[1])
        else:
            snps = getSNPs(chr_snps, frag_pos, frag_pos + read_len)

        if snp_prob < 1.0 and len(snps) > 0:
            snps_ = []
            for snp in snps:
                if random.random() <= snp_prob:
                    snps_.append(snp)
            snps = snps_
            
        # Simulate mismatches due to sequencing errors
        mms = []
        for i in range(e_left, min(e[1], e_left + tmp_read_len - 1)):
            if err_rand_src.getRand() == 1:
                assert i < len(chr_seq)
                err_base = "A"
                #rand = random.randint(0, 2)
                rand = myrandint(0, 2)
                if chr_seq[i] == "A":
                    err_base = "GCT"[rand]
                elif chr_seq[i] == "C":
                    err_base = "AGT"[rand]
                elif chr_seq[i] == "G":
                    err_base = "ACT"[rand]
                else:
                    err_base = "ACG"[rand]                    
                mms.append(["", "single", i, err_base])

        tmp_diffs = snps + mms
#        def diff_sort(a , b):
#            return a[2] - b[2]

        tmp_diffs = sorted(tmp_diffs, key=lambda t: t[2])
        diffs = []
        if len(tmp_diffs) > 0:
            diffs = tmp_diffs[:1]
            for diff in tmp_diffs[1:]:
                _, tmp_type, tmp_pos, tmp_data = diff
                _, prev_type, prev_pos, prev_data = diffs[-1]
                if prev_type == "deletion":
                    prev_pos += prev_data
                if tmp_pos <= prev_pos:
                    continue
                diffs.append(diff)

        cigar_descs.append([])
        prev_diff = None
        for diff in diffs:
            diff_id, diff_type, diff_pos, diff_data = diff
            if prev_diff:
                prev_diff_id, prev_diff_type, prev_diff_pos, prev_diff_data = prev_diff
                if prev_diff_type == "deletion":
                    prev_diff_pos += prev_diff_data
                assert prev_diff_pos < diff_pos
            diff_pos2 = diff_pos
            if diff_type == "deletion":
                diff_pos2 += diff_data
            if e_left + tmp_read_len - 1 < diff_pos2 or e[1] < diff_pos2:
                break
            
            if diff_type == "single":
                if mismatch + 1 > max_mismatch:
                    continue
                cigar_descs[-1].append([diff_pos - tmp_e_left, diff_data, diff_id])
                tmp_e_left = diff_pos + 1
                mismatch += 1
            elif diff_type == "deletion":
                del_len = diff_data
                if mismatch + del_len > max_mismatch:
                    continue
                if len(cigars) <= 0 and diff_pos - e_left <= 0:
                    continue                
                if remain_trans_len < del_len:
                    continue
                remain_trans_len -= del_len
                if diff_pos - e_left > 0:
                    cigars.append("{}M".format(diff_pos - e_left))
                    cigar_descs[-1].append([diff_pos - tmp_e_left, "", ""])
                    cigar_descs.append([])
                cigars.append("{}D".format(del_len))
                cigar_descs[-1].append([0, del_len, diff_id])
                cigar_descs.append([])
                tmp_read_len -= (diff_pos - e_left)
                e_left = tmp_e_left = diff_pos + del_len

            elif diff_type == "insertion":
                ins_len = len(diff_data)
                if mismatch + ins_len > max_mismatch:
                    continue
                if len(cigars) <= 0 and diff_pos - e_left <= 0:
                    continue
                if e_left + tmp_read_len - 1 < diff_pos + ins_len:
                    break
                if diff_pos - e_left > 0:
                    cigars.append("{}M".format(diff_pos - e_left))
                    cigar_descs[-1].append([diff_pos - tmp_e_left, "", ""])
                    cigar_descs.append([])
                cigars.append("{}I".format(ins_len))
                cigar_descs[-1].append([0, diff_data, diff_id])
                cigar_descs.append([])
                tmp_read_len -= (diff_pos - e_left)
                tmp_read_len -= ins_len
                e_left = tmp_e_left = diff_pos

            else:
                assert False
            prev_diff = diff

        e_right = min(e[1], e_left + tmp_read_len - 1)
        e_len = e_right - e_left + 1
        remain_e_len = e_right - tmp_e_left + 1
        if remain_e_len > 0:
            cigar_descs[-1].append([remain_e_len, "", ""])
        if e_len < tmp_read_len:
            tmp_read_len -= e_len
            cigars.append(("{}M".format(e_len)))
        else:
            assert e_len == tmp_read_len
            cigars.append(("{}M".format(tmp_read_len)))
            tmp_read_len = 0
            break
        prev_e = e

    # Define MD, XM, NM, Zs, read_seq
    MD, XM, NM, Zs, read_seq = "", 0, 0, "", ""
    assert len(cigars) == len(cigar_descs)
    MD_match_len, Zs_match_len = 0, 0
    cur_trans_pos = frag_pos
    for c in range(len(cigars)):
        cigar = cigars[c]
        cigar_len, cigar_op = int(cigar[:-1]), cigar[-1]
        cigar_desc = cigar_descs[c]
        if cigar_op == 'N':
            continue
        if cigar_op == 'M':
            for add_match_len, alt_base, snp_id in cigar_desc:
                MD_match_len += add_match_len
                Zs_match_len += add_match_len
                assert cur_trans_pos + add_match_len <= len(trans_seq)
                read_seq += trans_seq[cur_trans_pos:cur_trans_pos+add_match_len]
                cur_trans_pos += add_match_len
                if alt_base != "":
                    if MD_match_len > 0:
                        MD += ("{}".format(MD_match_len))
                        MD_match_len = 0
                    MD += trans_seq[cur_trans_pos]
                    if snp_id != "":
                        if Zs != "":
                            Zs += ","
                        Zs += ("{}|S|{}".format(Zs_match_len, snp_id))
                        Zs_match_len = 0
                    else:
                        Zs_match_len += 1
                    if snp_id == "":
                        XM += 1
                        NM += 1
                    read_seq += alt_base
                    cur_trans_pos += 1
        elif cigar_op == 'D':
            assert len(cigar_desc) == 1
            add_match_len, del_len, snp_id = cigar_desc[0]
            MD_match_len += add_match_len
            Zs_match_len += add_match_len
            if MD_match_len > 0:
                MD += ("{}".format(MD_match_len))
                MD_match_len = 0
            MD += ("^{}".format(trans_seq[cur_trans_pos:cur_trans_pos+cigar_len]))
            read_seq += trans_seq[cur_trans_pos:cur_trans_pos+add_match_len]
            if Zs != "":
                Zs += ","
            Zs += ("{}|D|{}".format(Zs_match_len, cigar_desc[0][-1]))
            Zs_match_len = 0
            cur_trans_pos += cigar_len
        elif cigar_op == 'I':
            assert len(cigar_desc) == 1
            add_match_len, ins_seq, snp_id = cigar_desc[0]
            ins_len = len(ins_seq)
            MD_match_len += add_match_len
            Zs_match_len += add_match_len
            read_seq += trans_seq[cur_trans_pos:cur_trans_pos+add_match_len]
            read_seq += ins_seq
            if Zs != "":
                Zs += ","
            Zs += ("{}|I|{}".format(Zs_match_len, cigar_desc[0][-1]))
            Zs_match_len = 0
        else:
            assert False

    if MD_match_len > 0:
        MD += ("{}".format(MD_match_len))

    if len(read_seq) != read_len:
        print("read length differs:", len(read_seq), "vs.", read_len, file=sys.stderr)
        print(pos, "".join(cigars), cigar_descs, MD, XM, NM, Zs, file=sys.stderr)
        assert False

    return pos, cigars, cigar_descs, MD, XM, NM, Zs, read_seq


"""
"""
cigar_re = re.compile('\d+\w')
def samRepOk(genome_seq, read_seq, chr, pos, cigar, XM, NM, MD, Zs, max_mismatch):
    assert chr in genome_seq
    chr_seq = genome_seq[chr]
    assert pos < len(chr_seq)

    # Calculate XM and NM based on Cigar and Zs
    cigars = cigar_re.findall(cigar)
    cigars = [[int(cigars[i][:-1]), cigars[i][-1]] for i in range(len(cigars))]
    ref_pos, read_pos = pos, 0
    ann_ref_seq, ann_ref_rel, ann_read_seq, ann_read_rel = [], [], [], []
    for i in range(len(cigars)):
        cigar_len, cigar_op = cigars[i]
        if cigar_op == "M":
            partial_ref_seq = chr_seq[ref_pos:ref_pos+cigar_len]
            partial_read_seq = read_seq[read_pos:read_pos+cigar_len]
            assert len(partial_ref_seq) == len(partial_read_seq)
            ann_ref_seq += list(partial_ref_seq)
            ann_read_seq += list(partial_read_seq)
            for j in range(len(partial_ref_seq)):
                if partial_ref_seq[j] == partial_read_seq[j]:
                    ann_ref_rel.append("=")
                    ann_read_rel.append("=")
                else:
                    ann_ref_rel.append("X")
                    ann_read_rel.append("X")
            ref_pos += cigar_len
            read_pos += cigar_len
        elif cigar_op == "D":
            partial_ref_seq = chr_seq[ref_pos:ref_pos+cigar_len]
            ann_ref_rel += list(partial_ref_seq)
            ann_ref_seq += list(partial_ref_seq)
            ann_read_rel += (["-"] * cigar_len)
            ann_read_seq += (["-"] * cigar_len)
            ref_pos += cigar_len
        elif cigar_op == "I":
            partial_read_seq = read_seq[read_pos:read_pos+cigar_len]
            ann_ref_rel += (["-"] * cigar_len)
            ann_ref_seq += (["-"] * cigar_len)
            ann_read_rel += list(partial_read_seq)
            ann_read_seq += list(partial_read_seq) 
            read_pos += cigar_len
        elif cigar_op == "N":
            ref_pos += cigar_len
        else:
            assert False
    
    assert len(ann_ref_seq) == len(ann_read_seq)
    assert len(ann_ref_seq) == len(ann_ref_rel)
    assert len(ann_ref_seq) == len(ann_read_rel)
    ann_Zs_seq = ["0" for i in range(len(ann_ref_seq))]

    Zss, Zs_i, snp_pos_add = [], 0, 0
    if Zs != "":
        Zss = Zs.split(',')
        Zss = [zs.split('|') for zs in Zss]

    ann_read_pos = 0
    for zs in Zss:
        zs_pos, zs_type, zs_id = zs
        zs_pos = int(zs_pos)
        for i in range(zs_pos):
            while ann_read_rel[ann_read_pos] == '-':
                ann_read_pos += 1
            ann_read_pos += 1
        if zs_type == "S":
            ann_Zs_seq[ann_read_pos] = "1"
            ann_read_pos += 1
        elif zs_type == "D":
            while ann_read_rel[ann_read_pos] == '-':
                ann_Zs_seq[ann_read_pos] = "1"
                ann_read_pos += 1
        elif zs_type == "I":
            while ann_ref_rel[ann_read_pos] == '-':
                ann_Zs_seq[ann_read_pos] = "1"
                ann_read_pos += 1
        else:
            assert False

    tMD, tXM, tNM = "", 0, 0
    match_len = 0
    i = 0
    while i < len(ann_ref_seq):
        if ann_ref_rel[i] == "=":
            assert ann_read_rel[i] == "="
            match_len += 1
            i += 1
            continue
        assert ann_read_rel[i] != "="
        if ann_ref_rel[i] == "X" and ann_read_rel[i] == "X":
            if match_len > 0:
                tMD += ("{}".format(match_len))
                match_len = 0
            tMD += ann_ref_seq[i]
            if ann_Zs_seq[i] == "0":
                tXM += 1
                tNM += 1
            i += 1
        else:
            assert ann_ref_rel[i] == "-" or ann_read_rel[i] == "-"
            if ann_ref_rel[i] == '-':
                while ann_ref_rel[i] == '-':
                    if ann_Zs_seq[i] == "0":
                        tNM += 1
                    i += 1
            else:
                assert ann_read_rel[i] == '-'
                del_seq = ""
                while  ann_read_rel[i] == '-':
                    del_seq += ann_ref_seq[i]
                    if ann_Zs_seq[i] == "0":
                        tNM += 1
                    i += 1
                if match_len > 0:
                    tMD += ("{}".format(match_len))
                    match_len = 0
                tMD += ("^{}".format(del_seq))

    if match_len > 0:
        tMD += ("{}".format(match_len))

    if tMD != MD or tXM != XM or tNM != NM or XM > max_mismatch or XM != NM:
        print(chr, pos, cigar, MD, XM, NM, Zs, file=sys.stderr)
        print(tMD, tXM, tNM, file=sys.stderr)
        assert False
        
        
"""
"""
def simulate_reads(genome_file, gtf_file, snp_file, base_fname,
                   rna, paired_end, read_len, frag_len,
                   num_frag, expr_profile_type, repeat_fname,
                   error_rate, max_mismatch,
                   random_seed, snp_prob, sanity_check, verbose):
    random.seed(random_seed, version=1)
    err_rand_src = ErrRandomSource(error_rate / 100.0)
    
    if read_len > frag_len:
        frag_len = read_len

    genome_seq = read_genome(genome_file)
    if rna:
        genes, transcripts = read_transcript(genome_seq, gtf_file, frag_len)
    else:
        genes, transcripts = {}, {}
    snps = read_snp(snp_file)

    if sanity_check:
        sanity_check_input(genome_seq, genes, transcripts, snps, frag_len)

    if rna:
        num_transcripts = min(len(transcripts), 10000)
        expr_profile = generate_rna_expr_profile(expr_profile_type, num_transcripts)
    else:
        expr_profile = generate_dna_expr_profile(genome_seq)

    expr_profile = [int(expr_profile[i] * num_frag) for i in range(len(expr_profile))]
    assert num_frag >= sum(expr_profile)
    while sum(expr_profile) < num_frag:
        for i in range(min(num_frag - sum(expr_profile), len(expr_profile))):
            expr_profile[i] += 1
    assert num_frag == sum(expr_profile)
    
    repeat_loci = {}
    if repeat_fname != "" and os.path.exists(repeat_fname):
        for line in open(repeat_fname):
            if line.startswith('>'):
                continue
            coords = line.strip().split()
            for coord in coords:
                chr, pos, strand = coord.split(':')
                if chr not in repeat_loci:
                    repeat_loci[chr] = []
                repeat_loci[chr].append([int(pos), strand])

    if rna:
        transcript_ids = sorted(list(transcripts.keys()))
        random.shuffle(transcript_ids, random=random.random)
        assert len(transcript_ids) >= len(expr_profile)
    else:
        chr_ids = list(genome_seq.keys())

    sam_file = open(base_fname + ".sam", "w")

    # Write SAM header
    print("@HD\tVN:1.0\tSO:unsorted", file=sam_file)
    for chr in genome_seq.keys():
        print("@SQ\tSN:%s\tLN:%d" % (chr, len(genome_seq[chr])), file=sam_file)
    
    read_file = open(base_fname + "_1.fa", "w")
    if paired_end:
        read2_file = open(base_fname + "_2.fa", "w")

    cur_read_id = 1
    for t in range(len(expr_profile)):
        t_num_frags = expr_profile[t]
        if rna:
            transcript_id = transcript_ids[t]
            chr, strand, transcript_len, exons = transcripts[transcript_id]
            print(transcript_id, t_num_frags, file=sys.stderr)
        else:
            chr = chr_ids[t]
            print(chr, t_num_frags, file=sys.stderr)

        assert chr in genome_seq
        chr_seq = genome_seq[chr]
        chr_len = len(chr_seq)
        if chr in repeat_loci:
            chr_repeat_loci = repeat_loci[chr]
        else:
            chr_repeat_loci = []
            
        if rna:
            t_seq = ""
            for e in exons:
                assert e[0] < e[1]
                t_seq += chr_seq[e[0]:e[1]+1]
            assert len(t_seq) == transcript_len
        else:
            t_seq = chr_seq
            exons = [[0, chr_len - 1]]

        if chr in snps:
            chr_snps = snps[chr]
        else:
            chr_snps = []

        for f in range(t_num_frags):
            if rna:
                #frag_pos = random.randint(0, transcript_len - frag_len)
                frag_pos = myrandint(0, transcript_len - frag_len)
            else:
                while True:
                    if len(chr_repeat_loci):
                        #locus_id = random.randint(0, len(chr_repeat_loci) - 1)
                        locus_id = myrandint(0, len(chr_repeat_loci) - 1)
                        frag_pos = chr_repeat_loci[locus_id][0]
                    else:
                        #frag_pos = random.randint(0, chr_len - frag_len)
                        frag_pos = myrandint(0, chr_len - frag_len)
                    if 'N' not in chr_seq[frag_pos:frag_pos + frag_len]:
                        break

            # SAM specification (v1.4)
            # http://samtools.sourceforge.net/
            flag, flag2 = 99, 163  # 83, 147
            pos, cigars, cigar_descs, MD, XM, NM, Zs, read_seq = getSamAlignment(rna, exons, chr_seq, t_seq, frag_pos, read_len, chr_snps, snp_prob, err_rand_src, max_mismatch)
            pos2, cigars2, cigar2_descs, MD2, XM2, NM2, Zs2, read2_seq = getSamAlignment(rna, exons, chr_seq, t_seq, frag_pos+frag_len-read_len, read_len, chr_snps, snp_prob, err_rand_src, max_mismatch)
            swapped = False
            if paired_end:
                #if random.randint(0, 1) == 1:
                if myrandint(0, 1) == 1:
                    swapped = True
                if swapped:
                    flag, flag2 = flag - 16, flag2 - 16
                    pos, pos2 = pos2, pos
                    cigars, cigars2 = cigars2, cigars
                    cigar_descs, cigar2_descs = cigar2_descs, cigar_descs
                    read_seq, read2_seq = read2_seq, read_seq
                    XM, XM2 = XM2, XM
                    NM, NM2 = NM2, NM
                    MD, MD2 = MD2, MD
                    Zs, Zs2 = Zs2, Zs

            cigar_str, cigar2_str = "".join(cigars), "".join(cigars2)
            if sanity_check:
                samRepOk(genome_seq, read_seq, chr, pos, cigar_str, XM, NM, MD, Zs, max_mismatch)
                samRepOk(genome_seq, read2_seq, chr, pos2, cigar2_str, XM2, NM2, MD2, Zs2, max_mismatch)

            if Zs != "":
                Zs = ("\tZs:Z:{}".format(Zs))
            if Zs2 != "":
                Zs2 = ("\tZs:Z:{}".format(Zs2))

            if rna:
                XS = "\tXS:A:{}".format(strand)
                TI = "\tTI:Z:{}".format(transcript_id)
            else:
                XS, TI = "", ""                

            print(">{}".format(cur_read_id), file=read_file)
            if swapped:
                print(reverse_complement(read_seq), file=read_file)
            else:
                print(read_seq, file=read_file)
            print("{}\t{}\t{}\t{}\t255\t{}\t{}\t{}\t0\t{}\t*\tXM:i:{}\tNM:i:{}\tMD:Z:{}{}{}{}".format(cur_read_id, flag, chr, pos + 1, cigar_str, chr, pos2 + 1, read_seq, XM, NM, MD, Zs, XS, TI), file=sam_file)
            if paired_end:
                print(">{}".format(cur_read_id), file=read2_file)
                if swapped:
                    print(read2_seq, file=read2_file)
                else:
                    print(reverse_complement(read2_seq), file=read2_file)
                print("{}\t{}\t{}\t{}\t255\t{}\t{}\t{}\t0\t{}\t*\tXM:i:{}\tNM:i:{}\tMD:Z:{}{}{}{}".format(cur_read_id, flag2, chr, pos2 + 1, cigar2_str, chr, pos + 1, read2_seq, XM2, NM2, MD2, Zs2, XS, TI), file=sam_file)

            cur_read_id += 1
            
    sam_file.close()
    read_file.close()
    if paired_end:
        read2_file.close()


if __name__ == '__main__':
    parser = ArgumentParser(
        description='Simulate reads from GENOME (fasta) and GTF files')
    parser.add_argument('genome_file',
                        nargs='?',
                        type=FileType('r'),
                        help='input GENOME file')
    parser.add_argument('gtf_file',
                        nargs='?',
                        type=FileType('r'),
                        help='input GTF file')
    parser.add_argument('snp_file',
                        nargs='?',
                        type=FileType('r'),
                        help='input SNP file')
    parser.add_argument('base_fname',
                        nargs='?',
                        type=str,
                        help='output base filename')
    parser.add_argument('-d', '--dna',
                        dest='rna',
                        action='store_false',
                        default=True,
                        help='DNA-seq reads (default: RNA-seq reads)')
    parser.add_argument('--single-end',
                        dest='paired_end',
                        action='store_false',
                        default=True,
                        help='single-end reads (default: paired-end reads)')
    parser.add_argument('-r', '--read-length',
                        dest='read_len',
                        action='store',
                        type=int,
                        default=100,
                        help='read length (default: 100)')
    parser.add_argument('-f', '--fragment-length',
                        dest='frag_len',
                        action='store',
                        type=int,
                        default=250,
                        help='fragment length (default: 250)')
    parser.add_argument('-n', '--num-fragment',
                        dest='num_frag',
                        action='store',
                        type=int,
                        default=1000000,
                        help='number of fragments (default: 1000000)')
    parser.add_argument('-e', '--expr-profile',
                        dest='expr_profile',
                        action='store',
                        type=str,
                        default='flux',
                        help='expression profile: flux or constant (default: flux)')
    parser.add_argument('--repeat-info',
                        dest='repeat_fname',
                        action='store',
                        type=str,
                        default='',
                        help='repeat information filename')
    parser.add_argument('--error-rate',
                        dest='error_rate',
                        action='store',
                        type=float,
                        default=0.0,
                        help='per-base sequencing error rate (%%) (default: 0.0)')
    parser.add_argument('--max-mismatch',
                        dest='max_mismatch',
                        action='store',
                        type=int,
                        default=3,
                        help='max mismatches due to sequencing errors (default: 3)')
    parser.add_argument('--random-seed',
                        dest='random_seed',
                        action='store',
                        type=int,
                        default=0,
                        help='random seeding value (default: 0)')
    parser.add_argument('--snp-prob',
                        dest='snp_prob',
                        action='store',
                        type=float,
                        default=1.0,
                        help='probability of a read including a snp when the read spans the snp ranging from 0.0 to 1.0 (default: 1.0)')
    parser.add_argument('--sanity-check',
                        dest='sanity_check',
                        action='store_true',
                        help='sanity check')
    parser.add_argument('-v', '--verbose',
                        dest='verbose',
                        action='store_true',
                        help='also print some statistics to stderr')
    parser.add_argument('--version', 
                        action='version',
                        version='%(prog)s 2.0.0-alpha')
    args = parser.parse_args()
    if not args.genome_file or not args.gtf_file or not args.snp_file:
        parser.print_help()
        exit(1)
    if not args.rna:
        args.expr_profile = "constant"
    simulate_reads(args.genome_file, args.gtf_file, args.snp_file, args.base_fname,
                   args.rna, args.paired_end, args.read_len, args.frag_len,
                   args.num_frag, args.expr_profile, args.repeat_fname,
                   args.error_rate, args.max_mismatch,
                   args.random_seed, args.snp_prob, args.sanity_check, args.verbose)