hisat-3n/evaluation/simulation/calculate_read_cost.py

#!/usr/bin/env python

import sys, os, subprocess
import multiprocessing
import string, re
import platform
from datetime import datetime, date, time
import copy
from argparse import ArgumentParser, FileType
from multiprocessing import Process
import bisect

mp_mode = False
mp_num = 1

cigar_re = re.compile('\d+\w')

osx_mode = False
if sys.platform == 'darwin':
    osx_mode = True

"""
"""
def parse_mem_usage(resource):
    if osx_mode:
        resource = resource.strip().split('\n')
        for line in resource:
            if line.find('maximum resident set size') != -1:
                return int(line.split()[0]) / 1024
    else:
        resource = resource.split(' ')
        for line in resource:
            idx = line.find('maxresident')
            if idx != -1:
                return line[:idx]

    return '0'


"""
"""
def reverse_complement(seq):
    result = ""
    for nt in seq:
        base = nt
        if nt == 'A':
            base = 'T'
        elif nt == 'a':
            base = 't'
        elif nt == 'C':
            base = 'G'
        elif nt == 'c':
            base = 'g'
        elif nt == 'G':
            base = 'C'
        elif nt == 'g':
            base = 'c'
        elif nt == 'T':
            base = 'A'
        elif nt == 't':
            base = 'a'

        result = base + result

    return result


"""
RepeatDB
"""

class RepeatAllele:
    def __init__(self):
        self.repeat_name = ''
        self.allele_idx = 0
        self.repeat_pos = 0
        self.repeat_length = 0
        self.positions = []
        return

    def __repr__(self):
        return '[' + ','.join([str(self.repeat_name), str(self.allele_idx), str(self.repeat_pos), str(self.repeat_length), str(len(self.positions))]) + ']'

    def add_position(self, chr, pos, strand):
        self.positions.append([chr, pos, strand])


    def __lt__(self, other):
        if self.repeat_pos < other.repeat_pos:
            return True
        elif self.repeat_pos == other.repeat_pos:
            return self.repeat_length < other.repeat_length
        else:
            return False

class Repeat:
    def __init__(self):
        self.repeat_name = ''
        self.repeat_length = 0
        self.repeat_pos = 0
        self.allele = []
        return

    def add_allele(self, allele_idx, repeatAllele):
        #self.allele[allele_idx] = repeatAllele
        self.allele.append(repeatAllele)

    def allele_sort(self):
        self.allele = sorted(self.allele)

def cmp_repeatmap(a, b):
    if a[0] < b[0]:
        return -1
    elif a[0] == b[0]:
        return 0
    else:
        return 1

def read_len_cigar(cigar_str):
    cigars = cigar_re.findall(cigar_str)
    cigars = [[cigar[-1], int(cigar[:-1])] for cigar in cigars]

    read_len = 0
    for cigar in cigars:
        cigar_op, length = cigar
        if cigar_op in "MISH":
            read_len += int(length)

    return read_len

def read_repeatdb(repeat_filename):
    repeat_db = {}

    if os.path.exists(repeat_filename):

        for line in open(repeat_filename, 'r'):

            if line[0] == '>':
                line = line.strip()[1:]
                name, rptRefName, pos, rep_len, _, _ = line.split()[:6]
                pos = int(pos)
                rep_len = int(rep_len)
                rptName, allele_idx = name.split('*')[0:2]
                allele_idx = int(allele_idx)

                repeatAllele = RepeatAllele()
                repeatAllele.repeat_name = rptName
                repeatAllele.allele_idx = allele_idx
                repeatAllele.repeat_pos = pos
                repeatAllele.repeat_length = rep_len

                if rptRefName not in repeat_db:
                    # new rptRefName
                    repeat_db[rptRefName] = {}

                if rptName not in repeat_db[rptRefName]:
                    # new rptName
                    assert allele_idx == 0
                    repeat_db[rptRefName][rptName] = Repeat()
                    repeat_db[rptRefName][rptName].repeat_name = rptName

                repeat_db[rptRefName][rptName].add_allele(allele_idx, repeatAllele)

            else:
                coords = line.split()
                for coord in coords:
                    chr, pos, strand = coord.split(':')
                    pos = int(pos)

                    repeat_db[rptRefName][rptName].allele[allele_idx].add_position(chr, pos, strand)

    else:
        print >> sys.stderr, 'Cannot open file', repeat_filename


    print >> sys.stderr, 'Build repeatMap'
    repeat_map = {}
    for rptRefName, repeats in repeat_db.items():
        #print 'Processing', rptRefName
        repeat_pos_list = []

        for repeatName, repeat in repeats.items():
            #print 'Common Allele:', repeatName, repeat.repeat_name
            repeat_left = sys.maxint
            repeat_right = 0

            #for allele_id, repeatAllele in repeat.allele.items():
            for repeatAllele in repeat.allele:
                left = repeatAllele.repeat_pos
                right = left + repeatAllele.repeat_length
                if left < repeat_left:
                    repeat_left = left
                if right > repeat_right:
                    repeat_right = right

            repeat.repeat_pos = repeat_left
            repeat.repeat_length = repeat_right - repeat_left

            #print repeat.allele
            #repeat.allele_sort()
            #print repeat.allele

            #print repeat_left, repeat_right

            repeat_pos_list.append((repeat_right, repeatName))

        repeat_map[rptRefName] = sorted(repeat_pos_list, cmp=cmp_repeatmap)
        #print repeat_map[rptRefName]

    return repeat_db, repeat_map


def find_leftmost_pos(rmap, left):
    pos = bisect.bisect_left(rmap, (left, None))
    #print pos

    if pos == len(rmap):
        return pos

    if rmap[pos][0] == left:
        while pos < len(rmap):
            if rmap[pos][0] != left:
                break
            pos += 1

    return pos

def repeat_to_genome_pos(repeat_db, repeat_map, rptRefName, pos, cigar_str = ''):
    assert rptRefName in repeat_db
    readlen = read_len_cigar(cigar_str)
    #readlen = 101

    # pos in sam-result. pos is 1-based
    left = pos - 1
    right = left + readlen

    repeats = repeat_db[rptRefName]
    rmap = repeat_map[rptRefName]

    #print len(rmap)
    #print rmap

    i = find_leftmost_pos(rmap, left)
    if i >= len(rmap):
        print >> sys.stderr, 'Value Error'
        return

    if right > rmap[i][0]:
        print >> sys.stderr, 'Not repeat'
        return

    repeat = repeats[rmap[i][1]]

    #print 'Allele Size:', len(repeat.allele)
    #print repeat.allele
    for allele in repeat.allele:
        rpos = allele.repeat_pos
        rlen = allele.repeat_length

        if (left >= rpos) and (right <= rpos + rlen):
            offset = left - rpos
            for genome_pos in allele.positions:
                print genome_pos[0], genome_pos[1] + offset + 1, genome_pos[2], genome_pos[1]

"""
"""
def read_genome(genome_filename):
    chr_dic = {}
    genome_file = open(genome_filename, "r")

    chr_name, sequence = "", ""
    for line in genome_file:
        if line[0] == ">":
            if chr_name and sequence:
                chr_dic[chr_name] = sequence

            chr_name = line[1:-1].split()[0]
            sequence = ""
        else:
            sequence += line[:-1]

    if chr_name and sequence:
        chr_dic[chr_name] = sequence

    genome_file.close()

    print >> sys.stderr, "genome is loaded"

    return chr_dic


"""
"""
def extract_splice_sites(gtf_fname):
    trans = {}

    gtf_file = open(gtf_fname)
    # Parse valid exon lines from the GTF file into a dict by transcript_id
    for line in gtf_file:
        line = line.strip()
        if not line or line.startswith('#'):
            continue
        if '#' in line:
            line = line.split('#')[0].strip()

        try:
            chrom, source, feature, left, right, score, \
            strand, frame, values = line.split('\t')
        except ValueError:
            continue
        left, right = int(left), int(right)

        if feature != 'exon' or left >= right:
            continue

        values_dict = {}
        for attr in values.split(';')[:-1]:
            attr, _, val = attr.strip().partition(' ')
            values_dict[attr] = val.strip('"')

        if 'gene_id' not in values_dict or \
                'transcript_id' not in values_dict:
            continue

        transcript_id = values_dict['transcript_id']
        if transcript_id not in trans:
            trans[transcript_id] = [chrom, strand, [[left, right]]]
        else:
            trans[transcript_id][2].append([left, right])

    gtf_file.close()

    # Sort exons and merge where separating introns are <=5 bps
    for tran, [chrom, strand, exons] in trans.items():
            exons.sort()
            tmp_exons = [exons[0]]
            for i in range(1, len(exons)):
                if exons[i][0] - tmp_exons[-1][1] <= 5:
                    tmp_exons[-1][1] = exons[i][1]
                else:
                    tmp_exons.append(exons[i])
            trans[tran] = [chrom, strand, tmp_exons]

    # Calculate and print the unique junctions
    junctions = set()
    for chrom, strand, exons in trans.values():
        for i in range(1, len(exons)):
            junctions.add(to_junction_str([chrom, exons[i-1][1], exons[i][0]]))

    return junctions

"""
"""
def read_repeat_info(repeat_filename):
    repeat_info, repeat_dic = {}, {}
    repeat_pos = {}
    if os.path.exists(repeat_filename):
        for line in open(repeat_filename):
            if line[0] == ">":
                line = line.strip()[1:]
                allele, rep, pos, rep_len, _, _ = line.split()[:6]
                pos, rep_len = int(pos), int(rep_len)
                common_allele = allele.split('*')[0]

                if rep not in repeat_info:
                    repeat_info[rep] = []
                    repeat_dic[rep] = {}
                    repeat_pos[rep] = {}

                repeat_info[rep].append([allele, pos, rep_len])
                if allele not in repeat_dic[rep]:
                    repeat_dic[rep][allele] = []
                    repeat_pos[rep][allele] = set()
            else:
                coords = line.split()
                for coord in coords:
                    chr, pos, strand = coord.split(':')
                    pos = int(pos)
                    if pos in repeat_pos[rep][allele]:
                        continue
                    repeat_dic[rep][allele].append([chr, pos, strand])
                    repeat_pos[rep][allele].add(pos)

    for rep, repeats in repeat_info.items():
        def my_cmp(a, b):
            if a[1] < b[1]:
                return -1
            elif a[1] == b[1]:
                return a[2] - b[2]
            else:
                return 1
        repeat_info[rep] = sorted(repeat_info[rep], cmp=my_cmp)

    return repeat_info, repeat_dic


"""
"""
def find_repeat(repeat_info, pos):
    if len(repeat_info) <= 0:
        return -1

    l, r = 0, len(repeat_info)
    while l < r:
        m = (l + r) / 2
        _, rep_pos, rep_len, _ = repeat_info[m]
        if rep_pos <= pos and pos < rep_pos + rep_len:
            return m
        elif pos < rep_pos:
            r = m
        else:
            l = m + 1

    return -1

def reverse_cigar(cigar_str):
    cigars = cigar_re.findall(cigar_str)
    cigars = [[cigar[-1], int(cigar[:-1])] for cigar in cigars]
    cigars[::-1]

    read_len = 0
    cigar_str = ""
    for cigar in cigars:
        cigar_op, length = cigar
        cigar_str += ("%d%s" % (length, cigar_op))
        if cigar_op in "MISH":
            read_len += int(length)

    return read_len, cigar_str


def to_junction_str(junction):
    return "%s-%d-%d" % (junction[0], junction[1], junction[2])


def to_junction(junction_str):
    fields = junction_str.split("-")
    if len(fields) > 3:
        chr, left, right = "-".join(fields[:-2]), fields[-2], fields[-1]
    else:
        assert len(fields) == 3
        chr, left, right = fields

    return [chr, int(left), int(right)]


"""
"""
def junction_cmp(a, b):
    if a[0] != b[0]:
        if a[0] < b[0]:
            return -1
        else:
            return 1

    if a[1] != b[1]:
        if a[1] < b[1]:
            return -1
        else:
            return 1

    if a[2] != b[2]:
        if a[2] < b[2]:
            return -1
        else:
            return 1

    return 0


"""
# chr and pos are assumed to be integers
"""
def get_junctions(chr, pos, cigar_str, min_anchor_len = 0, read_len = 100):
    junctions = []
    right_pos = pos
    cigars = cigar_re.findall(cigar_str)
    cigars = [[int(cigars[i][:-1]), cigars[i][-1]] for i in range(len(cigars))]

    left_anchor_lens = []
    cur_left_anchor_len = 0
    for i in range(len(cigars)):
        length, cigar_op = cigars[i]
        if cigar_op in "MI":
            cur_left_anchor_len += length
        elif cigar_op == "N":
            assert cur_left_anchor_len > 0
            left_anchor_lens.append(cur_left_anchor_len)
            cur_left_anchor_len = 0

    for i in range(len(cigars)):
        length, cigar_op = cigars[i]
        if cigar_op == "N":
            left, right = right_pos - 1, right_pos + length

            if i > 0 and cigars[i-1][1] in "ID":
                if cigars[i-1][1] == "I":
                    left += cigars[i-1][0]
                else:
                    left -= cigars[i-1][0]
            if i + 1 < len(cigars) and cigars[i+1][1] in "ID":
                if cigars[i+1][1] == "I":
                    right -= cigars[i+1][0]
                else:
                    right += cigars[i+1][0]

            junction_idx = len(junctions)
            assert junction_idx < len(left_anchor_lens)
            left_anchor_len = left_anchor_lens[junction_idx]
            assert left_anchor_len > 0 and left_anchor_len < read_len
            right_anchor_len = read_len - left_anchor_len
            if left_anchor_len >= min_anchor_len and right_anchor_len >= min_anchor_len:
                junctions.append([chr, left, right])

        if cigar_op in "MND":
            right_pos += length

    return junctions

def get_right(pos, cigars):
    right_pos = pos
    cigars = cigar_re.findall(cigars)
    for cigar in cigars:
        length = int(cigar[:-1])
        cigar_op = cigar[-1]
        if cigar_op in "MDN":
            right_pos += length

    return right_pos

def get_cigar_chars(cigars):
    cigars = cigar_re.findall(cigars)
    cigar_chars = ""
    for cigar in cigars:
        cigar_op = cigar[-1]
        cigar_chars += cigar_op

    return cigar_chars


"""
"""
def get_cigar_chars_MN(cigars):
    cigars = cigar_re.findall(cigars)
    cigar_chars = ""
    for cigar in cigars:
        cigar_op = cigar[-1]
        if cigar_op in "MN":
            if cigar_chars == "" or cigar_chars[-1] != cigar_op:
                cigar_chars += cigar_op

    return cigar_chars


"""
"""
def is_small_anchor_junction_read(cigars):
    cigar_list = []
    for cigar in cigar_re.findall(cigars):
        cigar_op = cigar[-1]
        cigar_len = int(cigar[:-1])
        cigar_list.append([cigar_op, cigar_len])

    if len(cigar_list) < 3:
        return False

    if cigar_list[0][0] != 'M' or cigar_list[-1][0] != 'M':
        return False

    if cigar_list[0][1] > 10 and cigar_list[-1][1] > 10:
        return False

    if cigar_list[1][0] != 'N' or cigar_list[-2][0] != 'N':
        return False

    return True


"""
"""
def is_canonical_junction(chr_dic, junction):
    chr, left, right = junction
    donor = chr_dic[chr][left:left+2]
    acceptor = chr_dic[chr][right-3:right-1]
    rev_donor = reverse_complement(acceptor)
    rev_acceptor = reverse_complement(donor)

    if (donor == "GT" and acceptor == "AG") or \
            (rev_donor == "GT" and rev_acceptor == "AG"):
        return True

    return False


"""
"""
def is_small_exon_junction_read(cigars, min_exon_len = 23):
    cigars = cigar_re.findall(cigars)
    for i in range(1, len(cigars) - 1):
        cigar = cigars[i]
        cigar_op = cigar[-1]
        cigar_len = int(cigar[:-1])

        prev_op = cigars[i-1][-1]
        next_op = cigars[i+1][-1]

        if prev_op == 'N' and cigar_op == 'M' and next_op == 'N':
            if cigar_len <= min_exon_len:
                return True

    return False


"""
"""
"""
def repeat_to_genome_alignment(repeat_info, repeat_dic, rep, pos, cigar_str = ""):
    assert rep in repeat_info
    left = pos - 1 # convert 1-based offset to zero-based

    repeats = repeat_info[rep]
    l, r = 0, len(repeats)
    while l < r:
        m = (l + r) / 2
        rep_allele, rpos, rlen = repeats[m]
        if left >= rpos and left < rpos + rlen:
            while m > 0:
                rep_allele, rpos, rlen = repeats[m-1]
                if left < rpos:
                    break
                m -= 1
            break
        if left < rpos:
            r = m
        else:
            l = m + 1

    alignments = []
    while m < len(repeats):
        rep_allele, rpos, rlen = repeats[m]
        if left >= rpos + rlen:
            m += 1
            continue

        if left < rpos:
            break

        assert rep_allele in repeat_dic[rep]
        coords = repeat_dic[rep][rep_allele]
        assert len(coords) > 0
        for coord in coords:
            cchr, cpos, cstrand = coord
            adj_left = left - rpos
            if cstrand == '+':
                rep_left = cpos + adj_left
                rep_cigar_str = cigar_str
            else:
                if cigar_str:
                    read_len, rep_cigar_str = reverse_cigar(cigar_str)
                else:
                    read_len, rep_cigar_str = 0, ""
                rc_adj_left = rlen - adj_left - read_len;
                rep_left = cpos + rc_adj_left

            alignments.append([cchr, rep_left + 1, rep_cigar_str])
        m += 1

    return alignments
"""
def repeat_to_genome_alignment(repeat_db, repeat_map, rptRefName, pos, cigar_str = ''):
    assert rptRefName in repeat_db
    readlen = read_len_cigar(cigar_str)
    #readlen = 101

    # pos in sam-result. pos is 1-based
    left = pos - 1
    right = left + readlen

    repeats = repeat_db[rptRefName]
    rmap = repeat_map[rptRefName]

    #print len(rmap)
    #print rmap

    alignments = []

    i = find_leftmost_pos(rmap, left)
    if i >= len(rmap):
        print >> sys.stderr, 'Value Error'
        return alignments

    if right > rmap[i][0]:
        print >> sys.stderr, 'Not repeat'
        return alignments

    repeat = repeats[rmap[i][1]]

    #print 'Allele Size:', len(repeat.allele)
    #print repeat.allele
    for allele in repeat.allele:
        rpos = allele.repeat_pos
        rlen = allele.repeat_length

        if (left >= rpos) and (right <= rpos + rlen):
            offset = left - rpos
            for coord in allele.positions:
                cchr, cpos, cstrand = coord
                if cstrand == '+':
                    rep_left = cpos + offset
                    rep_cigar_str = cigar_str
                else:
                    if cigar_str:
                        rep_read_len, rep_cigar_str = reverse_cigar(cigar_str)
                    else:
                        rep_read_len, rep_cigar_str = 0, ""

                    rc_offset = rlen - offset - rep_read_len
                    rep_left = cpos + rc_offset

                alignments.append([cchr, rep_left + 1, rep_cigar_str])
                #print genome_pos[0], genome_pos[1] + offset + 1, genome_pos[2], genome_pos[1]

    return alignments


"""
"""
def extract_single(infilename,
                   outfilename,
                   chr_dic,
                   aligner,
                   version,
                   repeat_db,
                   repeat_map,
                   debug_dic,
                   hash_idx):
    infile = open(infilename)
    if hash_idx == -1:
        outfile = open(outfilename, "w")
    else:
        outfile = open(outfilename + "." + str(hash_idx), "w")

    prev_read_id = ""
    num_reads, num_aligned_reads, num_ualigned_reads = 0, 0, 0
    prev_NM, prev_NH, NH_real = 0, 0, 0

    for line in infile:
        if line[0] == '@':
            continue

        cols = line[:-1].split()
        read_id, flag, chr, pos, mapQ, cigar_str = cols[:6]
        read_seq = cols[9]
        if len(read_id) >= 3 and read_id[-2] == "/":
            read_id = read_id[:-2]

        if read_id.find("seq.") == 0:
            read_id = read_id[4:]

        if aligner == "gsnap":
            chr = chr.replace("_", ":")

        if hash_idx != -1:
            hashval = hash(read_id)
            if hashval % mp_num != hash_idx:
                continue

        if read_id != prev_read_id:
            num_reads += 1

        flag, pos, mapQ = int(flag), int(pos), int(mapQ)
        if flag & 0x4 != 0 or \
                'H' in cigar_str:
            prev_read_id = read_id
            continue

        NH, NM, XA = "", sys.maxint, []
        for i in range(11, len(cols)):
            col = cols[i]
            # "nM" from STAR
            if col.startswith("NM") or col.startswith("nM"):
                NM = int(col[5:])
            elif col.startswith("NH"):
                NH = col
            elif col.startswith("XA"):
                XA = col[5:].split(';')[:-1]
        if NH != "":
            NH = int(NH[5:])
            if aligner == "hisat2":
                if prev_read_id == read_id:
                    assert prev_NH == NH
                if NH == 1 or mapQ == 60:
                    assert NH == 1 and mapQ == 60

        if read_id != prev_read_id:
            num_aligned_reads += 1
            if aligner == "hisat2" and \
                    NH == 1:
                num_ualigned_reads += 1
        else:
            # In case of Bowtie2, only consier the best alignments
            if aligner in ["bowtie2", "bwa", "vg"]:
                if NM > prev_NM:
                    continue

        def adjust_alignment(chr, pos, cigar_str):
            NM_real = 0
            read_pos, right_pos = 0, pos - 1
            cigars = cigar_re.findall(cigar_str)
            cigars = [[cigar[-1], int(cigar[:-1])] for cigar in cigars]
            for i in range(len(cigars)):
                cigar_op, length = cigars[i]
                if cigar_op == "S":
                    assert i == 0 or i == len(cigars) - 1
                    if i == 0:
                        assert cigars[i+1][0] == "M"
                        ref_seq = chr_dic[chr][right_pos-length:right_pos]
                    else:
                        assert cigars[i-1][0] == "M"
                        ref_seq = chr_dic[chr][right_pos:right_pos+length]

                    ref_seq = ref_seq.upper()
                    if length == len(ref_seq):
                        for j in range(length):
                            if ref_seq[j] != "N" and read_seq[read_pos+j] != ref_seq[j]:
                                NM_real += 1
                    else:
                        NM_real += length
                if cigar_op in "MND":
                    right_pos += length
                if cigar_op in "MIS":
                    read_pos += length

            if cigars[0][0] == "S":
                assert cigars[1][0] == "M"
                pos -= cigars[0][1]
                cigars[1][1] += cigars[0][1]
                cigars = cigars[1:]
            if cigars[-1][0] == "S":
                assert cigars[-2][0] == "M"
                cigars[-2][1] += cigars[-1][1]
                cigars = cigars[:-1]

            cigar_str = ""
            for cigar in cigars:
                cigar_op, length = cigar
                cigar_str += ("%d%s" % (length, cigar_op))

            return pos, cigar_str, NM_real


        alignments = [[chr, pos, cigar_str]]
        if aligner == "bwa" and len(XA) > 0:
            for alignment in XA:
                alt_chr, alt_pos, alt_cigar_str, alt_NM = alignment.split(',')
                alt_pos, alt_NM = abs(int(alt_pos)), int(alt_NM)
                if alt_NM > NM:
                    break
                alignments.append([alt_chr, alt_pos, alt_cigar_str])

        # Convert repeat alignments to genome alignments
        if aligner == "hisat2" and chr.startswith("rep") and len(repeat_map) > 0:
            alignments = repeat_to_genome_alignment(repeat_db, repeat_map, chr, pos, cigar_str)

        for i, alignment in enumerate(alignments):
            chr, pos, cigar_str = alignment
            pos, cigar_str, NM_real = adjust_alignment(chr, pos, cigar_str)
            p_str = "%s\t%s\t%d\t%s\tNM:i:%d" % (read_id, chr, pos, cigar_str, NM_real)
            print >> outfile, p_str

        if aligner == "hisat2":
            if prev_read_id != read_id:
                if prev_read_id != "":
                    assert prev_NH == NH_real
                NH_real = 1
            else:
                NH_real += 1
            prev_NH = NH
        prev_NM = NM
        prev_read_id = read_id

    if aligner == "hisat2":
        if prev_read_id != "":
            assert prev_NH == NH_real

    outfile.close()
    infile.close()

    # Sanity check for HISAT2's alignment summary
    if aligner == "hisat2" and os.path.exists(infilename + ".summary") and (not mp_mode):
        hisat2_reads, hisat2_0aligned_reads, hisat2_ualigned_reads, hisat2_maligned_reads = 0, 0, 0, 0
        for line in open(infilename + ".summary"):
            line = line.strip()
            if line.startswith("HISAT2 summary") or \
                    line.startswith("Overall"):
                continue
            category, num = line.split(':')
            num = num.strip()
            num = int(num.split(' ')[0])
            if category.startswith("Total reads"):
                hisat2_reads = num
            elif category.startswith("Aligned 0 time"):
                hisat2_0aligned_reads = num
            elif category.startswith("Aligned 1 time"):
                hisat2_ualigned_reads = num
            else:
                assert category.startswith("Aligned >1 time")
                assert hisat2_reads == hisat2_0aligned_reads + hisat2_ualigned_reads + num

        hisat2_aligned_reads = hisat2_reads - hisat2_0aligned_reads

        assert hisat2_reads == num_reads
        assert hisat2_aligned_reads == num_aligned_reads
        assert hisat2_ualigned_reads == num_ualigned_reads


"""
"""
def extract_pair(infilename,
                 outfilename,
                 chr_dic,
                 RNA,
                 aligner,
                 version,
                 repeat_db,
                 repeat_map,
                 debug_dic,
                 hash_idx):
    read_dic = {}
    pair_reported = set()

    infile = open(infilename)
    if hash_idx == -1:
        outfile = open(outfilename, "w")
    else:
        outfile = open(outfilename + "." + str(hash_idx), "w")

    num_pairs, num_conc_aligned_pairs, num_conc_ualigned_pairs, num_disc_aligned_pairs = 0, 0, 0, 0
    num_aligned_reads, num_ualigned_reads = 0, 0

    prev_read_id, pair_list = "", set()
    prev_NM = sys.maxint
    prev_NH1, prev_NH2 = 0, 0
    NH1_real, NH2_real = 0, 0

    for line in infile:
        if line[0] == '@':
            continue

        cols = line[:-1].split()
        read_id, flag, chr1, pos1, mapQ, cigar1_str, chr2, pos2 = cols[:8]
        read_seq = cols[9]
        if len(read_id) >= 3 and read_id[-2] == "/":
            read_id = read_id[:-2]

        if read_id.find("seq.") == 0:
            read_id = read_id[4:]

        if hash_idx != -1:
            hashval = hash(read_id)
            if hashval % mp_num != hash_idx:
                continue

        if aligner == "gsnap":
            chr1 = chr1.replace("_", ":")
            chr2 = chr2.replace("_", ":")

        if read_id != prev_read_id:
            num_pairs += 1
            pair_list = set()
            prev_NM = sys.maxint

        flag = int(flag)
        canonical_pos1, canonical_pos2 = int(pos1), int(pos2)
        left_read = (flag & 0x40 != 0)
        pos1 = canonical_pos1
        mapQ = int(mapQ)
        if flag & 0x4 != 0 or \
                'H' in cigar1_str:
            prev_read_id, is_prev_read_left = read_id, left_read
            continue

        concordant = (flag & 0x2 != 0)
        NH, NM1, YT, XA = sys.maxint, sys.maxint, "", []
        for i in range(11, len(cols)):
            col = cols[i]
            # "nM" from STAR
            if col.startswith("NM") or col.startswith("nM"):
                NM1 = int(col[5:])
            elif col.startswith("NH"):
                NH = int(col[5:])
            elif col.startswith("YT"):
                YT = col[5:]
            elif col.startswith("XA"):
                XA = col[5:].split(';')[:-1]

        if aligner == "hisat2":
            if prev_read_id == read_id:
                if left_read:
                    assert prev_NH1 == 0 or prev_NH1 == NH
                else:
                    assert prev_NH2 == 0 or prev_NH2 == NH
            if NH == 1 or mapQ == 60:
                assert NH == 1 and mapQ == 60

        unpaired = (flag & 0x8 != 0) or (YT in ["UU", "UP"])
        if unpaired:
            if left_read not in pair_list:
                pair_list.add(left_read)
                num_aligned_reads += 1
                if aligner == "hisat2" and NH == 1:
                    num_ualigned_reads += 1
                    assert mapQ == 60
        else:
            if read_id != prev_read_id:
                if concordant:
                    num_conc_aligned_pairs += 1
                    if aligner == "hisat2" and NH == 1:
                        num_conc_ualigned_pairs += 1
                else:
                    if aligner == "hisat2":
                        assert YT == "DP"
                    num_disc_aligned_pairs += 1

        if chr2 == '*':
            continue

        def adjust_alignment(chr, pos, cigar_str):
            NM_real = 0
            read_pos, right_pos = 0, pos - 1
            cigars = cigar_re.findall(cigar_str)
            cigars = [[cigar[-1], int(cigar[:-1])] for cigar in cigars]
            for i in range(len(cigars)):
                cigar_op, length = cigars[i]
                if cigar_op == "S":
                    assert i == 0 or i == len(cigars) - 1
                    if i == 0:
                        assert cigars[i+1][0] == "M"
                        ref_seq = chr_dic[chr1][right_pos-length:right_pos]
                    else:
                        assert cigars[i-1][0] == "M"
                        ref_seq = chr_dic[chr1][right_pos:right_pos+length]

                    ref_seq = ref_seq.upper()
                    if length == len(ref_seq):
                        for j in range(length):
                            if ref_seq[j] != "N" and read_seq[read_pos+j] != ref_seq[j]:
                                NM_real += 1
                    else:
                        NM_real += length
                if cigar_op in "MND":
                    right_pos += length
                if cigar_op in "MIS":
                    read_pos += length

            if cigars[0][0] == "S":
                assert cigars[1][0] == "M"
                pos -= cigars[0][1]
                cigars[1][1] += cigars[0][1]
                cigars = cigars[1:]
            if cigars[-1][0] == "S":
                assert cigars[-2][0] == "M"
                cigars[-2][1] += cigars[-1][1]
                cigars = cigars[:-1]

            cigar_str = ""
            for cigar in cigars:
                cigar_op, length = cigar
                cigar_str += ("%d%s" % (length, cigar_op))

            return pos, cigar_str, NM_real

        alignments = [[chr1, pos1, cigar1_str]]
        if aligner == "bwa" and len(XA) > 0:
            for alignment in XA:
                alt_chr, alt_pos, alt_cigar_str, alt_NM = alignment.split(',')
                alt_pos, alt_NM = abs(int(alt_pos)), int(alt_NM)
                if alt_NM > NM1:
                    break
                alignments.append([alt_chr, alt_pos, alt_cigar_str])

        # Convert repeat alignments to genome alignments
        if aligner == "hisat2" and (chr1.startswith("rep") or chr2.startswith("rep")) and len(repeat_map) > 0:
            if chr1.startswith("rep"):
                alignments = repeat_to_genome_alignment(repeat_db, repeat_map, chr1, pos1, cigar1_str)
            if chr2.startswith("rep") or (chr1.startswith("rep") and chr2 == "="):
                chr_tmp = chr1 if chr2 == "=" else chr2
                alignments2 = repeat_to_genome_alignment(repeat_db, repeat_map, chr_tmp, int(pos2))
            else:
                alignments2 = [[chr2, int(pos2)]]

            selected_alignments = []
            for alignment in alignments:
                _chr1, _pos1 = alignment[:2]
                add = False
                for alignment2 in alignments2:
                    _chr2, _pos2 = alignment2[:2]
                    if _chr1 == _chr2 and abs(_pos1 - _pos2) <= 1000:
                        add = True
                        break
                if add:
                    selected_alignments.append(alignment)

            alignments = selected_alignments

        for i, alignment in enumerate(alignments):
            chr1, pos1, cigar1_str = alignment
            pos1, cigar_str, NM_real = adjust_alignment(chr1, pos1, cigar1_str)
            chr2 = chr1

            if aligner == "bwa":
                me = "%s\t%s" % (read_id, chr1)
                partner = "%s\t%s" % (read_id, chr2)
            else:
                me = "%s\t%s\t%d" % (read_id, chr1, canonical_pos1)
                partner = "%s\t%s\t%d" % (read_id, chr2, canonical_pos2)
            if partner in read_dic:
                maps = read_dic[partner]

                for map in maps:
                    if (map[0] == me) or len(maps) == 1:
                        cigar2_str, NM2, pos2 = map[1:4]

                        if aligner == "bwa":
                            if abs(pos1 - pos2) >= 1000:
                                continue

                        if aligner in ["bowtie2", "bwa"]:
                            if NM1 + NM2 > prev_NM:
                                continue
                            else:
                                prev_NM = NM1 + NM2

                        if chr1 != chr2:
                            continue

                        # DK - debugging purposes
                        if RNA:
                            if aligner in ["bowtie2", "bwa"] and abs(pos1 - pos2) > 1000:
                                continue
                        else:
                            if abs(pos1 - pos2) > 1000:
                                continue

                        if int(pos2) > int(pos1):
                            p_str = "%s\t%s\t%d\t%s\t%s\t%d\t%s\tNM:i:%d\tNM:i:%d" % \
                                    (read_id, chr1, pos1, cigar_str, chr2, pos2, cigar2_str, NM1, NM2)
                        else:
                            p_str = "%s\t%s\t%d\t%s\t%s\t%d\t%s\tNM:i:%d\tNM:i:%d" % \
                                    (read_id, chr2, pos2, cigar2_str, chr1, pos1, cigar_str, NM2, NM1)

                        if p_str not in pair_reported:
                            pair_reported.add(p_str)
                            print >> outfile, p_str


            if not me in read_dic:
                read_dic[me] = []

            read_dic[me].append([partner, cigar_str, NM1, pos1])

        if aligner == "hisat2":
            if prev_read_id != read_id:
                if prev_read_id != "":
                    assert prev_NH1 == NH1_real
                    assert prev_NH2 == NH2_real
                    prev_NH1, prev_NH2 = 0, 0
                if left_read:
                    NH1_real, NH2_real = 1, 0
                else:
                    NH1_real, NH2_real = 0, 1
            else:
                if left_read:
                    NH1_real += 1
                else:
                    NH2_real += 1
            if left_read:
                prev_NH1 = NH
            else:
                prev_NH2 = NH
        prev_read_id = read_id

    if aligner == "hisat2":
        if prev_read_id != "":
            assert prev_NH1 == NH1_real
            assert prev_NH2 == NH2_real

    outfile.close()
    infile.close()

    # Sanity check for HISAT2's alignment summary
    if aligner == "hisat2" and os.path.exists(infilename + ".summary") and (not mp_mode):
        hisat2_pairs, hisat2_0aligned_pairs, hisat2_conc_ualigned_pairs, hisat2_conc_maligned_pairs, hisat2_disc_aligned_pairs = 0, 0, 0, 0, 0
        hisat2_reads, hisat2_0aligned_reads, hisat2_ualigned_reads, hisat2_maligned_reads = 0, 0, 0, 0

        for line in open(infilename + ".summary"):
            line = line.strip()
            if line.startswith("HISAT2 summary") or \
                    line.startswith("Overall"):
                continue
            category, num = line.split(':')
            num = num.strip()
            num = int(num.split(' ')[0])
            if category.startswith("Total pairs"):
                hisat2_pairs = num
            elif category.startswith("Aligned concordantly or discordantly 0 time"):
                hisat2_0aligned_pairs = num
            elif category.startswith("Aligned concordantly 1 time"):
                hisat2_conc_ualigned_pairs = num
            elif category.startswith("Aligned concordantly >1 time"):
                hisat2_conc_maligned_pairs = num
            elif category.startswith("Aligned discordantly"):
                hisat2_disc_aligned_pairs = num
                assert hisat2_pairs == hisat2_0aligned_pairs + hisat2_conc_ualigned_pairs + hisat2_conc_maligned_pairs + hisat2_disc_aligned_pairs
            elif category.startswith("Total unpaired reads"):
                hisat2_reads = num
                assert hisat2_reads == hisat2_0aligned_pairs * 2
            elif category.startswith("Aligned 0 time"):
                hisat2_0aligned_reads = num
            elif category.startswith("Aligned 1 time"):
                hisat2_ualigned_reads = num
            else:
                assert category.startswith("Aligned >1 times")
                hisat2_maligned_reads = num
                assert hisat2_reads == hisat2_0aligned_reads + hisat2_ualigned_reads + hisat2_maligned_reads

        assert hisat2_pairs == num_pairs
        assert hisat2_conc_ualigned_pairs == num_conc_ualigned_pairs
        assert hisat2_conc_maligned_pairs == num_conc_aligned_pairs - num_conc_ualigned_pairs
        assert hisat2_disc_aligned_pairs == num_disc_aligned_pairs
        assert hisat2_ualigned_reads == num_ualigned_reads
        assert hisat2_maligned_reads == num_aligned_reads - num_ualigned_reads


"""
"""
def is_junction_read(junctions_dic, chr, pos, cigar_str):
    result_junctions = []
    junctions = get_junctions(chr, pos, cigar_str)
    for junction in junctions:
        junction_str = to_junction_str(junction)
        result_junctions.append([junction_str, junction_str in junctions_dic])

    return result_junctions


"""
"""
def is_junction_pair(junctions_dic, chr, pos, cigar_str, mate_chr, mate_pos, mate_cigar_str):
    junctions = is_junction_read(junctions_dic, chr, pos, cigar_str)
    mate_junctions = is_junction_read(junctions_dic, mate_chr, mate_pos, mate_cigar_str)
    junctions += mate_junctions
    return junctions


"""
"""
def find_in_gtf_junctions(chr_dic, gtf_junctions, junction, relax_dist = 5):
    def find_in_gtf_junctions(gtf_junctions, junction):
        l, u = 0, len(gtf_junctions)
        while l < u:
            m = (l + u) / 2
            assert m >= 0 and m < len(gtf_junctions)
            cmp_result = junction_cmp(junction, gtf_junctions[m])
            if cmp_result == 0:
                return m
            elif cmp_result < 0:
                u = m
            else:
                l = m + 1

        return l

    chr, left, right = junction
    gtf_index = find_in_gtf_junctions(gtf_junctions, [chr, left - relax_dist, right - relax_dist])

    if gtf_index >= 0:
        i = gtf_index
        while i < len(gtf_junctions):
            chr2, left2, right2 = gtf_junctions[i]
            if chr2 > chr or \
                    left2 - left > relax_dist or \
                    right2 - right > relax_dist:
                break

            if abs(left - left2) <= relax_dist and left - left2 == right - right2:
                test_small = ":" in chr
                if is_canonical_junction(chr_dic, gtf_junctions[i]):
                    if left == left2:
                        return i
                    else:
                        return -1
                else:
                    return i
            i += 1

    return -1


def singleInMaps(chr, pos, pos2, cigar, NM, maps):
    for i in range(len(maps)):
        if chr != maps[i][0]:
            continue
        if (pos == maps[i][1] or pos2 == maps[i][2]):
            return True, i
    return False, -1


"""
"""
def compare_single_sam(RNA,
                       reference_sam,
                       query_sam,
                       mapped_fname,
                       chr_dic,
                       gtf_junctions,
                       gtf_junctions_set,
                       ex_gtf_junctions,
                       aligner):
    aligned, multi_aligned = 0, 0
    db_dic, db_junction_dic = {}, {}
    mapped_file = open(mapped_fname, "w")
    first_mapped_file = open(mapped_fname + ".first", "w")
    file = open(reference_sam, "r")
    junction_read_dic = {}
    for line in file:
        if line.startswith('@'):
            continue

        read_name, chr, pos, cigar, NM = line[:-1].split()
        pos, NM = int(pos), int(NM[5:])

        if read_name.find("seq.") == 0:
            read_name = read_name[4:]

        if len(read_name) > 2 and read_name[-2] == '/':
            read_name = read_name[:-2]

        multi_aligned += 1
        if read_name not in db_dic:
            db_dic[read_name] = []
            aligned += 1

        pos2 = get_right(pos, cigar)
        db_dic[read_name].append([chr, pos, pos2, cigar, NM])

        read_junctions = is_junction_read(gtf_junctions_set, chr, pos, cigar)
        if len(read_junctions) > 0:
            if read_name not in db_junction_dic:
                db_junction_dic[read_name] = []

            for junction_str, is_gtf_junction in read_junctions:
                db_junction_dic[read_name].append([junction_str, is_gtf_junction])

                if junction_str not in junction_read_dic:
                    junction_read_dic[junction_str] = []
                junction_read_dic[junction_str].append(line[:-1])

    file.close()

    temp_junctions, temp_gtf_junctions = set(), set()
    for read_name, can_junctions in db_junction_dic.items():
        if len(can_junctions) <= 0:
            continue

        # DK - for debugging purposes
        # 1. select the best candidate among spliced alignments if multiple

        def pickup_junction(can_junctions):
            junctions = [can_junctions[0]]
            for i in range(1, len(can_junctions)):
                def get_intron_len(can_junction):
                    chr, left, right = to_junction(can_junction)
                    return right - left - 1

                intron, intron_cmp = get_intron_len(junctions[0][0]), get_intron_len(can_junctions[i][0])

                if intron > intron_cmp:
                    junctions = [can_junctions[i]]
                elif intron == intron_cmp:
                    junctions.append(can_junctions[i])

            return junctions

        # can_junctions = pickup_junction(can_junctions)

        for can_junction in can_junctions:
            found_junction_str = None
            junction_str, is_gtf_junction = can_junction
            if is_gtf_junction:
                found_junction_str = junction_str

            if not found_junction_str:
                junction = to_junction(junction_str)
                gtf_index = find_in_gtf_junctions(chr_dic, gtf_junctions, junction)

                if gtf_index >= 0:
                    is_gtf_junction = True
                    found_junction_str = to_junction_str(gtf_junctions[gtf_index])

            if found_junction_str:
                temp_gtf_junctions.add(found_junction_str)
                temp_junctions.add(found_junction_str)
            else:
                if junction_str not in temp_junctions:
                    None
                    # assert junction_str in junction_read_dic
                    # DK - for debugging purposes
                    """
                    if len(junction_read_dic[junction_str]) <= 2:
                        canonical = is_canonical_junction(chr_dic, to_junction(junction_str))
                        if not canonical:
                            print >> sys.stdout, read_name, junction_str, len(junction_read_dic[junction_str]), can_junctions
                            for line in junction_read_dic[junction_str]:
                                print >> sys.stdout, "\t", line
                    """
                temp_junctions.add(junction_str)


    temp2_junctions = []
    for junction in temp_junctions:
        temp2_junctions.append(to_junction(junction))
    temp_junctions = sorted(list(temp2_junctions), cmp=junction_cmp)
    temp2_junctions = []
    for can_junction in temp_junctions:
        if len(temp2_junctions) <= 0:
            temp2_junctions.append(can_junction)
        else:
            chr, left, right = temp2_junctions[-1]
            chr2, left2, right2 = can_junction
            if chr == chr2 and \
                    abs(left - left2) == abs(right - right2) and \
                    abs(left - left2) <= 10 and \
                    not to_junction_str(can_junction) in temp_gtf_junctions:
                continue
            temp2_junctions.append(can_junction)

    temp_junctions = set()
    for junction in temp2_junctions:
        temp_junctions.add(to_junction_str(junction))

    file = open(query_sam)
    mapped, unmapped, unique_mapped, first_mapped, mapping_point = 0, 0, 0, 0, 0.0
    snp_mapped, snp_unmapped, snp_unique_mapped, snp_first_mapped = 0, 0, 0, 0
    for line in file:
        if line.startswith('@'):
            continue
        fields = line[:-1].split()
        read_name, chr, pos, cigar = fields[:4]
        trans_id, NM, Zs = None, None, None
        for field in fields[4:]:
            if field.startswith("TI"):
                trans_id = field[5:]
            elif field.startswith("NM"):
                NM = int(field[5:])
            elif field.startswith("Zs"):
                Zs = field[5:]
        snp_included = (Zs != None)

        pos = int(pos)
        pos2 = get_right(pos, cigar)
        if read_name not in db_dic:
            unmapped += 1
            if snp_included:
                snp_unmapped += 1
            continue

        maps = db_dic[read_name]
        found = False
        found_at_first = False

        if (aligner == "hisat-TLA" and RNA):
            found, index = singleInMaps(chr, pos, pos2, cigar, NM, maps)
            if found:
                if index == 0:
                    found_at_first = True

        else:
            if [chr, pos, pos2, cigar, NM] in maps:
                found = True
                if maps.index([chr, pos, pos2, cigar, NM]) == 0:
                    found_at_first = True

        # DK - debugging purposes
        if False and len(maps) > 0 and maps[0][-1] < NM:
            found = True

        if not found:
            for idx, map in enumerate(maps):
                if chr == map[0] and \
                        pos == map[1] and \
                        pos2 == map[2] and \
                        get_cigar_chars(cigar) == get_cigar_chars(map[3]):

                    read_junctions = is_junction_read(gtf_junctions_set, map[0], map[1], map[3])
                    if True:
                        found_list = [False for i in range(len(read_junctions))]
                        for j in  range(len(read_junctions)):
                            junction_str, is_gtf_junction = read_junctions[j]
                            junction = to_junction(junction_str)
                            gtf_index = find_in_gtf_junctions(chr_dic, gtf_junctions, junction)
                            if gtf_index >= 0:
                                found_list[j] = True
                        found = not (False in found_list)
                    else:
                        found = False

                    if found:
                        if idx == 0:
                            found_at_first = True
                    break

        if found:
            print >> mapped_file, read_name
            mapped += 1
            if snp_included:
                snp_mapped += 1
            if len(maps) == 1:
                unique_mapped += 1
                if snp_included:
                    snp_unique_mapped += 1
            if found_at_first:
                print >> first_mapped_file, read_name
                first_mapped += 1
                if snp_included:
                    snp_first_mapped += 1
            mapping_point += (1.0 / len(maps))
        else:
            unmapped += 1
            if snp_included:
                snp_unmapped += 1

    file.close()
    mapped_file.close()
    first_mapped_file.close()

    # DK - for debugging purposes
    false_can_junctions, false_noncan_junctions = 0, 0
    for junction_str in temp_junctions:
        if junction_str in temp_gtf_junctions:
            continue
        if junction_str in ex_gtf_junctions:
            continue
        if is_canonical_junction(chr_dic, to_junction(junction_str)):
            false_can_junctions += 1
        else:
            false_noncan_junctions += 1
    print >> sys.stderr, "\t\t\tfalse junctions: %d (canonical), %d (non-canonical)" % (false_can_junctions, false_noncan_junctions)

    return mapped, unique_mapped, first_mapped, unmapped, aligned, multi_aligned, \
           snp_mapped, snp_unique_mapped, snp_first_mapped, snp_unmapped, \
           len(temp_junctions), len(temp_gtf_junctions), mapping_point


def pairedInMaps(chr, pos, pos_right, cigar, pos2, pos2_right, cigar2, NM, NM2, maps):
    for i in range(len(maps)):
        if chr != maps[i][0]:
            continue;
        if (pos == maps[i][1] or pos_right == maps[i][2]) and (pos2 == maps[i][4] or pos2_right == maps[i][5]):
            return True, i
    return False, -1
"""
"""
def compare_paired_sam(RNA,
                       reference_sam,
                       query_sam,
                       mapped_fname,
                       chr_dic,
                       gtf_junctions,
                       gtf_junctions_set,
                       ex_gtf_junctions,
                       aligner):
    aligned, multi_aligned = 0, 0
    db_dic, db_junction_dic, junction_pair_dic = {}, {}, {}
    mapped_file = open(mapped_fname, "w")
    uniq_mapped_file = open(mapped_fname + '.uniq', "w")
    first_mapped_file = open(mapped_fname + '.first', "w")
    file = open(reference_sam, "r")
    for line in file:
        if line[0] == '@':
            continue
        read_name, chr, pos, cigar, chr2, pos2, cigar2, NM, NM2 = line[:-1].split()
        pos, pos2 = int(pos), int(pos2)
        NM, NM2 = int(NM[5:]), int(NM2[5:])

        if read_name.find("seq.") == 0:
            read_name = read_name[4:]

        if len(read_name) > 2 and read_name[-2] == '/':
            read_name = read_name[:-2]

        multi_aligned += 1
        if read_name not in db_dic:
            db_dic[read_name] = []
            aligned += 1

        pos_right, pos2_right = get_right(pos, cigar), get_right(pos2, cigar2)
        db_dic[read_name].append([chr, pos, pos_right, cigar, pos2, pos2_right, cigar2, NM, NM2])


        pair_junctions = is_junction_pair(gtf_junctions_set, chr, pos, cigar, chr2, pos2, cigar2)
        if len(pair_junctions) > 0:
            if read_name not in db_junction_dic:
                db_junction_dic[read_name] = []

            for junction_str, is_gtf_junction in pair_junctions:
                db_junction_dic[read_name].append([junction_str, is_gtf_junction])

                # DK - for debugging purposes
                if junction_str not in junction_pair_dic:
                    junction_pair_dic[junction_str] = []
                junction_pair_dic[junction_str].append(line[:-1])

    file.close()

    temp_junctions, temp_gtf_junctions = set(), set()
    for read_name, can_junctions in db_junction_dic.items():
        if len(can_junctions) <= 0:
            continue

        # DK - for debugging purposes
        # 1. select the best candidate among spliced alignments if multiple

        def pickup_junction(can_junctions):
            junctions = [can_junctions[0]]
            for i in range(1, len(can_junctions)):
                def get_intron_len(can_junction):
                    chr, left, right = to_junction(can_junction)
                    return right - left - 1

                intron, intron_cmp = get_intron_len(junctions[0][0]), get_intron_len(can_junctions[i][0])

                if intron > intron_cmp:
                    junctions = [can_junctions[i]]
                elif intron == intron_cmp:
                    junctions.append(can_junctions[i])

            return junctions

        # can_junctions = pickup_junction(can_junctions)

        for can_junction in can_junctions:
            found_junction_str = None
            junction_str, is_gtf_junction = can_junction

            # DK - for debugging purposes
            assert junction_str in junction_pair_dic
            if len(junction_pair_dic[junction_str]) <= 5:
                continue

            if is_gtf_junction:
                found_junction_str = junction_str

            if not found_junction_str:
                junction = to_junction(junction_str)
                gtf_index = find_in_gtf_junctions(chr_dic, gtf_junctions, junction)

                if gtf_index >= 0:
                    is_gtf_junction = True
                    found_junction_str = to_junction_str(gtf_junctions[gtf_index])

            if found_junction_str:
                temp_gtf_junctions.add(found_junction_str)
                temp_junctions.add(found_junction_str)
            else:
                if junction_str not in temp_junctions:
                    None
                    # assert junction_str in junction_read_dic
                    # print >> sys.stdout, read_name, junction_str, len(junction_read_dic[junction_str])
                    # for line in junction_read_dic[junction_str]:
                    #     print >> sys.stdout, "\t", line

                temp_junctions.add(junction_str)

    # DK - for debugging purposes
    filter_junction_db = {}

    temp2_junctions = []
    for junction in temp_junctions:
        temp2_junctions.append(to_junction(junction))
    temp_junctions = sorted(list(temp2_junctions), cmp=junction_cmp)
    temp2_junctions = []
    for can_junction in temp_junctions:
        if len(temp2_junctions) <= 0:
            temp2_junctions.append(can_junction)

            # DK - for debugging purposes
            # assert to_junction_str(can_junction) in junction_pair_dic
            # filter_junction_db[to_junction_str(can_junction)] = len(junction_pair_dic[to_junction_str(can_junction)])
        else:
            chr, left, right = temp2_junctions[-1]
            chr2, left2, right2 = can_junction
            if chr == chr2 and \
                    abs(left - left2) == abs(right - right2) and \
                    abs(left - left2) <= 10 and \
                    not to_junction_str(can_junction) in temp_gtf_junctions:

                # DK - for debugging purposes
                # assert to_junction_str(temp2_junctions[-1]) in junction_pair_dic
                # assert to_junction_str(temp2_junctions[-1]) in filter_junction_dic
                # filter_junction_db[to_junction_str(temp2_junctions[-1])] += len(junction_pair_dic[to_junction_str(temp2_junctions[-1])])

                continue

            temp2_junctions.append(can_junction)

            # DK - for debugging purposes
            # assert to_junction_str(can_junction) in junction_pair_dic
            # filter_junction_db[to_junction_str(can_junction)] = len(junction_pair_dic[to_junction_str(can_junction)])

    temp_junctions = set()
    for junction in temp2_junctions:
        # DK - for debugging purposes
        # assert to_junction_str(junction) in filter_junction_dic
        # if filter_junction_dic[to_junction_str(junction)] <= 5:
        #    continue

        temp_junctions.add(to_junction_str(junction))

    file = open(query_sam)
    mapped, unique_mapped, first_mapped, unmapped, mapping_point = 0, 0, 0, 0, 0.0
    snp_mapped, snp_unique_mapped, snp_first_mapped, snp_unmapped = 0, 0, 0, 0
    for line in file:
        if line.startswith('@'):
            continue

        fields = line[:-1].split()
        read_name, chr, pos, cigar, chr2, pos2, cigar2 = fields[:7]
        trains_id, NM, NM2, Zs, Zs2 = None, None, None, None, None
        for field in fields[7:]:
            if field.startswith("TI"):
                trans_id = field[5:]
            elif field.startswith("NM"):
                if NM == None:
                    NM = int(field[5:])
                else:
                    NM2 = int(field[5:])
            elif field.startswith("Zs"):
                if Zs == None:
                    Zs = field[5:]
                else:
                    Zs2 = field[5:]
        snp_included = (Zs != None or Zs2 != None)

        pos, pos2 = int(pos), int(pos2)
        pos_right, pos2_right = get_right(pos, cigar), get_right(pos2, cigar2)

        if read_name not in db_dic:
            unmapped += 1
            if snp_included:
                snp_unmapped += 1
            continue

        maps = db_dic[read_name]

        found = False
        found_at_first = False


        if (aligner == "hisat-TLA" and RNA) :
            found, index = pairedInMaps(chr, pos, pos_right, cigar, pos2, pos2_right, cigar2, NM, NM2, maps)
            if (found):
                if (index == 0):
                    found_at_first = True
        else:
            if [chr, pos, pos_right, cigar, pos2, pos2_right, cigar2, NM, NM2] in maps:
                found = True
                if maps.index([chr, pos, pos_right, cigar, pos2, pos2_right, cigar2, NM, NM2]) == 0:
                    found_at_first = True

        # DK - debugging purposes
        if False and len(maps) > 0 and maps[0][-1] + maps[0][-2] < NM + NM2:
            found = True

        if not found:
            for idx, map in enumerate(maps):
                if chr == map[0] and \
                        pos == map[1] and \
                        pos_right == map[2] and \
                        get_cigar_chars(cigar) == get_cigar_chars(map[3]) and \
                        pos2 == map[4] and \
                        pos2_right == map[5] and \
                        get_cigar_chars(cigar2) == get_cigar_chars(map[6]):

                    pair_junctions = is_junction_pair(gtf_junctions_set, map[0], map[1], map[3], map[0], map[4], map[6])
                    if True:
                        found_list = [False for i in range(len(pair_junctions))]
                        for j in  range(len(pair_junctions)):
                            junction_str, is_gtf_junction = pair_junctions[j]
                            junction = to_junction(junction_str)
                            gtf_index = find_in_gtf_junctions(chr_dic, gtf_junctions, junction)
                            if gtf_index >= 0:
                                found_list[j] = True
                        found = not (False in found_list)
                    else:
                        found = False

                    if found:
                        if idx == 0:
                            found_at_first = True
                    break

        if found:
            print >> mapped_file, read_name
            mapped += 1
            if snp_included:
                snp_mapped += 1
            if len(maps) == 1:
                unique_mapped += 1
                print >> uniq_mapped_file, read_name
                if snp_included:
                    snp_unique_mapped += 1
            if found_at_first:
                print >> first_mapped_file, read_name
                first_mapped += 1
                if snp_included:
                    snp_first_mapped += 1

            mapping_point += (1.0 / len(maps))
        else:
            unmapped += 1
            if snp_included:
                snp_unmapped += 1

    file.close()
    mapped_file.close()
    uniq_mapped_file.close()
    first_mapped_file.close()

    # DK - for debugging purposes
    false_can_junctions, false_noncan_junctions = 0, 0
    for junction_str in temp_junctions:
        if junction_str in temp_gtf_junctions:
            continue
        if is_canonical_junction(chr_dic, to_junction(junction_str)):
            false_can_junctions += 1
        else:
            false_noncan_junctions += 1
    print >> sys.stderr, "\t\t\tfalse junctions: %d (canonical), %d (non-canonical)" % (false_can_junctions, false_noncan_junctions)


    return mapped, unique_mapped, first_mapped, unmapped, aligned, multi_aligned, \
           snp_mapped, snp_unique_mapped, snp_first_mapped, snp_unmapped, \
           len(temp_junctions), len(temp_gtf_junctions), mapping_point


"""
"""
def extract_mapped_unmapped(read_fname, mapped_id_fname, mapped_fname, unmapped_fname, read2_fname = "", mapped2_fname = "", unmapped2_fname = ""):
    mapped_ids = set()
    mapped_id_file = open(mapped_id_fname)
    for line in mapped_id_file:
        read_id = int(line[:-1])
        mapped_ids.add(read_id)

    mapped_id_file.close()

    def write_reads(read_fname, mapped_fname, unmapped_fname):
        mapped_file = open(mapped_fname, "w")
        unmapped_file = open(unmapped_fname, "w")
        read_file = open(read_fname)
        write = False
        for line in read_file:
            if line[0] == "@":
                read_id = int(line[1:-1])
                write = read_id in mapped_ids

            if write:
                print >> mapped_file, line[:-1]
            else:
                print >> unmapped_file, line[:-1]

        read_file.close()
        mapped_file.close()
        unmapped_file.close()

    write_reads(read_fname, mapped_fname, unmapped_fname)
    if read2_fname != "":
        assert mapped2_fname != ""
        assert unmapped2_fname != ""
        write_reads(read2_fname, mapped2_fname, unmapped2_fname)


"""
"""
def sql_execute(sql_db, sql_query):
    sql_cmd = [
        "sqlite3", sql_db,
        "-separator", "\t",
        "%s;" % sql_query
    ]
    # print >> sys.stderr, sql_cmd
    sql_process = subprocess.Popen(sql_cmd, stdout=subprocess.PIPE)
    output = sql_process.communicate()[0][:-1]
    return output


"""
"""
def create_sql_db(sql_db):
    if os.path.exists(sql_db):
        print >> sys.stderr, sql_db, "already exists!"
        return

    columns = [
        ["id", "integer primary key autoincrement"],
        ["genome", "text"],
        ["head", "text"],
        ["end_type", "text"],
        ["type", "text"],
        ["aligner", "text"],
        ["version", "text"],
        ["num_reads", "integer"],
        ["mapped_reads", "integer"],
        ["unique_mapped_reads", "integer"],
        ["unmapped_reads", "integer"],
        ["mapping_point", "real"],
        ["snp_mapped_reads", "integer"],
        ["snp_unique_mapped_reads", "integer"],
        ["snp_unmapped_reads", "integer"],
        ["time", "real"],
        ["mem", "integer"],
        ["true_gtf_junctions", "integer"],
        ["temp_junctions", "integer"],
        ["temp_gtf_junctions", "integer"],
        ["host", "text"],
        ["created", "text"],
        ["cmd", "text"]
    ]

    sql_create_table = "CREATE TABLE ReadCosts ("
    for i in range(len(columns)):
        name, type = columns[i]
        if i != 0:
            sql_create_table += ", "
        sql_create_table += ("%s %s" % (name, type))
    sql_create_table += ");"
    sql_execute(sql_db, sql_create_table)


"""
"""
def write_analysis_data(sql_db, genome_name, database_name):
    if not os.path.exists(sql_db):
        return

    aligners = []
    sql_aligners = "SELECT aligner FROM ReadCosts GROUP BY aligner"
    output = sql_execute(sql_db, sql_aligners)
    aligners = output.split()

    can_read_types = ["all", "M", "2M_gt_15", "2M_8_15", "2M_1_7", "gt_2M"]
    tmp_read_types = []
    sql_types = "SELECT type FROM ReadCosts GROUP BY type"
    output = sql_execute(sql_db, sql_types)
    tmp_read_types = output.split()

    read_types = []
    for read_type in can_read_types:
        if read_type in tmp_read_types:
            read_types.append(read_type)

    for paired in [False, True]:
        database_fname = genome_name + "_" + database_name
        if paired:
            end_type = "paired"
            database_fname += "_paired"
        else:
            end_type = "single"
            database_fname += "_single"
        database_fname += ".analysis"
        database_file = open(database_fname, "w")
        print >> database_file, "end_type\ttype\taligner\tnum_reads\ttime\tmem\tmapped_reads\tunique_mapped_reads\tunmapped_reads\tmapping_point\ttrue_gtf_junctions\ttemp_junctions\ttemp_gtf_junctions"
        for aligner in aligners:
            for read_type in read_types:
                sql_row = "SELECT end_type, type, aligner, num_reads, time, mem, mapped_reads, unique_mapped_reads, unmapped_reads, mapping_point, snp_mapped_reads, snp_unique_mapped_reads, snp_unmapped_reads, true_gtf_junctions, temp_junctions, temp_gtf_junctions FROM ReadCosts"
                sql_row += " WHERE genome = '%s' and head = '%s' and aligner = '%s' and type = '%s' and end_type = '%s' ORDER BY created DESC LIMIT 1" % (genome_name, database_name, aligner, read_type, end_type)
                output = sql_execute(sql_db, sql_row)
                if output:
                    print >> database_file, output

        database_file.close()


"""
"""
def calculate_read_cost(single_end,
                        paired_end,
                        test_aligners,
                        fresh,
                        runtime_only,
                        baseChange,
                        verbose):
    sql_db_name = "analysis.db"
    if not os.path.exists(sql_db_name):
        create_sql_db(sql_db_name)

    num_cpus = multiprocessing.cpu_count()
    if num_cpus > 10:
        num_threads = min(10, num_cpus)
        desktop = False
    else:
        #num_threads = min(4, num_cpus)
        num_threads = num_cpus - 1
        desktop = True

    data_base = "sim"
    test_large_index = False
    verbose = False
    sql_write = True
    readtypes = ["all", "M", "2M_gt_15", "2M_8_15", "2M_1_7", "gt_2M"]

    aligners = [
        # [aligner, two_step, index_type, aligner_version, addition_options]
        # ["hisat", "", "", "", ""],
        # ["hisat2", "", "", "204", ""],
        # ["hisat2", "", "", "210", ""],
        # ["hisat2", "", "snp", "203b", ""],
        # ["hisat2", "", "snp", "210", ""],
        # ["hisat2", "", "tran", "210", ""],
        # ["hisat2", "", "snp_tran", "210", ""],
        # ["hisat2", "", "", "210", ""],
        ["hisat2", "", "", "", ""],
        #["hisat2", "", "rep", "", ""],
        # ["hisat2", "", "rep-100-300", "", ""],
        # ["hisat2", "", "rep_mm", "", ""],
        # ["hisat2", "", "", "", "--sensitive"],
        # ["hisat2", "", "rep", "", "--sensitive"],
        # ["hisat2", "", "", "", "--very-sensitive"],
        # ["hisat2", "", "snp", "", ""],
        # ["hisat2", "", "snp", "", "--sensitive"],
        # ["hisat2", "", "snp_noht", "", ""],
        # ["hisat2", "x2", "", "", ""],
        # ["hisat2", "x1", "tran", "", ""],
        # ["hisat2", "", "tran", "", ""],
        # ["hisat2", "", "snp_tran", "", ""],
        # ["hisat2", "x1", "snp_tran", "", ""],
        # ["hisat2", "x1", "snp_tran_ercc", "", ""],
        # ["tophat2", "gtfonly", "", "", ""],
        # ["tophat2", "gtf", "", "", ""],
        #["star", "", "", "", ""],
        # ["star", "x2", "", "", ""],
        # ["star", "gtf", "", "", ""],
        # ["bowtie", "", "", "", ""],
        #["bowtie2", "", "", "", ""],
        # ["bowtie2", "", "", "", "-k 10"],
        # ["bowtie2", "", "", "", "-k 1000 --extends 2000"],
        # ["gsnap", "", "", "", ""],
        # ["bwa", "mem", "", "", ""],
        #["bwa", "mem", "", "", "-a"],
        # ["hisat2", "", "snp", "", ""],
        # ["hisat2", "", "tran", "", ""],
        # ["hisat2", "", "snp_tran", "", ""],
        # ["vg", "", "", "", ""],
        # ["vg", "", "", "", "-M 10"],
        # ["vg", "", "snp", "", ""],
        # ["vg", "", "snp", "", "-M 10"],
        # ["minimap2", "", "", "", ""],
        ["hisat-TLA", "", "", "", ""],
    ]
    readtypes = ["all"]
    verbose = True
    debug = False
    # sql_write = False

    cwd = os.getcwd()
    if len(cwd.split("reads_")) > 1:
        genome = cwd.split("reads_")[1].split("_")[0]
    else:
        genome = "genome"
    RNA = (cwd.find("RNA") != -1)

    test_small = (genome != "genome")

    if runtime_only:
        verbose = True

    chr_dic = read_genome("../../data/%s.fa" % genome)
    gtf_junctions = extract_splice_sites("../../data/%s.gtf" % genome)
    repeat_db, repeat_map = read_repeatdb("../../data/%s_rep.rep.info" % genome)
    align_stat = []
    for paired in [False, True]:
        if not paired and not single_end:
            continue
        if paired and not paired_end:
            continue
        for readtype in readtypes:
            if paired:
                base_fname = data_base + "_paired"
                type_sam_fname = base_fname + "_" + readtype + ".sam"
                type_read1_fname = base_fname +  "_1_" + readtype + ".fa"
                type_read2_fname = base_fname +  "_2_" + readtype + ".fa"
                type_junction_fname = base_fname + "_" + readtype + ".junc"
            else:
                base_fname = data_base + "_single"
                type_sam_fname = base_fname + "_" + readtype + ".sam"
                type_read1_fname = base_fname + "_" + readtype + ".fa"
                type_read2_fname = ""
                type_junction_fname = base_fname + "_" + readtype + ".junc"

            assert os.path.exists(type_sam_fname) and os.path.exists(type_junction_fname)
            numreads = 0
            type_sam_file = open(type_sam_fname)
            for line in type_sam_file:
                numreads += 1
            type_sam_file.close()
            if numreads <= 0:
                continue
            print >> sys.stderr, "%s\t%d" % (readtype, numreads)

            junctions, junctions_set = [], set()
            type_junction_file = open(type_junction_fname)
            for line in type_junction_file:
                chr, left, right = line[:-1].split()
                left, right = int(left), int(right)
                junctions.append([chr, left, right])
                junctions_set.add(to_junction_str([chr, left, right]))

            type_junction_file.close()

            aligner_bin_base = "../../../aligners/bin"
            def get_aligner_version(aligner, version):
                version = ""
                if aligner == "hisat2" or \
                        aligner == "hisat" or \
                        aligner == "bowtie" or \
                        aligner == "bowtie2":
                    if version:
                        cmd = ["%s/%s_%s/%s" % (aligner_bin_base, aligner, version, aligner)]
                    else:
                        cmd = ["%s/%s" % (aligner_bin_base, aligner)]
                    cmd += ["--version"]
                    cmd_process = subprocess.Popen(cmd, stdout=subprocess.PIPE)
                    version = cmd_process.communicate()[0][:-1].split("\n")[0]
                    version = version.split()[-1]
                elif aligner == "tophat2":
                    cmd = ["%s/tophat" % (aligner_bin_base)]
                    cmd += ["--version"]
                    cmd_process = subprocess.Popen(cmd, stdout=subprocess.PIPE)
                    version = cmd_process.communicate()[0][:-1].split()[-1]
                elif aligner in ["star", "starx2"]:
                    version = "2.4.2a"
                elif aligner == "gsnap":
                    cmd = ["%s/gsnap" % (aligner_bin_base)]
                    cmd_process = subprocess.Popen(cmd, stderr=subprocess.PIPE)
                    version = cmd_process.communicate()[1][:-1].split("\n")[0]
                    version = version.split()[2]
                elif aligner == "bwa":
                    if version:
                        cmd = ["%s/bwa_%s/bwa" % (aligner_bin_base, version)]
                    else:
                        cmd = ["%s/bwa" % (aligner_bin_base)]
                    cmd_process = subprocess.Popen(cmd, stderr=subprocess.PIPE)
                    version = cmd_process.communicate()[1][:-1].split("\n")[2]
                    version = version.split()[1]
                elif aligner == "vg":
                    cmd = ["%s/vg" % (aligner_bin_base)]
                    cmd_process = subprocess.Popen(cmd, stderr=subprocess.PIPE)
                    version = cmd_process.communicate()[1][:-1].split("\n")[0]
                    version = version.split()[5]
                elif aligner == "minimap2":
                    cmd = ["%s/minimap2" % (aligner_bin_base)]
                    cmd += ["--version"]
                    cmd_process = subprocess.Popen(cmd, stdout=subprocess.PIPE)
                    version = cmd_process.communicate()[0][:-1].split("\n")[0]

                return version

            index_base = "../../../indexes"
            index_add = ""
            if genome != "genome":
                index_add = "_" + genome
            def get_aligner_cmd(RNA, aligner, type, index_type, version, options, read1_fname, read2_fname, out_fname, cmd_idx = 0):
                cmd = ["/usr/bin/time"]
                if osx_mode:
                    cmd += ['-l']
                if aligner == "hisat2":
                    if version:
                        cmd += ["%s/hisat2_%s/hisat2" % (aligner_bin_base, version)]
                    else:
                        cmd += ["%s/hisat2" % (aligner_bin_base)]
                    if num_threads > 1:
                        cmd += ["-p", str(num_threads)]
                    cmd += ["-f"]
                    # cmd += ["-k", "100"]
                    # cmd += ["--max-seeds", "100"]
                    # cmd += ["--score-min", "C,-100"]
                    # cmd += ["--pen-cansplice", "0"]
                    # cmd += ["--pen-noncansplice", "12"]
                    # cmd += ["--pen-intronlen", "G,-8,1"]
                    # cmd += ["--metrics", "1",
                    #        "--metrics-file", "metrics.out"]

                    if not RNA:
                        cmd += ["--no-spliced-alignment"]

                    if type in ["x1", "x2"]:
                        cmd += ["--no-temp-splicesite"]

                    # cmd += ["--no-anchorstop"]
                    if version == "" or \
                            (version != "" and int(version) >= 210):
                        cmd += ["--new-summary",
                                "--summary-file", out_fname + ".summary"]

                    if version == "" or int(version) >= 220:
                        cmd += ["--repeat"]

                    # cmd += ["--dta"]
                    # cmd += ["--dta-cufflinks"]

                    if options != "":
                        cmd += options.split(' ')

                    if type == "x2":
                        if cmd_idx == 0:
                            cmd += ["--novel-splicesite-outfile"]
                        else:
                            cmd += ["--novel-splicesite-infile"]
                        cmd += ["splicesites.txt"]

                    # "--novel-splicesite-infile",
                    # "../splicesites.txt",
                    # "--rna-strandness",
                    # "FR",
                    if version:
                        index_cmd = "%s/HISAT2_%s%s/" % (index_base, version, index_add) + genome
                    else:
                        index_cmd = "%s/HISAT2%s/" % (index_base, index_add) + genome
                    if index_type:
                        index_cmd += ("_" + index_type)
                    cmd += [index_cmd]
                    if paired:
                        cmd += ["-1", read1_fname,
                                "-2", read2_fname]
                    else:
                        cmd += [read1_fname]

                elif aligner == "hisat-TLA":

                    if version:
                        cmd += ["%s/hisat2_%s/hisat2" % (aligner_bin_base, version)]
                    else:
                        cmd += ["%s/hisat2" % (aligner_bin_base)]
                    if num_threads > 1:
                        cmd += ["-p", str(num_threads)]
                    cmd += ["-f"]
                    if not RNA:
                        cmd += ["--no-spliced-alignment"]
                    if type in ["x1", "x2"]:
                        cmd += ["--no-temp-splicesite"]
                    if version == "" or \
                            (version != "" and int(version) >= 210):
                        cmd += ["--new-summary",
                                "--summary-file", out_fname + ".summary"]
                    if version == "" or int(version) >= 220:
                        cmd += ["--repeat"]
                    if paired:
                        cmd += ["-1", read1_fname,
                                "-2", read2_fname]
                    else:
                        cmd += ["-U", read1_fname]

                    cmd += ["--TLA"]
                    cmd += ["--base-change"]
                    cmd += [baseChange]
                    cmd += ["--expand-repeat"]

                    index1_cmd = "%s/HISAT2%s/" % (index_base, index_add) + genome + '_' + baseChange
                    if index_type:
                        index1_cmd += ("_" + index_type)
                    cmd += ["--index1"]
                    cmd += [index1_cmd]

                    index2_cmd = "%s/HISAT2%s/" % (index_base, index_add) + genome + '_' + reverse_complement(baseChange[0]) + reverse_complement(baseChange[1])
                    if index_type:
                        index2_cmd += ("_" + index_type)
                    cmd += ["--index2"]
                    cmd += [index2_cmd]

                    cmd += ["--reference"]
                    cmd += ["../../../data/%s.fa" % genome]


                elif aligner == "hisat":
                    cmd += ["%s/hisat" % (aligner_bin_base)]
                    if num_threads > 1:
                        cmd += ["-p", str(num_threads)]
                    cmd += ["-f"]
                    # cmd += ["-k", "5"]
                    # cmd += ["--score-min", "C,-18"]
                    if version != "":
                        version = int(version)
                    else:
                        version = sys.maxint

                    if not RNA:
                        cmd += ["--no-spliced-alignment"]

                    if type in ["x1", "x2"]:
                        cmd += ["--no-temp-splicesite"]

                    """
                    cmd += ["--rdg", "100,100",
                            "--rfg", "100,100"]
                    """

                    if type == "x2":
                        if cmd_idx == 0:
                            cmd += ["--novel-splicesite-outfile"]
                        else:
                            cmd += ["--novel-splicesite-infile"]
                        cmd += ["splicesites.txt"]

                    # "--novel-splicesite-infile",
                    # "../splicesites.txt",
                    # "--rna-strandness",
                    # "FR",
                    cmd += ["%s/HISAT%s/" % (index_base, index_add) + genome]
                    if paired:
                        cmd += ["-1", read1_fname,
                                "-2", read2_fname]
                    else:
                        cmd += [read1_fname]
                elif aligner == "tophat2":
                    cmd += ["%s/tophat" % (aligner_bin_base)]
                    if num_threads > 1:
                        cmd += ["-p", str(num_threads)]
                    if type.find("gtf") != -1:
                        cmd += ["--transcriptome-index=%s/HISAT%s/gtf/%s" % (index_base, index_add, genome)]
                    if type == "gtfonly":
                        cmd += ["--transcriptome-only"]
                    cmd += ["--read-edit-dist", "3"]
                    cmd += ["--no-sort-bam"]
                    cmd += ["--read-realign-edit-dist", "0"]
                    cmd += ["--keep-tmp",
                            "%s/HISAT%s/" % (index_base, index_add) + genome,
                            read1_fname]
                    if paired:
                        cmd += [read2_fname]
                elif aligner == "star":
                    cmd += ["%s/STAR" % (aligner_bin_base)]
                    if num_threads > 1:
                        cmd += ["--runThreadN", str(num_threads)]
                    cmd += ["--genomeDir"]
                    if cmd_idx == 0:
                        if type == "gtf":
                            cmd += ["%s/STAR%s/gtf" % (index_base, index_add)]
                        else:
                            cmd += ["%s/STAR%s" % (index_base, index_add)]
                    else:
                        assert cmd_idx == 1
                        cmd += ["."]

                    if desktop:
                        cmd += ["--genomeLoad", "NoSharedMemory"]
                    else:
                        cmd += ["--genomeLoad", "LoadAndKeep"]
                    if type == "x2":
                        if cmd_idx == 1:
                            cmd += ["--alignSJDBoverhangMin", "1"]
                    cmd += ["--readFilesIn",
                            read1_fname]
                    if paired:
                        cmd += [read2_fname]
                    if paired:
                        cmd += ["--outFilterMismatchNmax", "6"]
                    else:
                        cmd += ["--outFilterMismatchNmax", "3"]
                elif aligner == "bowtie":
                    cmd += ["%s/bowtie" % (aligner_bin_base)]
                    if num_threads > 1:
                        cmd += ["-p", str(num_threads)]
                    cmd += ["-f",
                            "--sam",
                            "-k", "10"]
                    cmd += ["-n", "3"]
                    if paired:
                        cmd += ["-X", "500"]
                    cmd += ["%s/Bowtie%s/" % (index_base, index_add) + genome]
                    if paired:
                        cmd += ["-1", read1_fname,
                                "-2", read2_fname]
                    else:
                        cmd += [read1_fname]
                elif aligner == "bowtie2":
                    if version:
                        cmd += ["%s/bowtie2_%s/bowtie2" % (aligner_bin_base, version)]
                    else:
                        cmd += ["%s/bowtie2" % (aligner_bin_base)]
                    if num_threads > 1:
                        cmd += ["-p", str(num_threads)]
                    cmd += ["-f"]
                    if options != "":
                        cmd += options.split(' ')
                    if version:
                        cmd += ["-x %s/Bowtie2_%s%s/" % (index_base, version, index_add) + genome]
                    else:
                        cmd += ["-x %s/Bowtie2%s/" % (index_base, index_add) + genome]
                    if paired:
                        cmd += ["-1", read1_fname,
                                "-2", read2_fname]
                    else:
                        cmd += [read1_fname]
                elif aligner == "gsnap":
                    cmd += ["%s/gsnap" % (aligner_bin_base),
                            "-A",
                            "sam"]
                    if num_threads > 1:
                        cmd += ["-t", str(num_threads)]
                    cmd += ["--max-mismatches=3",
                            "-D", "%s/GSNAP%s" % (index_base, index_add),
                            "-N", "1",
                            "-d", genome,
                            read1_fname]
                    if paired:
                        cmd += [read2_fname]
                elif aligner == "bwa":
                    if version:
                        cmd += ["%s/bwa_%s/bwa" % (aligner_bin_base, version)]
                    else:
                        cmd += ["%s/bwa" % (aligner_bin_base)]
                    if type in ["mem", "aln"]:
                        cmd += [type]
                    elif type == "sw":
                        cmd += ["bwa" + type]
                    if num_threads > 1:
                        cmd += ["-t", str(num_threads)]
                    if options != "":
                        cmd += options.split(' ')
                    # if paired:
                    #    cmd += ["-T", "60"]
                    if version:
                        cmd += ["%s/BWA_%s%s/%s.fa" % (index_base, version, index_add, genome)]
                    else:
                        cmd += ["%s/BWA%s/%s.fa" % (index_base, index_add, genome)]
                    cmd += [read1_fname]
                    if paired:
                        cmd += [read2_fname]
                elif aligner == "vg":
                    # vg map -d 22 -t 6 -M 10 -f ../sim-1.fa -f ../sim-2.fa > result.sam.gam
                    cmd += ["%s/vg" % (aligner_bin_base)]
                    cmd += ["map"]
                    cmd += ["-t", str(num_threads)]
                    cmd += ["--surject-to", "sam"]
                    index_cmd = "%s/VG%s/" % (index_base, index_add) + genome
                    if index_type:
                        index_cmd += ("_" + index_type)

                    if options != "":
                        cmd += options.split(' ')

                    cmd += ["-d", index_cmd]

                    cmd += ["-f", read1_fname]
                    if paired:
                        cmd += ["-f", read2_fname]
                elif aligner == "minimap2":
                    # minimap2 -a -x sr 22.mmi sim_1.fa sim_2.fa > result.sam
                    cmd += ["%s/minimap2" % (aligner_bin_base)]
                    cmd += ["-a"]
                    cmd += ["-x", "sr"]
                    index_cmd = "%s/minimap2%s/" % (index_base, index_add) + genome
                    if index_type:
                        index_cmd += ("_" + index_type)
                    index_cmd += ".mmi"
                    cmd += [index_cmd]
                    cmd += [read1_fname]
                    if paired:
                        cmd += [read2_fname]
                else:
                    assert False

                return cmd

            for aligner, type, index_type, version, options in aligners:
                skip = False
                if len(test_aligners) > 0:
                    skip = True
                    for test_aligner in test_aligners:
                        if aligner == test_aligner:
                            skip = False
                if skip:
                    continue

                aligner_name = aligner + type
                if version != "":
                    aligner_name += ("_%s" % version)
                if aligner == "hisat2" and index_type != "":
                    aligner_name += ("_" + index_type)
                if aligner == "vg" and index_type != "":
                    aligner_name += ("_" + index_type)

                two_step = (aligner == "tophat2" or type == "x2" or (aligner in ["hisat2", "hisat"] and type == ""))
                if RNA and readtype != "M":
                    if aligner in ["bowtie", "bowtie2", "bwa"]:
                        continue
                if readtype != "all":
                    if two_step:
                        continue
                if not RNA and readtype != "all":
                    continue

                print >> sys.stderr, "\t%s\t%s" % (aligner_name, str(datetime.now()))
                if options != "":
                    option_name = options.replace(' ', '').replace('-', '').replace(',', '')
                    aligner_name = aligner_name + '_' + option_name
                if paired:
                    aligner_dir = aligner_name + "_paired"
                else:
                    aligner_dir = aligner_name + "_single"

                if fresh and os.path.exists(aligner_dir):
                    os.system("rm -rf %s" % aligner_dir)

                if not os.path.exists(aligner_dir):
                    os.mkdir(aligner_dir)

                os.chdir(aligner_dir)

                out_fname = base_fname + "_" + readtype + ".sam"
                out_fname2 = out_fname + "2"
                duration = -1.0
                mem_usage = '0'
                if not os.path.exists(out_fname):
                    if not os.path.exists("../one.fa") or not os.path.exists("../two.fa"):
                        os.system("head -400 ../%s_1.fa > ../one.fa" % (data_base))
                        os.system("head -400 ../%s_2.fa > ../two.fa" % (data_base))

                    if runtime_only:
                        out_fname = "/dev/null"
                        out_fname2 = "/dev/null"

                    if not two_step:
                        align_stat.append([readtype, aligner_name])

                    # dummy commands for caching index and simulated reads
                    loading_time = 0
                    if aligner != "tophat2":
                        for i in range(3):
                            dummy_cmd = get_aligner_cmd(RNA, aligner, type, index_type, version, options, "../one.fa", "../two.fa", "/dev/null")
                            start_time = datetime.now()
                            if verbose:
                                print >> sys.stderr, start_time, "\t", " ".join(dummy_cmd)
                            if aligner in ["hisat2", "hisat", "bowtie", "bowtie2", "gsnap", "bwa"]:
                                proc = subprocess.Popen(dummy_cmd, stdout=open("/dev/null", "w"), stderr=subprocess.PIPE)
                            else:
                                proc = subprocess.Popen(dummy_cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
                            proc.communicate()
                            finish_time = datetime.now()
                            duration = finish_time - start_time
                            duration = duration.total_seconds()
                            if verbose:
                                print >> sys.stderr, finish_time, "duration:", duration
                            loading_time = duration

                    # Align all reads
                    aligner_cmd = get_aligner_cmd(RNA, aligner, type, index_type, version, options, "../" + type_read1_fname, "../" + type_read2_fname, out_fname)
                    start_time = datetime.now()
                    if verbose:
                        print >> sys.stderr, "\t", start_time, " ".join(aligner_cmd)
                    if aligner in ["hisat2", "hisat", "bowtie", "bowtie2", "gsnap", "bwa", "vg", "minimap2", "hisat-TLA"]:
                        proc = subprocess.Popen(aligner_cmd, stdout=open(out_fname, "w"), stderr=subprocess.PIPE)
                    else:
                        proc = subprocess.Popen(aligner_cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
                    _, mem_usage = proc.communicate()
                    mem_usage = parse_mem_usage(mem_usage)
                    finish_time = datetime.now()
                    duration = finish_time - start_time
                    duration = duration.total_seconds() - loading_time
                    if duration < 0.1:
                        duration = 0.1
                    if verbose:
                        print >> sys.stderr, "\t", finish_time, "finished:", duration

                    if debug and aligner == "hisat2":
                        os.system("cat metrics.out")
                        print >> sys.stderr, "\ttime: %.4f" % (duration)

                    if aligner == "star" and type in ["", "gtf"]:
                        os.system("mv Aligned.out.sam %s" % out_fname)
                    elif aligner in ["hisat2", "hisat"] and type == "x2":
                        aligner_cmd = get_aligner_cmd(RNA, aligner, type, index_type, version, options, "../" + type_read1_fname, "../" + type_read2_fname, out_fname, 1)
                        start_time = datetime.now()
                        if verbose:
                            print >> sys.stderr, "\t", start_time, " ".join(aligner_cmd)
                        proc = subprocess.Popen(aligner_cmd, stdout=open(out_fname, "w"), stderr=subprocess.PIPE)
                        proc.communicate()
                        finish_time = datetime.now()
                        duration += (finish_time - start_time).total_seconds()
                        duration -= loading_time
                        if duration < 0.1:
                            duration = 0.1
                        if verbose:
                            print >> sys.stderr, "\t", finish_time, "finished:", duration
                    elif aligner == "star" and type == "x2":
                        assert os.path.exists("SJ.out.tab")
                        os.system("awk 'BEGIN {OFS=\"\t\"; strChar[0]=\".\"; strChar[1]=\"+\"; strChar[2]=\"-\";} {if($5>0){print $1,$2,$3,strChar[$4]}}' SJ.out.tab > SJ.out.tab.Pass1.sjdb")
                        for file in os.listdir("."):
                            if file in ["SJ.out.tab.Pass1.sjdb", "genome.fa"]:
                                continue
                            os.remove(file)
                        star_index_cmd = "%s/STAR --genomeDir ./ --runMode genomeGenerate --genomeFastaFiles ../../../data/%s.fa --sjdbFileChrStartEnd SJ.out.tab.Pass1.sjdb --sjdbOverhang 99 --runThreadN %d" % (aligner_bin_base, genome, num_threads)
                        if verbose:
                            print >> sys.stderr, "\t", datetime.now(), star_index_cmd
                        os.system(star_index_cmd)
                        if verbose:
                            print >> sys.stderr, "\t", datetime.now(), " ".join(dummy_cmd)
                        proc = subprocess.Popen(dummy_cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
                        proc.communicate()
                        if verbose:
                            print >> sys.stderr, "\t", datetime.now(), "finished"
                        aligner_cmd = get_aligner_cmd(RNA, aligner, type, index_type, version, options, "../" + type_read1_fname, "../" + type_read2_fname, out_fname, 1)
                        start_time = datetime.now()
                        if verbose:
                            print >> sys.stderr, "\t", start_time, " ".join(aligner_cmd)
                        proc = subprocess.Popen(aligner_cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
                        proc.communicate()
                        finish_time = datetime.now()
                        duration += (finish_time - start_time).total_seconds()
                        duration -= loading_time
                        if duration < 0.1:
                            duration = 0.1
                        if verbose:
                            print >> sys.stderr, "\t", finish_time, "finished:", duration
                        os.system("mv Aligned.out.sam %s" % out_fname)
                    elif aligner == "tophat2":
                        os.system("samtools sort -n tophat_out/accepted_hits.bam accepted_hits; samtools view -h accepted_hits.bam > %s" % out_fname)
                    elif aligner == "vg":
                        index_name = "%s/VG%s/" % (index_base, index_add) + genome
                        if index_type:
                            index_name += ("_" + index_type)

                    if aligner in ["gsnap", "tophat2"]:
                        os.system("tar cvzf %s.tar.gz %s &> /dev/null" % (out_fname, out_fname))

                if runtime_only:
                    print >> sys.stderr, "\t\t\tMemory Usage: %dMB" % (int(mem_usage) / 1024)
                    os.chdir("..")
                    continue

                if not os.path.exists(out_fname2):
                    debug_dic = {}
                    pid_list = []
                    if paired:
                        if mp_mode:
                            for i in xrange(mp_num):
                                p = Process(target=extract_pair, args=(out_fname, out_fname2, chr_dic, RNA, aligner, version, repeat_db, repeat_map, debug_dic, i))
                                pid_list.append(p)
                                p.start()

                            for p in pid_list:
                                p.join()

                            # merge
                            os.system("mv %s %s" % (out_fname2 + ".0", out_fname2))
                            for i in xrange(1, mp_num):
                                os.system("cat %s >> %s" % (out_fname2 + "." + str(i), out_fname2))
                                os.system("rm %s" % (out_fname2 + "." + str(i)))

                        else:
                            extract_pair(out_fname, out_fname2, chr_dic, RNA, aligner, version, repeat_db, repeat_map, debug_dic, -1)


                    else:
                        if mp_mode:
                            # Prepare queues
                            for i in xrange(mp_num):
                                p = Process(target=extract_single, args=(out_fname, out_fname2, chr_dic, aligner, version, repeat_db, repeat_map, debug_dic, i))
                                pid_list.append(p)
                                p.start()

                            # wait
                            for p in pid_list:
                                p.join()

                            # merge
                            os.system("mv %s %s" % (out_fname2 + ".0", out_fname2))
                            for i in xrange(1, mp_num):
                                os.system("cat %s >> %s" % (out_fname2 + "." + str(i), out_fname2))
                                os.system("rm %s" % (out_fname2 + "." + str(i)))

                        else:
                            extract_single(out_fname, out_fname2, chr_dic, aligner, version, repeat_db, repeat_map, debug_dic, -1)

                for readtype2 in readtypes:
                    if not two_step and readtype != readtype2:
                        continue

                    type_sam_fname2 = base_fname + "_" + readtype2 + ".sam"
                    if os.path.exists(type_sam_fname2 + ".done"):
                        continue

                    if paired:
                        type_read_fname2 = base_fname + "_1_" + readtype2 + ".fa"
                    else:
                        type_read_fname2 = base_fname + "_" + readtype2 + ".fa"
                    mapped_id_fname = base_fname + "_" + readtype2 + ".read_id"
                    if paired:
                        mapped, unique_mapped, first_mapped, unmapped, aligned, multi_aligned, \
                        snp_mapped, snp_unique_mapped, snp_first_mapped, snp_unmapped, \
                        temp_junctions, temp_gtf_junctions, mapping_point \
                            = compare_paired_sam(RNA, out_fname2, "../" + type_sam_fname2, mapped_id_fname, chr_dic, junctions, junctions_set, gtf_junctions, aligner)
                    else:
                        mapped, unique_mapped, first_mapped, unmapped, aligned, multi_aligned, \
                        snp_mapped, snp_unique_mapped, snp_first_mapped, snp_unmapped, \
                        temp_junctions, temp_gtf_junctions, mapping_point \
                            = compare_single_sam(RNA, out_fname2, "../" + type_sam_fname2, mapped_id_fname, chr_dic, junctions, junctions_set, gtf_junctions, aligner)
                    proc = subprocess.Popen(["wc", "-l", "../" + type_read_fname2], stdout=subprocess.PIPE)
                    out = proc.communicate()[0]
                    numreads = int(out.split()[0]) / 2
                    assert mapped + unmapped == numreads

                    if two_step:
                        print >> sys.stderr, "\t\t%s" % readtype2
                    print >> sys.stderr, "\t\taligned: %d, multi aligned: %d" % (aligned, multi_aligned)
                    print >> sys.stderr, "\t\tcorrectly mapped: %d (%.2f%%) mapping_point: %.2f" % (mapped, float(mapped) * 100.0 / numreads, mapping_point * 100.0 / numreads)
                    print >> sys.stderr, "\t\tcorrectly mapped at first: %d (%.2f%%)" % (first_mapped, float(first_mapped) * 100.0 / numreads)
                    print >> sys.stderr, "\t\tuniquely and correctly mapped: %d (%.2f%%)" % (unique_mapped, float(unique_mapped) * 100.0 / numreads)
                    snp_numreads = snp_mapped + snp_unmapped
                    if snp_numreads > 0:
                        print >> sys.stderr, "\t\t\t\tSNP: reads: %d" % (snp_numreads)
                        print >> sys.stderr, "\t\t\t\tSNP: correctly mapped: %d (%.2f%%)" % (snp_mapped, float(snp_mapped) * 100.0 / snp_numreads)
                        print >> sys.stderr, "\t\t\t\tSNP: correctly mapped at first: %d (%.2f%%)" % (snp_first_mapped, float(snp_first_mapped) * 100.0 / snp_numreads)
                        print >> sys.stderr, "\t\t\t\tSNP: uniquely and correctly mapped: %d (%.2f%%)" % (snp_unique_mapped, float(snp_unique_mapped) * 100.0 / snp_numreads)
                    if readtype == readtype2:
                        print >> sys.stderr, "\t\t\t%d reads per sec (all)" % (numreads / max(1.0, duration))
                    if RNA:
                        print >> sys.stderr, "\t\tjunc. sensitivity %d / %d (%.2f%%), junc. accuracy: %d / %d (%.2f%%)" % \
                                             (temp_gtf_junctions, len(junctions), float(temp_gtf_junctions) * 100.0 / max(1, len(junctions)), \
                                              temp_gtf_junctions, temp_junctions, float(temp_gtf_junctions) * 100.0 / max(1, temp_junctions))

                    print >> sys.stderr, "\t\t\tMemory Usage: %dMB" % (int(mem_usage) / 1024)

                    if duration > 0.0:
                        if sql_write and os.path.exists("../" + sql_db_name):
                            if paired:
                                end_type = "paired"
                            else:
                                end_type = "single"

                            mem_used = int(mem_usage) / 1024
                            sql_insert = "INSERT INTO \"ReadCosts\" VALUES(NULL, '%s', '%s', '%s', '%s', '%s', '%s', %d, %d, %d, %d, %f, %f, %d, %d, %d, %d, %d, %d, %d, '%s', datetime('now', 'localtime'), '%s');" % \
                                         (genome, data_base, end_type, readtype2, aligner_name, get_aligner_version(aligner, version), numreads, mapped, unique_mapped, unmapped, mapping_point, snp_mapped, snp_unique_mapped, snp_unmapped, duration, mem_used, len(junctions), temp_junctions, temp_gtf_junctions, platform.node(), " ".join(aligner_cmd))
                            sql_execute("../" + sql_db_name, sql_insert)

                        if two_step:
                            align_stat.append([readtype2, aligner_name, numreads, duration, mem_used, mapped, unique_mapped, unmapped, mapping_point, snp_mapped, snp_unique_mapped, snp_unmapped, len(junctions), temp_junctions, temp_gtf_junctions])
                        else:
                            align_stat[-1].extend([numreads, duration, mem_used, mapped, unique_mapped, unmapped, mapping_point, snp_mapped, snp_unique_mapped, snp_unmapped, len(junctions), temp_junctions, temp_gtf_junctions])

                    os.system("touch %s.done" % type_sam_fname2)

                os.chdir("..")

    print >> sys.stdout, "\t".join(["type", "aligner", "all", "all_time", "mem", "mapped", "unique_mapped", "unmapped", "mapping point", "snp_mapped", "snp_unique_mapped", "snp_unmapped", "true_gtf_junctions", "temp_junctions", "temp_gtf_junctions"])
    for line in align_stat:
        outstr = ""
        for item in line:
            if outstr != "":
                outstr += "\t"
            outstr += str(item)
        print >> sys.stdout, outstr

    if os.path.exists(sql_db_name):
        write_analysis_data(sql_db_name, genome, data_base)


"""
"""
if __name__ == "__main__":
    parser = ArgumentParser(
        description='test HISAT2, and compare HISAT2 with other popular aligners such as TopHat2, STAR, Bowtie1/2, GSNAP, BWA-mem, etc.')
    parser.add_argument('--single-end',
                        dest='paired_end',
                        action='store_false',
                        help='run single-end only')
    parser.add_argument('--paired-end',
                        dest='single_end',
                        action='store_false',
                        help='run paired_end only')
    parser.add_argument('--aligner-list',
                        dest='aligner_list',
                        type=str,
                        default="",
                        help='comma-separated list of aligners (e.g. hisat2,bowtie2,bwa')
    parser.add_argument('--fresh',
                        dest='fresh',
                        action='store_true',
                        help='delete existing alignment related directories (e.g. hisat2_single)')
    parser.add_argument('--runtime-only', '--runtime',
                        dest='runtime_only',
                        action='store_true',
                        help='run programs without evaluation')
    parser.add_argument('-p', '--multi-process',
                        dest='mp_num',
                        action='store',
                        type=int,
                        default=1,
                        help='Use multiple process mode')
    parser.add_argument('--base-change',
                        dest='baseChange',
                        type=str,
                        default="",
                        help='base change for Hisat-TLA')
    parser.add_argument('-v', '--verbose',
                        dest='verbose',
                        action='store_true',
                        help='also print some statistics to stderr')

    args = parser.parse_args()

    aligners = []
    for aligner in args.aligner_list.split(','):
        if aligner == "":
            continue
        aligners.append(aligner)

    mp_num = args.mp_num
    mp_mode = (mp_num > 1)

    calculate_read_cost(args.single_end,
                        args.paired_end,
                        aligners,
                        args.fresh,
                        args.runtime_only,
                        args.baseChange,
                        args.verbose)