hisat-3n/evaluation/tests/one_snp_test/evaluate_one_snp_reads.py

#!/usr/bin/env python

import sys, os, subprocess
import multiprocessing
import string, re
import platform
from datetime import datetime, date, time
import copy
from argparse import ArgumentParser, FileType


"""
"""
def evaluate(read_fname,
             verbose):
    aligners = [
        ["hisat2", "", "", ""],
        ["hisat2", "", "snp", ""],
        ["bowtie2", "", "", ""],
        ]
    num_threads = 3

    cwd = os.getcwd()
    genome = "genome"
    align_stat = []
    # for paired in [False, True]:
    for paired in [False]:
        base_fname = "common_snp_reads"
        type_sam_fname = base_fname + ".sam"
        type_read1_fname = base_fname +  "_1.fa"
        type_read2_fname = base_fname +  "_2.fa"

        type_read1_fname = read_fname

        def get_aligner_version(aligner, version):
            version = ""
            if aligner == "hisat2" or \
                    aligner == "bowtie2":
                if version:
                    cmd = ["%s_%s/%s" % (aligner, version, aligner)]
                else:
                    cmd = ["%s/%s" % (aligner_bin_base, aligner)]
                cmd += ["--version"]
                cmd_process = subprocess.Popen(cmd, stdout=subprocess.PIPE)
                version = cmd_process.communicate()[0][:-1].split("\n")[0]
                version = version.split()[-1]
            elif aligner == "star":
                version = "2.4.2a"
            elif aligner == "gsnap":
                cmd = ["%s/gsnap" % (aligner_bin_base)]
                cmd_process = subprocess.Popen(cmd, stderr=subprocess.PIPE)
                version = cmd_process.communicate()[1][:-1].split("\n")[0]
                version = version.split()[2]
            elif aligner == "bwa":
                cmd = ["%s/bwa" % (aligner_bin_base)]
                cmd_process = subprocess.Popen(cmd, stderr=subprocess.PIPE)
                version = cmd_process.communicate()[1][:-1].split("\n")[2]
                version = version.split()[1]

            return version

        def get_aligner_cmd(aligner, type, index_type, version, read1_fname, read2_fname, out_fname, cmd_idx = 0):
            cmd = []
            if aligner == "hisat2":
                cmd = ["hisat2"]
                if num_threads > 1:
                    cmd += ["-p", str(num_threads)]
                cmd += ["-f"]
                cmd += ["--no-spliced-alignment"]
                if index_type:
                    index_cmd = "../grch38_snp_hisat2/genome_snp"
                else:
                    index_cmd = "../grch38_hisat2/genome"
                cmd += [index_cmd]
                if paired:
                    cmd += ["-1", read1_fname,
                            "-2", read2_fname]
                else:
                    cmd += [read1_fname]
            elif aligner == "star":
                cmd = ["%s/STAR" % (aligner_bin_base)]
                if num_threads > 1:
                    cmd += ["--runThreadN", str(num_threads)]
                cmd += ["--genomeDir"]
                if cmd_idx == 0:
                    if type == "gtf":
                        cmd += ["%s/STAR%s/gtf" % (index_base, index_add)]
                    else:
                        cmd += ["%s/STAR%s" % (index_base, index_add)]
                else:
                    assert cmd_idx == 1
                    cmd += ["."]

                if desktop:
                    cmd += ["--genomeLoad", "NoSharedMemory"]
                else:
                    cmd += ["--genomeLoad", "LoadAndKeep"]
                if type == "x2":
                    if cmd_idx == 1:
                        cmd += ["--alignSJDBoverhangMin", "1"]
                cmd += ["--readFilesIn",
                        read1_fname]
                if paired:
                    cmd += [read2_fname]
                if paired:
                    cmd += ["--outFilterMismatchNmax", "6"]
                else:
                    cmd += ["--outFilterMismatchNmax", "3"]
            elif aligner == "bowtie2":
                cmd = ["bowtie2"]
                if num_threads > 1:
                    cmd += ["-p", str(num_threads)]
                cmd += ["-f"]
                cmd += ["-x ../grch38_bowtie2/genome"]
                if paired:
                    cmd += ["-1", read1_fname,
                            "-2", read2_fname]
                else:
                    cmd += [read1_fname]
            elif aligner == "gsnap":
                cmd = ["%s/gsnap" % (aligner_bin_base),
                       "-A",
                       "sam"]
                if num_threads > 1:
                    cmd += ["-t", str(num_threads)]
                cmd += ["--max-mismatches=3",
                        "-D", "%s/GSNAP%s" % (index_base, index_add),
                        "-N", "1",
                        "-d", genome,
                        read1_fname]
                if paired:
                    cmd += [read2_fname]
            elif aligner == "bwa":
                cmd = ["%s/bwa" % (aligner_bin_base)]
                if type in ["mem", "aln"]:
                    cmd += [type]
                elif type == "sw":
                    cmd += ["bwa" + type]
                if num_threads > 1:
                    cmd += ["-t", str(num_threads)]
                cmd += ["%s/BWA%s/%s.fa" % (index_base, index_add, genome)]
                cmd += [read1_fname]
                if paired:
                    cmd += [read2_fname]
            else:
                assert False

            return cmd

        for aligner, type, index_type, version in aligners:
            aligner_name = aligner + type
            if version != "":
                aligner_name += ("_%s" % version)
            if aligner == "hisat2" and index_type != "":
                aligner_name += ("_" + index_type)
            two_step = (aligner == "tophat2" or type == "x2" or (aligner in ["hisat2", "hisat"] and type == ""))
            print >> sys.stderr, "\t%s\t%s" % (aligner_name, str(datetime.now()))
            if paired:
                aligner_dir = aligner_name + "_paired"
            else:
                aligner_dir = aligner_name + "_single"
            if not os.path.exists(aligner_dir):
                os.mkdir(aligner_dir)
            os.chdir(aligner_dir)

            out_fname = base_fname + ".sam"
            duration = -1.0

            # Align all reads
            aligner_cmd = get_aligner_cmd(aligner, type, index_type, version, "../" + type_read1_fname, "../" + type_read2_fname, out_fname)
            start_time = datetime.now()
            if verbose:
                print >> sys.stderr, "\t", start_time, " ".join(aligner_cmd)
            if aligner in ["hisat2", "hisat", "bowtie", "bowtie2", "gsnap", "bwa"]:
                proc = subprocess.Popen(aligner_cmd, stdout=open(out_fname, "w"), stderr=subprocess.PIPE)
            else:
                proc = subprocess.Popen(aligner_cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
            proc.communicate()
            finish_time = datetime.now()
            duration = finish_time - start_time
            duration = duration.total_seconds()
            if verbose:
                print >> sys.stderr, "\t", finish_time, "finished:", duration

            assert os.path.exists(out_fname)
            correct_reads, correct_multi_reads, num_reads = 0, 0, 0
            prev_read_id = None
            for line in open(out_fname):
                if line.startswith('@'):
                    continue
                read_id, flag, chr, pos, mapQ, cigar = line.split()[:6]
                if chr.startswith("chr"):
                    chr = chr[3:]
                pos = int(pos) - 1
                true_chr, true_pos, true_cigar = read_id.split('_')[1:4]
                true_pos = int(true_pos)

                if read_id != prev_read_id:
                    num_reads += 1

                if true_chr == chr and pos == true_pos and cigar == true_cigar:
                    correct_multi_reads += 1
                    if prev_read_id != read_id:
                        correct_reads += 1

                prev_read_id = read_id

            print >> sys.stderr, "\tfirst: %d / %d (%.2f%%)" % (correct_reads, num_reads, float(correct_reads)/num_reads*100)
            print >> sys.stderr, "\tall: %d / %d (%.2f%%)" % (correct_multi_reads, num_reads, float(correct_multi_reads)/num_reads*100)

            os.chdir("..")


"""
"""
if __name__ == "__main__":
    parser = ArgumentParser(
        description='test HISAT2, and compare HISAT2 with other popular aligners such as TopHat2, STAR, Bowtie1/2, GSNAP, BWA-mem, etc.')
    parser.add_argument('read_fname',
                        nargs='?',
                        type=str,
                        help='input read file')
    parser.add_argument('-v', '--verbose',
                        dest='verbose',
                        action='store_true',
                        help='also print some statistics to stderr')

    args = parser.parse_args()
    evaluate(args.read_fname,
             args.verbose)