hisat-3n/evaluation/tests/one_snp_test/evaluate_one_snp_reads.py

#!/usr/bin/env python

import sys, os, subprocess
import multiprocessing
import string, re
import platform
from datetime import datetime, date, time
import copy
from argparse import ArgumentParser, FileType


"""
"""
def evaluate(read_fname,
             verbose):
    aligners = [
        ["hisat2", "", "", ""],
        ["hisat2", "", "snp", ""],
        ["bowtie2", "", "", ""],
        ]
    num_threads = 3

    cwd = os.getcwd()
    genome = "genome"
    align_stat = []
    # for paired in [False, True]:
    for paired in [False]:
        base_fname = "common_snp_reads"
        type_sam_fname = base_fname + ".sam"
        type_read1_fname = base_fname +  "_1.fa"
        type_read2_fname = base_fname +  "_2.fa"

        type_read1_fname = read_fname

        def get_aligner_version(aligner, version):
            version = ""
            if aligner == "hisat2" or \
                    aligner == "bowtie2":
                if version:
                    cmd = ["%s_%s/%s" % (aligner, version, aligner)]
                else:
                    cmd = ["%s/%s" % (aligner_bin_base, aligner)]
                cmd += ["--version"]                    
                cmd_process = subprocess.Popen(cmd, stdout=subprocess.PIPE)
                version = cmd_process.communicate()[0][:-1].split("\n")[0]
                version = version.split()[-1]
            elif aligner == "star":
                version = "2.4.2a"
            elif aligner == "gsnap":
                cmd = ["%s/gsnap" % (aligner_bin_base)]
                cmd_process = subprocess.Popen(cmd, stderr=subprocess.PIPE)
                version = cmd_process.communicate()[1][:-1].split("\n")[0]
                version = version.split()[2]
            elif aligner == "bwa":
                cmd = ["%s/bwa" % (aligner_bin_base)]
                cmd_process = subprocess.Popen(cmd, stderr=subprocess.PIPE)
                version = cmd_process.communicate()[1][:-1].split("\n")[2]
                version = version.split()[1]

            return version

        def get_aligner_cmd(aligner, type, index_type, version, read1_fname, read2_fname, out_fname, cmd_idx = 0):
            cmd = []
            if aligner == "hisat2":
                cmd = ["hisat2"]
                if num_threads > 1:
                    cmd += ["-p", str(num_threads)]
                cmd += ["-f"]
                cmd += ["--no-spliced-alignment"]
                if index_type:
                    index_cmd = "../grch38_snp_hisat2/genome_snp"
                else:
                    index_cmd = "../grch38_hisat2/genome"
                cmd += [index_cmd]
                if paired:
                    cmd += ["-1", read1_fname,
                            "-2", read2_fname]
                else:
                    cmd += [read1_fname]                        
            elif aligner == "star":
                cmd = ["%s/STAR" % (aligner_bin_base)]
                if num_threads > 1:
                    cmd += ["--runThreadN", str(num_threads)]
                cmd += ["--genomeDir"]
                if cmd_idx == 0:
                    if type == "gtf":
                        cmd += ["%s/STAR%s/gtf" % (index_base, index_add)]
                    else:
                        cmd += ["%s/STAR%s" % (index_base, index_add)]
                else:
                    assert cmd_idx == 1
                    cmd += ["."]

                if desktop:
                    cmd += ["--genomeLoad", "NoSharedMemory"]
                else:
                    cmd += ["--genomeLoad", "LoadAndKeep"]
                if type == "x2":
                    if cmd_idx == 1:
                        cmd += ["--alignSJDBoverhangMin", "1"]
                cmd += ["--readFilesIn",
                        read1_fname]
                if paired:
                    cmd += [read2_fname]
                if paired:
                    cmd += ["--outFilterMismatchNmax", "6"]
                else:
                    cmd += ["--outFilterMismatchNmax", "3"]
            elif aligner == "bowtie2":
                cmd = ["bowtie2"]
                if num_threads > 1:
                    cmd += ["-p", str(num_threads)]
                cmd += ["-f"]
                cmd += ["-x ../grch38_bowtie2/genome"]
                if paired:
                    cmd += ["-1", read1_fname,
                            "-2", read2_fname]
                else:
                    cmd += [read1_fname]
            elif aligner == "gsnap":
                cmd = ["%s/gsnap" % (aligner_bin_base),
                       "-A",
                       "sam"]
                if num_threads > 1:
                    cmd += ["-t", str(num_threads)]
                cmd += ["--max-mismatches=3",
                        "-D", "%s/GSNAP%s" % (index_base, index_add),
                        "-N", "1",
                        "-d", genome,
                        read1_fname]
                if paired:
                    cmd += [read2_fname]
            elif aligner == "bwa":
                cmd = ["%s/bwa" % (aligner_bin_base)]
                if type in ["mem", "aln"]:
                    cmd += [type]
                elif type == "sw":
                    cmd += ["bwa" + type]
                if num_threads > 1:
                    cmd += ["-t", str(num_threads)]
                cmd += ["%s/BWA%s/%s.fa" % (index_base, index_add, genome)]
                cmd += [read1_fname]
                if paired:
                    cmd += [read2_fname]
            else:
                assert False

            return cmd

        for aligner, type, index_type, version in aligners:
            aligner_name = aligner + type
            if version != "":
                aligner_name += ("_%s" % version)
            if aligner == "hisat2" and index_type != "":
                aligner_name += ("_" + index_type)
            two_step = (aligner == "tophat2" or type == "x2" or (aligner in ["hisat2", "hisat"] and type == ""))
            print >> sys.stderr, "\t%s\t%s" % (aligner_name, str(datetime.now()))
            if paired:
                aligner_dir = aligner_name + "_paired"
            else:
                aligner_dir = aligner_name + "_single"
            if not os.path.exists(aligner_dir):
                os.mkdir(aligner_dir)
            os.chdir(aligner_dir)

            out_fname = base_fname + ".sam"
            duration = -1.0

            # Align all reads
            aligner_cmd = get_aligner_cmd(aligner, type, index_type, version, "../" + type_read1_fname, "../" + type_read2_fname, out_fname)
            start_time = datetime.now()
            if verbose:
                print >> sys.stderr, "\t", start_time, " ".join(aligner_cmd)
            if aligner in ["hisat2", "hisat", "bowtie", "bowtie2", "gsnap", "bwa"]:
                proc = subprocess.Popen(aligner_cmd, stdout=open(out_fname, "w"), stderr=subprocess.PIPE)
            else:
                proc = subprocess.Popen(aligner_cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
            proc.communicate()
            finish_time = datetime.now()
            duration = finish_time - start_time
            duration = duration.total_seconds()
            if verbose:
                print >> sys.stderr, "\t", finish_time, "finished:", duration

            assert os.path.exists(out_fname)
            correct_reads, correct_multi_reads, num_reads = 0, 0, 0
            prev_read_id = None
            for line in open(out_fname):
                if line.startswith('@'):
                    continue
                read_id, flag, chr, pos, mapQ, cigar = line.split()[:6]
                if chr.startswith("chr"):
                    chr = chr[3:]
                pos = int(pos) - 1
                true_chr, true_pos, true_cigar = read_id.split('_')[1:4]
                true_pos = int(true_pos)

                if read_id != prev_read_id:
                    num_reads += 1

                if true_chr == chr and pos == true_pos and cigar == true_cigar:
                    correct_multi_reads += 1
                    if prev_read_id != read_id:
                        correct_reads += 1

                prev_read_id = read_id

            print >> sys.stderr, "\tfirst: %d / %d (%.2f%%)" % (correct_reads, num_reads, float(correct_reads)/num_reads*100)
            print >> sys.stderr, "\tall: %d / %d (%.2f%%)" % (correct_multi_reads, num_reads, float(correct_multi_reads)/num_reads*100)

            os.chdir("..")


"""
"""
if __name__ == "__main__":
    parser = ArgumentParser(
        description='test HISAT2, and compare HISAT2 with other popular aligners such as TopHat2, STAR, Bowtie1/2, GSNAP, BWA-mem, etc.')
    parser.add_argument('read_fname',
                        nargs='?',
                        type=str,
                        help='input read file')
    parser.add_argument('-v', '--verbose',
                        dest='verbose',
                        action='store_true',
                        help='also print some statistics to stderr')

    args = parser.parse_args()
    evaluate(args.read_fname,
             args.verbose)
initial commit 2025-01-18 13:09:52 +00:00			`#!/usr/bin/env python`

			`import sys, os, subprocess`
			`import multiprocessing`
			`import string, re`
			`import platform`
			`from datetime import datetime, date, time`
			`import copy`
			`from argparse import ArgumentParser, FileType`


			`"""`
			`"""`
			`def evaluate(read_fname,`
			`verbose):`
			`aligners = [`
			`["hisat2", "", "", ""],`
			`["hisat2", "", "snp", ""],`
			`["bowtie2", "", "", ""],`
			`]`
			`num_threads = 3`

			`cwd = os.getcwd()`
			`genome = "genome"`
			`align_stat = []`
			`# for paired in [False, True]:`
			`for paired in [False]:`
			`base_fname = "common_snp_reads"`
			`type_sam_fname = base_fname + ".sam"`
			`type_read1_fname = base_fname + "_1.fa"`
			`type_read2_fname = base_fname + "_2.fa"`

			`type_read1_fname = read_fname`

			`def get_aligner_version(aligner, version):`
			`version = ""`
			`if aligner == "hisat2" or \`
			`aligner == "bowtie2":`
			`if version:`
			`cmd = ["%s_%s/%s" % (aligner, version, aligner)]`
			`else:`
			`cmd = ["%s/%s" % (aligner_bin_base, aligner)]`
			`cmd += ["--version"]`
			`cmd_process = subprocess.Popen(cmd, stdout=subprocess.PIPE)`
			`version = cmd_process.communicate()[0][:-1].split("\n")[0]`
			`version = version.split()[-1]`
			`elif aligner == "star":`
			`version = "2.4.2a"`
			`elif aligner == "gsnap":`
			`cmd = ["%s/gsnap" % (aligner_bin_base)]`
			`cmd_process = subprocess.Popen(cmd, stderr=subprocess.PIPE)`
			`version = cmd_process.communicate()[1][:-1].split("\n")[0]`
			`version = version.split()[2]`
			`elif aligner == "bwa":`
			`cmd = ["%s/bwa" % (aligner_bin_base)]`
			`cmd_process = subprocess.Popen(cmd, stderr=subprocess.PIPE)`
			`version = cmd_process.communicate()[1][:-1].split("\n")[2]`
			`version = version.split()[1]`

			`return version`

			`def get_aligner_cmd(aligner, type, index_type, version, read1_fname, read2_fname, out_fname, cmd_idx = 0):`
			`cmd = []`
			`if aligner == "hisat2":`
			`cmd = ["hisat2"]`
			`if num_threads > 1:`
			`cmd += ["-p", str(num_threads)]`
			`cmd += ["-f"]`
			`cmd += ["--no-spliced-alignment"]`
			`if index_type:`
			`index_cmd = "../grch38_snp_hisat2/genome_snp"`
			`else:`
			`index_cmd = "../grch38_hisat2/genome"`
			`cmd += [index_cmd]`
			`if paired:`
			`cmd += ["-1", read1_fname,`
			`"-2", read2_fname]`
			`else:`
			`cmd += [read1_fname]`
			`elif aligner == "star":`
			`cmd = ["%s/STAR" % (aligner_bin_base)]`
			`if num_threads > 1:`
			`cmd += ["--runThreadN", str(num_threads)]`
			`cmd += ["--genomeDir"]`
			`if cmd_idx == 0:`
			`if type == "gtf":`
			`cmd += ["%s/STAR%s/gtf" % (index_base, index_add)]`
			`else:`
			`cmd += ["%s/STAR%s" % (index_base, index_add)]`
			`else:`
			`assert cmd_idx == 1`
			`cmd += ["."]`

			`if desktop:`
			`cmd += ["--genomeLoad", "NoSharedMemory"]`
			`else:`
			`cmd += ["--genomeLoad", "LoadAndKeep"]`
			`if type == "x2":`
			`if cmd_idx == 1:`
			`cmd += ["--alignSJDBoverhangMin", "1"]`
			`cmd += ["--readFilesIn",`
			`read1_fname]`
			`if paired:`
			`cmd += [read2_fname]`
			`if paired:`
			`cmd += ["--outFilterMismatchNmax", "6"]`
			`else:`
			`cmd += ["--outFilterMismatchNmax", "3"]`
			`elif aligner == "bowtie2":`
			`cmd = ["bowtie2"]`
			`if num_threads > 1:`
			`cmd += ["-p", str(num_threads)]`
			`cmd += ["-f"]`
			`cmd += ["-x ../grch38_bowtie2/genome"]`
			`if paired:`
			`cmd += ["-1", read1_fname,`
			`"-2", read2_fname]`
			`else:`
			`cmd += [read1_fname]`
			`elif aligner == "gsnap":`
			`cmd = ["%s/gsnap" % (aligner_bin_base),`
			`"-A",`
			`"sam"]`
			`if num_threads > 1:`
			`cmd += ["-t", str(num_threads)]`
			`cmd += ["--max-mismatches=3",`
			`"-D", "%s/GSNAP%s" % (index_base, index_add),`
			`"-N", "1",`
			`"-d", genome,`
			`read1_fname]`
			`if paired:`
			`cmd += [read2_fname]`
			`elif aligner == "bwa":`
			`cmd = ["%s/bwa" % (aligner_bin_base)]`
			`if type in ["mem", "aln"]:`
			`cmd += [type]`
			`elif type == "sw":`
			`cmd += ["bwa" + type]`
			`if num_threads > 1:`
			`cmd += ["-t", str(num_threads)]`
			`cmd += ["%s/BWA%s/%s.fa" % (index_base, index_add, genome)]`
			`cmd += [read1_fname]`
			`if paired:`
			`cmd += [read2_fname]`
			`else:`
			`assert False`

			`return cmd`

			`for aligner, type, index_type, version in aligners:`
			`aligner_name = aligner + type`
			`if version != "":`
			`aligner_name += ("_%s" % version)`
			`if aligner == "hisat2" and index_type != "":`
			`aligner_name += ("_" + index_type)`
			`two_step = (aligner == "tophat2" or type == "x2" or (aligner in ["hisat2", "hisat"] and type == ""))`
			`print >> sys.stderr, "\t%s\t%s" % (aligner_name, str(datetime.now()))`
			`if paired:`
			`aligner_dir = aligner_name + "_paired"`
			`else:`
			`aligner_dir = aligner_name + "_single"`
			`if not os.path.exists(aligner_dir):`
			`os.mkdir(aligner_dir)`
			`os.chdir(aligner_dir)`

			`out_fname = base_fname + ".sam"`
			`duration = -1.0`

			`# Align all reads`
			`aligner_cmd = get_aligner_cmd(aligner, type, index_type, version, "../" + type_read1_fname, "../" + type_read2_fname, out_fname)`
			`start_time = datetime.now()`
			`if verbose:`
			`print >> sys.stderr, "\t", start_time, " ".join(aligner_cmd)`
			`if aligner in ["hisat2", "hisat", "bowtie", "bowtie2", "gsnap", "bwa"]:`
			`proc = subprocess.Popen(aligner_cmd, stdout=open(out_fname, "w"), stderr=subprocess.PIPE)`
			`else:`
			`proc = subprocess.Popen(aligner_cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE)`
			`proc.communicate()`
			`finish_time = datetime.now()`
			`duration = finish_time - start_time`
			`duration = duration.total_seconds()`
			`if verbose:`
			`print >> sys.stderr, "\t", finish_time, "finished:", duration`

			`assert os.path.exists(out_fname)`
			`correct_reads, correct_multi_reads, num_reads = 0, 0, 0`
			`prev_read_id = None`
			`for line in open(out_fname):`
			`if line.startswith('@'):`
			`continue`
			`read_id, flag, chr, pos, mapQ, cigar = line.split()[:6]`
			`if chr.startswith("chr"):`
			`chr = chr[3:]`
			`pos = int(pos) - 1`
			`true_chr, true_pos, true_cigar = read_id.split('_')[1:4]`
			`true_pos = int(true_pos)`

			`if read_id != prev_read_id:`
			`num_reads += 1`

			`if true_chr == chr and pos == true_pos and cigar == true_cigar:`
			`correct_multi_reads += 1`
			`if prev_read_id != read_id:`
			`correct_reads += 1`

			`prev_read_id = read_id`

			`print >> sys.stderr, "\tfirst: %d / %d (%.2f%%)" % (correct_reads, num_reads, float(correct_reads)/num_reads*100)`
			`print >> sys.stderr, "\tall: %d / %d (%.2f%%)" % (correct_multi_reads, num_reads, float(correct_multi_reads)/num_reads*100)`

			`os.chdir("..")`


			`"""`
			`"""`
			`if __name__ == "__main__":`
			`parser = ArgumentParser(`
			`description='test HISAT2, and compare HISAT2 with other popular aligners such as TopHat2, STAR, Bowtie1/2, GSNAP, BWA-mem, etc.')`
			`parser.add_argument('read_fname',`
			`nargs='?',`
			`type=str,`
			`help='input read file')`
			`parser.add_argument('-v', '--verbose',`
			`dest='verbose',`
			`action='store_true',`
			`help='also print some statistics to stderr')`

			`args = parser.parse_args()`
			`evaluate(args.read_fname,`
			`args.verbose)`