hisat-3n/evaluation/get_data.py

#!/usr/bin/env python

import sys, os
from argparse import ArgumentParser, FileType

def get_data(small = False):
    data_root = "http://www.ccb.jhu.edu/software/hisat2/downloads/evaluation"
    
    # Download the reference human genome, SNPs, and gene annotations
    if not os.path.exists("data"):
        os.mkdir("data")
    os.chdir("data")
    genome_files = ["genome.fa", "genome.fa.fai", "genome.gtf", "snpCommon.txt", "genome.snp", "genome.ss", "genome.exon"]
    small_genome_files = ["22.fa", "22.fa.fai", "22.gtf", "22.snp", "22.ss", "22.exon", \
                              "22_20-21M.fa", "22_20-21M.fa.fai", "22_20-21M.gtf", "22_20-21M.snp", "22_20-21M.ss", "22_20-21M.exon"]
    files = []
    if not small:
        files += genome_files
    files += small_genome_files
    for file in files:
        if os.path.exists(file):
            continue
        wget_cmd = "wget %s/data/%s" % (data_root, file)
        print >> sys.stderr, wget_cmd
        os.system(wget_cmd)
    os.chdir("..")

    # Download indexes
    if not os.path.exists("indexes"):
        os.mkdir("indexes")
    os.chdir("indexes")
    aligners = ["HISAT2", "HISAT", "Bowtie", "STAR", "GSNAP"]
    for genome in ["genome", "22", "22_20-21M"]:
        if small and genome == "genome":
            continue
        for aligner in aligners:
            if genome == "genome":
                aligner_dir = aligner
            else:
                aligner_dir = aligner + "_" + genome
            if os.path.exists(aligner_dir):
                continue
            cmd = "wget %s/indexes/%s.tar.gz; tar xvzf %s.tar.gz; rm %s.tar.gz" % \
                (data_root, aligner_dir, aligner_dir, aligner_dir)
            print >> sys.stderr, cmd
            os.system(cmd)
    os.chdir("..")

    # Download simulated and real reads
    if not os.path.exists("reads"):
        os.mkdir("reads")
    os.chdir("reads")
    for type in ["simulation", "real"]:
        if small and type == "real":
            continue
        if not os.path.exists(type):
            os.mkdir(type)
        os.chdir(type)
        if type == "simulation":
            files = ["1M_DNA_reads_22",
                     "1M_DNA_mismatch_reads_22",
                     "1M_DNA_snp_reads_22",
                     "1M_DNA_mismatch_snp_reads_22",
                     "1M_RNA_reads_22",
                     "1M_RNA_constant_reads_22",
                     "1M_RNA_mismatch_reads_22",
                     "1M_RNA_snp_reads_22",
                     "1M_RNA_mismatch_snp_reads_22",
                     "1M_RNA_reads_22_20-21M",
                     "20M_DNA_reads_genome",
                     "20M_DNA_snp_reads_genome",
                     "20M_RNA_reads_genome",
                     "20M_RNA_snp_reads_genome"]
        else:
            files = ["108M_RNA_wgEncodeCshlLongRnaSeq",
                     "62M_RNA_SRR353653",
                     "80M_DNA_SRR345300",
                     "5M_DNA_NA12878D"]
        for file in files:
            if small and file.find("20M") != -1:
                continue
            if os.path.exists(file):
                continue
            cmd = "wget %s/reads/%s/%s.tar.gz; tar xvzf %s.tar.gz; rm %s.tar.gz" % \
                (data_root, type, file, file, file)
            print >> sys.stderr, cmd
            os.system(cmd)
        os.chdir("..")
    
    os.chdir("..")
    
    
if __name__ == "__main__":
    parser = ArgumentParser(
        description='Get reference genome, annotations, and indexes')
    parser.add_argument('-s', '--small',
                        dest='small',
                        action='store_true',
                        default=False,
                        help='small testset')
    args = parser.parse_args()
    get_data(args.small)
initial commit 2025-01-18 13:09:52 +00:00			`#!/usr/bin/env python`

			`import sys, os`
			`from argparse import ArgumentParser, FileType`

			`def get_data(small = False):`
			`data_root = "http://www.ccb.jhu.edu/software/hisat2/downloads/evaluation"`

			`# Download the reference human genome, SNPs, and gene annotations`
			`if not os.path.exists("data"):`
			`os.mkdir("data")`
			`os.chdir("data")`
			`genome_files = ["genome.fa", "genome.fa.fai", "genome.gtf", "snpCommon.txt", "genome.snp", "genome.ss", "genome.exon"]`
			`small_genome_files = ["22.fa", "22.fa.fai", "22.gtf", "22.snp", "22.ss", "22.exon", \`
			`"22_20-21M.fa", "22_20-21M.fa.fai", "22_20-21M.gtf", "22_20-21M.snp", "22_20-21M.ss", "22_20-21M.exon"]`
			`files = []`
			`if not small:`
			`files += genome_files`
			`files += small_genome_files`
			`for file in files:`
			`if os.path.exists(file):`
			`continue`
			`wget_cmd = "wget %s/data/%s" % (data_root, file)`
			`print >> sys.stderr, wget_cmd`
			`os.system(wget_cmd)`
			`os.chdir("..")`

			`# Download indexes`
			`if not os.path.exists("indexes"):`
			`os.mkdir("indexes")`
			`os.chdir("indexes")`
			`aligners = ["HISAT2", "HISAT", "Bowtie", "STAR", "GSNAP"]`
			`for genome in ["genome", "22", "22_20-21M"]:`
			`if small and genome == "genome":`
			`continue`
			`for aligner in aligners:`
			`if genome == "genome":`
			`aligner_dir = aligner`
			`else:`
			`aligner_dir = aligner + "_" + genome`
			`if os.path.exists(aligner_dir):`
			`continue`
			`cmd = "wget %s/indexes/%s.tar.gz; tar xvzf %s.tar.gz; rm %s.tar.gz" % \`
			`(data_root, aligner_dir, aligner_dir, aligner_dir)`
			`print >> sys.stderr, cmd`
			`os.system(cmd)`
			`os.chdir("..")`

			`# Download simulated and real reads`
			`if not os.path.exists("reads"):`
			`os.mkdir("reads")`
			`os.chdir("reads")`
			`for type in ["simulation", "real"]:`
			`if small and type == "real":`
			`continue`
			`if not os.path.exists(type):`
			`os.mkdir(type)`
			`os.chdir(type)`
			`if type == "simulation":`
			`files = ["1M_DNA_reads_22",`
			`"1M_DNA_mismatch_reads_22",`
			`"1M_DNA_snp_reads_22",`
			`"1M_DNA_mismatch_snp_reads_22",`
			`"1M_RNA_reads_22",`
			`"1M_RNA_constant_reads_22",`
			`"1M_RNA_mismatch_reads_22",`
			`"1M_RNA_snp_reads_22",`
			`"1M_RNA_mismatch_snp_reads_22",`
			`"1M_RNA_reads_22_20-21M",`
			`"20M_DNA_reads_genome",`
			`"20M_DNA_snp_reads_genome",`
			`"20M_RNA_reads_genome",`
			`"20M_RNA_snp_reads_genome"]`
			`else:`
			`files = ["108M_RNA_wgEncodeCshlLongRnaSeq",`
			`"62M_RNA_SRR353653",`
			`"80M_DNA_SRR345300",`
			`"5M_DNA_NA12878D"]`
			`for file in files:`
			`if small and file.find("20M") != -1:`
			`continue`
			`if os.path.exists(file):`
			`continue`
			`cmd = "wget %s/reads/%s/%s.tar.gz; tar xvzf %s.tar.gz; rm %s.tar.gz" % \`
			`(data_root, type, file, file, file)`
			`print >> sys.stderr, cmd`
			`os.system(cmd)`
			`os.chdir("..")`

			`os.chdir("..")`


			`if __name__ == "__main__":`
			`parser = ArgumentParser(`
			`description='Get reference genome, annotations, and indexes')`
			`parser.add_argument('-s', '--small',`
			`dest='small',`
			`action='store_true',`
			`default=False,`
			`help='small testset')`
			`args = parser.parse_args()`
			`get_data(args.small)`