hisat-3n/evaluation/get_data.py

#!/usr/bin/env python

import sys, os
from argparse import ArgumentParser, FileType

def get_data(small = False):
    data_root = "http://www.ccb.jhu.edu/software/hisat2/downloads/evaluation"

    # Download the reference human genome, SNPs, and gene annotations
    if not os.path.exists("data"):
        os.mkdir("data")
    os.chdir("data")
    genome_files = ["genome.fa", "genome.fa.fai", "genome.gtf", "snpCommon.txt", "genome.snp", "genome.ss", "genome.exon"]
    small_genome_files = ["22.fa", "22.fa.fai", "22.gtf", "22.snp", "22.ss", "22.exon", \
                              "22_20-21M.fa", "22_20-21M.fa.fai", "22_20-21M.gtf", "22_20-21M.snp", "22_20-21M.ss", "22_20-21M.exon"]
    files = []
    if not small:
        files += genome_files
    files += small_genome_files
    for file in files:
        if os.path.exists(file):
            continue
        wget_cmd = "wget %s/data/%s" % (data_root, file)
        print >> sys.stderr, wget_cmd
        os.system(wget_cmd)
    os.chdir("..")

    # Download indexes
    if not os.path.exists("indexes"):
        os.mkdir("indexes")
    os.chdir("indexes")
    aligners = ["HISAT2", "HISAT", "Bowtie", "STAR", "GSNAP"]
    for genome in ["genome", "22", "22_20-21M"]:
        if small and genome == "genome":
            continue
        for aligner in aligners:
            if genome == "genome":
                aligner_dir = aligner
            else:
                aligner_dir = aligner + "_" + genome
            if os.path.exists(aligner_dir):
                continue
            cmd = "wget %s/indexes/%s.tar.gz; tar xvzf %s.tar.gz; rm %s.tar.gz" % \
                (data_root, aligner_dir, aligner_dir, aligner_dir)
            print >> sys.stderr, cmd
            os.system(cmd)
    os.chdir("..")

    # Download simulated and real reads
    if not os.path.exists("reads"):
        os.mkdir("reads")
    os.chdir("reads")
    for type in ["simulation", "real"]:
        if small and type == "real":
            continue
        if not os.path.exists(type):
            os.mkdir(type)
        os.chdir(type)
        if type == "simulation":
            files = ["1M_DNA_reads_22",
                     "1M_DNA_mismatch_reads_22",
                     "1M_DNA_snp_reads_22",
                     "1M_DNA_mismatch_snp_reads_22",
                     "1M_RNA_reads_22",
                     "1M_RNA_constant_reads_22",
                     "1M_RNA_mismatch_reads_22",
                     "1M_RNA_snp_reads_22",
                     "1M_RNA_mismatch_snp_reads_22",
                     "1M_RNA_reads_22_20-21M",
                     "20M_DNA_reads_genome",
                     "20M_DNA_snp_reads_genome",
                     "20M_RNA_reads_genome",
                     "20M_RNA_snp_reads_genome"]
        else:
            files = ["108M_RNA_wgEncodeCshlLongRnaSeq",
                     "62M_RNA_SRR353653",
                     "80M_DNA_SRR345300",
                     "5M_DNA_NA12878D"]
        for file in files:
            if small and file.find("20M") != -1:
                continue
            if os.path.exists(file):
                continue
            cmd = "wget %s/reads/%s/%s.tar.gz; tar xvzf %s.tar.gz; rm %s.tar.gz" % \
                (data_root, type, file, file, file)
            print >> sys.stderr, cmd
            os.system(cmd)
        os.chdir("..")

    os.chdir("..")


if __name__ == "__main__":
    parser = ArgumentParser(
        description='Get reference genome, annotations, and indexes')
    parser.add_argument('-s', '--small',
                        dest='small',
                        action='store_true',
                        default=False,
                        help='small testset')
    args = parser.parse_args()
    get_data(args.small)