hisat-3n/evaluation/get_data.py
2025-01-18 21:09:52 +08:00

103 lines
3.7 KiB
Python

#!/usr/bin/env python
import sys, os
from argparse import ArgumentParser, FileType
def get_data(small = False):
data_root = "http://www.ccb.jhu.edu/software/hisat2/downloads/evaluation"
# Download the reference human genome, SNPs, and gene annotations
if not os.path.exists("data"):
os.mkdir("data")
os.chdir("data")
genome_files = ["genome.fa", "genome.fa.fai", "genome.gtf", "snpCommon.txt", "genome.snp", "genome.ss", "genome.exon"]
small_genome_files = ["22.fa", "22.fa.fai", "22.gtf", "22.snp", "22.ss", "22.exon", \
"22_20-21M.fa", "22_20-21M.fa.fai", "22_20-21M.gtf", "22_20-21M.snp", "22_20-21M.ss", "22_20-21M.exon"]
files = []
if not small:
files += genome_files
files += small_genome_files
for file in files:
if os.path.exists(file):
continue
wget_cmd = "wget %s/data/%s" % (data_root, file)
print >> sys.stderr, wget_cmd
os.system(wget_cmd)
os.chdir("..")
# Download indexes
if not os.path.exists("indexes"):
os.mkdir("indexes")
os.chdir("indexes")
aligners = ["HISAT2", "HISAT", "Bowtie", "STAR", "GSNAP"]
for genome in ["genome", "22", "22_20-21M"]:
if small and genome == "genome":
continue
for aligner in aligners:
if genome == "genome":
aligner_dir = aligner
else:
aligner_dir = aligner + "_" + genome
if os.path.exists(aligner_dir):
continue
cmd = "wget %s/indexes/%s.tar.gz; tar xvzf %s.tar.gz; rm %s.tar.gz" % \
(data_root, aligner_dir, aligner_dir, aligner_dir)
print >> sys.stderr, cmd
os.system(cmd)
os.chdir("..")
# Download simulated and real reads
if not os.path.exists("reads"):
os.mkdir("reads")
os.chdir("reads")
for type in ["simulation", "real"]:
if small and type == "real":
continue
if not os.path.exists(type):
os.mkdir(type)
os.chdir(type)
if type == "simulation":
files = ["1M_DNA_reads_22",
"1M_DNA_mismatch_reads_22",
"1M_DNA_snp_reads_22",
"1M_DNA_mismatch_snp_reads_22",
"1M_RNA_reads_22",
"1M_RNA_constant_reads_22",
"1M_RNA_mismatch_reads_22",
"1M_RNA_snp_reads_22",
"1M_RNA_mismatch_snp_reads_22",
"1M_RNA_reads_22_20-21M",
"20M_DNA_reads_genome",
"20M_DNA_snp_reads_genome",
"20M_RNA_reads_genome",
"20M_RNA_snp_reads_genome"]
else:
files = ["108M_RNA_wgEncodeCshlLongRnaSeq",
"62M_RNA_SRR353653",
"80M_DNA_SRR345300",
"5M_DNA_NA12878D"]
for file in files:
if small and file.find("20M") != -1:
continue
if os.path.exists(file):
continue
cmd = "wget %s/reads/%s/%s.tar.gz; tar xvzf %s.tar.gz; rm %s.tar.gz" % \
(data_root, type, file, file, file)
print >> sys.stderr, cmd
os.system(cmd)
os.chdir("..")
os.chdir("..")
if __name__ == "__main__":
parser = ArgumentParser(
description='Get reference genome, annotations, and indexes')
parser.add_argument('-s', '--small',
dest='small',
action='store_true',
default=False,
help='small testset')
args = parser.parse_args()
get_data(args.small)