hisat-3n/hisat2_extract_exons.py
2025-01-18 21:09:52 +08:00

160 lines
5.4 KiB
Python

#!/usr/bin/env python3
#
# Copyright 2015, Daehwan Kim <infphilo@gmail.com>
#
# This file is part of HISAT 2.
#
# HISAT 2 is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# HISAT 2 is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with HISAT 2. If not, see <http://www.gnu.org/licenses/>.
#
from sys import stderr, exit
from collections import defaultdict as dd, Counter
from argparse import ArgumentParser, FileType
def extract_exons(gtf_file, verbose = False):
genes = dd(list)
trans = {}
# Parse valid exon lines from the GTF file into a dict by transcript_id
for line in gtf_file:
line = line.strip()
if not line or line.startswith('#'):
continue
if '#' in line:
line = line.split('#')[0].strip()
try:
chrom, source, feature, left, right, score, \
strand, frame, values = line.split('\t')
except ValueError:
continue
left, right = int(left), int(right)
if feature != 'exon' or left >= right:
continue
values_dict = {}
for attr in values.split(';')[:-1]:
attr, _, val = attr.strip().partition(' ')
values_dict[attr] = val.strip('"')
if 'gene_id' not in values_dict or \
'transcript_id' not in values_dict:
continue
transcript_id = values_dict['transcript_id']
if transcript_id not in trans:
trans[transcript_id] = [chrom, strand, [[left, right]]]
genes[values_dict['gene_id']].append(transcript_id)
else:
trans[transcript_id][2].append([left, right])
# Sort exons and merge where separating introns are <=5 bps
for tran, [chrom, strand, exons] in trans.items():
exons.sort()
tmp_exons = [exons[0]]
for i in range(1, len(exons)):
if exons[i][0] - tmp_exons[-1][1] <= 5:
tmp_exons[-1][1] = exons[i][1]
else:
tmp_exons.append(exons[i])
trans[tran] = [chrom, strand, tmp_exons]
# Calculate and print the unique junctions
tmp_exons = set()
for chrom, strand, texons in trans.values():
for i in range(len(texons)):
tmp_exons.add((chrom, texons[i][0], texons[i][1], strand))
tmp_exons = sorted(tmp_exons)
if len(tmp_exons) <= 0:
return
exons = [tmp_exons[0]]
for exon in tmp_exons[1:]:
prev_exon = exons[-1]
if exon[0] != prev_exon[0]:
exons.append(exon)
continue
assert prev_exon[1] <= exon[1]
if prev_exon[2] < exon[1]:
exons.append(exon)
continue
if prev_exon[2] < exon[2]:
strand = prev_exon[3]
if strand not in "+-":
strand = exon[3]
exons[-1] = (prev_exon[0], prev_exon[1], exon[2], strand)
for chrom, left, right, strand in exons:
# Zero-based offset
print('{}\t{}\t{}\t{}'.format(chrom, left-1, right-1, strand))
# Print some stats if asked
if verbose:
None
"""
exon_lengths, intron_lengths, trans_lengths = \
Counter(), Counter(), Counter()
for chrom, strand, exons in trans.values():
tran_len = 0
for i, exon in enumerate(exons):
exon_len = exon[1]-exon[0]+1
exon_lengths[exon_len] += 1
tran_len += exon_len
if i == 0:
continue
intron_lengths[exon[0] - exons[i-1][1]] += 1
trans_lengths[tran_len] += 1
print('genes: {}, genes with multiple isoforms: {}'.format(
len(genes), sum(len(v) > 1 for v in genes.values())),
file=stderr)
print('transcripts: {}, transcript avg. length: {:d}'.format(
len(trans), sum(trans_lengths.elements())/len(trans)),
file=stderr)
print('exons: {}, exon avg. length: {:d}'.format(
sum(exon_lengths.values()),
sum(exon_lengths.elements())/sum(exon_lengths.values())),
file=stderr)
print('introns: {}, intron avg. length: {:d}'.format(
sum(intron_lengths.values()),
sum(intron_lengths.elements())/sum(intron_lengths.values())),
file=stderr)
print('average number of exons per transcript: {:d}'.format(
sum(exon_lengths.values())/len(trans)),
file=stderr)
"""
if __name__ == '__main__':
parser = ArgumentParser(
description='Extract exons from a GTF file')
parser.add_argument('gtf_file',
nargs='?',
type=FileType('r'),
help='input GTF file (use "-" for stdin)')
parser.add_argument('-v', '--verbose',
dest='verbose',
action='store_true',
help='also print some statistics to stderr')
args = parser.parse_args()
if not args.gtf_file:
parser.print_help()
exit(1)
extract_exons(args.gtf_file, args.verbose)