783 lines
22 KiB
Perl
783 lines
22 KiB
Perl
#!/usr/bin/env perl
|
|
|
|
#
|
|
# Copyright 2015, Daehwan Kim <infphilo@gmail.com>
|
|
#
|
|
# This file is part of HISAT 2.
|
|
#
|
|
# HISAT 2 is free software: you can redistribute it and/or modify
|
|
# it under the terms of the GNU General Public License as published by
|
|
# the Free Software Foundation, either version 3 of the License, or
|
|
# (at your option) any later version.
|
|
#
|
|
# HISAT 2 is distributed in the hope that it will be useful,
|
|
# but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
|
# GNU General Public License for more details.
|
|
#
|
|
# You should have received a copy of the GNU General Public License
|
|
# along with HISAT 2. If not, see <http://www.gnu.org/licenses/>.
|
|
#
|
|
|
|
# hisat-3n:
|
|
#
|
|
# A wrapper script for hisat-3n. Provides various advantages over running
|
|
# hisat directly, including:
|
|
#
|
|
# 1. Handling compressed inputs
|
|
# 2. Redirecting output to various files
|
|
# 3. Output directly to bam (not currently supported)
|
|
|
|
use strict;
|
|
use warnings;
|
|
use Getopt::Long qw(GetOptions);
|
|
use File::Spec;
|
|
use POSIX;
|
|
|
|
|
|
my ($vol,$script_path,$prog);
|
|
$prog = File::Spec->rel2abs( __FILE__ );
|
|
|
|
while (-f $prog && -l $prog){
|
|
my (undef, $dir, undef) = File::Spec->splitpath($prog);
|
|
$prog = File::Spec->rel2abs(readlink($prog), $dir);
|
|
}
|
|
|
|
($vol,$script_path,$prog)
|
|
= File::Spec->splitpath($prog);
|
|
my $os_is_nix = ($^O eq "linux") || ($^O eq "darwin");
|
|
my $align_bin_s = $os_is_nix ? 'hisat2-align-s' : 'hisat2-align-s.exe';
|
|
my $build_bin = $os_is_nix ? 'hisat2-build' : 'hisat2-build.exe';
|
|
my $align_bin_l = $os_is_nix ? 'hisat2-align-l' : 'hisat2-align-l.exe';
|
|
my $align_prog_s= File::Spec->catpath($vol,$script_path,$align_bin_s);
|
|
my $align_prog_l= File::Spec->catpath($vol,$script_path,$align_bin_l);
|
|
my $align_prog = $align_prog_s;
|
|
my $read_stat_prog = File::Spec->catpath($vol,$script_path,"hisat2_read_statistics.py");
|
|
my $idx_ext_l = 'ht2l';
|
|
my $idx_ext_s = 'ht2';
|
|
my $idx_ext = $idx_ext_s;
|
|
my $seq_in_args = 0;
|
|
my $skip_read_stat = 0;
|
|
my %signo = ();
|
|
my @signame = ();
|
|
my $basechange_pair = 'CT';
|
|
|
|
# HISAT-3N basechange type
|
|
my %conversion_type = (
|
|
"AT" => 0, "AC" => 0, "CG" => 0, "CT" => 0, "GA" => 0, "TG" => 0,
|
|
# need to reverse From,To bases
|
|
"CA" => 1, "GC" => 1, "TA" => 1, "TC" => 1, "AG" => 1, "GT" => 1,
|
|
);
|
|
|
|
my %convertion_rc = (
|
|
"AT" => "TA", "AC" => "TG", "CG" => "GC",
|
|
"CT" => "GA", "GA" => "CT", "TG" => "AC",
|
|
);
|
|
|
|
sub getConversion {
|
|
my $from = uc $_[0];
|
|
my $to = uc $_[1];
|
|
my $conv_pair = $from.$to;
|
|
|
|
my $val = $conversion_type{$conv_pair};
|
|
|
|
if (defined $val) {
|
|
if ($val) {
|
|
$conv_pair = $to.$from;
|
|
}
|
|
} else {
|
|
return undef;
|
|
}
|
|
|
|
return $conv_pair;
|
|
}
|
|
|
|
{
|
|
# Get signal info
|
|
use Config;
|
|
my $i = 0;
|
|
for my $name (split(' ', $Config{sig_name})) {
|
|
$signo{$name} = $i;
|
|
$signame[$i] = $name;
|
|
$i++;
|
|
}
|
|
}
|
|
|
|
(-x "$align_prog") ||
|
|
Fail("Expected hisat2 to be in same directory with hisat2-align-s and hisat2-align-l:\n$script_path\n");
|
|
|
|
(-x "$read_stat_prog") || ($skip_read_stat = 1);
|
|
|
|
# Get description of arguments from HISAT so that we can distinguish HISAT
|
|
# args from wrapper args
|
|
sub getHt2Desc($) {
|
|
my $d = shift;
|
|
my $cmd = "'$align_prog' --wrapper basic-0 --arg-desc";
|
|
open(my $fh, "$cmd |") || Fail("Failed to run command '$cmd'\n");
|
|
while(readline $fh) {
|
|
chomp;
|
|
next if /^\s*$/;
|
|
my @ts = split(/\t/);
|
|
$d->{$ts[0]} = $ts[1];
|
|
}
|
|
close($fh);
|
|
$? == 0 || Fail("Description of arguments failed!\n");
|
|
}
|
|
|
|
my %desc = ();
|
|
my %wrapped = ("1" => 1, "2" => 1);
|
|
getHt2Desc(\%desc);
|
|
|
|
# Given an option like -1, determine whether it's wrapped (i.e. should be
|
|
# handled by this script rather than being passed along to HISAT)
|
|
sub isWrapped($) { return defined($wrapped{$_[0]}); }
|
|
|
|
my @orig_argv = @ARGV;
|
|
|
|
my @ht2w_args = (); # options for wrapper
|
|
my @ht2_args = (); # options for HISAT
|
|
my $saw_dd = 0;
|
|
for(0..$#ARGV) {
|
|
if($ARGV[$_] eq "--") {
|
|
$saw_dd = 1;
|
|
next;
|
|
}
|
|
push @ht2w_args, $ARGV[$_] if !$saw_dd;
|
|
push @ht2_args, $ARGV[$_] if $saw_dd;
|
|
}
|
|
if(!$saw_dd) {
|
|
@ht2_args = @ht2w_args;
|
|
@ht2w_args= ();
|
|
}
|
|
|
|
my $debug = 0;
|
|
my %read_fns = ();
|
|
my %read_compress = ();
|
|
my $cap_out = undef; # Filename for passthrough
|
|
my $no_unal = 0;
|
|
my $large_idx = 0;
|
|
# Remove whitespace
|
|
for my $i (0..$#ht2_args) {
|
|
$ht2_args[$i]=~ s/^\s+//; $ht2_args[$i] =~ s/\s+$//;
|
|
}
|
|
|
|
# We've handled arguments that the user has explicitly directed either to the
|
|
# wrapper or to hisat, now we capture some of the hisat arguments that
|
|
# ought to be handled in the wrapper
|
|
for(my $i = 0; $i < scalar(@ht2_args); $i++) {
|
|
next unless defined($ht2_args[$i]);
|
|
my $arg = $ht2_args[$i];
|
|
my @args = split(/=/, $arg);
|
|
if(scalar(@args) > 2) {
|
|
$args[1] = join("=", @args[1..$#args]);
|
|
}
|
|
$arg = $args[0];
|
|
if($arg eq "-U" || $arg eq "--unpaired") {
|
|
$ht2_args[$i] = undef;
|
|
$arg =~ s/^-U//; $arg =~ s/^--unpaired//;
|
|
if($arg ne "") {
|
|
# Argument was part of this token
|
|
my @args = split(/,/, $arg);
|
|
for my $a (@args) { push @ht2w_args, ("-U", $a); }
|
|
} else {
|
|
# Argument is in the next token
|
|
$i < scalar(@ht2_args)-1 || Fail("Argument expected in next token!\n");
|
|
$i++;
|
|
my @args = split(/,/, $ht2_args[$i]);
|
|
for my $a (@args) { push @ht2w_args, ("-U", $a); }
|
|
$ht2_args[$i] = undef;
|
|
}
|
|
}
|
|
if($arg =~ /^--?([12])/ && $arg !~ /^--?12/) {
|
|
my $mate = $1;
|
|
$ht2_args[$i] = undef;
|
|
$arg =~ s/^--?[12]//;
|
|
if($arg ne "") {
|
|
# Argument was part of this token
|
|
my @args = split(/,/, $arg);
|
|
for my $a (@args) { push @ht2w_args, ("-$mate", $a); }
|
|
} else {
|
|
# Argument is in the next token
|
|
$i < scalar(@ht2_args)-1 || Fail("Argument expected in next token!\n");
|
|
$i++;
|
|
my @args = split(/,/, $ht2_args[$i]);
|
|
for my $a (@args) { push @ht2w_args, ("-$mate", $a); }
|
|
$ht2_args[$i] = undef;
|
|
}
|
|
}
|
|
if($arg eq "--debug") {
|
|
$debug = 1;
|
|
$ht2_args[$i] = undef;
|
|
}
|
|
if($arg eq "--no-unal") {
|
|
$no_unal = 1;
|
|
$ht2_args[$i] = undef;
|
|
}
|
|
if($arg eq "--large-index") {
|
|
$large_idx = 1;
|
|
$ht2_args[$i] = undef;
|
|
}
|
|
if($arg eq "--skip-read-lengths") {
|
|
$skip_read_stat = 1;
|
|
$ht2_args[$i] = undef
|
|
}
|
|
if($arg eq "-c") {
|
|
$seq_in_args = 1;
|
|
}
|
|
if($arg eq "--base-change") {
|
|
my $base_args = "";
|
|
if (scalar @args > 1 && $args[1] ne "") {
|
|
$base_args = $args[1];
|
|
} else {
|
|
Fail("${arg} option takes an argument.\n") if ($i >= scalar(@ht2_args)-1);
|
|
$base_args = $ht2_args[$i+1];
|
|
$i++;
|
|
}
|
|
my @basechange_opt = split(/,/, $base_args);
|
|
|
|
Fail("Invalid --base-change option\n") if (scalar @basechange_opt != 2);
|
|
$basechange_pair = getConversion($basechange_opt[0], $basechange_opt[1]);
|
|
|
|
Fail("Invalid --base-change option\n") unless defined $basechange_pair;
|
|
}
|
|
|
|
for my $rarg ("un-conc", "al-conc", "al-conc-disc", "un", "al") {
|
|
if($arg =~ /^--${rarg}$/ || $arg =~ /^--${rarg}-gz$/ || $arg =~ /^--${rarg}-bz2$/) {
|
|
$ht2_args[$i] = undef;
|
|
if(scalar(@args) > 1 && $args[1] ne "") {
|
|
$read_fns{$rarg} = $args[1];
|
|
} else {
|
|
$i < scalar(@ht2_args)-1 || Fail("--${rarg}* option takes an argument.\n");
|
|
$read_fns{$rarg} = $ht2_args[$i+1];
|
|
$ht2_args[$i+1] = undef;
|
|
}
|
|
$read_compress{$rarg} = "";
|
|
$read_compress{$rarg} = "gzip" if $arg eq "--${rarg}-gz";
|
|
$read_compress{$rarg} = "bzip2" if $arg eq "--${rarg}-bz2";
|
|
last;
|
|
}
|
|
}
|
|
}
|
|
|
|
# If the user asked us to redirect some reads to files, or to suppress
|
|
# unaligned reads, then we need to capture the output from HISAT and pass it
|
|
# through this wrapper.
|
|
my $passthru = 0;
|
|
if(scalar(keys %read_fns) > 0 || $no_unal) {
|
|
$passthru = 1;
|
|
push @ht2_args, "--passthrough";
|
|
$cap_out = "-";
|
|
for(my $i = 0; $i < scalar(@ht2_args); $i++) {
|
|
next unless defined($ht2_args[$i]);
|
|
my $arg = $ht2_args[$i];
|
|
if($arg eq "-S" || $arg eq "--output") {
|
|
$i < scalar(@ht2_args)-1 || Fail("-S/--output takes an argument.\n");
|
|
$cap_out = $ht2_args[$i+1];
|
|
$ht2_args[$i] = undef;
|
|
$ht2_args[$i+1] = undef;
|
|
}
|
|
}
|
|
}
|
|
my @tmp = ();
|
|
for (@ht2_args) { push(@tmp, $_) if defined($_); }
|
|
@ht2_args = @tmp;
|
|
|
|
my @unps = ();
|
|
my @mate1s = ();
|
|
my @mate2s = ();
|
|
my @to_delete = ();
|
|
my @to_kills = ();
|
|
my $temp_dir = "/tmp";
|
|
my $bam_out = 0;
|
|
my $ref_str = undef;
|
|
my $no_pipes = 0;
|
|
my $keep = 0;
|
|
my $verbose = 0;
|
|
my $readpipe = undef;
|
|
my $log_fName = undef;
|
|
|
|
my @ht2w_args_cp = (@ht2w_args>0) ? @ht2w_args : @ht2_args;
|
|
Getopt::Long::Configure("pass_through","no_ignore_case");
|
|
|
|
my @old_ARGV = @ARGV;
|
|
@ARGV = @ht2w_args_cp;
|
|
|
|
GetOptions(
|
|
"1=s" => \@mate1s,
|
|
"2=s" => \@mate2s,
|
|
"reads|U=s" => \@unps,
|
|
"temp-directory=s" => \$temp_dir,
|
|
"bam" => \$bam_out,
|
|
"no-named-pipes" => \$no_pipes,
|
|
"ref-string|reference-string=s" => \$ref_str,
|
|
"keep" => \$keep,
|
|
"verbose" => \$verbose,
|
|
"log-file=s" => \$log_fName
|
|
);
|
|
|
|
@ARGV = @old_ARGV;
|
|
|
|
my $old_stderr;
|
|
|
|
if ($log_fName) {
|
|
open($old_stderr, ">&STDERR") or Fail("Cannot dup STDERR!\n");
|
|
open(STDERR, ">", $log_fName) or Fail("Cannot redirect to log file $log_fName.\n");
|
|
}
|
|
|
|
Info("Before arg handling:\n");
|
|
Info(" Wrapper args:\n[ @ht2w_args ]\n");
|
|
Info(" Binary args:\n[ @ht2_args ]\n");
|
|
|
|
# check read lengths
|
|
# if read_files have more than 1 files, use first one,
|
|
my @read_files = (scalar(@unps) > 0) ? @unps : @mate1s;
|
|
if ((scalar(@read_files) > 0)
|
|
&& ($seq_in_args == 0)
|
|
&& ($skip_read_stat == 0)) {
|
|
Info("Check read length: $read_files[0]\n");
|
|
my $cmd = "'$read_stat_prog' $read_files[0]";
|
|
my $read_len_str = "";
|
|
|
|
open(my $fh, "$cmd |") || Fail("Failed to run command '$cmd'\n");
|
|
while(readline $fh) {
|
|
chomp;
|
|
next if /^\s*$/;
|
|
my @ts = split(/ /);
|
|
if (scalar(@ts) > 4) {
|
|
$read_len_str = $ts[4];
|
|
} else {
|
|
$read_len_str = "";
|
|
}
|
|
}
|
|
close($fh);
|
|
|
|
if (($read_len_str ne "") && ($read_len_str ne "0")) {
|
|
Info("Read Length String: $read_len_str\n");
|
|
push @ht2_args, ("--read-lengths", $read_len_str);
|
|
}
|
|
}
|
|
|
|
sub check_file_exist($$$) {
|
|
my ($unps, $mate1s, $mate2s) = @_;
|
|
for my $fn (@$unps, @$mate1s, @$mate2s) {
|
|
if (not -f $fn) {
|
|
Fail("Read file '%s' doesn't exist\n", $fn);
|
|
return 1;
|
|
}
|
|
}
|
|
return 0;
|
|
}
|
|
|
|
sub cat_file($$) {
|
|
my ($ifn, $ofh) = @_;
|
|
my $ifh = undef;
|
|
if($ifn =~ /\.gz$/) {
|
|
open($ifh, "gzip -dc $ifn |") ||
|
|
Fail("Could not open gzipped read file: $ifn \n");
|
|
} elsif($ifn =~ /\.bz2/) {
|
|
open($ifh, "bzip2 -dc $ifn |") ||
|
|
Fail("Could not open bzip2ed read file: $ifn \n");
|
|
} else {
|
|
open($ifh, $ifn) || Fail("Could not open read file: $ifn \n");
|
|
}
|
|
while(readline $ifh) { print {$ofh} $_; }
|
|
close($ifh);
|
|
}
|
|
|
|
# Return non-zero if and only if the input should be wrapped (i.e. because
|
|
# it's compressed).
|
|
sub wrapInput($$$) {
|
|
my ($unps, $mate1s, $mate2s) = @_;
|
|
for my $fn (@$unps, @$mate1s, @$mate2s) {
|
|
return 1 if $fn =~ /\.gz$/ || $fn =~ /\.bz2$/;
|
|
}
|
|
return 0;
|
|
}
|
|
|
|
sub Info {
|
|
if ($verbose) {
|
|
print STDERR "(INFO): " ,@_;
|
|
}
|
|
}
|
|
|
|
sub Error {
|
|
my @msg = @_;
|
|
$msg[0] = "(ERR): ".$msg[0];
|
|
printf STDERR @msg;
|
|
}
|
|
|
|
sub Fail {
|
|
Error(@_);
|
|
die("Exiting now ...\n");
|
|
}
|
|
|
|
sub Extract_IndexName_From {
|
|
my $index_opt = $ref_str ? '--index' : '-x';
|
|
for (my $i=0; $i<@_; $i++) {
|
|
if ($_[$i] eq $index_opt){
|
|
my $idx_basename = $_[$i+1];
|
|
my @idx_filenames = glob($idx_basename . "*.ht2{,l}");
|
|
unless(@idx_filenames) {
|
|
if(exists $ENV{"HISAT2_INDEXES"}) {
|
|
@idx_filenames = glob("$ENV{'HISAT2_INDEXES'}/$idx_basename" . "ht2{,l}");
|
|
}
|
|
|
|
if(!@idx_filenames) {
|
|
Fail("\"" . $idx_basename . "\" does not exist\n");
|
|
}
|
|
$idx_basename = "$ENV{'HISAT2_INDEXES'}/$idx_basename"
|
|
}
|
|
|
|
return $idx_basename;
|
|
}
|
|
}
|
|
Info("Cannot find any index option (--reference-string, --ref-string or -x) in the given command line.\n");
|
|
}
|
|
|
|
sub Get_IndexFileName {
|
|
my $isLargeIndex = $_[0];
|
|
my $IndexPrefix = $_[1];
|
|
my $IndexFileName = "${IndexPrefix}.3n.1.1.";
|
|
|
|
if ($isLargeIndex) {
|
|
$IndexFileName = $IndexFileName.$idx_ext_l;
|
|
} else {
|
|
$IndexFileName = $IndexFileName.$idx_ext_s;
|
|
}
|
|
|
|
return $IndexFileName;
|
|
}
|
|
|
|
if($seq_in_args == 0) {
|
|
check_file_exist(\@unps, \@mate1s, \@mate2s);
|
|
}
|
|
|
|
if(wrapInput(\@unps, \@mate1s, \@mate2s)) {
|
|
if(scalar(@mate2s) > 0) {
|
|
#
|
|
# Wrap paired-end inputs
|
|
#
|
|
# Put reads into temporary files or fork off processes to feed named pipes
|
|
scalar(@mate2s) == scalar(@mate1s) ||
|
|
Fail("Different number of files specified with --reads/-1 as with -2\n");
|
|
# Make a named pipe for delivering mate #1s
|
|
my $m1fn = "$temp_dir/$$.inpipe1";
|
|
push @to_delete, $m1fn;
|
|
push @ht2_args, "-1 $m1fn";
|
|
# Create named pipe 1 for writing
|
|
if(!$no_pipes) {
|
|
mkfifo($m1fn, 0700) || Fail("mkfifo($m1fn) failed.\n");
|
|
}
|
|
my $pid = 0;
|
|
$pid = fork() unless $no_pipes;
|
|
push @to_kills, $pid unless $no_pipes;
|
|
if($pid == 0) {
|
|
# Open named pipe 1 for writing
|
|
open(my $ofh, ">$m1fn") || Fail("Can't open '$m1fn' for writing\n");
|
|
for my $ifn (@mate1s) { cat_file($ifn, $ofh); }
|
|
close($ofh);
|
|
exit 0 unless $no_pipes;
|
|
}
|
|
# Make a named pipe for delivering mate #2s
|
|
my $m2fn = "$temp_dir/$$.inpipe2";
|
|
push @to_delete, $m2fn;
|
|
push @ht2_args, "-2 $m2fn";
|
|
# Create named pipe 2 for writing
|
|
if(!$no_pipes) {
|
|
mkfifo($m2fn, 0700) || Fail("mkfifo($m2fn) failed.\n");
|
|
}
|
|
$pid = 0;
|
|
$pid = fork() unless $no_pipes;
|
|
push @to_kills, $pid unless $no_pipes;
|
|
if($pid == 0) {
|
|
# Open named pipe 2 for writing
|
|
open(my $ofh, ">$m2fn") || Fail("Can't open '$m2fn' for writing.\n");
|
|
for my $ifn (@mate2s) { cat_file($ifn, $ofh); }
|
|
close($ofh);
|
|
exit 0 unless $no_pipes;
|
|
}
|
|
}
|
|
if(scalar(@unps) > 0) {
|
|
#
|
|
# Wrap unpaired inputs.
|
|
#
|
|
# Make a named pipe for delivering unpaired reads
|
|
my $ufn = "$temp_dir/$$.unp";
|
|
push @to_delete, $ufn;
|
|
push @ht2_args, "-U $ufn";
|
|
# Create named pipe 2 for writing
|
|
if(!$no_pipes) {
|
|
mkfifo($ufn, 0700) || Fail("mkfifo($ufn) failed.\n");
|
|
}
|
|
my $pid = 0;
|
|
$pid = fork() unless $no_pipes;
|
|
if($pid == 0) {
|
|
# Open named pipe 2 for writing
|
|
open(my $ofh, ">$ufn") || Fail("Can't open '$ufn' for writing.\n");
|
|
for my $ifn (@unps) { cat_file($ifn, $ofh); }
|
|
close($ofh);
|
|
exit 0 unless $no_pipes;
|
|
}
|
|
}
|
|
} else {
|
|
if(scalar(@mate2s) > 0) {
|
|
# Just pass all the mate arguments along to the binary
|
|
push @ht2_args, ("-1", join(",", @mate1s));
|
|
push @ht2_args, ("-2", join(",", @mate2s));
|
|
}
|
|
if(scalar(@unps) > 0) {
|
|
push @ht2_args, ("-U", join(",", @unps));
|
|
}
|
|
}
|
|
|
|
if(defined($ref_str)) {
|
|
my $ofn = "$temp_dir/$$.ref_str.fa";
|
|
open(my $ofh, ">$ofn") ||
|
|
Fail("could not open temporary fasta file '$ofn' for writing.\n");
|
|
print {$ofh} ">1\n$ref_str\n";
|
|
close($ofh);
|
|
push @to_delete, $ofn;
|
|
system("$build_bin $ofn $ofn") == 0 ||
|
|
Fail("hisat2-build returned non-0 exit level.\n");
|
|
push @ht2_args, ("--index", "$ofn");
|
|
push @to_delete, ("$ofn.1.".$idx_ext, "$ofn.2.".$idx_ext,
|
|
"$ofn.3.".$idx_ext, "$ofn.4.".$idx_ext,
|
|
"$ofn.5.".$idx_ext, "$ofn.6.".$idx_ext,
|
|
"$ofn.7.".$idx_ext, "$ofn.8.".$idx_ext);
|
|
}
|
|
|
|
Info("After arg handling:\n");
|
|
Info(" Binary args:\n[ @ht2_args ]\n");
|
|
|
|
sub find_hisat3n_index {
|
|
my $isLargeIndex = $_[0];
|
|
my $IndexName = $_[1];
|
|
|
|
my $basepair = $basechange_pair;
|
|
my $rc_basepair = $convertion_rc{$basechange_pair};
|
|
|
|
my @IndexNames = (
|
|
$IndexName.".3n.$basepair.1",
|
|
$IndexName.".3n.$rc_basepair.1"
|
|
);
|
|
|
|
my @IndexFullName = ();
|
|
if ($isLargeIndex) {
|
|
for (@IndexNames) {
|
|
push @IndexFullName, $_.".$idx_ext_l";
|
|
}
|
|
} else {
|
|
for (@IndexNames) {
|
|
push @IndexFullName, $_.".$idx_ext_s";
|
|
}
|
|
}
|
|
|
|
if (-f $IndexFullName[0] && -f $IndexFullName[1]) {
|
|
return @IndexFullName;
|
|
}
|
|
|
|
# find old-name-format index file
|
|
@IndexNames = (
|
|
$IndexName.".3n.1.1",
|
|
$IndexName.".3n.2.1"
|
|
);
|
|
@IndexFullName = ();
|
|
if ($isLargeIndex) {
|
|
for (@IndexNames) {
|
|
push @IndexFullName, $_.".$idx_ext_l";
|
|
}
|
|
} else {
|
|
for (@IndexNames) {
|
|
push @IndexFullName, $_.".$idx_ext_s";
|
|
}
|
|
}
|
|
|
|
if (-f $IndexFullName[0] && -f $IndexFullName[1]) {
|
|
return @IndexFullName;
|
|
}
|
|
|
|
# empty array
|
|
return ();
|
|
}
|
|
|
|
my $index_name = Extract_IndexName_From(@ht2_args);
|
|
my $def_index_filename_s = $index_name.".3n.$basechange_pair.1.$idx_ext_s";
|
|
my $def_index_filename_l = $index_name.".3n.$basechange_pair.1.$idx_ext_l";
|
|
|
|
if ($large_idx) {
|
|
Info("Using a large index enforced by user.\n");
|
|
$align_prog = $align_prog_l;
|
|
$idx_ext = $idx_ext_l;
|
|
my @IndexNames = find_hisat3n_index($large_idx, $index_name);
|
|
if (scalar @IndexNames == 0) {
|
|
Fail("Cannot find the large index $def_index_filename_l\n");
|
|
}
|
|
Info("Using large index ($IndexNames[0]).\n");
|
|
}
|
|
else {
|
|
my @LargeIndexNames = find_hisat3n_index(1, $index_name);
|
|
my @SmallIndexNames = find_hisat3n_index(0, $index_name);
|
|
|
|
if ((scalar @LargeIndexNames > 0) && (scalar @SmallIndexNames == 0)) {
|
|
Info("Cannot find a small index but a large one seems to be present.\n");
|
|
Info("Switching to using the large index ($LargeIndexNames[0]).\n");
|
|
$align_prog = $align_prog_l;
|
|
$idx_ext = $idx_ext_l;
|
|
}
|
|
else {
|
|
Info("Using the small index (${def_index_filename_s}).\n")
|
|
}
|
|
}
|
|
|
|
my $debug_str = ($debug ? "-debug" : "");
|
|
push @ht2_args, "--3N";
|
|
# Construct command invoking hisat2-align
|
|
my $cmd = "'$align_prog$debug_str' --wrapper basic-0 ".join(" ", @ht2_args);
|
|
|
|
# Possibly add read input on an anonymous pipe
|
|
$cmd = "$readpipe $cmd" if defined($readpipe);
|
|
|
|
Info("$cmd\n");
|
|
my $ret;
|
|
if(defined($cap_out)) {
|
|
# Open HISAT2 pipe
|
|
open(HT, "$cmd |") || Fail("Could not open HISAT2 pipe: '$cmd |'\n");
|
|
# Open output pipe
|
|
my $ofh = *STDOUT;
|
|
my @fhs_to_close = ();
|
|
if($cap_out ne "-") {
|
|
open($ofh, ">$cap_out") ||
|
|
Fail("Could not open output file '$cap_out' for writing.\n");
|
|
}
|
|
my %read_fhs = ();
|
|
for my $i ("al", "un", "al-conc", "al-conc-disc", "un-conc") {
|
|
if(defined($read_fns{$i})) {
|
|
my ($vol, $base_spec_dir, $base_fname) = File::Spec->splitpath($read_fns{$i});
|
|
if (-d $read_fns{$i}) {
|
|
$base_spec_dir = $read_fns{$i};
|
|
$base_fname = undef;
|
|
}
|
|
if($i =~ /-conc$/ || $i =~ /-conc-disc$/) {
|
|
# Open 2 output files, one for mate 1, one for mate 2
|
|
my ($fn1, $fn2);
|
|
if ($base_fname) {
|
|
($fn1, $fn2) = ($base_fname,$base_fname);
|
|
}
|
|
else {
|
|
($fn1, $fn2) = ($i.'-mate',$i.'-mate');
|
|
}
|
|
if($fn1 =~ /%/) {
|
|
$fn1 =~ s/%/1/g; $fn2 =~ s/%/2/g;
|
|
} elsif($fn1 =~ /\.[^.]*$/) {
|
|
$fn1 =~ s/\.([^.]*)$/.1.$1/;
|
|
$fn2 =~ s/\.([^.]*)$/.2.$1/;
|
|
} else {
|
|
$fn1 .= ".1";
|
|
$fn2 .= ".2";
|
|
}
|
|
$fn1 = File::Spec->catpath($vol,$base_spec_dir,$fn1);
|
|
$fn2 = File::Spec->catpath($vol,$base_spec_dir,$fn2);
|
|
$fn1 ne $fn2 || Fail("$fn1\n$fn2\n");
|
|
my ($redir1, $redir2) = (">$fn1", ">$fn2");
|
|
$redir1 = "| gzip -c $redir1" if $read_compress{$i} eq "gzip";
|
|
$redir1 = "| bzip2 -c $redir1" if $read_compress{$i} eq "bzip2";
|
|
$redir2 = "| gzip -c $redir2" if $read_compress{$i} eq "gzip";
|
|
$redir2 = "| bzip2 -c $redir2" if $read_compress{$i} eq "bzip2";
|
|
open($read_fhs{$i}{1}, $redir1) || Fail("Could not open --$i mate-1 output file '$fn1'\n");
|
|
open($read_fhs{$i}{2}, $redir2) || Fail("Could not open --$i mate-2 output file '$fn2'\n");
|
|
push @fhs_to_close, $read_fhs{$i}{1};
|
|
push @fhs_to_close, $read_fhs{$i}{2};
|
|
} else {
|
|
my $redir = ">".File::Spec->catpath($vol,$base_spec_dir,$i."-seqs");
|
|
if ($base_fname) {
|
|
$redir = ">$read_fns{$i}";
|
|
}
|
|
$redir = "| gzip -c $redir" if $read_compress{$i} eq "gzip";
|
|
$redir = "| bzip2 -c $redir" if $read_compress{$i} eq "bzip2";
|
|
open($read_fhs{$i}, $redir) || Fail("Could not open --$i output file '$read_fns{$i}'\n");
|
|
push @fhs_to_close, $read_fhs{$i};
|
|
}
|
|
}
|
|
}
|
|
while(<HT>) {
|
|
chomp;
|
|
my $filt = 0;
|
|
unless(substr($_, 0, 1) eq "@") {
|
|
# If we are supposed to output certain reads to files...
|
|
my $tab1_i = index($_, "\t") + 1;
|
|
my $tab2_i = index($_, "\t", $tab1_i);
|
|
my $fl = substr($_, $tab1_i, $tab2_i - $tab1_i);
|
|
my $unal = ($fl & 4) != 0;
|
|
my $secondary = ($fl & 256) != 0;
|
|
$filt = 1 if $no_unal && $unal;
|
|
if($passthru) {
|
|
if(scalar(keys %read_fhs) == 0) {
|
|
# Next line is read with some whitespace escaped
|
|
my $l = <HT>;
|
|
} else {
|
|
my $mate1 = (($fl & 64) != 0);
|
|
my $mate2 = (($fl & 128) != 0);
|
|
my $unp = !$mate1 && !$mate2;
|
|
my $pair = !$unp;
|
|
# Next line is read with some whitespace escaped
|
|
my $l = <HT>;
|
|
chomp($l);
|
|
$l =~ s/%(..)/chr(hex($1))/eg;
|
|
if((defined($read_fhs{un}) || defined($read_fhs{al})) && $unp && !$secondary) {
|
|
if($unal) {
|
|
# Failed to align
|
|
print {$read_fhs{un}} $l if defined($read_fhs{un});
|
|
} else {
|
|
# Aligned
|
|
print {$read_fhs{al}} $l if defined($read_fhs{al});
|
|
}
|
|
}
|
|
if((defined($read_fhs{"un-conc"}) || defined($read_fhs{"al-conc"}) || defined($read_fhs{"al-conc-disc"})) && $pair && !$secondary) {
|
|
my $conc = (($fl & 2) != 0);
|
|
my $conc_disc = ($fl & 4) == 0 || ($fl & 8) == 0;
|
|
if($conc && $mate1) {
|
|
print {$read_fhs{"al-conc"}{1}} $l if defined($read_fhs{"al-conc"});
|
|
} elsif($conc && $mate2) {
|
|
print {$read_fhs{"al-conc"}{2}} $l if defined($read_fhs{"al-conc"});
|
|
} elsif(!$conc && $mate1) {
|
|
print {$read_fhs{"un-conc"}{1}} $l if defined($read_fhs{"un-conc"});
|
|
} elsif(!$conc && $mate2) {
|
|
print {$read_fhs{"un-conc"}{2}} $l if defined($read_fhs{"un-conc"});
|
|
}
|
|
if($conc_disc && $mate1) {
|
|
print {$read_fhs{"al-conc-disc"}{1}} $l if defined($read_fhs{"al-conc-disc"});
|
|
} elsif($conc_disc && $mate2) {
|
|
print {$read_fhs{"al-conc-disc"}{2}} $l if defined($read_fhs{"al-conc-disc"});
|
|
}
|
|
}
|
|
}
|
|
}
|
|
}
|
|
print {$ofh} "$_\n" if !$filt;
|
|
}
|
|
for my $k (@fhs_to_close) { close($k); }
|
|
close($ofh);
|
|
close(HT);
|
|
$ret = $?;
|
|
} else {
|
|
$ret = system($cmd);
|
|
}
|
|
kill 'TERM', @to_kills;
|
|
|
|
if(!$keep) { for(@to_delete) { unlink($_); } }
|
|
|
|
if ($ret == -1) {
|
|
Error("Failed to execute hisat2-align: $!\n");
|
|
exit 1;
|
|
} elsif ($ret & 127) {
|
|
my $signm = "(unknown)";
|
|
$signm = $signame[$ret & 127] if defined($signame[$ret & 127]);
|
|
my $ad = "";
|
|
$ad = "(core dumped)" if (($ret & 128) != 0);
|
|
Error("hisat2-align died with signal %d (%s) $ad\n", ($ret & 127), $signm);
|
|
exit 1;
|
|
} elsif($ret != 0) {
|
|
Error("hisat2-align exited with value %d\n", ($ret >> 8));
|
|
}
|
|
exit ($ret >> 8);
|