hisat-3n/scripts/sim/AlignmentCheck.pm

860 lines
24 KiB
Perl
Raw Permalink Normal View History

2025-01-18 13:09:52 +00:00
#!/usr/bin/perl -w
#
# Copyright 2011, Ben Langmead <langmea@cs.jhu.edu>
#
# This file is part of Bowtie 2.
#
# Bowtie 2 is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# Bowtie 2 is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with Bowtie 2. If not, see <http://www.gnu.org/licenses/>.
#
##
# AlignmentCheck.pm
#
# Read in fasta files containing reference sequences that might be
# aligned to, then read in alignment files, checking each alignment to
# be sure it's sane and consistent with the reference sequence it
# aligns to.
#
package AlignmentCheck;
use strict;
use warnings;
use FindBin qw($Bin);
use lib $Bin;
use DNA;
use Data::Dumper;
##
# Parse a fasta file into the %ref hash
#
sub parseFasta($$) {
my ($fa, $ref) = @_;
print STDERR "Parsing FASTA file $fa...\n";
my $fapipe = "$fa";
$fapipe = "gzip -dc $fa |" if $fa =~ /\.gz$/;
open(FA, $fapipe) || die "Could not open '$fapipe' for reading";
my $name = "";
my $bases = 0;
while(<FA>) {
chomp;
if(/^>/) {
$name = substr($_, 1);
$name =~ s/\s.*//;
print STDERR " parsing sequence $name...\n";
$ref->{std}{$name} = "";
} else {
$name ne "" || die "sequence before name";
$bases += length($_);
$ref->{std}{$name} .= $_;
}
}
print STDERR "Parsed $bases reference bases\n";
close(FA);
}
##
# Create a new alignment checker
#
sub new {
my (
$class,
$name, # name of checker
$fas, # list of fasta files containing reference sequences
# or hash of the references themselves (former if it's
# an array ref latter if it's a hash ref)
$format, # alignment format
$bisC, # whether alignment was w/r/t ref w/ all Cs converted to Ys
$bisCpG, # whether alignment was w/r/t ref w/ all CpGs converted to YpGs
) = @_;
(defined($fas)) || die "Must specify non-empty list of fasta files";
# Set defaults for format, bisC, bisCpG, name
$format = "bowtie" unless defined($format);
$bisC = 0 unless defined($bisC);
$bisCpG = 0 unless defined($bisCpG);
$name = "noname" unless defined($name);
# Parse all the fasta files into the ref hash
my %ref = ();
if(ref($fas) eq "HASH") {
for (keys %$fas) {
$ref{std}{$_} = $fas->{$_}
}
} else {
ref($fas) eq "ARRAY" || die;
foreach (@$fas) { parseFasta($_, \%ref); }
}
return bless {
_name => $name,
_fas => $fas,
_format => $format,
_bisC => $bisC,
_bisCpG => $bisCpG,
_refs => \%ref,
_nals => 0,
_nedits => 0
}, $class;
}
sub name { return $_[0]->{_name} }
sub fas { return $_[0]->{_fas} }
sub format { return $_[0]->{_format} }
sub bisC { return $_[0]->{_bisC} }
sub bisCpG { return $_[0]->{_bisCpG} }
sub refs { return $_[0]->{_refs} }
sub nals { return $_[0]->{_nals} }
sub nedits { return $_[0]->{_nedits} }
sub nrefs { return scalar(keys %{$_[0]->{_refs}}); }
##
# Given a sequence that represents the read oriented s.t. the Watson-
# upstream end is on the left, and given a set of edits to apply to the
# read, also oriented assuming that Watson-upstream is on the left,
# return the string corresponding to the read mutated to match the
# reference.
#
my $nedits = 0;
sub applyEdits($$$) {
my ($seq, $edits, $line) = @_;
my $rfseq = $seq;
my $lpos = length($seq)+1;
$nedits += scalar(@$edits);
foreach (reverse @$edits) {
next unless defined($_);
#print STDERR "Applying edit at $_->{pos}\n";
# Apply the edit
$_->{pos} <= $lpos || die "Edit position $_->{pos} was greater than previous $lpos";
if($_->{qchr} eq "-") {
# Insert
$_->{pos} <= length($rfseq) || die "Read gap pos $_->{pos} was not <= string len ".length($rfseq)."\n$line";
substr($rfseq, $_->{pos}, 0) = $_->{chr};
} elsif($_->{chr} eq "-") {
# Deletion
$_->{pos} < length($rfseq) || die "Ref gap pos $_->{pos} was not < string len ".length($rfseq)."\n$line";
my $dc = substr($rfseq, $_->{pos}, 1);
$dc eq $_->{qchr} ||
die "Edit: $_->{pos}:$_->{chr}>$_->{qchr} but ref char was $dc".
"\n$rfseq\n$line";
substr($rfseq, $_->{pos}, 1) = "";
} else {
# Mismatch
$_->{pos} < length($rfseq) || die "Mismatch pos $_->{pos} was not < string len ".length($rfseq)."\n$line";
substr($rfseq, $_->{pos}, 1) eq $_->{qchr} ||
die "Edit: $_->{pos}:$_->{chr}>$_->{qchr}\n$rfseq\n$line";
substr($rfseq, $_->{pos}, 1) = $_->{chr};
}
}
return $rfseq;
}
##
# Given a list of Bowtie edits, invert them by reversing the list and
# changing the poss to be with respect to the other end of the read.
#
sub invertEdits($$) {
my ($edits, $len) = @_;
@$edits = reverse @$edits;
for (@$edits) {
next unless defined($_);
defined($_->{qchr}) || die;
if($_->{qchr} eq "-") {
$_->{pos} = $len - $_->{pos};
length($_->{chr}) >= 1 || die;
$_->{chr} = reverse $_->{chr};
} else {
$_->{pos} = $len - $_->{pos} - 1;
length($_->{chr}) == 1 || die;
}
}
}
##
# Given an edit string, parses it into a list of hashes and returns
# the list.
#
sub parseEdits($) {
my $editstr = shift;
return undef if (!defined($editstr) || $editstr eq "-" || $editstr eq "");
my @edits = ();
# For each edit
for (split(/,/, $editstr)) {
# Parse pos
my ($pos, $ed) = split(/:/);
defined($ed) || die;
# Parse chr, qchr
my ($chr, $qchr) = split(/>/, $ed);
push @edits, { pos => $pos, chr => $chr, qchr => $qchr };
}
return \@edits;
}
##
# Given an edit string, possibly with 4 semicolon-delimited fields,
# parse it into a set of 4 lists of hashes and return the set as an
# array ref.
#
sub parseAllEdits($) {
my $editstr = shift;
return [ undef, undef, undef, undef ] if ($editstr eq "-" || $editstr eq "");
my @editls = split(/;/, $editstr, -1);
if(scalar(@editls) > 1) {
scalar(@editls) == 4 || die;
return [
parseEdits($editls[0]),
parseEdits($editls[1]),
parseEdits($editls[2]),
parseEdits($editls[3]) ];
} else {
scalar(@editls) == 1 || die;
return [
parseEdits($editls[0]),
undef,
undef,
undef ];
}
}
##
# Given array refs for two lists of edits, one corresponding to the
# nucleotide edit list and the other corresponding to the resolved
# ambiguous base list, eliminate any edits that appear in both lists.
# Really this shouldn't happen, but I observe that merman does report
# an edit in both categories if the reference base is being resolved to
# an incompatible nucleotide, e.g. R>C.
#
sub removeDups($$) {
my ($ned, $aed) = @_;
return unless (defined($ned) && defined($aed));
for my $i (0..scalar(@$ned)-1) {
next unless defined($ned->[$i]);
for my $j (0..scalar(@$aed)-1) {
next unless defined($aed->[$j]);
if($ned->[$i]->{qchr} eq $aed->[$j]->{qchr} &&
$ned->[$i]->{chr} eq $aed->[$j]->{chr} &&
$ned->[$i]->{pos} eq $aed->[$j]->{pos})
{
#print STDERR " Eliminated a duplicate edit\n";
$aed->[$j] = undef;
}
}
}
}
##
# Take all the references in %ref and make both Watson and Crick
# versions where the sequence is in-silico bisulfite treated.
#
sub bisulfiteC($) {
my $ref = shift;
for(keys %{$ref->{std}}) {
$ref->{bisc_fw}{$_} = $ref->{std}{$_};
$ref->{bisc_fw}{$_} = s/C/Y/g;
$ref->{bisc_rc}{$_} = $ref->{std}{$_};
$ref->{bisc_rc}{$_} = s/G/R/g;
}
}
##
# Take all the references in %ref and make both Watson and Crick
# versions where the sequence is in-silico bisulfite treated.
#
sub bisulfiteCpG($) {
my $ref = shift;
for(keys %{$ref->{std}}) {
$ref->{biscpg_fw}{$_} = $ref->{std}{$_};
$ref->{biscpg_fw}{$_} =~ s/CG/YG/g;
$ref->{biscpg_fw}{$_} =~ s/C/T/g;
$ref->{biscpg_rc}{$_} = $ref->{std}{$_};
$ref->{biscpg_rc}{$_} =~ s/CG/CR/g;
$ref->{biscpg_rc}{$_} =~ s/G/A/g;
}
}
##
# Given a Bowtie orientation string, return true iff the 5' end of the
# read is at the left of the printed sequence.
#
sub fivePrimeLeft($) {
return ($_[0] eq "+" || $_[0] eq "W" || $_[0] eq "CR");
}
##
# Given the orientation of the read and the state of the global
# bisulfite variables, determine which version of the reference to
# compare against.
#
sub calcRefType {
my ($self, $orient) = @_;
if($self->bisC || $self->bisCpG) {
if($orient eq "W" || $orient eq "WR" || $orient eq "+") {
return $self->bisC ? "bisc_fw" : "biscpg_fw";
} else {
$orient eq "C" || $orient eq "CR" || $orient eq "-" || die;
return $self->bisC ? "bisc_rc" : "biscpg_rc";
}
} else {
return "std";
}
}
##
# Parse a CIGAR string into parallel arrays of CIGAR operations (M, I, D)
#
sub cigarParse($$$) {
my ($cigar, $ops, $runs) = @_;
my $i = 0;
while($i < length($cigar)) {
substr($cigar, $i) =~ /^([0-9]+)/;
defined($1) || die "Could not parse number at pos $i: '$cigar'";
$i += length($1);
$i < length($cigar) || die;
push @$runs, $1;
my $op = substr($cigar, $i, 1);
defined($op) || die "Could not parse operation at pos $i: '$cigar'";
push @$ops, $op;
$i++;
}
}
##
# Trim a read sequence according to the soft clipping in the CIGAR string.
#
sub cigarTrim($$) {
my ($seq, $cigar) = @_;
my @ops = ();
my @runs = ();
cigarParse($cigar, \@ops, \@runs);
my ($trimup, $trimdn) = (0, 0);
if($ops[0] eq 'S') {
$runs[0] < length($seq) || die "Soft clipped entire alignment!";
$seq = substr($seq, $runs[0]);
$trimup = $runs[0];
}
if(scalar(@ops) > 1 && $ops[-1] eq 'S') {
$runs[-1] < length($seq) || die "Soft clipped entire alignment!";
$seq = substr($seq, 0, -$runs[-1]);
$trimdn = $runs[-1];
}
return ($seq, $trimup, $trimdn);
}
##
# Parse a CIGAR string into a string of operators. Operators are expanded into
# runs where appropriate. = and X are collapsed into M.
#
sub parse_cigar($) {
my ($cigar) = @_;
my $ret = "";
my $i = 0;
my ($rdlen, $rflen) = (0, 0);
while($i < length($cigar)) {
substr($cigar, $i) =~ /^([0-9]+)/;
defined($1) || die "Could not parse number at pos $i: '$cigar'";
my $runlen = $1;
$i += length($1);
$i < length($cigar) || die;
my $op = substr($cigar, $i, 1);
defined($op) || die "Could not parse operation at pos $i: '$cigar'";
if($op eq "X" || $op eq "=") {
$op = "M";
}
$rdlen += $runlen if $op ne "D";
$rflen += $runlen if $op ne "I";
$ret .= ($op x $runlen);
$i++;
}
return ($ret, $rdlen, $rflen);
}
##
# Parse an MD:Z string into a string with length equal to query length. Each
# position contains either a space, if the read matches the reference at that
# position, or a character, if the reference contains a character that doesn't
# match its opposite in the alignment. In the latter case, the character in
# the string is the reference character.
#
sub parse_md($) {
my ($md) = @_;
my $i = 0;
my $ret = "";
while($i < length($md)) {
# Starts with a number?
my $ch = substr($md, $i, 1);
if($ch =~ /[0-9]/) {
# Parse the number off the beginning
substr($md, $i) =~ /^([0-9]+)/;
defined($1) || die "Could not parse number at pos $i: '$md'";
my $runlen = $1;
$ret .= (" " x $runlen) if $runlen > 0;
$i += length($runlen);
} elsif($ch eq "^") {
# Read gap
$i++;
substr($md, $i) =~ /^([A-Za-z]+)/;
defined($1) || die "Could not parse read gap at pos $i: '$md'";
my $chrs = $1;
$i += length($chrs);
$ret .= $chrs;
} else {
# DNA character
$ch =~ /[ACGTN.]/i || die "Bad char '$ch' at pos $i: '$md'";
$ret .= $ch;
$i++;
}
}
return $ret;
}
##
# Given a read sequence (with characters in upstream-to-downstream order with
# respect to the reference - NOT necessarily 5'-to-3') and a CIGAR string and
# an MD:Z string, build the alignment strings. The alignment strings will only
# contain the portion of the read that aligned. Any portions that were either
# hard-trimmed or soft-trimmed are trimmed from this function's result.
#
# For now, I'm assuming that the MD:Z string only describes aligned characters,
# i.e. *after* trimming.
#
sub _read_md_cigar_to_al($$$) {
my ($seq, $md, $cigar) = @_;
my $alread = "";
my $alref = "";
$cigar ne "*" || die "CIGAR string was star!";
$seq ne "" || die "Empty sequence given to _read_md_cigar_to_al";
my $parsed_md = parse_md($md);
my ($parsed_cig, $cigar_rdlen, $cigar_rflen) = parse_cigar($cigar);
my ($rdoff, $mdoff) = (0, 0);
my ($htriml, $striml, $htrimr, $strimr) = (0, 0, 0, 0);
my $nonsh = 0; # have I seen a non-S, non-H CIGAR op?
my $nonh = 0; # have I seen a non-H CIGAR op?
for(my $i = 0; $i < length($parsed_cig); $i++) {
my $cigop = substr($parsed_cig, $i, 1);
$nonh++ if $cigop ne "H";
$nonsh++ if ($cigop ne "H" && $cigop ne "S");
if($cigop eq "S") {
if($nonsh) {
$strimr++;
} else {
$striml++;
}
$rdoff++;
next;
}
if($cigop eq "H") {
if($nonh) {
$htrimr++;
} else {
$htriml++;
}
next;
}
$cigop = "M" if $cigop eq "=" || $cigop eq "X";
if($cigop eq "P") {
# Padding
$alread .= "-";
$alref .= "-";
} elsif($cigop eq "M") {
my $rdc = substr($seq, $rdoff, 1);
$mdoff < length($parsed_md) ||
die "Bad mdoff ($mdoff)\nlength(parsed_md)=".length($parsed_md)."\nseq:\n$seq\ncigar:\n$cigar\nmd:\n$md\nparsed md:\n$parsed_md";
my $rfc = substr($parsed_md, $mdoff, 1);
$rfc = $rdc if $rfc eq " ";
$alread .= $rdc;
$alref .= $rfc;
$rdoff++;
$mdoff++;
} elsif($cigop eq "D") {
# Read gap
# Read: AAA-AAA
# Ref: AAAAAAA
my $rfc = substr($parsed_md, $mdoff, 1);
$rfc ne " " ||
die "Must have a ref character opposite a gap in the read:\n".
"cig: $parsed_cig ($i)\nmd: $parsed_md ($mdoff)\n";
$alread .= "-";
$alref .= $rfc;
$mdoff++;
} else {
# Reference gap
# Read: AAAAAAA
# Ref: AAA-AAA
$cigop eq "I" || die "Unsupported cigop: $cigop in cigar $cigar";
my $rdc = substr($seq, $rdoff, 1);
$alread .= $rdc;
$alref .= "-";
$rdoff++;
# $mdoff SHOULD NOT be incremented here
}
$rdoff <= length($seq) ||
die "Bad rdoff:$rdoff for seq '$seq' cigop=$cigop\nseq: $seq\ncigar=$cigar\nmd=$md";
}
return ($alread, $alref, $htriml, $striml, $htrimr, $strimr);
}
##
# Parse a line from a Bowtie alignment file and check that the
# alignment is sane and consistent with the reference.
#
sub parseBowtieLines {
my ($self, $lines) = @_;
for my $line (@$lines) {
chomp($line);
my ($rdname, $orient, $refname, $off, $seq, $qual, $oms, $editstr,
$flags) = split(/\t/, $line, -1);
next if $refname eq "*";
$flags =~ /XC:([^,\s]+)/;
my $cigar = $1;
defined($cigar) ||
die "Could not parse CIGAR string from flags: '$flags'";
defined($editstr) || die "Failed to get 8 tokens from line:\n$_";
$off == int($off) || die "Offset field (col 4) must be an integer:\n$_";
$oms == int($oms) || die "OMS field (col 7) must be an integer:\n$_";
my $reftype = $self->calcRefType($orient);
defined($self->refs->{$reftype}{$refname}) ||
die "No such refname as $refname for reftype $reftype:\n".
Dumper($self->refs->{$reftype});
my $edits4 = parseAllEdits($editstr);
my ($ned, $aed) = ($edits4->[0], $edits4->[1]);
removeDups($ned, $aed);
my $fpl = fivePrimeLeft($orient);
# Trim seq according to CIGAR string
my $rfseq = $seq;
my ($trimup, $trimdn);
($rfseq, $trimup, $trimdn) = cigarTrim($rfseq, $cigar);
invertEdits($ned, length($rfseq)) unless ($fpl || !defined($ned));
invertEdits($aed, length($rfseq)) unless ($fpl || !defined($aed));
$rfseq = applyEdits($rfseq, $ned, $line) if defined($ned);
$rfseq = applyEdits($rfseq, $aed, $line) if defined($aed);
# Check if our alignment falls off the end of the reference, in
# which case we need to pad the reference string with Ns
my $exoff = $off;
my $padleft = "";
my $exlen = length($rfseq);
my $tlen = length($self->refs->{$reftype}{$refname});
if($exoff < 0) {
# Alignment hangs off LHS; pad it
my $npad = -$exoff;
$padleft = "N" x $npad;
$exlen += $exoff;
$exlen >= 0 ||
die "Read was entirely off the LHS of the reference\n".
"Referemce len=$tlen\n".
"Alignment referemce len=$tlen\n".
"$line\n";
$exoff = 0;
}
my $padright = "";
my $roverhang = $off + length($rfseq) - $tlen;
if($roverhang > 0) {
$padright = "N" x $roverhang;
$exlen -= $roverhang;
$exlen >= 0 ||
die "Read was entirely off the RHS of the reference\n".
"Referemce len=$tlen\n".
"Alignment referemce len=$tlen\n".
"$line\n";
}
my $refsub = substr($self->refs->{$reftype}{$refname}, $exoff, $exlen);
length($refsub) == $exlen ||
die "Tried to extract ref substring of length $exlen, got ".
"\"$refsub\" from \"".$self->refs->{$reftype}{$refname}."\"".
"\n$line\n".
"\noff=$off, rfseq=$rfseq\n";
$refsub = DNA::iupacSubN($refsub);
my $trueRfseq = $padleft . $refsub . $padright;
length($trueRfseq) == length($rfseq) ||
die "Different lengths for edited read and ref:\n".
" Read: $seq\n".
"Edited read: $rfseq\n".
" Ref: $trueRfseq\n";
$rfseq eq $trueRfseq ||
die "Did not match:\n".
" Read: $seq\n".
"Edited read: $rfseq\n".
" Ref: $trueRfseq\n";
$self->{_nals}++;
}
}
##
# Parse a line from a SAM alignment file and check that the
# alignment is sane and consistent with the reference.
#
sub parseSamLines {
my ($self, $lines) = @_;
my ($lastseq, $lastqual) = ("", "");
my $idx = 0;
for my $line (@$lines) {
$idx++;
print STDERR "Processing line...\n";
chomp($line);
next if $line =~ /^\@/;
my @toks = split(/\t/, $line, -1);
my (
$qname, #1
$flag, #2
$rname, #3
$pos, #4
$mapq, #5
$cigar, #6
$rnext, #7
$pnext, #8
$tlen, #9
$seq, #10
$qual) = @toks;
defined($qual) || die "Not enough SAM tokens:\n$line\n";
my @opt_flags_list = @toks[11..$#toks];
my %opt_flags = ();
next if $cigar eq "*"; # Failed to align
# Get the read sequence & qualities from previous record if necessary
if($seq eq "*") {
$lastseq ne "" || die "Line #$idx:\n$line";
$seq = $lastseq;
$qual = $lastqual;
} else {
$lastseq = $seq;
$lastqual = $qual;
}
$seq ne "*" || die;
my ($parsed_cigar, $rdlen_cigar, $rflen_cigar) = parse_cigar($cigar);
length($seq) == $rdlen_cigar ||
die "Sequence length and parsed cigar string length ($rdlen_cigar) mismatch:\n".
"$seq\n$parsed_cigar\nLine:\n$line";
# Stick optional flags into a hash
for my $fl (@opt_flags_list) {
my @fs = split(/:/, $fl, -1);
scalar(@fs) > 2 || die "Bad optional flag: $fl\n$line\n";
$opt_flags{$fs[0]}{type} = $fs[1];
$opt_flags{$fs[0]}{value} = join(":", @fs[2..$#fs]);
}
defined($opt_flags{"MD"}) || die "No MD:Z flag:\n$line\n";
$opt_flags{"MD"}{type} eq "Z" || die "Bad type for MD:Z flag\n$line\n";
my $md = $opt_flags{"MD"}{value};
$pos == int($pos) || die "POS field (col 4) must be an int:\n$line\n";
$pnext == int($pnext) || die "PNEXT field (col 8) must be an int:\n$line\n";
$tlen == int($tlen) || die "TLEN field (col 9) must be an int:\n$line\n";
$mapq == int($mapq) || die "MAPQ field (col 5) must be an int:\n$line\n";
# TODO: deal with bisulfite strands??
my $fw = (($flag & 0x10) == 0);
my $orient = $fw ? "+" : "-";
my $reftype = $self->calcRefType($orient);
defined($self->refs->{$reftype}{$rname}) ||
die "No such refname as $rname for reftype $reftype:\n$line\n".
Dumper($self->refs->{$reftype});
my $exoff = $pos-1; # expected 0-based reference offset
my ($alread, $alref, $htriml, $striml, $htrimr, $strimr) =
_read_md_cigar_to_al($seq, $md, $cigar);
print STDERR "$alread\n$alref\n";
my $rfseq = $alref;
$rfseq =~ s/[ -]//g; # remove spaces & gaps
my $exlen = length($rfseq);
my $refsub = substr($self->refs->{$reftype}{$rname}, $exoff, $exlen);
length($refsub) == $exlen ||
die "Tried to extract ref substring of length $exlen from:\n".
$self->refs->{$reftype}{$rname}.
"\ngot string of length ".length($refsub).":\n".
$refsub.
"\nfrom:\n".
$line.
"\nexlen is the length of:\n$rfseq\npos=$pos, rfseq=$rfseq\n";
$refsub = DNA::iupacSubN($refsub);
my $trueRfseq = $refsub;
length($trueRfseq) == length($rfseq) ||
die "Different lengths for edited read and ref:\n".
" Read: $seq\n".
"Edited read: $rfseq\n".
" Ref: $trueRfseq\n";
$rfseq eq $trueRfseq ||
die "Did not match:\n".
" Read: $seq\n".
"Edited read: $rfseq\n".
" Ref: $trueRfseq\n".
"$line";
$self->{_nals}++;
}
}
##
# Parse lines from a Bowtie alignment file and check that the
# alignments are sane and consistent with the reference.
#
sub parseBowtie {
my ($self, $fh) = @_;
while(<$fh>) {
$self->parseBowtieLines([$_]);
}
}
##
# Parse lines from a SAM alignment file and check that alignments are
# sane and consistent with the reference.
#
sub parseSam {
my ($self, $fh) = @_;
my @lines = ();
while(<$fh>) { push @lines, $_; }
$self->parseSamLines(\@lines);
}
##
# Parse lines from an alignment file of the type given by self->format
#
sub parseLines {
my ($self, $lines) = @_;
if($self->format eq "bowtie") {
$self->parseBowtieLines($lines);
} else {
$self->format eq "sam" || die;
$self->parseSamLines($lines);
}
}
##
# Parse lines from an alignment file of the type given by self->format
#
sub parse {
my ($self, $fh) = @_;
if($self->format eq "bowtie") {
$self->parseBowtie($fh);
} else {
$self->format eq "sam" || die;
$self->parseSam($fh);
}
}
##
# Print summary of how many alignments and edits were checked.
#
sub printSummary {
my $self = shift;
print STDERR "--- Summary ---\n";
print STDERR "Read ".scalar(keys %{$self->refs})." reference strings\n";
print STDERR "Checked $self->{_nals} alignments, $self->{_nedits} edits\n";
print STDERR "---------------\n";
print STDERR "PASSED\n";
}
##
# Check the given batch of alignments. We check that they're
# internally consistent in some basic ways, and we check that the
# sequence and edits are consistent with the reference.
#
# The $als argument is either a list of (possibly compressed) filenames
# of files containing alignments, or a list of alignment strings. If
# the former, $filenames is non-zero.
#
sub checkAlignments {
my ($self, $als, $filenames) = @_;
if($self->bisC) {
print STDERR "Generating all-C bisulfite-treated references\n";
bisulfiteC($self->refs);
}
if($self->bisCpG) {
print STDERR "Generating all-CpG bisulfite-treated references\n";
bisulfiteCpG($self->refs);
}
if($filenames) {
foreach (@$als) {
my $alnpipe = $_;
print STDERR "Processing alignment file '$_'\n";
$alnpipe = "gzip -dc $_ |" if ($_ =~ /\.gz$/);
my $alnfh = undef;
open($alnfh, $alnpipe) || die "Could not open '$alnpipe' for reading";
$self->parse($alnfh);
close($alnfh);
}
} else {
$self->parseLines($als);
}
}
##
# Check simple alignments
#
sub test1 {
my $ac = AlignmentCheck->new(
"AlignmentCheck.pm test1 checker",
{ "r1" => "TTGTTCGT" },
"bowtie",
0,
0
);
$ac->checkAlignments([
"0\t+\tr1\t1\tTGTTCGT\tIIIIIII\t40\t-",
"1\t+\tr1\t0\tTTGTTCG\tIIIIIII\t40\t-",
"2\t+\tr1\t2\tGTTCGTA\tIIIIIII\t40\t6:N>A",
"3\t+\tr1\t-1\tATTGTTC\tIIIIIII\t40\t0:N>A"], 0);
return 1;
}
##
# Check simple alignments from files
#
sub test2 {
open(TMP, ">/tmp/.AlignmentCheck.pm.fa") || die;
print TMP ">r1\n";
print TMP "TTGTTCGT\n";
close(TMP);
my $ac = AlignmentCheck->new(
"AlignmentCheck.pm test1 checker",
[ "/tmp/.AlignmentCheck.pm.fa" ],
"bowtie",
0,
0
);
$ac->checkAlignments([
"0\t+\tr1\t1\tTGTTCGT\tIIIIIII\t40\t-",
"1\t+\tr1\t0\tTTGTTCG\tIIIIIII\t40\t-",
"2\t+\tr1\t2\tGTTCGTA\tIIIIIII\t40\t6:N>A",
"3\t+\tr1\t-1\tATTGTTC\tIIIIIII\t40\t0:N>A"], 0);
return 1;
}
if($0 =~ /[^0-9a-zA-Z_]?AlignmentCheck\.pm$/) {
my @fas = ();
my @als = ();
my $format = "sam";
my $bisC = 0;
my $bisCpG = 0;
my $test = 0;
use Getopt::Long;
GetOptions (
"test" => \$test,
"fasta|ref=s" => \@fas,
"als|aln=s" => \@als,
"format=s" => \$format,
"bis-C|bisulfite-C" => \$bisC,
"bis-CpG|bisulfite-CpG" => \$bisCpG,
"bowtie" => sub {$format = "bowtie"},
"sam" => sub {$format = "sam"}) || die;
if($test) {
use Test;
print "Running unit tests\n";
# Run unit tests
Test::shouldSucceed("test1", \&test1);
Test::shouldSucceed("test2", \&test2);
exit 0;
}
my $ac = AlignmentCheck->new(
"AlignmentCheck.pm checker",
\@fas,
$format,
$bisC,
$bisCpG);
$ac->checkAlignments(\@als, 1);
}
1;