860 lines
24 KiB
Perl
860 lines
24 KiB
Perl
|
#!/usr/bin/perl -w
|
||
|
|
||
|
#
|
||
|
# Copyright 2011, Ben Langmead <langmea@cs.jhu.edu>
|
||
|
#
|
||
|
# This file is part of Bowtie 2.
|
||
|
#
|
||
|
# Bowtie 2 is free software: you can redistribute it and/or modify
|
||
|
# it under the terms of the GNU General Public License as published by
|
||
|
# the Free Software Foundation, either version 3 of the License, or
|
||
|
# (at your option) any later version.
|
||
|
#
|
||
|
# Bowtie 2 is distributed in the hope that it will be useful,
|
||
|
# but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||
|
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||
|
# GNU General Public License for more details.
|
||
|
#
|
||
|
# You should have received a copy of the GNU General Public License
|
||
|
# along with Bowtie 2. If not, see <http://www.gnu.org/licenses/>.
|
||
|
#
|
||
|
|
||
|
##
|
||
|
# AlignmentCheck.pm
|
||
|
#
|
||
|
# Read in fasta files containing reference sequences that might be
|
||
|
# aligned to, then read in alignment files, checking each alignment to
|
||
|
# be sure it's sane and consistent with the reference sequence it
|
||
|
# aligns to.
|
||
|
#
|
||
|
|
||
|
package AlignmentCheck;
|
||
|
use strict;
|
||
|
use warnings;
|
||
|
use FindBin qw($Bin);
|
||
|
use lib $Bin;
|
||
|
use DNA;
|
||
|
use Data::Dumper;
|
||
|
|
||
|
##
|
||
|
# Parse a fasta file into the %ref hash
|
||
|
#
|
||
|
sub parseFasta($$) {
|
||
|
my ($fa, $ref) = @_;
|
||
|
print STDERR "Parsing FASTA file $fa...\n";
|
||
|
my $fapipe = "$fa";
|
||
|
$fapipe = "gzip -dc $fa |" if $fa =~ /\.gz$/;
|
||
|
open(FA, $fapipe) || die "Could not open '$fapipe' for reading";
|
||
|
my $name = "";
|
||
|
my $bases = 0;
|
||
|
while(<FA>) {
|
||
|
chomp;
|
||
|
if(/^>/) {
|
||
|
$name = substr($_, 1);
|
||
|
$name =~ s/\s.*//;
|
||
|
print STDERR " parsing sequence $name...\n";
|
||
|
$ref->{std}{$name} = "";
|
||
|
} else {
|
||
|
$name ne "" || die "sequence before name";
|
||
|
$bases += length($_);
|
||
|
$ref->{std}{$name} .= $_;
|
||
|
}
|
||
|
}
|
||
|
print STDERR "Parsed $bases reference bases\n";
|
||
|
close(FA);
|
||
|
}
|
||
|
|
||
|
##
|
||
|
# Create a new alignment checker
|
||
|
#
|
||
|
sub new {
|
||
|
my (
|
||
|
$class,
|
||
|
$name, # name of checker
|
||
|
$fas, # list of fasta files containing reference sequences
|
||
|
# or hash of the references themselves (former if it's
|
||
|
# an array ref latter if it's a hash ref)
|
||
|
$format, # alignment format
|
||
|
$bisC, # whether alignment was w/r/t ref w/ all Cs converted to Ys
|
||
|
$bisCpG, # whether alignment was w/r/t ref w/ all CpGs converted to YpGs
|
||
|
) = @_;
|
||
|
(defined($fas)) || die "Must specify non-empty list of fasta files";
|
||
|
# Set defaults for format, bisC, bisCpG, name
|
||
|
$format = "bowtie" unless defined($format);
|
||
|
$bisC = 0 unless defined($bisC);
|
||
|
$bisCpG = 0 unless defined($bisCpG);
|
||
|
$name = "noname" unless defined($name);
|
||
|
# Parse all the fasta files into the ref hash
|
||
|
my %ref = ();
|
||
|
if(ref($fas) eq "HASH") {
|
||
|
for (keys %$fas) {
|
||
|
$ref{std}{$_} = $fas->{$_}
|
||
|
}
|
||
|
} else {
|
||
|
ref($fas) eq "ARRAY" || die;
|
||
|
foreach (@$fas) { parseFasta($_, \%ref); }
|
||
|
}
|
||
|
return bless {
|
||
|
_name => $name,
|
||
|
_fas => $fas,
|
||
|
_format => $format,
|
||
|
_bisC => $bisC,
|
||
|
_bisCpG => $bisCpG,
|
||
|
_refs => \%ref,
|
||
|
_nals => 0,
|
||
|
_nedits => 0
|
||
|
}, $class;
|
||
|
}
|
||
|
sub name { return $_[0]->{_name} }
|
||
|
sub fas { return $_[0]->{_fas} }
|
||
|
sub format { return $_[0]->{_format} }
|
||
|
sub bisC { return $_[0]->{_bisC} }
|
||
|
sub bisCpG { return $_[0]->{_bisCpG} }
|
||
|
sub refs { return $_[0]->{_refs} }
|
||
|
sub nals { return $_[0]->{_nals} }
|
||
|
sub nedits { return $_[0]->{_nedits} }
|
||
|
sub nrefs { return scalar(keys %{$_[0]->{_refs}}); }
|
||
|
|
||
|
##
|
||
|
# Given a sequence that represents the read oriented s.t. the Watson-
|
||
|
# upstream end is on the left, and given a set of edits to apply to the
|
||
|
# read, also oriented assuming that Watson-upstream is on the left,
|
||
|
# return the string corresponding to the read mutated to match the
|
||
|
# reference.
|
||
|
#
|
||
|
my $nedits = 0;
|
||
|
sub applyEdits($$$) {
|
||
|
my ($seq, $edits, $line) = @_;
|
||
|
my $rfseq = $seq;
|
||
|
my $lpos = length($seq)+1;
|
||
|
$nedits += scalar(@$edits);
|
||
|
foreach (reverse @$edits) {
|
||
|
next unless defined($_);
|
||
|
#print STDERR "Applying edit at $_->{pos}\n";
|
||
|
# Apply the edit
|
||
|
$_->{pos} <= $lpos || die "Edit position $_->{pos} was greater than previous $lpos";
|
||
|
if($_->{qchr} eq "-") {
|
||
|
# Insert
|
||
|
$_->{pos} <= length($rfseq) || die "Read gap pos $_->{pos} was not <= string len ".length($rfseq)."\n$line";
|
||
|
substr($rfseq, $_->{pos}, 0) = $_->{chr};
|
||
|
} elsif($_->{chr} eq "-") {
|
||
|
# Deletion
|
||
|
$_->{pos} < length($rfseq) || die "Ref gap pos $_->{pos} was not < string len ".length($rfseq)."\n$line";
|
||
|
my $dc = substr($rfseq, $_->{pos}, 1);
|
||
|
$dc eq $_->{qchr} ||
|
||
|
die "Edit: $_->{pos}:$_->{chr}>$_->{qchr} but ref char was $dc".
|
||
|
"\n$rfseq\n$line";
|
||
|
substr($rfseq, $_->{pos}, 1) = "";
|
||
|
} else {
|
||
|
# Mismatch
|
||
|
$_->{pos} < length($rfseq) || die "Mismatch pos $_->{pos} was not < string len ".length($rfseq)."\n$line";
|
||
|
substr($rfseq, $_->{pos}, 1) eq $_->{qchr} ||
|
||
|
die "Edit: $_->{pos}:$_->{chr}>$_->{qchr}\n$rfseq\n$line";
|
||
|
substr($rfseq, $_->{pos}, 1) = $_->{chr};
|
||
|
}
|
||
|
}
|
||
|
return $rfseq;
|
||
|
}
|
||
|
|
||
|
##
|
||
|
# Given a list of Bowtie edits, invert them by reversing the list and
|
||
|
# changing the poss to be with respect to the other end of the read.
|
||
|
#
|
||
|
sub invertEdits($$) {
|
||
|
my ($edits, $len) = @_;
|
||
|
@$edits = reverse @$edits;
|
||
|
for (@$edits) {
|
||
|
next unless defined($_);
|
||
|
defined($_->{qchr}) || die;
|
||
|
if($_->{qchr} eq "-") {
|
||
|
$_->{pos} = $len - $_->{pos};
|
||
|
length($_->{chr}) >= 1 || die;
|
||
|
$_->{chr} = reverse $_->{chr};
|
||
|
} else {
|
||
|
$_->{pos} = $len - $_->{pos} - 1;
|
||
|
length($_->{chr}) == 1 || die;
|
||
|
}
|
||
|
}
|
||
|
}
|
||
|
|
||
|
##
|
||
|
# Given an edit string, parses it into a list of hashes and returns
|
||
|
# the list.
|
||
|
#
|
||
|
sub parseEdits($) {
|
||
|
my $editstr = shift;
|
||
|
return undef if (!defined($editstr) || $editstr eq "-" || $editstr eq "");
|
||
|
my @edits = ();
|
||
|
# For each edit
|
||
|
for (split(/,/, $editstr)) {
|
||
|
# Parse pos
|
||
|
my ($pos, $ed) = split(/:/);
|
||
|
defined($ed) || die;
|
||
|
# Parse chr, qchr
|
||
|
my ($chr, $qchr) = split(/>/, $ed);
|
||
|
push @edits, { pos => $pos, chr => $chr, qchr => $qchr };
|
||
|
}
|
||
|
return \@edits;
|
||
|
}
|
||
|
|
||
|
##
|
||
|
# Given an edit string, possibly with 4 semicolon-delimited fields,
|
||
|
# parse it into a set of 4 lists of hashes and return the set as an
|
||
|
# array ref.
|
||
|
#
|
||
|
sub parseAllEdits($) {
|
||
|
my $editstr = shift;
|
||
|
return [ undef, undef, undef, undef ] if ($editstr eq "-" || $editstr eq "");
|
||
|
my @editls = split(/;/, $editstr, -1);
|
||
|
if(scalar(@editls) > 1) {
|
||
|
scalar(@editls) == 4 || die;
|
||
|
return [
|
||
|
parseEdits($editls[0]),
|
||
|
parseEdits($editls[1]),
|
||
|
parseEdits($editls[2]),
|
||
|
parseEdits($editls[3]) ];
|
||
|
} else {
|
||
|
scalar(@editls) == 1 || die;
|
||
|
return [
|
||
|
parseEdits($editls[0]),
|
||
|
undef,
|
||
|
undef,
|
||
|
undef ];
|
||
|
}
|
||
|
}
|
||
|
|
||
|
##
|
||
|
# Given array refs for two lists of edits, one corresponding to the
|
||
|
# nucleotide edit list and the other corresponding to the resolved
|
||
|
# ambiguous base list, eliminate any edits that appear in both lists.
|
||
|
# Really this shouldn't happen, but I observe that merman does report
|
||
|
# an edit in both categories if the reference base is being resolved to
|
||
|
# an incompatible nucleotide, e.g. R>C.
|
||
|
#
|
||
|
sub removeDups($$) {
|
||
|
my ($ned, $aed) = @_;
|
||
|
return unless (defined($ned) && defined($aed));
|
||
|
for my $i (0..scalar(@$ned)-1) {
|
||
|
next unless defined($ned->[$i]);
|
||
|
for my $j (0..scalar(@$aed)-1) {
|
||
|
next unless defined($aed->[$j]);
|
||
|
if($ned->[$i]->{qchr} eq $aed->[$j]->{qchr} &&
|
||
|
$ned->[$i]->{chr} eq $aed->[$j]->{chr} &&
|
||
|
$ned->[$i]->{pos} eq $aed->[$j]->{pos})
|
||
|
{
|
||
|
#print STDERR " Eliminated a duplicate edit\n";
|
||
|
$aed->[$j] = undef;
|
||
|
}
|
||
|
}
|
||
|
}
|
||
|
}
|
||
|
|
||
|
##
|
||
|
# Take all the references in %ref and make both Watson and Crick
|
||
|
# versions where the sequence is in-silico bisulfite treated.
|
||
|
#
|
||
|
sub bisulfiteC($) {
|
||
|
my $ref = shift;
|
||
|
for(keys %{$ref->{std}}) {
|
||
|
$ref->{bisc_fw}{$_} = $ref->{std}{$_};
|
||
|
$ref->{bisc_fw}{$_} = s/C/Y/g;
|
||
|
$ref->{bisc_rc}{$_} = $ref->{std}{$_};
|
||
|
$ref->{bisc_rc}{$_} = s/G/R/g;
|
||
|
}
|
||
|
}
|
||
|
|
||
|
##
|
||
|
# Take all the references in %ref and make both Watson and Crick
|
||
|
# versions where the sequence is in-silico bisulfite treated.
|
||
|
#
|
||
|
sub bisulfiteCpG($) {
|
||
|
my $ref = shift;
|
||
|
for(keys %{$ref->{std}}) {
|
||
|
$ref->{biscpg_fw}{$_} = $ref->{std}{$_};
|
||
|
$ref->{biscpg_fw}{$_} =~ s/CG/YG/g;
|
||
|
$ref->{biscpg_fw}{$_} =~ s/C/T/g;
|
||
|
$ref->{biscpg_rc}{$_} = $ref->{std}{$_};
|
||
|
$ref->{biscpg_rc}{$_} =~ s/CG/CR/g;
|
||
|
$ref->{biscpg_rc}{$_} =~ s/G/A/g;
|
||
|
}
|
||
|
}
|
||
|
|
||
|
##
|
||
|
# Given a Bowtie orientation string, return true iff the 5' end of the
|
||
|
# read is at the left of the printed sequence.
|
||
|
#
|
||
|
sub fivePrimeLeft($) {
|
||
|
return ($_[0] eq "+" || $_[0] eq "W" || $_[0] eq "CR");
|
||
|
}
|
||
|
|
||
|
##
|
||
|
# Given the orientation of the read and the state of the global
|
||
|
# bisulfite variables, determine which version of the reference to
|
||
|
# compare against.
|
||
|
#
|
||
|
sub calcRefType {
|
||
|
my ($self, $orient) = @_;
|
||
|
if($self->bisC || $self->bisCpG) {
|
||
|
if($orient eq "W" || $orient eq "WR" || $orient eq "+") {
|
||
|
return $self->bisC ? "bisc_fw" : "biscpg_fw";
|
||
|
} else {
|
||
|
$orient eq "C" || $orient eq "CR" || $orient eq "-" || die;
|
||
|
return $self->bisC ? "bisc_rc" : "biscpg_rc";
|
||
|
}
|
||
|
} else {
|
||
|
return "std";
|
||
|
}
|
||
|
}
|
||
|
|
||
|
##
|
||
|
# Parse a CIGAR string into parallel arrays of CIGAR operations (M, I, D)
|
||
|
#
|
||
|
sub cigarParse($$$) {
|
||
|
my ($cigar, $ops, $runs) = @_;
|
||
|
my $i = 0;
|
||
|
while($i < length($cigar)) {
|
||
|
substr($cigar, $i) =~ /^([0-9]+)/;
|
||
|
defined($1) || die "Could not parse number at pos $i: '$cigar'";
|
||
|
$i += length($1);
|
||
|
$i < length($cigar) || die;
|
||
|
push @$runs, $1;
|
||
|
my $op = substr($cigar, $i, 1);
|
||
|
defined($op) || die "Could not parse operation at pos $i: '$cigar'";
|
||
|
push @$ops, $op;
|
||
|
$i++;
|
||
|
}
|
||
|
}
|
||
|
|
||
|
##
|
||
|
# Trim a read sequence according to the soft clipping in the CIGAR string.
|
||
|
#
|
||
|
sub cigarTrim($$) {
|
||
|
my ($seq, $cigar) = @_;
|
||
|
my @ops = ();
|
||
|
my @runs = ();
|
||
|
cigarParse($cigar, \@ops, \@runs);
|
||
|
my ($trimup, $trimdn) = (0, 0);
|
||
|
if($ops[0] eq 'S') {
|
||
|
$runs[0] < length($seq) || die "Soft clipped entire alignment!";
|
||
|
$seq = substr($seq, $runs[0]);
|
||
|
$trimup = $runs[0];
|
||
|
}
|
||
|
if(scalar(@ops) > 1 && $ops[-1] eq 'S') {
|
||
|
$runs[-1] < length($seq) || die "Soft clipped entire alignment!";
|
||
|
$seq = substr($seq, 0, -$runs[-1]);
|
||
|
$trimdn = $runs[-1];
|
||
|
}
|
||
|
return ($seq, $trimup, $trimdn);
|
||
|
}
|
||
|
|
||
|
##
|
||
|
# Parse a CIGAR string into a string of operators. Operators are expanded into
|
||
|
# runs where appropriate. = and X are collapsed into M.
|
||
|
#
|
||
|
sub parse_cigar($) {
|
||
|
my ($cigar) = @_;
|
||
|
my $ret = "";
|
||
|
my $i = 0;
|
||
|
my ($rdlen, $rflen) = (0, 0);
|
||
|
while($i < length($cigar)) {
|
||
|
substr($cigar, $i) =~ /^([0-9]+)/;
|
||
|
defined($1) || die "Could not parse number at pos $i: '$cigar'";
|
||
|
my $runlen = $1;
|
||
|
$i += length($1);
|
||
|
$i < length($cigar) || die;
|
||
|
my $op = substr($cigar, $i, 1);
|
||
|
defined($op) || die "Could not parse operation at pos $i: '$cigar'";
|
||
|
if($op eq "X" || $op eq "=") {
|
||
|
$op = "M";
|
||
|
}
|
||
|
$rdlen += $runlen if $op ne "D";
|
||
|
$rflen += $runlen if $op ne "I";
|
||
|
$ret .= ($op x $runlen);
|
||
|
$i++;
|
||
|
}
|
||
|
return ($ret, $rdlen, $rflen);
|
||
|
}
|
||
|
|
||
|
##
|
||
|
# Parse an MD:Z string into a string with length equal to query length. Each
|
||
|
# position contains either a space, if the read matches the reference at that
|
||
|
# position, or a character, if the reference contains a character that doesn't
|
||
|
# match its opposite in the alignment. In the latter case, the character in
|
||
|
# the string is the reference character.
|
||
|
#
|
||
|
sub parse_md($) {
|
||
|
my ($md) = @_;
|
||
|
my $i = 0;
|
||
|
my $ret = "";
|
||
|
while($i < length($md)) {
|
||
|
# Starts with a number?
|
||
|
my $ch = substr($md, $i, 1);
|
||
|
if($ch =~ /[0-9]/) {
|
||
|
# Parse the number off the beginning
|
||
|
substr($md, $i) =~ /^([0-9]+)/;
|
||
|
defined($1) || die "Could not parse number at pos $i: '$md'";
|
||
|
my $runlen = $1;
|
||
|
$ret .= (" " x $runlen) if $runlen > 0;
|
||
|
$i += length($runlen);
|
||
|
} elsif($ch eq "^") {
|
||
|
# Read gap
|
||
|
$i++;
|
||
|
substr($md, $i) =~ /^([A-Za-z]+)/;
|
||
|
defined($1) || die "Could not parse read gap at pos $i: '$md'";
|
||
|
my $chrs = $1;
|
||
|
$i += length($chrs);
|
||
|
$ret .= $chrs;
|
||
|
} else {
|
||
|
# DNA character
|
||
|
$ch =~ /[ACGTN.]/i || die "Bad char '$ch' at pos $i: '$md'";
|
||
|
$ret .= $ch;
|
||
|
$i++;
|
||
|
}
|
||
|
}
|
||
|
return $ret;
|
||
|
}
|
||
|
|
||
|
##
|
||
|
# Given a read sequence (with characters in upstream-to-downstream order with
|
||
|
# respect to the reference - NOT necessarily 5'-to-3') and a CIGAR string and
|
||
|
# an MD:Z string, build the alignment strings. The alignment strings will only
|
||
|
# contain the portion of the read that aligned. Any portions that were either
|
||
|
# hard-trimmed or soft-trimmed are trimmed from this function's result.
|
||
|
#
|
||
|
# For now, I'm assuming that the MD:Z string only describes aligned characters,
|
||
|
# i.e. *after* trimming.
|
||
|
#
|
||
|
sub _read_md_cigar_to_al($$$) {
|
||
|
my ($seq, $md, $cigar) = @_;
|
||
|
my $alread = "";
|
||
|
my $alref = "";
|
||
|
$cigar ne "*" || die "CIGAR string was star!";
|
||
|
$seq ne "" || die "Empty sequence given to _read_md_cigar_to_al";
|
||
|
my $parsed_md = parse_md($md);
|
||
|
my ($parsed_cig, $cigar_rdlen, $cigar_rflen) = parse_cigar($cigar);
|
||
|
my ($rdoff, $mdoff) = (0, 0);
|
||
|
my ($htriml, $striml, $htrimr, $strimr) = (0, 0, 0, 0);
|
||
|
my $nonsh = 0; # have I seen a non-S, non-H CIGAR op?
|
||
|
my $nonh = 0; # have I seen a non-H CIGAR op?
|
||
|
for(my $i = 0; $i < length($parsed_cig); $i++) {
|
||
|
my $cigop = substr($parsed_cig, $i, 1);
|
||
|
$nonh++ if $cigop ne "H";
|
||
|
$nonsh++ if ($cigop ne "H" && $cigop ne "S");
|
||
|
if($cigop eq "S") {
|
||
|
if($nonsh) {
|
||
|
$strimr++;
|
||
|
} else {
|
||
|
$striml++;
|
||
|
}
|
||
|
$rdoff++;
|
||
|
next;
|
||
|
}
|
||
|
if($cigop eq "H") {
|
||
|
if($nonh) {
|
||
|
$htrimr++;
|
||
|
} else {
|
||
|
$htriml++;
|
||
|
}
|
||
|
next;
|
||
|
}
|
||
|
$cigop = "M" if $cigop eq "=" || $cigop eq "X";
|
||
|
if($cigop eq "P") {
|
||
|
# Padding
|
||
|
$alread .= "-";
|
||
|
$alref .= "-";
|
||
|
} elsif($cigop eq "M") {
|
||
|
my $rdc = substr($seq, $rdoff, 1);
|
||
|
$mdoff < length($parsed_md) ||
|
||
|
die "Bad mdoff ($mdoff)\nlength(parsed_md)=".length($parsed_md)."\nseq:\n$seq\ncigar:\n$cigar\nmd:\n$md\nparsed md:\n$parsed_md";
|
||
|
my $rfc = substr($parsed_md, $mdoff, 1);
|
||
|
$rfc = $rdc if $rfc eq " ";
|
||
|
$alread .= $rdc;
|
||
|
$alref .= $rfc;
|
||
|
$rdoff++;
|
||
|
$mdoff++;
|
||
|
} elsif($cigop eq "D") {
|
||
|
# Read gap
|
||
|
# Read: AAA-AAA
|
||
|
# Ref: AAAAAAA
|
||
|
my $rfc = substr($parsed_md, $mdoff, 1);
|
||
|
$rfc ne " " ||
|
||
|
die "Must have a ref character opposite a gap in the read:\n".
|
||
|
"cig: $parsed_cig ($i)\nmd: $parsed_md ($mdoff)\n";
|
||
|
$alread .= "-";
|
||
|
$alref .= $rfc;
|
||
|
$mdoff++;
|
||
|
} else {
|
||
|
# Reference gap
|
||
|
# Read: AAAAAAA
|
||
|
# Ref: AAA-AAA
|
||
|
$cigop eq "I" || die "Unsupported cigop: $cigop in cigar $cigar";
|
||
|
my $rdc = substr($seq, $rdoff, 1);
|
||
|
$alread .= $rdc;
|
||
|
$alref .= "-";
|
||
|
$rdoff++;
|
||
|
# $mdoff SHOULD NOT be incremented here
|
||
|
}
|
||
|
$rdoff <= length($seq) ||
|
||
|
die "Bad rdoff:$rdoff for seq '$seq' cigop=$cigop\nseq: $seq\ncigar=$cigar\nmd=$md";
|
||
|
}
|
||
|
return ($alread, $alref, $htriml, $striml, $htrimr, $strimr);
|
||
|
}
|
||
|
|
||
|
##
|
||
|
# Parse a line from a Bowtie alignment file and check that the
|
||
|
# alignment is sane and consistent with the reference.
|
||
|
#
|
||
|
sub parseBowtieLines {
|
||
|
my ($self, $lines) = @_;
|
||
|
for my $line (@$lines) {
|
||
|
chomp($line);
|
||
|
my ($rdname, $orient, $refname, $off, $seq, $qual, $oms, $editstr,
|
||
|
$flags) = split(/\t/, $line, -1);
|
||
|
next if $refname eq "*";
|
||
|
$flags =~ /XC:([^,\s]+)/;
|
||
|
my $cigar = $1;
|
||
|
defined($cigar) ||
|
||
|
die "Could not parse CIGAR string from flags: '$flags'";
|
||
|
defined($editstr) || die "Failed to get 8 tokens from line:\n$_";
|
||
|
$off == int($off) || die "Offset field (col 4) must be an integer:\n$_";
|
||
|
$oms == int($oms) || die "OMS field (col 7) must be an integer:\n$_";
|
||
|
my $reftype = $self->calcRefType($orient);
|
||
|
defined($self->refs->{$reftype}{$refname}) ||
|
||
|
die "No such refname as $refname for reftype $reftype:\n".
|
||
|
Dumper($self->refs->{$reftype});
|
||
|
my $edits4 = parseAllEdits($editstr);
|
||
|
my ($ned, $aed) = ($edits4->[0], $edits4->[1]);
|
||
|
removeDups($ned, $aed);
|
||
|
my $fpl = fivePrimeLeft($orient);
|
||
|
# Trim seq according to CIGAR string
|
||
|
my $rfseq = $seq;
|
||
|
my ($trimup, $trimdn);
|
||
|
($rfseq, $trimup, $trimdn) = cigarTrim($rfseq, $cigar);
|
||
|
invertEdits($ned, length($rfseq)) unless ($fpl || !defined($ned));
|
||
|
invertEdits($aed, length($rfseq)) unless ($fpl || !defined($aed));
|
||
|
$rfseq = applyEdits($rfseq, $ned, $line) if defined($ned);
|
||
|
$rfseq = applyEdits($rfseq, $aed, $line) if defined($aed);
|
||
|
# Check if our alignment falls off the end of the reference, in
|
||
|
# which case we need to pad the reference string with Ns
|
||
|
my $exoff = $off;
|
||
|
my $padleft = "";
|
||
|
my $exlen = length($rfseq);
|
||
|
my $tlen = length($self->refs->{$reftype}{$refname});
|
||
|
if($exoff < 0) {
|
||
|
# Alignment hangs off LHS; pad it
|
||
|
my $npad = -$exoff;
|
||
|
$padleft = "N" x $npad;
|
||
|
$exlen += $exoff;
|
||
|
$exlen >= 0 ||
|
||
|
die "Read was entirely off the LHS of the reference\n".
|
||
|
"Referemce len=$tlen\n".
|
||
|
"Alignment referemce len=$tlen\n".
|
||
|
"$line\n";
|
||
|
$exoff = 0;
|
||
|
}
|
||
|
my $padright = "";
|
||
|
my $roverhang = $off + length($rfseq) - $tlen;
|
||
|
if($roverhang > 0) {
|
||
|
$padright = "N" x $roverhang;
|
||
|
$exlen -= $roverhang;
|
||
|
$exlen >= 0 ||
|
||
|
die "Read was entirely off the RHS of the reference\n".
|
||
|
"Referemce len=$tlen\n".
|
||
|
"Alignment referemce len=$tlen\n".
|
||
|
"$line\n";
|
||
|
}
|
||
|
my $refsub = substr($self->refs->{$reftype}{$refname}, $exoff, $exlen);
|
||
|
length($refsub) == $exlen ||
|
||
|
die "Tried to extract ref substring of length $exlen, got ".
|
||
|
"\"$refsub\" from \"".$self->refs->{$reftype}{$refname}."\"".
|
||
|
"\n$line\n".
|
||
|
"\noff=$off, rfseq=$rfseq\n";
|
||
|
$refsub = DNA::iupacSubN($refsub);
|
||
|
my $trueRfseq = $padleft . $refsub . $padright;
|
||
|
length($trueRfseq) == length($rfseq) ||
|
||
|
die "Different lengths for edited read and ref:\n".
|
||
|
" Read: $seq\n".
|
||
|
"Edited read: $rfseq\n".
|
||
|
" Ref: $trueRfseq\n";
|
||
|
$rfseq eq $trueRfseq ||
|
||
|
die "Did not match:\n".
|
||
|
" Read: $seq\n".
|
||
|
"Edited read: $rfseq\n".
|
||
|
" Ref: $trueRfseq\n";
|
||
|
$self->{_nals}++;
|
||
|
}
|
||
|
}
|
||
|
|
||
|
##
|
||
|
# Parse a line from a SAM alignment file and check that the
|
||
|
# alignment is sane and consistent with the reference.
|
||
|
#
|
||
|
sub parseSamLines {
|
||
|
my ($self, $lines) = @_;
|
||
|
my ($lastseq, $lastqual) = ("", "");
|
||
|
my $idx = 0;
|
||
|
for my $line (@$lines) {
|
||
|
$idx++;
|
||
|
print STDERR "Processing line...\n";
|
||
|
chomp($line);
|
||
|
next if $line =~ /^\@/;
|
||
|
my @toks = split(/\t/, $line, -1);
|
||
|
my (
|
||
|
$qname, #1
|
||
|
$flag, #2
|
||
|
$rname, #3
|
||
|
$pos, #4
|
||
|
$mapq, #5
|
||
|
$cigar, #6
|
||
|
$rnext, #7
|
||
|
$pnext, #8
|
||
|
$tlen, #9
|
||
|
$seq, #10
|
||
|
$qual) = @toks;
|
||
|
defined($qual) || die "Not enough SAM tokens:\n$line\n";
|
||
|
my @opt_flags_list = @toks[11..$#toks];
|
||
|
my %opt_flags = ();
|
||
|
next if $cigar eq "*"; # Failed to align
|
||
|
# Get the read sequence & qualities from previous record if necessary
|
||
|
if($seq eq "*") {
|
||
|
$lastseq ne "" || die "Line #$idx:\n$line";
|
||
|
$seq = $lastseq;
|
||
|
$qual = $lastqual;
|
||
|
} else {
|
||
|
$lastseq = $seq;
|
||
|
$lastqual = $qual;
|
||
|
}
|
||
|
$seq ne "*" || die;
|
||
|
my ($parsed_cigar, $rdlen_cigar, $rflen_cigar) = parse_cigar($cigar);
|
||
|
length($seq) == $rdlen_cigar ||
|
||
|
die "Sequence length and parsed cigar string length ($rdlen_cigar) mismatch:\n".
|
||
|
"$seq\n$parsed_cigar\nLine:\n$line";
|
||
|
# Stick optional flags into a hash
|
||
|
for my $fl (@opt_flags_list) {
|
||
|
my @fs = split(/:/, $fl, -1);
|
||
|
scalar(@fs) > 2 || die "Bad optional flag: $fl\n$line\n";
|
||
|
$opt_flags{$fs[0]}{type} = $fs[1];
|
||
|
$opt_flags{$fs[0]}{value} = join(":", @fs[2..$#fs]);
|
||
|
}
|
||
|
defined($opt_flags{"MD"}) || die "No MD:Z flag:\n$line\n";
|
||
|
$opt_flags{"MD"}{type} eq "Z" || die "Bad type for MD:Z flag\n$line\n";
|
||
|
my $md = $opt_flags{"MD"}{value};
|
||
|
$pos == int($pos) || die "POS field (col 4) must be an int:\n$line\n";
|
||
|
$pnext == int($pnext) || die "PNEXT field (col 8) must be an int:\n$line\n";
|
||
|
$tlen == int($tlen) || die "TLEN field (col 9) must be an int:\n$line\n";
|
||
|
$mapq == int($mapq) || die "MAPQ field (col 5) must be an int:\n$line\n";
|
||
|
# TODO: deal with bisulfite strands??
|
||
|
my $fw = (($flag & 0x10) == 0);
|
||
|
my $orient = $fw ? "+" : "-";
|
||
|
my $reftype = $self->calcRefType($orient);
|
||
|
defined($self->refs->{$reftype}{$rname}) ||
|
||
|
die "No such refname as $rname for reftype $reftype:\n$line\n".
|
||
|
Dumper($self->refs->{$reftype});
|
||
|
my $exoff = $pos-1; # expected 0-based reference offset
|
||
|
my ($alread, $alref, $htriml, $striml, $htrimr, $strimr) =
|
||
|
_read_md_cigar_to_al($seq, $md, $cigar);
|
||
|
print STDERR "$alread\n$alref\n";
|
||
|
my $rfseq = $alref;
|
||
|
$rfseq =~ s/[ -]//g; # remove spaces & gaps
|
||
|
my $exlen = length($rfseq);
|
||
|
my $refsub = substr($self->refs->{$reftype}{$rname}, $exoff, $exlen);
|
||
|
length($refsub) == $exlen ||
|
||
|
die "Tried to extract ref substring of length $exlen from:\n".
|
||
|
$self->refs->{$reftype}{$rname}.
|
||
|
"\ngot string of length ".length($refsub).":\n".
|
||
|
$refsub.
|
||
|
"\nfrom:\n".
|
||
|
$line.
|
||
|
"\nexlen is the length of:\n$rfseq\npos=$pos, rfseq=$rfseq\n";
|
||
|
$refsub = DNA::iupacSubN($refsub);
|
||
|
my $trueRfseq = $refsub;
|
||
|
length($trueRfseq) == length($rfseq) ||
|
||
|
die "Different lengths for edited read and ref:\n".
|
||
|
" Read: $seq\n".
|
||
|
"Edited read: $rfseq\n".
|
||
|
" Ref: $trueRfseq\n";
|
||
|
$rfseq eq $trueRfseq ||
|
||
|
die "Did not match:\n".
|
||
|
" Read: $seq\n".
|
||
|
"Edited read: $rfseq\n".
|
||
|
" Ref: $trueRfseq\n".
|
||
|
"$line";
|
||
|
$self->{_nals}++;
|
||
|
}
|
||
|
}
|
||
|
|
||
|
##
|
||
|
# Parse lines from a Bowtie alignment file and check that the
|
||
|
# alignments are sane and consistent with the reference.
|
||
|
#
|
||
|
sub parseBowtie {
|
||
|
my ($self, $fh) = @_;
|
||
|
while(<$fh>) {
|
||
|
$self->parseBowtieLines([$_]);
|
||
|
}
|
||
|
}
|
||
|
|
||
|
##
|
||
|
# Parse lines from a SAM alignment file and check that alignments are
|
||
|
# sane and consistent with the reference.
|
||
|
#
|
||
|
sub parseSam {
|
||
|
my ($self, $fh) = @_;
|
||
|
my @lines = ();
|
||
|
while(<$fh>) { push @lines, $_; }
|
||
|
$self->parseSamLines(\@lines);
|
||
|
}
|
||
|
|
||
|
##
|
||
|
# Parse lines from an alignment file of the type given by self->format
|
||
|
#
|
||
|
sub parseLines {
|
||
|
my ($self, $lines) = @_;
|
||
|
if($self->format eq "bowtie") {
|
||
|
$self->parseBowtieLines($lines);
|
||
|
} else {
|
||
|
$self->format eq "sam" || die;
|
||
|
$self->parseSamLines($lines);
|
||
|
}
|
||
|
}
|
||
|
|
||
|
##
|
||
|
# Parse lines from an alignment file of the type given by self->format
|
||
|
#
|
||
|
sub parse {
|
||
|
my ($self, $fh) = @_;
|
||
|
if($self->format eq "bowtie") {
|
||
|
$self->parseBowtie($fh);
|
||
|
} else {
|
||
|
$self->format eq "sam" || die;
|
||
|
$self->parseSam($fh);
|
||
|
}
|
||
|
}
|
||
|
|
||
|
##
|
||
|
# Print summary of how many alignments and edits were checked.
|
||
|
#
|
||
|
sub printSummary {
|
||
|
my $self = shift;
|
||
|
print STDERR "--- Summary ---\n";
|
||
|
print STDERR "Read ".scalar(keys %{$self->refs})." reference strings\n";
|
||
|
print STDERR "Checked $self->{_nals} alignments, $self->{_nedits} edits\n";
|
||
|
print STDERR "---------------\n";
|
||
|
print STDERR "PASSED\n";
|
||
|
}
|
||
|
|
||
|
##
|
||
|
# Check the given batch of alignments. We check that they're
|
||
|
# internally consistent in some basic ways, and we check that the
|
||
|
# sequence and edits are consistent with the reference.
|
||
|
#
|
||
|
# The $als argument is either a list of (possibly compressed) filenames
|
||
|
# of files containing alignments, or a list of alignment strings. If
|
||
|
# the former, $filenames is non-zero.
|
||
|
#
|
||
|
sub checkAlignments {
|
||
|
my ($self, $als, $filenames) = @_;
|
||
|
if($self->bisC) {
|
||
|
print STDERR "Generating all-C bisulfite-treated references\n";
|
||
|
bisulfiteC($self->refs);
|
||
|
}
|
||
|
if($self->bisCpG) {
|
||
|
print STDERR "Generating all-CpG bisulfite-treated references\n";
|
||
|
bisulfiteCpG($self->refs);
|
||
|
}
|
||
|
if($filenames) {
|
||
|
foreach (@$als) {
|
||
|
my $alnpipe = $_;
|
||
|
print STDERR "Processing alignment file '$_'\n";
|
||
|
$alnpipe = "gzip -dc $_ |" if ($_ =~ /\.gz$/);
|
||
|
my $alnfh = undef;
|
||
|
open($alnfh, $alnpipe) || die "Could not open '$alnpipe' for reading";
|
||
|
$self->parse($alnfh);
|
||
|
close($alnfh);
|
||
|
}
|
||
|
} else {
|
||
|
$self->parseLines($als);
|
||
|
}
|
||
|
}
|
||
|
|
||
|
##
|
||
|
# Check simple alignments
|
||
|
#
|
||
|
sub test1 {
|
||
|
my $ac = AlignmentCheck->new(
|
||
|
"AlignmentCheck.pm test1 checker",
|
||
|
{ "r1" => "TTGTTCGT" },
|
||
|
"bowtie",
|
||
|
0,
|
||
|
0
|
||
|
);
|
||
|
$ac->checkAlignments([
|
||
|
"0\t+\tr1\t1\tTGTTCGT\tIIIIIII\t40\t-",
|
||
|
"1\t+\tr1\t0\tTTGTTCG\tIIIIIII\t40\t-",
|
||
|
"2\t+\tr1\t2\tGTTCGTA\tIIIIIII\t40\t6:N>A",
|
||
|
"3\t+\tr1\t-1\tATTGTTC\tIIIIIII\t40\t0:N>A"], 0);
|
||
|
return 1;
|
||
|
}
|
||
|
|
||
|
##
|
||
|
# Check simple alignments from files
|
||
|
#
|
||
|
sub test2 {
|
||
|
open(TMP, ">/tmp/.AlignmentCheck.pm.fa") || die;
|
||
|
print TMP ">r1\n";
|
||
|
print TMP "TTGTTCGT\n";
|
||
|
close(TMP);
|
||
|
my $ac = AlignmentCheck->new(
|
||
|
"AlignmentCheck.pm test1 checker",
|
||
|
[ "/tmp/.AlignmentCheck.pm.fa" ],
|
||
|
"bowtie",
|
||
|
0,
|
||
|
0
|
||
|
);
|
||
|
$ac->checkAlignments([
|
||
|
"0\t+\tr1\t1\tTGTTCGT\tIIIIIII\t40\t-",
|
||
|
"1\t+\tr1\t0\tTTGTTCG\tIIIIIII\t40\t-",
|
||
|
"2\t+\tr1\t2\tGTTCGTA\tIIIIIII\t40\t6:N>A",
|
||
|
"3\t+\tr1\t-1\tATTGTTC\tIIIIIII\t40\t0:N>A"], 0);
|
||
|
return 1;
|
||
|
}
|
||
|
|
||
|
if($0 =~ /[^0-9a-zA-Z_]?AlignmentCheck\.pm$/) {
|
||
|
my @fas = ();
|
||
|
my @als = ();
|
||
|
my $format = "sam";
|
||
|
my $bisC = 0;
|
||
|
my $bisCpG = 0;
|
||
|
my $test = 0;
|
||
|
|
||
|
use Getopt::Long;
|
||
|
GetOptions (
|
||
|
"test" => \$test,
|
||
|
"fasta|ref=s" => \@fas,
|
||
|
"als|aln=s" => \@als,
|
||
|
"format=s" => \$format,
|
||
|
"bis-C|bisulfite-C" => \$bisC,
|
||
|
"bis-CpG|bisulfite-CpG" => \$bisCpG,
|
||
|
"bowtie" => sub {$format = "bowtie"},
|
||
|
"sam" => sub {$format = "sam"}) || die;
|
||
|
|
||
|
if($test) {
|
||
|
use Test;
|
||
|
print "Running unit tests\n";
|
||
|
# Run unit tests
|
||
|
Test::shouldSucceed("test1", \&test1);
|
||
|
Test::shouldSucceed("test2", \&test2);
|
||
|
exit 0;
|
||
|
}
|
||
|
|
||
|
my $ac = AlignmentCheck->new(
|
||
|
"AlignmentCheck.pm checker",
|
||
|
\@fas,
|
||
|
$format,
|
||
|
$bisC,
|
||
|
$bisCpG);
|
||
|
$ac->checkAlignments(\@als, 1);
|
||
|
}
|
||
|
|
||
|
1;
|