1053 lines
30 KiB
Perl
1053 lines
30 KiB
Perl
#!/usr/bin/perl -w
|
|
|
|
#
|
|
# Copyright 2011, Ben Langmead <langmea@cs.jhu.edu>
|
|
#
|
|
# This file is part of Bowtie 2.
|
|
#
|
|
# Bowtie 2 is free software: you can redistribute it and/or modify
|
|
# it under the terms of the GNU General Public License as published by
|
|
# the Free Software Foundation, either version 3 of the License, or
|
|
# (at your option) any later version.
|
|
#
|
|
# Bowtie 2 is distributed in the hope that it will be useful,
|
|
# but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
|
# GNU General Public License for more details.
|
|
#
|
|
# You should have received a copy of the GNU General Public License
|
|
# along with Bowtie 2. If not, see <http://www.gnu.org/licenses/>.
|
|
#
|
|
|
|
package Sim;
|
|
use strict;
|
|
use Carp;
|
|
use FindBin qw($Bin);
|
|
use lib $Bin;
|
|
use DNA;
|
|
use Test;
|
|
use RandDNA;
|
|
use SampleRead;
|
|
use Mutate;
|
|
use AlignmentCheck;
|
|
use Math::Random;
|
|
use List::Util qw(max min);
|
|
use POSIX;
|
|
|
|
##
|
|
# Replacement for "die" that additionally writes error message to file so that
|
|
# run.pl can read it later.
|
|
#
|
|
sub mydie($) {
|
|
my $fn = ".run.pl.child.$$";
|
|
open(EO, ">$fn") || die "Could not open $fn for writing";
|
|
print EO "$_[0]\n";
|
|
close(EO);
|
|
confess $_[0];
|
|
}
|
|
|
|
# Generates random printable strings of a given length
|
|
sub randStr($) {
|
|
my $len = shift;
|
|
my @chars = ('a'..'z', 'A'..'Z', '0'..'9', '_');
|
|
my $str = "";
|
|
foreach (1..$len) {
|
|
$str .= $chars[int(rand(scalar(@chars)))];
|
|
}
|
|
return $str;
|
|
}
|
|
|
|
##
|
|
# Default random generator for number of reference per test case.
|
|
#
|
|
sub defaultRefNumGen() { return int(Math::Random::random_exponential(1, 8))+1; }
|
|
|
|
##
|
|
# Default random generator for reference length.
|
|
#
|
|
sub defaultRefLenGen() {
|
|
return int(Math::Random::random_exponential(1, 50000))+1;
|
|
}
|
|
|
|
##
|
|
# Default random generator for number of reference per test case.
|
|
#
|
|
sub defaultReadNumGen() {
|
|
return int(Math::Random::random_exponential(1, 10000))+1;
|
|
}
|
|
|
|
##
|
|
# Default random generator for read length.
|
|
#
|
|
sub defaultFragLenGen() {
|
|
return int(Math::Random::random_normal(1, 200, 40))+1;
|
|
}
|
|
|
|
##
|
|
# Default random generator for reference length.
|
|
#
|
|
sub defaultReadLenGen() {
|
|
my $r = int(rand(3));
|
|
if($r == 0) {
|
|
return int(Math::Random::random_exponential(1, 60))+1;
|
|
} elsif($r == 1) {
|
|
return int(Math::Random::random_exponential(1, 20))+1;
|
|
} else {
|
|
return int(Math::Random::random_exponential(1, 150))+1;
|
|
}
|
|
}
|
|
|
|
##
|
|
# Default random generator for fraction of reference characters = N.
|
|
#
|
|
sub defaultNGen() {
|
|
return Math::Random::random_uniform(1, 0, 0.05);
|
|
}
|
|
|
|
##
|
|
# Default random generator for fraction of reference characters = an
|
|
# ambiguous IUPAC code.
|
|
#
|
|
sub defaultIupacGen() {
|
|
return Math::Random::random_uniform(1, 0, 0.01);
|
|
}
|
|
|
|
##
|
|
# Default random generator for AT/ACGT fraction.
|
|
#
|
|
sub defaultAtGen() {
|
|
return min(max(Math::Random::random_normal(1, 0.5, 0.18), 0), 1);
|
|
}
|
|
|
|
##
|
|
# Default random generator for A/AT fraction.
|
|
#
|
|
sub defaultAGen() {
|
|
return min(max(Math::Random::random_normal(1, 0.5, 0.18), 0), 1);
|
|
}
|
|
|
|
##
|
|
# Default random generator for C/CG fraction.
|
|
#
|
|
sub defaultCGen() {
|
|
return min(max(Math::Random::random_normal(1, 0.5, 0.18), 0), 1);
|
|
}
|
|
|
|
##
|
|
# Default SNP rate generator. Doesn't generate the SNP per se, just
|
|
# the rate.
|
|
#
|
|
sub defaultSNPGen() {
|
|
return Math::Random::random_uniform(1, 0, 0.05);
|
|
}
|
|
|
|
##
|
|
# Default read gap rate generator. Doesn't generate the gaps or
|
|
# lengths, just the rate.
|
|
#
|
|
sub defaultRdGapGen() {
|
|
return Math::Random::random_uniform(1, 0, 0.005);
|
|
}
|
|
|
|
##
|
|
# Default reference gap rate generator. Doesn't generate the gaps or
|
|
# lengths, just the rate.
|
|
#
|
|
sub defaultRfGapGen() {
|
|
return Math::Random::random_uniform(1, 0, 0.005);
|
|
}
|
|
|
|
##
|
|
# Default rearrangement rate generator.
|
|
#
|
|
sub defaultRearrGen() {
|
|
return Math::Random::random_uniform(1, 0, 0.005);
|
|
}
|
|
|
|
##
|
|
# Default function for generating gap lengths when introducing a gap.
|
|
#
|
|
sub defaultGapLenGen($) {
|
|
return int(Math::Random::random_exponential(1, 3))+1;
|
|
}
|
|
|
|
##
|
|
# Default function for generating random sequence to insert into a gap.
|
|
#
|
|
sub defaultSeqGen($) {
|
|
my $len = shift;
|
|
($len == int($len) && $len > 0) ||
|
|
mydie("Bad length for sequence generator: $len");
|
|
my $ret = "";
|
|
for (1..$len) {
|
|
$ret .= substr("ACGT", int(rand(4)), 1);
|
|
}
|
|
return $ret;
|
|
}
|
|
|
|
##
|
|
# Default sequencing miscall rate generator.
|
|
#
|
|
sub defaultSeqMmGen() {
|
|
return Math::Random::random_uniform(1, 0, 0.1);
|
|
}
|
|
|
|
##
|
|
# Create a new test case simulator
|
|
#
|
|
sub new {
|
|
my (
|
|
$class,
|
|
$name, # name of simulator
|
|
$rfnumgen, # number of reference sequences
|
|
$rflengen, # reference length
|
|
$rdnumgen, # number of read sequences per run
|
|
$rdlengen, # read length generator
|
|
$fraglengen, # fragment length generator
|
|
$ngen, # N fraction
|
|
$iupacgen, # Non-A/C/G/T/N IUPAC fraction (after N fraction removed)
|
|
$atgen, # AT fraction (after N/IUPAC fractions removed)
|
|
$agen, # A fraction (of AT)
|
|
$cgen, # C fraction (of CG)
|
|
$snpgen, # SNP rate gen
|
|
$rdgapgen, # read gap generator
|
|
$rfgapgen, # ref gap generator
|
|
$rearrgen, # rearrangement generator
|
|
$gaplengen, # gap length generator
|
|
$seqgen, # gap filler sequence generator
|
|
$seqmm, # sequencing error generator
|
|
) = @_;
|
|
$rfnumgen = \&defaultRefNumGen unless defined($rfnumgen);
|
|
$rflengen = \&defaultRefLenGen unless defined($rflengen);
|
|
$rdnumgen = \&defaultReadNumGen unless defined($rdnumgen);
|
|
$rdlengen = \&defaultReadLenGen unless defined($rdlengen);
|
|
$fraglengen = \&defaultFragLenGen unless defined($fraglengen);
|
|
$ngen = \&defaultNGen unless defined($ngen);
|
|
$iupacgen = \&defaultIupacGen unless defined($iupacgen);
|
|
$atgen = \&defaultAtGen unless defined($atgen);
|
|
$agen = \&defaultAGen unless defined($agen);
|
|
$cgen = \&defaultCGen unless defined($cgen);
|
|
$snpgen = \&defaultSNPGen unless defined($snpgen);
|
|
$rdgapgen = \&defaultRdGapGen unless defined($rdgapgen);
|
|
$rfgapgen = \&defaultRfGapGen unless defined($rfgapgen);
|
|
$rearrgen = \&defaultRearrGen unless defined($rearrgen);
|
|
$gaplengen = \&defaultGapLenGen unless defined($gaplengen);
|
|
$seqgen = \&defaultSeqGen unless defined($seqgen);
|
|
$seqmm = \&defaultSeqMmGen unless defined($seqmm);
|
|
$name = "noname" unless defined($name);
|
|
return bless {
|
|
_name => $name,
|
|
_rfnumgen => $rfnumgen,
|
|
_rflengen => $rflengen,
|
|
_rdnumgen => $rdnumgen,
|
|
_rdlengen => $rdlengen,
|
|
_fraglengen => $fraglengen,
|
|
_ngen => $ngen,
|
|
_iupacgen => $iupacgen,
|
|
_atgen => $atgen,
|
|
_agen => $agen,
|
|
_cgen => $cgen,
|
|
_snpgen => $snpgen,
|
|
_rdgapgen => $rdgapgen,
|
|
_rfgapgen => $rfgapgen,
|
|
_rearrgen => $rearrgen,
|
|
_gaplengen => $gaplengen,
|
|
_seqgen => $seqgen,
|
|
_seqmm => $seqmm,
|
|
}, $class;
|
|
}
|
|
sub rfnumgen { return $_[0]->{_rfnumgen} }
|
|
sub rflengen { return $_[0]->{_rflengen} }
|
|
sub rdnumgen { return $_[0]->{_rdnumgen} }
|
|
sub rdlengen { return $_[0]->{_rdlengen} }
|
|
sub fraglengen { return $_[0]->{_fraglengen} }
|
|
sub ngen { return $_[0]->{_ngen} }
|
|
sub iupacgen { return $_[0]->{_iupacgen} }
|
|
sub atgen { return $_[0]->{_atgen} }
|
|
sub agen { return $_[0]->{_agen} }
|
|
sub cgen { return $_[0]->{_cgen} }
|
|
sub snpgen { return $_[0]->{_snpgen} }
|
|
sub rdgapgen { return $_[0]->{_rdgapgen} }
|
|
sub rfgapgen { return $_[0]->{_rfgapgen} }
|
|
sub rearrgen { return $_[0]->{_rearrgen} }
|
|
sub gaplengen { return $_[0]->{_gaplengen} }
|
|
sub seqgen { return $_[0]->{_seqgen} }
|
|
sub seqmm { return $_[0]->{_seqmm} }
|
|
|
|
##
|
|
# Generate DNA generator.
|
|
#
|
|
sub genDNAgen {
|
|
my $self = shift;
|
|
my $nfrac = $self->ngen->();
|
|
my $iupacfrac = $self->iupacgen->();
|
|
my $atfrac = $self->atgen->();
|
|
my $afrac = $self->agen->();
|
|
my $cfrac = $self->cgen->();
|
|
my $refdnagen = RandDNA->new(
|
|
"Sim.pm gen",
|
|
$nfrac,
|
|
$iupacfrac,
|
|
$atfrac,
|
|
$afrac,
|
|
$cfrac);
|
|
printf STDERR "Created DNA generator\n";
|
|
printf STDERR " N frac: %0.3f\n", $nfrac;
|
|
printf STDERR " IUPAC frac: %0.3f\n", $iupacfrac;
|
|
printf STDERR " AT/ACGT frac: %0.3f\n", $atfrac;
|
|
printf STDERR " A/AT frac: %0.3f\n", $afrac;
|
|
printf STDERR " C/CG frac: %0.3f\n", $cfrac;
|
|
return $refdnagen;
|
|
}
|
|
|
|
##
|
|
# Generate and print reference sequences to file of given name. Also,
|
|
# install reference sequences into hash ref $ref. To allow for
|
|
# "overhang" (alignment that hang off the end of the reference), we
|
|
# actually write out a little bit less than the full reference sequence
|
|
# for each sequence.
|
|
#
|
|
sub genRef {
|
|
my ($self, $ref, $refdnagen, $conf, $tmpfn) = @_;
|
|
# Get a generator for reference length
|
|
my $reflen = $self->rflengen;
|
|
# Generate the number of references
|
|
my $refnum = $self->rfnumgen->();
|
|
$refnum = sqrt($refnum) if $conf->{small};
|
|
$refnum = 1 if $refnum <= 0;
|
|
$refnum = sqrt($refnum) if $conf->{small};
|
|
$refnum = 1 if $refnum <= 0;
|
|
$refnum = ceil($refnum);
|
|
$refnum = $conf->{numrefs} if defined($conf->{numrefs});
|
|
# Open output file
|
|
open (FA, ">$tmpfn") ||
|
|
mydie("Could not open temporary fasta file '$tmpfn' for writing");
|
|
my %ccnt = ();
|
|
print STDERR "Generating $refnum references\n";
|
|
for (1..$refnum) {
|
|
# Randomly generate length
|
|
my $len = $reflen->();
|
|
$len = sqrt($len) if $conf->{small};
|
|
$len = 1 if $len <= 0;
|
|
$len = ceil($len);
|
|
my $seq = $refdnagen->nextSeq($len);
|
|
length($seq) >= $len || die;
|
|
my $name = "Sim.pm.$_";
|
|
$ref->{$name} = $seq;
|
|
# Select amount to trim from LHS
|
|
my $trimleft = int(Math::Random::random_exponential(1, 200));
|
|
# Select amount to trim from RHS
|
|
my $trimright = int(Math::Random::random_exponential(1, 200));
|
|
# Make sure we're leaving some sequence after trimming
|
|
while($trimleft + $trimright > $len) {
|
|
if(int(rand(2))) {
|
|
$trimleft = int($trimleft*0.5);
|
|
} else {
|
|
$trimright = int($trimright*0.5);
|
|
}
|
|
}
|
|
# Trim the sequence
|
|
substr($seq, 0, $trimleft) = "";
|
|
$seq = substr($seq, 0, length($seq)-$trimright);
|
|
my $trimlen = length($seq);
|
|
$trimlen == $len - $trimleft - $trimright ||
|
|
mydie("Unexpected trim combo: $len, $trimleft, $trimright, $trimlen");
|
|
print STDERR " Generated reference '$name' of untrimmed length $len, trimmed length $trimlen\n";
|
|
print FA ">$name\n";
|
|
my $buf = "";
|
|
length($seq) >= $trimlen || die;
|
|
for my $i (1..$trimlen) {
|
|
my $c = substr($seq, $i-1, 1);
|
|
defined($c) || die;
|
|
$ccnt{$c}++;
|
|
$buf .= $c;
|
|
$ref->{$name} .= $c;
|
|
if($i % 60 == 0) {
|
|
print FA "$buf\n";
|
|
$buf = "";
|
|
}
|
|
}
|
|
print FA "$buf\n" if $buf ne "";
|
|
}
|
|
close(FA);
|
|
print STDERR "Wrote references to $tmpfn\n";
|
|
for my $k (sort keys %ccnt) {
|
|
print STDERR " $k: $ccnt{$k}\n";
|
|
}
|
|
}
|
|
|
|
##
|
|
# Generate a hash of key/value arguments to pass to bowtie2.
|
|
#
|
|
sub genBuildArgs {
|
|
my ($self) = @_;
|
|
my %args = ();
|
|
my $r1 = int(rand(3));
|
|
if($r1 == 0) {
|
|
$args{"--bmaxdivn"} = int(Math::Random::random_exponential(1, 4))+1;
|
|
} elsif($r1 == 1) {
|
|
$args{"--bmax"} = int(Math::Random::random_exponential(1, 10000))+100;
|
|
}
|
|
my $r2 = int(rand(2));
|
|
if($r2 == 0) {
|
|
$args{"--dcv"} = 2 ** (int(rand(10))+4);
|
|
}
|
|
my $r3 = int(rand(5));
|
|
if($r3 == 0) {
|
|
$args{"--packed"} = "";
|
|
}
|
|
my $r4 = int(rand(3));
|
|
if($r4 == 0) {
|
|
$args{"--offrate"} = int(rand(8))+1;
|
|
}
|
|
return \%args;
|
|
}
|
|
|
|
##
|
|
# Given a fasta filename, an index basename, and a path to the
|
|
# bowtie2-build executable, build nucleotide-space and colorpace
|
|
# indexes for the sequences in the fasta file.
|
|
#
|
|
sub build {
|
|
my ($self, $fa, $idx, $conf, $args) = @_;
|
|
my $argstr = "";
|
|
for (keys %$args) {
|
|
$argstr .= " $_";
|
|
if($args->{$_} ne "") {
|
|
$argstr .= " ".$args->{$_};
|
|
}
|
|
}
|
|
$argstr .= " --sanity";
|
|
# Build nucleotide index
|
|
my $cmd = "$conf->{bowtie2_build_debug} $argstr $fa $idx";
|
|
print STDERR "$cmd\n";
|
|
system($cmd);
|
|
$? == 0 || mydie("Error running '$cmd'; exitlevel=$?");
|
|
print STDERR "Built nucleotide index '$idx'\n";
|
|
# Build colorspace index
|
|
unless($conf->{no_color}) {
|
|
$cmd = "$conf->{bowtie2_build_debug} $argstr -C $fa ${idx}.c";
|
|
print STDERR "$cmd\n";
|
|
system($cmd);
|
|
$? == 0 || mydie("Error running '$cmd'; exitlevel=$?");
|
|
print STDERR "Built colorspace index '$idx'\n";
|
|
}
|
|
}
|
|
|
|
##
|
|
# Given a hash of sequences, flatten all IUPAC codes into unambiguous
|
|
# nucleotides.
|
|
#
|
|
sub flattenIUPAC() {
|
|
my ($self, $h) = @_;
|
|
for my $c (keys %$h) {
|
|
my $len = length($h->{$c});
|
|
for my $i (0..$len-1) {
|
|
my $ch = uc substr($h->{$c}, $i, 1);
|
|
my $nc = $ch;
|
|
if(DNA::isIUPAC($ch) || $ch eq "N") {
|
|
if(rand() < $self->snpgen->()) {
|
|
$nc = DNA::pickIncompat($ch);
|
|
defined($nc) || mydie("Couldn't find incompatible base for $ch");
|
|
} else {
|
|
$nc = DNA::pickCompat($ch);
|
|
defined($nc) || mydie("Couldn't find compatible base for $ch");
|
|
}
|
|
}
|
|
if($ch ne $nc) {
|
|
substr($h->{$c}, $i, 1) = $nc;
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
##
|
|
# Mutate reference genome into a subject genome.
|
|
#
|
|
sub mutate() {
|
|
my ($self, $refs) = @_;
|
|
my %subj = %$refs;
|
|
$self->flattenIUPAC(\%subj);
|
|
print STDERR "Flattened IUPAC characters\n";
|
|
my $mutator = Mutate->new(
|
|
"Sim.pm mutator",
|
|
$self->snpgen,
|
|
$self->rdgapgen,
|
|
$self->rfgapgen,
|
|
$self->rearrgen,
|
|
$self->gaplengen,
|
|
$self->seqgen);
|
|
my ($nsnp, $nrfgap, $nrdgap, $nrearr) = (0, 0, 0, 0);
|
|
for(keys %subj) {
|
|
print STDERR " Mutating sequence $_\n";
|
|
my ($nsnp_, $nrfgap_, $nrdgap_, $nrearr_) = $mutator->mutateSeq($_, \%subj);
|
|
$nsnp += $nsnp_;
|
|
$nrfgap += $nrfgap_;
|
|
$nrdgap += $nrdgap_;
|
|
$nrearr += $nrearr_;
|
|
}
|
|
print STDERR "Mutated reference genome to subject genome\n";
|
|
print STDERR " SNPs introduced: $nsnp\n";
|
|
print STDERR " Reference gaps introduced: $nrfgap\n";
|
|
print STDERR " Read gaps introduced: $nrdgap\n";
|
|
print STDERR " Rearrangements introduced: $nrearr\n";
|
|
return \%subj;
|
|
}
|
|
|
|
sub dumpFastq {
|
|
my ($self, $input, $fh1, $fh2) = @_;
|
|
for (1..scalar(@{$input->{seq1s}})) {
|
|
my $seq1 = $input->{seq1s}->[$_-1];
|
|
my $qual1 = $input->{qual1s}->[$_-1];
|
|
print {$fh1} "\@$_\n";
|
|
print {$fh1} "$seq1\n";
|
|
print {$fh1} "+$_\n";
|
|
print {$fh1} "$qual1\n";
|
|
if($input->{paired}) {
|
|
my $seq2 = $input->{seq2s}->[$_-1];
|
|
my $qual2 = $input->{qual2s}->[$_-1];
|
|
print {$fh2} "\@$_\n";
|
|
print {$fh2} "$seq2\n";
|
|
print {$fh2} "+$_\n";
|
|
print {$fh2} "$qual2\n";
|
|
}
|
|
}
|
|
}
|
|
|
|
sub dumpQseq {
|
|
my ($self, $input, $fh1, $fh2) = @_;
|
|
for (1..scalar(@{$input->{seq1s}})) {
|
|
my $seq1 = $input->{seq1s}->[$_-1];
|
|
my $qual1 = $input->{qual1s}->[$_-1];
|
|
print {$fh1} "R\t1\t1\t1\t$_\t$_\t1\t1\t$seq1\t$qual1\t1\n";
|
|
if($input->{paired}) {
|
|
my $seq2 = $input->{seq2s}->[$_-1];
|
|
my $qual2 = $input->{qual2s}->[$_-1];
|
|
print {$fh2} "R\t1\t1\t1\t$_\t$_\t1\t1\t$seq2\t$qual2\t1\n";
|
|
}
|
|
}
|
|
}
|
|
|
|
sub dumpFasta {
|
|
my ($self, $input, $fh1, $fh2) = @_;
|
|
for (1..scalar(@{$input->{seq1s}})) {
|
|
my $seq1 = $input->{seq1s}->[$_-1];
|
|
print {$fh1} ">$_\n";
|
|
print {$fh1} "$seq1\n";
|
|
if($input->{paired}) {
|
|
my $seq2 = $input->{seq2s}->[$_-1];
|
|
print {$fh2} ">$_\n";
|
|
print {$fh2} "$seq2\n";
|
|
}
|
|
}
|
|
}
|
|
|
|
sub dumpRaw {
|
|
my ($self, $input, $fh1, $fh2) = @_;
|
|
for (1..scalar(@{$input->{seq1s}})) {
|
|
my $seq1 = $input->{seq1s}->[$_-1];
|
|
print {$fh1} "$seq1\n";
|
|
if($input->{paired}) {
|
|
my $seq2 = $input->{seq2s}->[$_-1];
|
|
print {$fh2} "$seq2\n";
|
|
}
|
|
}
|
|
}
|
|
|
|
##
|
|
# Generate the input (reads plus paired/fragment information)
|
|
#
|
|
sub genInput {
|
|
my ($self, $refs, $conf) = @_;
|
|
# Select whether we're doing colorspace
|
|
my $color = int(rand(2));
|
|
$color = 0 if $conf->{no_color};
|
|
# Select whether we're doing unpaired or paired-end.
|
|
my $paired = int(rand(2));
|
|
$paired = 0 if $conf->{no_paired};
|
|
# Select format for read file
|
|
my @formats = ("fastq", "qseq", "fasta", "raw");
|
|
my @format_arg = ( "-q", "--qseq", "-f", "-r");
|
|
my $formati = int(rand(scalar(@formats)));
|
|
my $format = $formats[$formati];
|
|
my $format_arg = $format_arg[$formati];
|
|
my $tmprdfn1 = "$conf->{tempdir}/Sim.pm.$conf->{randstr}_1.$format";
|
|
my $tmprdfn2 = "$conf->{tempdir}/Sim.pm.$conf->{randstr}_2.$format";
|
|
# Generate reads from the subject genome; no sequencing error yet
|
|
my %input = (
|
|
seq1s => [],
|
|
seq2s => [],
|
|
qual1s => [],
|
|
qual2s => [],
|
|
mate1fw => 1,
|
|
mate2fw => 0,
|
|
paired => $paired,
|
|
color => $color,
|
|
format => $format,
|
|
format_arg => $format_arg,
|
|
file1 => $tmprdfn1,
|
|
file2 => $tmprdfn2 );
|
|
my $read_sampler = SampleRead->new(
|
|
"Sim.pm read sampler",
|
|
$self->fraglengen,
|
|
$self->rdlengen,
|
|
$self->rdlengen);
|
|
print STDERR "Created read sampler\n";
|
|
my $numreads = $self->rdnumgen->();
|
|
$numreads = ceil(sqrt($numreads)) if $conf->{small};
|
|
$numreads == int($numreads) || mydie("numreads $numreads not a number");
|
|
my $tmp = int(rand(3));
|
|
if($tmp == 0) {
|
|
$input{mate2fw} = 1;
|
|
} elsif($tmp == 1) {
|
|
$input{mate1fw} = 0;
|
|
$input{mate2fw} = 1;
|
|
}
|
|
print STDERR "Sampling $numreads reads\n";
|
|
ref($refs) eq "HASH" || mydie("Reference input must be hash ref");
|
|
if($paired) {
|
|
$read_sampler->genReadPairs(
|
|
$numreads, # number of reads/fragments to generate
|
|
$input{color}, # colorize?
|
|
$refs, # hash ref holding reference sequences
|
|
$input{mate1fw}, # orientation of mate 1 when fragment comes from Watson strand
|
|
$input{mate2fw}, # orientation of mate 2 when fragment comes from Watson strand
|
|
$input{seq1s}, # put generated mate1 sequences here
|
|
$input{seq2s}, # put generated mate2 sequences here
|
|
$input{qual1s}, # put generated mate1 quality sequences here
|
|
$input{qual2s}); # put generated mate2 quality sequences here
|
|
} else {
|
|
$read_sampler->genReads(
|
|
$numreads, # number of reads/fragments to generate
|
|
$input{color}, # colorize?
|
|
$refs, # hash ref holding reference sequences
|
|
$input{seq1s}, # put generated sequences here
|
|
$input{qual1s}); # put generated quality sequences here
|
|
}
|
|
# TODO: with some probability, sort the reads
|
|
print STDERR "Dumping reads to temporary files $tmprdfn1 & $tmprdfn2\n";
|
|
# Dump reads to output file
|
|
my ($fh1, $fh2);
|
|
open($fh1, ">$tmprdfn1") || mydie("Could not open '$tmprdfn1' for writing");
|
|
open($fh2, ">$tmprdfn2") || mydie("Could not open '$tmprdfn2' for writing");
|
|
if($format eq "fastq") {
|
|
$self->dumpFastq(\%input, $fh1, $fh2);
|
|
} elsif($format eq "qseq") {
|
|
$self->dumpQseq(\%input, $fh1, $fh2);
|
|
} elsif($format eq "fasta") {
|
|
$self->dumpFasta(\%input, $fh1, $fh2);
|
|
} elsif($format eq "raw") {
|
|
$self->dumpRaw(\%input, $fh1, $fh2);
|
|
}
|
|
close($fh1);
|
|
close($fh2);
|
|
return \%input;
|
|
}
|
|
|
|
##
|
|
# Mutate reads according to sequencing error model.
|
|
#
|
|
sub mutateSeq {
|
|
my ($self, $input) = @_;
|
|
return $input;
|
|
}
|
|
|
|
##
|
|
# Generate a setting for MA (match bonus).
|
|
#
|
|
sub genPolicyMA($) {
|
|
my $local = shift;
|
|
return "" if ($local || int(rand(2)) == 0);
|
|
return "MA=".Math::Random::random_uniform(1, 1, 40).";";
|
|
}
|
|
|
|
##
|
|
# Generate a setting for MMP (mismatch penalty).
|
|
#
|
|
sub genPolicyMMP() {
|
|
return "" if int(rand(2)) == 0;
|
|
my $op = substr("CQR", int(rand(3)), 1);
|
|
if($op eq "C") {
|
|
$op .= Math::Random::random_uniform(1, 1, 40);
|
|
}
|
|
return "MMP=$op;";
|
|
}
|
|
|
|
##
|
|
# Generate a setting for NP (penalty for a mismatch involving an N).
|
|
#
|
|
sub genPolicyNP() {
|
|
return "" if int(rand(2)) == 0;
|
|
my $op = substr("CQR", int(rand(3)), 1);
|
|
if($op eq "C") {
|
|
$op .= int(Math::Random::random_exponential(1, 3))+1;
|
|
}
|
|
return "NP=$op;";
|
|
}
|
|
|
|
##
|
|
# Generate a setting for RDG (read gap open and extend penalties).
|
|
#
|
|
sub genPolicyRDG() {
|
|
return undef if int(rand(2)) == 0;
|
|
my $op = Math::Random::random_uniform(1, 1, 50);
|
|
if(int(rand(2)) == 0) {
|
|
$op .= ",";
|
|
$op .= Math::Random::random_uniform(1, 1, 20);
|
|
}
|
|
return "$op";
|
|
}
|
|
|
|
##
|
|
# Generate a setting for RFG (ref gap open and extend penalties).
|
|
#
|
|
sub genPolicyRFG() {
|
|
return undef if int(rand(2)) == 0;
|
|
my $op = Math::Random::random_uniform(1, 1, 50);
|
|
if(int(rand(2)) == 0) {
|
|
$op .= ",";
|
|
$op .= Math::Random::random_uniform(1, 1, 20);
|
|
}
|
|
return "$op";
|
|
}
|
|
|
|
##
|
|
# Generate a setting for MIN (function determining minimum acceptable score).
|
|
#
|
|
sub genPolicyMIN($) {
|
|
my $local = shift;
|
|
return undef if ($local || int(rand(2)) == 0);
|
|
my $xx = Math::Random::random_uniform(1, 1, 10);
|
|
my $yy = Math::Random::random_uniform(1, 1, 10);
|
|
if(!$local) {
|
|
$xx = -$xx if int(rand(2)) == 0;
|
|
$yy = -$yy;
|
|
}
|
|
return "L,$xx,$yy";
|
|
}
|
|
|
|
##
|
|
# Generate a setting for NCEIL (function determining maximum number of Ns
|
|
# allowed).
|
|
#
|
|
sub genPolicyNCEIL() {
|
|
return undef if int(rand(2)) == 0;
|
|
my $xx = Math::Random::random_uniform(1, 0, 1.5);
|
|
my $yy = Math::Random::random_uniform(1, 0, 1.5);
|
|
return "$xx,$yy";
|
|
}
|
|
|
|
##
|
|
# Generate a setting for SEED (# mismatches, length, interval).
|
|
#
|
|
sub genPolicySEED() {
|
|
return undef if int(rand(2)) == 0;
|
|
# Pick a number of mismatches
|
|
my $sd = substr("012", int(rand(2)), 1);
|
|
if(rand() < 0.9) {
|
|
# Length
|
|
$sd .= ",".int(Math::Random::random_uniform(1, 12, 32));
|
|
}
|
|
return $sd;
|
|
}
|
|
|
|
##
|
|
# Generate a setting for -D (# DP fails in a row).
|
|
#
|
|
sub genPolicyFailStreak() {
|
|
return undef if int(rand(2)) == 0;
|
|
return int(Math::Random::random_uniform(1, 2, 50));
|
|
}
|
|
|
|
##
|
|
# Generate a setting for -R (# seeding rounds).
|
|
#
|
|
sub genPolicySeedRounds() {
|
|
return undef if int(rand(2)) == 0;
|
|
return int(Math::Random::random_uniform(1, 1, 5));
|
|
}
|
|
|
|
##
|
|
# Generate a setting for IVAL. Interval between seeds is a function of the
|
|
# read length OR sqaure root of read length OR cube root of read length.
|
|
#
|
|
sub genPolicyIVAL() {
|
|
return "" if int(rand(2)) == 0;
|
|
# Pick a number of mismatches
|
|
my $iv = substr("LSC", int(rand(3)), 1);
|
|
if($iv eq "L") {
|
|
if(rand() < 0.9) {
|
|
# Multiplier
|
|
$iv .= ",".Math::Random::random_uniform(1, 0.0, 0.5);
|
|
}
|
|
if(rand() < 0.3) {
|
|
# Offset
|
|
$iv .= ",".Math::Random::random_uniform(1, 0.0, 4.0);
|
|
}
|
|
} elsif($iv eq "S") {
|
|
if(rand() < 0.9) {
|
|
# Multiplier
|
|
$iv .= ",".Math::Random::random_uniform(1, 0.0, 3.0);
|
|
}
|
|
if(rand() < 0.3) {
|
|
# Offset
|
|
$iv .= ",".Math::Random::random_uniform(1, 0.0, 7.0);
|
|
}
|
|
} elsif($iv eq "C") {
|
|
if(rand() < 0.9) {
|
|
# Multiplier
|
|
$iv .= ",".Math::Random::random_uniform(1, 0.0, 5.0);
|
|
}
|
|
if(rand() < 0.3) {
|
|
# Offset
|
|
$iv .= ",".Math::Random::random_uniform(1, 0.0, 14.0);
|
|
}
|
|
}
|
|
return "IVAL=$iv;";
|
|
}
|
|
|
|
##
|
|
# Generate a random but meaningful string of policy arguments to specify using
|
|
# the -P option.
|
|
#
|
|
sub genPolicyArg($) {
|
|
my $local = shift;
|
|
my $args = "";
|
|
$args .= genPolicyMA($local);
|
|
$args .= genPolicyMMP();
|
|
$args .= genPolicyNP();
|
|
$args .= genPolicyIVAL();
|
|
if($args ne "") {
|
|
return substr($args, 0, -1);
|
|
} else { return ""; }
|
|
}
|
|
|
|
##
|
|
# Generate a hash of key/value arguments to pass to bowtie2.
|
|
#
|
|
sub genAlignArgs {
|
|
my ($self, $input, $color, $conf) = @_;
|
|
my %args = ();
|
|
my $local = int(rand(2)) == 0;
|
|
$args{"-u"} = $conf->{maxreads} if defined($conf->{maxreads});
|
|
$args{"--mm"} = "" if int(rand(2)) == 0;
|
|
#$args{"--overhang"} = "" if int(rand(2)) == 0;
|
|
$args{"--trim3"} = int(rand(10)) if int(rand(2)) == 0;
|
|
$args{"--trim5"} = int(rand(10)) if int(rand(2)) == 0;
|
|
$args{"--nofw"} = "" if int(rand(4)) == 0;
|
|
$args{"--norc"} = "" if int(rand(4)) == 0;
|
|
$args{"--col-keepends"} = "" if ($color && int(rand(3)) == 0);
|
|
$args{"--gbar"} = int(Math::Random::random_exponential(1, 3))+1 if int(rand(4)) == 0;
|
|
$args{"--local"} = "" if $local;
|
|
my $rep = int(rand(5));
|
|
if($rep == 0) {
|
|
$args{"-a"} = "";
|
|
} elsif($rep == 1) {
|
|
$args{"-k"} = int(Math::Random::random_exponential(1, 3))+2;
|
|
} elsif($rep == 2) {
|
|
$args{"-M"} = int(Math::Random::random_exponential(1, 3))+2;
|
|
}
|
|
$args{"--rdg"} = genPolicyRDG();
|
|
$args{"--rfg"} = genPolicyRFG();
|
|
$args{"--score-min"} = genPolicyMIN($local);
|
|
$args{"--n-ceil"} = genPolicyNCEIL();
|
|
$args{"-N"} = genPolicySEED();
|
|
$args{"-D"} = genPolicyFailStreak();
|
|
$args{"-R"} = genPolicySeedRounds();
|
|
$args{"--policy"} = ("\"".genPolicyArg($local)."\"") if rand() < 0.9;
|
|
$args{"--cp-min"} = int(Math::Random::random_exponential(1, 3)) + 2;
|
|
$args{"--cp-ival"} = int(Math::Random::random_exponential(1, 1)) + 1;
|
|
return \%args;
|
|
}
|
|
|
|
##
|
|
# Align the given input set against the given index using the given
|
|
# bowtie2 binary and arguments. Sanity-check the SAM output.
|
|
#
|
|
sub align {
|
|
my ($self, $fa, $idx, $input, $conf, $args) = @_;
|
|
my $argstr = "";
|
|
for (keys %$args) {
|
|
if(defined($args->{$_})) {
|
|
$argstr .= " $_";
|
|
if($args->{$_} ne "") {
|
|
$argstr .= " ".$args->{$_};
|
|
}
|
|
}
|
|
}
|
|
$argstr .= " -C" if $input->{color};
|
|
$argstr .= " ".$input->{format_arg};
|
|
$idx .= ".c" if $input->{color};
|
|
my $inputfn;
|
|
if($input->{paired}) {
|
|
$inputfn = "-1 $input->{file1} -2 $input->{file2}";
|
|
} else {
|
|
$inputfn = $input->{file1};
|
|
}
|
|
# Create object that will help us sanity-check alignments
|
|
my $ac = AlignmentCheck->new(
|
|
"Sim.pm alignment checker", # name
|
|
[ $fa ], # fasta
|
|
"sam", # SAM-formatted alignments
|
|
0, # no bis-C
|
|
0 # no bis-CpG
|
|
);
|
|
$ac->nrefs() > 0 || mydie("No references");
|
|
# Run normal (non-debug) Bowtie
|
|
defined($conf->{tempdir}) || mydie("No tmp dir");
|
|
my $als = "$conf->{tempdir}/Sim.pm.$conf->{randstr}.als";
|
|
my $als_debug = "$conf->{tempdir}/Sim.pm.$conf->{randstr}.debug.als";
|
|
my $als_px = "$conf->{tempdir}/Sim.pm.$conf->{randstr}.px.als";
|
|
my $als_px_reord = "$conf->{tempdir}/Sim.pm.$conf->{randstr}.px.reord.als";
|
|
my $cmd = "$conf->{bowtie2_debug} $argstr $idx $inputfn";
|
|
print "$cmd\n";
|
|
open(ALSDEB, ">$als_debug") || mydie("Could not open '$als_debug' for writing");
|
|
open(ALSDEBCMD, "$cmd |") || mydie("Could not open pipe '$cmd |'");
|
|
my $ival = 50;
|
|
my $nals = 0;
|
|
my @lines = ();
|
|
while(<ALSDEBCMD>) {
|
|
# Remove @PG line because CL: tag can legitimately differ
|
|
print ALSDEB $_ unless /^\@PG/;
|
|
push @lines, $_;
|
|
$nals++;
|
|
print STDERR " Read $nals alignments...\n" if ($nals % $ival) == 0;
|
|
}
|
|
close(ALSDEBCMD);
|
|
$ac->checkAlignments(\@lines, 0);
|
|
$? == 0 || mydie("bowtie2-align-debug exited with exitlevel $?:\n$cmd\n");
|
|
close(ALSDEB);
|
|
$ac->printSummary();
|
|
# With some probability, also run debug Bowtie and check that
|
|
# results are identical
|
|
if(int(rand(3)) == 0) {
|
|
print STDERR "ALSO checking that bowtie2 and bowtie2-align-debug match up\n";
|
|
# Remove @PG line because CL: tag can legitimately differ
|
|
$cmd = "$conf->{bowtie2} $argstr $idx $inputfn | grep -v '^\@PG' > $als";
|
|
print "$cmd\n";
|
|
system($cmd);
|
|
$? == 0 ||
|
|
mydie("Command '$cmd' failed with exitlevel $?");
|
|
$cmd = "diff -uw $als $als_debug";
|
|
print "$cmd\n";
|
|
system($cmd);
|
|
$? == 0 ||
|
|
mydie("diff found a difference between bowtie2 and bowtie2-align-debug ".
|
|
"output for same input (above)\n");
|
|
}
|
|
# With some probability, also run debug Bowtie in -p X mode with X > 1 and
|
|
# without the --reorder argument and check that results are identical
|
|
if(int(rand(3)) == 0) {
|
|
print STDERR "ALSO checking that bowtie2 and bowtie2 -p X w/ X > 1 match up\n";
|
|
my $p = int(rand(3))+2;
|
|
$cmd = "$conf->{bowtie2} $argstr -p $p $idx $inputfn | grep -v '^\@PG' > $als_px";
|
|
print "$cmd\n";
|
|
system($cmd);
|
|
$? == 0 ||
|
|
mydie("Command '$cmd' failed with exitlevel $?");
|
|
# Sort the $als_px and $als_debug files to guarantee that reads and
|
|
# alignments for a given read appear in the same order in both
|
|
$cmd = "sort -k 1,1 -n -k 2,2 -k 3,3 -k 4,4 < $als_px | grep -v '^\@PG' > $als_px.sorted";
|
|
print "$cmd\n";
|
|
system($cmd);
|
|
$? == 0 ||
|
|
mydie("Failed to sort alignment file $als_px\n");
|
|
# Sort the $als_px and $als_debug files to guarantee that reads and
|
|
# alignments for a given read appear in the same order in both
|
|
$cmd = "sort -k 1,1 -n -k 2,2 -k 3,3 -k 4,4 < $als_debug | grep -v '^\@PG' > $als_debug.sorted";
|
|
print "$cmd\n";
|
|
system($cmd);
|
|
$? == 0 ||
|
|
mydie("Failed to sort alignment file $als_debug\n");
|
|
$cmd = "diff -uw $als_debug.sorted $als_px.sorted";
|
|
print "$cmd\n";
|
|
system($cmd);
|
|
$? == 0 ||
|
|
mydie("diff found a difference between bowtie2-align-debug and bowtie2 ".
|
|
"-p output for same input (above)\n");
|
|
}
|
|
|
|
# With some probability, also run debug Bowtie in -p X mode with X > 1 and
|
|
# with the --reorder argument and check that results are identical
|
|
if(int(rand(3)) == 0) {
|
|
print STDERR "ALSO checking that bowtie2 and bowtie2 -p X --reorder w/ X > 1 match up\n";
|
|
my $p = int(rand(3))+2;
|
|
$cmd = "$conf->{bowtie2} $argstr -p $p $idx --reorder $inputfn | grep -v '^\@PG' > $als_px_reord";
|
|
print "$cmd\n";
|
|
system($cmd);
|
|
$? == 0 || mydie("Command '$cmd' failed with exitlevel $?");
|
|
$cmd = "diff -uw $als_debug $als_px_reord";
|
|
print "$cmd\n";
|
|
system($cmd);
|
|
$? == 0 ||
|
|
mydie("diff found a difference between bowtie2-align-debug and bowtie2 ".
|
|
"-p --reorder output for same input (above)\n");
|
|
}
|
|
}
|
|
|
|
##
|
|
# Generate a new test case
|
|
#
|
|
# Possible key/value pairs in $conf hash:
|
|
#
|
|
# 1. bowtie2_build: path to bowtie2-build binary
|
|
# 2. bowtie2: path to bowtie2 binary
|
|
# 3. bowtie2_build_debug: path to bowtie2-build-debug binary
|
|
# 4. bowtie2_debug: path to bowtie2-debug binary
|
|
# 5. tempdir: temporary directory for reference/reads/index
|
|
# 6. no_paired: defined & non-0 -> don't generate paired-end datasets
|
|
# 7. no_color: defined & non-0 -> don't generate colorspace datasets
|
|
# 8. single_thread: defined & non-0 -> don't use -p X where X > 1
|
|
#
|
|
sub nextCase {
|
|
my ($self, $conf) = @_;
|
|
|
|
$conf->{bowtie2_build} = "bowtie2-build" unless defined($conf->{bowtie2_build});
|
|
$conf->{bowtie2} = "bowtie2-align" unless defined($conf->{bowtie2});
|
|
$conf->{bowtie2_build_debug} = "bowtie2-build-debug" unless defined($conf->{bowtie2_build_debug});
|
|
$conf->{bowtie2_debug} = "bowtie2-align-debug" unless defined($conf->{bowtie2_debug});
|
|
$conf->{tempdir} = "/tmp" unless defined($conf->{tempdir});
|
|
srand(time ^ $$);
|
|
$conf->{randstr} = randStr(8);
|
|
|
|
print "*** TEST CASE ***\n";
|
|
|
|
# Generate the references
|
|
my $refdnagen = $self->genDNAgen();
|
|
# Generate references and write them to a temporary fasta file
|
|
my $tmpfn = "$conf->{tempdir}/Sim.pm.$conf->{randstr}.fa";
|
|
my %refs = ();
|
|
$self->genRef(\%refs, $refdnagen, $conf, $tmpfn);
|
|
# Run bowtie2-build
|
|
my $tmpidxfn = "$conf->{tempdir}/Sim.pm.$conf->{randstr}";
|
|
my $buildArgs = $self->genBuildArgs();
|
|
$self->build($tmpfn, $tmpidxfn, $conf, $buildArgs);
|
|
my $numruns = 10;
|
|
$numruns *= 10 if $conf->{small}; # Lots of short runs
|
|
# For each batch of reads / bowtie options
|
|
for(1..$numruns) {
|
|
print "*** Run $_ of $numruns\n";
|
|
# Generate mutated version of the reference as our subject genome
|
|
my $subj = $self->mutate(\%refs);
|
|
# Generate all the input, including reads, pairedness,
|
|
# fragment information, whether it's colorspace, etc
|
|
my $input = $self->genInput($subj, $conf);
|
|
# Mutate the input
|
|
my $mutinput = $self->mutateSeq($input);
|
|
# Select Bowtie arguments
|
|
my $args = $self->genAlignArgs($mutinput, $input->{color}, $conf);
|
|
$self->align($tmpfn, $tmpidxfn, $mutinput, $conf, $args);
|
|
# Sanity check output. Possible sanity checks are:
|
|
# 1. Check alignments & edits against reference
|
|
# 2. Compare bowtie2 and bowtie2-debug
|
|
# 3. Compare -p X>1 and -p 1
|
|
}
|
|
}
|
|
|
|
if($0 =~ /Sim\.pm$/) {
|
|
print "Running unit tests\n";
|
|
# Run unit tests
|
|
}
|
|
|
|
1;
|