hisat-3n/scripts/sim/Sim.pm

#!/usr/bin/perl -w

#
# Copyright 2011, Ben Langmead <langmea@cs.jhu.edu>
#
# This file is part of Bowtie 2.
#
# Bowtie 2 is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# Bowtie 2 is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with Bowtie 2.  If not, see <http://www.gnu.org/licenses/>.
#

package Sim;
use strict;
use Carp;
use FindBin qw($Bin); 
use lib $Bin;
use DNA;
use Test;
use RandDNA;
use SampleRead;
use Mutate;
use AlignmentCheck;
use Math::Random;
use List::Util qw(max min);
use POSIX;

##
# Replacement for "die" that additionally writes error message to file so that
# run.pl can read it later.
#
sub mydie($) {
	my $fn = ".run.pl.child.$$";
	open(EO, ">$fn") || die "Could not open $fn for writing";
	print EO "$_[0]\n";
	close(EO);
	confess $_[0];
}

# Generates random printable strings of a given length
sub randStr($) {
	my $len = shift;
	my @chars = ('a'..'z', 'A'..'Z', '0'..'9', '_');
	my $str = "";
	foreach (1..$len) {
		$str .= $chars[int(rand(scalar(@chars)))];
	}
	return $str;
}

##
# Default random generator for number of reference per test case.
#
sub defaultRefNumGen() { return int(Math::Random::random_exponential(1, 8))+1; }

##
# Default random generator for reference length.
#
sub defaultRefLenGen() {
	return int(Math::Random::random_exponential(1, 50000))+1;
}

##
# Default random generator for number of reference per test case.
#
sub defaultReadNumGen() {
	return int(Math::Random::random_exponential(1, 10000))+1;
}

##
# Default random generator for read length.
#
sub defaultFragLenGen() {
	return int(Math::Random::random_normal(1, 200, 40))+1;
}

##
# Default random generator for reference length.
#
sub defaultReadLenGen() {
	my $r = int(rand(3));
	if($r == 0) {
		return int(Math::Random::random_exponential(1, 60))+1;
	} elsif($r == 1) {
		return int(Math::Random::random_exponential(1, 20))+1;
	} else {
		return int(Math::Random::random_exponential(1, 150))+1;
	}
}

##
# Default random generator for fraction of reference characters = N.
#
sub defaultNGen() {
	return Math::Random::random_uniform(1, 0, 0.05);
}

##
# Default random generator for fraction of reference characters = an
# ambiguous IUPAC code.
#
sub defaultIupacGen() {
	return Math::Random::random_uniform(1, 0, 0.01);
}

##
# Default random generator for AT/ACGT fraction.
#
sub defaultAtGen() {
	return min(max(Math::Random::random_normal(1, 0.5, 0.18), 0), 1);
}

##
# Default random generator for A/AT fraction.
#
sub defaultAGen() {
	return min(max(Math::Random::random_normal(1, 0.5, 0.18), 0), 1);
}

##
# Default random generator for C/CG fraction.
#
sub defaultCGen() {
	return min(max(Math::Random::random_normal(1, 0.5, 0.18), 0), 1);
}

##
# Default SNP rate generator.  Doesn't generate the SNP per se, just
# the rate.
#
sub defaultSNPGen() {
	return Math::Random::random_uniform(1, 0, 0.05);
}

##
# Default read gap rate generator.  Doesn't generate the gaps or
# lengths, just the rate.
#
sub defaultRdGapGen() {
	return Math::Random::random_uniform(1, 0, 0.005);
}

##
# Default reference gap rate generator.  Doesn't generate the gaps or
# lengths, just the rate.
#
sub defaultRfGapGen() {
	return Math::Random::random_uniform(1, 0, 0.005);
}

##
# Default rearrangement rate generator.
#
sub defaultRearrGen() {
	return Math::Random::random_uniform(1, 0, 0.005);
}

##
# Default function for generating gap lengths when introducing a gap.
#
sub defaultGapLenGen($) {
	return int(Math::Random::random_exponential(1, 3))+1;
}

##
# Default function for generating random sequence to insert into a gap.
#
sub defaultSeqGen($) {
	my $len = shift;
	($len == int($len) && $len > 0) ||
		mydie("Bad length for sequence generator: $len");
	my $ret = "";
	for (1..$len) {
		$ret .= substr("ACGT", int(rand(4)), 1);
	}
	return $ret;
}

##
# Default sequencing miscall rate generator.
#
sub defaultSeqMmGen() {
	return Math::Random::random_uniform(1, 0, 0.1);
}

##
# Create a new test case simulator
#
sub new {
	my (
		$class,
		$name,       # name of simulator
		$rfnumgen,   # number of reference sequences
		$rflengen,   # reference length
		$rdnumgen,   # number of read sequences per run
		$rdlengen,   # read length generator
		$fraglengen, # fragment length generator
		$ngen,       # N fraction
		$iupacgen,   # Non-A/C/G/T/N IUPAC fraction (after N fraction removed)
		$atgen,      # AT fraction (after N/IUPAC fractions removed)
		$agen,       # A fraction (of AT)
		$cgen,       # C fraction (of CG)
		$snpgen,     # SNP rate gen
		$rdgapgen,   # read gap generator
		$rfgapgen,   # ref gap generator
		$rearrgen,   # rearrangement generator
		$gaplengen,  # gap length generator
		$seqgen,     # gap filler sequence generator
		$seqmm,      # sequencing error generator
	) = @_;
	$rfnumgen   = \&defaultRefNumGen  unless defined($rfnumgen);
	$rflengen   = \&defaultRefLenGen  unless defined($rflengen);
	$rdnumgen   = \&defaultReadNumGen unless defined($rdnumgen);
	$rdlengen   = \&defaultReadLenGen unless defined($rdlengen);
	$fraglengen = \&defaultFragLenGen unless defined($fraglengen);
	$ngen       = \&defaultNGen       unless defined($ngen);
	$iupacgen   = \&defaultIupacGen   unless defined($iupacgen);
	$atgen      = \&defaultAtGen      unless defined($atgen);
	$agen       = \&defaultAGen       unless defined($agen);
	$cgen       = \&defaultCGen       unless defined($cgen);
	$snpgen     = \&defaultSNPGen     unless defined($snpgen);
	$rdgapgen   = \&defaultRdGapGen   unless defined($rdgapgen);
	$rfgapgen   = \&defaultRfGapGen   unless defined($rfgapgen);
	$rearrgen   = \&defaultRearrGen   unless defined($rearrgen);
	$gaplengen  = \&defaultGapLenGen  unless defined($gaplengen);
	$seqgen     = \&defaultSeqGen     unless defined($seqgen);
	$seqmm      = \&defaultSeqMmGen   unless defined($seqmm);
	$name = "noname" unless defined($name);
	return bless {
		_name       => $name,
		_rfnumgen   => $rfnumgen,
		_rflengen   => $rflengen,
		_rdnumgen   => $rdnumgen,
		_rdlengen   => $rdlengen,
		_fraglengen => $fraglengen,
		_ngen       => $ngen,
		_iupacgen   => $iupacgen,
		_atgen      => $atgen,
		_agen       => $agen,
		_cgen       => $cgen,
		_snpgen     => $snpgen,
		_rdgapgen   => $rdgapgen,
		_rfgapgen   => $rfgapgen,
		_rearrgen   => $rearrgen,
		_gaplengen  => $gaplengen,
		_seqgen     => $seqgen,
		_seqmm      => $seqmm,
	}, $class;
}
sub rfnumgen   { return $_[0]->{_rfnumgen}   }
sub rflengen   { return $_[0]->{_rflengen}   }
sub rdnumgen   { return $_[0]->{_rdnumgen}   }
sub rdlengen   { return $_[0]->{_rdlengen}   }
sub fraglengen { return $_[0]->{_fraglengen} }
sub ngen       { return $_[0]->{_ngen}       }
sub iupacgen   { return $_[0]->{_iupacgen}   }
sub atgen      { return $_[0]->{_atgen}      }
sub agen       { return $_[0]->{_agen}       }
sub cgen       { return $_[0]->{_cgen}       }
sub snpgen     { return $_[0]->{_snpgen}     }
sub rdgapgen   { return $_[0]->{_rdgapgen}   }
sub rfgapgen   { return $_[0]->{_rfgapgen}   }
sub rearrgen   { return $_[0]->{_rearrgen}   }
sub gaplengen  { return $_[0]->{_gaplengen}  }
sub seqgen     { return $_[0]->{_seqgen}     }
sub seqmm      { return $_[0]->{_seqmm}      }

##
# Generate DNA generator.
#
sub genDNAgen {
	my $self = shift;
	my $nfrac     = $self->ngen->();
	my $iupacfrac = $self->iupacgen->();
	my $atfrac    = $self->atgen->();
	my $afrac     = $self->agen->();
	my $cfrac     = $self->cgen->();
	my $refdnagen = RandDNA->new(
		"Sim.pm gen",
		$nfrac,
		$iupacfrac,
		$atfrac,
		$afrac,
		$cfrac);
	printf STDERR "Created DNA generator\n";
	printf STDERR "  N frac: %0.3f\n", $nfrac;
	printf STDERR "  IUPAC frac: %0.3f\n", $iupacfrac;
	printf STDERR "  AT/ACGT frac: %0.3f\n", $atfrac;
	printf STDERR "  A/AT frac: %0.3f\n", $afrac;
	printf STDERR "  C/CG frac: %0.3f\n", $cfrac;
	return $refdnagen;
}

##
# Generate and print reference sequences to file of given name.  Also,
# install reference sequences into hash ref $ref.  To allow for
# "overhang" (alignment that hang off the end of the reference), we
# actually write out a little bit less than the full reference sequence
# for each sequence.
#
sub genRef {
	my ($self, $ref, $refdnagen, $conf, $tmpfn) = @_;
	# Get a generator for reference length
	my $reflen = $self->rflengen;
	# Generate the number of references
	my $refnum = $self->rfnumgen->();
	$refnum = sqrt($refnum) if $conf->{small};
	$refnum = 1 if $refnum <= 0;
	$refnum = sqrt($refnum) if $conf->{small};
	$refnum = 1 if $refnum <= 0;
	$refnum = ceil($refnum);
	$refnum = $conf->{numrefs} if defined($conf->{numrefs});
	# Open output file
	open (FA, ">$tmpfn") ||
		mydie("Could not open temporary fasta file '$tmpfn' for writing");
	my %ccnt = ();
	print STDERR "Generating $refnum references\n";
	for (1..$refnum) {
		# Randomly generate length
		my $len = $reflen->();
		$len = sqrt($len) if $conf->{small};
		$len = 1 if $len <= 0;
		$len = ceil($len);
		my $seq = $refdnagen->nextSeq($len);
		length($seq) >= $len || die;
		my $name = "Sim.pm.$_";
		$ref->{$name} = $seq;
		# Select amount to trim from LHS
		my $trimleft = int(Math::Random::random_exponential(1, 200));
		# Select amount to trim from RHS
		my $trimright = int(Math::Random::random_exponential(1, 200));
		# Make sure we're leaving some sequence after trimming
		while($trimleft + $trimright > $len) {
			if(int(rand(2))) {
				$trimleft = int($trimleft*0.5);
			} else {
				$trimright = int($trimright*0.5);
			}
		}
		# Trim the sequence
		substr($seq, 0, $trimleft) = "";
		$seq = substr($seq, 0, length($seq)-$trimright);
		my $trimlen = length($seq);
		$trimlen == $len - $trimleft - $trimright ||
			mydie("Unexpected trim combo: $len, $trimleft, $trimright, $trimlen");
		print STDERR "  Generated reference '$name' of untrimmed length $len, trimmed length $trimlen\n";
		print FA ">$name\n";
		my $buf = "";
		length($seq) >= $trimlen || die;
		for my $i (1..$trimlen) {
			my $c = substr($seq, $i-1, 1);
			defined($c) || die;
			$ccnt{$c}++;
			$buf .= $c;
			$ref->{$name} .= $c;
			if($i % 60 == 0) {
				print FA "$buf\n";
				$buf = "";
			}
		}
		print FA "$buf\n" if $buf ne "";
	}
	close(FA);
	print STDERR "Wrote references to $tmpfn\n";
	for my $k (sort keys %ccnt) {
		print STDERR "  $k: $ccnt{$k}\n";
	}
}

##
# Generate a hash of key/value arguments to pass to bowtie2.
#
sub genBuildArgs {
	my ($self) = @_;
	my %args = ();
	my $r1 = int(rand(3));
	if($r1 == 0) {
		$args{"--bmaxdivn"} = int(Math::Random::random_exponential(1, 4))+1;
	} elsif($r1 == 1) {
		$args{"--bmax"} = int(Math::Random::random_exponential(1, 10000))+100;
	}
	my $r2 = int(rand(2));
	if($r2 == 0) {
		$args{"--dcv"} = 2 ** (int(rand(10))+4);
	}
	my $r3 = int(rand(5));
	if($r3 == 0) {
		$args{"--packed"} = "";
	}
	my $r4 = int(rand(3));
	if($r4 == 0) {
		$args{"--offrate"} = int(rand(8))+1;
	}
	return \%args;
}

##
# Given a fasta filename, an index basename, and a path to the
# bowtie2-build executable, build nucleotide-space and colorpace
# indexes for the sequences in the fasta file.
#
sub build {
	my ($self, $fa, $idx, $conf, $args) = @_;
	my $argstr = "";
	for (keys %$args) {
		$argstr .= " $_";
		if($args->{$_} ne "") {
			$argstr .= " ".$args->{$_};
		}
	}
	$argstr .= " --sanity";
	# Build nucleotide index
	my $cmd = "$conf->{bowtie2_build_debug} $argstr $fa $idx";
	print STDERR "$cmd\n";
	system($cmd);
	$? == 0 || mydie("Error running '$cmd'; exitlevel=$?");
	print STDERR "Built nucleotide index '$idx'\n";
	# Build colorspace index
	unless($conf->{no_color}) {
		$cmd = "$conf->{bowtie2_build_debug} $argstr -C $fa ${idx}.c";
		print STDERR "$cmd\n";
		system($cmd);
		$? == 0 || mydie("Error running '$cmd'; exitlevel=$?");
		print STDERR "Built colorspace index '$idx'\n";
	}
}

##
# Given a hash of sequences, flatten all IUPAC codes into unambiguous
# nucleotides.
#
sub flattenIUPAC() {
	my ($self, $h) = @_;
	for my $c (keys %$h) {
		my $len = length($h->{$c});
		for my $i (0..$len-1) {
			my $ch = uc substr($h->{$c}, $i, 1);
			my $nc = $ch;
			if(DNA::isIUPAC($ch) || $ch eq "N") {
				if(rand() < $self->snpgen->()) {
					$nc = DNA::pickIncompat($ch);
					defined($nc) || mydie("Couldn't find incompatible base for $ch");
				} else {
					$nc = DNA::pickCompat($ch);
					defined($nc) || mydie("Couldn't find compatible base for $ch");
				}
			}
			if($ch ne $nc) {
				substr($h->{$c}, $i, 1) = $nc;
			}
		}
	}
}

##
# Mutate reference genome into a subject genome.
#
sub mutate() {
	my ($self, $refs) = @_;
	my %subj = %$refs;
	$self->flattenIUPAC(\%subj);
	print STDERR "Flattened IUPAC characters\n";
	my $mutator = Mutate->new(
		"Sim.pm mutator",
		$self->snpgen,
		$self->rdgapgen,
		$self->rfgapgen,
		$self->rearrgen,
		$self->gaplengen,
		$self->seqgen);
	my ($nsnp, $nrfgap, $nrdgap, $nrearr) = (0, 0, 0, 0);
	for(keys %subj) {
		print STDERR "  Mutating sequence $_\n";
		my ($nsnp_, $nrfgap_, $nrdgap_, $nrearr_) = $mutator->mutateSeq($_, \%subj);
		$nsnp   += $nsnp_;
		$nrfgap += $nrfgap_;
		$nrdgap += $nrdgap_;
		$nrearr += $nrearr_;
	}
	print STDERR "Mutated reference genome to subject genome\n";
	print STDERR "  SNPs introduced: $nsnp\n";
	print STDERR "  Reference gaps introduced: $nrfgap\n";
	print STDERR "  Read gaps introduced: $nrdgap\n";
	print STDERR "  Rearrangements introduced: $nrearr\n";
	return \%subj;
}

sub dumpFastq {
	my ($self, $input, $fh1, $fh2) = @_;
	for (1..scalar(@{$input->{seq1s}})) {
		my $seq1 = $input->{seq1s}->[$_-1];
		my $qual1 = $input->{qual1s}->[$_-1];
		print {$fh1} "\@$_\n";
		print {$fh1} "$seq1\n";
		print {$fh1} "+$_\n";
		print {$fh1} "$qual1\n";
		if($input->{paired}) {
			my $seq2 = $input->{seq2s}->[$_-1];
			my $qual2 = $input->{qual2s}->[$_-1];
			print {$fh2} "\@$_\n";
			print {$fh2} "$seq2\n";
			print {$fh2} "+$_\n";
			print {$fh2} "$qual2\n";
		}
	}
}

sub dumpQseq {
	my ($self, $input, $fh1, $fh2) = @_;
	for (1..scalar(@{$input->{seq1s}})) {
		my $seq1 = $input->{seq1s}->[$_-1];
		my $qual1 = $input->{qual1s}->[$_-1];
		print {$fh1} "R\t1\t1\t1\t$_\t$_\t1\t1\t$seq1\t$qual1\t1\n";
		if($input->{paired}) {
			my $seq2 = $input->{seq2s}->[$_-1];
			my $qual2 = $input->{qual2s}->[$_-1];
			print {$fh2} "R\t1\t1\t1\t$_\t$_\t1\t1\t$seq2\t$qual2\t1\n";
		}
	}
}

sub dumpFasta {
	my ($self, $input, $fh1, $fh2) = @_;
	for (1..scalar(@{$input->{seq1s}})) {
		my $seq1 = $input->{seq1s}->[$_-1];
		print {$fh1} ">$_\n";
		print {$fh1} "$seq1\n";
		if($input->{paired}) {
			my $seq2 = $input->{seq2s}->[$_-1];
			print {$fh2} ">$_\n";
			print {$fh2} "$seq2\n";
		}
	}
}

sub dumpRaw {
	my ($self, $input, $fh1, $fh2) = @_;
	for (1..scalar(@{$input->{seq1s}})) {
		my $seq1 = $input->{seq1s}->[$_-1];
		print {$fh1} "$seq1\n";
		if($input->{paired}) {
			my $seq2 = $input->{seq2s}->[$_-1];
			print {$fh2} "$seq2\n";
		}
	}
}

##
# Generate the input (reads plus paired/fragment information)
#
sub genInput {
	my ($self, $refs, $conf) = @_;
	# Select whether we're doing colorspace
	my $color = int(rand(2));
	$color = 0 if $conf->{no_color};
	# Select whether we're doing unpaired or paired-end.
	my $paired = int(rand(2));
	$paired = 0 if $conf->{no_paired};
	# Select format for read file
	my @formats    = ("fastq",   "qseq", "fasta", "raw");
	my @format_arg = (   "-q", "--qseq",    "-f",  "-r");
	my $formati = int(rand(scalar(@formats)));
	my $format     = $formats[$formati];
	my $format_arg = $format_arg[$formati];
	my $tmprdfn1 = "$conf->{tempdir}/Sim.pm.$conf->{randstr}_1.$format";
	my $tmprdfn2 = "$conf->{tempdir}/Sim.pm.$conf->{randstr}_2.$format";
	# Generate reads from the subject genome; no sequencing error yet
	my %input = (
		seq1s      => [],
		seq2s      => [],
		qual1s     => [],
		qual2s     => [],
		mate1fw    => 1,
		mate2fw    => 0,
		paired     => $paired,
		color      => $color,
		format     => $format,
		format_arg => $format_arg,
		file1      => $tmprdfn1,
		file2      => $tmprdfn2 );
	my $read_sampler = SampleRead->new(
		"Sim.pm read sampler",
		$self->fraglengen,
		$self->rdlengen,
		$self->rdlengen);
	print STDERR "Created read sampler\n";
	my $numreads = $self->rdnumgen->();
	$numreads = ceil(sqrt($numreads)) if $conf->{small};
	$numreads == int($numreads) || mydie("numreads $numreads not a number");
	my $tmp = int(rand(3));
	if($tmp == 0) {
		$input{mate2fw} = 1;
	} elsif($tmp == 1) {
		$input{mate1fw} = 0;
		$input{mate2fw} = 1;
	}
	print STDERR "Sampling $numreads reads\n";
	ref($refs) eq "HASH" || mydie("Reference input must be hash ref");
	if($paired) { 
		$read_sampler->genReadPairs(
			$numreads,       # number of reads/fragments to generate
			$input{color},   # colorize?
			$refs,           # hash ref holding reference sequences
			$input{mate1fw}, # orientation of mate 1 when fragment comes from Watson strand
			$input{mate2fw}, # orientation of mate 2 when fragment comes from Watson strand
			$input{seq1s},   # put generated mate1 sequences here
			$input{seq2s},   # put generated mate2 sequences here
			$input{qual1s},  # put generated mate1 quality sequences here
			$input{qual2s}); # put generated mate2 quality sequences here
	} else {
		$read_sampler->genReads(
			$numreads,       # number of reads/fragments to generate
			$input{color},   # colorize?
			$refs,           # hash ref holding reference sequences
			$input{seq1s},   # put generated sequences here
			$input{qual1s}); # put generated quality sequences here
	}
	# TODO: with some probability, sort the reads
	print STDERR "Dumping reads to temporary files $tmprdfn1 & $tmprdfn2\n";
	# Dump reads to output file
	my ($fh1, $fh2);
	open($fh1, ">$tmprdfn1") || mydie("Could not open '$tmprdfn1' for writing");
	open($fh2, ">$tmprdfn2") || mydie("Could not open '$tmprdfn2' for writing");
	if($format eq "fastq") {
		$self->dumpFastq(\%input, $fh1, $fh2);
	} elsif($format eq "qseq") {
		$self->dumpQseq(\%input, $fh1, $fh2);
	} elsif($format eq "fasta") {
		$self->dumpFasta(\%input, $fh1, $fh2);
	} elsif($format eq "raw") {
		$self->dumpRaw(\%input, $fh1, $fh2);
	}
	close($fh1);
	close($fh2);
	return \%input;
}

##
# Mutate reads according to sequencing error model.
#
sub mutateSeq {
	my ($self, $input) = @_;
	return $input;
}

##
# Generate a setting for MA (match bonus).
#
sub genPolicyMA($) {
	my $local = shift;
	return "" if ($local || int(rand(2)) == 0);
	return "MA=".Math::Random::random_uniform(1, 1, 40).";";
}

##
# Generate a setting for MMP (mismatch penalty).
#
sub genPolicyMMP() {
	return "" if int(rand(2)) == 0;
	my $op = substr("CQR", int(rand(3)), 1);
	if($op eq "C") {
		$op .= Math::Random::random_uniform(1, 1, 40);
	}
	return "MMP=$op;";
}

##
# Generate a setting for NP (penalty for a mismatch involving an N).
#
sub genPolicyNP() {
	return "" if int(rand(2)) == 0;
	my $op = substr("CQR", int(rand(3)), 1);
	if($op eq "C") {
		$op .= int(Math::Random::random_exponential(1, 3))+1;
	}
	return "NP=$op;";
}

##
# Generate a setting for RDG (read gap open and extend penalties).
#
sub genPolicyRDG() {
	return undef if int(rand(2)) == 0;
	my $op = Math::Random::random_uniform(1, 1, 50);
	if(int(rand(2)) == 0) {
		$op .= ",";
		$op .= Math::Random::random_uniform(1, 1, 20);
	}
	return "$op";
}

##
# Generate a setting for RFG (ref gap open and extend penalties).
#
sub genPolicyRFG() {
	return undef if int(rand(2)) == 0;
	my $op = Math::Random::random_uniform(1, 1, 50);
	if(int(rand(2)) == 0) {
		$op .= ",";
		$op .= Math::Random::random_uniform(1, 1, 20);
	}
	return "$op";
}

##
# Generate a setting for MIN (function determining minimum acceptable score).
#
sub genPolicyMIN($) {
	my $local = shift;
	return undef if ($local || int(rand(2)) == 0);
	my $xx = Math::Random::random_uniform(1, 1, 10);
	my $yy = Math::Random::random_uniform(1, 1, 10);
	if(!$local) {
		$xx = -$xx if int(rand(2)) == 0;
		$yy = -$yy;
	}
	return "L,$xx,$yy";
}

##
# Generate a setting for NCEIL (function determining maximum number of Ns
# allowed).
#
sub genPolicyNCEIL() {
	return undef if int(rand(2)) == 0;
	my $xx = Math::Random::random_uniform(1, 0, 1.5);
	my $yy = Math::Random::random_uniform(1, 0, 1.5);
	return "$xx,$yy";
}

##
# Generate a setting for SEED (# mismatches, length, interval).
#
sub genPolicySEED() {
	return undef if int(rand(2)) == 0;
	# Pick a number of mismatches
	my $sd = substr("012", int(rand(2)), 1);
	if(rand() < 0.9) {
		# Length
		$sd .= ",".int(Math::Random::random_uniform(1, 12, 32));
	}
	return $sd;
}

##
# Generate a setting for -D (# DP fails in a row).
#
sub genPolicyFailStreak() {
	return undef if int(rand(2)) == 0;
	return int(Math::Random::random_uniform(1, 2, 50));
}

##
# Generate a setting for -R (# seeding rounds).
#
sub genPolicySeedRounds() {
	return undef if int(rand(2)) == 0;
	return int(Math::Random::random_uniform(1, 1, 5));
}

##
# Generate a setting for IVAL.  Interval between seeds is a function of the
# read length OR sqaure root of read length OR cube root of read length.
#
sub genPolicyIVAL() {
	return "" if int(rand(2)) == 0;
	# Pick a number of mismatches
	my $iv = substr("LSC", int(rand(3)), 1);
	if($iv eq "L") {
		if(rand() < 0.9) {
			# Multiplier
			$iv .= ",".Math::Random::random_uniform(1, 0.0, 0.5);
		}
		if(rand() < 0.3) {
			# Offset
			$iv .= ",".Math::Random::random_uniform(1, 0.0, 4.0);
		}
	} elsif($iv eq "S") {
		if(rand() < 0.9) {
			# Multiplier
			$iv .= ",".Math::Random::random_uniform(1, 0.0, 3.0);
		}
		if(rand() < 0.3) {
			# Offset
			$iv .= ",".Math::Random::random_uniform(1, 0.0, 7.0);
		}
	} elsif($iv eq "C") {
		if(rand() < 0.9) {
			# Multiplier
			$iv .= ",".Math::Random::random_uniform(1, 0.0, 5.0);
		}
		if(rand() < 0.3) {
			# Offset
			$iv .= ",".Math::Random::random_uniform(1, 0.0, 14.0);
		}
	}
	return "IVAL=$iv;";
}

##
# Generate a random but meaningful string of policy arguments to specify using
# the -P option.
#
sub genPolicyArg($) {
	my $local = shift;
	my $args = "";
	$args .= genPolicyMA($local);
	$args .= genPolicyMMP();
	$args .= genPolicyNP();
	$args .= genPolicyIVAL();
	if($args ne "") {
		return substr($args, 0, -1);
	} else { return ""; }
}

##
# Generate a hash of key/value arguments to pass to bowtie2.
#
sub genAlignArgs {
	my ($self, $input, $color, $conf) = @_;
	my %args = ();
	my $local = int(rand(2)) == 0;
	$args{"-u"}         = $conf->{maxreads} if defined($conf->{maxreads});
	$args{"--mm"}       = "" if int(rand(2)) == 0;
	#$args{"--overhang"} = "" if int(rand(2)) == 0;
	$args{"--trim3"}    = int(rand(10)) if int(rand(2)) == 0;
	$args{"--trim5"}    = int(rand(10)) if int(rand(2)) == 0;
	$args{"--nofw"}     = "" if int(rand(4)) == 0;
	$args{"--norc"}     = "" if int(rand(4)) == 0;
	$args{"--col-keepends"} = "" if ($color && int(rand(3)) == 0);
	$args{"--gbar"}     = int(Math::Random::random_exponential(1, 3))+1 if int(rand(4)) == 0;
	$args{"--local"}    = "" if $local;
	my $rep = int(rand(5));
	if($rep == 0) {
		$args{"-a"} = "";
	} elsif($rep == 1) {
		$args{"-k"} = int(Math::Random::random_exponential(1, 3))+2;
	} elsif($rep == 2) {
		$args{"-M"} = int(Math::Random::random_exponential(1, 3))+2;
	}
	$args{"--rdg"} = genPolicyRDG();
	$args{"--rfg"} = genPolicyRFG();
	$args{"--score-min"} = genPolicyMIN($local);
	$args{"--n-ceil"} = genPolicyNCEIL();
	$args{"-N"} = genPolicySEED();
	$args{"-D"} = genPolicyFailStreak();
	$args{"-R"} = genPolicySeedRounds();
	$args{"--policy"} = ("\"".genPolicyArg($local)."\"") if rand() < 0.9;
	$args{"--cp-min"} = int(Math::Random::random_exponential(1, 3)) + 2;
	$args{"--cp-ival"} = int(Math::Random::random_exponential(1, 1)) + 1;
	return \%args;
}

##
# Align the given input set against the given index using the given
# bowtie2 binary and arguments.  Sanity-check the SAM output.
#
sub align {
	my ($self, $fa, $idx, $input, $conf, $args) = @_;
	my $argstr = "";
	for (keys %$args) {
		if(defined($args->{$_})) {
			$argstr .= " $_";
			if($args->{$_} ne "") {
				$argstr .= " ".$args->{$_};
			}
		}
	}
	$argstr .= " -C" if $input->{color};
	$argstr .= " ".$input->{format_arg};
	$idx .= ".c" if $input->{color};
	my $inputfn;
	if($input->{paired}) {
		$inputfn = "-1 $input->{file1} -2 $input->{file2}";
	} else {
		$inputfn = $input->{file1};
	}
	# Create object that will help us sanity-check alignments
	my $ac = AlignmentCheck->new(
		"Sim.pm alignment checker", # name
		[ $fa ],                    # fasta
		"sam",                      # SAM-formatted alignments
		0,                          # no bis-C
		0                           # no bis-CpG
	);
	$ac->nrefs() > 0 || mydie("No references");
	# Run normal (non-debug) Bowtie
	defined($conf->{tempdir}) || mydie("No tmp dir");
	my $als       = "$conf->{tempdir}/Sim.pm.$conf->{randstr}.als";
	my $als_debug = "$conf->{tempdir}/Sim.pm.$conf->{randstr}.debug.als";
	my $als_px    = "$conf->{tempdir}/Sim.pm.$conf->{randstr}.px.als";
	my $als_px_reord = "$conf->{tempdir}/Sim.pm.$conf->{randstr}.px.reord.als";
	my $cmd = "$conf->{bowtie2_debug} $argstr $idx $inputfn";
	print "$cmd\n";
	open(ALSDEB, ">$als_debug") || mydie("Could not open '$als_debug' for writing");
	open(ALSDEBCMD, "$cmd |") || mydie("Could not open pipe '$cmd |'");
	my $ival = 50;
	my $nals = 0;
	my @lines = ();
	while(<ALSDEBCMD>) {
		# Remove @PG line because CL: tag can legitimately differ
		print ALSDEB $_ unless /^\@PG/;
		push @lines, $_;
		$nals++;
		print STDERR "  Read $nals alignments...\n" if ($nals % $ival) == 0;
	}
	close(ALSDEBCMD);
	$ac->checkAlignments(\@lines, 0);
	$? == 0 || mydie("bowtie2-align-debug exited with exitlevel $?:\n$cmd\n");
	close(ALSDEB);
	$ac->printSummary();
	# With some probability, also run debug Bowtie and check that
	# results are identical
	if(int(rand(3)) == 0) {
		print STDERR "ALSO checking that bowtie2 and bowtie2-align-debug match up\n";
		# Remove @PG line because CL: tag can legitimately differ
		$cmd = "$conf->{bowtie2} $argstr $idx $inputfn | grep -v '^\@PG' > $als";
		print "$cmd\n";
		system($cmd);
		$? == 0 ||
			mydie("Command '$cmd' failed with exitlevel $?");
		$cmd = "diff -uw $als $als_debug";
		print "$cmd\n";
		system($cmd);
		$? == 0 ||
			mydie("diff found a difference between bowtie2 and bowtie2-align-debug ".
			      "output for same input (above)\n");
	}
	# With some probability, also run debug Bowtie in -p X mode with  X > 1 and
	# without the --reorder argument and check that results are identical
	if(int(rand(3)) == 0) {
		print STDERR "ALSO checking that bowtie2 and bowtie2 -p X w/ X > 1 match up\n";
		my $p = int(rand(3))+2;
		$cmd = "$conf->{bowtie2} $argstr -p $p $idx $inputfn | grep -v '^\@PG' > $als_px";
		print "$cmd\n";
		system($cmd);
		$? == 0 ||
			mydie("Command '$cmd' failed with exitlevel $?");
		# Sort the $als_px and $als_debug files to guarantee that reads and
		# alignments for a given read appear in the same order in both
		$cmd = "sort -k 1,1 -n -k 2,2 -k 3,3 -k 4,4 < $als_px | grep -v '^\@PG' > $als_px.sorted";
		print "$cmd\n";
		system($cmd);
		$? == 0 ||
			mydie("Failed to sort alignment file $als_px\n");
		# Sort the $als_px and $als_debug files to guarantee that reads and
		# alignments for a given read appear in the same order in both
		$cmd = "sort -k 1,1 -n -k 2,2 -k 3,3 -k 4,4 < $als_debug | grep -v '^\@PG' > $als_debug.sorted";
		print "$cmd\n";
		system($cmd);
		$? == 0 ||
			mydie("Failed to sort alignment file $als_debug\n");
		$cmd = "diff -uw $als_debug.sorted $als_px.sorted";
		print "$cmd\n";
		system($cmd);
		$? == 0 ||
			mydie("diff found a difference between bowtie2-align-debug and bowtie2 ".
			      "-p output for same input (above)\n");
	}

	# With some probability, also run debug Bowtie in -p X mode with X > 1 and
	# with the --reorder argument and check that results are identical
	if(int(rand(3)) == 0) {
		print STDERR "ALSO checking that bowtie2 and bowtie2 -p X --reorder w/ X > 1 match up\n";
		my $p = int(rand(3))+2;
		$cmd = "$conf->{bowtie2} $argstr -p $p $idx --reorder $inputfn | grep -v '^\@PG' > $als_px_reord";
		print "$cmd\n";
		system($cmd);
		$? == 0 || mydie("Command '$cmd' failed with exitlevel $?");
		$cmd = "diff -uw $als_debug $als_px_reord";
		print "$cmd\n";
		system($cmd);
		$? == 0 ||
			mydie("diff found a difference between bowtie2-align-debug and bowtie2 ".
			      "-p --reorder output for same input (above)\n");
	}
}

##
# Generate a new test case
#
# Possible key/value pairs in $conf hash:
#
# 1. bowtie2_build:       path to bowtie2-build binary
# 2. bowtie2:             path to bowtie2 binary
# 3. bowtie2_build_debug: path to bowtie2-build-debug binary
# 4. bowtie2_debug:       path to bowtie2-debug binary
# 5. tempdir:             temporary directory for reference/reads/index
# 6. no_paired:           defined & non-0 -> don't generate paired-end datasets
# 7. no_color:            defined & non-0 -> don't generate colorspace datasets
# 8. single_thread:       defined & non-0 -> don't use -p X where X > 1
#
sub nextCase {
	my ($self, $conf) = @_;

	$conf->{bowtie2_build}       = "bowtie2-build"       unless defined($conf->{bowtie2_build});
	$conf->{bowtie2}             = "bowtie2-align"       unless defined($conf->{bowtie2});
	$conf->{bowtie2_build_debug} = "bowtie2-build-debug" unless defined($conf->{bowtie2_build_debug});
	$conf->{bowtie2_debug}       = "bowtie2-align-debug" unless defined($conf->{bowtie2_debug});
	$conf->{tempdir}             = "/tmp"                unless defined($conf->{tempdir});
	srand(time ^ $$);
	$conf->{randstr} = randStr(8);

	print "*** TEST CASE ***\n";
	
	# Generate the references
	my $refdnagen = $self->genDNAgen();
	# Generate references and write them to a temporary fasta file
	my $tmpfn = "$conf->{tempdir}/Sim.pm.$conf->{randstr}.fa";
	my %refs = ();
	$self->genRef(\%refs, $refdnagen, $conf, $tmpfn);
	# Run bowtie2-build
	my $tmpidxfn = "$conf->{tempdir}/Sim.pm.$conf->{randstr}";
	my $buildArgs = $self->genBuildArgs();
	$self->build($tmpfn, $tmpidxfn, $conf, $buildArgs);
	my $numruns = 10;
	$numruns *= 10 if $conf->{small}; # Lots of short runs
	# For each batch of reads / bowtie options
	for(1..$numruns) {
		print "*** Run $_ of $numruns\n";
		# Generate mutated version of the reference as our subject genome
		my $subj = $self->mutate(\%refs);
		# Generate all the input, including reads, pairedness,
		# fragment information, whether it's colorspace, etc
		my $input = $self->genInput($subj, $conf);
		# Mutate the input
		my $mutinput = $self->mutateSeq($input);
		# Select Bowtie arguments
		my $args = $self->genAlignArgs($mutinput, $input->{color}, $conf);
		$self->align($tmpfn, $tmpidxfn, $mutinput, $conf, $args);
		# Sanity check output.  Possible sanity checks are:
		# 1. Check alignments & edits against reference
		# 2. Compare bowtie2 and bowtie2-debug
		# 3. Compare -p X>1 and -p 1
	}
}

if($0 =~ /Sim\.pm$/) {
	print "Running unit tests\n";
	# Run unit tests
}

1;