hisat-3n/scripts/gen_occ_lookup.pl

258 lines
6.5 KiB
Perl
Raw Permalink Normal View History

2025-01-18 13:09:52 +00:00
#!/usr/bin/perl -w
#
# Copyright 2011, Ben Langmead <langmea@cs.jhu.edu>
#
# This file is part of Bowtie 2.
#
# Bowtie 2 is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# Bowtie 2 is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with Bowtie 2. If not, see <http://www.gnu.org/licenses/>.
#
#
# Generate lookup table that, given a packed DNA byte (four bases) and
# a character (A, C, G or T), returns how many times that character
# occurs in that packed byte. Useful for quickly counting character
# occurrences in long strings. The LUT is indexed first by character
# (0-3) then by byte (0-255).
#
# Larger lookup tables are also possible, though they seem
# counterproductive. E.g., looking up eight bases at a time yields a
# 256K LUT, which doesn't fit in L1. A four-base LUT is 1KB, easily
# fitting in L1.
#
# See ebwt.h.
#
my @as4 = (), @as3 = (), @as2 = (), @as1 = ();
my @cs4 = (), @cs3 = (), @cs2 = (), @cs1 = ();
my @gs4 = (), @gs3 = (), @gs2 = (), @gs1 = ();
my @ts4 = (), @ts3 = (), @ts2 = (), @ts1 = ();
# Compile character arrays
my $i;
for($i = 0; $i < 256; $i++) {
my $b01 = ($i >> 0) & 3;
my $b23 = ($i >> 2) & 3;
my $b45 = ($i >> 4) & 3;
my $b67 = ($i >> 6) & 3;
my $a4 = ($b01 == 0) + ($b23 == 0) + ($b45 == 0) + ($b67 == 0);
my $c4 = ($b01 == 1) + ($b23 == 1) + ($b45 == 1) + ($b67 == 1);
my $g4 = ($b01 == 2) + ($b23 == 2) + ($b45 == 2) + ($b67 == 2);
my $t4 = ($b01 == 3) + ($b23 == 3) + ($b45 == 3) + ($b67 == 3);
push @as4, $a4;
push @cs4, $c4;
push @gs4, $g4;
push @ts4, $t4;
my $a3 = ($b01 == 0) + ($b23 == 0) + ($b45 == 0);
my $c3 = ($b01 == 1) + ($b23 == 1) + ($b45 == 1);
my $g3 = ($b01 == 2) + ($b23 == 2) + ($b45 == 2);
my $t3 = ($b01 == 3) + ($b23 == 3) + ($b45 == 3);
push @as3, $a3;
push @cs3, $c3;
push @gs3, $g3;
push @ts3, $t3;
my $a2 = ($b01 == 0) + ($b23 == 0);
my $c2 = ($b01 == 1) + ($b23 == 1);
my $g2 = ($b01 == 2) + ($b23 == 2);
my $t2 = ($b01 == 3) + ($b23 == 3);
push @as2, $a2;
push @cs2, $c2;
push @gs2, $g2;
push @ts2, $t2;
my $a1 = ($b01 == 0) + 0;
my $c1 = ($b01 == 1) + 0;
my $g1 = ($b01 == 2) + 0;
my $t1 = ($b01 == 3) + 0;
push @as1, $a1;
push @cs1, $c1;
push @gs1, $g1;
push @ts1, $t1;
}
my $entsPerLine = 16;
print "#include <stdint.h>\n\n";
print "/* Generated by gen_lookup_tables.pl */\n\n";
# Count occurrences in all 4 bit pairs
print "uint8_t cCntLUT_4[4][4][256] = {\n";
print "\t/* All 4 bit pairs */ {\n";
# Print As array
print "\t\t/* As */ {\n";
for($i = 0; $i < 256; $i++) {
print "\t\t\t" if(($i % $entsPerLine) == 0);
print "$as4[$i], ";
print "\n" if(($i % $entsPerLine) == ($entsPerLine-1));
}
print "\t\t},\n";
# Print Cs array
print "\t\t/* Cs */ {\n";
for($i = 0; $i < 256; $i++) {
print "\t\t\t" if(($i % $entsPerLine) == 0);
print "$cs4[$i], ";
print "\n" if(($i % $entsPerLine) == ($entsPerLine-1));
}
print "\t\t},\n";
# Print Gs array
print "\t\t/* Gs */ {\n";
for($i = 0; $i < 256; $i++) {
print "\t\t\t" if(($i % $entsPerLine) == 0);
print "$gs4[$i], ";
print "\n" if(($i % $entsPerLine) == ($entsPerLine-1));
}
print "\t\t},\n";
# Print Ts array
print "\t\t/* Ts */ {\n";
for($i = 0; $i < 256; $i++) {
print "\t\t\t" if(($i % $entsPerLine) == 0);
print "$ts4[$i], ";
print "\n" if(($i % $entsPerLine) == ($entsPerLine-1));
}
print "\t\t}\n\t},\n";
# Count occurrences in low 1 bit pair
print "\t/* Least significant 1 bit pair */ {\n";
# Print As array
print "\t\t/* As */ {\n";
for($i = 0; $i < 256; $i++) {
print "\t\t\t" if(($i % $entsPerLine) == 0);
print "$as1[$i], ";
print "\n" if(($i % $entsPerLine) == ($entsPerLine-1));
}
print "\t\t},\n";
# Print Cs array
print "\t\t/* Cs */ {\n";
for($i = 0; $i < 256; $i++) {
print "\t\t\t" if(($i % $entsPerLine) == 0);
print "$cs1[$i], ";
print "\n" if(($i % $entsPerLine) == ($entsPerLine-1));
}
print "\t\t},\n";
# Print Gs array
print "\t\t/* Gs */ {\n";
for($i = 0; $i < 256; $i++) {
print "\t\t\t" if(($i % $entsPerLine) == 0);
print "$gs1[$i], ";
print "\n" if(($i % $entsPerLine) == ($entsPerLine-1));
}
print "\t\t},\n";
# Print Ts array
print "\t\t/* Ts */ {\n";
for($i = 0; $i < 256; $i++) {
print "\t\t\t" if(($i % $entsPerLine) == 0);
print "$ts1[$i], ";
print "\n" if(($i % $entsPerLine) == ($entsPerLine-1));
}
print "\t\t}\n\t},\n";
# Count occurrences in low 2 bit pairs
print "\t/* Least significant 2 bit pairs */ {\n";
# Print As array
print "\t\t/* As */ {\n";
for($i = 0; $i < 256; $i++) {
print "\t\t\t" if(($i % $entsPerLine) == 0);
print "$as2[$i], ";
print "\n" if(($i % $entsPerLine) == ($entsPerLine-1));
}
print "\t\t},\n";
# Print Cs array
print "\t\t/* Cs */ {\n";
for($i = 0; $i < 256; $i++) {
print "\t\t\t" if(($i % $entsPerLine) == 0);
print "$cs2[$i], ";
print "\n" if(($i % $entsPerLine) == ($entsPerLine-1));
}
print "\t\t},\n";
# Print Gs array
print "\t\t/* Gs */ {\n";
for($i = 0; $i < 256; $i++) {
print "\t\t\t" if(($i % $entsPerLine) == 0);
print "$gs2[$i], ";
print "\n" if(($i % $entsPerLine) == ($entsPerLine-1));
}
print "\t\t},\n";
# Print Ts array
print "\t\t/* Ts */ {\n";
for($i = 0; $i < 256; $i++) {
print "\t\t\t" if(($i % $entsPerLine) == 0);
print "$ts2[$i], ";
print "\n" if(($i % $entsPerLine) == ($entsPerLine-1));
}
print "\t\t}\n\t},\n";
# Count occurrences in low 3 bit pairs
print "\t/* Least significant 3 bit pairs */ {\n";
# Print As array
print "\t\t/* As */ {\n";
for($i = 0; $i < 256; $i++) {
print "\t\t\t" if(($i % $entsPerLine) == 0);
print "$as3[$i], ";
print "\n" if(($i % $entsPerLine) == ($entsPerLine-1));
}
print "\t\t},\n";
# Print Cs array
print "\t\t/* Cs */ {\n";
for($i = 0; $i < 256; $i++) {
print "\t\t\t" if(($i % $entsPerLine) == 0);
print "$cs3[$i], ";
print "\n" if(($i % $entsPerLine) == ($entsPerLine-1));
}
print "\t\t},\n";
# Print Gs array
print "\t\t/* Gs */ {\n";
for($i = 0; $i < 256; $i++) {
print "\t\t\t" if(($i % $entsPerLine) == 0);
print "$gs3[$i], ";
print "\n" if(($i % $entsPerLine) == ($entsPerLine-1));
}
print "\t\t},\n";
# Print Ts array
print "\t\t/* Ts */ {\n";
for($i = 0; $i < 256; $i++) {
print "\t\t\t" if(($i % $entsPerLine) == 0);
print "$ts3[$i], ";
print "\n" if(($i % $entsPerLine) == ($entsPerLine-1));
}
print "\t\t}\n\t}\n";
print "};\n";