commit 5e601d04015cc5c502053468535f5ac4116cfaa2
Author: Yaossg
Date: Sat Jan 18 21:09:52 2025 +0800
initial commit
diff --git a/.gitattributes b/.gitattributes
new file mode 100644
index 0000000..8edfb9b
--- /dev/null
+++ b/.gitattributes
@@ -0,0 +1 @@
+*.pbxproj binary merge=union
diff --git a/.gitignore b/.gitignore
new file mode 100644
index 0000000..7f38173
--- /dev/null
+++ b/.gitignore
@@ -0,0 +1,48 @@
+*~
+*.dSYM
+.DS_Store
+tags
+*-debug
+*-s
+*-l
+hisat2.xcodeproj/project.xcworkspace
+hisat2.xcodeproj/xcuserdata
+hisat2.xcodeproj/xcshareddata
+*.patch
+
+build_automaton
+build_index
+clean_alignment
+determinize
+gcsa_alignment
+gcsa_test
+hisat2-repeat
+
+hisat2_test/*.bt2
+hisat2_test/*.ht2
+hisat2_test/*.sam
+hisat2_test/paper_example.malignment.automaton
+hisat2_test/paper_example.malignment.backbone
+hisat2_test/paper_example.malignment.gcsa
+hisat2_test/kim_example*.malignment.automaton
+hisat2_test/kim_example*.malignment.backbone
+hisat2_test/kim_example*.malignment.gcsa
+hisat2_test/genome*
+hisat2_test/2*
+hisat2_test/snp142*
+hisat2_test/testset*
+
+.idea
+.vscode
+
+.ht2lib-obj*
+*.a
+*.so
+docs/_site
+docs/*.lock
+docs/.*-cache
+*.tar.gz
+*.ipynb
+*.pyc
+
+cmake*
\ No newline at end of file
diff --git a/AUTHORS b/AUTHORS
new file mode 100644
index 0000000..d22b8b2
--- /dev/null
+++ b/AUTHORS
@@ -0,0 +1,29 @@
+Ben Langmead wrote Bowtie 2, which is based partially on
+Bowtie. Bowtie was written by Ben Langmead and Cole Trapnell.
+
+ Bowtie & Bowtie 2: http://bowtie-bio.sf.net
+
+A DLL from the pthreads for Win32 library is distributed with the Win32 version
+of Bowtie 2. The pthreads for Win32 library and the GnuWin32 package have many
+contributors (see their respective web sites).
+
+ pthreads for Win32: http://sourceware.org/pthreads-win32
+ GnuWin32: http://gnuwin32.sf.net
+
+The ForkManager.pm perl module is used in Bowtie 2's random testing framework,
+and is included as scripts/sim/contrib/ForkManager.pm. ForkManager.pm is
+written by dLux (Szabo, Balazs), with contributions by others. See the perldoc
+in ForkManager.pm for the complete list.
+
+The file ls.h includes an implementation of the Larsson-Sadakane suffix sorting
+algorithm. The implementation is by N. Jesper Larsson and was adapted somewhat
+for use in Bowtie 2.
+
+TinyThreads is a portable thread implementation with a fairly compatible subset
+of C++11 thread management classes written by Marcus Geelnard. For more info
+check http://tinythreadpp.bitsnbites.eu/
+
+Various users have kindly supplied patches, bug reports and feature requests
+over the years. Many, many thanks go to them.
+
+September 2011
diff --git a/HISAT2-genotype.png b/HISAT2-genotype.png
new file mode 100644
index 0000000..1327c4c
Binary files /dev/null and b/HISAT2-genotype.png differ
diff --git a/HISAT2_VERSION b/HISAT2_VERSION
new file mode 100644
index 0000000..ddd34cc
--- /dev/null
+++ b/HISAT2_VERSION
@@ -0,0 +1 @@
+2.2.1-3n-0.0.3
diff --git a/LICENSE b/LICENSE
new file mode 100644
index 0000000..94a9ed0
--- /dev/null
+++ b/LICENSE
@@ -0,0 +1,674 @@
+ GNU GENERAL PUBLIC LICENSE
+ Version 3, 29 June 2007
+
+ Copyright (C) 2007 Free Software Foundation, Inc.
+ Everyone is permitted to copy and distribute verbatim copies
+ of this license document, but changing it is not allowed.
+
+ Preamble
+
+ The GNU General Public License is a free, copyleft license for
+software and other kinds of works.
+
+ The licenses for most software and other practical works are designed
+to take away your freedom to share and change the works. By contrast,
+the GNU General Public License is intended to guarantee your freedom to
+share and change all versions of a program--to make sure it remains free
+software for all its users. We, the Free Software Foundation, use the
+GNU General Public License for most of our software; it applies also to
+any other work released this way by its authors. You can apply it to
+your programs, too.
+
+ When we speak of free software, we are referring to freedom, not
+price. Our General Public Licenses are designed to make sure that you
+have the freedom to distribute copies of free software (and charge for
+them if you wish), that you receive source code or can get it if you
+want it, that you can change the software or use pieces of it in new
+free programs, and that you know you can do these things.
+
+ To protect your rights, we need to prevent others from denying you
+these rights or asking you to surrender the rights. Therefore, you have
+certain responsibilities if you distribute copies of the software, or if
+you modify it: responsibilities to respect the freedom of others.
+
+ For example, if you distribute copies of such a program, whether
+gratis or for a fee, you must pass on to the recipients the same
+freedoms that you received. You must make sure that they, too, receive
+or can get the source code. And you must show them these terms so they
+know their rights.
+
+ Developers that use the GNU GPL protect your rights with two steps:
+(1) assert copyright on the software, and (2) offer you this License
+giving you legal permission to copy, distribute and/or modify it.
+
+ For the developers' and authors' protection, the GPL clearly explains
+that there is no warranty for this free software. For both users' and
+authors' sake, the GPL requires that modified versions be marked as
+changed, so that their problems will not be attributed erroneously to
+authors of previous versions.
+
+ Some devices are designed to deny users access to install or run
+modified versions of the software inside them, although the manufacturer
+can do so. This is fundamentally incompatible with the aim of
+protecting users' freedom to change the software. The systematic
+pattern of such abuse occurs in the area of products for individuals to
+use, which is precisely where it is most unacceptable. Therefore, we
+have designed this version of the GPL to prohibit the practice for those
+products. If such problems arise substantially in other domains, we
+stand ready to extend this provision to those domains in future versions
+of the GPL, as needed to protect the freedom of users.
+
+ Finally, every program is threatened constantly by software patents.
+States should not allow patents to restrict development and use of
+software on general-purpose computers, but in those that do, we wish to
+avoid the special danger that patents applied to a free program could
+make it effectively proprietary. To prevent this, the GPL assures that
+patents cannot be used to render the program non-free.
+
+ The precise terms and conditions for copying, distribution and
+modification follow.
+
+ TERMS AND CONDITIONS
+
+ 0. Definitions.
+
+ "This License" refers to version 3 of the GNU General Public License.
+
+ "Copyright" also means copyright-like laws that apply to other kinds of
+works, such as semiconductor masks.
+
+ "The Program" refers to any copyrightable work licensed under this
+License. Each licensee is addressed as "you". "Licensees" and
+"recipients" may be individuals or organizations.
+
+ To "modify" a work means to copy from or adapt all or part of the work
+in a fashion requiring copyright permission, other than the making of an
+exact copy. The resulting work is called a "modified version" of the
+earlier work or a work "based on" the earlier work.
+
+ A "covered work" means either the unmodified Program or a work based
+on the Program.
+
+ To "propagate" a work means to do anything with it that, without
+permission, would make you directly or secondarily liable for
+infringement under applicable copyright law, except executing it on a
+computer or modifying a private copy. Propagation includes copying,
+distribution (with or without modification), making available to the
+public, and in some countries other activities as well.
+
+ To "convey" a work means any kind of propagation that enables other
+parties to make or receive copies. Mere interaction with a user through
+a computer network, with no transfer of a copy, is not conveying.
+
+ An interactive user interface displays "Appropriate Legal Notices"
+to the extent that it includes a convenient and prominently visible
+feature that (1) displays an appropriate copyright notice, and (2)
+tells the user that there is no warranty for the work (except to the
+extent that warranties are provided), that licensees may convey the
+work under this License, and how to view a copy of this License. If
+the interface presents a list of user commands or options, such as a
+menu, a prominent item in the list meets this criterion.
+
+ 1. Source Code.
+
+ The "source code" for a work means the preferred form of the work
+for making modifications to it. "Object code" means any non-source
+form of a work.
+
+ A "Standard Interface" means an interface that either is an official
+standard defined by a recognized standards body, or, in the case of
+interfaces specified for a particular programming language, one that
+is widely used among developers working in that language.
+
+ The "System Libraries" of an executable work include anything, other
+than the work as a whole, that (a) is included in the normal form of
+packaging a Major Component, but which is not part of that Major
+Component, and (b) serves only to enable use of the work with that
+Major Component, or to implement a Standard Interface for which an
+implementation is available to the public in source code form. A
+"Major Component", in this context, means a major essential component
+(kernel, window system, and so on) of the specific operating system
+(if any) on which the executable work runs, or a compiler used to
+produce the work, or an object code interpreter used to run it.
+
+ The "Corresponding Source" for a work in object code form means all
+the source code needed to generate, install, and (for an executable
+work) run the object code and to modify the work, including scripts to
+control those activities. However, it does not include the work's
+System Libraries, or general-purpose tools or generally available free
+programs which are used unmodified in performing those activities but
+which are not part of the work. For example, Corresponding Source
+includes interface definition files associated with source files for
+the work, and the source code for shared libraries and dynamically
+linked subprograms that the work is specifically designed to require,
+such as by intimate data communication or control flow between those
+subprograms and other parts of the work.
+
+ The Corresponding Source need not include anything that users
+can regenerate automatically from other parts of the Corresponding
+Source.
+
+ The Corresponding Source for a work in source code form is that
+same work.
+
+ 2. Basic Permissions.
+
+ All rights granted under this License are granted for the term of
+copyright on the Program, and are irrevocable provided the stated
+conditions are met. This License explicitly affirms your unlimited
+permission to run the unmodified Program. The output from running a
+covered work is covered by this License only if the output, given its
+content, constitutes a covered work. This License acknowledges your
+rights of fair use or other equivalent, as provided by copyright law.
+
+ You may make, run and propagate covered works that you do not
+convey, without conditions so long as your license otherwise remains
+in force. You may convey covered works to others for the sole purpose
+of having them make modifications exclusively for you, or provide you
+with facilities for running those works, provided that you comply with
+the terms of this License in conveying all material for which you do
+not control copyright. Those thus making or running the covered works
+for you must do so exclusively on your behalf, under your direction
+and control, on terms that prohibit them from making any copies of
+your copyrighted material outside their relationship with you.
+
+ Conveying under any other circumstances is permitted solely under
+the conditions stated below. Sublicensing is not allowed; section 10
+makes it unnecessary.
+
+ 3. Protecting Users' Legal Rights From Anti-Circumvention Law.
+
+ No covered work shall be deemed part of an effective technological
+measure under any applicable law fulfilling obligations under article
+11 of the WIPO copyright treaty adopted on 20 December 1996, or
+similar laws prohibiting or restricting circumvention of such
+measures.
+
+ When you convey a covered work, you waive any legal power to forbid
+circumvention of technological measures to the extent such circumvention
+is effected by exercising rights under this License with respect to
+the covered work, and you disclaim any intention to limit operation or
+modification of the work as a means of enforcing, against the work's
+users, your or third parties' legal rights to forbid circumvention of
+technological measures.
+
+ 4. Conveying Verbatim Copies.
+
+ You may convey verbatim copies of the Program's source code as you
+receive it, in any medium, provided that you conspicuously and
+appropriately publish on each copy an appropriate copyright notice;
+keep intact all notices stating that this License and any
+non-permissive terms added in accord with section 7 apply to the code;
+keep intact all notices of the absence of any warranty; and give all
+recipients a copy of this License along with the Program.
+
+ You may charge any price or no price for each copy that you convey,
+and you may offer support or warranty protection for a fee.
+
+ 5. Conveying Modified Source Versions.
+
+ You may convey a work based on the Program, or the modifications to
+produce it from the Program, in the form of source code under the
+terms of section 4, provided that you also meet all of these conditions:
+
+ a) The work must carry prominent notices stating that you modified
+ it, and giving a relevant date.
+
+ b) The work must carry prominent notices stating that it is
+ released under this License and any conditions added under section
+ 7. This requirement modifies the requirement in section 4 to
+ "keep intact all notices".
+
+ c) You must license the entire work, as a whole, under this
+ License to anyone who comes into possession of a copy. This
+ License will therefore apply, along with any applicable section 7
+ additional terms, to the whole of the work, and all its parts,
+ regardless of how they are packaged. This License gives no
+ permission to license the work in any other way, but it does not
+ invalidate such permission if you have separately received it.
+
+ d) If the work has interactive user interfaces, each must display
+ Appropriate Legal Notices; however, if the Program has interactive
+ interfaces that do not display Appropriate Legal Notices, your
+ work need not make them do so.
+
+ A compilation of a covered work with other separate and independent
+works, which are not by their nature extensions of the covered work,
+and which are not combined with it such as to form a larger program,
+in or on a volume of a storage or distribution medium, is called an
+"aggregate" if the compilation and its resulting copyright are not
+used to limit the access or legal rights of the compilation's users
+beyond what the individual works permit. Inclusion of a covered work
+in an aggregate does not cause this License to apply to the other
+parts of the aggregate.
+
+ 6. Conveying Non-Source Forms.
+
+ You may convey a covered work in object code form under the terms
+of sections 4 and 5, provided that you also convey the
+machine-readable Corresponding Source under the terms of this License,
+in one of these ways:
+
+ a) Convey the object code in, or embodied in, a physical product
+ (including a physical distribution medium), accompanied by the
+ Corresponding Source fixed on a durable physical medium
+ customarily used for software interchange.
+
+ b) Convey the object code in, or embodied in, a physical product
+ (including a physical distribution medium), accompanied by a
+ written offer, valid for at least three years and valid for as
+ long as you offer spare parts or customer support for that product
+ model, to give anyone who possesses the object code either (1) a
+ copy of the Corresponding Source for all the software in the
+ product that is covered by this License, on a durable physical
+ medium customarily used for software interchange, for a price no
+ more than your reasonable cost of physically performing this
+ conveying of source, or (2) access to copy the
+ Corresponding Source from a network server at no charge.
+
+ c) Convey individual copies of the object code with a copy of the
+ written offer to provide the Corresponding Source. This
+ alternative is allowed only occasionally and noncommercially, and
+ only if you received the object code with such an offer, in accord
+ with subsection 6b.
+
+ d) Convey the object code by offering access from a designated
+ place (gratis or for a charge), and offer equivalent access to the
+ Corresponding Source in the same way through the same place at no
+ further charge. You need not require recipients to copy the
+ Corresponding Source along with the object code. If the place to
+ copy the object code is a network server, the Corresponding Source
+ may be on a different server (operated by you or a third party)
+ that supports equivalent copying facilities, provided you maintain
+ clear directions next to the object code saying where to find the
+ Corresponding Source. Regardless of what server hosts the
+ Corresponding Source, you remain obligated to ensure that it is
+ available for as long as needed to satisfy these requirements.
+
+ e) Convey the object code using peer-to-peer transmission, provided
+ you inform other peers where the object code and Corresponding
+ Source of the work are being offered to the general public at no
+ charge under subsection 6d.
+
+ A separable portion of the object code, whose source code is excluded
+from the Corresponding Source as a System Library, need not be
+included in conveying the object code work.
+
+ A "User Product" is either (1) a "consumer product", which means any
+tangible personal property which is normally used for personal, family,
+or household purposes, or (2) anything designed or sold for incorporation
+into a dwelling. In determining whether a product is a consumer product,
+doubtful cases shall be resolved in favor of coverage. For a particular
+product received by a particular user, "normally used" refers to a
+typical or common use of that class of product, regardless of the status
+of the particular user or of the way in which the particular user
+actually uses, or expects or is expected to use, the product. A product
+is a consumer product regardless of whether the product has substantial
+commercial, industrial or non-consumer uses, unless such uses represent
+the only significant mode of use of the product.
+
+ "Installation Information" for a User Product means any methods,
+procedures, authorization keys, or other information required to install
+and execute modified versions of a covered work in that User Product from
+a modified version of its Corresponding Source. The information must
+suffice to ensure that the continued functioning of the modified object
+code is in no case prevented or interfered with solely because
+modification has been made.
+
+ If you convey an object code work under this section in, or with, or
+specifically for use in, a User Product, and the conveying occurs as
+part of a transaction in which the right of possession and use of the
+User Product is transferred to the recipient in perpetuity or for a
+fixed term (regardless of how the transaction is characterized), the
+Corresponding Source conveyed under this section must be accompanied
+by the Installation Information. But this requirement does not apply
+if neither you nor any third party retains the ability to install
+modified object code on the User Product (for example, the work has
+been installed in ROM).
+
+ The requirement to provide Installation Information does not include a
+requirement to continue to provide support service, warranty, or updates
+for a work that has been modified or installed by the recipient, or for
+the User Product in which it has been modified or installed. Access to a
+network may be denied when the modification itself materially and
+adversely affects the operation of the network or violates the rules and
+protocols for communication across the network.
+
+ Corresponding Source conveyed, and Installation Information provided,
+in accord with this section must be in a format that is publicly
+documented (and with an implementation available to the public in
+source code form), and must require no special password or key for
+unpacking, reading or copying.
+
+ 7. Additional Terms.
+
+ "Additional permissions" are terms that supplement the terms of this
+License by making exceptions from one or more of its conditions.
+Additional permissions that are applicable to the entire Program shall
+be treated as though they were included in this License, to the extent
+that they are valid under applicable law. If additional permissions
+apply only to part of the Program, that part may be used separately
+under those permissions, but the entire Program remains governed by
+this License without regard to the additional permissions.
+
+ When you convey a copy of a covered work, you may at your option
+remove any additional permissions from that copy, or from any part of
+it. (Additional permissions may be written to require their own
+removal in certain cases when you modify the work.) You may place
+additional permissions on material, added by you to a covered work,
+for which you have or can give appropriate copyright permission.
+
+ Notwithstanding any other provision of this License, for material you
+add to a covered work, you may (if authorized by the copyright holders of
+that material) supplement the terms of this License with terms:
+
+ a) Disclaiming warranty or limiting liability differently from the
+ terms of sections 15 and 16 of this License; or
+
+ b) Requiring preservation of specified reasonable legal notices or
+ author attributions in that material or in the Appropriate Legal
+ Notices displayed by works containing it; or
+
+ c) Prohibiting misrepresentation of the origin of that material, or
+ requiring that modified versions of such material be marked in
+ reasonable ways as different from the original version; or
+
+ d) Limiting the use for publicity purposes of names of licensors or
+ authors of the material; or
+
+ e) Declining to grant rights under trademark law for use of some
+ trade names, trademarks, or service marks; or
+
+ f) Requiring indemnification of licensors and authors of that
+ material by anyone who conveys the material (or modified versions of
+ it) with contractual assumptions of liability to the recipient, for
+ any liability that these contractual assumptions directly impose on
+ those licensors and authors.
+
+ All other non-permissive additional terms are considered "further
+restrictions" within the meaning of section 10. If the Program as you
+received it, or any part of it, contains a notice stating that it is
+governed by this License along with a term that is a further
+restriction, you may remove that term. If a license document contains
+a further restriction but permits relicensing or conveying under this
+License, you may add to a covered work material governed by the terms
+of that license document, provided that the further restriction does
+not survive such relicensing or conveying.
+
+ If you add terms to a covered work in accord with this section, you
+must place, in the relevant source files, a statement of the
+additional terms that apply to those files, or a notice indicating
+where to find the applicable terms.
+
+ Additional terms, permissive or non-permissive, may be stated in the
+form of a separately written license, or stated as exceptions;
+the above requirements apply either way.
+
+ 8. Termination.
+
+ You may not propagate or modify a covered work except as expressly
+provided under this License. Any attempt otherwise to propagate or
+modify it is void, and will automatically terminate your rights under
+this License (including any patent licenses granted under the third
+paragraph of section 11).
+
+ However, if you cease all violation of this License, then your
+license from a particular copyright holder is reinstated (a)
+provisionally, unless and until the copyright holder explicitly and
+finally terminates your license, and (b) permanently, if the copyright
+holder fails to notify you of the violation by some reasonable means
+prior to 60 days after the cessation.
+
+ Moreover, your license from a particular copyright holder is
+reinstated permanently if the copyright holder notifies you of the
+violation by some reasonable means, this is the first time you have
+received notice of violation of this License (for any work) from that
+copyright holder, and you cure the violation prior to 30 days after
+your receipt of the notice.
+
+ Termination of your rights under this section does not terminate the
+licenses of parties who have received copies or rights from you under
+this License. If your rights have been terminated and not permanently
+reinstated, you do not qualify to receive new licenses for the same
+material under section 10.
+
+ 9. Acceptance Not Required for Having Copies.
+
+ You are not required to accept this License in order to receive or
+run a copy of the Program. Ancillary propagation of a covered work
+occurring solely as a consequence of using peer-to-peer transmission
+to receive a copy likewise does not require acceptance. However,
+nothing other than this License grants you permission to propagate or
+modify any covered work. These actions infringe copyright if you do
+not accept this License. Therefore, by modifying or propagating a
+covered work, you indicate your acceptance of this License to do so.
+
+ 10. Automatic Licensing of Downstream Recipients.
+
+ Each time you convey a covered work, the recipient automatically
+receives a license from the original licensors, to run, modify and
+propagate that work, subject to this License. You are not responsible
+for enforcing compliance by third parties with this License.
+
+ An "entity transaction" is a transaction transferring control of an
+organization, or substantially all assets of one, or subdividing an
+organization, or merging organizations. If propagation of a covered
+work results from an entity transaction, each party to that
+transaction who receives a copy of the work also receives whatever
+licenses to the work the party's predecessor in interest had or could
+give under the previous paragraph, plus a right to possession of the
+Corresponding Source of the work from the predecessor in interest, if
+the predecessor has it or can get it with reasonable efforts.
+
+ You may not impose any further restrictions on the exercise of the
+rights granted or affirmed under this License. For example, you may
+not impose a license fee, royalty, or other charge for exercise of
+rights granted under this License, and you may not initiate litigation
+(including a cross-claim or counterclaim in a lawsuit) alleging that
+any patent claim is infringed by making, using, selling, offering for
+sale, or importing the Program or any portion of it.
+
+ 11. Patents.
+
+ A "contributor" is a copyright holder who authorizes use under this
+License of the Program or a work on which the Program is based. The
+work thus licensed is called the contributor's "contributor version".
+
+ A contributor's "essential patent claims" are all patent claims
+owned or controlled by the contributor, whether already acquired or
+hereafter acquired, that would be infringed by some manner, permitted
+by this License, of making, using, or selling its contributor version,
+but do not include claims that would be infringed only as a
+consequence of further modification of the contributor version. For
+purposes of this definition, "control" includes the right to grant
+patent sublicenses in a manner consistent with the requirements of
+this License.
+
+ Each contributor grants you a non-exclusive, worldwide, royalty-free
+patent license under the contributor's essential patent claims, to
+make, use, sell, offer for sale, import and otherwise run, modify and
+propagate the contents of its contributor version.
+
+ In the following three paragraphs, a "patent license" is any express
+agreement or commitment, however denominated, not to enforce a patent
+(such as an express permission to practice a patent or covenant not to
+sue for patent infringement). To "grant" such a patent license to a
+party means to make such an agreement or commitment not to enforce a
+patent against the party.
+
+ If you convey a covered work, knowingly relying on a patent license,
+and the Corresponding Source of the work is not available for anyone
+to copy, free of charge and under the terms of this License, through a
+publicly available network server or other readily accessible means,
+then you must either (1) cause the Corresponding Source to be so
+available, or (2) arrange to deprive yourself of the benefit of the
+patent license for this particular work, or (3) arrange, in a manner
+consistent with the requirements of this License, to extend the patent
+license to downstream recipients. "Knowingly relying" means you have
+actual knowledge that, but for the patent license, your conveying the
+covered work in a country, or your recipient's use of the covered work
+in a country, would infringe one or more identifiable patents in that
+country that you have reason to believe are valid.
+
+ If, pursuant to or in connection with a single transaction or
+arrangement, you convey, or propagate by procuring conveyance of, a
+covered work, and grant a patent license to some of the parties
+receiving the covered work authorizing them to use, propagate, modify
+or convey a specific copy of the covered work, then the patent license
+you grant is automatically extended to all recipients of the covered
+work and works based on it.
+
+ A patent license is "discriminatory" if it does not include within
+the scope of its coverage, prohibits the exercise of, or is
+conditioned on the non-exercise of one or more of the rights that are
+specifically granted under this License. You may not convey a covered
+work if you are a party to an arrangement with a third party that is
+in the business of distributing software, under which you make payment
+to the third party based on the extent of your activity of conveying
+the work, and under which the third party grants, to any of the
+parties who would receive the covered work from you, a discriminatory
+patent license (a) in connection with copies of the covered work
+conveyed by you (or copies made from those copies), or (b) primarily
+for and in connection with specific products or compilations that
+contain the covered work, unless you entered into that arrangement,
+or that patent license was granted, prior to 28 March 2007.
+
+ Nothing in this License shall be construed as excluding or limiting
+any implied license or other defenses to infringement that may
+otherwise be available to you under applicable patent law.
+
+ 12. No Surrender of Others' Freedom.
+
+ If conditions are imposed on you (whether by court order, agreement or
+otherwise) that contradict the conditions of this License, they do not
+excuse you from the conditions of this License. If you cannot convey a
+covered work so as to satisfy simultaneously your obligations under this
+License and any other pertinent obligations, then as a consequence you may
+not convey it at all. For example, if you agree to terms that obligate you
+to collect a royalty for further conveying from those to whom you convey
+the Program, the only way you could satisfy both those terms and this
+License would be to refrain entirely from conveying the Program.
+
+ 13. Use with the GNU Affero General Public License.
+
+ Notwithstanding any other provision of this License, you have
+permission to link or combine any covered work with a work licensed
+under version 3 of the GNU Affero General Public License into a single
+combined work, and to convey the resulting work. The terms of this
+License will continue to apply to the part which is the covered work,
+but the special requirements of the GNU Affero General Public License,
+section 13, concerning interaction through a network will apply to the
+combination as such.
+
+ 14. Revised Versions of this License.
+
+ The Free Software Foundation may publish revised and/or new versions of
+the GNU General Public License from time to time. Such new versions will
+be similar in spirit to the present version, but may differ in detail to
+address new problems or concerns.
+
+ Each version is given a distinguishing version number. If the
+Program specifies that a certain numbered version of the GNU General
+Public License "or any later version" applies to it, you have the
+option of following the terms and conditions either of that numbered
+version or of any later version published by the Free Software
+Foundation. If the Program does not specify a version number of the
+GNU General Public License, you may choose any version ever published
+by the Free Software Foundation.
+
+ If the Program specifies that a proxy can decide which future
+versions of the GNU General Public License can be used, that proxy's
+public statement of acceptance of a version permanently authorizes you
+to choose that version for the Program.
+
+ Later license versions may give you additional or different
+permissions. However, no additional obligations are imposed on any
+author or copyright holder as a result of your choosing to follow a
+later version.
+
+ 15. Disclaimer of Warranty.
+
+ THERE IS NO WARRANTY FOR THE PROGRAM, TO THE EXTENT PERMITTED BY
+APPLICABLE LAW. EXCEPT WHEN OTHERWISE STATED IN WRITING THE COPYRIGHT
+HOLDERS AND/OR OTHER PARTIES PROVIDE THE PROGRAM "AS IS" WITHOUT WARRANTY
+OF ANY KIND, EITHER EXPRESSED OR IMPLIED, INCLUDING, BUT NOT LIMITED TO,
+THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+PURPOSE. THE ENTIRE RISK AS TO THE QUALITY AND PERFORMANCE OF THE PROGRAM
+IS WITH YOU. SHOULD THE PROGRAM PROVE DEFECTIVE, YOU ASSUME THE COST OF
+ALL NECESSARY SERVICING, REPAIR OR CORRECTION.
+
+ 16. Limitation of Liability.
+
+ IN NO EVENT UNLESS REQUIRED BY APPLICABLE LAW OR AGREED TO IN WRITING
+WILL ANY COPYRIGHT HOLDER, OR ANY OTHER PARTY WHO MODIFIES AND/OR CONVEYS
+THE PROGRAM AS PERMITTED ABOVE, BE LIABLE TO YOU FOR DAMAGES, INCLUDING ANY
+GENERAL, SPECIAL, INCIDENTAL OR CONSEQUENTIAL DAMAGES ARISING OUT OF THE
+USE OR INABILITY TO USE THE PROGRAM (INCLUDING BUT NOT LIMITED TO LOSS OF
+DATA OR DATA BEING RENDERED INACCURATE OR LOSSES SUSTAINED BY YOU OR THIRD
+PARTIES OR A FAILURE OF THE PROGRAM TO OPERATE WITH ANY OTHER PROGRAMS),
+EVEN IF SUCH HOLDER OR OTHER PARTY HAS BEEN ADVISED OF THE POSSIBILITY OF
+SUCH DAMAGES.
+
+ 17. Interpretation of Sections 15 and 16.
+
+ If the disclaimer of warranty and limitation of liability provided
+above cannot be given local legal effect according to their terms,
+reviewing courts shall apply local law that most closely approximates
+an absolute waiver of all civil liability in connection with the
+Program, unless a warranty or assumption of liability accompanies a
+copy of the Program in return for a fee.
+
+ END OF TERMS AND CONDITIONS
+
+ How to Apply These Terms to Your New Programs
+
+ If you develop a new program, and you want it to be of the greatest
+possible use to the public, the best way to achieve this is to make it
+free software which everyone can redistribute and change under these terms.
+
+ To do so, attach the following notices to the program. It is safest
+to attach them to the start of each source file to most effectively
+state the exclusion of warranty; and each file should have at least
+the "copyright" line and a pointer to where the full notice is found.
+
+
+ Copyright (C)
+
+ This program is free software: you can redistribute it and/or modify
+ it under the terms of the GNU General Public License as published by
+ the Free Software Foundation, either version 3 of the License, or
+ (at your option) any later version.
+
+ This program is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU General Public License for more details.
+
+ You should have received a copy of the GNU General Public License
+ along with this program. If not, see .
+
+Also add information on how to contact you by electronic and paper mail.
+
+ If the program does terminal interaction, make it output a short
+notice like this when it starts in an interactive mode:
+
+ Copyright (C)
+ This program comes with ABSOLUTELY NO WARRANTY; for details type `show w'.
+ This is free software, and you are welcome to redistribute it
+ under certain conditions; type `show c' for details.
+
+The hypothetical commands `show w' and `show c' should show the appropriate
+parts of the General Public License. Of course, your program's commands
+might be different; for a GUI interface, you would use an "about box".
+
+ You should also get your employer (if you work as a programmer) or school,
+if any, to sign a "copyright disclaimer" for the program, if necessary.
+For more information on this, and how to apply and follow the GNU GPL, see
+.
+
+ The GNU General Public License does not permit incorporating your program
+into proprietary programs. If your program is a subroutine library, you
+may consider it more useful to permit linking proprietary applications with
+the library. If this is what you want to do, use the GNU Lesser General
+Public License instead of this License. But first, please read
+.
diff --git a/MANUAL b/MANUAL
new file mode 100644
index 0000000..56eb557
--- /dev/null
+++ b/MANUAL
@@ -0,0 +1,1467 @@
+
+Introduction
+============
+
+What is HISAT2?
+-----------------
+
+HISAT2 is a fast and sensitive alignment program for mapping next-generation sequencing reads
+(whole-genome, transcriptome, and exome sequencing data) against the general human population
+(as well as against a single reference genome). Based on [GCSA] (an extension of [BWT] for a graph), we designed and implemented a graph FM index (GFM),
+an original approach and its first implementation to the best of our knowledge.
+In addition to using one global GFM index that represents general population,
+HISAT2 uses a large set of small GFM indexes that collectively cover the whole genome
+(each index representing a genomic region of 56 Kbp, with 55,000 indexes needed to cover human population).
+These small indexes (called local indexes) combined with several alignment strategies enable effective alignment of sequencing reads.
+This new indexing scheme is called Hierarchical Graph FM index (HGFM).
+We have developed HISAT 2 based on the [HISAT] and [Bowtie2] implementations.
+HISAT2 outputs alignments in [SAM] format, enabling interoperation with a large number of other tools (e.g. [SAMtools], [GATK]) that use SAM.
+HISAT2 is distributed under the [GPLv3 license], and it runs on the command line under
+Linux, Mac OS X and Windows.
+
+[HISAT2]: http://ccb.jhu.edu/software/hisat2
+[HISAT]: http://ccb.jhu.edu/software/hisat
+[Bowtie2]: http://bowtie-bio.sf.net/bowtie2
+[Bowtie]: http://bowtie-bio.sf.net
+[Bowtie1]: http://bowtie-bio.sf.net
+[GCSA]: http://ieeexplore.ieee.org/xpls/abs_all.jsp?arnumber=6698337&tag=1
+[Burrows-Wheeler Transform]: http://en.wikipedia.org/wiki/Burrows-Wheeler_transform
+[BWT]: http://en.wikipedia.org/wiki/Burrows-Wheeler_transform
+[FM Index]: http://en.wikipedia.org/wiki/FM-index
+[SAM]: http://samtools.sourceforge.net/SAM1.pdf
+[SAMtools]: http://samtools.sourceforge.net
+[GATK]: http://www.broadinstitute.org/gsa/wiki/index.php/The_Genome_Analysis_Toolkit
+[TopHat2]: http://ccb.jhu.edu/software/tophat
+[Cufflinks]: http://cufflinks.cbcb.umd.edu/
+[Crossbow]: http://bowtie-bio.sf.net/crossbow
+[Myrna]: http://bowtie-bio.sf.net/myrna
+[Bowtie paper]: http://genomebiology.com/2009/10/3/R25
+[GPLv3 license]: http://www.gnu.org/licenses/gpl-3.0.html
+
+Obtaining HISAT2
+==================
+
+Download HISAT2 sources and binaries from the Releases sections on the right side.
+Binaries are available for Intel architectures (`x86_64`) running Linux, and Mac OS X.
+
+Building from source
+--------------------
+
+Building HISAT2 from source requires a GNU-like environment with GCC, GNU Make
+and other basics. It should be possible to build HISAT2 on most vanilla Linux
+installations or on a Mac installation with [Xcode] installed. HISAT2 can
+also be built on Windows using [Cygwin] or [MinGW] (MinGW recommended). For a
+MinGW build the choice of what compiler is to be used is important since this
+will determine if a 32 or 64 bit code can be successfully compiled using it. If
+there is a need to generate both 32 and 64 bit on the same machine then a multilib
+MinGW has to be properly installed. [MSYS], the [zlib] library, and depending on
+architecture [pthreads] library are also required. We are recommending a 64 bit
+build since it has some clear advantages in real life research problems. In order
+to simplify the MinGW setup it might be worth investigating popular MinGW personal
+builds since these are coming already prepared with most of the toolchains needed.
+
+First, download the [source package] from the Releases section on the right side.
+Unzip the file, change to the unzipped directory, and build the
+HISAT2 tools by running GNU `make` (usually with the command `make`, but
+sometimes with `gmake`) with no arguments. If building with MinGW, run `make`
+from the MSYS environment.
+
+HISAT2 is using the multithreading software model in order to speed up
+execution times on SMP architectures where this is possible. On POSIX
+platforms (like linux, Mac OS, etc) it needs the pthread library. Although
+it is possible to use pthread library on non-POSIX platform like Windows, due
+to performance reasons HISAT2 will try to use Windows native multithreading
+if possible.
+
+For the support of SRA data access in HISAT2, please download and install the [NCBI-NGS] toolkit.
+When running `make`, specify additional variables as follow.
+`make USE_SRA=1 NCBI_NGS_DIR=/path/to/NCBI-NGS-directory NCBI_VDB_DIR=/path/to/NCBI-NGS-directory`,
+where `NCBI_NGS_DIR` and `NCBI_VDB_DIR` will be used in Makefile for -I and -L compilation options.
+For example, $(NCBI_NGS_DIR)/include and $(NCBI_NGS_DIR)/lib64 will be used.
+
+[Cygwin]: http://www.cygwin.com/
+[MinGW]: http://www.mingw.org/
+[MSYS]: http://www.mingw.org/wiki/msys
+[zlib]: http://cygwin.com/packages/mingw-zlib/
+[pthreads]: http://sourceware.org/pthreads-win32/
+[GnuWin32]: http://gnuwin32.sf.net/packages/coreutils.htm
+[Download]: https://sourceforge.net/projects/bowtie-bio/files/bowtie2/
+[sourceforge site]: https://sourceforge.net/projects/bowtie-bio/files/bowtie2/
+[source package]: http://ccb.jhu.edu/software/hisat2/downloads/hisat2-2.0.0-beta-source.zip
+[Xcode]: http://developer.apple.com/xcode/
+[NCBI-NGS]: https://github.com/ncbi/ngs/wiki/Downloads
+
+Running HISAT2
+=============
+
+Adding to PATH
+--------------
+
+By adding your new HISAT2 directory to your [PATH environment variable], you
+ensure that whenever you run `hisat2`, `hisat2-build` or `hisat2-inspect`
+from the command line, you will get the version you just installed without
+having to specify the entire path. This is recommended for most users. To do
+this, follow your operating system's instructions for adding the directory to
+your [PATH].
+
+If you would like to install HISAT2 by copying the HISAT2 executable files
+to an existing directory in your [PATH], make sure that you copy all the
+executables, including `hisat2`, `hisat2-align-s`, `hisat2-align-l`, `hisat2-build`, `hisat2-build-s`, `hisat2-build-l`, `hisat2-inspect`, `hisat2-inspect-s` and
+`hisat2-inspect-l`.
+
+[PATH environment variable]: http://en.wikipedia.org/wiki/PATH_(variable)
+[PATH]: http://en.wikipedia.org/wiki/PATH_(variable)
+
+Reporting
+---------
+
+The reporting mode governs how many alignments HISAT2 looks for, and how to
+report them.
+
+In general, when we say that a read has an alignment, we mean that it has a
+[valid alignment]. When we say that a read has multiple alignments, we mean
+that it has multiple alignments that are valid and distinct from one another.
+
+By default, HISAT2 may soft-clip reads near their 5' and 3' ends. Users can control this behavior by setting different penalties for soft-clipping (`--sp`) or by disallowing soft-clipping (`--no-softclip`).
+
+### Distinct alignments map a read to different places
+
+Two alignments for the same individual read are "distinct" if they map the same
+read to different places. Specifically, we say that two alignments are distinct
+if there are no alignment positions where a particular read offset is aligned
+opposite a particular reference offset in both alignments with the same
+orientation. E.g. if the first alignment is in the forward orientation and
+aligns the read character at read offset 10 to the reference character at
+chromosome 3, offset 3,445,245, and the second alignment is also in the forward
+orientation and also aligns the read character at read offset 10 to the
+reference character at chromosome 3, offset 3,445,245, they are not distinct
+alignments.
+
+Two alignments for the same pair are distinct if either the mate 1s in the two
+paired-end alignments are distinct or the mate 2s in the two alignments are
+distinct or both.
+
+### Default mode: search for one or more alignments, report each
+
+HISAT2 searches for up to N distinct, primary alignments for
+each read, where N equals the integer specified with the `-k` parameter.
+Primary alignments mean alignments whose alignment score is equal or higher than any other alignments.
+It is possible that multiple distinct alignments have the same score.
+That is, if `-k 2` is specified, HISAT2 will search for at most 2 distinct
+alignments. The alignment score for a paired-end alignment equals the sum of the
+alignment scores of the individual mates. Each reported read or pair alignment
+beyond the first has the SAM 'secondary' bit (which equals 256) set in its FLAGS
+field. See the [SAM specification] for details.
+
+HISAT2 does not "find" alignments in any specific order, so for reads that
+have more than N distinct, valid alignments, HISAT2 does not guarantee that
+the N alignments reported are the best possible in terms of alignment score.
+Still, this mode can be effective and fast in situations where the user cares
+more about whether a read aligns (or aligns a certain number of times) than
+where exactly it originated.
+
+[SAM specification]: http://samtools.sourceforge.net/SAM1.pdf
+
+Alignment summary
+------------------
+
+When HISAT2 finishes running, it prints messages summarizing what happened.
+These messages are printed to the "standard error" ("stderr") filehandle. For
+datasets consisting of unpaired reads, the summary might look like this:
+
+ 20000 reads; of these:
+ 20000 (100.00%) were unpaired; of these:
+ 1247 (6.24%) aligned 0 times
+ 18739 (93.69%) aligned exactly 1 time
+ 14 (0.07%) aligned >1 times
+ 93.77% overall alignment rate
+
+For datasets consisting of pairs, the summary might look like this:
+
+ 10000 reads; of these:
+ 10000 (100.00%) were paired; of these:
+ 650 (6.50%) aligned concordantly 0 times
+ 8823 (88.23%) aligned concordantly exactly 1 time
+ 527 (5.27%) aligned concordantly >1 times
+ ----
+ 650 pairs aligned concordantly 0 times; of these:
+ 34 (5.23%) aligned discordantly 1 time
+ ----
+ 616 pairs aligned 0 times concordantly or discordantly; of these:
+ 1232 mates make up the pairs; of these:
+ 660 (53.57%) aligned 0 times
+ 571 (46.35%) aligned exactly 1 time
+ 1 (0.08%) aligned >1 times
+ 96.70% overall alignment rate
+
+The indentation indicates how subtotals relate to totals.
+
+Wrapper
+-------
+
+The `hisat2`, `hisat2-build` and `hisat2-inspect` executables are actually
+wrapper scripts that call binary programs as appropriate. The wrappers shield
+users from having to distinguish between "small" and "large" index formats,
+discussed briefly in the following section. Also, the `hisat2` wrapper
+provides some key functionality, like the ability to handle compressed inputs,
+and the functionality for `--un`, `--al` and related options.
+
+It is recommended that you always run the hisat2 wrappers and not run the
+binaries directly.
+
+Small and large indexes
+-----------------------
+
+`hisat2-build` can index reference genomes of any size. For genomes less than
+about 4 billion nucleotides in length, `hisat2-build` builds a "small" index
+using 32-bit numbers in various parts of the index. When the genome is longer,
+`hisat2-build` builds a "large" index using 64-bit numbers. Small indexes are
+stored in files with the `.ht2` extension, and large indexes are stored in
+files with the `.ht2l` extension. The user need not worry about whether a
+particular index is small or large; the wrapper scripts will automatically build
+and use the appropriate index.
+
+Performance tuning
+------------------
+
+1. If your computer has multiple processors/cores, use `-p`
+
+ The `-p` option causes HISAT2 to launch a specified number of parallel
+ search threads. Each thread runs on a different processor/core and all
+ threads find alignments in parallel, increasing alignment throughput by
+ approximately a multiple of the number of threads (though in practice,
+ speedup is somewhat worse than linear).
+
+Command Line
+------------
+
+### Setting function options
+
+Some HISAT2 options specify a function rather than an individual number or
+setting. In these cases the user specifies three parameters: (a) a function
+type `F`, (b) a constant term `B`, and (c) a coefficient `A`. The available
+function types are constant (`C`), linear (`L`), square-root (`S`), and natural
+log (`G`). The parameters are specified as `F,B,A` - that is, the function type,
+the constant term, and the coefficient are separated by commas with no
+whitespace. The constant term and coefficient may be negative and/or
+floating-point numbers.
+
+For example, if the function specification is `L,-0.4,-0.6`, then the function
+defined is:
+
+ f(x) = -0.4 + -0.6 * x
+
+If the function specification is `G,1,5.4`, then the function defined is:
+
+ f(x) = 1.0 + 5.4 * ln(x)
+
+See the documentation for the option in question to learn what the parameter `x`
+is for. For example, in the case if the `--score-min` option, the function
+`f(x)` sets the minimum alignment score necessary for an alignment to be
+considered valid, and `x` is the read length.
+
+### Usage
+
+ hisat2 [options]* -x {-1 -2 | -U | --sra-acc } [-S ]
+
+### Main arguments
+
+ -x
+
+The basename of the index for the reference genome. The basename is the name of
+any of the index files up to but not including the final `.1.ht2` / etc.
+`hisat2` looks for the specified index first in the current directory,
+then in the directory specified in the `HISAT2_INDEXES` environment variable.
+
+ -1
+
+Comma-separated list of files containing mate 1s (filename usually includes
+`_1`), e.g. `-1 flyA_1.fq,flyB_1.fq`. Sequences specified with this option must
+correspond file-for-file and read-for-read with those specified in ``. Reads
+may be a mix of different lengths. If `-` is specified, `hisat2` will read the
+mate 1s from the "standard in" or "stdin" filehandle.
+
+ -2
+
+Comma-separated list of files containing mate 2s (filename usually includes
+`_2`), e.g. `-2 flyA_2.fq,flyB_2.fq`. Sequences specified with this option must
+correspond file-for-file and read-for-read with those specified in ``. Reads
+may be a mix of different lengths. If `-` is specified, `hisat2` will read the
+mate 2s from the "standard in" or "stdin" filehandle.
+
+ -U
+
+Comma-separated list of files containing unpaired reads to be aligned, e.g.
+`lane1.fq,lane2.fq,lane3.fq,lane4.fq`. Reads may be a mix of different lengths.
+If `-` is specified, `hisat2` gets the reads from the "standard in" or "stdin"
+filehandle.
+
+ --sra-acc
+
+Comma-separated list of SRA accession numbers, e.g. `--sra-acc SRR353653,SRR353654`.
+Information about read types is available at http://trace.ncbi.nlm.nih.gov/Traces/sra/sra.cgi?sp=runinfo&acc=sra-acc&retmode=xml,
+where sra-acc is SRA accession number. If users run HISAT2 on a computer cluster, it is recommended to disable SRA-related caching (see the instruction at [SRA-MANUAL]).
+
+[SRA-MANUAL]: https://github.com/ncbi/sra-tools/wiki/Toolkit-Configuration
+
+ -S
+
+File to write SAM alignments to. By default, alignments are written to the
+"standard out" or "stdout" filehandle (i.e. the console).
+
+### Options
+
+#### Input options
+
+ -q
+
+Reads (specified with ``, ``, ``) are FASTQ files. FASTQ files
+usually have extension `.fq` or `.fastq`. FASTQ is the default format. See
+also: `--solexa-quals` and `--int-quals`.
+
+ --qseq
+
+Reads (specified with ``, ``, ``) are QSEQ files. QSEQ files usually
+end in `_qseq.txt`. See also: `--solexa-quals` and `--int-quals`.
+
+ -f
+
+Reads (specified with ``, ``, ``) are FASTA files. FASTA files
+usually have extension `.fa`, `.fasta`, `.mfa`, `.fna` or similar. FASTA files
+do not have a way of specifying quality values, so when `-f` is set, the result
+is as if `--ignore-quals` is also set.
+
+ -r
+
+Reads (specified with ``, ``, ``) are files with one input sequence
+per line, without any other information (no read names, no qualities). When
+`-r` is set, the result is as if `--ignore-quals` is also set.
+
+ -c
+
+The read sequences are given on command line. I.e. ``, `` and
+`` are comma-separated lists of reads rather than lists of read files.
+There is no way to specify read names or qualities, so `-c` also implies
+`--ignore-quals`.
+
+ -s/--skip
+
+Skip (i.e. do not align) the first `` reads or pairs in the input.
+
+ -u/--qupto
+
+Align the first `` reads or read pairs from the input (after the
+`-s`/`--skip` reads or pairs have been skipped), then stop. Default: no limit.
+
+ -5/--trim5
+
+Trim `` bases from 5' (left) end of each read before alignment (default: 0).
+
+ -3/--trim3
+
+Trim `` bases from 3' (right) end of each read before alignment (default:
+0).
+
+ --phred33
+
+Input qualities are ASCII chars equal to the [Phred quality] plus 33. This is
+also called the "Phred+33" encoding, which is used by the very latest Illumina
+pipelines.
+
+[Phred quality]: http://en.wikipedia.org/wiki/Phred_quality_score
+
+ --phred64
+
+Input qualities are ASCII chars equal to the [Phred quality] plus 64. This is
+also called the "Phred+64" encoding.
+
+ --solexa-quals
+
+Convert input qualities from [Solexa][Phred quality] (which can be negative) to
+[Phred][Phred quality] (which can't). This scheme was used in older Illumina GA
+Pipeline versions (prior to 1.3). Default: off.
+
+ --int-quals
+
+Quality values are represented in the read input file as space-separated ASCII
+integers, e.g., `40 40 30 40`..., rather than ASCII characters, e.g., `II?I`....
+ Integers are treated as being on the [Phred quality] scale unless
+`--solexa-quals` is also specified. Default: off.
+
+#### Alignment options
+
+ --n-ceil
+
+Sets a function governing the maximum number of ambiguous characters (usually
+`N`s and/or `.`s) allowed in a read as a function of read length. For instance,
+specifying `-L,0,0.15` sets the N-ceiling function `f` to `f(x) = 0 + 0.15 * x`,
+where x is the read length. See also: [setting function options]. Reads
+exceeding this ceiling are [filtered out]. Default: `L,0,0.15`.
+
+ --ignore-quals
+
+When calculating a mismatch penalty, always consider the quality value at the
+mismatched position to be the highest possible, regardless of the actual value.
+I.e. input is treated as though all quality values are high. This is also the
+default behavior when the input doesn't specify quality values (e.g. in `-f`,
+`-r`, or `-c` modes).
+
+ --nofw/--norc
+
+If `--nofw` is specified, `hisat2` will not attempt to align unpaired reads to
+the forward (Watson) reference strand. If `--norc` is specified, `hisat2` will
+not attempt to align unpaired reads against the reverse-complement (Crick)
+reference strand. In paired-end mode, `--nofw` and `--norc` pertain to the
+fragments; i.e. specifying `--nofw` causes `hisat2` to explore only those
+paired-end configurations corresponding to fragments from the reverse-complement
+(Crick) strand. Default: both strands enabled.
+
+#### Scoring options
+
+ --mp MX,MN
+
+Sets the maximum (`MX`) and minimum (`MN`) mismatch penalties, both integers. A
+number less than or equal to `MX` and greater than or equal to `MN` is
+subtracted from the alignment score for each position where a read character
+aligns to a reference character, the characters do not match, and neither is an
+`N`. If `--ignore-quals` is specified, the number subtracted quals `MX`.
+Otherwise, the number subtracted is `MN + floor( (MX-MN)(MIN(Q, 40.0)/40.0) )`
+where Q is the Phred quality value. Default: `MX` = 6, `MN` = 2.
+
+ --sp MX,MN
+
+Sets the maximum (`MX`) and minimum (`MN`) penalties for soft-clipping per base,
+both integers. A number less than or equal to `MX` and greater than or equal to `MN` is
+subtracted from the alignment score for each position.
+The number subtracted is `MN + floor( (MX-MN)(MIN(Q, 40.0)/40.0) )`
+where Q is the Phred quality value. Default: `MX` = 2, `MN` = 1.
+
+ --no-softclip
+
+Disallow soft-clipping.
+
+ --np
+
+Sets penalty for positions where the read, reference, or both, contain an
+ambiguous character such as `N`. Default: 1.
+
+ --rdg ,
+
+Sets the read gap open (``) and extend (``) penalties. A read gap of
+length N gets a penalty of `` + N * ``. Default: 5, 3.
+
+ --rfg ,
+
+Sets the reference gap open (``) and extend (``) penalties. A
+reference gap of length N gets a penalty of `` + N * ``. Default:
+5, 3.
+
+ --score-min
+
+Sets a function governing the minimum alignment score needed for an alignment to
+be considered "valid" (i.e. good enough to report). This is a function of read
+length. For instance, specifying `L,0,-0.6` sets the minimum-score function `f`
+to `f(x) = 0 + -0.6 * x`, where `x` is the read length. See also: [setting
+function options]. The default is `L,0,-0.2`.
+
+#### Spliced alignment options
+
+ --pen-cansplice
+
+Sets the penalty for each pair of canonical splice sites (e.g. GT/AG). Default: 0.
+
+ --pen-noncansplice
+
+Sets the penalty for each pair of non-canonical splice sites (e.g. non-GT/AG). Default: 12.
+
+ --pen-canintronlen
+
+Sets the penalty for long introns with canonical splice sites so that alignments with shorter introns are preferred
+to those with longer ones. Default: G,-8,1
+
+ --pen-noncanintronlen
+
+Sets the penalty for long introns with noncanonical splice sites so that alignments with shorter introns are preferred
+to those with longer ones. Default: G,-8,1
+
+ --min-intronlen
+
+Sets minimum intron length. Default: 20
+
+ --max-intronlen
+
+Sets maximum intron length. Default: 500000
+
+ --known-splicesite-infile
+
+With this mode, you can provide a list of known splice sites, which HISAT2 makes use of to align reads with small anchors.
+You can create such a list using `python hisat2_extract_splice_sites.py genes.gtf > splicesites.txt`,
+where `hisat2_extract_splice_sites.py` is included in the HISAT2 package, `genes.gtf` is a gene annotation file,
+and `splicesites.txt` is a list of splice sites with which you provide HISAT2 in this mode.
+Note that it is better to use indexes built using annotated transcripts (such as genome_tran or genome_snp_tran), which works better
+than using this option. It has no effect to provide splice sites that are already included in the indexes.
+
+ --novel-splicesite-outfile
+
+In this mode, HISAT2 reports a list of splice sites in the file :
+ chromosome name `` genomic position of the flanking base on the left side of an intron `` genomic position of the flanking base on the right `` strand (+, -, and .)
+ '.' indicates an unknown strand for non-canonical splice sites.
+
+ --novel-splicesite-infile
+
+With this mode, you can provide a list of novel splice sites that were generated from the above option "--novel-splicesite-outfile".
+
+ --no-temp-splicesite
+
+HISAT2, by default, makes use of splice sites found by earlier reads to align later reads in the same run,
+in particular, reads with small anchors (<= 15 bp).
+The option disables this default alignment strategy.
+
+ --no-spliced-alignment
+
+Disable spliced alignment.
+
+ --rna-strandness
+
+Specify strand-specific information: the default is unstranded.
+For single-end reads, use F or R.
+ 'F' means a read corresponds to a transcript.
+ 'R' means a read corresponds to the reverse complemented counterpart of a transcript.
+For paired-end reads, use either FR or RF.
+With this option being used, every read alignment will have an XS attribute tag:
+ '+' means a read belongs to a transcript on '+' strand of genome.
+ '-' means a read belongs to a transcript on '-' strand of genome.
+
+(TopHat has a similar option, --library-type option, where fr-firststrand corresponds to R and RF; fr-secondstrand corresponds to F and FR.)
+
+ --tmo/--transcriptome-mapping-only
+
+Report only those alignments within known transcripts.
+
+ --dta/--downstream-transcriptome-assembly
+
+Report alignments tailored for transcript assemblers including StringTie.
+With this option, HISAT2 requires longer anchor lengths for de novo discovery of splice sites.
+This leads to fewer alignments with short-anchors,
+which helps transcript assemblers improve significantly in computation and memory usage.
+
+ --dta-cufflinks
+
+Report alignments tailored specifically for Cufflinks. In addition to what HISAT2 does with the above option (--dta),
+With this option, HISAT2 looks for novel splice sites with three signals (GT/AG, GC/AG, AT/AC), but all user-provided splice sites are used irrespective of their signals.
+HISAT2 produces an optional field, XS:A:[+-], for every spliced alignment.
+
+ --no-templatelen-adjustment
+
+Disables template length adjustment for RNA-seq reads.
+
+#### Reporting options
+
+ -k
+
+It searches for at most `` distinct, primary alignments for each read.
+Primary alignments mean alignments whose alignment score is equal or higher than any other alignments.
+The search terminates when it can't find more distinct valid alignments, or when it
+finds ``, whichever happens first. The alignment score for a paired-end
+alignment equals the sum of the alignment scores of the individual mates. Each
+reported read or pair alignment beyond the first has the SAM 'secondary' bit
+(which equals 256) set in its FLAGS field. For reads that have more than
+`` distinct, valid alignments, `hisat2` does not guarantee that the
+`` alignments reported are the best possible in terms of alignment score. Default: 5 (HFM) or 10 (HGFM)
+
+Note: HISAT2 is not designed with large values for `-k` in mind, and when
+aligning reads to long, repetitive genomes large `-k` can be very, very slow.
+
+ --max-seeds
+
+HISAT2, like other aligners, uses seed-and-extend approaches. HISAT2 tries to extend seeds to full-length alignments. In HISAT2, --max-seeds is used to control the maximum number of seeds that will be extended. HISAT2 extends up to these many seeds and skips the rest of the seeds. Large values for `--max-seeds` may improve alignment sensitivity, but HISAT2 is not designed with large values for `--max-seeds` in mind, and when aligning reads to long, repetitive genomes large `--max-seeds` can be very, very slow. The default value is the maximum of 5 and the value that comes with`-k`.
+
+ --secondary
+
+Report secondary alignments.
+
+#### Paired-end options
+
+ -I/--minins
+
+The minimum fragment length for valid paired-end alignments.This option is valid only with --no-spliced-alignment.
+E.g. if `-I 60` is specified and a paired-end alignment consists of two 20-bp alignments in the
+appropriate orientation with a 20-bp gap between them, that alignment is
+considered valid (as long as `-X` is also satisfied). A 19-bp gap would not
+be valid in that case. If trimming options `-3` or `-5` are also used, the
+`-I` constraint is applied with respect to the untrimmed mates.
+
+The larger the difference between `-I` and `-X`, the slower HISAT2 will
+run. This is because larger differences between `-I` and `-X` require that
+HISAT2 scan a larger window to determine if a concordant alignment exists.
+For typical fragment length ranges (200 to 400 nucleotides), HISAT2 is very
+efficient.
+
+Default: 0 (essentially imposing no minimum)
+
+ -X/--maxins
+
+The maximum fragment length for valid paired-end alignments. This option is valid only with --no-spliced-alignment.
+E.g. if `-X 100` is specified and a paired-end alignment consists of two 20-bp alignments in the
+proper orientation with a 60-bp gap between them, that alignment is considered
+valid (as long as `-I` is also satisfied). A 61-bp gap would not be valid in
+that case. If trimming options `-3` or `-5` are also used, the `-X`
+constraint is applied with respect to the untrimmed mates, not the trimmed
+mates.
+
+The larger the difference between `-I` and `-X`, the slower HISAT2 will
+run. This is because larger differences between `-I` and `-X` require that
+HISAT2 scan a larger window to determine if a concordant alignment exists.
+For typical fragment length ranges (200 to 400 nucleotides), HISAT2 is very
+efficient.
+
+Default: 500.
+
+ --fr/--rf/--ff
+
+The upstream/downstream mate orientations for a valid paired-end alignment
+against the forward reference strand. E.g., if `--fr` is specified and there is
+a candidate paired-end alignment where mate 1 appears upstream of the reverse
+complement of mate 2 and the fragment length constraints (`-I` and `-X`) are
+met, that alignment is valid. Also, if mate 2 appears upstream of the reverse
+complement of mate 1 and all other constraints are met, that too is valid.
+`--rf` likewise requires that an upstream mate1 be reverse-complemented and a
+downstream mate2 be forward-oriented. ` --ff` requires both an upstream mate 1
+and a downstream mate 2 to be forward-oriented. Default: `--fr` (appropriate
+for Illumina's Paired-end Sequencing Assay).
+
+ --no-mixed
+
+By default, when `hisat2` cannot find a concordant or discordant alignment for
+a pair, it then tries to find alignments for the individual mates. This option
+disables that behavior.
+
+ --no-discordant
+
+By default, `hisat2` looks for discordant alignments if it cannot find any
+concordant alignments. A discordant alignment is an alignment where both mates
+align uniquely, but that does not satisfy the paired-end constraints
+(`--fr`/`--rf`/`--ff`, `-I`, `-X`). This option disables that behavior.
+
+#### Output options
+
+ -t/--time
+
+Print the wall-clock time required to load the index files and align the reads.
+This is printed to the "standard error" ("stderr") filehandle. Default: off.
+
+ --un
+ --un-gz
+ --un-bz2
+
+Write unpaired reads that fail to align to file at ``. These reads
+correspond to the SAM records with the FLAGS `0x4` bit set and neither the
+`0x40` nor `0x80` bits set. If `--un-gz` is specified, output will be gzip
+compressed. If `--un-bz2` is specified, output will be bzip2 compressed. Reads
+written in this way will appear exactly as they did in the input file, without
+any modification (same sequence, same name, same quality string, same quality
+encoding). Reads will not necessarily appear in the same order as they did in
+the input.
+
+ --al
+ --al-gz
+ --al-bz2
+
+Write unpaired reads that align at least once to file at ``. These reads
+correspond to the SAM records with the FLAGS `0x4`, `0x40`, and `0x80` bits
+unset. If `--al-gz` is specified, output will be gzip compressed. If `--al-bz2`
+is specified, output will be bzip2 compressed. Reads written in this way will
+appear exactly as they did in the input file, without any modification (same
+sequence, same name, same quality string, same quality encoding). Reads will
+not necessarily appear in the same order as they did in the input.
+
+ --un-conc
+ --un-conc-gz
+ --un-conc-bz2
+
+Write paired-end reads that fail to align concordantly to file(s) at ``.
+These reads correspond to the SAM records with the FLAGS `0x4` bit set and
+either the `0x40` or `0x80` bit set (depending on whether it's mate #1 or #2).
+`.1` and `.2` strings are added to the filename to distinguish which file
+contains mate #1 and mate #2. If a percent symbol, `%`, is used in ``,
+the percent symbol is replaced with `1` or `2` to make the per-mate filenames.
+Otherwise, `.1` or `.2` are added before the final dot in `` to make the
+per-mate filenames. Reads written in this way will appear exactly as they did
+in the input files, without any modification (same sequence, same name, same
+quality string, same quality encoding). Reads will not necessarily appear in
+the same order as they did in the inputs.
+
+ --al-conc
+ --al-conc-gz
+ --al-conc-bz2
+
+Write paired-end reads that align concordantly at least once to file(s) at
+``. These reads correspond to the SAM records with the FLAGS `0x4` bit
+unset and either the `0x40` or `0x80` bit set (depending on whether it's mate #1
+or #2). `.1` and `.2` strings are added to the filename to distinguish which
+file contains mate #1 and mate #2. If a percent symbol, `%`, is used in
+``, the percent symbol is replaced with `1` or `2` to make the per-mate
+filenames. Otherwise, `.1` or `.2` are added before the final dot in `` to
+make the per-mate filenames. Reads written in this way will appear exactly as
+they did in the input files, without any modification (same sequence, same name,
+same quality string, same quality encoding). Reads will not necessarily appear
+in the same order as they did in the inputs.
+
+ --quiet
+
+Print nothing besides alignments and serious errors.
+
+ --summary-file
+
+Print alignment summary to this file.
+
+ --new-summary
+
+Print alignment summary in a new style, which is more machine-friendly.
+
+ --met-file
+
+Write `hisat2` metrics to file ``. Having alignment metric can be useful
+for debugging certain problems, especially performance issues. See also:
+`--met`. Default: metrics disabled.
+
+ --met-stderr
+
+Write `hisat2` metrics to the "standard error" ("stderr") filehandle. This is
+not mutually exclusive with `--met-file`. Having alignment metric can be
+useful for debugging certain problems, especially performance issues. See also:
+`--met`. Default: metrics disabled.
+
+ --met
+
+Write a new `hisat2` metrics record every `` seconds. Only matters if
+either `--met-stderr` or `--met-file` are specified. Default: 1.
+
+#### SAM options
+
+ --no-unal
+
+Suppress SAM records for reads that failed to align.
+
+ --no-hd
+
+Suppress SAM header lines (starting with `@`).
+
+ --no-sq
+
+Suppress `@SQ` SAM header lines.
+
+ --rg-id
+
+Set the read group ID to ``. This causes the SAM `@RG` header line to be
+printed, with `` as the value associated with the `ID:` tag. It also
+causes the `RG:Z:` extra field to be attached to each SAM output record, with
+value set to ``.
+
+ --rg
+
+Add `` (usually of the form `TAG:VAL`, e.g. `SM:Pool1`) as a field on the
+`@RG` header line. Note: in order for the `@RG` line to appear, `--rg-id`
+must also be specified. This is because the `ID` tag is required by the [SAM
+Spec][SAM]. Specify `--rg` multiple times to set multiple fields. See the
+[SAM Spec][SAM] for details about what fields are legal.
+
+ --remove-chrname
+
+Remove 'chr' from reference names in alignment (e.g., chr18 to 18)
+
+ --add-chrname
+
+Add 'chr' to reference names in alignment (e.g., 18 to chr18)
+
+ --omit-sec-seq
+
+When printing secondary alignments, HISAT2 by default will write out the `SEQ`
+and `QUAL` strings. Specifying this option causes HISAT2 to print an asterisk
+in those fields instead.
+
+#### Performance options
+
+ -o/--offrate
+
+Override the offrate of the index with ``. If `` is greater
+than the offrate used to build the index, then some row markings are
+discarded when the index is read into memory. This reduces the memory
+footprint of the aligner but requires more time to calculate text
+offsets. `` must be greater than the value used to build the
+index.
+
+ -p/--threads NTHREADS
+
+Launch `NTHREADS` parallel search threads (default: 1). Threads will run on
+separate processors/cores and synchronize when parsing reads and outputting
+alignments. Searching for alignments is highly parallel, and speedup is close
+to linear. Increasing `-p` increases HISAT2's memory footprint. E.g. when
+aligning to a human genome index, increasing `-p` from 1 to 8 increases the
+memory footprint by a few hundred megabytes. This option is only available if
+`hisat2` is linked with the `pthreads` library (i.e. if `HISAT2_PTHREADS=0` is
+not specified at build time).
+
+ --reorder
+
+Guarantees that output SAM records are printed in an order corresponding to the
+order of the reads in the original input file, even when `-p` is set greater
+than 1. Specifying `--reorder` and setting `-p` greater than 1 causes HISAT2
+to run somewhat slower and use somewhat more memory then if `--reorder` were
+not specified. Has no effect if `-p` is set to 1, since output order will
+naturally correspond to input order in that case.
+
+ --mm
+
+Use memory-mapped I/O to load the index, rather than typical file I/O.
+Memory-mapping allows many concurrent `hisat2` processes on the same computer to
+share the same memory image of the index (i.e. you pay the memory overhead just
+once). This facilitates memory-efficient parallelization of `hisat2` in
+situations where using `-p` is not possible or not preferable.
+
+#### Other options
+
+ --qc-filter
+
+Filter out reads for which the QSEQ filter field is non-zero. Only has an
+effect when read format is `--qseq`. Default: off.
+
+ --seed
+
+Use `` as the seed for pseudo-random number generator. Default: 0.
+
+ --non-deterministic
+
+Normally, HISAT2 re-initializes its pseudo-random generator for each read. It
+seeds the generator with a number derived from (a) the read name, (b) the
+nucleotide sequence, (c) the quality sequence, (d) the value of the `--seed`
+option. This means that if two reads are identical (same name, same
+nucleotides, same qualities) HISAT2 will find and report the same alignment(s)
+for both, even if there was ambiguity. When `--non-deterministic` is specified,
+HISAT2 re-initializes its pseudo-random generator for each read using the
+current time. This means that HISAT2 will not necessarily report the same
+alignment for two identical reads. This is counter-intuitive for some users,
+but might be more appropriate in situations where the input consists of many
+identical reads.
+
+ --version
+
+Print version information and quit.
+
+ -h/--help
+
+Print usage information and quit.
+
+SAM output
+----------
+
+Following is a brief description of the [SAM] format as output by `hisat2`.
+For more details, see the [SAM format specification][SAM].
+
+By default, `hisat2` prints a SAM header with `@HD`, `@SQ` and `@PG` lines.
+When one or more `--rg` arguments are specified, `hisat2` will also print
+an `@RG` line that includes all user-specified `--rg` tokens separated by
+tabs.
+
+Each subsequent line describes an alignment or, if the read failed to align, a
+read. Each line is a collection of at least 12 fields separated by tabs; from
+left to right, the fields are:
+
+1. Name of read that aligned.
+
+ Note that the [SAM specification] disallows whitespace in the read name.
+ If the read name contains any whitespace characters, HISAT2 will truncate
+ the name at the first whitespace character. This is similar to the
+ behavior of other tools.
+
+2. Sum of all applicable flags. Flags relevant to HISAT2 are:
+
+ 1
+
+ The read is one of a pair
+
+ 2
+
+ The alignment is one end of a proper paired-end alignment
+
+ 4
+
+ The read has no reported alignments
+
+ 8
+
+ The read is one of a pair and has no reported alignments
+
+ 16
+
+ The alignment is to the reverse reference strand
+
+ 32
+
+ The other mate in the paired-end alignment is aligned to the
+ reverse reference strand
+
+ 64
+
+ The read is mate 1 in a pair
+
+ 128
+
+ The read is mate 2 in a pair
+
+ Thus, an unpaired read that aligns to the reverse reference strand
+ will have flag 16. A paired-end read that aligns and is the first
+ mate in the pair will have flag 83 (= 64 + 16 + 2 + 1).
+
+3. Name of reference sequence where alignment occurs
+
+4. 1-based offset into the forward reference strand where leftmost
+ character of the alignment occurs
+
+5. Mapping quality
+
+6. CIGAR string representation of alignment
+
+7. Name of reference sequence where mate's alignment occurs. Set to `=` if the
+mate's reference sequence is the same as this alignment's, or `*` if there is no
+mate.
+
+8. 1-based offset into the forward reference strand where leftmost character of
+the mate's alignment occurs. Offset is 0 if there is no mate.
+
+9. Inferred fragment length. Size is negative if the mate's alignment occurs
+upstream of this alignment. Size is 0 if the mates did not align concordantly.
+However, size is non-0 if the mates aligned discordantly to the same
+chromosome.
+
+10. Read sequence (reverse-complemented if aligned to the reverse strand)
+
+11. ASCII-encoded read qualities (reverse-complemented if the read aligned to
+the reverse strand). The encoded quality values are on the [Phred quality]
+scale and the encoding is ASCII-offset by 33 (ASCII char `!`), similarly to a
+[FASTQ] file.
+
+12. Optional fields. Fields are tab-separated. `hisat2` outputs zero or more
+of these optional fields for each alignment, depending on the type of the
+alignment:
+
+ AS:i:
+
+ Alignment score. Can be negative. Only present if SAM record is for
+ an aligned read.
+
+ ZS:i:
+
+ Alignment score for the best-scoring alignment found other than the
+ alignment reported. Can be negative. Only present if the SAM record is
+ for an aligned read and more than one alignment was found for the read.
+ Note that, when the read is part of a concordantly-aligned pair, this score
+ could be greater than `AS:i`.
+
+ YS:i:
+
+ Alignment score for opposite mate in the paired-end alignment. Only present
+ if the SAM record is for a read that aligned as part of a paired-end
+ alignment.
+
+ XN:i:
+
+ The number of ambiguous bases in the reference covering this alignment.
+ Only present if SAM record is for an aligned read.
+
+ XM:i:
+
+ The number of mismatches in the alignment. Only present if SAM record is
+ for an aligned read.
+
+ XO:i:
+
+ The number of gap opens, for both read and reference gaps, in the alignment.
+ Only present if SAM record is for an aligned read.
+
+ XG:i:
+
+ The number of gap extensions, for both read and reference gaps, in the
+ alignment. Only present if SAM record is for an aligned read.
+
+ NM:i:
+
+ The edit distance; that is, the minimal number of one-nucleotide edits
+ (substitutions, insertions and deletions) needed to transform the read
+ string into the reference string. Only present if SAM record is for an
+ aligned read.
+
+ YF:Z:
+
+ String indicating reason why the read was filtered out. See also:
+ [Filtering]. Only appears for reads that were filtered out.
+
+ YT:Z:
+
+ Value of `UU` indicates the read was not part of a pair. Value of `CP`
+ indicates the read was part of a pair and the pair aligned concordantly.
+ Value of `DP` indicates the read was part of a pair and the pair aligned
+ discordantly. Value of `UP` indicates the read was part of a pair but the
+ pair failed to aligned either concordantly or discordantly.
+
+ MD:Z:
+
+ A string representation of the mismatched reference bases in the alignment.
+ See [SAM] format specification for details. Only present if SAM record is
+ for an aligned read.
+
+ XS:A:
+
+ Values of `+` and `-` indicate the read is mapped to transcripts on sense and anti-sense
+ strands, respectively. Spliced alignments need to have this field, which is required in Cufflinks and StringTie.
+ We can report this field for the canonical-splice site (GT/AG), but not for non-canonical splice sites.
+ You can direct HISAT2 not to output such alignments (involving non-canonical splice sites) using "--pen-noncansplice 1000000".
+
+ NH:i:
+
+ The number of mapped locations for the read or the pair.
+
+ Zs:Z:
+
+ When the alignment of a read involves SNPs that are in the index, this option is used to indicate where exactly the read involves the SNPs.
+ This optional field is similar to the above MD:Z field.
+ For example, `Zs:Z:1|S|rs3747203,97|S|rs16990981` indicates the second base of the read corresponds to a known SNP (ID: rs3747203).
+ 97 bases after the third base (the base after the second one), the read at 100th base involves another known SNP (ID: rs16990981).
+ 'S' indicates a single nucleotide polymorphism. 'D' and 'I' indicate a deletion and an insertion, respectively.
+
+[SAM format specification]: http://samtools.sf.net/SAM1.pdf
+[FASTQ]: http://en.wikipedia.org/wiki/FASTQ_format
+
+The `hisat2-build` indexer
+===========================
+
+`hisat2-build` builds a HISAT2 index from a set of DNA sequences.
+`hisat2-build` outputs a set of 6 files with suffixes `.1.ht2`, `.2.ht2`,
+`.3.ht2`, `.4.ht2`, `.5.ht2`, `.6.ht2`, `.7.ht2`, and `.8.ht2`. In the case of a large
+index these suffixes will have a `ht2l` termination. These files together
+constitute the index: they are all that is needed to align reads to that
+reference. The original sequence FASTA files are no longer used by HISAT2
+once the index is built.
+
+Use of Karkkainen's [blockwise algorithm] allows `hisat2-build` to trade off
+between running time and memory usage. `hisat2-build` has three options
+governing how it makes this trade: `-p`/`--packed`, `--bmax`/`--bmaxdivn`,
+and `--dcv`. By default, `hisat2-build` will automatically search for the
+settings that yield the best running time without exhausting memory. This
+behavior can be disabled using the `-a`/`--noauto` option.
+
+The indexer provides options pertaining to the "shape" of the index, e.g.
+`--offrate` governs the fraction of [Burrows-Wheeler]
+rows that are "marked" (i.e., the density of the suffix-array sample; see the
+original [FM Index] paper for details). All of these options are potentially
+profitable trade-offs depending on the application. They have been set to
+defaults that are reasonable for most cases according to our experiments. See
+[Performance tuning] for details.
+
+`hisat2-build` can generate either [small or large indexes]. The wrapper
+will decide which based on the length of the input genome. If the reference
+does not exceed 4 billion characters but a large index is preferred, the user
+can specify `--large-index` to force `hisat2-build` to build a large index
+instead.
+
+The HISAT2 index is based on the [FM Index] of Ferragina and Manzini, which in
+turn is based on the [Burrows-Wheeler] transform. The algorithm used to build
+the index is based on the [blockwise algorithm] of Karkkainen.
+
+[Blockwise algorithm]: http://portal.acm.org/citation.cfm?id=1314852
+[Burrows-Wheeler]: http://en.wikipedia.org/wiki/Burrows-Wheeler_transform
+
+Command Line
+------------
+
+Usage:
+
+ hisat2-build [options]*
+
+### Notes
+ If you use --snp, --ss, and/or --exon, hisat2-build will need about 200GB RAM for the human genome size as index building involves a graph construction.
+ Otherwise, you will be able to build an index on your desktop with 8GB RAM.
+
+### Main arguments
+
+A comma-separated list of FASTA files containing the reference sequences to be
+aligned to, or, if `-c` is specified, the sequences
+themselves. E.g., `` might be `chr1.fa,chr2.fa,chrX.fa,chrY.fa`,
+or, if `-c` is specified, this might be
+`GGTCATCCT,ACGGGTCGT,CCGTTCTATGCGGCTTA`.
+
+The basename of the index files to write. By default, `hisat2-build` writes
+files named `NAME.1.ht2`, `NAME.2.ht2`, `NAME.3.ht2`, `NAME.4.ht2`,
+`NAME.5.ht2`, `NAME.6.ht2`, `NAME.7.ht2`, and `NAME.8.ht2` where `NAME` is ``.
+
+### Options
+
+ -f
+
+The reference input files (specified as ``) are FASTA files
+(usually having extension `.fa`, `.mfa`, `.fna` or similar).
+
+ -c
+
+The reference sequences are given on the command line. I.e. `` is
+a comma-separated list of sequences rather than a list of FASTA files.
+
+ --large-index
+
+Force `hisat2-build` to build a [large index], even if the reference is less
+than ~ 4 billion nucleotides long.
+
+ -a/--noauto
+
+Disable the default behavior whereby `hisat2-build` automatically selects
+values for the `--bmax`, `--dcv` and `--packed` parameters according to
+available memory. Instead, user may specify values for those parameters. If
+memory is exhausted during indexing, an error message will be printed; it is up
+to the user to try new parameters.
+
+ --bmax
+
+The maximum number of suffixes allowed in a block. Allowing more suffixes per
+block makes indexing faster, but increases peak memory usage. Setting this
+option overrides any previous setting for `--bmax`, or `--bmaxdivn`.
+Default (in terms of the `--bmaxdivn` parameter) is `--bmaxdivn` 4. This is
+configured automatically by default; use `-a`/`--noauto` to configure manually.
+
+ --bmaxdivn
+
+The maximum number of suffixes allowed in a block, expressed as a fraction of
+the length of the reference. Setting this option overrides any previous setting
+for `--bmax`, or `--bmaxdivn`. Default: `--bmaxdivn` 4. This is
+configured automatically by default; use `-a`/`--noauto` to configure manually.
+
+ --dcv
+
+Use `` as the period for the difference-cover sample. A larger period
+yields less memory overhead, but may make suffix sorting slower, especially if
+repeats are present. Must be a power of 2 no greater than 4096. Default: 1024.
+ This is configured automatically by default; use `-a`/`--noauto` to configure
+manually.
+
+ --nodc
+
+Disable use of the difference-cover sample. Suffix sorting becomes
+quadratic-time in the worst case (where the worst case is an extremely
+repetitive reference). Default: off.
+
+ -r/--noref
+
+Do not build the `NAME.3.ht2` and `NAME.4.ht2` portions of the index, which
+contain a bitpacked version of the reference sequences and are used for
+paired-end alignment.
+
+ -3/--justref
+
+Build only the `NAME.3.ht2` and `NAME.4.ht2` portions of the index, which
+contain a bitpacked version of the reference sequences and are used for
+paired-end alignment.
+
+ -o/--offrate
+
+To map alignments back to positions on the reference sequences, it's necessary
+to annotate ("mark") some or all of the [Burrows-Wheeler] rows with their
+corresponding location on the genome.
+`-o`/`--offrate` governs how many rows get marked:
+the indexer will mark every 2^`` rows. Marking more rows makes
+reference-position lookups faster, but requires more memory to hold the
+annotations at runtime. The default is 4 (every 16th row is marked; for human
+genome, annotations occupy about 680 megabytes).
+
+ -t/--ftabchars
+
+The ftab is the lookup table used to calculate an initial [Burrows-Wheeler]
+range with respect to the first `` characters of the query. A larger
+`` yields a larger lookup table but faster query times. The ftab has size
+4^(``+1) bytes. The default setting is 10 (ftab is 4MB).
+
+ --localoffrate
+
+This option governs how many rows get marked in a local index:
+the indexer will mark every 2^`` rows. Marking more rows makes
+reference-position lookups faster, but requires more memory to hold the
+annotations at runtime. The default is 3 (every 8th row is marked,
+this occupies about 16KB per local index).
+
+ --localftabchars
+
+The local ftab is the lookup table in a local index.
+The default setting is 6 (ftab is 8KB per local index).
+
+ -p
+
+Launch `NTHREADS` parallel build threads (default: 1).
+
+ --snp
+
+Provide a list of SNPs (in the HISAT2's own format) as follows (five columns).
+
+ SNP ID `` snp type (single, deletion, or insertion) `` chromosome name `` zero-offset based genomic position of a SNP `` alternative base (single), the length of SNP (deletion), or insertion sequence (insertion)
+
+ For example,
+ rs58784443 single 13 18447947 T
+
+Use `hisat2_extract_snps_haplotypes_UCSC.py` (in the HISAT2 package) to extract SNPs and haplotypes from a dbSNP file (e.g. http://hgdownload.soe.ucsc.edu/goldenPath/hg38/database/snp144Common.txt.gz).
+or `hisat2_extract_snps_haplotypes_VCF.py` to extract SNPs and haplotypes from a VCF file (e.g. ftp://ftp.1000genomes.ebi.ac.uk/vol1/ftp/release/20130502/supporting/GRCh38_positions/ALL.chr22.phase3_shapeit2_mvncall_integrated_v3plus_nounphased.rsID.genotypes.GRCh38_dbSNP_no_SVs.vcf.gz).
+
+ --haplotype
+
+Provide a list of haplotypes (in the HISAT2's own format) as follows (five columns).
+
+ Haplotype ID `` chromosome name `` zero-offset based left coordinate of haplotype `` zero-offset based right coordinate of haplotype `` a comma separated list of SNP ids in the haplotype
+
+ For example,
+ ht35 13 18446877 18446945 rs12381094,rs12381056,rs192016659,rs538569910
+
+See the above option, --snp, about how to extract haplotypes. This option is not required, but haplotype information can keep the index construction from exploding and reduce the index size substantially.
+
+ --ss
+
+Note this option should be used with the following --exon option.
+Provide a list of splice sites (in the HISAT2's own format) as follows (four columns).
+
+ chromosome name `` zero-offset based genomic position of the flanking base on the left side of an intron `` zero-offset based genomic position of the flanking base on the right `` strand
+
+Use `hisat2_extract_splice_sites.py` (in the HISAT2 package) to extract splice sites from a GTF file.
+
+ --exon
+
+Note this option should be used with the above --ss option.
+Provide a list of exons (in the HISAT2's own format) as follows (three columns).
+
+ chromosome name `` zero-offset based left genomic position of an exon `` zero-offset based right genomic position of an exon
+
+Use `hisat2_extract_exons.py` (in the HISAT2 package) to extract exons from a GTF file.
+
+ --seed
+
+Use `` as the seed for pseudo-random number generator.
+
+ --cutoff
+
+Index only the first `` bases of the reference sequences (cumulative across
+sequences) and ignore the rest.
+
+ -q/--quiet
+
+`hisat2-build` is verbose by default. With this option `hisat2-build` will
+print only error messages.
+
+ -h/--help
+
+Print usage information and quit.
+
+ --version
+
+Print version information and quit.
+
+The `hisat2-inspect` index inspector
+=====================================
+
+`hisat2-inspect` extracts information from a HISAT2 index about what kind of
+index it is and what reference sequences were used to build it. When run without
+any options, the tool will output a FASTA file containing the sequences of the
+original references (with all non-`A`/`C`/`G`/`T` characters converted to `N`s).
+ It can also be used to extract just the reference sequence names using the
+`-n`/`--names` option or a more verbose summary using the `-s`/`--summary`
+option.
+
+Command Line
+------------
+
+Usage:
+
+ hisat2-inspect [options]*
+
+### Main arguments
+
+The basename of the index to be inspected. The basename is name of any of the
+index files but with the `.X.ht2` suffix omitted.
+`hisat2-inspect` first looks in the current directory for the index files, then
+in the directory specified in the `HISAT2_INDEXES` environment variable.
+
+### Options
+
+ -a/--across
+
+When printing FASTA output, output a newline character every `` bases
+(default: 60).
+
+ -n/--names
+
+Print reference sequence names, one per line, and quit.
+
+ -s/--summary
+
+Print a summary that includes information about index settings, as well as the
+names and lengths of the input sequences. The summary has this format:
+
+ Colorspace <0 or 1>
+ SA-Sample 1 in
+ FTab-Chars
+ Sequence-1
+ Sequence-2
+ ...
+ Sequence-N
+
+Fields are separated by tabs. Colorspace is always set to 0 for HISAT2.
+
+ --snp
+
+Print SNPs, and quit.
+
+ --ss
+
+Print splice sites, and quit.
+
+ --ss-all
+
+Print splice sites including those not in the global index, and quit.
+
+ --exon
+
+Print exons, and quit.
+
+ -v/--verbose
+
+Print verbose output (for debugging).
+
+ --version
+
+Print version information and quit.
+
+ -h/--help
+
+Print usage information and quit.
+
+Getting started with HISAT2
+===================================================
+
+HISAT2 comes with some example files to get you started. The example files
+are not scientifically significant; these files will simply let you start running HISAT2 and
+downstream tools right away.
+
+First follow the manual instructions to [obtain HISAT2]. Set the `HISAT2_HOME`
+environment variable to point to the new HISAT2 directory containing the
+`hisat2`, `hisat2-build` and `hisat2-inspect` binaries. This is important,
+as the `HISAT2_HOME` variable is used in the commands below to refer to that
+directory.
+
+Indexing a reference genome
+---------------------------
+
+To create an index for the genomic region (1 million bps from the human chromosome 22 between 20,000,000 and 20,999,999)
+included with HISAT2, create a new temporary directory (it doesn't matter where), change into that directory, and run:
+
+ $HISAT2_HOME/hisat2-build $HISAT2_HOME/example/reference/22_20-21M.fa --snp $HISAT2_HOME/example/reference/22_20-21M.snp 22_20-21M_snp
+
+The command should print many lines of output then quit. When the command
+completes, the current directory will contain ten new files that all start with
+`22_20-21M_snp` and end with `.1.ht2`, `.2.ht2`, `.3.ht2`, `.4.ht2`, `.5.ht2`, `.6.ht2`,
+`.7.ht2`, and `.8.ht2`. These files constitute the index - you're done!
+
+You can use `hisat2-build` to create an index for a set of FASTA files obtained
+from any source, including sites such as [UCSC], [NCBI], and [Ensembl]. When
+indexing multiple FASTA files, specify all the files using commas to separate
+file names. For more details on how to create an index with `hisat2-build`,
+see the [manual section on index building]. You may also want to bypass this
+process by obtaining a pre-built index.
+
+[UCSC]: http://genome.ucsc.edu/cgi-bin/hgGateway
+[NCBI]: http://www.ncbi.nlm.nih.gov/sites/genome
+[Ensembl]: http://www.ensembl.org/
+
+Aligning example reads
+----------------------
+
+Stay in the directory created in the previous step, which now contains the
+`22_20-21M` index files. Next, run:
+
+ $HISAT2_HOME/hisat2 -f -x $HISAT2_HOME/example/index/22_20-21M_snp -U $HISAT2_HOME/example/reads/reads_1.fa -S eg1.sam
+
+This runs the HISAT2 aligner, which aligns a set of unpaired reads to the
+genome region using the index generated in the previous step.
+The alignment results in SAM format are written to the file `eg1.sam`, and a
+short alignment summary is written to the console. (Actually, the summary is
+written to the "standard error" or "stderr" filehandle, which is typically
+printed to the console.)
+
+To see the first few lines of the SAM output, run:
+
+ head eg1.sam
+
+You will see something like this:
+
+ @HD VN:1.0 SO:unsorted
+ @SQ SN:22:20000001-21000000 LN:1000000
+ @PG ID:hisat2 PN:hisat2 VN:2.0.0-beta
+ 1 0 22:20000001-21000000 397984 255 100M * 0 0 GCCTGTGAGGGAGCCCCGGACCCGGTCAGAGCAGGAGCCTGGCCTGGGGCCAAGTTCACCTTATGGACTCTCTTCCCTGCCCTTCCAGGAGCAGCTCACT IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII AS:i:0 XN:i:0 XM:i:0 XO:i:0 XG:i:0 NM:i:0 MD:Z:100 YT:Z:UU NH:i:1
+ 2 16 22:20000001-21000000 398131 255 100M * 0 0 ATGACACACTGTACACACCAGGGGCCCTGTGCTCCCCAGGAAGAGGGCCCTCACTTGAAGCGGGGCCCGATGGCCGCCACGTGCCGGTTCATGCTCCCCT IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII AS:i:0 XN:i:0 XM:i:0 XO:i:0 XG:i:0 NM:i:0 MD:Z:80A19 YT:Z:UU NH:i:1 Zs:Z:80|S|rs576159895
+ 3 16 22:20000001-21000000 398222 255 100M * 0 0 TGCTCCCCTTGGCCCCGCCGATGTTCAGGGACATGGAGCGCTGCAGCAGGCTGGAGAAGATCTCCACTTGGTCAGAGCTGCAGTACTTGGCGATCTCAAA IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII AS:i:0 XN:i:0 XM:i:0 XO:i:0 XG:i:0 NM:i:0 MD:Z:16A83 YT:Z:UU NH:i:1 Zs:Z:16|S|rs2629364
+ 4 16 22:20000001-21000000 398247 255 90M200N10M * 0 0 CAGGGACATGGAGCGCTGCAGCAGGCTGGAGAAGATCTCCACTTGGTCAGAGCTGCAGTACTTGGCGATCTCAAACCGCTGCACCAGGAAGTCGATCCAG IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII AS:i:0 XN:i:0 XM:i:0 XO:i:0 XG:i:0 NM:i:0 MD:Z:100 YT:Z:UU XS:A:- NH:i:1
+ 5 16 22:20000001-21000000 398194 255 100M * 0 0 GGCCCGATGGCCGCCACGTGCCGGTTCATGCTCCCCTTGGCCCCGCCGATGTTCAGGGACATGGAGCGCTGCAGCAGGCTGGAGAAGATCTCCACTTGGT IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII AS:i:0 XN:i:0 XM:i:0 XO:i:0 XG:i:0 NM:i:0 MD:Z:17A26A55 YT:Z:UU NH:i:1 Zs:Z:17|S|rs576159895,26|S|rs2629364
+ 6 0 22:20000001-21000000 398069 255 100M * 0 0 CAGGAGCAGCTCACTGAAATGTGTTCCCCGTCTACAGAAGTACCGTGATACACAGACGCCCCATGACACACTGTACACACCAGGGGCCCTGTGCTCCCCA IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII AS:i:0 XN:i:0 XM:i:0 XO:i:0 XG:i:0 NM:i:0 MD:Z:100 YT:Z:UU NH:i:1
+ 7 0 22:20000001-21000000 397896 255 100M * 0 0 GTGGAGTAGATCTTCTCGCGAAGCACATTGCAGATGGTTGCATTTGGAACCACATCGGCATGCAGGAGGGACAGCCCCAGGGTCAGCAGCCTGTGAGGGA IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII AS:i:0 XN:i:0 XM:i:0 XO:i:0 XG:i:0 NM:i:0 MD:Z:31G68 YT:Z:UU NH:i:1 Zs:Z:31|S|rs562662261
+ 8 0 22:20000001-21000000 398150 255 100M * 0 0 AGGGGCCCTGTGCTCCCCAGGAAGAGGGCCCTCACTTGAAGCGGGGCCCGATGGCCGCCACGTGCCGGTTCATGCTCCCCTTGGCCCCGCCGATGTTCAG IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII AS:i:0 XN:i:0 XM:i:0 XO:i:0 XG:i:0 NM:i:0 MD:Z:61A26A11 YT:Z:UU NH:i:1 Zs:Z:61|S|rs576159895,26|S|rs2629364
+ 9 16 22:20000001-21000000 398329 255 8M200N92M * 0 0 ACCAGGAAGTCGATCCAGATGTAGTGGGGGGTCACTTCGGGGGGACAGGGTTTGGGTTGACTTGCTTCCGAGGCAGCCAGGGGGTCTGCTTCCTTTATCT IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII AS:i:0 XN:i:0 XM:i:0 XO:i:0 XG:i:0 NM:i:0 MD:Z:100 YT:Z:UU XS:A:- NH:i:1
+ 10 16 22:20000001-21000000 398184 255 100M * 0 0 CTTGAAGCGGGGCCCGATGGCCGCCACGTGCCGGTTCATGCTCCCCTTGGCCCCGCCGATGTTCAGGGACATGGAGCGCTGCAGCAGGCTGGAGAAGATC IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII AS:i:0 XN:i:0 XM:i:0 XO:i:0 XG:i:0 NM:i:0 MD:Z:27A26A45 YT:Z:UU NH:i:1 Zs:Z:27|S|rs576159895,26|S|rs2629364
+
+The first few lines (beginning with `@`) are SAM header lines, and the rest of
+the lines are SAM alignments, one line per read or mate. See the [HISAT2
+manual section on SAM output] and the [SAM specification] for details about how
+to interpret the SAM file format.
+
+Paired-end example
+------------------
+
+To align paired-end reads included with HISAT2, stay in the same directory and
+run:
+
+ $HISAT2_HOME/hisat2 -f -x $HISAT2_HOME/example/index/22_20-21M_snp -1 $HISAT2_HOME/example/reads/reads_1.fa -2 $HISAT2_HOME/example/reads/reads_2.fa -S eg2.sam
+
+This aligns a set of paired-end reads to the reference genome, with results
+written to the file `eg2.sam`.
+
+Using SAMtools/BCFtools downstream
+----------------------------------
+
+[SAMtools] is a collection of tools for manipulating and analyzing SAM and BAM
+alignment files. [BCFtools] is a collection of tools for calling variants and
+manipulating VCF and BCF files, and it is typically distributed with [SAMtools].
+Using these tools together allows you to get from alignments in SAM format to
+variant calls in VCF format. This example assumes that `samtools` and
+`bcftools` are installed and that the directories containing these binaries are
+in your [PATH environment variable].
+
+Run the paired-end example:
+
+ $HISAT2_HOME/hisat -f -x $HISAT2_HOME/example/index/22_20-21M_snp -1 $HISAT2_HOME/example/reads/reads_1.fa -2 $HISAT2_HOME/example/reads/reads_2.fa -S eg2.sam
+
+Use `samtools view` to convert the SAM file into a BAM file. BAM is a the
+binary format corresponding to the SAM text format. Run:
+
+ samtools view -bS eg2.sam > eg2.bam
+
+Use `samtools sort` to convert the BAM file to a sorted BAM file. The following command requires samtools version 1.2 or higher.
+
+ samtools sort eg2.bam -o eg2.sorted.bam
+
+We now have a sorted BAM file called `eg2.sorted.bam`. Sorted BAM is a useful
+format because the alignments are (a) compressed, which is convenient for
+long-term storage, and (b) sorted, which is convenient for variant discovery.
+To generate variant calls in VCF format, run:
+
+ samtools mpileup -uf $HISAT2_HOME/example/reference/22_20-21M.fa eg2.sorted.bam | bcftools view -bvcg - > eg2.raw.bcf
+
+Then to view the variants, run:
+
+ bcftools view eg2.raw.bcf
+
+See the official SAMtools guide to [Calling SNPs/INDELs with SAMtools/BCFtools]
+for more details and variations on this process.
+
+[BCFtools]: http://samtools.sourceforge.net/mpileup.shtml
+[Calling SNPs/INDELs with SAMtools/BCFtools]: http://samtools.sourceforge.net/mpileup.shtml
diff --git a/MANUAL.markdown b/MANUAL.markdown
new file mode 100644
index 0000000..a88b0f6
--- /dev/null
+++ b/MANUAL.markdown
@@ -0,0 +1,2437 @@
+
+
+Introduction
+============
+
+What is HISAT2?
+-----------------
+
+HISAT2 is a fast and sensitive alignment program for mapping next-generation sequencing reads
+(whole-genome, transcriptome, and exome sequencing data) against the general human population
+(as well as against a single reference genome). Based on [GCSA] (an extension of [BWT] for a graph), we designed and implemented a graph FM index (GFM),
+an original approach and its first implementation to the best of our knowledge.
+In addition to using one global GFM index that represents general population,
+HISAT2 uses a large set of small GFM indexes that collectively cover the whole genome
+(each index representing a genomic region of 56 Kbp, with 55,000 indexes needed to cover human population).
+These small indexes (called local indexes) combined with several alignment strategies enable effective alignment of sequencing reads.
+This new indexing scheme is called Hierarchical Graph FM index (HGFM).
+We have developed HISAT 2 based on the [HISAT] and [Bowtie2] implementations.
+HISAT2 outputs alignments in [SAM] format, enabling interoperation with a large number of other tools (e.g. [SAMtools], [GATK]) that use SAM.
+HISAT2 is distributed under the [GPLv3 license], and it runs on the command line under
+Linux, Mac OS X and Windows.
+
+[HISAT2]: http://ccb.jhu.edu/software/hisat2
+[HISAT]: http://ccb.jhu.edu/software/hisat
+[Bowtie2]: http://bowtie-bio.sf.net/bowtie2
+[Bowtie]: http://bowtie-bio.sf.net
+[Bowtie1]: http://bowtie-bio.sf.net
+[GCSA]: http://ieeexplore.ieee.org/xpls/abs_all.jsp?arnumber=6698337&tag=1
+[Burrows-Wheeler Transform]: http://en.wikipedia.org/wiki/Burrows-Wheeler_transform
+[BWT]: http://en.wikipedia.org/wiki/Burrows-Wheeler_transform
+[FM Index]: http://en.wikipedia.org/wiki/FM-index
+[SAM]: http://samtools.sourceforge.net/SAM1.pdf
+[SAMtools]: http://samtools.sourceforge.net
+[GATK]: http://www.broadinstitute.org/gsa/wiki/index.php/The_Genome_Analysis_Toolkit
+[TopHat2]: http://ccb.jhu.edu/software/tophat
+[Cufflinks]: http://cufflinks.cbcb.umd.edu/
+[Crossbow]: http://bowtie-bio.sf.net/crossbow
+[Myrna]: http://bowtie-bio.sf.net/myrna
+[Bowtie paper]: http://genomebiology.com/2009/10/3/R25
+[GPLv3 license]: http://www.gnu.org/licenses/gpl-3.0.html
+
+
+Obtaining HISAT2
+==================
+
+Download HISAT2 sources and binaries from the Releases sections on the right side.
+Binaries are available for Intel architectures (`x86_64`) running Linux, and Mac OS X.
+
+Building from source
+--------------------
+
+Building HISAT2 from source requires a GNU-like environment with GCC, GNU Make
+and other basics. It should be possible to build HISAT2 on most vanilla Linux
+installations or on a Mac installation with [Xcode] installed. HISAT2 can
+also be built on Windows using [Cygwin] or [MinGW] (MinGW recommended). For a
+MinGW build the choice of what compiler is to be used is important since this
+will determine if a 32 or 64 bit code can be successfully compiled using it. If
+there is a need to generate both 32 and 64 bit on the same machine then a multilib
+MinGW has to be properly installed. [MSYS], the [zlib] library, and depending on
+architecture [pthreads] library are also required. We are recommending a 64 bit
+build since it has some clear advantages in real life research problems. In order
+to simplify the MinGW setup it might be worth investigating popular MinGW personal
+builds since these are coming already prepared with most of the toolchains needed.
+
+First, download the [source package] from the Releases section on the right side.
+Unzip the file, change to the unzipped directory, and build the
+HISAT2 tools by running GNU `make` (usually with the command `make`, but
+sometimes with `gmake`) with no arguments. If building with MinGW, run `make`
+from the MSYS environment.
+
+HISAT2 is using the multithreading software model in order to speed up
+execution times on SMP architectures where this is possible. On POSIX
+platforms (like linux, Mac OS, etc) it needs the pthread library. Although
+it is possible to use pthread library on non-POSIX platform like Windows, due
+to performance reasons HISAT2 will try to use Windows native multithreading
+if possible.
+
+For the support of SRA data access in HISAT2, please download and install the [NCBI-NGS] toolkit.
+When running `make`, specify additional variables as follow.
+`make USE_SRA=1 NCBI_NGS_DIR=/path/to/NCBI-NGS-directory NCBI_VDB_DIR=/path/to/NCBI-NGS-directory`,
+where `NCBI_NGS_DIR` and `NCBI_VDB_DIR` will be used in Makefile for -I and -L compilation options.
+For example, $(NCBI_NGS_DIR)/include and $(NCBI_NGS_DIR)/lib64 will be used.
+
+[Cygwin]: http://www.cygwin.com/
+[MinGW]: http://www.mingw.org/
+[MSYS]: http://www.mingw.org/wiki/msys
+[zlib]: http://cygwin.com/packages/mingw-zlib/
+[pthreads]: http://sourceware.org/pthreads-win32/
+[GnuWin32]: http://gnuwin32.sf.net/packages/coreutils.htm
+[Download]: https://sourceforge.net/projects/bowtie-bio/files/bowtie2/
+[sourceforge site]: https://sourceforge.net/projects/bowtie-bio/files/bowtie2/
+[source package]: http://ccb.jhu.edu/software/hisat2/downloads/hisat2-2.0.0-beta-source.zip
+[Xcode]: http://developer.apple.com/xcode/
+[NCBI-NGS]: https://github.com/ncbi/ngs/wiki/Downloads
+
+Running HISAT2
+=============
+
+Adding to PATH
+--------------
+
+By adding your new HISAT2 directory to your [PATH environment variable], you
+ensure that whenever you run `hisat2`, `hisat2-build` or `hisat2-inspect`
+from the command line, you will get the version you just installed without
+having to specify the entire path. This is recommended for most users. To do
+this, follow your operating system's instructions for adding the directory to
+your [PATH].
+
+If you would like to install HISAT2 by copying the HISAT2 executable files
+to an existing directory in your [PATH], make sure that you copy all the
+executables, including `hisat2`, `hisat2-align-s`, `hisat2-align-l`, `hisat2-build`, `hisat2-build-s`, `hisat2-build-l`, `hisat2-inspect`, `hisat2-inspect-s` and
+`hisat2-inspect-l`.
+
+[PATH environment variable]: http://en.wikipedia.org/wiki/PATH_(variable)
+[PATH]: http://en.wikipedia.org/wiki/PATH_(variable)
+
+Reporting
+---------
+
+The reporting mode governs how many alignments HISAT2 looks for, and how to
+report them.
+
+In general, when we say that a read has an alignment, we mean that it has a
+[valid alignment]. When we say that a read has multiple alignments, we mean
+that it has multiple alignments that are valid and distinct from one another.
+
+[valid alignment]: #valid-alignments-meet-or-exceed-the-minimum-score-threshold
+
+By default, HISAT2 may soft-clip reads near their 5' and 3' ends. Users can control this behavior by setting different penalties for soft-clipping ([`--sp`]) or by disallowing soft-clipping ([`--no-softclip`]).
+
+### Distinct alignments map a read to different places
+
+Two alignments for the same individual read are "distinct" if they map the same
+read to different places. Specifically, we say that two alignments are distinct
+if there are no alignment positions where a particular read offset is aligned
+opposite a particular reference offset in both alignments with the same
+orientation. E.g. if the first alignment is in the forward orientation and
+aligns the read character at read offset 10 to the reference character at
+chromosome 3, offset 3,445,245, and the second alignment is also in the forward
+orientation and also aligns the read character at read offset 10 to the
+reference character at chromosome 3, offset 3,445,245, they are not distinct
+alignments.
+
+Two alignments for the same pair are distinct if either the mate 1s in the two
+paired-end alignments are distinct or the mate 2s in the two alignments are
+distinct or both.
+
+### Default mode: search for one or more alignments, report each
+
+HISAT2 searches for up to N distinct, primary alignments for
+each read, where N equals the integer specified with the `-k` parameter.
+Primary alignments mean alignments whose alignment score is equal or higher than any other alignments.
+It is possible that multiple distinct alignments have the same score.
+That is, if `-k 2` is specified, HISAT2 will search for at most 2 distinct
+alignments. The alignment score for a paired-end alignment equals the sum of the
+alignment scores of the individual mates. Each reported read or pair alignment
+beyond the first has the SAM 'secondary' bit (which equals 256) set in its FLAGS
+field. See the [SAM specification] for details.
+
+HISAT2 does not "find" alignments in any specific order, so for reads that
+have more than N distinct, valid alignments, HISAT2 does not guarantee that
+the N alignments reported are the best possible in terms of alignment score.
+Still, this mode can be effective and fast in situations where the user cares
+more about whether a read aligns (or aligns a certain number of times) than
+where exactly it originated.
+
+
+[SAM specification]: http://samtools.sourceforge.net/SAM1.pdf
+
+Alignment summary
+------------------
+
+When HISAT2 finishes running, it prints messages summarizing what happened.
+These messages are printed to the "standard error" ("stderr") filehandle. For
+datasets consisting of unpaired reads, the summary might look like this:
+
+ 20000 reads; of these:
+ 20000 (100.00%) were unpaired; of these:
+ 1247 (6.24%) aligned 0 times
+ 18739 (93.69%) aligned exactly 1 time
+ 14 (0.07%) aligned >1 times
+ 93.77% overall alignment rate
+
+For datasets consisting of pairs, the summary might look like this:
+
+ 10000 reads; of these:
+ 10000 (100.00%) were paired; of these:
+ 650 (6.50%) aligned concordantly 0 times
+ 8823 (88.23%) aligned concordantly exactly 1 time
+ 527 (5.27%) aligned concordantly >1 times
+ ----
+ 650 pairs aligned concordantly 0 times; of these:
+ 34 (5.23%) aligned discordantly 1 time
+ ----
+ 616 pairs aligned 0 times concordantly or discordantly; of these:
+ 1232 mates make up the pairs; of these:
+ 660 (53.57%) aligned 0 times
+ 571 (46.35%) aligned exactly 1 time
+ 1 (0.08%) aligned >1 times
+ 96.70% overall alignment rate
+
+The indentation indicates how subtotals relate to totals.
+
+Wrapper
+-------
+
+The `hisat2`, `hisat2-build` and `hisat2-inspect` executables are actually
+wrapper scripts that call binary programs as appropriate. The wrappers shield
+users from having to distinguish between "small" and "large" index formats,
+discussed briefly in the following section. Also, the `hisat2` wrapper
+provides some key functionality, like the ability to handle compressed inputs,
+and the functionality for [`--un`], [`--al`] and related options.
+
+It is recommended that you always run the hisat2 wrappers and not run the
+binaries directly.
+
+Small and large indexes
+-----------------------
+
+`hisat2-build` can index reference genomes of any size. For genomes less than
+about 4 billion nucleotides in length, `hisat2-build` builds a "small" index
+using 32-bit numbers in various parts of the index. When the genome is longer,
+`hisat2-build` builds a "large" index using 64-bit numbers. Small indexes are
+stored in files with the `.ht2` extension, and large indexes are stored in
+files with the `.ht2l` extension. The user need not worry about whether a
+particular index is small or large; the wrapper scripts will automatically build
+and use the appropriate index.
+
+Performance tuning
+------------------
+
+1. If your computer has multiple processors/cores, use `-p`
+
+ The [`-p`] option causes HISAT2 to launch a specified number of parallel
+ search threads. Each thread runs on a different processor/core and all
+ threads find alignments in parallel, increasing alignment throughput by
+ approximately a multiple of the number of threads (though in practice,
+ speedup is somewhat worse than linear).
+
+Command Line
+------------
+
+### Setting function options
+
+Some HISAT2 options specify a function rather than an individual number or
+setting. In these cases the user specifies three parameters: (a) a function
+type `F`, (b) a constant term `B`, and (c) a coefficient `A`. The available
+function types are constant (`C`), linear (`L`), square-root (`S`), and natural
+log (`G`). The parameters are specified as `F,B,A` - that is, the function type,
+the constant term, and the coefficient are separated by commas with no
+whitespace. The constant term and coefficient may be negative and/or
+floating-point numbers.
+
+For example, if the function specification is `L,-0.4,-0.6`, then the function
+defined is:
+
+ f(x) = -0.4 + -0.6 * x
+
+If the function specification is `G,1,5.4`, then the function defined is:
+
+ f(x) = 1.0 + 5.4 * ln(x)
+
+See the documentation for the option in question to learn what the parameter `x`
+is for. For example, in the case if the [`--score-min`] option, the function
+`f(x)` sets the minimum alignment score necessary for an alignment to be
+considered valid, and `x` is the read length.
+
+### Usage
+
+ hisat2 [options]* -x {-1 -2 | -U | --sra-acc } [-S ]
+
+### Main arguments
+
+
+
+[`-x`]: #hisat2-options-x
+
+ -x
+
+ |
+
+The basename of the index for the reference genome. The basename is the name of
+any of the index files up to but not including the final `.1.ht2` / etc.
+`hisat2` looks for the specified index first in the current directory,
+then in the directory specified in the `HISAT2_INDEXES` environment variable.
+
+ |
+
+[`-1`]: #hisat2-options-1
+
+ -1
+
+ |
+
+Comma-separated list of files containing mate 1s (filename usually includes
+`_1`), e.g. `-1 flyA_1.fq,flyB_1.fq`. Sequences specified with this option must
+correspond file-for-file and read-for-read with those specified in ``. Reads
+may be a mix of different lengths. If `-` is specified, `hisat2` will read the
+mate 1s from the "standard in" or "stdin" filehandle.
+
+ |
+
+[`-2`]: #hisat2-options-2
+
+ -2
+
+ |
+
+Comma-separated list of files containing mate 2s (filename usually includes
+`_2`), e.g. `-2 flyA_2.fq,flyB_2.fq`. Sequences specified with this option must
+correspond file-for-file and read-for-read with those specified in ``. Reads
+may be a mix of different lengths. If `-` is specified, `hisat2` will read the
+mate 2s from the "standard in" or "stdin" filehandle.
+
+ |
+
+[`-U`]: #hisat2-options-U
+
+ -U
+
+ |
+
+Comma-separated list of files containing unpaired reads to be aligned, e.g.
+`lane1.fq,lane2.fq,lane3.fq,lane4.fq`. Reads may be a mix of different lengths.
+If `-` is specified, `hisat2` gets the reads from the "standard in" or "stdin"
+filehandle.
+
+ |
+
+[`--sra-acc`]: #hisat2-options-sra-acc
+
+ --sra-acc
+
+ |
+
+Comma-separated list of SRA accession numbers, e.g. `--sra-acc SRR353653,SRR353654`.
+Information about read types is available at http://trace.ncbi.nlm.nih.gov/Traces/sra/sra.cgi?sp=runinfo&acc=sra-acc&retmode=xml,
+where sra-acc is SRA accession number. If users run HISAT2 on a computer cluster, it is recommended to disable SRA-related caching (see the instruction at [SRA-MANUAL]).
+
+[SRA-MANUAL]: https://github.com/ncbi/sra-tools/wiki/Toolkit-Configuration
+
+ |
+
+[`-S`]: #hisat2-options-S
+
+ -S
+
+ |
+
+File to write SAM alignments to. By default, alignments are written to the
+"standard out" or "stdout" filehandle (i.e. the console).
+
+ |
+
+### Options
+
+#### Input options
+
+
+
+
+[`-q`]: #hisat2-options-q
+
+ -q
+
+ |
+
+Reads (specified with ``, ``, ``) are FASTQ files. FASTQ files
+usually have extension `.fq` or `.fastq`. FASTQ is the default format. See
+also: [`--solexa-quals`] and [`--int-quals`].
+
+ |
+
+
+[`--qseq`]: #hisat2-options-qseq
+
+ --qseq
+
+ |
+
+Reads (specified with ``, ``, ``) are QSEQ files. QSEQ files usually
+end in `_qseq.txt`. See also: [`--solexa-quals`] and [`--int-quals`].
+
+ |
+
+
+[`-f`]: #hisat2-options-f
+
+ -f
+
+ |
+
+Reads (specified with ``, ``, ``) are FASTA files. FASTA files
+usually have extension `.fa`, `.fasta`, `.mfa`, `.fna` or similar. FASTA files
+do not have a way of specifying quality values, so when `-f` is set, the result
+is as if `--ignore-quals` is also set.
+
+ |
+
+
+[`-r`]: #hisat2-options-r
+
+ -r
+
+ |
+
+Reads (specified with ``, ``, ``) are files with one input sequence
+per line, without any other information (no read names, no qualities). When
+`-r` is set, the result is as if `--ignore-quals` is also set.
+
+ |
+
+
+[`-c`]: #hisat2-options-c
+
+ -c
+
+ |
+
+The read sequences are given on command line. I.e. ``, `` and
+`` are comma-separated lists of reads rather than lists of read files.
+There is no way to specify read names or qualities, so `-c` also implies
+`--ignore-quals`.
+
+ |
+
+
+[`-s`/`--skip`]: #hisat2-options-s
+[`-s`]: #hisat2-options-s
+
+ -s/--skip
+
+ |
+
+Skip (i.e. do not align) the first `` reads or pairs in the input.
+
+ |
+
+
+[`-u`/`--qupto`]: #hisat2-options-u
+[`-u`]: #hisat2-options-u
+
+ -u/--qupto
+
+ |
+
+Align the first `` reads or read pairs from the input (after the
+[`-s`/`--skip`] reads or pairs have been skipped), then stop. Default: no limit.
+
+ |
+
+
+[`-5`/`--trim5`]: #hisat2-options-5
+[`-5`]: #hisat2-options-5
+
+ -5/--trim5
+
+ |
+
+Trim `` bases from 5' (left) end of each read before alignment (default: 0).
+
+ |
+
+
+[`-3`/`--trim3`]: #hisat2-options-3
+[`-3`]: #hisat2-options-3
+
+ -3/--trim3
+
+ |
+
+Trim `` bases from 3' (right) end of each read before alignment (default:
+0).
+
+ |
+
+[`--phred33`]: #hisat2-options-phred33-quals
+
+ --phred33
+
+ |
+
+Input qualities are ASCII chars equal to the [Phred quality] plus 33. This is
+also called the "Phred+33" encoding, which is used by the very latest Illumina
+pipelines.
+
+[Phred quality]: http://en.wikipedia.org/wiki/Phred_quality_score
+
+ |
+
+
+[`--phred64`]: #hisat2-options-phred64-quals
+
+ --phred64
+
+ |
+
+Input qualities are ASCII chars equal to the [Phred quality] plus 64. This is
+also called the "Phred+64" encoding.
+
+ |
+
+
+[`--solexa-quals`]: #hisat2-options-solexa-quals
+
+ --solexa-quals
+
+ |
+
+Convert input qualities from [Solexa][Phred quality] (which can be negative) to
+[Phred][Phred quality] (which can't). This scheme was used in older Illumina GA
+Pipeline versions (prior to 1.3). Default: off.
+
+ |
+
+
+[`--int-quals`]: #hisat2-options-int-quals
+
+ --int-quals
+
+ |
+
+Quality values are represented in the read input file as space-separated ASCII
+integers, e.g., `40 40 30 40`..., rather than ASCII characters, e.g., `II?I`....
+ Integers are treated as being on the [Phred quality] scale unless
+[`--solexa-quals`] is also specified. Default: off.
+
+ |
+
+#### Alignment options
+
+
+
+
+
+[`--n-ceil`]: #hisat2-options-n-ceil
+
+ --n-ceil
+
+ |
+
+Sets a function governing the maximum number of ambiguous characters (usually
+`N`s and/or `.`s) allowed in a read as a function of read length. For instance,
+specifying `-L,0,0.15` sets the N-ceiling function `f` to `f(x) = 0 + 0.15 * x`,
+where x is the read length. See also: [setting function options]. Reads
+exceeding this ceiling are [filtered out]. Default: `L,0,0.15`.
+
+[filtered out]: #filtering
+
+ |
+
+
+
+[`--ignore-quals`]: #hisat2-options-ignore-quals
+
+ --ignore-quals
+
+ |
+
+When calculating a mismatch penalty, always consider the quality value at the
+mismatched position to be the highest possible, regardless of the actual value.
+I.e. input is treated as though all quality values are high. This is also the
+default behavior when the input doesn't specify quality values (e.g. in [`-f`],
+[`-r`], or [`-c`] modes).
+
+ |
+
+
+[`--nofw`]: #hisat2-options-nofw
+
+ --nofw/--norc
+
+ |
+
+If `--nofw` is specified, `hisat2` will not attempt to align unpaired reads to
+the forward (Watson) reference strand. If `--norc` is specified, `hisat2` will
+not attempt to align unpaired reads against the reverse-complement (Crick)
+reference strand. In paired-end mode, `--nofw` and `--norc` pertain to the
+fragments; i.e. specifying `--nofw` causes `hisat2` to explore only those
+paired-end configurations corresponding to fragments from the reverse-complement
+(Crick) strand. Default: both strands enabled.
+
+ |
+
+
+
+#### Scoring options
+
+
+
+
+
+[`--mp`]: #hisat2-options-mp
+
+ --mp MX,MN
+
+ |
+
+Sets the maximum (`MX`) and minimum (`MN`) mismatch penalties, both integers. A
+number less than or equal to `MX` and greater than or equal to `MN` is
+subtracted from the alignment score for each position where a read character
+aligns to a reference character, the characters do not match, and neither is an
+`N`. If [`--ignore-quals`] is specified, the number subtracted quals `MX`.
+Otherwise, the number subtracted is `MN + floor( (MX-MN)(MIN(Q, 40.0)/40.0) )`
+where Q is the Phred quality value. Default: `MX` = 6, `MN` = 2.
+
+ |
+
+
+[`--sp`]: #hisat2-options-sp
+
+ --sp MX,MN
+
+ |
+
+Sets the maximum (`MX`) and minimum (`MN`) penalties for soft-clipping per base,
+both integers. A number less than or equal to `MX` and greater than or equal to `MN` is
+subtracted from the alignment score for each position.
+The number subtracted is `MN + floor( (MX-MN)(MIN(Q, 40.0)/40.0) )`
+where Q is the Phred quality value. Default: `MX` = 2, `MN` = 1.
+
+ |
+
+
+[`--sp`]: #hisat2-options-no-softclip
+
+ --no-softclip
+
+ |
+
+Disallow soft-clipping.
+
+ |
+
+
+
+[`--np`]: #hisat2-options-np
+
+ --np
+
+ |
+
+Sets penalty for positions where the read, reference, or both, contain an
+ambiguous character such as `N`. Default: 1.
+
+ |
+
+
+[`--rdg`]: #hisat2-options-rdg
+
+ --rdg ,
+
+ |
+
+Sets the read gap open (``) and extend (``) penalties. A read gap of
+length N gets a penalty of `` + N * ``. Default: 5, 3.
+
+ |
+
+
+[`--rfg`]: #hisat2-options-rfg
+
+ --rfg ,
+
+ |
+
+Sets the reference gap open (``) and extend (``) penalties. A
+reference gap of length N gets a penalty of `` + N * ``. Default:
+5, 3.
+
+ |
+
+
+[`--score-min`]: #hisat2-options-score-min
+
+ --score-min
+
+ |
+
+Sets a function governing the minimum alignment score needed for an alignment to
+be considered "valid" (i.e. good enough to report). This is a function of read
+length. For instance, specifying `L,0,-0.6` sets the minimum-score function `f`
+to `f(x) = 0 + -0.6 * x`, where `x` is the read length. See also: [setting
+function options]. The default is `L,0,-0.2`.
+
+ |
+
+
+#### Spliced alignment options
+
+
+
+
+
+[`--pen-cansplice`]: #hisat2-options-pen-cansplice
+
+ --pen-cansplice
+
+ |
+
+Sets the penalty for each pair of canonical splice sites (e.g. GT/AG). Default: 0.
+
+ |
+
+
+
+[`--pen-noncansplice`]: #hisat2-options-pen-noncansplice
+
+ --pen-noncansplice
+
+ |
+
+Sets the penalty for each pair of non-canonical splice sites (e.g. non-GT/AG). Default: 12.
+
+ |
+
+
+[`--pen-canintronlen`]: #hisat2-options-pen-canintronlen
+
+ --pen-canintronlen
+
+ |
+
+Sets the penalty for long introns with canonical splice sites so that alignments with shorter introns are preferred
+to those with longer ones. Default: G,-8,1
+
+ |
+
+
+[`--pen-noncanintronlen`]: #hisat2-options-pen-noncanintronlen
+
+ --pen-noncanintronlen
+
+ |
+
+Sets the penalty for long introns with noncanonical splice sites so that alignments with shorter introns are preferred
+to those with longer ones. Default: G,-8,1
+
+
+ |
+
+
+[`--min-intronlen`]: #hisat2-options-min-intronlen
+
+ --min-intronlen
+
+ |
+
+Sets minimum intron length. Default: 20
+
+ |
+
+
+
+
+[`--max-intronlen`]: #hisat2-options-max-intronlen
+
+ --max-intronlen
+
+ |
+
+Sets maximum intron length. Default: 500000
+
+ |
+
+
+
+[`--splice-infile`]: #hisat2-options-known-splicesite-infile
+
+ --known-splicesite-infile
+
+ |
+
+With this mode, you can provide a list of known splice sites, which HISAT2 makes use of to align reads with small anchors.
+You can create such a list using `python hisat2_extract_splice_sites.py genes.gtf > splicesites.txt`,
+where `hisat2_extract_splice_sites.py` is included in the HISAT2 package, `genes.gtf` is a gene annotation file,
+and `splicesites.txt` is a list of splice sites with which you provide HISAT2 in this mode.
+Note that it is better to use indexes built using annotated transcripts (such as genome_tran or genome_snp_tran), which works better
+than using this option. It has no effect to provide splice sites that are already included in the indexes.
+
+ |
+
+
+
+[`--novel-splicesite-outfile`]: #hisat2-options-novel-splicesite-outfile
+
+ --novel-splicesite-outfile
+
+ |
+
+In this mode, HISAT2 reports a list of splice sites in the file :
+ chromosome name `` genomic position of the flanking base on the left side of an intron `` genomic position of the flanking base on the right `` strand (+, -, and .)
+ '.' indicates an unknown strand for non-canonical splice sites.
+
+ |
+
+
+
+[`--novel-splicesite-infile`]: #hisat2-options-novel-splicesite-infile
+
+ --novel-splicesite-infile
+
+ |
+
+With this mode, you can provide a list of novel splice sites that were generated from the above option "--novel-splicesite-outfile".
+
+ |
+
+
+
+[`--no-temp-splicesite`]: #hisat2-options-no-temp-splicesite
+
+ --no-temp-splicesite
+
+ |
+
+HISAT2, by default, makes use of splice sites found by earlier reads to align later reads in the same run,
+in particular, reads with small anchors (<= 15 bp).
+The option disables this default alignment strategy.
+
+ |
+
+
+
+[`--no-spliced-alignment`]: #hisat2-options-no-spliced-alignment
+
+ --no-spliced-alignment
+
+ |
+
+Disable spliced alignment.
+
+ |
+
+
+
+[`--rna-strandness`]: #hisat2-options-rna-strandness
+
+ --rna-strandness
+
+ |
+
+Specify strand-specific information: the default is unstranded.
+For single-end reads, use F or R.
+ 'F' means a read corresponds to a transcript.
+ 'R' means a read corresponds to the reverse complemented counterpart of a transcript.
+For paired-end reads, use either FR or RF.
+With this option being used, every read alignment will have an XS attribute tag:
+ '+' means a read belongs to a transcript on '+' strand of genome.
+ '-' means a read belongs to a transcript on '-' strand of genome.
+
+(TopHat has a similar option, --library-type option, where fr-firststrand corresponds to R and RF; fr-secondstrand corresponds to F and FR.)
+ |
+
+
+[`--tmo/--transcriptome-mapping-only`]: #hisat2-options-tmo
+
+ --tmo/--transcriptome-mapping-only
+
+ |
+
+Report only those alignments within known transcripts.
+
+ |
+
+
+[`--dta/--downstream-transcriptome-assembly`]: #hisat2-options-dta
+
+ --dta/--downstream-transcriptome-assembly
+
+ |
+
+Report alignments tailored for transcript assemblers including StringTie.
+With this option, HISAT2 requires longer anchor lengths for de novo discovery of splice sites.
+This leads to fewer alignments with short-anchors,
+which helps transcript assemblers improve significantly in computation and memory usage.
+
+ |
+
+
+[`--dta-cufflinks`]: #hisat2-options-dta-cufflinks
+
+ --dta-cufflinks
+
+ |
+
+Report alignments tailored specifically for Cufflinks. In addition to what HISAT2 does with the above option (--dta),
+With this option, HISAT2 looks for novel splice sites with three signals (GT/AG, GC/AG, AT/AC), but all user-provided splice sites are used irrespective of their signals.
+HISAT2 produces an optional field, XS:A:[+-], for every spliced alignment.
+
+ |
+
+
+[`--avoid-pseudogene`]: #hisat2-options-avoid-pseudogene
+
+ --avoid-pseudogene
+
+ |
+
+Try to avoid aligning reads to pseudogenes. Note this option is experimental and needs further investigation.
+
+ |
+
+
+[`--no-templatelen-adjustment`]: #hisat2-options-no-templatelen-adjustment
+
+ --no-templatelen-adjustment
+
+ |
+
+Disables template length adjustment for RNA-seq reads.
+
+ |
+
+
+
+#### Reporting options
+
+
+
+
+
+[`-k`]: #hisat2-options-k
+
+ -k
+
+ |
+
+It searches for at most `` distinct, primary alignments for each read.
+Primary alignments mean alignments whose alignment score is equal or higher than any other alignments.
+The search terminates when it can't find more distinct valid alignments, or when it
+finds ``, whichever happens first. The alignment score for a paired-end
+alignment equals the sum of the alignment scores of the individual mates. Each
+reported read or pair alignment beyond the first has the SAM 'secondary' bit
+(which equals 256) set in its FLAGS field. For reads that have more than
+`` distinct, valid alignments, `hisat2` does not guarantee that the
+`` alignments reported are the best possible in terms of alignment score. Default: 5 (HFM) or 10 (HGFM)
+
+Note: HISAT2 is not designed with large values for `-k` in mind, and when
+aligning reads to long, repetitive genomes large `-k` can be very, very slow.
+
+ |
+
+
+[`--max-seeds`]: #hisat2-options-max-seeds
+
+ --max-seeds
+
+ |
+
+HISAT2, like other aligners, uses seed-and-extend approaches. HISAT2 tries to extend seeds to full-length alignments. In HISAT2, --max-seeds is used to control the maximum number of seeds that will be extended. HISAT2 extends up to these many seeds and skips the rest of the seeds. Large values for `--max-seeds` may improve alignment sensitivity, but HISAT2 is not designed with large values for `--max-seeds` in mind, and when aligning reads to long, repetitive genomes large `--max-seeds` can be very, very slow. The default value is the maximum of 5 and the value that comes with`-k`.
+
+ |
+
+
+
+[`--secondary`]: #hisat2-options-secondary
+
+ --secondary
+
+ |
+
+Report secondary alignments.
+
+ |
+
+
+
+#### Paired-end options
+
+
+
+
+
+[`-I`/`--minins`]: #hisat2-options-I
+[`-I`]: #hisat2-options-I
+
+ -I/--minins
+
+ |
+
+The minimum fragment length for valid paired-end alignments.This option is valid only with --no-spliced-alignment.
+E.g. if `-I 60` is specified and a paired-end alignment consists of two 20-bp alignments in the
+appropriate orientation with a 20-bp gap between them, that alignment is
+considered valid (as long as [`-X`] is also satisfied). A 19-bp gap would not
+be valid in that case. If trimming options [`-3`] or [`-5`] are also used, the
+[`-I`] constraint is applied with respect to the untrimmed mates.
+
+The larger the difference between [`-I`] and [`-X`], the slower HISAT2 will
+run. This is because larger differences between [`-I`] and [`-X`] require that
+HISAT2 scan a larger window to determine if a concordant alignment exists.
+For typical fragment length ranges (200 to 400 nucleotides), HISAT2 is very
+efficient.
+
+Default: 0 (essentially imposing no minimum)
+
+ |
+
+
+[`-X`/`--maxins`]: #hisat2-options-X
+[`-X`]: #hisat2-options-X
+
+ -X/--maxins
+
+ |
+
+The maximum fragment length for valid paired-end alignments. This option is valid only with --no-spliced-alignment.
+E.g. if `-X 100` is specified and a paired-end alignment consists of two 20-bp alignments in the
+proper orientation with a 60-bp gap between them, that alignment is considered
+valid (as long as [`-I`] is also satisfied). A 61-bp gap would not be valid in
+that case. If trimming options [`-3`] or [`-5`] are also used, the `-X`
+constraint is applied with respect to the untrimmed mates, not the trimmed
+mates.
+
+The larger the difference between [`-I`] and [`-X`], the slower HISAT2 will
+run. This is because larger differences between [`-I`] and [`-X`] require that
+HISAT2 scan a larger window to determine if a concordant alignment exists.
+For typical fragment length ranges (200 to 400 nucleotides), HISAT2 is very
+efficient.
+
+Default: 500.
+
+ |
+
+
+[`--fr`/`--rf`/`--ff`]: #hisat2-options-fr
+[`--fr`]: #hisat2-options-fr
+[`--rf`]: #hisat2-options-fr
+[`--ff`]: #hisat2-options-fr
+
+ --fr/--rf/--ff
+
+ |
+
+The upstream/downstream mate orientations for a valid paired-end alignment
+against the forward reference strand. E.g., if `--fr` is specified and there is
+a candidate paired-end alignment where mate 1 appears upstream of the reverse
+complement of mate 2 and the fragment length constraints ([`-I`] and [`-X`]) are
+met, that alignment is valid. Also, if mate 2 appears upstream of the reverse
+complement of mate 1 and all other constraints are met, that too is valid.
+`--rf` likewise requires that an upstream mate1 be reverse-complemented and a
+downstream mate2 be forward-oriented. ` --ff` requires both an upstream mate 1
+and a downstream mate 2 to be forward-oriented. Default: `--fr` (appropriate
+for Illumina's Paired-end Sequencing Assay).
+
+ |
+
+
+[`--no-mixed`]: #hisat2-options-no-mixed
+
+ --no-mixed
+
+ |
+
+By default, when `hisat2` cannot find a concordant or discordant alignment for
+a pair, it then tries to find alignments for the individual mates. This option
+disables that behavior.
+
+ |
+
+
+[`--no-discordant`]: #hisat2-options-no-discordant
+
+ --no-discordant
+
+ |
+
+By default, `hisat2` looks for discordant alignments if it cannot find any
+concordant alignments. A discordant alignment is an alignment where both mates
+align uniquely, but that does not satisfy the paired-end constraints
+([`--fr`/`--rf`/`--ff`], [`-I`], [`-X`]). This option disables that behavior.
+
+ |
+
+#### Output options
+
+
+
+
+
+[`-t`/`--time`]: #hisat2-options-t
+[`-t`]: #hisat2-options-t
+
+ -t/--time
+
+ |
+
+Print the wall-clock time required to load the index files and align the reads.
+This is printed to the "standard error" ("stderr") filehandle. Default: off.
+
+ |
+
+
+[`--un`]: #hisat2-options-un
+[`--un-gz`]: #hisat2-options-un
+[`--un-bz2`]: #hisat2-options-un
+
+ --un
+ --un-gz
+ --un-bz2
+
+ |
+
+Write unpaired reads that fail to align to file at ``. These reads
+correspond to the SAM records with the FLAGS `0x4` bit set and neither the
+`0x40` nor `0x80` bits set. If `--un-gz` is specified, output will be gzip
+compressed. If `--un-bz2` is specified, output will be bzip2 compressed. Reads
+written in this way will appear exactly as they did in the input file, without
+any modification (same sequence, same name, same quality string, same quality
+encoding). Reads will not necessarily appear in the same order as they did in
+the input.
+
+ |
+
+
+[`--al`]: #hisat2-options-al
+[`--al-gz`]: #hisat2-options-al
+[`--al-bz2`]: #hisat2-options-al
+
+ --al
+ --al-gz
+ --al-bz2
+
+ |
+
+Write unpaired reads that align at least once to file at ``. These reads
+correspond to the SAM records with the FLAGS `0x4`, `0x40`, and `0x80` bits
+unset. If `--al-gz` is specified, output will be gzip compressed. If `--al-bz2`
+is specified, output will be bzip2 compressed. Reads written in this way will
+appear exactly as they did in the input file, without any modification (same
+sequence, same name, same quality string, same quality encoding). Reads will
+not necessarily appear in the same order as they did in the input.
+
+ |
+
+
+[`--un-conc`]: #hisat2-options-un-conc
+[`--un-conc-gz`]: #hisat2-options-un-conc
+[`--un-conc-bz2`]: #hisat2-options-un-conc
+
+ --un-conc
+ --un-conc-gz
+ --un-conc-bz2
+
+ |
+
+Write paired-end reads that fail to align concordantly to file(s) at ``.
+These reads correspond to the SAM records with the FLAGS `0x4` bit set and
+either the `0x40` or `0x80` bit set (depending on whether it's mate #1 or #2).
+`.1` and `.2` strings are added to the filename to distinguish which file
+contains mate #1 and mate #2. If a percent symbol, `%`, is used in ``,
+the percent symbol is replaced with `1` or `2` to make the per-mate filenames.
+Otherwise, `.1` or `.2` are added before the final dot in `` to make the
+per-mate filenames. Reads written in this way will appear exactly as they did
+in the input files, without any modification (same sequence, same name, same
+quality string, same quality encoding). Reads will not necessarily appear in
+the same order as they did in the inputs.
+
+ |
+
+
+[`--al-conc`]: #hisat2-options-al-conc
+[`--al-conc-gz`]: #hisat2-options-al-conc
+[`--al-conc-bz2`]: #hisat2-options-al-conc
+
+ --al-conc
+ --al-conc-gz
+ --al-conc-bz2
+
+ |
+
+Write paired-end reads that align concordantly at least once to file(s) at
+``. These reads correspond to the SAM records with the FLAGS `0x4` bit
+unset and either the `0x40` or `0x80` bit set (depending on whether it's mate #1
+or #2). `.1` and `.2` strings are added to the filename to distinguish which
+file contains mate #1 and mate #2. If a percent symbol, `%`, is used in
+``, the percent symbol is replaced with `1` or `2` to make the per-mate
+filenames. Otherwise, `.1` or `.2` are added before the final dot in `` to
+make the per-mate filenames. Reads written in this way will appear exactly as
+they did in the input files, without any modification (same sequence, same name,
+same quality string, same quality encoding). Reads will not necessarily appear
+in the same order as they did in the inputs.
+
+ |
+
+
+[`--quiet`]: #hisat2-options-quiet
+
+ --quiet
+
+ |
+
+Print nothing besides alignments and serious errors.
+
+ |
+
+
+
+[`--summary-file`]: #hisat2-options-summary-file
+
+ --summary-file
+
+ |
+
+Print alignment summary to this file.
+
+ |
+
+
+
+[`--new-summary`]: #hisat2-options-new-summary
+
+ --new-summary
+
+ |
+
+Print alignment summary in a new style, which is more machine-friendly.
+
+ |
+
+
+
+[`--met-file`]: #hisat2-options-met-file
+
+ --met-file
+
+ |
+
+Write `hisat2` metrics to file ``. Having alignment metric can be useful
+for debugging certain problems, especially performance issues. See also:
+[`--met`]. Default: metrics disabled.
+
+ |
+
+
+[`--met-stderr`]: #hisat2-options-met-stderr
+
+ --met-stderr
+
+ |
+
+Write `hisat2` metrics to the "standard error" ("stderr") filehandle. This is
+not mutually exclusive with [`--met-file`]. Having alignment metric can be
+useful for debugging certain problems, especially performance issues. See also:
+[`--met`]. Default: metrics disabled.
+
+ |
+
+
+[`--met`]: #hisat2-options-met
+
+ --met
+
+ |
+
+Write a new `hisat2` metrics record every `` seconds. Only matters if
+either [`--met-stderr`] or [`--met-file`] are specified. Default: 1.
+
+ |
+
+
+#### SAM options
+
+
+
+
+
+[`--no-unal`]: #hisat2-options-no-unal
+
+ --no-unal
+
+ |
+
+Suppress SAM records for reads that failed to align.
+
+ |
+
+
+[`--no-hd`]: #hisat2-options-no-hd
+
+ --no-hd
+
+ |
+
+Suppress SAM header lines (starting with `@`).
+
+ |
+
+
+[`--no-sq`]: #hisat2-options-no-sq
+
+ --no-sq
+
+ |
+
+Suppress `@SQ` SAM header lines.
+
+ |
+
+
+[`--rg-id`]: #hisat2-options-rg-id
+
+ --rg-id
+
+ |
+
+Set the read group ID to ``. This causes the SAM `@RG` header line to be
+printed, with `` as the value associated with the `ID:` tag. It also
+causes the `RG:Z:` extra field to be attached to each SAM output record, with
+value set to ``.
+
+ |
+
+
+[`--rg`]: #hisat2-options-rg
+
+ --rg
+
+ |
+
+Add `` (usually of the form `TAG:VAL`, e.g. `SM:Pool1`) as a field on the
+`@RG` header line. Note: in order for the `@RG` line to appear, [`--rg-id`]
+must also be specified. This is because the `ID` tag is required by the [SAM
+Spec][SAM]. Specify `--rg` multiple times to set multiple fields. See the
+[SAM Spec][SAM] for details about what fields are legal.
+
+
+ |
+
+
+[`--remove-chrname`]: #hisat2-remove-chrname
+
+ --remove-chrname
+
+ |
+
+Remove 'chr' from reference names in alignment (e.g., chr18 to 18)
+
+ |
+
+
+[`--add-chrname`]: #hisat2-options-add-chrname
+
+ --add-chrname
+
+ |
+
+Add 'chr' to reference names in alignment (e.g., 18 to chr18)
+
+ |
+
+
+[`--omit-sec-seq`]: #hisat2-options-omit-sec-seq
+
+ --omit-sec-seq
+
+ |
+
+When printing secondary alignments, HISAT2 by default will write out the `SEQ`
+and `QUAL` strings. Specifying this option causes HISAT2 to print an asterisk
+in those fields instead.
+
+ |
+
+
+
+
+#### Performance options
+
+
+
+
+
+[`-o`/`--offrate`]: #hisat2-options-o
+[`-o`]: #hisat2-options-o
+[`--offrate`]: #hisat2-options-o
+
+ -o/--offrate
+
+ |
+
+Override the offrate of the index with ``. If `` is greater
+than the offrate used to build the index, then some row markings are
+discarded when the index is read into memory. This reduces the memory
+footprint of the aligner but requires more time to calculate text
+offsets. `` must be greater than the value used to build the
+index.
+
+ |
+
+
+[`-p`/`--threads`]: #hisat2-options-p
+[`-p`]: #hisat2-options-p
+
+ -p/--threads NTHREADS
+
+ |
+
+Launch `NTHREADS` parallel search threads (default: 1). Threads will run on
+separate processors/cores and synchronize when parsing reads and outputting
+alignments. Searching for alignments is highly parallel, and speedup is close
+to linear. Increasing `-p` increases HISAT2's memory footprint. E.g. when
+aligning to a human genome index, increasing `-p` from 1 to 8 increases the
+memory footprint by a few hundred megabytes. This option is only available if
+`bowtie` is linked with the `pthreads` library (i.e. if `BOWTIE_PTHREADS=0` is
+not specified at build time).
+
+ |
+
+
+[`--reorder`]: #hisat2-options-reorder
+
+ --reorder
+
+ |
+
+Guarantees that output SAM records are printed in an order corresponding to the
+order of the reads in the original input file, even when [`-p`] is set greater
+than 1. Specifying `--reorder` and setting [`-p`] greater than 1 causes HISAT2
+to run somewhat slower and use somewhat more memory then if `--reorder` were
+not specified. Has no effect if [`-p`] is set to 1, since output order will
+naturally correspond to input order in that case.
+
+ |
+
+
+[`--mm`]: #hisat2-options-mm
+
+ --mm
+
+ |
+
+Use memory-mapped I/O to load the index, rather than typical file I/O.
+Memory-mapping allows many concurrent `bowtie` processes on the same computer to
+share the same memory image of the index (i.e. you pay the memory overhead just
+once). This facilitates memory-efficient parallelization of `bowtie` in
+situations where using [`-p`] is not possible or not preferable.
+
+ |
+
+#### Other options
+
+
+
+
+[`--qc-filter`]: #hisat2-options-qc-filter
+
+ --qc-filter
+
+ |
+
+Filter out reads for which the QSEQ filter field is non-zero. Only has an
+effect when read format is [`--qseq`]. Default: off.
+
+ |
+
+
+[`--seed`]: #hisat2-options-seed
+
+ --seed
+
+ |
+
+Use `` as the seed for pseudo-random number generator. Default: 0.
+
+ |
+
+
+[`--non-deterministic`]: #hisat2-options-non-deterministic
+
+ --non-deterministic
+
+ |
+
+Normally, HISAT2 re-initializes its pseudo-random generator for each read. It
+seeds the generator with a number derived from (a) the read name, (b) the
+nucleotide sequence, (c) the quality sequence, (d) the value of the [`--seed`]
+option. This means that if two reads are identical (same name, same
+nucleotides, same qualities) HISAT2 will find and report the same alignment(s)
+for both, even if there was ambiguity. When `--non-deterministic` is specified,
+HISAT2 re-initializes its pseudo-random generator for each read using the
+current time. This means that HISAT2 will not necessarily report the same
+alignment for two identical reads. This is counter-intuitive for some users,
+but might be more appropriate in situations where the input consists of many
+identical reads.
+
+ |
+
+
+[`--version`]: #hisat2-options-version
+
+ --version
+
+ |
+
+Print version information and quit.
+
+ |
+
+
+ -h/--help
+
+ |
+
+Print usage information and quit.
+
+ |
+
+SAM output
+----------
+
+Following is a brief description of the [SAM] format as output by `hisat2`.
+For more details, see the [SAM format specification][SAM].
+
+By default, `hisat2` prints a SAM header with `@HD`, `@SQ` and `@PG` lines.
+When one or more [`--rg`] arguments are specified, `hisat2` will also print
+an `@RG` line that includes all user-specified [`--rg`] tokens separated by
+tabs.
+
+Each subsequent line describes an alignment or, if the read failed to align, a
+read. Each line is a collection of at least 12 fields separated by tabs; from
+left to right, the fields are:
+
+1. Name of read that aligned.
+
+ Note that the [SAM specification] disallows whitespace in the read name.
+ If the read name contains any whitespace characters, HISAT2 will truncate
+ the name at the first whitespace character. This is similar to the
+ behavior of other tools.
+
+2. Sum of all applicable flags. Flags relevant to HISAT2 are:
+
+
+
+ 1
+
+ |
+
+ The read is one of a pair
+
+ |
+
+ 2
+
+ |
+
+ The alignment is one end of a proper paired-end alignment
+
+ |
+
+ 4
+
+ |
+
+ The read has no reported alignments
+
+ |
+
+ 8
+
+ |
+
+ The read is one of a pair and has no reported alignments
+
+ |
+
+ 16
+
+ |
+
+ The alignment is to the reverse reference strand
+
+ |
+
+ 32
+
+ |
+
+ The other mate in the paired-end alignment is aligned to the
+ reverse reference strand
+
+ |
+
+ 64
+
+ |
+
+ The read is mate 1 in a pair
+
+ |
+
+ 128
+
+ |
+
+ The read is mate 2 in a pair
+
+ |
+
+ Thus, an unpaired read that aligns to the reverse reference strand
+ will have flag 16. A paired-end read that aligns and is the first
+ mate in the pair will have flag 83 (= 64 + 16 + 2 + 1).
+
+3. Name of reference sequence where alignment occurs
+
+4. 1-based offset into the forward reference strand where leftmost
+ character of the alignment occurs
+
+5. Mapping quality. Mapping quality of HISAT2
+
+6. CIGAR string representation of alignment
+
+7. Name of reference sequence where mate's alignment occurs. Set to `=` if the
+mate's reference sequence is the same as this alignment's, or `*` if there is no
+mate.
+
+8. 1-based offset into the forward reference strand where leftmost character of
+the mate's alignment occurs. Offset is 0 if there is no mate.
+
+9. Inferred fragment length. Size is negative if the mate's alignment occurs
+upstream of this alignment. Size is 0 if the mates did not align concordantly.
+However, size is non-0 if the mates aligned discordantly to the same
+chromosome.
+
+10. Read sequence (reverse-complemented if aligned to the reverse strand)
+
+11. ASCII-encoded read qualities (reverse-complemented if the read aligned to
+the reverse strand). The encoded quality values are on the [Phred quality]
+scale and the encoding is ASCII-offset by 33 (ASCII char `!`), similarly to a
+[FASTQ] file.
+
+12. Optional fields. Fields are tab-separated. `hisat2` outputs zero or more
+of these optional fields for each alignment, depending on the type of the
+alignment:
+
+
+
+
+ AS:i:
+
+ |
+
+
+ Alignment score. Can be negative. Only present if SAM record is for
+ an aligned read.
+
+ |
+
+
+ ZS:i:
+
+ |
+
+ Alignment score for the best-scoring alignment found other than the
+ alignment reported. Can be negative. Only present if the SAM record is
+ for an aligned read and more than one alignment was found for the read.
+ Note that, when the read is part of a concordantly-aligned pair, this score
+ could be greater than [`AS:i`].
+
+ |
+
+
+ YS:i:
+
+ |
+
+
+ Alignment score for opposite mate in the paired-end alignment. Only present
+ if the SAM record is for a read that aligned as part of a paired-end
+ alignment.
+
+ |
+
+
+ XN:i:
+
+ |
+
+
+ The number of ambiguous bases in the reference covering this alignment.
+ Only present if SAM record is for an aligned read.
+
+ |
+
+
+ XM:i:
+
+ |
+
+
+ The number of mismatches in the alignment. Only present if SAM record is
+ for an aligned read.
+
+ |
+
+
+ XO:i:
+
+ |
+
+
+ The number of gap opens, for both read and reference gaps, in the alignment.
+ Only present if SAM record is for an aligned read.
+
+ |
+
+
+ XG:i:
+
+ |
+
+
+ The number of gap extensions, for both read and reference gaps, in the
+ alignment. Only present if SAM record is for an aligned read.
+
+ |
+
+
+ NM:i:
+
+ |
+
+
+ The edit distance; that is, the minimal number of one-nucleotide edits
+ (substitutions, insertions and deletions) needed to transform the read
+ string into the reference string. Only present if SAM record is for an
+ aligned read.
+
+ |
+
+
+ YF:Z:
+
+ |
+
+ String indicating reason why the read was filtered out. See also:
+ [Filtering]. Only appears for reads that were filtered out.
+
+ |
+
+
+ YT:Z:
+
+ |
+
+ Value of `UU` indicates the read was not part of a pair. Value of `CP`
+ indicates the read was part of a pair and the pair aligned concordantly.
+ Value of `DP` indicates the read was part of a pair and the pair aligned
+ discordantly. Value of `UP` indicates the read was part of a pair but the
+ pair failed to aligned either concordantly or discordantly.
+
+ |
+
+
+ MD:Z:
+
+ |
+
+ A string representation of the mismatched reference bases in the alignment.
+ See [SAM] format specification for details. Only present if SAM record is
+ for an aligned read.
+
+ |
+
+
+ XS:A:
+
+ |
+
+ Values of `+` and `-` indicate the read is mapped to transcripts on sense and anti-sense
+ strands, respectively. Spliced alignments need to have this field, which is required in Cufflinks and StringTie.
+ We can report this field for the canonical-splice site (GT/AG), but not for non-canonical splice sites.
+ You can direct HISAT2 not to output such alignments (involving non-canonical splice sites) using "--pen-noncansplice 1000000".
+
+ |
+
+
+ NH:i:
+
+ |
+
+ The number of mapped locations for the read or the pair.
+
+ |
+
+
+ Zs:Z:
+
+ |
+
+ When the alignment of a read involves SNPs that are in the index, this option is used to indicate where exactly the read involves the SNPs.
+ This optional field is similar to the above MD:Z field.
+ For example, `Zs:Z:1|S|rs3747203,97|S|rs16990981` indicates the second base of the read corresponds to a known SNP (ID: rs3747203).
+ 97 bases after the third base (the base after the second one), the read at 100th base involves another known SNP (ID: rs16990981).
+ 'S' indicates a single nucleotide polymorphism. 'D' and 'I' indicate a deletion and an insertion, respectively.
+ |
+
+
+
+[SAM format specification]: http://samtools.sf.net/SAM1.pdf
+[FASTQ]: http://en.wikipedia.org/wiki/FASTQ_format
+[`-S`/`--sam`]: #hisat2-options-S
+[`-m`]: #hisat2-options-m
+
+The `hisat2-build` indexer
+===========================
+
+`hisat2-build` builds a HISAT2 index from a set of DNA sequences.
+`hisat2-build` outputs a set of 6 files with suffixes `.1.ht2`, `.2.ht2`,
+`.3.ht2`, `.4.ht2`, `.5.ht2`, `.6.ht2`, `.7.ht2`, and `.8.ht2`. In the case of a large
+index these suffixes will have a `ht2l` termination. These files together
+constitute the index: they are all that is needed to align reads to that
+reference. The original sequence FASTA files are no longer used by HISAT2
+once the index is built.
+
+Use of Karkkainen's [blockwise algorithm] allows `hisat2-build` to trade off
+between running time and memory usage. `hisat2-build` has three options
+governing how it makes this trade: [`-p`/`--packed`], [`--bmax`]/[`--bmaxdivn`],
+and [`--dcv`]. By default, `hisat2-build` will automatically search for the
+settings that yield the best running time without exhausting memory. This
+behavior can be disabled using the [`-a`/`--noauto`] option.
+
+The indexer provides options pertaining to the "shape" of the index, e.g.
+[`--offrate`](#hisat2-build-options-o) governs the fraction of [Burrows-Wheeler]
+rows that are "marked" (i.e., the density of the suffix-array sample; see the
+original [FM Index] paper for details). All of these options are potentially
+profitable trade-offs depending on the application. They have been set to
+defaults that are reasonable for most cases according to our experiments. See
+[Performance tuning] for details.
+
+`hisat2-build` can generate either [small or large indexes](#small-and-large-indexes). The wrapper
+will decide which based on the length of the input genome. If the reference
+does not exceed 4 billion characters but a large index is preferred, the user
+can specify [`--large-index`] to force `hisat2-build` to build a large index
+instead.
+
+The HISAT2 index is based on the [FM Index] of Ferragina and Manzini, which in
+turn is based on the [Burrows-Wheeler] transform. The algorithm used to build
+the index is based on the [blockwise algorithm] of Karkkainen.
+
+[Blockwise algorithm]: http://portal.acm.org/citation.cfm?id=1314852
+[Burrows-Wheeler]: http://en.wikipedia.org/wiki/Burrows-Wheeler_transform
+[Performance tuning]: #performance-tuning
+
+Command Line
+------------
+
+Usage:
+
+ hisat2-build [options]*
+
+### Notes
+ If you use --snp, --ss, and/or --exon, hisat2-build will need about 200GB RAM for the human genome size as index building involves a graph construction.
+ Otherwise, you will be able to build an index on your desktop with 8GB RAM.
+
+### Main arguments
+
+
+
+
+
+ |
+
+A comma-separated list of FASTA files containing the reference sequences to be
+aligned to, or, if [`-c`](#hisat2-build-options-c) is specified, the sequences
+themselves. E.g., `` might be `chr1.fa,chr2.fa,chrX.fa,chrY.fa`,
+or, if [`-c`](#hisat2-build-options-c) is specified, this might be
+`GGTCATCCT,ACGGGTCGT,CCGTTCTATGCGGCTTA`.
+
+ |
+
+
+
+ |
+
+The basename of the index files to write. By default, `hisat2-build` writes
+files named `NAME.1.ht2`, `NAME.2.ht2`, `NAME.3.ht2`, `NAME.4.ht2`,
+`NAME.5.ht2`, `NAME.6.ht2`, `NAME.7.ht2`, and `NAME.8.ht2` where `NAME` is ``.
+
+ |
+
+### Options
+
+
+
+ -f
+
+ |
+
+The reference input files (specified as ``) are FASTA files
+(usually having extension `.fa`, `.mfa`, `.fna` or similar).
+
+ |
+
+ -c
+
+ |
+
+The reference sequences are given on the command line. I.e. `` is
+a comma-separated list of sequences rather than a list of FASTA files.
+
+ |
+
+
+[`--large-index`]: #hisat2-build-options-large-index
+
+ --large-index
+
+ |
+
+Force `hisat2-build` to build a [large index](#small-and-large-indexes), even if the reference is less
+than ~ 4 billion nucleotides long.
+
+ |
+
+
+[`-a`/`--noauto`]: #hisat2-build-options-a
+
+ -a/--noauto
+
+ |
+
+Disable the default behavior whereby `hisat2-build` automatically selects
+values for the [`--bmax`], [`--dcv`] and [`--packed`] parameters according to
+available memory. Instead, user may specify values for those parameters. If
+memory is exhausted during indexing, an error message will be printed; it is up
+to the user to try new parameters.
+
+ |
+
+[`--bmax`]: #hisat2-build-options-bmax
+
+ --bmax
+
+ |
+
+The maximum number of suffixes allowed in a block. Allowing more suffixes per
+block makes indexing faster, but increases peak memory usage. Setting this
+option overrides any previous setting for [`--bmax`], or [`--bmaxdivn`].
+Default (in terms of the [`--bmaxdivn`] parameter) is [`--bmaxdivn`] 4. This is
+configured automatically by default; use [`-a`/`--noauto`] to configure manually.
+
+ |
+
+[`--bmaxdivn`]: #hisat2-build-options-bmaxdivn
+
+ --bmaxdivn
+
+ |
+
+The maximum number of suffixes allowed in a block, expressed as a fraction of
+the length of the reference. Setting this option overrides any previous setting
+for [`--bmax`], or [`--bmaxdivn`]. Default: [`--bmaxdivn`] 4. This is
+configured automatically by default; use [`-a`/`--noauto`] to configure manually.
+
+ |
+
+[`--dcv`]: #hisat2-build-options-dcv
+
+ --dcv
+
+ |
+
+Use `` as the period for the difference-cover sample. A larger period
+yields less memory overhead, but may make suffix sorting slower, especially if
+repeats are present. Must be a power of 2 no greater than 4096. Default: 1024.
+ This is configured automatically by default; use [`-a`/`--noauto`] to configure
+manually.
+
+ |
+
+[`--nodc`]: #hisat2-build-options-nodc
+
+ --nodc
+
+ |
+
+Disable use of the difference-cover sample. Suffix sorting becomes
+quadratic-time in the worst case (where the worst case is an extremely
+repetitive reference). Default: off.
+
+ |
+
+ -r/--noref
+
+ |
+
+Do not build the `NAME.3.ht2` and `NAME.4.ht2` portions of the index, which
+contain a bitpacked version of the reference sequences and are used for
+paired-end alignment.
+
+ |
+
+ -3/--justref
+
+ |
+
+Build only the `NAME.3.ht2` and `NAME.4.ht2` portions of the index, which
+contain a bitpacked version of the reference sequences and are used for
+paired-end alignment.
+
+ |
+
+ -o/--offrate
+
+ |
+
+To map alignments back to positions on the reference sequences, it's necessary
+to annotate ("mark") some or all of the [Burrows-Wheeler] rows with their
+corresponding location on the genome.
+[`-o`/`--offrate`](#hisat2-build-options-o) governs how many rows get marked:
+the indexer will mark every 2^`` rows. Marking more rows makes
+reference-position lookups faster, but requires more memory to hold the
+annotations at runtime. The default is 4 (every 16th row is marked; for human
+genome, annotations occupy about 680 megabytes).
+
+ |
+
+ -t/--ftabchars
+
+ |
+
+The ftab is the lookup table used to calculate an initial [Burrows-Wheeler]
+range with respect to the first `` characters of the query. A larger
+`` yields a larger lookup table but faster query times. The ftab has size
+4^(``+1) bytes. The default setting is 10 (ftab is 4MB).
+
+
+ |
+
+ --localoffrate
+
+ |
+
+This option governs how many rows get marked in a local index:
+the indexer will mark every 2^`` rows. Marking more rows makes
+reference-position lookups faster, but requires more memory to hold the
+annotations at runtime. The default is 3 (every 8th row is marked,
+this occupies about 16KB per local index).
+
+ |
+
+ --localftabchars
+
+ |
+
+The local ftab is the lookup table in a local index.
+The default setting is 6 (ftab is 8KB per local index).
+
+ |
+
+ -p
+
+ |
+
+Launch `NTHREADS` parallel build threads (default: 1).
+
+ |
+
+ --snp
+
+ |
+
+Provide a list of SNPs (in the HISAT2's own format) as follows (five columns).
+
+ SNP ID `` snp type (single, deletion, or insertion) `` chromosome name `` zero-offset based genomic position of a SNP `` alternative base (single), the length of SNP (deletion), or insertion sequence (insertion)
+
+ For example,
+ rs58784443 single 13 18447947 T
+
+Use `hisat2_extract_snps_haplotypes_UCSC.py` (in the HISAT2 package) to extract SNPs and haplotypes from a dbSNP file (e.g. http://hgdownload.soe.ucsc.edu/goldenPath/hg38/database/snp144Common.txt.gz).
+or `hisat2_extract_snps_haplotypes_VCF.py` to extract SNPs and haplotypes from a VCF file (e.g. ftp://ftp.1000genomes.ebi.ac.uk/vol1/ftp/release/20130502/supporting/GRCh38_positions/ALL.chr22.phase3_shapeit2_mvncall_integrated_v3plus_nounphased.rsID.genotypes.GRCh38_dbSNP_no_SVs.vcf.gz).
+
+ |
+
+ --haplotype
+
+ |
+
+Provide a list of haplotypes (in the HISAT2's own format) as follows (five columns).
+
+ Haplotype ID `` chromosome name `` zero-offset based left coordinate of haplotype `` zero-offset based right coordinate of haplotype `` a comma separated list of SNP ids in the haplotype
+
+ For example,
+ ht35 13 18446877 18446945 rs12381094,rs12381056,rs192016659,rs538569910
+
+See the above option, --snp, about how to extract haplotypes. This option is not required, but haplotype information can keep the index construction from exploding and reduce the index size substantially.
+
+ |
+
+ --ss
+
+ |
+
+Note this option should be used with the following --exon option.
+Provide a list of splice sites (in the HISAT2's own format) as follows (four columns).
+
+ chromosome name `` zero-offset based genomic position of the flanking base on the left side of an intron `` zero-offset based genomic position of the flanking base on the right `` strand
+
+Use `hisat2_extract_splice_sites.py` (in the HISAT2 package) to extract splice sites from a GTF file.
+
+ |
+
+ --exon
+
+ |
+
+Note this option should be used with the above --ss option.
+Provide a list of exons (in the HISAT2's own format) as follows (three columns).
+
+ chromosome name `` zero-offset based left genomic position of an exon `` zero-offset based right genomic position of an exon
+
+Use `hisat2_extract_exons.py` (in the HISAT2 package) to extract exons from a GTF file.
+
+ |
+
+ --seed
+
+ |
+
+Use `` as the seed for pseudo-random number generator.
+
+ |
+
+ --cutoff
+
+ |
+
+Index only the first `` bases of the reference sequences (cumulative across
+sequences) and ignore the rest.
+
+ |
+
+ -q/--quiet
+
+ |
+
+`hisat2-build` is verbose by default. With this option `hisat2-build` will
+print only error messages.
+
+ |
+
+ -h/--help
+
+ |
+
+Print usage information and quit.
+
+ |
+
+ --version
+
+ |
+
+Print version information and quit.
+
+ |
+
+The `hisat2-inspect` index inspector
+=====================================
+
+`hisat2-inspect` extracts information from a HISAT2 index about what kind of
+index it is and what reference sequences were used to build it. When run without
+any options, the tool will output a FASTA file containing the sequences of the
+original references (with all non-`A`/`C`/`G`/`T` characters converted to `N`s).
+ It can also be used to extract just the reference sequence names using the
+[`-n`/`--names`] option or a more verbose summary using the [`-s`/`--summary`]
+option.
+
+Command Line
+------------
+
+Usage:
+
+ hisat2-inspect [options]*
+
+### Main arguments
+
+
+
+
+
+ |
+
+The basename of the index to be inspected. The basename is name of any of the
+index files but with the `.X.ht2` suffix omitted.
+`hisat2-inspect` first looks in the current directory for the index files, then
+in the directory specified in the `HISAT2_INDEXES` environment variable.
+
+ |
+
+### Options
+
+
+
+ -a/--across
+
+ |
+
+When printing FASTA output, output a newline character every `` bases
+(default: 60).
+
+ |
+
+[`-n`/`--names`]: #hisat2-inspect-options-n
+
+ -n/--names
+
+ |
+
+Print reference sequence names, one per line, and quit.
+
+ |
+
+[`-s`/`--summary`]: #hisat2-inspect-options-s
+
+ -s/--summary
+
+ |
+
+Print a summary that includes information about index settings, as well as the
+names and lengths of the input sequences. The summary has this format:
+
+ Colorspace <0 or 1>
+ SA-Sample 1 in
+ FTab-Chars
+ Sequence-1
+ Sequence-2
+ ...
+ Sequence-N
+
+Fields are separated by tabs. Colorspace is always set to 0 for HISAT2.
+
+ |
+
+[`--snp`]: #hisat2-inspect-options-snp
+
+ --snp
+
+ |
+
+Print SNPs, and quit.
+
+ |
+
+[`--ss`]: #hisat2-inspect-options-ss
+
+ --ss
+
+ |
+
+Print splice sites, and quit.
+
+ |
+
+[`--ss-all`]: #hisat2-inspect-options-ss-all
+
+ --ss-all
+
+ |
+
+Print splice sites including those not in the global index, and quit.
+
+ |
+
+[`--exon`]: #hisat2-inspect-options-exon
+
+ --exon
+
+ |
+
+Print exons, and quit.
+
+ |
+
+ -v/--verbose
+
+ |
+
+Print verbose output (for debugging).
+
+ |
+
+ --version
+
+ |
+
+Print version information and quit.
+
+ |
+
+ -h/--help
+
+ |
+
+Print usage information and quit.
+
+ |
+
+Getting started with HISAT2
+===================================================
+
+HISAT2 comes with some example files to get you started. The example files
+are not scientifically significant; these files will simply let you start running HISAT2 and
+downstream tools right away.
+
+First follow the manual instructions to [obtain HISAT2]. Set the `HISAT2_HOME`
+environment variable to point to the new HISAT2 directory containing the
+`hisat2`, `hisat2-build` and `hisat2-inspect` binaries. This is important,
+as the `HISAT2_HOME` variable is used in the commands below to refer to that
+directory.
+
+[obtain HISAT2]: #obtaining-hisat2
+
+Indexing a reference genome
+---------------------------
+
+To create an index for the genomic region (1 million bps from the human chromosome 22 between 20,000,000 and 20,999,999)
+included with HISAT2, create a new temporary directory (it doesn't matter where), change into that directory, and run:
+
+ $HISAT2_HOME/hisat2-build $HISAT2_HOME/example/reference/22_20-21M.fa --snp $HISAT2_HOME/example/reference/22_20-21M.snp 22_20-21M_snp
+
+The command should print many lines of output then quit. When the command
+completes, the current directory will contain ten new files that all start with
+`22_20-21M_snp` and end with `.1.ht2`, `.2.ht2`, `.3.ht2`, `.4.ht2`, `.5.ht2`, `.6.ht2`,
+`.7.ht2`, and `.8.ht2`. These files constitute the index - you're done!
+
+You can use `hisat2-build` to create an index for a set of FASTA files obtained
+from any source, including sites such as [UCSC], [NCBI], and [Ensembl]. When
+indexing multiple FASTA files, specify all the files using commas to separate
+file names. For more details on how to create an index with `hisat2-build`,
+see the [manual section on index building]. You may also want to bypass this
+process by obtaining a pre-built index.
+
+[UCSC]: http://genome.ucsc.edu/cgi-bin/hgGateway
+[NCBI]: http://www.ncbi.nlm.nih.gov/sites/genome
+[Ensembl]: http://www.ensembl.org/
+[manual section on index building]: #the-hisat2-build-indexer
+[using a pre-built index]: #using-a-pre-built-index
+
+Aligning example reads
+----------------------
+
+Stay in the directory created in the previous step, which now contains the
+`22_20-21M` index files. Next, run:
+
+ $HISAT2_HOME/hisat2 -f -x $HISAT2_HOME/example/index/22_20-21M_snp -U $HISAT2_HOME/example/reads/reads_1.fa -S eg1.sam
+
+This runs the HISAT2 aligner, which aligns a set of unpaired reads to the
+genome region using the index generated in the previous step.
+The alignment results in SAM format are written to the file `eg1.sam`, and a
+short alignment summary is written to the console. (Actually, the summary is
+written to the "standard error" or "stderr" filehandle, which is typically
+printed to the console.)
+
+To see the first few lines of the SAM output, run:
+
+ head eg1.sam
+
+You will see something like this:
+
+ @HD VN:1.0 SO:unsorted
+ @SQ SN:22:20000001-21000000 LN:1000000
+ @PG ID:hisat2 PN:hisat2 VN:2.0.0-beta
+ 1 0 22:20000001-21000000 397984 255 100M * 0 0 GCCTGTGAGGGAGCCCCGGACCCGGTCAGAGCAGGAGCCTGGCCTGGGGCCAAGTTCACCTTATGGACTCTCTTCCCTGCCCTTCCAGGAGCAGCTCACT IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII AS:i:0 XN:i:0 XM:i:0 XO:i:0 XG:i:0 NM:i:0 MD:Z:100 YT:Z:UU NH:i:1
+ 2 16 22:20000001-21000000 398131 255 100M * 0 0 ATGACACACTGTACACACCAGGGGCCCTGTGCTCCCCAGGAAGAGGGCCCTCACTTGAAGCGGGGCCCGATGGCCGCCACGTGCCGGTTCATGCTCCCCT IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII AS:i:0 XN:i:0 XM:i:0 XO:i:0 XG:i:0 NM:i:0 MD:Z:80A19 YT:Z:UU NH:i:1 Zs:Z:80|S|rs576159895
+ 3 16 22:20000001-21000000 398222 255 100M * 0 0 TGCTCCCCTTGGCCCCGCCGATGTTCAGGGACATGGAGCGCTGCAGCAGGCTGGAGAAGATCTCCACTTGGTCAGAGCTGCAGTACTTGGCGATCTCAAA IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII AS:i:0 XN:i:0 XM:i:0 XO:i:0 XG:i:0 NM:i:0 MD:Z:16A83 YT:Z:UU NH:i:1 Zs:Z:16|S|rs2629364
+ 4 16 22:20000001-21000000 398247 255 90M200N10M * 0 0 CAGGGACATGGAGCGCTGCAGCAGGCTGGAGAAGATCTCCACTTGGTCAGAGCTGCAGTACTTGGCGATCTCAAACCGCTGCACCAGGAAGTCGATCCAG IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII AS:i:0 XN:i:0 XM:i:0 XO:i:0 XG:i:0 NM:i:0 MD:Z:100 YT:Z:UU XS:A:- NH:i:1
+ 5 16 22:20000001-21000000 398194 255 100M * 0 0 GGCCCGATGGCCGCCACGTGCCGGTTCATGCTCCCCTTGGCCCCGCCGATGTTCAGGGACATGGAGCGCTGCAGCAGGCTGGAGAAGATCTCCACTTGGT IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII AS:i:0 XN:i:0 XM:i:0 XO:i:0 XG:i:0 NM:i:0 MD:Z:17A26A55 YT:Z:UU NH:i:1 Zs:Z:17|S|rs576159895,26|S|rs2629364
+ 6 0 22:20000001-21000000 398069 255 100M * 0 0 CAGGAGCAGCTCACTGAAATGTGTTCCCCGTCTACAGAAGTACCGTGATACACAGACGCCCCATGACACACTGTACACACCAGGGGCCCTGTGCTCCCCA IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII AS:i:0 XN:i:0 XM:i:0 XO:i:0 XG:i:0 NM:i:0 MD:Z:100 YT:Z:UU NH:i:1
+ 7 0 22:20000001-21000000 397896 255 100M * 0 0 GTGGAGTAGATCTTCTCGCGAAGCACATTGCAGATGGTTGCATTTGGAACCACATCGGCATGCAGGAGGGACAGCCCCAGGGTCAGCAGCCTGTGAGGGA IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII AS:i:0 XN:i:0 XM:i:0 XO:i:0 XG:i:0 NM:i:0 MD:Z:31G68 YT:Z:UU NH:i:1 Zs:Z:31|S|rs562662261
+ 8 0 22:20000001-21000000 398150 255 100M * 0 0 AGGGGCCCTGTGCTCCCCAGGAAGAGGGCCCTCACTTGAAGCGGGGCCCGATGGCCGCCACGTGCCGGTTCATGCTCCCCTTGGCCCCGCCGATGTTCAG IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII AS:i:0 XN:i:0 XM:i:0 XO:i:0 XG:i:0 NM:i:0 MD:Z:61A26A11 YT:Z:UU NH:i:1 Zs:Z:61|S|rs576159895,26|S|rs2629364
+ 9 16 22:20000001-21000000 398329 255 8M200N92M * 0 0 ACCAGGAAGTCGATCCAGATGTAGTGGGGGGTCACTTCGGGGGGACAGGGTTTGGGTTGACTTGCTTCCGAGGCAGCCAGGGGGTCTGCTTCCTTTATCT IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII AS:i:0 XN:i:0 XM:i:0 XO:i:0 XG:i:0 NM:i:0 MD:Z:100 YT:Z:UU XS:A:- NH:i:1
+ 10 16 22:20000001-21000000 398184 255 100M * 0 0 CTTGAAGCGGGGCCCGATGGCCGCCACGTGCCGGTTCATGCTCCCCTTGGCCCCGCCGATGTTCAGGGACATGGAGCGCTGCAGCAGGCTGGAGAAGATC IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII AS:i:0 XN:i:0 XM:i:0 XO:i:0 XG:i:0 NM:i:0 MD:Z:27A26A45 YT:Z:UU NH:i:1 Zs:Z:27|S|rs576159895,26|S|rs2629364
+
+The first few lines (beginning with `@`) are SAM header lines, and the rest of
+the lines are SAM alignments, one line per read or mate. See the [HISAT2
+manual section on SAM output] and the [SAM specification] for details about how
+to interpret the SAM file format.
+
+[HISAT2 manual section on SAM output]: #sam-output
+
+Paired-end example
+------------------
+
+To align paired-end reads included with HISAT2, stay in the same directory and
+run:
+
+ $HISAT2_HOME/hisat2 -f -x $HISAT2_HOME/example/index/22_20-21M_snp -1 $HISAT2_HOME/example/reads/reads_1.fa -2 $HISAT2_HOME/example/reads/reads_2.fa -S eg2.sam
+
+This aligns a set of paired-end reads to the reference genome, with results
+written to the file `eg2.sam`.
+
+Using SAMtools/BCFtools downstream
+----------------------------------
+
+[SAMtools] is a collection of tools for manipulating and analyzing SAM and BAM
+alignment files. [BCFtools] is a collection of tools for calling variants and
+manipulating VCF and BCF files, and it is typically distributed with [SAMtools].
+Using these tools together allows you to get from alignments in SAM format to
+variant calls in VCF format. This example assumes that `samtools` and
+`bcftools` are installed and that the directories containing these binaries are
+in your [PATH environment variable].
+
+Run the paired-end example:
+
+ $HISAT2_HOME/hisat -f -x $HISAT2_HOME/example/index/22_20-21M_snp -1 $HISAT2_HOME/example/reads/reads_1.fa -2 $HISAT2_HOME/example/reads/reads_2.fa -S eg2.sam
+
+Use `samtools view` to convert the SAM file into a BAM file. BAM is a the
+binary format corresponding to the SAM text format. Run:
+
+ samtools view -bS eg2.sam > eg2.bam
+
+Use `samtools sort` to convert the BAM file to a sorted BAM file. The following command requires samtools version 1.2 or higher.
+
+ samtools sort eg2.bam -o eg2.sorted.bam
+
+We now have a sorted BAM file called `eg2.sorted.bam`. Sorted BAM is a useful
+format because the alignments are (a) compressed, which is convenient for
+long-term storage, and (b) sorted, which is convenient for variant discovery.
+To generate variant calls in VCF format, run:
+
+ samtools mpileup -uf $HISAT2_HOME/example/reference/22_20-21M.fa eg2.sorted.bam | bcftools view -bvcg - > eg2.raw.bcf
+
+Then to view the variants, run:
+
+ bcftools view eg2.raw.bcf
+
+See the official SAMtools guide to [Calling SNPs/INDELs with SAMtools/BCFtools]
+for more details and variations on this process.
+
+[BCFtools]: http://samtools.sourceforge.net/mpileup.shtml
+[Calling SNPs/INDELs with SAMtools/BCFtools]: http://samtools.sourceforge.net/mpileup.shtml
diff --git a/Makefile b/Makefile
new file mode 100644
index 0000000..0abfad9
--- /dev/null
+++ b/Makefile
@@ -0,0 +1,590 @@
+#
+# Copyright 2015, Daehwan Kim
+#
+# This file is part of HISAT2.
+#
+# HISAT 2 is free software: you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation, either version 3 of the License, or
+# (at your option) any later version.
+#
+# HISAT 2 is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with HISAT. If not, see .
+#
+#
+# Makefile for hisat2-align, hisat2-build, hisat2-inspect
+#
+
+INC =
+GCC_PREFIX = $(shell dirname `which gcc`)
+GCC_SUFFIX =
+CC = $(GCC_PREFIX)/gcc$(GCC_SUFFIX)
+CPP = $(GCC_PREFIX)/g++$(GCC_SUFFIX)
+CXX = $(CPP)
+HEADERS = $(wildcard *.h)
+BOWTIE_MM = 1
+BOWTIE_SHARED_MEM = 0
+
+# Detect Cygwin or MinGW
+WINDOWS = 0
+CYGWIN = 0
+MINGW = 0
+ifneq (,$(findstring CYGWIN,$(shell uname)))
+ WINDOWS = 1
+ CYGWIN = 1
+ # POSIX memory-mapped files not currently supported on Windows
+ BOWTIE_MM = 0
+ BOWTIE_SHARED_MEM = 0
+else
+ ifneq (,$(findstring MINGW,$(shell uname)))
+ WINDOWS = 1
+ MINGW = 1
+ # POSIX memory-mapped files not currently supported on Windows
+ BOWTIE_MM = 0
+ BOWTIE_SHARED_MEM = 0
+ endif
+endif
+
+MACOS = 0
+ifneq (,$(findstring Darwin,$(shell uname)))
+ MACOS = 1
+endif
+
+EXTRA_FLAGS += -DPOPCNT_CAPABILITY -std=c++11
+INC += -I. -I third_party
+
+MM_DEF =
+
+ifeq (1,$(BOWTIE_MM))
+ MM_DEF = -DBOWTIE_MM
+endif
+
+SHMEM_DEF =
+
+ifeq (1,$(BOWTIE_SHARED_MEM))
+ SHMEM_DEF = -DBOWTIE_SHARED_MEM
+endif
+
+PTHREAD_PKG =
+PTHREAD_LIB =
+
+ifeq (1,$(MINGW))
+ PTHREAD_LIB =
+else
+ PTHREAD_LIB = -lpthread
+endif
+
+SEARCH_LIBS =
+BUILD_LIBS =
+INSPECT_LIBS =
+
+ifeq (1,$(MINGW))
+ BUILD_LIBS =
+ INSPECT_LIBS =
+endif
+
+USE_SRA = 0
+SRA_DEF =
+SRA_LIB =
+SERACH_INC =
+ifeq (1,$(USE_SRA))
+ SRA_DEF = -DUSE_SRA
+ SRA_LIB = -lncbi-ngs-c++-static -lngs-c++-static -lncbi-vdb-static -ldl
+ SEARCH_INC += -I$(NCBI_NGS_DIR)/include -I$(NCBI_VDB_DIR)/include
+ SEARCH_LIBS += -L$(NCBI_NGS_DIR)/lib64 -L$(NCBI_VDB_DIR)/lib64
+endif
+
+LIBS = $(PTHREAD_LIB)
+
+HT2LIB_DIR = hisat2lib
+
+HT2LIB_CPPS = $(HT2LIB_DIR)/ht2_init.cpp \
+ $(HT2LIB_DIR)/ht2_repeat.cpp \
+ $(HT2LIB_DIR)/ht2_index.cpp
+
+SHARED_CPPS = ccnt_lut.cpp ref_read.cpp alphabet.cpp shmem.cpp \
+ edit.cpp gfm.cpp \
+ reference.cpp ds.cpp multikey_qsort.cpp limit.cpp \
+ random_source.cpp tinythread.cpp utility_3n.cpp
+SEARCH_CPPS = qual.cpp pat.cpp \
+ read_qseq.cpp aligner_seed_policy.cpp \
+ aligner_seed.cpp \
+ aligner_seed2.cpp \
+ aligner_sw.cpp \
+ aligner_sw_driver.cpp aligner_cache.cpp \
+ aligner_result.cpp ref_coord.cpp mask.cpp \
+ pe.cpp aln_sink.cpp dp_framer.cpp \
+ scoring.cpp presets.cpp unique.cpp \
+ simple_func.cpp \
+ random_util.cpp \
+ aligner_bt.cpp sse_util.cpp \
+ aligner_swsse.cpp outq.cpp \
+ aligner_swsse_loc_i16.cpp \
+ aligner_swsse_ee_i16.cpp \
+ aligner_swsse_loc_u8.cpp \
+ aligner_swsse_ee_u8.cpp \
+ aligner_driver.cpp \
+ splice_site.cpp \
+ alignment_3n.cpp \
+ position_3n.cpp \
+ $(HT2LIB_CPPS)
+
+BUILD_CPPS = diff_sample.cpp
+
+REPEAT_CPPS = \
+ mask.cpp \
+ qual.cpp \
+ aligner_bt.cpp \
+ scoring.cpp \
+ simple_func.cpp \
+ dp_framer.cpp \
+ aligner_result.cpp \
+ aligner_sw_driver.cpp \
+ aligner_sw.cpp \
+ aligner_swsse_ee_i16.cpp \
+ aligner_swsse_ee_u8.cpp \
+ aligner_swsse_loc_i16.cpp \
+ aligner_swsse_loc_u8.cpp \
+ aligner_swsse.cpp \
+ bit_packed_array.cpp \
+ repeat_builder.cpp
+
+THREE_N_HEADERS = \
+ position_3n_table.h \
+ alignment_3n_table.h \
+ utility_3n_table.h
+
+HISAT2_CPPS_MAIN = $(SEARCH_CPPS) hisat2_main.cpp
+HISAT2_BUILD_CPPS_MAIN = $(BUILD_CPPS) hisat2_build_main.cpp
+HISAT2_REPEAT_CPPS_MAIN = $(REPEAT_CPPS) $(BUILD_CPPS) hisat2_repeat_main.cpp
+
+SEARCH_FRAGMENTS = $(wildcard search_*_phase*.c)
+VERSION := $(shell cat HISAT2_VERSION)
+
+# Convert BITS=?? to a -m flag
+BITS=32
+ifeq (x86_64,$(shell uname -m))
+BITS=64
+endif
+# msys will always be 32 bit so look at the cpu arch instead.
+ifneq (,$(findstring AMD64,$(PROCESSOR_ARCHITEW6432)))
+ ifeq (1,$(MINGW))
+ BITS=64
+ endif
+endif
+BITS_FLAG =
+
+ifeq (32,$(BITS))
+ BITS_FLAG = -m32
+endif
+
+ifeq (64,$(BITS))
+ BITS_FLAG = -m64
+endif
+SSE_FLAG=-msse2
+
+DEBUG_FLAGS = -O0 -g3 $(BITS_FLAG) $(SSE_FLAG)
+DEBUG_DEFS = -DCOMPILER_OPTIONS="\"$(DEBUG_FLAGS) $(EXTRA_FLAGS)\""
+RELEASE_FLAGS = -O3 $(BITS_FLAG) $(SSE_FLAG) -funroll-loops -g3
+RELEASE_DEFS = -DCOMPILER_OPTIONS="\"$(RELEASE_FLAGS) $(EXTRA_FLAGS)\""
+NOASSERT_FLAGS = -DNDEBUG
+FILE_FLAGS = -D_LARGEFILE_SOURCE -D_FILE_OFFSET_BITS=64 -D_GNU_SOURCE
+HT2LIB_FLAGS = -DHISAT2_BUILD_LIB
+ifeq (1,$(USE_SRA))
+ ifeq (1, $(MACOS))
+ SRA_LIB += -stdlib=libc++
+ DEBUG_FLAGS += -mmacosx-version-min=10.10
+ RELEASE_FLAGS += -mmacosx-version-min=10.10
+ endif
+endif
+
+
+HISAT2_BIN_LIST = hisat2-build-s \
+ hisat2-build-l \
+ hisat2-align-s \
+ hisat2-align-l \
+ hisat2-inspect-s \
+ hisat2-inspect-l \
+ hisat2-repeat \
+ hisat-3n-table
+
+HISAT2_BIN_LIST_AUX = hisat2-build-s-debug \
+ hisat2-build-l-debug \
+ hisat2-align-s-debug \
+ hisat2-align-l-debug \
+ hisat2-inspect-s-debug \
+ hisat2-inspect-l-debug \
+ hisat2-repeat-debug
+
+HT2LIB_SRCS = $(SHARED_CPPS) \
+ $(HT2LIB_CPPS)
+
+HT2LIB_OBJS = $(HT2LIB_SRCS:.cpp=.o)
+
+HT2LIB_DEBUG_OBJS = $(addprefix .ht2lib-obj-debug/,$(HT2LIB_OBJS))
+HT2LIB_RELEASE_OBJS = $(addprefix .ht2lib-obj-release/,$(HT2LIB_OBJS))
+HT2LIB_SHARED_DEBUG_OBJS = $(addprefix .ht2lib-obj-debug-shared/,$(HT2LIB_OBJS))
+HT2LIB_SHARED_RELEASE_OBJS = $(addprefix .ht2lib-obj-release-shared/,$(HT2LIB_OBJS))
+
+HT2LIB_PKG_SRC = \
+ $(HT2LIB_DIR)/ht2_init.cpp \
+ $(HT2LIB_DIR)/ht2_repeat.cpp \
+ $(HT2LIB_DIR)/ht2_index.cpp \
+ $(HT2LIB_DIR)/ht2.h \
+ $(HT2LIB_DIR)/ht2_handle.h \
+ $(HT2LIB_DIR)/java_jni/Makefile \
+ $(HT2LIB_DIR)/java_jni/ht2module.c \
+ $(HT2LIB_DIR)/java_jni/HT2Module.java \
+ $(HT2LIB_DIR)/java_jni/HT2ModuleExample.java \
+ $(HT2LIB_DIR)/pymodule/Makefile \
+ $(HT2LIB_DIR)/pymodule/ht2module.c \
+ $(HT2LIB_DIR)/pymodule/setup.py \
+ $(HT2LIB_DIR)/pymodule/ht2example.py
+
+
+GENERAL_LIST = $(wildcard scripts/*.sh) \
+ $(wildcard scripts/*.pl) \
+ $(wildcard *.py) \
+ $(wildcard example/index/*.ht2) \
+ $(wildcard example/reads/*.fa) \
+ example/reference/22_20-21M.fa \
+ example/reference/22_20-21M.snp \
+ $(PTHREAD_PKG) \
+ hisat2 \
+ hisat2-build \
+ hisat2-inspect \
+ AUTHORS \
+ LICENSE \
+ NEWS \
+ MANUAL \
+ MANUAL.markdown \
+ TUTORIAL \
+ HISAT2_VERSION
+
+ifeq (1,$(WINDOWS))
+ HISAT2_BIN_LIST := $(HISAT2_BIN_LIST) hisat2.bat hisat2-build.bat hisat2-inspect.bat
+endif
+
+# This is helpful on Windows under MinGW/MSYS, where Make might go for
+# the Windows FIND tool instead.
+FIND=$(shell which find)
+
+SRC_PKG_LIST = $(wildcard *.h) \
+ $(wildcard *.hh) \
+ $(wildcard *.c) \
+ $(wildcard *.cpp) \
+ $(HT2LIB_PKG_SRC) \
+ Makefile \
+ CMakeLists.txt \
+ $(GENERAL_LIST)
+
+BIN_PKG_LIST = $(GENERAL_LIST)
+
+.PHONY: all allall both both-debug
+
+all: $(HISAT2_BIN_LIST)
+
+allall: $(HISAT2_BIN_LIST) $(HISAT2_BIN_LIST_AUX)
+
+both: hisat2-align-s hisat2-align-l hisat2-build-s hisat2-build-l
+
+both-debug: hisat2-align-s-debug hisat2-align-l-debug hisat2-build-s-debug hisat2-build-l-debug
+
+repeat: hisat2-repeat
+
+repeat-debug: hisat2-repeat-debug
+
+DEFS :=-fno-strict-aliasing \
+ -DHISAT2_VERSION="\"`cat HISAT2_VERSION`\"" \
+ -DBUILD_HOST="\"`hostname`\"" \
+ -DBUILD_TIME="\"`date`\"" \
+ -DCOMPILER_VERSION="\"`$(CXX) -v 2>&1 | tail -1`\"" \
+ $(FILE_FLAGS) \
+ $(PREF_DEF) \
+ $(MM_DEF) \
+ $(SHMEM_DEF)
+
+#
+# hisat-bp targets
+#
+
+hisat-bp-bin: hisat_bp.cpp $(SEARCH_CPPS) $(SHARED_CPPS) $(HEADERS) $(SEARCH_FRAGMENTS)
+ $(CXX) $(RELEASE_FLAGS) $(RELEASE_DEFS) $(EXTRA_FLAGS) \
+ $(DEFS) -DBOWTIE2 $(NOASSERT_FLAGS) -Wall \
+ $(INC) \
+ -o $@ $< \
+ $(SHARED_CPPS) $(HISAT_CPPS_MAIN) \
+ $(LIBS) $(SEARCH_LIBS)
+
+hisat-bp-bin-debug: hisat_bp.cpp $(SEARCH_CPPS) $(SHARED_CPPS) $(HEADERS) $(SEARCH_FRAGMENTS)
+ $(CXX) $(DEBUG_FLAGS) \
+ $(DEBUG_DEFS) $(EXTRA_FLAGS) \
+ $(DEFS) -DBOWTIE2 -Wall \
+ $(INC) \
+ -o $@ $< \
+ $(SHARED_CPPS) $(HISAT_CPPS_MAIN) \
+ $(LIBS) $(SEARCH_LIBS)
+
+#
+# hisat2-repeat targets
+#
+
+hisat2-repeat: hisat2_repeat.cpp $(REPEAT_CPPS) $(SHARED_CPPS) $(HEADERS)
+ $(CXX) $(RELEASE_FLAGS) $(RELEASE_DEFS) $(EXTRA_FLAGS) \
+ $(DEFS) -DBOWTIE2 -DBOWTIE_64BIT_INDEX $(NOASSERT_FLAGS) -Wall \
+ $(INC) \
+ -o $@ $< \
+ $(SHARED_CPPS) $(HISAT2_REPEAT_CPPS_MAIN) \
+ $(LIBS) $(BUILD_LIBS)
+
+hisat2-repeat-debug: hisat2_repeat.cpp $(REPEAT_CPPS) $(SHARED_CPPS) $(HEADERS)
+ $(CXX) $(DEBUG_FLAGS) $(DEBUG_DEFS) $(EXTRA_FLAGS) \
+ $(DEFS) -DBOWTIE2 -DBOWTIE_64BIT_INDEX -Wall \
+ $(INC) \
+ -o $@ $< \
+ $(SHARED_CPPS) $(HISAT2_REPEAT_CPPS_MAIN) \
+ $(LIBS) $(BUILD_LIBS)
+
+
+#
+# hisat2-build targets
+#
+
+hisat2-build-s: hisat2_build.cpp $(SHARED_CPPS) $(HEADERS)
+ $(CXX) $(RELEASE_FLAGS) $(RELEASE_DEFS) $(EXTRA_FLAGS) \
+ $(DEFS) -DBOWTIE2 $(NOASSERT_FLAGS) -Wall -DMASSIVE_DATA_RLCSA \
+ $(INC) \
+ -o $@ $< \
+ $(SHARED_CPPS) $(HISAT2_BUILD_CPPS_MAIN) \
+ $(LIBS) $(BUILD_LIBS)
+
+hisat2-build-l: hisat2_build.cpp $(SHARED_CPPS) $(HEADERS)
+ $(CXX) $(RELEASE_FLAGS) $(RELEASE_DEFS) $(EXTRA_FLAGS) \
+ $(DEFS) -DBOWTIE2 -DBOWTIE_64BIT_INDEX $(NOASSERT_FLAGS) -Wall \
+ $(INC) \
+ -o $@ $< \
+ $(SHARED_CPPS) $(HISAT2_BUILD_CPPS_MAIN) \
+ $(LIBS) $(BUILD_LIBS)
+
+hisat2-build-s-debug: hisat2_build.cpp $(SHARED_CPPS) $(HEADERS)
+ $(CXX) $(DEBUG_FLAGS) $(DEBUG_DEFS) $(EXTRA_FLAGS) \
+ $(DEFS) -DBOWTIE2 -Wall -DMASSIVE_DATA_RLCSA \
+ $(INC) \
+ -o $@ $< \
+ $(SHARED_CPPS) $(HISAT2_BUILD_CPPS_MAIN) \
+ $(LIBS) $(BUILD_LIBS)
+
+hisat2-build-l-debug: hisat2_build.cpp $(SHARED_CPPS) $(HEADERS)
+ $(CXX) $(DEBUG_FLAGS) $(DEBUG_DEFS) $(EXTRA_FLAGS) \
+ $(DEFS) -DBOWTIE2 -DBOWTIE_64BIT_INDEX -Wall \
+ $(INC) \
+ -o $@ $< \
+ $(SHARED_CPPS) $(HISAT2_BUILD_CPPS_MAIN) \
+ $(LIBS) $(BUILD_LIBS)
+
+#
+# hisat2 targets
+#
+
+hisat2-align-s: hisat2.cpp $(SEARCH_CPPS) $(SHARED_CPPS) $(HEADERS) $(SEARCH_FRAGMENTS)
+ $(CXX) $(RELEASE_FLAGS) $(RELEASE_DEFS) $(EXTRA_FLAGS) \
+ $(DEFS) $(SRA_DEF) -DBOWTIE2 $(NOASSERT_FLAGS) -Wall \
+ $(INC) $(SEARCH_INC) \
+ -o $@ $< \
+ $(SHARED_CPPS) $(HISAT2_CPPS_MAIN) \
+ $(LIBS) $(SRA_LIB) $(SEARCH_LIBS)
+
+hisat2-align-l: hisat2.cpp $(SEARCH_CPPS) $(SHARED_CPPS) $(HEADERS) $(SEARCH_FRAGMENTS)
+ $(CXX) $(RELEASE_FLAGS) $(RELEASE_DEFS) $(EXTRA_FLAGS) \
+ $(DEFS) $(SRA_DEF) -DBOWTIE2 -DBOWTIE_64BIT_INDEX $(NOASSERT_FLAGS) -Wall \
+ $(INC) $(SEARCH_INC) \
+ -o $@ $< \
+ $(SHARED_CPPS) $(HISAT2_CPPS_MAIN) \
+ $(LIBS) $(SRA_LIB) $(SEARCH_LIBS)
+
+hisat2-align-s-debug: hisat2.cpp $(SEARCH_CPPS) $(SHARED_CPPS) $(HEADERS) $(SEARCH_FRAGMENTS)
+ $(CXX) $(DEBUG_FLAGS) \
+ $(DEBUG_DEFS) $(EXTRA_FLAGS) \
+ $(DEFS) $(SRA_DEF) -DBOWTIE2 -Wall \
+ $(INC) $(SEARCH_INC) \
+ -o $@ $< \
+ $(SHARED_CPPS) $(HISAT2_CPPS_MAIN) \
+ $(LIBS) $(SRA_LIB) $(SEARCH_LIBS)
+
+hisat2-align-l-debug: hisat2.cpp $(SEARCH_CPPS) $(SHARED_CPPS) $(HEADERS) $(SEARCH_FRAGMENTS)
+ $(CXX) $(DEBUG_FLAGS) \
+ $(DEBUG_DEFS) $(EXTRA_FLAGS) \
+ $(DEFS) $(SRA_DEF) -DBOWTIE2 -DBOWTIE_64BIT_INDEX -Wall \
+ $(INC) $(SEARCH_INC) \
+ -o $@ $< \
+ $(SHARED_CPPS) $(HISAT2_CPPS_MAIN) \
+ $(LIBS) $(SRA_LIB) $(SEARCH_LIBS)
+
+#
+# hisat2-inspect targets
+#
+
+hisat2-inspect-s: hisat2_inspect.cpp $(HEADERS) $(SHARED_CPPS)
+ $(CXX) $(RELEASE_FLAGS) \
+ $(RELEASE_DEFS) $(EXTRA_FLAGS) \
+ $(DEFS) -DBOWTIE2 -DHISAT2_INSPECT_MAIN -Wall \
+ $(INC) -I . \
+ -o $@ $< \
+ $(SHARED_CPPS) \
+ $(LIBS) $(INSPECT_LIBS)
+
+hisat2-inspect-l: hisat2_inspect.cpp $(HEADERS) $(SHARED_CPPS)
+ $(CXX) $(RELEASE_FLAGS) \
+ $(RELEASE_DEFS) $(EXTRA_FLAGS) \
+ $(DEFS) -DBOWTIE2 -DBOWTIE_64BIT_INDEX -DHISAT2_INSPECT_MAIN -Wall \
+ $(INC) -I . \
+ -o $@ $< \
+ $(SHARED_CPPS) \
+ $(LIBS) $(INSPECT_LIBS)
+
+hisat2-inspect-s-debug: hisat2_inspect.cpp $(HEADERS) $(SHARED_CPPS)
+ $(CXX) $(DEBUG_FLAGS) \
+ $(DEBUG_DEFS) $(EXTRA_FLAGS) \
+ $(DEFS) -DBOWTIE2 -DHISAT2_INSPECT_MAIN -Wall \
+ $(INC) -I . \
+ -o $@ $< \
+ $(SHARED_CPPS) \
+ $(LIBS) $(INSPECT_LIBS)
+
+hisat2-inspect-l-debug: hisat2_inspect.cpp $(HEADERS) $(SHARED_CPPS)
+ $(CXX) $(DEBUG_FLAGS) \
+ $(DEBUG_DEFS) $(EXTRA_FLAGS) \
+ $(DEFS) -DBOWTIE2 -DBOWTIE_64BIT_INDEX -DHISAT2_INSPECT_MAIN -Wall \
+ $(INC) -I . \
+ -o $@ $< \
+ $(SHARED_CPPS) \
+ $(LIBS) $(INSPECT_LIBS)
+
+#
+# hisat-3n-table targets
+#
+
+hisat-3n-table: hisat_3n_table.cpp $(THREE_N_HEADERS)
+ $(CXX) $(RELEASE_FLAGS) $(RELEASE_DEFS) $(EXTRA_FLAGS) $(NOASSERT_FLAGS) $(DEFS) -pthread -o $@ $<
+
+#
+# HT2LIB targets
+#
+
+ht2lib: libhisat2lib-debug.a libhisat2lib.a libhisat2lib-debug.so libhisat2lib.so
+
+libhisat2lib-debug.a: $(HT2LIB_DEBUG_OBJS)
+ ar rc $@ $(HT2LIB_DEBUG_OBJS)
+
+libhisat2lib.a: $(HT2LIB_RELEASE_OBJS)
+ ar rc $@ $(HT2LIB_RELEASE_OBJS)
+
+libhisat2lib-debug.so: $(HT2LIB_SHARED_DEBUG_OBJS)
+ $(CXX) $(DEBUG_FLAGS) $(DEBUG_DEFS) $(EXTRA_FLAGS) $(DEFS) $(SRA_DEF) -DBOWTIE2 -Wall $(INC) $(SEARCH_INC) \
+ -shared -o $@ $(HT2LIB_SHARED_DEBUG_OBJS) $(LIBS) $(SRA_LIB) $(SEARCH_LIBS)
+
+libhisat2lib.so: $(HT2LIB_SHARED_RELEASE_OBJS)
+ $(CXX) $(RELEASE_FLAGS) $(RELEASE_DEFS) $(EXTRA_FLAGS) $(DEFS) $(SRA_DEF) -DBOWTIE2 $(NOASSERT_FLAGS) -Wall $(INC) $(SEARCH_INC)\
+ -shared -o $@ $(HT2LIB_SHARED_RELEASE_OBJS) $(LIBS) $(SRA_LIB) $(SEARCH_LIBS)
+
+.ht2lib-obj-debug/%.o: %.cpp
+ @mkdir -p $(dir $@)/$(dir $<)
+ $(CXX) -fPIC $(DEBUG_FLAGS) $(DEBUG_DEFS) $(EXTRA_FLAGS) $(DEFS) $(SRA_DEF) $(HT2LIB_FLAGS) -DBOWTIE2 -Wall $(INC) $(SEARCH_INC) \
+ -c -o $@ $<
+
+.ht2lib-obj-release/%.o: %.cpp
+ @mkdir -p $(dir $@)/$(dir $<)
+ $(CXX) -fPIC $(RELEASE_FLAGS) $(RELEASE_DEFS) $(EXTRA_FLAGS) $(DEFS) $(SRA_DEF) $(HT2LIB_FLAGS) -DBOWTIE2 $(NOASSERT_FLAGS) -Wall $(INC) $(SEARCH_INC) \
+ -c -o $@ $<
+
+.ht2lib-obj-debug-shared/%.o: %.cpp
+ @mkdir -p $(dir $@)/$(dir $<)
+ $(CXX) -fPIC $(DEBUG_FLAGS) $(DEBUG_DEFS) $(EXTRA_FLAGS) $(DEFS) $(SRA_DEF) $(HT2LIB_FLAGS) -DBOWTIE2 -Wall $(INC) $(SEARCH_INC) \
+ -c -o $@ $<
+
+.ht2lib-obj-release-shared/%.o: %.cpp
+ @mkdir -p $(dir $@)/$(dir $<)
+ $(CXX) -fPIC $(RELEASE_FLAGS) $(RELEASE_DEFS) $(EXTRA_FLAGS) $(DEFS) $(SRA_DEF) $(HT2LIB_FLAGS) -DBOWTIE2 $(NOASSERT_FLAGS) -Wall $(INC) $(SEARCH_INC) \
+ -c -o $@ $<
+
+#
+# repeatexp
+#
+repeatexp:
+ g++ -o repeatexp repeatexp.cpp -I hisat2lib libhisat2lib.a
+
+hisat2: ;
+
+hisat2.bat:
+ echo "@echo off" > hisat2.bat
+ echo "perl %~dp0/hisat2 %*" >> hisat2.bat
+
+hisat2-build.bat:
+ echo "@echo off" > hisat2-build.bat
+ echo "python %~dp0/hisat2-build %*" >> hisat2-build.bat
+
+hisat2-inspect.bat:
+ echo "@echo off" > hisat2-inspect.bat
+ echo "python %~dp0/hisat2-inspect %*" >> hisat2-inspect.bat
+
+
+.PHONY: hisat2-src
+hisat2-src: $(SRC_PKG_LIST)
+ chmod a+x scripts/*.sh scripts/*.pl
+ mkdir .src.tmp
+ mkdir .src.tmp/hisat2-$(VERSION)
+ zip tmp.zip $(SRC_PKG_LIST)
+ mv tmp.zip .src.tmp/hisat2-$(VERSION)
+ cd .src.tmp/hisat2-$(VERSION) ; unzip tmp.zip ; rm -f tmp.zip
+ cd .src.tmp ; zip -r hisat2-$(VERSION)-source.zip hisat2-$(VERSION)
+ cp .src.tmp/hisat2-$(VERSION)-source.zip .
+ rm -rf .src.tmp
+
+.PHONY: hisat2-bin
+hisat2-bin: $(BIN_PKG_LIST) $(HISAT2_BIN_LIST) $(HISAT2_BIN_LIST_AUX)
+ chmod a+x scripts/*.sh scripts/*.pl
+ rm -rf .bin.tmp
+ mkdir .bin.tmp
+ mkdir .bin.tmp/hisat2-$(VERSION)
+ if [ -f hisat2.exe ] ; then \
+ zip tmp.zip $(BIN_PKG_LIST) $(addsuffix .exe,$(HISAT2_BIN_LIST) $(HISAT2_BIN_LIST_AUX)) ; \
+ else \
+ zip tmp.zip $(BIN_PKG_LIST) $(HISAT2_BIN_LIST) $(HISAT2_BIN_LIST_AUX) ; \
+ fi
+ mv tmp.zip .bin.tmp/hisat2-$(VERSION)
+ cd .bin.tmp/hisat2-$(VERSION) ; unzip tmp.zip ; rm -f tmp.zip
+ cd .bin.tmp ; zip -r hisat2-$(VERSION)-$(BITS).zip hisat2-$(VERSION)
+ cp .bin.tmp/hisat2-$(VERSION)-$(BITS).zip .
+ rm -rf .bin.tmp
+
+.PHONY: doc
+doc: doc/manual.inc.html MANUAL
+
+doc/manual.inc.html: MANUAL.markdown
+ pandoc -T "HISAT2 Manual" -o $@ \
+ --from markdown --to HTML --toc $^
+ perl -i -ne \
+ '$$w=0 if m|^
|;' $@
+
+MANUAL: MANUAL.markdown
+ perl doc/strip_markdown.pl < $^ > $@
+
+.PHONY: clean
+clean:
+ rm -f $(HISAT2_BIN_LIST) $(HISAT2_BIN_LIST_AUX) \
+ $(addsuffix .exe,$(HISAT2_BIN_LIST) $(HISAT2_BIN_LIST_AUX)) \
+ hisat2-src.zip hisat2-bin.zip
+ rm -f core.* .tmp.head
+ rm -rf *.dSYM
+ rm -rf .ht2lib-obj*
+ rm -f libhisat2lib*.a libhisat2lib*.so
+
+
+.PHONY: push-doc
+push-doc: doc/manual.inc.html
+ scp doc/*.*html doc/indexes.txt salz-dmz:/ccb/salz7-data/www/ccb.jhu.edu/html/software/hisat2/
diff --git a/NEWS b/NEWS
new file mode 100644
index 0000000..56be177
--- /dev/null
+++ b/NEWS
@@ -0,0 +1,16 @@
+HISAT 2 NEWS
+=============
+
+HISAT 2 is now available for download from the project website,
+http://bowtie-bio.sf.net/bowtie2. 2.0.0-beta is the first version released to
+the public and 2.0.7 is the latest version. HISAT 2 is licensed under
+the GPLv3 license. See `LICENSE' file for details.
+
+
+Version Release History
+=======================
+
+Version 2.0.0-beta - August XX, 2015
+ * Improved multithreading support so that Bowtie 2 now uses native Windows
+ threads when compiled on Windows and uses a faster mutex. Threading
+ performance should improve on all platforms.
diff --git a/README.md b/README.md
new file mode 100644
index 0000000..1278115
--- /dev/null
+++ b/README.md
@@ -0,0 +1,247 @@
+HISAT-3N
+============
+
+Overview
+-----------------
+HISAT-3N (hierarchical indexing for spliced alignment of transcripts - 3 nucleotides)
+is an ultrafast and memory-efficient sequence aligner designed for nucleotide conversion
+sequencing technologies. HISAT-3N index contains two HISAT2 indexes which require memory small:
+for the human genome, it requires 9 GB for standard 3N-index and 10.5 GB for repeat 3N-index.
+The repeat 3N-index could be used to align one read to thousands position 3 times faster standard 3N-index.
+HISAT-3N is developed based on [HISAT2],
+which is particularly optimized for RNA sequencing technology. HISAT-3N support both strand-specific and non-strand reads.
+HISAT-3N can be used for any base-converted sequencing reads include [BS-seq], [SLAM-seq], [scBS-seq], [scSLAM-seq], and [TAPS].
+See the [HISAT-3N] website for more information.
+
+[HISAT2]:https://github.com/DaehwanKimLab/hisat2
+[BS-seq]: https://en.wikipedia.org/wiki/Bisulfite_sequencing
+[SLAM-seq]: https://www.nature.com/articles/nmeth.4435
+[scBS-seq]: https://www.nature.com/articles/nmeth.3035
+[scSLAM-seq]: https://www.nature.com/articles/s41586-019-1369-y
+[TAPS]: https://www.nature.com/articles/s41587-019-0041-2
+[HISAT-3N]:https://daehwankimlab.github.io/hisat2/hisat-3n
+
+
+Getting started
+============
+HISAT-3N requires a 64-bit computer running either Linux or Mac OS X and at least 16 GB of RAM.
+
+A few notes:
+
+1. Building the standard 3N index requires 16GB of RAM or less.
+2. Building the repeat 3N index requires 256GB of RAM.
+3. The alignment process using either the standard or repeat index requires less than 16GB of RAM.
+4. [SAMtools] is required to sort SAM files in order to generate a HISAT-3N table.
+
+Install
+------------
+
+ git clone https://github.com/DaehwanKimLab/hisat2.git hisat-3n
+ cd hisat-3n
+ git checkout -b hisat-3n origin/hisat-3n
+ make
+
+Build a HISAT-3N index with `hisat-3n-build`
+-----------
+`hisat-3n-build` builds a 3N-index, which contains two hisat2 indexes, from a set of DNA sequences. For standard 3N-index,
+each index contains 16 files with suffix `.3n.*.*.ht2`.
+For repeat 3N-index, there are 16 more files in addition to the standard 3N-index, and they have the suffix
+`.3n.*.rep.*.ht2`.
+These files constitute the hisat-3n index and no other file is needed to alignment reads to the reference.
+
+* `--base-change ` argument is required for `hisat-3n-build` and `hisat-3n`.
+ Provide which base is converted in the sequencing process to another base. Please enter
+ 2 letters separated by ',' for this argument. The first letter(chr1) should be the converted base, the second letter(chr2) should be
+ the converted to base. For example, during slam-seq, some 'T' is converted to 'C',
+ please enter `--base-change T,C`. During bisulfite-seq, some 'C' is converted to 'T', please enter `--base-change C,T`.
+* Different conversion types may build the same hisat-3n index. Please check the table below for more detail.
+ Once you build the hisat-3n index with C to T conversion (for example BS-seq).
+ You can align the T to C conversion reads (for example SLAM-seq reads) with the same index.
+
+
+| Conversion Types | HISAT-3N index suffix |
+ |:----------------------------------:|:-----------------------------:|
+|C -> T
T -> C
A -> G
G -> A|.3n.CT.\*.ht2
.3n.GA.\*.ht2|
+|A -> C
C -> A
G -> T
T -> G|.3n.AC.\*.ht2
.3n.TG.\*.ht2|
+|A -> T
T -> A |.3n.AT.\*.ht2
.3n.TA.\*.ht2|
+|C -> G
G -> C |.3n.CG.\*.ht2
.3n.GC.\*.ht2|
+
+#### Examples:
+ # Build the standard HISAT-3N index (with C to T conversion):
+ hisat-3n-build --base-change C,T genome.fa genome
+
+ # Build the repeat HISAT-3N index (with T to C conversion, require 256 GB memory for human genome index):
+ hisat-3n-build --base-change T,C --repeat-index genome.fa genome
+
+It is optional to make the graph index and add SNP or spice site information to the index, to increase the alignment accuracy.
+The graph index building may require more memory than the linear index building.
+For more detail, please check the [HISAT2 manual].
+
+[HISAT2 manual]:https://daehwankimlab.github.io/hisat2/manual/
+
+#### Examples:
+ # Build the standard HISAT-3N index integrated index with SNP information
+ hisat-3n-build --base-change C,T --snp genome.snp genome.fa genome
+
+ # Build the standard HISAT-3N integrated index with splice site information
+ hisat-3n-build --base-change C,T --ss genome.ss --exon genome.exon genome.fa genome
+
+ # Build the repeat HISAT-3N index integrated index with SNP information
+ hisat-3n-build --base-change C,T --repeat-index --snp genome.snp genome.fa genome
+
+ # Build the repeat HISAT-3N integrated index with splice site information
+ hisat-3n-build --base-change C,T --repeat-index --ss genome.ss --exon genome.exon genome.fa genome
+
+
+Alignment with `hisat-3n`
+------------
+After building the HISAT-3N index, you are ready to use `hisat-3n` for alignment.
+HISAT-3N has the same set of parameters as in HISAT2 with some additional arguments. Please refer to the [HISAT2 manual] for more details.
+
+For the human reference genome, HISAT-3N requires about 9GB for alignment with the standard 3N-index and 10.5GB for the repeat 3N-index.
+
+* `--base-change `
+ Specify the nucleotide conversion type (e.g., C to T in bisulfite-sequencing reads). The parameter option is two characters separated by ','. Type the original nucleotide for the first character (nt1) and type the converted nucleotide as the second character (nt2). For example, if performing [SLAM-seq] where some 'T's are converted to 'C's, input `--base-change T,C`.
+ As another example, if performing bisulfite-seq, where some 'C's are converted to 'T's, please input `--base-change C,T`.
+ If you want to align non-converted reads to the regular HISAT2 index, then omit this command.
+
+* `--index/-x `
+ Specify the index file basename for HISAT-3N. The basename is the name of the index files up to but not including the suffix `.3n.*.*.ht2` / etc.
+ For example, if you build your index with basename 'genome' using a HISAT-3N-build, please input `--index genome`.
+
+* `--directional-mapping`
+ Make directional mapping. Please use this option only if your sequencing reads are generated from a strand-specific library.
+ The directional mapping mode is about 2x faster than the standard (non-directional) mapping mode.
+
+* `--repeat-limit `
+ You can set up the number of alignments to be checked for each repeat alignment. You may increase the number to direct hisat-3n
+ to output more, if a read has multiple mapping locations. We suggest that you limit the repeat number for paired-end read alignment to no more
+ than 1,000,000. default: 1000.
+
+* `--unique-only`
+ Only output uniquely aligned reads.
+
+
+#### Examples:
+* Single-end [SLAM-seq] read (T to C conversion) alignment with standard 3N-index:
+ `hisat-3n --index genome -f -U read.fa -S output.sam --base-change T,C`
+
+* Paired-end strand-specific bisulfite-seq read (C to T conversion) alignment with repeat 3N-index:
+ `hisat-3n --index genome -f -1 read_1.fa -2 read_2.fa -S output.sam --base-change C,T --directional-mapping`
+
+* Single-end TAPS reads (C to T conversion) alignment with repeat 3N-index and only output unique aligned results:
+ `hisat-3n --index genome -q -U read.fq -S output.sam --base-change C,T --unique`
+
+
+
+#### Extra SAM tags generated by HISAT-3N:
+
+* `Yf:i:`: Number of conversions detected in the read.
+* `Zf:i:`: Number of un-converted bases are detected in the read. Yf + Zf = total number of bases which can be converted in the read sequence.
+* `YZ:A:`: The value `+` or `–` indicates the read is mapped to REF-3N (`+`) or REF-RC-3N (`-`), respectively.
+
+Generate a 3N-conversion-table with `hisat-3n-table`
+------------
+### Preparation
+
+To generate a 3N-conversion-table, users need to sort the `hisat-3n` generated SAM alignment file.
+
+[SAMtools] is required for this sorting process.
+
+Use `samtools sort` to convert the SAM file into a sorted SAM file.
+
+ samtools sort output.sam -o output_sorted.sam -O sam
+
+Generate 3N-conversion-table with `hisat-3n-table`:
+
+### Usage
+ hisat-3n-table [options]* --alignments --ref --base-change
+
+#### Main arguments
+* `--alignments `
+ SORTED SAM file. Please enter `-` for standard input.
+
+* `--ref `
+ The reference genome file (FASTA format) for generating HISAT-3N index.
+
+* `--output-name `
+ Filename to write 3N-conversion-table (tsv format) to. By default, table is written to the “standard out†or “stdout†filehandle (i.e. the console).
+
+* `--base-change `
+ The base-change rule. User should enter the exact same `--base-change` arguments in hisat-3n.
+ For example, please enter `--base-change C,T` for bisulfite sequencing reads.
+
+#### Input options
+* `-u/--unique-only`
+ Only count the unique aligned reads into 3N-conversion-table.
+
+* `-m/--multiple-only`
+ Only count the multiple aligned reads into 3N-conversion-table.
+
+* `-c/--CG-only`
+ Only count the CpG sites in reference genome. This option is designed for bisulfite sequencing reads.
+
+* `--added-chrname`
+ Please add this option if you use `--add-chrname` during `hisat-3n` alignment.
+ During `hisat-3n` alignment, the prefix "chr" is added in front of chromosome name and shows on SAM output, when user choose `--add-chrname`.
+ `hisat-3n-table` cannot find the chromosome name on reference because it has an additional "chr" prefix. This option is to help `hisat-3n-table`
+ find the matching chromosome name on reference file. The 3n-table provides the same chromosome name as SAM file.
+
+* `--removed-chrname`
+ Please add this option if you use `--remove-chrname` during `hisat-3n` alignment.
+ During `hisat-3n` alignment, the prefix "chr" is removed in front of chromosome name and shows on SAM output, when user choose `--remove-chrname`.
+ `hisat-3n-table` cannot find the chromosome name on reference because it has no "chr" prefix. This option is to help `hisat-3n-table`
+ find the matching chromosome name on reference file. The 3n-table provides the same chromosome name as SAM file.
+
+#### Other options:
+* `-p/--threads `
+ Launch `int` parallel threads (default: 1) for table building.
+
+* `-h/--help`
+ Print usage information and quit.
+
+#### Examples:
+ # Generate the 3N-conversion-table for bisulfite sequencing data:
+ hisat-3n-table -p 16 --alignments sorted_alignment_result.sam --ref genome.fa --output-name output.tsv --base-change C,T
+
+ # Generate the 3N-conversion-table for TAPS data and only count base in CpG site and uniquely aligned:
+ hisat-3n-table -p 16 --alignments sorted_alignment_result.sam --ref genome.fa --output-name output.tsv --base-change C,T --CG-only --unique-only
+
+ # Generate the 3N-conversion-table for bisulfite sequencing data from sorted BAM file:
+ samtools view -h sorted_alignment_result.bam | hisat-3n-table --ref genome.fa --alignments - --output-name output.tsv --base-change C,T
+
+ # Generate the 3N-conversion-table for bisulfite sequencing data from unsorted BAM file:
+ samtools sort alignment_result.bam -O sam | hisat-3n-table --ref genome.fa --alignments - --output-name output.tsv --base-change C,T
+
+
+#### Note:
+There are 7 columns in the 3N-conversion-table:
+
+1. `ref`: the chromosome name.
+2. `pos`: 1-based position in `ref`.
+3. `strand`: '+' for forward strand. '-' for reverse strand.
+4. `convertedBaseQualities`: the qualities of the converted bases in read-level measurement. The length of this string is equal to the number of converted bases.
+5. `convertedBaseCount`: the number of distinct read positions where converted bases in read-level measurements were found.
+ this number is equal to the length of convertedBaseQualities.
+6. `unconvertedBaseQualities`: the qualities of the unconverted bases in read-level measurement. The length of this string is equal to the number of unconverted bases in read-level measurement.
+7. `unconvertedBaseCount`: the number of distinct read positions where unconverted bases in read-level measurements were found.
+ this number is equal to the length of unconvertedBaseQualities.
+
+##### Sample 3N-conversion-table:
+ ref pos strand convertedBaseQualities convertedBaseCount unconvertedBaseQualities unconvertedBaseCount
+ 1 11874 + FFFFFB
+ *
+ * This file is part of Bowtie 2.
+ *
+ * Bowtie 2 is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ *
+ * Bowtie 2 is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with Bowtie 2. If not, see .
+ */
+
+#include "aligner_bt.h"
+#include "mask.h"
+
+using namespace std;
+
+#define CHECK_ROW_COL(rowc, colc) \
+ if(rowc >= 0 && colc >= 0) { \
+ if(!sawcell_[colc].insert(rowc)) { \
+ /* was already in there */ \
+ abort = true; \
+ return; \
+ } \
+ assert(local || prob_.cper_->debugCell(rowc, colc, hefc)); \
+ }
+
+/**
+ * Fill in a triangle of the DP table and backtrace from the given cell to
+ * a cell in the previous checkpoint, or to the terminal cell.
+ */
+void BtBranchTracer::triangleFill(
+ int64_t rw, // row of cell to backtrace from
+ int64_t cl, // column of cell to backtrace from
+ int hef, // cell to backtrace from is H (0), E (1), or F (2)
+ TAlScore targ, // score of cell to backtrace from
+ TAlScore targ_final, // score of alignment we're looking for
+ RandomSource& rnd, // pseudo-random generator
+ int64_t& row_new, // out: row we ended up in after backtrace
+ int64_t& col_new, // out: column we ended up in after backtrace
+ int& hef_new, // out: H/E/F after backtrace
+ TAlScore& targ_new, // out: score up to cell we ended up in
+ bool& done, // out: finished tracing out an alignment?
+ bool& abort) // out: aborted b/c cell was seen before?
+{
+ assert_geq(rw, 0);
+ assert_geq(cl, 0);
+ assert_range(0, 2, hef);
+ assert_lt(rw, (int64_t)prob_.qrylen_);
+ assert_lt(cl, (int64_t)prob_.reflen_);
+ assert(prob_.usecp_ && prob_.fill_);
+ int64_t row = rw, col = cl;
+ const int64_t colmin = 0;
+ const int64_t rowmin = 0;
+ const int64_t colmax = prob_.reflen_ - 1;
+ const int64_t rowmax = prob_.qrylen_ - 1;
+ assert_leq(prob_.reflen_, (TRefOff)sawcell_.size());
+ assert_leq(col, (int64_t)prob_.cper_->hicol());
+ assert_geq(col, (int64_t)prob_.cper_->locol());
+ assert_geq(prob_.cper_->per(), 2);
+ size_t mod = (row + col) & prob_.cper_->lomask();
+ assert_lt(mod, prob_.cper_->per());
+ // Allocate room for diags
+ size_t depth = mod+1;
+ assert_leq(depth, prob_.cper_->per());
+ size_t breadth = depth;
+ tri_.resize(depth);
+ // Allocate room for each diag
+ for(size_t i = 0; i < depth; i++) {
+ tri_[i].resize(breadth - i);
+ }
+ bool upperleft = false;
+ size_t off = (row + col) >> prob_.cper_->perpow2();
+ if(off == 0) {
+ upperleft = true;
+ } else {
+ off--;
+ }
+ const TAlScore sc_rdo = prob_.sc_->readGapOpen();
+ const TAlScore sc_rde = prob_.sc_->readGapExtend();
+ const TAlScore sc_rfo = prob_.sc_->refGapOpen();
+ const TAlScore sc_rfe = prob_.sc_->refGapExtend();
+ const bool local = !prob_.sc_->monotone;
+ int64_t row_lo = row - (int64_t)mod;
+ const CpQuad *prev2 = NULL, *prev1 = NULL;
+ if(!upperleft) {
+ // Read-only pointer to cells in diagonal -2. Start one row above the
+ // target row.
+ prev2 = prob_.cper_->qdiag1sPtr() + (off * prob_.cper_->nrow() + row_lo - 1);
+ // Read-only pointer to cells in diagonal -1. Start one row above the
+ // target row
+ prev1 = prob_.cper_->qdiag2sPtr() + (off * prob_.cper_->nrow() + row_lo - 1);
+#ifndef NDEBUG
+ if(row >= (int64_t)mod) {
+ size_t rowc = row - mod, colc = col;
+ if(rowc > 0 && prob_.cper_->isCheckpointed(rowc-1, colc)) {
+ TAlScore al = prev1[0].sc[0];
+ if(al == MIN_I16) al = MIN_I64;
+ assert_eq(prob_.cper_->scoreTriangle(rowc-1, colc, 0), al);
+ }
+ if(rowc > 0 && colc > 0 && prob_.cper_->isCheckpointed(rowc-1, colc-1)) {
+ TAlScore al = prev2[0].sc[0];
+ if(al == MIN_I16) al = MIN_I64;
+ assert_eq(prob_.cper_->scoreTriangle(rowc-1, colc-1, 0), al);
+ }
+ }
+#endif
+ }
+ // Pointer to cells in current diagonal
+ // For each diagonal we need to fill in
+ for(size_t i = 0; i < depth; i++) {
+ CpQuad * cur = tri_[i].ptr();
+ CpQuad * curc = cur;
+ size_t doff = mod - i; // # diagonals we are away from target diag
+ //assert_geq(row, (int64_t)doff);
+ int64_t rowc = row - doff;
+ int64_t colc = col;
+ size_t neval = 0; // # cells evaluated in this diag
+ ASSERT_ONLY(const CpQuad *last = NULL);
+ // Fill this diagonal from upper right to lower left
+ for(size_t j = 0; j < breadth; j++) {
+ if(rowc >= rowmin && rowc <= rowmax &&
+ colc >= colmin && colc <= colmax)
+ {
+ neval++;
+ int64_t fromend = prob_.qrylen_ - rowc - 1;
+ bool allowGaps = fromend >= prob_.sc_->gapbar && rowc >= prob_.sc_->gapbar;
+ // Fill this cell
+ // Some things we might want to calculate about this cell up front:
+ // 1. How many matches are possible from this cell to the cell in
+ // row, col, in case this allows us to prune
+ // Get character from read
+ int qc = prob_.qry_[rowc];
+ // Get quality value from read
+ int qq = prob_.qual_[rowc];
+ assert_geq(qq, 33);
+ // Get character from reference
+ int rc = prob_.ref_[colc];
+ assert_range(0, 16, rc);
+ int16_t sc_diag = prob_.sc_->score(qc, rc, qq - 33);
+ int16_t sc_h_up = MIN_I16;
+ int16_t sc_f_up = MIN_I16;
+ int16_t sc_h_lf = MIN_I16;
+ int16_t sc_e_lf = MIN_I16;
+ if(allowGaps) {
+ if(rowc > 0) {
+ assert(local || prev1[j+0].sc[2] < 0);
+ if(prev1[j+0].sc[0] > MIN_I16) {
+ sc_h_up = prev1[j+0].sc[0] - sc_rfo;
+ if(local) sc_h_up = max(sc_h_up, 0);
+ }
+ if(prev1[j+0].sc[2] > MIN_I16) {
+ sc_f_up = prev1[j+0].sc[2] - sc_rfe;
+ if(local) sc_f_up = max(sc_f_up, 0);
+ }
+#ifndef NDEBUG
+ TAlScore hup = prev1[j+0].sc[0];
+ TAlScore fup = prev1[j+0].sc[2];
+ if(hup == MIN_I16) hup = MIN_I64;
+ if(fup == MIN_I16) fup = MIN_I64;
+ if(local) {
+ hup = max(hup, 0);
+ fup = max(fup, 0);
+ }
+ if(prob_.cper_->isCheckpointed(rowc-1, colc)) {
+ assert_eq(hup, prob_.cper_->scoreTriangle(rowc-1, colc, 0));
+ assert_eq(fup, prob_.cper_->scoreTriangle(rowc-1, colc, 2));
+ }
+#endif
+ }
+ if(colc > 0) {
+ assert(local || prev1[j+1].sc[1] < 0);
+ if(prev1[j+1].sc[0] > MIN_I16) {
+ sc_h_lf = prev1[j+1].sc[0] - sc_rdo;
+ if(local) sc_h_lf = max(sc_h_lf, 0);
+ }
+ if(prev1[j+1].sc[1] > MIN_I16) {
+ sc_e_lf = prev1[j+1].sc[1] - sc_rde;
+ if(local) sc_e_lf = max(sc_e_lf, 0);
+ }
+#ifndef NDEBUG
+ TAlScore hlf = prev1[j+1].sc[0];
+ TAlScore elf = prev1[j+1].sc[1];
+ if(hlf == MIN_I16) hlf = MIN_I64;
+ if(elf == MIN_I16) elf = MIN_I64;
+ if(local) {
+ hlf = max(hlf, 0);
+ elf = max(elf, 0);
+ }
+ if(prob_.cper_->isCheckpointed(rowc, colc-1)) {
+ assert_eq(hlf, prob_.cper_->scoreTriangle(rowc, colc-1, 0));
+ assert_eq(elf, prob_.cper_->scoreTriangle(rowc, colc-1, 1));
+ }
+#endif
+ }
+ }
+ assert(rowc <= 1 || colc <= 0 || prev2 != NULL);
+ int16_t sc_h_dg = ((rowc > 0 && colc > 0) ? prev2[j+0].sc[0] : 0);
+ if(colc == 0 && rowc > 0 && !local) {
+ sc_h_dg = MIN_I16;
+ }
+ if(sc_h_dg > MIN_I16) {
+ sc_h_dg += sc_diag;
+ }
+ if(local) sc_h_dg = max(sc_h_dg, 0);
+ // cerr << sc_diag << " " << sc_h_dg << " " << sc_h_up << " " << sc_f_up << " " << sc_h_lf << " " << sc_e_lf << endl;
+ int mask = 0;
+ // Calculate best ways into H, E, F cells starting with H.
+ // Mask bits:
+ // H: 1=diag, 2=hhoriz, 4=ehoriz, 8=hvert, 16=fvert
+ // E: 32=hhoriz, 64=ehoriz
+ // F: 128=hvert, 256=fvert
+ int16_t sc_best = sc_h_dg;
+ if(sc_h_dg > MIN_I64) {
+ mask = 1;
+ }
+ if(colc > 0 && sc_h_lf >= sc_best && sc_h_lf > MIN_I64) {
+ if(sc_h_lf > sc_best) mask = 0;
+ mask |= 2;
+ sc_best = sc_h_lf;
+ }
+ if(colc > 0 && sc_e_lf >= sc_best && sc_e_lf > MIN_I64) {
+ if(sc_e_lf > sc_best) mask = 0;
+ mask |= 4;
+ sc_best = sc_e_lf;
+ }
+ if(rowc > 0 && sc_h_up >= sc_best && sc_h_up > MIN_I64) {
+ if(sc_h_up > sc_best) mask = 0;
+ mask |= 8;
+ sc_best = sc_h_up;
+ }
+ if(rowc > 0 && sc_f_up >= sc_best && sc_f_up > MIN_I64) {
+ if(sc_f_up > sc_best) mask = 0;
+ mask |= 16;
+ sc_best = sc_f_up;
+ }
+ // Calculate best way into E cell
+ int16_t sc_e_best = sc_h_lf;
+ if(colc > 0) {
+ if(sc_h_lf >= sc_e_lf && sc_h_lf > MIN_I64) {
+ if(sc_h_lf == sc_e_lf) {
+ mask |= 64;
+ }
+ mask |= 32;
+ } else if(sc_e_lf > MIN_I64) {
+ sc_e_best = sc_e_lf;
+ mask |= 64;
+ }
+ }
+ if(sc_e_best > sc_best) {
+ sc_best = sc_e_best;
+ mask &= ~31; // don't go diagonal
+ }
+ // Calculate best way into F cell
+ int16_t sc_f_best = sc_h_up;
+ if(rowc > 0) {
+ if(sc_h_up >= sc_f_up && sc_h_up > MIN_I64) {
+ if(sc_h_up == sc_f_up) {
+ mask |= 256;
+ }
+ mask |= 128;
+ } else if(sc_f_up > MIN_I64) {
+ sc_f_best = sc_f_up;
+ mask |= 256;
+ }
+ }
+ if(sc_f_best > sc_best) {
+ sc_best = sc_f_best;
+ mask &= ~127; // don't go horizontal or diagonal
+ }
+ // Install results in cur
+ assert(!prob_.sc_->monotone || sc_best <= 0);
+ assert(!prob_.sc_->monotone || sc_e_best <= 0);
+ assert(!prob_.sc_->monotone || sc_f_best <= 0);
+ curc->sc[0] = sc_best;
+ assert( local || sc_e_best < 0);
+ assert( local || sc_f_best < 0);
+ assert(!local || sc_e_best >= 0 || sc_e_best == MIN_I16);
+ assert(!local || sc_f_best >= 0 || sc_f_best == MIN_I16);
+ curc->sc[1] = sc_e_best;
+ curc->sc[2] = sc_f_best;
+ curc->sc[3] = mask;
+ // cerr << curc->sc[0] << " " << curc->sc[1] << " " << curc->sc[2] << " " << curc->sc[3] << endl;
+ ASSERT_ONLY(last = curc);
+#ifndef NDEBUG
+ if(prob_.cper_->isCheckpointed(rowc, colc)) {
+ if(local) {
+ sc_e_best = max(sc_e_best, 0);
+ sc_f_best = max(sc_f_best, 0);
+ }
+ TAlScore sc_best64 = sc_best; if(sc_best == MIN_I16) sc_best64 = MIN_I64;
+ TAlScore sc_e_best64 = sc_e_best; if(sc_e_best == MIN_I16) sc_e_best64 = MIN_I64;
+ TAlScore sc_f_best64 = sc_f_best; if(sc_f_best == MIN_I16) sc_f_best64 = MIN_I64;
+ assert_eq(prob_.cper_->scoreTriangle(rowc, colc, 0), sc_best64);
+ assert_eq(prob_.cper_->scoreTriangle(rowc, colc, 1), sc_e_best64);
+ assert_eq(prob_.cper_->scoreTriangle(rowc, colc, 2), sc_f_best64);
+ }
+#endif
+ }
+ // Update row, col
+ assert_lt(rowc, (int64_t)prob_.qrylen_);
+ rowc++;
+ colc--;
+ curc++;
+ } // for(size_t j = 0; j < breadth; j++)
+ if(i == depth-1) {
+ // Final iteration
+ assert(last != NULL);
+ assert_eq(1, neval);
+ assert_neq(0, last->sc[3]);
+ assert_eq(targ, last->sc[hef]);
+ } else {
+ breadth--;
+ prev2 = prev1 + 1;
+ prev1 = cur;
+ }
+ } // for(size_t i = 0; i < depth; i++)
+ //
+ // Now backtrack through the triangle. Abort as soon as we enter a cell
+ // that was visited by a previous backtrace.
+ //
+ int64_t rowc = row, colc = col;
+ size_t curid;
+ int hefc = hef;
+ if(bs_.empty()) {
+ // Start an initial branch
+ CHECK_ROW_COL(rowc, colc);
+ curid = bs_.alloc();
+ assert_eq(0, curid);
+ Edit e;
+ bs_[curid].init(
+ prob_,
+ 0, // parent ID
+ 0, // penalty
+ 0, // score_en
+ rowc, // row
+ colc, // col
+ e, // edit
+ 0, // hef
+ true, // I am the root
+ false); // don't try to extend with exact matches
+ bs_[curid].len_ = 0;
+ } else {
+ curid = bs_.size()-1;
+ }
+ size_t idx_orig = (row + col) >> prob_.cper_->perpow2();
+ while(true) {
+ // What depth are we?
+ size_t mod = (rowc + colc) & prob_.cper_->lomask();
+ assert_lt(mod, prob_.cper_->per());
+ CpQuad * cur = tri_[mod].ptr();
+ int64_t row_off = rowc - row_lo - mod;
+ assert(!local || cur[row_off].sc[0] > 0);
+ assert_geq(row_off, 0);
+ int mask = cur[row_off].sc[3];
+ assert_gt(mask, 0);
+ int sel = -1;
+ // Select what type of move to make, which depends on whether we're
+ // currently in H, E, F:
+ if(hefc == 0) {
+ if( (mask & 1) != 0) {
+ // diagonal
+ sel = 0;
+ } else if((mask & 8) != 0) {
+ // up to H
+ sel = 3;
+ } else if((mask & 16) != 0) {
+ // up to F
+ sel = 4;
+ } else if((mask & 2) != 0) {
+ // left to H
+ sel = 1;
+ } else if((mask & 4) != 0) {
+ // left to E
+ sel = 2;
+ }
+ } else if(hefc == 1) {
+ if( (mask & 32) != 0) {
+ // left to H
+ sel = 5;
+ } else if((mask & 64) != 0) {
+ // left to E
+ sel = 6;
+ }
+ } else {
+ assert_eq(2, hefc);
+ if( (mask & 128) != 0) {
+ // up to H
+ sel = 7;
+ } else if((mask & 256) != 0) {
+ // up to F
+ sel = 8;
+ }
+ }
+ assert_geq(sel, 0);
+ // Get character from read
+ int qc = prob_.qry_[rowc], qq = prob_.qual_[rowc];
+ // Get character from reference
+ int rc = prob_.ref_[colc];
+ assert_range(0, 16, rc);
+ // Now that we know what type of move to make, make it, updating our
+ // row and column and moving updating the branch.
+ if(sel == 0) {
+ assert_geq(rowc, 0);
+ assert_geq(colc, 0);
+ TAlScore scd = prob_.sc_->score(qc, rc, qq - 33);
+ if((rc & (1 << qc)) == 0) {
+ // Mismatch
+ size_t id = curid;
+ // Check if the previous branch was the initial (bottommost)
+ // branch with no matches. If so, the mismatch should be added
+ // to the initial branch, instead of starting a new branch.
+ bool empty = (bs_[curid].len_ == 0 && curid == 0);
+ if(!empty) {
+ id = bs_.alloc();
+ }
+ Edit e((int)rowc, mask2dna[rc], "ACGTN"[qc], EDIT_TYPE_MM);
+ assert_lt(scd, 0);
+ TAlScore score_en = bs_[curid].score_st_ + scd;
+ bs_[id].init(
+ prob_,
+ curid, // parent ID
+ -scd, // penalty
+ score_en, // score_en
+ rowc, // row
+ colc, // col
+ e, // edit
+ hefc, // hef
+ empty, // root?
+ false); // don't try to extend with exact matches
+ //assert(!local || bs_[id].score_st_ >= 0);
+ curid = id;
+ } else {
+ // Match
+ bs_[curid].score_st_ += prob_.sc_->match();
+ bs_[curid].len_++;
+ assert_leq((int64_t)bs_[curid].len_, bs_[curid].row_ + 1);
+ }
+ rowc--;
+ colc--;
+ assert(local || bs_[curid].score_st_ >= targ_final);
+ hefc = 0;
+ } else if((sel >= 1 && sel <= 2) || (sel >= 5 && sel <= 6)) {
+ assert_gt(colc, 0);
+ // Read gap
+ size_t id = bs_.alloc();
+ Edit e((int)rowc+1, mask2dna[rc], '-', EDIT_TYPE_READ_GAP);
+ TAlScore gapp = prob_.sc_->readGapOpen();
+ if(bs_[curid].len_ == 0 && bs_[curid].e_.inited() && bs_[curid].e_.isReadGap()) {
+ gapp = prob_.sc_->readGapExtend();
+ }
+ TAlScore score_en = bs_[curid].score_st_ - gapp;
+ bs_[id].init(
+ prob_,
+ curid, // parent ID
+ gapp, // penalty
+ score_en, // score_en
+ rowc, // row
+ colc-1, // col
+ e, // edit
+ hefc, // hef
+ false, // root?
+ false); // don't try to extend with exact matches
+ colc--;
+ curid = id;
+ assert( local || bs_[curid].score_st_ >= targ_final);
+ //assert(!local || bs_[curid].score_st_ >= 0);
+ if(sel == 1 || sel == 5) {
+ hefc = 0;
+ } else {
+ hefc = 1;
+ }
+ } else {
+ assert_gt(rowc, 0);
+ // Reference gap
+ size_t id = bs_.alloc();
+ Edit e((int)rowc, '-', "ACGTN"[qc], EDIT_TYPE_REF_GAP);
+ TAlScore gapp = prob_.sc_->refGapOpen();
+ if(bs_[curid].len_ == 0 && bs_[curid].e_.inited() && bs_[curid].e_.isRefGap()) {
+ gapp = prob_.sc_->refGapExtend();
+ }
+ TAlScore score_en = bs_[curid].score_st_ - gapp;
+ bs_[id].init(
+ prob_,
+ curid, // parent ID
+ gapp, // penalty
+ score_en, // score_en
+ rowc-1, // row
+ colc, // col
+ e, // edit
+ hefc, // hef
+ false, // root?
+ false); // don't try to extend with exact matches
+ rowc--;
+ curid = id;
+ //assert(!local || bs_[curid].score_st_ >= 0);
+ if(sel == 3 || sel == 7) {
+ hefc = 0;
+ } else {
+ hefc = 2;
+ }
+ }
+ CHECK_ROW_COL(rowc, colc);
+ size_t mod_new = (rowc + colc) & prob_.cper_->lomask();
+ size_t idx = (rowc + colc) >> prob_.cper_->perpow2();
+ assert_lt(mod_new, prob_.cper_->per());
+ int64_t row_off_new = rowc - row_lo - mod_new;
+ CpQuad * cur_new = NULL;
+ if(colc >= 0 && rowc >= 0 && idx == idx_orig) {
+ cur_new = tri_[mod_new].ptr();
+ }
+ bool hit_new_tri = (idx < idx_orig && colc >= 0 && rowc >= 0);
+ // Check whether we made it to the top row or to a cell with score 0
+ if(colc < 0 || rowc < 0 ||
+ (cur_new != NULL && (local && cur_new[row_off_new].sc[0] == 0)))
+ {
+ done = true;
+ assert(bs_[curid].isSolution(prob_));
+ addSolution(curid);
+#ifndef NDEBUG
+ // A check to see if any two adjacent branches in the backtrace
+ // overlap. If they do, the whole alignment will be filtered out
+ // in trySolution(...)
+ size_t cur = curid;
+ if(!bs_[cur].root_) {
+ size_t next = bs_[cur].parentId_;
+ while(!bs_[next].root_) {
+ assert_neq(cur, next);
+ if(bs_[next].len_ != 0 || bs_[cur].len_ == 0) {
+ assert(!bs_[cur].overlap(prob_, bs_[next]));
+ }
+ cur = next;
+ next = bs_[cur].parentId_;
+ }
+ }
+#endif
+ return;
+ }
+ if(hit_new_tri) {
+ assert(rowc < 0 || colc < 0 || prob_.cper_->isCheckpointed(rowc, colc));
+ row_new = rowc; col_new = colc;
+ hef_new = hefc;
+ done = false;
+ if(rowc < 0 || colc < 0) {
+ assert(local);
+ targ_new = 0;
+ } else {
+ targ_new = prob_.cper_->scoreTriangle(rowc, colc, hefc);
+ }
+ if(local && targ_new == 0) {
+ done = true;
+ assert(bs_[curid].isSolution(prob_));
+ addSolution(curid);
+ }
+ assert((row_new >= 0 && col_new >= 0) || done);
+ return;
+ }
+ }
+ assert(false);
+}
+
+#ifndef NDEBUG
+#define DEBUG_CHECK(ss, row, col, hef) { \
+ if(prob_.cper_->debug() && row >= 0 && col >= 0) { \
+ TAlScore s = ss; \
+ if(s == MIN_I16) s = MIN_I64; \
+ if(local && s < 0) s = 0; \
+ TAlScore deb = prob_.cper_->debugCell(row, col, hef); \
+ if(local && deb < 0) deb = 0; \
+ assert_eq(s, deb); \
+ } \
+}
+#else
+#define DEBUG_CHECK(ss, row, col, hef)
+#endif
+
+
+/**
+ * Fill in a square of the DP table and backtrace from the given cell to
+ * a cell in the previous checkpoint, or to the terminal cell.
+ */
+void BtBranchTracer::squareFill(
+ int64_t rw, // row of cell to backtrace from
+ int64_t cl, // column of cell to backtrace from
+ int hef, // cell to backtrace from is H (0), E (1), or F (2)
+ TAlScore targ, // score of cell to backtrace from
+ TAlScore targ_final, // score of alignment we're looking for
+ RandomSource& rnd, // pseudo-random generator
+ int64_t& row_new, // out: row we ended up in after backtrace
+ int64_t& col_new, // out: column we ended up in after backtrace
+ int& hef_new, // out: H/E/F after backtrace
+ TAlScore& targ_new, // out: score up to cell we ended up in
+ bool& done, // out: finished tracing out an alignment?
+ bool& abort) // out: aborted b/c cell was seen before?
+{
+ assert_geq(rw, 0);
+ assert_geq(cl, 0);
+ assert_range(0, 2, hef);
+ assert_lt(rw, (int64_t)prob_.qrylen_);
+ assert_lt(cl, (int64_t)prob_.reflen_);
+ assert(prob_.usecp_ && prob_.fill_);
+ const bool is8_ = prob_.cper_->is8_;
+ int64_t row = rw, col = cl;
+ assert_leq(prob_.reflen_, (TRefOff)sawcell_.size());
+ assert_leq(col, (int64_t)prob_.cper_->hicol());
+ assert_geq(col, (int64_t)prob_.cper_->locol());
+ assert_geq(prob_.cper_->per(), 2);
+ size_t xmod = col & prob_.cper_->lomask();
+ size_t ymod = row & prob_.cper_->lomask();
+ size_t xdiv = col >> prob_.cper_->perpow2();
+ size_t ydiv = row >> prob_.cper_->perpow2();
+ size_t sq_ncol = xmod+1, sq_nrow = ymod+1;
+ sq_.resize(sq_ncol * sq_nrow);
+ bool upper = ydiv == 0;
+ bool left = xdiv == 0;
+ const TAlScore sc_rdo = prob_.sc_->readGapOpen();
+ const TAlScore sc_rde = prob_.sc_->readGapExtend();
+ const TAlScore sc_rfo = prob_.sc_->refGapOpen();
+ const TAlScore sc_rfe = prob_.sc_->refGapExtend();
+ const bool local = !prob_.sc_->monotone;
+ const CpQuad *qup = NULL;
+ const __m128i *qlf = NULL;
+ size_t per = prob_.cper_->per_;
+ ASSERT_ONLY(size_t nrow = prob_.cper_->nrow());
+ size_t ncol = prob_.cper_->ncol();
+ assert_eq(prob_.qrylen_, nrow);
+ assert_eq(prob_.reflen_, (TRefOff)ncol);
+ size_t niter = prob_.cper_->niter_;
+ if(!upper) {
+ qup = prob_.cper_->qrows_.ptr() + (ncol * (ydiv-1)) + xdiv * per;
+ }
+ if(!left) {
+ // Set up the column pointers to point to the first __m128i word in the
+ // relevant column
+ size_t off = (niter << 2) * (xdiv-1);
+ qlf = prob_.cper_->qcols_.ptr() + off;
+ }
+ size_t xedge = xdiv * per; // absolute offset of leftmost cell in square
+ size_t yedge = ydiv * per; // absolute offset of topmost cell in square
+ size_t xi = xedge, yi = yedge; // iterators for columns, rows
+ size_t ii = 0; // iterator into packed square
+ // Iterate over rows, then over columns
+ size_t m128mod = yi % prob_.cper_->niter_;
+ size_t m128div = yi / prob_.cper_->niter_;
+ int16_t sc_h_dg_lastrow = MIN_I16;
+ for(size_t i = 0; i <= ymod; i++, yi++) {
+ assert_lt(yi, nrow);
+ xi = xedge;
+ // Handling for first column is done outside the loop
+ size_t fromend = prob_.qrylen_ - yi - 1;
+ bool allowGaps = fromend >= (size_t)prob_.sc_->gapbar && yi >= (size_t)prob_.sc_->gapbar;
+ // Get character, quality from read
+ int qc = prob_.qry_[yi], qq = prob_.qual_[yi];
+ assert_geq(qq, 33);
+ int16_t sc_h_lf_last = MIN_I16;
+ int16_t sc_e_lf_last = MIN_I16;
+ for(size_t j = 0; j <= xmod; j++, xi++) {
+ assert_lt(xi, ncol);
+ // Get character from reference
+ int rc = prob_.ref_[xi];
+ assert_range(0, 16, rc);
+ int16_t sc_diag = prob_.sc_->score(qc, rc, qq - 33);
+ int16_t sc_h_up = MIN_I16, sc_f_up = MIN_I16,
+ sc_h_lf = MIN_I16, sc_e_lf = MIN_I16,
+ sc_h_dg = MIN_I16;
+ int16_t sc_h_up_c = MIN_I16, sc_f_up_c = MIN_I16,
+ sc_h_lf_c = MIN_I16, sc_e_lf_c = MIN_I16,
+ sc_h_dg_c = MIN_I16;
+ if(yi == 0) {
+ // If I'm in the first first row or column set it to 0
+ sc_h_dg = 0;
+ } else if(xi == 0) {
+ // Do nothing; leave it at min
+ if(local) {
+ sc_h_dg = 0;
+ }
+ } else if(i == 0 && j == 0) {
+ // Otherwise, if I'm in the upper-left square corner, I can get
+ // it from the checkpoint
+ sc_h_dg = qup[-1].sc[0];
+ } else if(j == 0) {
+ // Otherwise, if I'm in the leftmost cell of this row, I can
+ // get it from sc_h_lf in first column of previous row
+ sc_h_dg = sc_h_dg_lastrow;
+ } else {
+ // Otherwise, I can get it from qup
+ sc_h_dg = qup[j-1].sc[0];
+ }
+ if(yi > 0 && xi > 0) DEBUG_CHECK(sc_h_dg, yi-1, xi-1, 2);
+
+ // If we're in the leftmost column, calculate sc_h_lf regardless of
+ // allowGaps.
+ if(j == 0 && xi > 0) {
+ // Get values for left neighbors from the checkpoint
+ if(is8_) {
+ size_t vecoff = (m128mod << 6) + m128div;
+ sc_e_lf = ((uint8_t*)(qlf + 0))[vecoff];
+ sc_h_lf = ((uint8_t*)(qlf + 2))[vecoff];
+ if(local) {
+ // No adjustment
+ } else {
+ if(sc_h_lf == 0) sc_h_lf = MIN_I16;
+ else sc_h_lf -= 0xff;
+ if(sc_e_lf == 0) sc_e_lf = MIN_I16;
+ else sc_e_lf -= 0xff;
+ }
+ } else {
+ size_t vecoff = (m128mod << 5) + m128div;
+ sc_e_lf = ((int16_t*)(qlf + 0))[vecoff];
+ sc_h_lf = ((int16_t*)(qlf + 2))[vecoff];
+ if(local) {
+ sc_h_lf += 0x8000; assert_geq(sc_h_lf, 0);
+ sc_e_lf += 0x8000; assert_geq(sc_e_lf, 0);
+ } else {
+ if(sc_h_lf != MIN_I16) sc_h_lf -= 0x7fff;
+ if(sc_e_lf != MIN_I16) sc_e_lf -= 0x7fff;
+ }
+ }
+ DEBUG_CHECK(sc_e_lf, yi, xi-1, 0);
+ DEBUG_CHECK(sc_h_lf, yi, xi-1, 2);
+ sc_h_dg_lastrow = sc_h_lf;
+ }
+
+ if(allowGaps) {
+ if(j == 0 /* at left edge */ && xi > 0 /* not extreme */) {
+ sc_h_lf_c = sc_h_lf;
+ sc_e_lf_c = sc_e_lf;
+ if(sc_h_lf_c != MIN_I16) sc_h_lf_c -= sc_rdo;
+ if(sc_e_lf_c != MIN_I16) sc_e_lf_c -= sc_rde;
+ assert_leq(sc_h_lf_c, prob_.cper_->perf_);
+ assert_leq(sc_e_lf_c, prob_.cper_->perf_);
+ } else if(xi > 0) {
+ // Get values for left neighbors from the previous iteration
+ if(sc_h_lf_last != MIN_I16) {
+ sc_h_lf = sc_h_lf_last;
+ sc_h_lf_c = sc_h_lf - sc_rdo;
+ }
+ if(sc_e_lf_last != MIN_I16) {
+ sc_e_lf = sc_e_lf_last;
+ sc_e_lf_c = sc_e_lf - sc_rde;
+ }
+ }
+ if(yi > 0 /* not extreme */) {
+ // Get column values
+ assert(qup != NULL);
+ assert(local || qup[j].sc[2] < 0);
+ if(qup[j].sc[0] > MIN_I16) {
+ DEBUG_CHECK(qup[j].sc[0], yi-1, xi, 2);
+ sc_h_up = qup[j].sc[0];
+ sc_h_up_c = sc_h_up - sc_rfo;
+ }
+ if(qup[j].sc[2] > MIN_I16) {
+ DEBUG_CHECK(qup[j].sc[2], yi-1, xi, 1);
+ sc_f_up = qup[j].sc[2];
+ sc_f_up_c = sc_f_up - sc_rfe;
+ }
+ }
+ if(local) {
+ sc_h_up_c = max(sc_h_up_c, 0);
+ sc_f_up_c = max(sc_f_up_c, 0);
+ sc_h_lf_c = max(sc_h_lf_c, 0);
+ sc_e_lf_c = max(sc_e_lf_c, 0);
+ }
+ }
+
+ if(sc_h_dg > MIN_I16) {
+ sc_h_dg_c = sc_h_dg + sc_diag;
+ }
+ if(local) sc_h_dg_c = max(sc_h_dg_c, 0);
+
+ int mask = 0;
+ // Calculate best ways into H, E, F cells starting with H.
+ // Mask bits:
+ // H: 1=diag, 2=hhoriz, 4=ehoriz, 8=hvert, 16=fvert
+ // E: 32=hhoriz, 64=ehoriz
+ // F: 128=hvert, 256=fvert
+ int16_t sc_best = sc_h_dg_c;
+ if(sc_h_dg_c > MIN_I64) {
+ mask = 1;
+ }
+ if(xi > 0 && sc_h_lf_c >= sc_best && sc_h_lf_c > MIN_I64) {
+ if(sc_h_lf_c > sc_best) mask = 0;
+ mask |= 2;
+ sc_best = sc_h_lf_c;
+ }
+ if(xi > 0 && sc_e_lf_c >= sc_best && sc_e_lf_c > MIN_I64) {
+ if(sc_e_lf_c > sc_best) mask = 0;
+ mask |= 4;
+ sc_best = sc_e_lf_c;
+ }
+ if(yi > 0 && sc_h_up_c >= sc_best && sc_h_up_c > MIN_I64) {
+ if(sc_h_up_c > sc_best) mask = 0;
+ mask |= 8;
+ sc_best = sc_h_up_c;
+ }
+ if(yi > 0 && sc_f_up_c >= sc_best && sc_f_up_c > MIN_I64) {
+ if(sc_f_up_c > sc_best) mask = 0;
+ mask |= 16;
+ sc_best = sc_f_up_c;
+ }
+ // Calculate best way into E cell
+ int16_t sc_e_best = sc_h_lf_c;
+ if(xi > 0) {
+ if(sc_h_lf_c >= sc_e_lf_c && sc_h_lf_c > MIN_I64) {
+ if(sc_h_lf_c == sc_e_lf_c) {
+ mask |= 64;
+ }
+ mask |= 32;
+ } else if(sc_e_lf_c > MIN_I64) {
+ sc_e_best = sc_e_lf_c;
+ mask |= 64;
+ }
+ }
+ if(sc_e_best > sc_best) {
+ sc_best = sc_e_best;
+ mask &= ~31; // don't go diagonal
+ }
+ // Calculate best way into F cell
+ int16_t sc_f_best = sc_h_up_c;
+ if(yi > 0) {
+ if(sc_h_up_c >= sc_f_up_c && sc_h_up_c > MIN_I64) {
+ if(sc_h_up_c == sc_f_up_c) {
+ mask |= 256;
+ }
+ mask |= 128;
+ } else if(sc_f_up_c > MIN_I64) {
+ sc_f_best = sc_f_up_c;
+ mask |= 256;
+ }
+ }
+ if(sc_f_best > sc_best) {
+ sc_best = sc_f_best;
+ mask &= ~127; // don't go horizontal or diagonal
+ }
+ // Install results in cur
+ assert( local || sc_best <= 0);
+ sq_[ii+j].sc[0] = sc_best;
+ assert( local || sc_e_best < 0);
+ assert( local || sc_f_best < 0);
+ assert(!local || sc_e_best >= 0 || sc_e_best == MIN_I16);
+ assert(!local || sc_f_best >= 0 || sc_f_best == MIN_I16);
+ sq_[ii+j].sc[1] = sc_e_best;
+ sq_[ii+j].sc[2] = sc_f_best;
+ sq_[ii+j].sc[3] = mask;
+ DEBUG_CHECK(sq_[ii+j].sc[0], yi, xi, 2); // H
+ DEBUG_CHECK(sq_[ii+j].sc[1], yi, xi, 0); // E
+ DEBUG_CHECK(sq_[ii+j].sc[2], yi, xi, 1); // F
+ // Update sc_h_lf_last, sc_e_lf_last
+ sc_h_lf_last = sc_best;
+ sc_e_lf_last = sc_e_best;
+ }
+ // Update m128mod, m128div
+ m128mod++;
+ if(m128mod == prob_.cper_->niter_) {
+ m128mod = 0;
+ m128div++;
+ }
+ // update qup
+ ii += sq_ncol;
+ // dimensions of sq_
+ qup = sq_.ptr() + sq_ncol * i;
+ }
+ assert_eq(targ, sq_[ymod * sq_ncol + xmod].sc[hef]);
+ //
+ // Now backtrack through the triangle. Abort as soon as we enter a cell
+ // that was visited by a previous backtrace.
+ //
+ int64_t rowc = row, colc = col;
+ size_t curid;
+ int hefc = hef;
+ if(bs_.empty()) {
+ // Start an initial branch
+ CHECK_ROW_COL(rowc, colc);
+ curid = bs_.alloc();
+ assert_eq(0, curid);
+ Edit e;
+ bs_[curid].init(
+ prob_,
+ 0, // parent ID
+ 0, // penalty
+ 0, // score_en
+ rowc, // row
+ colc, // col
+ e, // edit
+ 0, // hef
+ true, // root?
+ false); // don't try to extend with exact matches
+ bs_[curid].len_ = 0;
+ } else {
+ curid = bs_.size()-1;
+ }
+ size_t ymodTimesNcol = ymod * sq_ncol;
+ while(true) {
+ // What depth are we?
+ assert_eq(ymodTimesNcol, ymod * sq_ncol);
+ CpQuad * cur = sq_.ptr() + ymodTimesNcol + xmod;
+ int mask = cur->sc[3];
+ assert_gt(mask, 0);
+ int sel = -1;
+ // Select what type of move to make, which depends on whether we're
+ // currently in H, E, F:
+ if(hefc == 0) {
+ if( (mask & 1) != 0) {
+ // diagonal
+ sel = 0;
+ } else if((mask & 8) != 0) {
+ // up to H
+ sel = 3;
+ } else if((mask & 16) != 0) {
+ // up to F
+ sel = 4;
+ } else if((mask & 2) != 0) {
+ // left to H
+ sel = 1;
+ } else if((mask & 4) != 0) {
+ // left to E
+ sel = 2;
+ }
+ } else if(hefc == 1) {
+ if( (mask & 32) != 0) {
+ // left to H
+ sel = 5;
+ } else if((mask & 64) != 0) {
+ // left to E
+ sel = 6;
+ }
+ } else {
+ assert_eq(2, hefc);
+ if( (mask & 128) != 0) {
+ // up to H
+ sel = 7;
+ } else if((mask & 256) != 0) {
+ // up to F
+ sel = 8;
+ }
+ }
+ assert_geq(sel, 0);
+ // Get character from read
+ int qc = prob_.qry_[rowc], qq = prob_.qual_[rowc];
+ // Get character from reference
+ int rc = prob_.ref_[colc];
+ assert_range(0, 16, rc);
+ bool xexit = false, yexit = false;
+ // Now that we know what type of move to make, make it, updating our
+ // row and column and moving updating the branch.
+ if(sel == 0) {
+ assert_geq(rowc, 0);
+ assert_geq(colc, 0);
+ TAlScore scd = prob_.sc_->score(qc, rc, qq - 33);
+ if((rc & (1 << qc)) == 0) {
+ // Mismatch
+ size_t id = curid;
+ // Check if the previous branch was the initial (bottommost)
+ // branch with no matches. If so, the mismatch should be added
+ // to the initial branch, instead of starting a new branch.
+ bool empty = (bs_[curid].len_ == 0 && curid == 0);
+ if(!empty) {
+ id = bs_.alloc();
+ }
+ Edit e((int)rowc, mask2dna[rc], "ACGTN"[qc], EDIT_TYPE_MM);
+ assert_lt(scd, 0);
+ TAlScore score_en = bs_[curid].score_st_ + scd;
+ bs_[id].init(
+ prob_,
+ curid, // parent ID
+ -scd, // penalty
+ score_en, // score_en
+ rowc, // row
+ colc, // col
+ e, // edit
+ hefc, // hef
+ empty, // root?
+ false); // don't try to extend with exact matches
+ curid = id;
+ //assert(!local || bs_[curid].score_st_ >= 0);
+ } else {
+ // Match
+ bs_[curid].score_st_ += prob_.sc_->match();
+ bs_[curid].len_++;
+ assert_leq((int64_t)bs_[curid].len_, bs_[curid].row_ + 1);
+ }
+ if(xmod == 0) xexit = true;
+ if(ymod == 0) yexit = true;
+ rowc--; ymod--; ymodTimesNcol -= sq_ncol;
+ colc--; xmod--;
+ assert(local || bs_[curid].score_st_ >= targ_final);
+ hefc = 0;
+ } else if((sel >= 1 && sel <= 2) || (sel >= 5 && sel <= 6)) {
+ assert_gt(colc, 0);
+ // Read gap
+ size_t id = bs_.alloc();
+ Edit e((int)rowc+1, mask2dna[rc], '-', EDIT_TYPE_READ_GAP);
+ TAlScore gapp = prob_.sc_->readGapOpen();
+ if(bs_[curid].len_ == 0 && bs_[curid].e_.inited() && bs_[curid].e_.isReadGap()) {
+ gapp = prob_.sc_->readGapExtend();
+ }
+ //assert(!local || bs_[curid].score_st_ >= gapp);
+ TAlScore score_en = bs_[curid].score_st_ - gapp;
+ bs_[id].init(
+ prob_,
+ curid, // parent ID
+ gapp, // penalty
+ score_en, // score_en
+ rowc, // row
+ colc-1, // col
+ e, // edit
+ hefc, // hef
+ false, // root?
+ false); // don't try to extend with exact matches
+ if(xmod == 0) xexit = true;
+ colc--; xmod--;
+ curid = id;
+ assert( local || bs_[curid].score_st_ >= targ_final);
+ //assert(!local || bs_[curid].score_st_ >= 0);
+ if(sel == 1 || sel == 5) {
+ hefc = 0;
+ } else {
+ hefc = 1;
+ }
+ } else {
+ assert_gt(rowc, 0);
+ // Reference gap
+ size_t id = bs_.alloc();
+ Edit e((int)rowc, '-', "ACGTN"[qc], EDIT_TYPE_REF_GAP);
+ TAlScore gapp = prob_.sc_->refGapOpen();
+ if(bs_[curid].len_ == 0 && bs_[curid].e_.inited() && bs_[curid].e_.isRefGap()) {
+ gapp = prob_.sc_->refGapExtend();
+ }
+ //assert(!local || bs_[curid].score_st_ >= gapp);
+ TAlScore score_en = bs_[curid].score_st_ - gapp;
+ bs_[id].init(
+ prob_,
+ curid, // parent ID
+ gapp, // penalty
+ score_en, // score_en
+ rowc-1, // row
+ colc, // col
+ e, // edit
+ hefc, // hef
+ false, // root?
+ false); // don't try to extend with exact matches
+ if(ymod == 0) yexit = true;
+ rowc--; ymod--; ymodTimesNcol -= sq_ncol;
+ curid = id;
+ assert( local || bs_[curid].score_st_ >= targ_final);
+ //assert(!local || bs_[curid].score_st_ >= 0);
+ if(sel == 3 || sel == 7) {
+ hefc = 0;
+ } else {
+ hefc = 2;
+ }
+ }
+ CHECK_ROW_COL(rowc, colc);
+ CpQuad * cur_new = NULL;
+ if(!xexit && !yexit) {
+ cur_new = sq_.ptr() + ymodTimesNcol + xmod;
+ }
+ // Check whether we made it to the top row or to a cell with score 0
+ if(colc < 0 || rowc < 0 ||
+ (cur_new != NULL && local && cur_new->sc[0] == 0))
+ {
+ done = true;
+ assert(bs_[curid].isSolution(prob_));
+ addSolution(curid);
+#ifndef NDEBUG
+ // A check to see if any two adjacent branches in the backtrace
+ // overlap. If they do, the whole alignment will be filtered out
+ // in trySolution(...)
+ size_t cur = curid;
+ if(!bs_[cur].root_) {
+ size_t next = bs_[cur].parentId_;
+ while(!bs_[next].root_) {
+ assert_neq(cur, next);
+ if(bs_[next].len_ != 0 || bs_[cur].len_ == 0) {
+ assert(!bs_[cur].overlap(prob_, bs_[next]));
+ }
+ cur = next;
+ next = bs_[cur].parentId_;
+ }
+ }
+#endif
+ return;
+ }
+ assert(!xexit || hefc == 0 || hefc == 1);
+ assert(!yexit || hefc == 0 || hefc == 2);
+ if(xexit || yexit) {
+ //assert(rowc < 0 || colc < 0 || prob_.cper_->isCheckpointed(rowc, colc));
+ row_new = rowc; col_new = colc;
+ hef_new = hefc;
+ done = false;
+ if(rowc < 0 || colc < 0) {
+ assert(local);
+ targ_new = 0;
+ } else {
+ // TODO: Don't use scoreSquare
+ targ_new = prob_.cper_->scoreSquare(rowc, colc, hefc);
+ assert(local || targ_new >= targ);
+ assert(local || targ_new >= targ_final);
+ }
+ if(local && targ_new == 0) {
+ assert_eq(0, hefc);
+ done = true;
+ assert(bs_[curid].isSolution(prob_));
+ addSolution(curid);
+ }
+ assert((row_new >= 0 && col_new >= 0) || done);
+ return;
+ }
+ }
+ assert(false);
+}
+
+/**
+ * Caller gives us score_en, row and col. We figure out score_st and len_
+ * by comparing characters from the strings.
+ *
+ * If this branch comes after a mismatch, (row, col) describe the cell that the
+ * mismatch occurs in. len_ is initially set to 1, and the next cell we test
+ * is the next cell up and to the left (row-1, col-1).
+ *
+ * If this branch comes after a read gap, (row, col) describe the leftmost cell
+ * involved in the gap. len_ is initially set to 0, and the next cell we test
+ * is the current cell (row, col).
+ *
+ * If this branch comes after a reference gap, (row, col) describe the upper
+ * cell involved in the gap. len_ is initially set to 0, and the next cell we
+ * test is the current cell (row, col).
+ */
+void BtBranch::init(
+ const BtBranchProblem& prob,
+ size_t parentId,
+ TAlScore penalty,
+ TAlScore score_en,
+ int64_t row,
+ int64_t col,
+ Edit e,
+ int hef,
+ bool root,
+ bool extend)
+{
+ score_en_ = score_en;
+ penalty_ = penalty;
+ score_st_ = score_en_;
+ row_ = row;
+ col_ = col;
+ parentId_ = parentId;
+ e_ = e;
+ root_ = root;
+ assert(!root_ || parentId == 0);
+ assert_lt(row, (int64_t)prob.qrylen_);
+ assert_lt(col, (int64_t)prob.reflen_);
+ // First match to check is diagonally above and to the left of the cell
+ // where the edit occurs
+ int64_t rowc = row;
+ int64_t colc = col;
+ len_ = 0;
+ if(e.inited() && e.isMismatch()) {
+ rowc--; colc--;
+ len_ = 1;
+ }
+ int64_t match = prob.sc_->match();
+ bool cp = prob.usecp_;
+ size_t iters = 0;
+ curtailed_ = false;
+ if(extend) {
+ while(rowc >= 0 && colc >= 0) {
+ int rfm = prob.ref_[colc];
+ assert_range(0, 16, rfm);
+ int rdc = prob.qry_[rowc];
+ bool matches = (rfm & (1 << rdc)) != 0;
+ if(!matches) {
+ // What's the mismatch penalty?
+ break;
+ }
+ // Get score from checkpointer
+ score_st_ += match;
+ if(cp && rowc - 1 >= 0 && colc - 1 >= 0 &&
+ prob.cper_->isCheckpointed(rowc - 1, colc - 1))
+ {
+ // Possibly prune
+ int16_t cpsc;
+ cpsc = prob.cper_->scoreTriangle(rowc - 1, colc - 1, hef);
+ if(cpsc + score_st_ < prob.targ_) {
+ curtailed_ = true;
+ break;
+ }
+ }
+ iters++;
+ rowc--; colc--;
+ }
+ }
+ assert_geq(rowc, -1);
+ assert_geq(colc, -1);
+ len_ = (int64_t)row - rowc;
+ assert_leq((int64_t)len_, row_+1);
+ assert_leq((int64_t)len_, col_+1);
+ assert_leq((int64_t)score_st_, (int64_t)prob.qrylen_ * match);
+}
+
+/**
+ * Given a potential branch to add to the queue, see if we can follow the
+ * branch a little further first. If it's still valid, or if we reach a
+ * choice between valid outgoing paths, go ahead and add it to the queue.
+ */
+void BtBranchTracer::examineBranch(
+ int64_t row,
+ int64_t col,
+ const Edit& e,
+ TAlScore pen, // penalty associated with edit
+ TAlScore sc,
+ size_t parentId)
+{
+ size_t id = bs_.alloc();
+ bs_[id].init(prob_, parentId, pen, sc, row, col, e, 0, false, true);
+ if(bs_[id].isSolution(prob_)) {
+ assert(bs_[id].isValid(prob_));
+ addSolution(id);
+ } else {
+ // Check if this branch is legit
+ if(bs_[id].isValid(prob_)) {
+ add(id);
+ } else {
+ bs_.pop();
+ }
+ }
+}
+
+/**
+ * Take all possible ways of leaving the given branch and add them to the
+ * branch queue.
+ */
+void BtBranchTracer::addOffshoots(size_t bid) {
+ BtBranch& b = bs_[bid];
+ TAlScore sc = b.score_en_;
+ int64_t match = prob_.sc_->match();
+ int64_t scoreFloor = prob_.sc_->monotone ? MIN_I64 : 0;
+ bool cp = prob_.usecp_; // Are there are any checkpoints?
+ ASSERT_ONLY(TAlScore perfectScore = prob_.sc_->perfectScore(prob_.qrylen_));
+ assert_leq(prob_.targ_, perfectScore);
+ // For each cell in the branch
+ for(size_t i = 0 ; i < b.len_; i++) {
+ assert_leq((int64_t)i, b.row_+1);
+ assert_leq((int64_t)i, b.col_+1);
+ int64_t row = b.row_ - i, col = b.col_ - i;
+ int64_t bonusLeft = (row + 1) * match;
+ int64_t fromend = prob_.qrylen_ - row - 1;
+ bool allowGaps = fromend >= prob_.sc_->gapbar && row >= prob_.sc_->gapbar;
+ if(allowGaps && row >= 0 && col >= 0) {
+ if(col > 0) {
+ // Try a read gap - it's either an extension or an open
+ bool extend = b.e_.inited() && b.e_.isReadGap() && i == 0;
+ TAlScore rdgapPen = extend ?
+ prob_.sc_->readGapExtend() : prob_.sc_->readGapOpen();
+ bool prune = false;
+ assert_gt(rdgapPen, 0);
+ if(cp && prob_.cper_->isCheckpointed(row, col - 1)) {
+ // Possibly prune
+ int16_t cpsc = (int16_t)prob_.cper_->scoreTriangle(row, col - 1, 0);
+ assert_leq(cpsc, perfectScore);
+ assert_geq(prob_.sc_->readGapOpen(), prob_.sc_->readGapExtend());
+ TAlScore bonus = prob_.sc_->readGapOpen() - prob_.sc_->readGapExtend();
+ assert_geq(bonus, 0);
+ if(cpsc + bonus + sc - rdgapPen < prob_.targ_) {
+ prune = true;
+ }
+ }
+ if(prune) {
+ if(extend) { nrdexPrune_++; } else { nrdopPrune_++; }
+ } else if(sc - rdgapPen >= scoreFloor && sc - rdgapPen + bonusLeft >= prob_.targ_) {
+ // Yes, we can introduce a read gap here
+ Edit e((int)row + 1, mask2dna[(int)prob_.ref_[col]], '-', EDIT_TYPE_READ_GAP);
+ assert(e.isReadGap());
+ examineBranch(row, col - 1, e, rdgapPen, sc - rdgapPen, bid);
+ if(extend) { nrdex_++; } else { nrdop_++; }
+ }
+ }
+ if(row > 0) {
+ // Try a reference gap - it's either an extension or an open
+ bool extend = b.e_.inited() && b.e_.isRefGap() && i == 0;
+ TAlScore rfgapPen = (b.e_.inited() && b.e_.isRefGap()) ?
+ prob_.sc_->refGapExtend() : prob_.sc_->refGapOpen();
+ bool prune = false;
+ assert_gt(rfgapPen, 0);
+ if(cp && prob_.cper_->isCheckpointed(row - 1, col)) {
+ // Possibly prune
+ int16_t cpsc = (int16_t)prob_.cper_->scoreTriangle(row - 1, col, 0);
+ assert_leq(cpsc, perfectScore);
+ assert_geq(prob_.sc_->refGapOpen(), prob_.sc_->refGapExtend());
+ TAlScore bonus = prob_.sc_->refGapOpen() - prob_.sc_->refGapExtend();
+ assert_geq(bonus, 0);
+ if(cpsc + bonus + sc - rfgapPen < prob_.targ_) {
+ prune = true;
+ }
+ }
+ if(prune) {
+ if(extend) { nrfexPrune_++; } else { nrfopPrune_++; }
+ } else if(sc - rfgapPen >= scoreFloor && sc - rfgapPen + bonusLeft >= prob_.targ_) {
+ // Yes, we can introduce a ref gap here
+ Edit e((int)row, '-', "ACGTN"[(int)prob_.qry_[row]], EDIT_TYPE_REF_GAP);
+ assert(e.isRefGap());
+ examineBranch(row - 1, col, e, rfgapPen, sc - rfgapPen, bid);
+ if(extend) { nrfex_++; } else { nrfop_++; }
+ }
+ }
+ }
+ // If we're at the top of the branch but not yet at the top of
+ // the DP table, a mismatch branch is also possible.
+ if(i == b.len_ && !b.curtailed_ && row >= 0 && col >= 0) {
+ int rfm = prob_.ref_[col];
+ assert_lt(row, (int64_t)prob_.qrylen_);
+ int rdc = prob_.qry_[row];
+ int rdq = prob_.qual_[row];
+ int scdiff = prob_.sc_->score(rdc, rfm, rdq - 33);
+ assert_lt(scdiff, 0); // at end of branch, so can't match
+ bool prune = false;
+ if(cp && row > 0 && col > 0 && prob_.cper_->isCheckpointed(row - 1, col - 1)) {
+ // Possibly prune
+ int16_t cpsc = prob_.cper_->scoreTriangle(row - 1, col - 1, 0);
+ assert_leq(cpsc, perfectScore);
+ assert_leq(cpsc + scdiff + sc, perfectScore);
+ if(cpsc + scdiff + sc < prob_.targ_) {
+ prune = true;
+ }
+ }
+ if(prune) {
+ nmm_++;
+ } else {
+ // Yes, we can introduce a mismatch here
+ if(sc + scdiff >= scoreFloor && sc + scdiff + bonusLeft >= prob_.targ_) {
+ Edit e((int)row, mask2dna[rfm], "ACGTN"[rdc], EDIT_TYPE_MM);
+ bool nmm = (mask2dna[rfm] == 'N' || rdc > 4);
+ assert_neq(e.chr, e.qchr);
+ assert_lt(scdiff, 0);
+ examineBranch(row - 1, col - 1, e, -scdiff, sc + scdiff, bid);
+ if(nmm) { nnmm_++; } else { nmm_++; }
+ }
+ }
+ }
+ sc += match;
+ }
+}
+
+/**
+ * Sort unsorted branches, merge them with master sorted list.
+ */
+void BtBranchTracer::flushUnsorted() {
+ if(unsorted_.empty()) {
+ return;
+ }
+ unsorted_.sort();
+ unsorted_.reverse();
+#ifndef NDEBUG
+ for(size_t i = 1; i < unsorted_.size(); i++) {
+ assert_leq(bs_[unsorted_[i].second].score_st_, bs_[unsorted_[i-1].second].score_st_);
+ }
+#endif
+ EList *src2 = sortedSel_ ? &sorted1_ : &sorted2_;
+ EList *dest = sortedSel_ ? &sorted2_ : &sorted1_;
+ // Merge src1 and src2 into dest
+ dest->clear();
+ size_t cur1 = 0, cur2 = cur_;
+ while(cur1 < unsorted_.size() || cur2 < src2->size()) {
+ // Take from 1 or 2 next?
+ bool take1 = true;
+ if(cur1 == unsorted_.size()) {
+ take1 = false;
+ } else if(cur2 == src2->size()) {
+ take1 = true;
+ } else {
+ assert_neq(unsorted_[cur1].second, (*src2)[cur2]);
+ take1 = bs_[unsorted_[cur1].second] < bs_[(*src2)[cur2]];
+ }
+ if(take1) {
+ dest->push_back(unsorted_[cur1++].second); // Take from list 1
+ } else {
+ dest->push_back((*src2)[cur2++]); // Take from list 2
+ }
+ }
+ assert_eq(cur1, unsorted_.size());
+ assert_eq(cur2, src2->size());
+ sortedSel_ = !sortedSel_;
+ cur_ = 0;
+ unsorted_.clear();
+}
+
+/**
+ * Try all the solutions accumulated so far. Solutions might be rejected
+ * if they, for instance, overlap a previous solution, have too many Ns,
+ * fail to overlap a core diagonal, etc.
+ */
+bool BtBranchTracer::trySolutions(
+ bool lookForOlap,
+ SwResult& res,
+ size_t& off,
+ size_t& nrej,
+ RandomSource& rnd,
+ bool& success)
+{
+ if(solutions_.size() > 0) {
+ for(size_t i = 0; i < solutions_.size(); i++) {
+ int ret = trySolution(solutions_[i], lookForOlap, res, off, nrej, rnd);
+ if(ret == BT_FOUND) {
+ success = true;
+ return true; // there were solutions and one was good
+ }
+ }
+ solutions_.clear();
+ success = false;
+ return true; // there were solutions but none were good
+ }
+ return false; // there were no solutions to check
+}
+
+/**
+ * Given the id of a branch that completes a successful backtrace, turn the
+ * chain of branches into
+ */
+int BtBranchTracer::trySolution(
+ size_t id,
+ bool lookForOlap,
+ SwResult& res,
+ size_t& off,
+ size_t& nrej,
+ RandomSource& rnd)
+{
+ AlnScore score;
+ BtBranch *br = &bs_[id];
+ // 'br' corresponds to the leftmost edit in a right-to-left
+ // chain of edits.
+ EList& ned = res.alres.ned();
+ const BtBranch *cur = br, *prev = NULL;
+ size_t ns = 0, nrefns = 0;
+ size_t ngap = 0;
+ while(true) {
+ if(cur->e_.inited()) {
+ if(cur->e_.isMismatch()) {
+ if(cur->e_.qchr == 'N' || cur->e_.chr == 'N') {
+ ns++;
+ }
+ } else if(cur->e_.isGap()) {
+ ngap++;
+ }
+ if(cur->e_.chr == 'N') {
+ nrefns++;
+ }
+ ned.push_back(cur->e_);
+ }
+ if(cur->root_) {
+ break;
+ }
+ cur = &bs_[cur->parentId_];
+ }
+ if(ns > prob_.nceil_) {
+ // Alignment has too many Ns in it!
+ res.reset();
+ assert(res.alres.ned().empty());
+ nrej++;
+ return BT_REJECTED_N;
+ }
+ // Update 'seenPaths_'
+ cur = br;
+ bool rejSeen = false; // set =true if we overlap prev path
+ bool rejCore = true; // set =true if we don't touch core diag
+ while(true) {
+ // Consider row, col, len, then do something
+ int64_t row = cur->row_, col = cur->col_;
+ assert_lt(row, (int64_t)prob_.qrylen_);
+ size_t fromend = prob_.qrylen_ - row - 1;
+ size_t diag = fromend + col;
+ // Calculate the diagonal within the *trimmed* rectangle,
+ // i.e. the rectangle we dealt with in align, gather and
+ // backtrack.
+ int64_t diagi = col - row;
+ // Now adjust to the diagonal within the *untrimmed*
+ // rectangle by adding on the amount trimmed from the left.
+ diagi += prob_.rect_->triml;
+ assert_lt(diag, seenPaths_.size());
+ // Does it overlap a core diagonal?
+ if(diagi >= 0) {
+ size_t diag = (size_t)diagi;
+ if(diag >= prob_.rect_->corel &&
+ diag <= prob_.rect_->corer)
+ {
+ // Yes it does - it's OK
+ rejCore = false;
+ }
+ }
+ if(lookForOlap) {
+ int64_t newlo, newhi;
+ if(cur->len_ == 0) {
+ if(prev != NULL && prev->len_ > 0) {
+ // If there's a gap at the base of a non-0 length branch, the
+ // gap will appear to overlap the branch if we give it length 1.
+ newhi = newlo = 0;
+ } else {
+ // Read or ref gap with no matches coming off of it
+ newlo = row;
+ newhi = row + 1;
+ }
+ } else {
+ // Diagonal with matches
+ newlo = row - (cur->len_ - 1);
+ newhi = row + 1;
+ }
+ assert_geq(newlo, 0);
+ assert_geq(newhi, 0);
+ // Does the diagonal cover cells?
+ if(newhi > newlo) {
+ // Check whether there is any overlap with previously traversed
+ // cells
+ bool added = false;
+ const size_t sz = seenPaths_[diag].size();
+ for(size_t i = 0; i < sz; i++) {
+ // Does the new interval overlap this already-seen
+ // interval? Also of interest: does it abut this
+ // already-seen interval? If so, we should merge them.
+ size_t lo = seenPaths_[diag][i].first;
+ size_t hi = seenPaths_[diag][i].second;
+ assert_lt(lo, hi);
+ size_t lo_sm = newlo, hi_sm = newhi;
+ if(hi - lo < hi_sm - lo_sm) {
+ swap(lo, lo_sm);
+ swap(hi, hi_sm);
+ }
+ if((lo <= lo_sm && hi > lo_sm) ||
+ (lo < hi_sm && hi >= hi_sm))
+ {
+ // One or both of the shorter interval's end points
+ // are contained in the longer interval - so they
+ // overlap.
+ rejSeen = true;
+ // Merge them into one longer interval
+ seenPaths_[diag][i].first = min(lo, lo_sm);
+ seenPaths_[diag][i].second = max(hi, hi_sm);
+#ifndef NDEBUG
+ for(int64_t ii = seenPaths_[diag][i].first;
+ ii < (int64_t)seenPaths_[diag][i].second;
+ ii++)
+ {
+ //cerr << "trySolution rejected (" << ii << ", " << (ii + col - row) << ")" << endl;
+ }
+#endif
+ added = true;
+ break;
+ } else if(hi == lo_sm || lo == hi_sm) {
+ // Merge them into one longer interval
+ seenPaths_[diag][i].first = min(lo, lo_sm);
+ seenPaths_[diag][i].second = max(hi, hi_sm);
+#ifndef NDEBUG
+ for(int64_t ii = seenPaths_[diag][i].first;
+ ii < (int64_t)seenPaths_[diag][i].second;
+ ii++)
+ {
+ //cerr << "trySolution rejected (" << ii << ", " << (ii + col - row) << ")" << endl;
+ }
+#endif
+ added = true;
+ // Keep going in case it overlaps one of the other
+ // intervals
+ }
+ }
+ if(!added) {
+ seenPaths_[diag].push_back(make_pair(newlo, newhi));
+ }
+ }
+ }
+ // After the merging that may have occurred above, it's no
+ // longer guarnateed that all the overlapping intervals in
+ // the list have been merged. That's OK though. We'll
+ // still get correct answers to overlap queries.
+ if(cur->root_) {
+ assert_eq(0, cur->parentId_);
+ break;
+ }
+ prev = cur;
+ cur = &bs_[cur->parentId_];
+ } // while(cur->e_.inited())
+ if(rejSeen) {
+ res.reset();
+ assert(res.alres.ned().empty());
+ nrej++;
+ return BT_NOT_FOUND;
+ }
+ if(rejCore) {
+ res.reset();
+ assert(res.alres.ned().empty());
+ nrej++;
+ return BT_REJECTED_CORE_DIAG;
+ }
+ off = br->leftmostCol();
+ score.score_ = prob_.targ_;
+ score.ns_ = ns;
+ score.gaps_ = ngap;
+ res.alres.setScore(score);
+ res.alres.setRefNs(nrefns);
+ size_t trimBeg = br->uppermostRow();
+ size_t trimEnd = prob_.qrylen_ - prob_.row_ - 1;
+ assert_leq(trimBeg, prob_.qrylen_);
+ assert_leq(trimEnd, prob_.qrylen_);
+ TRefOff refoff = off + prob_.refoff_ + prob_.rect_->refl;
+ res.alres.setShape(
+ prob_.refid_, // ref id
+ refoff, // 0-based ref offset
+ prob_.treflen(), // ref length
+ prob_.fw_, // aligned to Watson?
+ prob_.qrylen_, // read length
+ 0, // read id
+ true, // pretrim soft?
+ 0, // pretrim 5' end
+ 0, // pretrim 3' end
+ true, // alignment trim soft?
+ prob_.fw_ ? trimBeg : trimEnd, // alignment trim 5' end
+ prob_.fw_ ? trimEnd : trimBeg); // alignment trim 3' end
+ return BT_FOUND;
+}
+
+/**
+ * Get the next valid alignment given a backtrace problem. Return false
+ * if there is no valid solution. Use a backtracking search to find the
+ * solution. This can be very slow.
+ */
+bool BtBranchTracer::nextAlignmentBacktrace(
+ size_t maxiter,
+ SwResult& res,
+ size_t& off,
+ size_t& nrej,
+ size_t& niter,
+ RandomSource& rnd)
+{
+ assert(!empty() || !emptySolution());
+ assert(prob_.inited());
+ // There's a subtle case where we might fail to backtracing in
+ // local-alignment mode. The basic fact to remember is that when we're
+ // backtracing from the highest-scoring cell in the table, we're guaranteed
+ // to be able to backtrace without ever dipping below 0. But if we're
+ // backtracing from a cell other than the highest-scoring cell in the
+ // table, we might dip below 0. Dipping below 0 implies that there's a
+ // shorted local alignment with a better score. In which case, it's
+ // perfectly fair for us to abandon any path that dips below the floor, and
+ // this might result in the queue becoming empty before we finish.
+ bool result = false;
+ niter = 0;
+ while(!empty()) {
+ if(trySolutions(true, res, off, nrej, rnd, result)) {
+ return result;
+ }
+ if(niter++ >= maxiter) {
+ break;
+ }
+ size_t brid = best(rnd); // put best branch in 'br'
+ assert(!seen_.contains(brid));
+ ASSERT_ONLY(seen_.insert(brid));
+#if 0
+ BtBranch *br = &bs_[brid];
+ cerr << brid
+ << ": targ:" << prob_.targ_
+ << ", sc:" << br->score_st_
+ << ", row:" << br->uppermostRow()
+ << ", nmm:" << nmm_
+ << ", nnmm:" << nnmm_
+ << ", nrdop:" << nrdop_
+ << ", nrfop:" << nrfop_
+ << ", nrdex:" << nrdex_
+ << ", nrfex:" << nrfex_
+ << ", nrdop_pr: " << nrdopPrune_
+ << ", nrfop_pr: " << nrfopPrune_
+ << ", nrdex_pr: " << nrdexPrune_
+ << ", nrfex_pr: " << nrfexPrune_
+ << endl;
+#endif
+ addOffshoots(brid);
+ }
+ if(trySolutions(true, res, off, nrej, rnd, result)) {
+ return result;
+ }
+ return false;
+}
+
+/**
+ * Get the next valid alignment given a backtrace problem. Return false
+ * if there is no valid solution. Use a triangle-fill backtrace to find
+ * the solution. This is usually fast (it's O(m + n)).
+ */
+bool BtBranchTracer::nextAlignmentFill(
+ size_t maxiter,
+ SwResult& res,
+ size_t& off,
+ size_t& nrej,
+ size_t& niter,
+ RandomSource& rnd)
+{
+ assert(prob_.inited());
+ assert(!emptySolution());
+ bool result = false;
+ if(trySolutions(false, res, off, nrej, rnd, result)) {
+ return result;
+ }
+ return false;
+}
+
+/**
+ * Get the next valid alignment given the backtrace problem. Return false
+ * if there is no valid solution, e.g., if
+ */
+bool BtBranchTracer::nextAlignment(
+ size_t maxiter,
+ SwResult& res,
+ size_t& off,
+ size_t& nrej,
+ size_t& niter,
+ RandomSource& rnd)
+{
+ if(prob_.fill_) {
+ return nextAlignmentFill(
+ maxiter,
+ res,
+ off,
+ nrej,
+ niter,
+ rnd);
+ } else {
+ return nextAlignmentBacktrace(
+ maxiter,
+ res,
+ off,
+ nrej,
+ niter,
+ rnd);
+ }
+}
+
+#ifdef MAIN_ALIGNER_BT
+
+#include
+
+int main(int argc, char **argv) {
+ size_t off = 0;
+ RandomSource rnd(77);
+ BtBranchTracer tr;
+ Scoring sc = Scoring::base1();
+ SwResult res;
+ tr.init(
+ "ACGTACGT", // in: read sequence
+ "IIIIIIII", // in: quality sequence
+ 8, // in: read sequence length
+ "ACGTACGT", // in: reference sequence
+ 8, // in: reference sequence length
+ 0, // in: reference id
+ 0, // in: reference offset
+ true, // in: orientation
+ sc, // in: scoring scheme
+ 0, // in: N ceiling
+ 8, // in: alignment score
+ 7, // start in this row
+ 7, // start in this column
+ rnd); // random gen, to choose among equal paths
+ size_t nrej = 0;
+ tr.nextAlignment(
+ res,
+ off,
+ nrej,
+ rnd);
+}
+
+#endif /*def MAIN_ALIGNER_BT*/
diff --git a/aligner_bt.h b/aligner_bt.h
new file mode 100644
index 0000000..8056b7a
--- /dev/null
+++ b/aligner_bt.h
@@ -0,0 +1,947 @@
+/*
+ * Copyright 2011, Ben Langmead
+ *
+ * This file is part of Bowtie 2.
+ *
+ * Bowtie 2 is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ *
+ * Bowtie 2 is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with Bowtie 2. If not, see .
+ */
+
+#ifndef ALIGNER_BT_H_
+#define ALIGNER_BT_H_
+
+#include
+#include
+#include "aligner_sw_common.h"
+#include "aligner_result.h"
+#include "scoring.h"
+#include "edit.h"
+#include "limit.h"
+#include "dp_framer.h"
+#include "sse_util.h"
+
+/* Say we've filled in a DP matrix in a cost-only manner, not saving the scores
+ * for each of the cells. At the end, we obtain a list of candidate cells and
+ * we'd like to backtrace from them. The per-cell scores are gone, but we have
+ * to re-create the correct path somehow. Hopefully we can do this without
+ * recreating most or al of the score matrix, since this takes too much memory.
+ *
+ * Approach 1: Naively refill the matrix.
+ *
+ * Just refill the matrix, perhaps backwards starting from the backtrace cell.
+ * Since this involves recreating all or most of the score matrix, this is not
+ * a good approach.
+ *
+ * Approach 2: Naive backtracking.
+ *
+ * Conduct a search through the space of possible backtraces, rooted at the
+ * candidate cell. To speed things along, we can prioritize paths that have a
+ * high score and that align more characters from the read.
+ *
+ * The approach is simple, but it's neither fast nor memory-efficient in
+ * general.
+ *
+ * Approach 3: Refilling with checkpoints.
+ *
+ * Refill the matrix "backwards" starting from the candidate cell, but use
+ * checkpoints to ensure that only a series of relatively small triangles or
+ * rectangles need to be refilled. The checkpoints must include elements from
+ * the H, E and F matrices; not just H. After each refill, we backtrace
+ * through the refilled area, then discard/reuse the fill memory. I call each
+ * such fill/backtrace a mini-fill/backtrace.
+ *
+ * If there's only one path to be found, then this is O(m+n). But what if
+ * there are many? And what if we would like to avoid paths that overlap in
+ * one or more cells? There are two ways we can make this more efficient:
+ *
+ * 1. Remember the re-calculated E/F/H values and try to retrieve them
+ * 2. Keep a record of cells that have already been traversed
+ *
+ * Legend:
+ *
+ * 1: Candidate cell
+ * 2: Final cell from first mini-fill/backtrace
+ * 3: Final cell from second mini-fill/backtrace (third not shown)
+ * +: Checkpointed cell
+ * *: Cell filled from first or second mini-fill/backtrace
+ * -: Unfilled cell
+ *
+ * ---++--------++--------++----
+ * --++--------++*-------++-----
+ * -++--(etc)-++**------++------
+ * ++--------+3***-----++-------
+ * +--------++****----++--------
+ * --------++*****---++--------+
+ * -------++******--++--------++
+ * ------++*******-++*-------++-
+ * -----++********++**------++--
+ * ----++********2+***-----++---
+ * ---++--------++****----++----
+ * --++--------++*****---++-----
+ * -++--------++*****1--++------
+ * ++--------++--------++-------
+ *
+ * Approach 4: Backtracking with checkpoints.
+ *
+ * Conduct a search through the space of possible backtraces, rooted at the
+ * candidate cell. Use "checkpoints" to prune. That is, when a backtrace
+ * moves through a cell with a checkpointed score, consider the score
+ * accumulated so far and the cell's saved score; abort if those two scores
+ * add to something less than a valid score. Note we're only checkpointing H
+ * in this case (possibly; see "subtle point"), not E or F.
+ *
+ * Subtle point: checkpoint scores are a result of moving forward through
+ * the matrix whereas backtracking scores result from moving backward. This
+ * matters becuase the two paths that meet up at a cell might have both
+ * factored in a gap open penalty for the same gap, in which case we will
+ * underestimate the overall score and prune a good path. Here are two ideas
+ * for how to resolve this:
+ *
+ * Idea 1: when we combine the forward and backward scores to find an overall
+ * score, and our backtrack procedure *just* made a horizontal or vertical
+ * move, add in a "bonus" equal to the gap open penalty of the appropraite
+ * type (read gap open for horizontal, ref gap open for vertical). This might
+ * overcompensate, since
+ *
+ * Idea 2: keep the E and F values for the checkpoints around, in addition to
+ * the H values. When it comes time to combine the score from the forward
+ * and backward paths, we consider the last move we made in the backward
+ * backtrace. If it's a read gap (horizontal move), then we calculate the
+ * overall score as:
+ *
+ * max(Score-backward + H-forward, Score-backward + E-forward + read-open)
+ *
+ * If it's a reference gap (vertical move), then we calculate the overall
+ * score as:
+ *
+ * max(Score-backward + H-forward, Score-backward + F-forward + ref-open)
+ *
+ * What does it mean to abort a backtrack? If we're starting a new branch
+ * and there is a checkpoing in the bottommost cell of the branch, and the
+ * overall score is less than the target, then we can simply ignore the
+ * branch. If the checkpoint occurs in the middle of a string of matches, we
+ * need to curtail the branch such that it doesn't include the checkpointed
+ * cell and we won't ever try to enter the checkpointed cell, e.g., on a
+ * mismatch.
+ *
+ * Approaches 3 and 4 seem reasonable, and could be combined. For simplicity,
+ * we implement only approach 4 for now.
+ *
+ * Checkpoint information is propagated from the fill process to the backtracer
+ * via a
+ */
+
+enum {
+ BT_NOT_FOUND = 1, // could not obtain the backtrace because it
+ // overlapped a previous solution
+ BT_FOUND, // obtained a valid backtrace
+ BT_REJECTED_N, // backtrace rejected because it had too many Ns
+ BT_REJECTED_CORE_DIAG // backtrace rejected because it failed to overlap a
+ // core diagonal
+};
+
+/**
+ * Parameters for a matrix of potential backtrace problems to solve.
+ * Encapsulates information about:
+ *
+ * The problem given a particular reference substring:
+ *
+ * - The query string (nucleotides and qualities)
+ * - The reference substring (incl. orientation, offset into overall sequence)
+ * - Checkpoints (i.e. values of matrix cells)
+ * - Scoring scheme and other thresholds
+ *
+ * The problem given a particular reference substring AND a particular row and
+ * column from which to backtrace:
+ *
+ * - The row and column
+ * - The target score
+ */
+class BtBranchProblem {
+
+public:
+
+ /**
+ * Create new uninitialized problem.
+ */
+ BtBranchProblem() { reset(); }
+
+ /**
+ * Initialize a new problem.
+ */
+ void initRef(
+ const char *qry, // query string (along rows)
+ const char *qual, // query quality string (along rows)
+ size_t qrylen, // query string (along rows) length
+ const char *ref, // reference string (along columns)
+ TRefOff reflen, // in-rectangle reference string length
+ TRefOff treflen,// total reference string length
+ TRefId refid, // reference id
+ TRefOff refoff, // reference offset
+ bool fw, // orientation of problem
+ const DPRect* rect, // dynamic programming rectangle filled out
+ const Checkpointer* cper, // checkpointer
+ const Scoring *sc, // scoring scheme
+ size_t nceil) // max # Ns allowed in alignment
+ {
+ qry_ = qry;
+ qual_ = qual;
+ qrylen_ = qrylen;
+ ref_ = ref;
+ reflen_ = reflen;
+ treflen_ = treflen;
+ refid_ = refid;
+ refoff_ = refoff;
+ fw_ = fw;
+ rect_ = rect;
+ cper_ = cper;
+ sc_ = sc;
+ nceil_ = nceil;
+ }
+
+ /**
+ * Initialize a new problem.
+ */
+ void initBt(
+ size_t row, // row
+ size_t col, // column
+ bool fill, // use a filling rather than a backtracking strategy
+ bool usecp, // use checkpoints to short-circuit while backtracking
+ TAlScore targ) // target score
+ {
+ row_ = row;
+ col_ = col;
+ targ_ = targ;
+ fill_ = fill;
+ usecp_ = usecp;
+ if(fill) {
+ assert(usecp_);
+ }
+ }
+
+ /**
+ * Reset to uninitialized state.
+ */
+ void reset() {
+ qry_ = qual_ = ref_ = NULL;
+ cper_ = NULL;
+ rect_ = NULL;
+ sc_ = NULL;
+ qrylen_ = reflen_ = treflen_ = refid_ = refoff_ = row_ = col_ = targ_ = nceil_ = 0;
+ fill_ = fw_ = usecp_ = false;
+ }
+
+ /**
+ * Return true iff the BtBranchProblem has been initialized.
+ */
+ bool inited() const {
+ return qry_ != NULL;
+ }
+
+#ifndef NDEBUG
+ /**
+ * Sanity-check the problem.
+ */
+ bool repOk() const {
+ assert_gt(qrylen_, 0);
+ assert_gt(reflen_, 0);
+ assert_gt(treflen_, 0);
+ assert_lt(row_, qrylen_);
+ assert_lt((TRefOff)col_, reflen_);
+ return true;
+ }
+#endif
+
+ size_t reflen() const { return reflen_; }
+ size_t treflen() const { return treflen_; }
+
+protected:
+
+ const char *qry_; // query string (along rows)
+ const char *qual_; // query quality string (along rows)
+ size_t qrylen_; // query string (along rows) length
+ const char *ref_; // reference string (along columns)
+ TRefOff reflen_; // in-rectangle reference string length
+ TRefOff treflen_;// total reference string length
+ TRefId refid_; // reference id
+ TRefOff refoff_; // reference offset
+ bool fw_; // orientation of problem
+ const DPRect* rect_; // dynamic programming rectangle filled out
+ size_t row_; // starting row
+ size_t col_; // starting column
+ TAlScore targ_; // target score
+ const Checkpointer *cper_; // checkpointer
+ bool fill_; // use mini-fills
+ bool usecp_; // use checkpointing?
+ const Scoring *sc_; // scoring scheme
+ size_t nceil_; // max # Ns allowed in alignment
+
+ friend class BtBranch;
+ friend class BtBranchQ;
+ friend class BtBranchTracer;
+};
+
+/**
+ * Encapsulates a "branch" which is a diagonal of cells (possibly of length 0)
+ * in the matrix where all the cells are matches. These stretches are linked
+ * together by edits to form a full backtrace path through the matrix. Lengths
+ * are measured w/r/t to the number of rows traversed by the path, so a branch
+ * that represents a read gap extension could have length = 0.
+ *
+ * At the end of the day, the full backtrace path is represented as a list of
+ * BtBranch's where each BtBranch represents a stretch of matching cells (and
+ * up to one mismatching cell at its bottom extreme) ending in an edit (or in
+ * the bottommost row, in which case the edit is uninitialized). Each
+ * BtBranch's row and col fields indicate the bottommost cell involved in the
+ * diagonal stretch of matches, and the len_ field indicates the length of the
+ * stretch of matches. Note that the edits themselves also correspond to
+ * movement through the matrix.
+ *
+ * A related issue is how we record which cells have been visited so that we
+ * never report a pair of paths both traversing the same (row, col) of the
+ * overall DP matrix. This gets a little tricky because we have to take into
+ * account the cells covered by *edits* in addition to the cells covered by the
+ * stretches of matches. For instance: imagine a mismatch. That takes up a
+ * cell of the DP matrix, but it may or may not be preceded by a string of
+ * matches. It's hard to imagine how to represent this unless we let the
+ * mismatch "count toward" the len_ of the branch and let (row, col) refer to
+ * the cell where the mismatch occurs.
+ *
+ * We need BtBranches to "live forever" so that we can make some BtBranches
+ * parents of others using parent pointers. For this reason, BtBranch's are
+ * stored in an EFactory object in the BtBranchTracer class.
+ */
+class BtBranch {
+
+public:
+
+ BtBranch() { reset(); }
+
+ BtBranch(
+ const BtBranchProblem& prob,
+ size_t parentId,
+ TAlScore penalty,
+ TAlScore score_en,
+ int64_t row,
+ int64_t col,
+ Edit e,
+ int hef,
+ bool root,
+ bool extend)
+ {
+ init(prob, parentId, penalty, score_en, row, col, e, hef, root, extend);
+ }
+
+ /**
+ * Reset to uninitialized state.
+ */
+ void reset() {
+ parentId_ = 0;
+ score_st_ = score_en_ = len_ = row_ = col_ = 0;
+ curtailed_ = false;
+ e_.reset();
+ }
+
+ /**
+ * Caller gives us score_en, row and col. We figure out score_st and len_
+ * by comparing characters from the strings.
+ */
+ void init(
+ const BtBranchProblem& prob,
+ size_t parentId,
+ TAlScore penalty,
+ TAlScore score_en,
+ int64_t row,
+ int64_t col,
+ Edit e,
+ int hef,
+ bool root,
+ bool extend);
+
+ /**
+ * Return true iff this branch ends in a solution to the backtrace problem.
+ */
+ bool isSolution(const BtBranchProblem& prob) const {
+ const bool end2end = prob.sc_->monotone;
+ return score_st_ == prob.targ_ && (!end2end || endsInFirstRow());
+ }
+
+ /**
+ * Return true iff this branch could potentially lead to a valid alignment.
+ */
+ bool isValid(const BtBranchProblem& prob) const {
+ int64_t scoreFloor = prob.sc_->monotone ? MIN_I64 : 0;
+ if(score_st_ < scoreFloor) {
+ // Dipped below the score floor
+ return false;
+ }
+ if(isSolution(prob)) {
+ // It's a solution, so it's also valid
+ return true;
+ }
+ if((int64_t)len_ > row_) {
+ // Went all the way to the top row
+ //assert_leq(score_st_, prob.targ_);
+ return score_st_ == prob.targ_;
+ } else {
+ int64_t match = prob.sc_->match();
+ int64_t bonusLeft = (row_ + 1 - len_) * match;
+ return score_st_ + bonusLeft >= prob.targ_;
+ }
+ }
+
+ /**
+ * Return true iff this branch overlaps with the given branch.
+ */
+ bool overlap(const BtBranchProblem& prob, const BtBranch& bt) const {
+ // Calculate this branch's diagonal
+ assert_lt(row_, (int64_t)prob.qrylen_);
+ size_t fromend = prob.qrylen_ - row_ - 1;
+ size_t diag = fromend + col_;
+ int64_t lo = 0, hi = row_ + 1;
+ if(len_ == 0) {
+ lo = row_;
+ } else {
+ lo = row_ - (len_ - 1);
+ }
+ // Calculate other branch's diagonal
+ assert_lt(bt.row_, (int64_t)prob.qrylen_);
+ size_t ofromend = prob.qrylen_ - bt.row_ - 1;
+ size_t odiag = ofromend + bt.col_;
+ if(diag != odiag) {
+ return false;
+ }
+ int64_t olo = 0, ohi = bt.row_ + 1;
+ if(bt.len_ == 0) {
+ olo = bt.row_;
+ } else {
+ olo = bt.row_ - (bt.len_ - 1);
+ }
+ int64_t losm = olo, hism = ohi;
+ if(hi - lo < ohi - olo) {
+ swap(lo, losm);
+ swap(hi, hism);
+ }
+ if((lo <= losm && hi > losm) || (lo < hism && hi >= hism)) {
+ return true;
+ }
+ return false;
+ }
+
+ /**
+ * Return true iff this branch is higher priority than the branch 'o'.
+ */
+ bool operator<(const BtBranch& o) const {
+ // Prioritize uppermost above score
+ if(uppermostRow() != o.uppermostRow()) {
+ return uppermostRow() < o.uppermostRow();
+ }
+ if(score_st_ != o.score_st_) return score_st_ > o.score_st_;
+ if(row_ != o.row_) return row_ < o.row_;
+ if(col_ != o.col_) return col_ > o.col_;
+ if(parentId_ != o.parentId_) return parentId_ > o.parentId_;
+ assert(false);
+ return false;
+ }
+
+ /**
+ * Return true iff the topmost cell involved in this branch is in the top
+ * row.
+ */
+ bool endsInFirstRow() const {
+ assert_leq((int64_t)len_, row_ + 1);
+ return (int64_t)len_ == row_+1;
+ }
+
+ /**
+ * Return the uppermost row covered by this branch.
+ */
+ size_t uppermostRow() const {
+ assert_geq(row_ + 1, (int64_t)len_);
+ return row_ + 1 - (int64_t)len_;
+ }
+
+ /**
+ * Return the leftmost column covered by this branch.
+ */
+ size_t leftmostCol() const {
+ assert_geq(col_ + 1, (int64_t)len_);
+ return col_ + 1 - (int64_t)len_;
+ }
+
+#ifndef NDEBUG
+ /**
+ * Sanity-check this BtBranch.
+ */
+ bool repOk() const {
+ assert(root_ || e_.inited());
+ assert_gt(len_, 0);
+ assert_geq(col_ + 1, (int64_t)len_);
+ assert_geq(row_ + 1, (int64_t)len_);
+ return true;
+ }
+#endif
+
+protected:
+
+ // ID of the parent branch.
+ size_t parentId_;
+
+ // Penalty associated with the edit at the bottom of this branch (0 if
+ // there is no edit)
+ TAlScore penalty_;
+
+ // Score at the beginning of the branch
+ TAlScore score_st_;
+
+ // Score at the end of the branch (taking the edit into account)
+ TAlScore score_en_;
+
+ // Length of the branch. That is, the total number of diagonal cells
+ // involved in all the matches and in the edit (if any). Should always be
+ // > 0.
+ size_t len_;
+
+ // The row of the final (bottommost) cell in the branch. This might be the
+ // bottommost match if the branch has no associated edit. Otherwise, it's
+ // the cell occupied by the edit.
+ int64_t row_;
+
+ // The column of the final (bottommost) cell in the branch.
+ int64_t col_;
+
+ // The edit at the bottom of the branch. If this is the bottommost branch
+ // in the alignment and it does not end in an edit, then this remains
+ // uninitialized.
+ Edit e_;
+
+ // True iff this is the bottommost branch in the alignment. We can't just
+ // use row_ to tell us this because local alignments don't necessarily end
+ // in the last row.
+ bool root_;
+
+ bool curtailed_; // true -> pruned at a checkpoint where we otherwise
+ // would have had a match
+
+friend class BtBranchQ;
+friend class BtBranchTracer;
+
+};
+
+/**
+ * Instantiate and solve best-first branch-based backtraces.
+ */
+class BtBranchTracer {
+
+public:
+
+ explicit BtBranchTracer() :
+ prob_(), bs_(), seenPaths_(DP_CAT), sawcell_(DP_CAT), doTri_() { }
+
+ /**
+ * Add a branch to the queue.
+ */
+ void add(size_t id) {
+ assert(!bs_[id].isSolution(prob_));
+ unsorted_.push_back(make_pair(bs_[id].score_st_, id));
+ }
+
+ /**
+ * Add a branch to the list of solutions.
+ */
+ void addSolution(size_t id) {
+ assert(bs_[id].isSolution(prob_));
+ solutions_.push_back(id);
+ }
+
+ /**
+ * Given a potential branch to add to the queue, see if we can follow the
+ * branch a little further first. If it's still valid, or if we reach a
+ * choice between valid outgoing paths, go ahead and add it to the queue.
+ */
+ void examineBranch(
+ int64_t row,
+ int64_t col,
+ const Edit& e,
+ TAlScore pen,
+ TAlScore sc,
+ size_t parentId);
+
+ /**
+ * Take all possible ways of leaving the given branch and add them to the
+ * branch queue.
+ */
+ void addOffshoots(size_t bid);
+
+ /**
+ * Get the best branch and remove it from the priority queue.
+ */
+ size_t best(RandomSource& rnd) {
+ assert(!empty());
+ flushUnsorted();
+ assert_gt(sortedSel_ ? sorted1_.size() : sorted2_.size(), cur_);
+ // Perhaps shuffle everyone who's tied for first?
+ size_t id = sortedSel_ ? sorted1_[cur_] : sorted2_[cur_];
+ cur_++;
+ return id;
+ }
+
+ /**
+ * Return true iff there are no branches left to try.
+ */
+ bool empty() const {
+ return size() == 0;
+ }
+
+ /**
+ * Return the size, i.e. the total number of branches contained.
+ */
+ size_t size() const {
+ return unsorted_.size() +
+ (sortedSel_ ? sorted1_.size() : sorted2_.size()) - cur_;
+ }
+
+ /**
+ * Return true iff there are no solutions left to try.
+ */
+ bool emptySolution() const {
+ return sizeSolution() == 0;
+ }
+
+ /**
+ * Return the size of the solution set so far.
+ */
+ size_t sizeSolution() const {
+ return solutions_.size();
+ }
+
+ /**
+ * Sort unsorted branches, merge them with master sorted list.
+ */
+ void flushUnsorted();
+
+#ifndef NDEBUG
+ /**
+ * Sanity-check the queue.
+ */
+ bool repOk() const {
+ assert_lt(cur_, (sortedSel_ ? sorted1_.size() : sorted2_.size()));
+ return true;
+ }
+#endif
+
+ /**
+ * Initialize the tracer with respect to a new read. This involves
+ * resetting all the state relating to the set of cells already visited
+ */
+ void initRef(
+ const char* rd, // in: read sequence
+ const char* qu, // in: quality sequence
+ size_t rdlen, // in: read sequence length
+ const char* rf, // in: reference sequence
+ size_t rflen, // in: in-rectangle reference sequence length
+ TRefOff trflen, // in: total reference sequence length
+ TRefId refid, // in: reference id
+ TRefOff refoff, // in: reference offset
+ bool fw, // in: orientation
+ const DPRect *rect, // in: DP rectangle
+ const Checkpointer *cper, // in: checkpointer
+ const Scoring& sc, // in: scoring scheme
+ size_t nceil) // in: N ceiling
+ {
+ prob_.initRef(rd, qu, rdlen, rf, rflen, trflen, refid, refoff, fw, rect, cper, &sc, nceil);
+ const size_t ndiag = rflen + rdlen - 1;
+ seenPaths_.resize(ndiag);
+ for(size_t i = 0; i < ndiag; i++) {
+ seenPaths_[i].clear();
+ }
+ // clear each of the per-column sets
+ if(sawcell_.size() < rflen) {
+ size_t isz = sawcell_.size();
+ sawcell_.resize(rflen);
+ for(size_t i = isz; i < rflen; i++) {
+ sawcell_[i].setCat(DP_CAT);
+ }
+ }
+ for(size_t i = 0; i < rflen; i++) {
+ sawcell_[i].setCat(DP_CAT);
+ sawcell_[i].clear(); // clear the set
+ }
+ }
+
+ /**
+ * Initialize with a new backtrace.
+ */
+ void initBt(
+ TAlScore escore, // in: alignment score
+ size_t row, // in: start in this row
+ size_t col, // in: start in this column
+ bool fill, // in: use mini-filling?
+ bool usecp, // in: use checkpointing?
+ bool doTri, // in: triangle-shaped mini-fills?
+ RandomSource& rnd) // in: random gen, to choose among equal paths
+ {
+ prob_.initBt(row, col, fill, usecp, escore);
+ Edit e; e.reset();
+ unsorted_.clear();
+ solutions_.clear();
+ sorted1_.clear();
+ sorted2_.clear();
+ cur_ = 0;
+ nmm_ = 0; // number of mismatches attempted
+ nnmm_ = 0; // number of mismatches involving N attempted
+ nrdop_ = 0; // number of read gap opens attempted
+ nrfop_ = 0; // number of ref gap opens attempted
+ nrdex_ = 0; // number of read gap extensions attempted
+ nrfex_ = 0; // number of ref gap extensions attempted
+ nmmPrune_ = 0; // number of mismatches attempted
+ nnmmPrune_ = 0; // number of mismatches involving N attempted
+ nrdopPrune_ = 0; // number of read gap opens attempted
+ nrfopPrune_ = 0; // number of ref gap opens attempted
+ nrdexPrune_ = 0; // number of read gap extensions attempted
+ nrfexPrune_ = 0; // number of ref gap extensions attempted
+ row_ = row;
+ col_ = col;
+ doTri_ = doTri;
+ bs_.clear();
+ if(!prob_.fill_) {
+ size_t id = bs_.alloc();
+ bs_[id].init(
+ prob_,
+ 0, // parent id
+ 0, // penalty
+ 0, // starting score
+ row, // row
+ col, // column
+ e,
+ 0,
+ true, // this is the root
+ true); // this should be extend with exact matches
+ if(bs_[id].isSolution(prob_)) {
+ addSolution(id);
+ } else {
+ add(id);
+ }
+ } else {
+ int64_t row = row_, col = col_;
+ TAlScore targsc = prob_.targ_;
+ int hef = 0;
+ bool done = false, abort = false;
+ size_t depth = 0;
+ while(!done && !abort) {
+ // Accumulate edits as we go. We can do this by adding
+ // BtBranches to the bs_ structure. Each step of the backtrace
+ // either involves an edit (thereby starting a new branch) or
+ // extends the previous branch by one more position.
+ //
+ // Note: if the BtBranches are in line, then trySolution can be
+ // used to populate the SwResult and check for various
+ // situations where we might reject the alignment (i.e. due to
+ // a cell having been visited previously).
+ if(doTri_) {
+ triangleFill(
+ row, // row of cell to backtrace from
+ col, // column of cell to backtrace from
+ hef, // cell to bt from: H (0), E (1), or F (2)
+ targsc, // score of cell to backtrace from
+ prob_.targ_, // score of alignment we're looking for
+ rnd, // pseudo-random generator
+ row, // out: row we ended up in after bt
+ col, // out: column we ended up in after bt
+ hef, // out: H/E/F after backtrace
+ targsc, // out: score up to cell we ended up in
+ done, // out: finished tracing out an alignment?
+ abort); // out: aborted b/c cell was seen before?
+ } else {
+ squareFill(
+ row, // row of cell to backtrace from
+ col, // column of cell to backtrace from
+ hef, // cell to bt from: H (0), E (1), or F (2)
+ targsc, // score of cell to backtrace from
+ prob_.targ_, // score of alignment we're looking for
+ rnd, // pseudo-random generator
+ row, // out: row we ended up in after bt
+ col, // out: column we ended up in after bt
+ hef, // out: H/E/F after backtrace
+ targsc, // out: score up to cell we ended up in
+ done, // out: finished tracing out an alignment?
+ abort); // out: aborted b/c cell was seen before?
+ }
+ if(depth >= ndep_.size()) {
+ ndep_.resize(depth+1);
+ ndep_[depth] = 1;
+ } else {
+ ndep_[depth]++;
+ }
+ depth++;
+ assert((row >= 0 && col >= 0) || done);
+ }
+ }
+ ASSERT_ONLY(seen_.clear());
+ }
+
+ /**
+ * Get the next valid alignment given the backtrace problem. Return false
+ * if there is no valid solution, e.g., if
+ */
+ bool nextAlignment(
+ size_t maxiter,
+ SwResult& res,
+ size_t& off,
+ size_t& nrej,
+ size_t& niter,
+ RandomSource& rnd);
+
+ /**
+ * Return true iff this tracer has been initialized
+ */
+ bool inited() const {
+ return prob_.inited();
+ }
+
+ /**
+ * Return true iff the mini-fills are triangle-shaped.
+ */
+ bool doTri() const { return doTri_; }
+
+ /**
+ * Fill in a triangle of the DP table and backtrace from the given cell to
+ * a cell in the previous checkpoint, or to the terminal cell.
+ */
+ void triangleFill(
+ int64_t rw, // row of cell to backtrace from
+ int64_t cl, // column of cell to backtrace from
+ int hef, // cell to backtrace from is H (0), E (1), or F (2)
+ TAlScore targ, // score of cell to backtrace from
+ TAlScore targ_final, // score of alignment we're looking for
+ RandomSource& rnd, // pseudo-random generator
+ int64_t& row_new, // out: row we ended up in after backtrace
+ int64_t& col_new, // out: column we ended up in after backtrace
+ int& hef_new, // out: H/E/F after backtrace
+ TAlScore& targ_new, // out: score up to cell we ended up in
+ bool& done, // out: finished tracing out an alignment?
+ bool& abort); // out: aborted b/c cell was seen before?
+
+ /**
+ * Fill in a square of the DP table and backtrace from the given cell to
+ * a cell in the previous checkpoint, or to the terminal cell.
+ */
+ void squareFill(
+ int64_t rw, // row of cell to backtrace from
+ int64_t cl, // column of cell to backtrace from
+ int hef, // cell to backtrace from is H (0), E (1), or F (2)
+ TAlScore targ, // score of cell to backtrace from
+ TAlScore targ_final, // score of alignment we're looking for
+ RandomSource& rnd, // pseudo-random generator
+ int64_t& row_new, // out: row we ended up in after backtrace
+ int64_t& col_new, // out: column we ended up in after backtrace
+ int& hef_new, // out: H/E/F after backtrace
+ TAlScore& targ_new, // out: score up to cell we ended up in
+ bool& done, // out: finished tracing out an alignment?
+ bool& abort); // out: aborted b/c cell was seen before?
+
+protected:
+
+ /**
+ * Get the next valid alignment given a backtrace problem. Return false
+ * if there is no valid solution. Use a backtracking search to find the
+ * solution. This can be very slow.
+ */
+ bool nextAlignmentBacktrace(
+ size_t maxiter,
+ SwResult& res,
+ size_t& off,
+ size_t& nrej,
+ size_t& niter,
+ RandomSource& rnd);
+
+ /**
+ * Get the next valid alignment given a backtrace problem. Return false
+ * if there is no valid solution. Use a triangle-fill backtrace to find
+ * the solution. This is usually fast (it's O(m + n)).
+ */
+ bool nextAlignmentFill(
+ size_t maxiter,
+ SwResult& res,
+ size_t& off,
+ size_t& nrej,
+ size_t& niter,
+ RandomSource& rnd);
+
+ /**
+ * Try all the solutions accumulated so far. Solutions might be rejected
+ * if they, for instance, overlap a previous solution, have too many Ns,
+ * fail to overlap a core diagonal, etc.
+ */
+ bool trySolutions(
+ bool lookForOlap,
+ SwResult& res,
+ size_t& off,
+ size_t& nrej,
+ RandomSource& rnd,
+ bool& success);
+
+ /**
+ * See if a given solution branch works as a solution (i.e. doesn't overlap
+ * another one, have too many Ns, fail to overlap a core diagonal, etc.)
+ */
+ int trySolution(
+ size_t id,
+ bool lookForOlap,
+ SwResult& res,
+ size_t& off,
+ size_t& nrej,
+ RandomSource& rnd);
+
+ BtBranchProblem prob_; // problem configuration
+ EFactory bs_; // global BtBranch factory
+
+ // already reported alignments going through these diagonal segments
+ ELList > seenPaths_;
+ ELSet sawcell_; // cells already backtraced through
+
+ EList > unsorted_; // unsorted list of as-yet-unflished BtBranches
+ EList sorted1_; // list of BtBranch, sorted by score
+ EList sorted2_; // list of BtBranch, sorted by score
+ EList solutions_; // list of solution branches
+ bool sortedSel_; // true -> 1, false -> 2
+ size_t cur_; // cursor into sorted list to start from
+
+ size_t nmm_; // number of mismatches attempted
+ size_t nnmm_; // number of mismatches involving N attempted
+ size_t nrdop_; // number of read gap opens attempted
+ size_t nrfop_; // number of ref gap opens attempted
+ size_t nrdex_; // number of read gap extensions attempted
+ size_t nrfex_; // number of ref gap extensions attempted
+
+ size_t nmmPrune_; //
+ size_t nnmmPrune_; //
+ size_t nrdopPrune_; //
+ size_t nrfopPrune_; //
+ size_t nrdexPrune_; //
+ size_t nrfexPrune_; //
+
+ size_t row_; // row
+ size_t col_; // column
+
+ bool doTri_; // true -> fill in triangles; false -> squares
+ EList sq_; // square to fill when doing mini-fills
+ ELList tri_; // triangle to fill when doing mini-fills
+ EList ndep_; // # triangles mini-filled at various depths
+
+#ifndef NDEBUG
+ ESet seen_; // seedn branch ids; should never see same twice
+#endif
+};
+
+#endif /*ndef ALIGNER_BT_H_*/
diff --git a/aligner_cache.cpp b/aligner_cache.cpp
new file mode 100644
index 0000000..7a8de26
--- /dev/null
+++ b/aligner_cache.cpp
@@ -0,0 +1,181 @@
+/*
+ * Copyright 2011, Ben Langmead
+ *
+ * This file is part of Bowtie 2.
+ *
+ * Bowtie 2 is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ *
+ * Bowtie 2 is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with Bowtie 2. If not, see .
+ */
+
+#include "aligner_cache.h"
+#include "tinythread.h"
+
+#ifdef ALIGNER_CACHE_MAIN
+
+#include
+#include
+#include
+#include "random_source.h"
+
+using namespace std;
+
+enum {
+ ARG_TESTS = 256
+};
+
+static const char *short_opts = "vCt";
+static struct option long_opts[] = {
+ {(char*)"verbose", no_argument, 0, 'v'},
+ {(char*)"tests", no_argument, 0, ARG_TESTS},
+};
+
+static void printUsage(ostream& os) {
+ os << "Usage: sawhi-cache [options]*" << endl;
+ os << "Options:" << endl;
+ os << " --tests run unit tests" << endl;
+ os << " -v/--verbose talkative mode" << endl;
+}
+
+int gVerbose = 0;
+
+static void add(
+ RedBlack& t,
+ Pool& p,
+ const char *dna)
+{
+ QKey qk;
+ qk.init(BTDnaString(dna, true));
+ t.add(p, qk, NULL);
+}
+
+/**
+ * Small tests for the AlignmentCache.
+ */
+static void aligner_cache_tests() {
+ RedBlack rb(1024);
+ Pool p(64 * 1024, 1024);
+ // Small test
+ add(rb, p, "ACGTCGATCGT");
+ add(rb, p, "ACATCGATCGT");
+ add(rb, p, "ACGACGATCGT");
+ add(rb, p, "ACGTAGATCGT");
+ add(rb, p, "ACGTCAATCGT");
+ add(rb, p, "ACGTCGCTCGT");
+ add(rb, p, "ACGTCGAACGT");
+ assert_eq(7, rb.size());
+ rb.clear();
+ p.clear();
+ // Another small test
+ add(rb, p, "ACGTCGATCGT");
+ add(rb, p, "CCGTCGATCGT");
+ add(rb, p, "TCGTCGATCGT");
+ add(rb, p, "GCGTCGATCGT");
+ add(rb, p, "AAGTCGATCGT");
+ assert_eq(5, rb.size());
+ rb.clear();
+ p.clear();
+ // Regression test (attempt to make it smaller)
+ add(rb, p, "CCTA");
+ add(rb, p, "AGAA");
+ add(rb, p, "TCTA");
+ add(rb, p, "GATC");
+ add(rb, p, "CTGC");
+ add(rb, p, "TTGC");
+ add(rb, p, "GCCG");
+ add(rb, p, "GGAT");
+ rb.clear();
+ p.clear();
+ // Regression test
+ add(rb, p, "CCTA");
+ add(rb, p, "AGAA");
+ add(rb, p, "TCTA");
+ add(rb, p, "GATC");
+ add(rb, p, "CTGC");
+ add(rb, p, "CATC");
+ add(rb, p, "CAAA");
+ add(rb, p, "CTAT");
+ add(rb, p, "CTCA");
+ add(rb, p, "TTGC");
+ add(rb, p, "GCCG");
+ add(rb, p, "GGAT");
+ assert_eq(12, rb.size());
+ rb.clear();
+ p.clear();
+ // Larger random test
+ EList strs;
+ char buf[5];
+ for(int i = 0; i < 4; i++) {
+ for(int j = 0; j < 4; j++) {
+ for(int k = 0; k < 4; k++) {
+ for(int m = 0; m < 4; m++) {
+ buf[0] = "ACGT"[i];
+ buf[1] = "ACGT"[j];
+ buf[2] = "ACGT"[k];
+ buf[3] = "ACGT"[m];
+ buf[4] = '\0';
+ strs.push_back(BTDnaString(buf, true));
+ }
+ }
+ }
+ }
+ // Add all of the 4-mers in several different random orders
+ RandomSource rand;
+ for(uint32_t runs = 0; runs < 100; runs++) {
+ rb.clear();
+ p.clear();
+ assert_eq(0, rb.size());
+ rand.init(runs);
+ EList used;
+ used.resize(256);
+ for(int i = 0; i < 256; i++) used[i] = false;
+ for(int i = 0; i < 256; i++) {
+ int r = rand.nextU32() % (256-i);
+ int unused = 0;
+ bool added = false;
+ for(int j = 0; j < 256; j++) {
+ if(!used[j] && unused == r) {
+ used[j] = true;
+ QKey qk;
+ qk.init(strs[j]);
+ rb.add(p, qk, NULL);
+ added = true;
+ break;
+ }
+ if(!used[j]) unused++;
+ }
+ assert(added);
+ }
+ }
+}
+
+/**
+ * A way of feeding simply tests to the seed alignment infrastructure.
+ */
+int main(int argc, char **argv) {
+ int option_index = 0;
+ int next_option;
+ do {
+ next_option = getopt_long(argc, argv, short_opts, long_opts, &option_index);
+ switch (next_option) {
+ case 'v': gVerbose = true; break;
+ case ARG_TESTS: aligner_cache_tests(); return 0;
+ case -1: break;
+ default: {
+ cerr << "Unknown option: " << (char)next_option << endl;
+ printUsage(cerr);
+ exit(1);
+ }
+ }
+ } while(next_option != -1);
+}
+#endif
diff --git a/aligner_cache.h b/aligner_cache.h
new file mode 100644
index 0000000..2237071
--- /dev/null
+++ b/aligner_cache.h
@@ -0,0 +1,1013 @@
+/*
+ * Copyright 2011, Ben Langmead
+ *
+ * This file is part of Bowtie 2.
+ *
+ * Bowtie 2 is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ *
+ * Bowtie 2 is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with Bowtie 2. If not, see .
+ */
+
+#ifndef ALIGNER_CACHE_H_
+#define ALIGNER_CACHE_H_
+
+/**
+ * CACHEING
+ *
+ * By caching the results of some alignment sub-problems, we hope to
+ * enable a "fast path" for read alignment whereby answers are mostly
+ * looked up rather than calculated from scratch. This is particularly
+ * effective when the input is sorted or otherwise grouped in a way
+ * that brings together reads with (at least some) seed sequences in
+ * common.
+ *
+ * But the cache is also where results are held, regardless of whether
+ * the results are maintained & re-used across reads.
+ *
+ * The cache consists of two linked potions:
+ *
+ * 1. A multimap from seed strings (i.e. read substrings) to reference strings
+ * that are within some edit distance (roughly speaking). This is the "seed
+ * multimap".
+ *
+ * Key: Read substring (2-bit-per-base encoded + length)
+ * Value: Set of reference substrings (i.e. keys into the suffix
+ * array multimap).
+ *
+ * 2. A multimap from reference strings to the corresponding elements of the
+ * suffix array. Elements are filled in with reference-offset info as it's
+ * calculated. This is the "suffix array multimap"
+ *
+ * Key: Reference substring (2-bit-per-base encoded + length)
+ * Value: (a) top from BWT, (b) length of range, (c) offset of first
+ * range element in
+ *
+ * For both multimaps, we use a combo Red-Black tree and EList. The payload in
+ * the Red-Black tree nodes points to a range in the EList.
+ */
+
+#include
+#include "ds.h"
+#include "read.h"
+#include "threading.h"
+#include "mem_ids.h"
+#include "simple_func.h"
+#include "btypes.h"
+
+#define CACHE_PAGE_SZ (16 * 1024)
+
+typedef PListSlice TSlice;
+
+/**
+ * Key for the query multimap: the read substring and its length.
+ */
+struct QKey {
+
+ /**
+ * Initialize invalid QKey.
+ */
+ QKey() { reset(); }
+
+ /**
+ * Initialize QKey from DNA string.
+ */
+ QKey(const BTDnaString& s ASSERT_ONLY(, BTDnaString& tmp)) {
+ init(s ASSERT_ONLY(, tmp));
+ }
+
+ /**
+ * Initialize QKey from DNA string. Rightmost character is placed in the
+ * least significant bitpair.
+ */
+ bool init(
+ const BTDnaString& s
+ ASSERT_ONLY(, BTDnaString& tmp))
+ {
+ seq = 0;
+ len = (uint32_t)s.length();
+ ASSERT_ONLY(tmp.clear());
+ if(len > 32) {
+ len = 0xffffffff;
+ return false; // wasn't cacheable
+ } else {
+ // Rightmost char of 's' goes in the least significant bitpair
+ for(size_t i = 0; i < 32 && i < s.length(); i++) {
+ int c = (int)s.get(i);
+ assert_range(0, 4, c);
+ if(c == 4) {
+ len = 0xffffffff;
+ return false;
+ }
+ seq = (seq << 2) | s.get(i);
+ }
+ ASSERT_ONLY(toString(tmp));
+ assert(sstr_eq(tmp, s));
+ assert_leq(len, 32);
+ return true; // was cacheable
+ }
+ }
+
+ /**
+ * Convert this key to a DNA string.
+ */
+ void toString(BTDnaString& s) {
+ s.resize(len);
+ uint64_t sq = seq;
+ for(int i = (len)-1; i >= 0; i--) {
+ s.set((uint32_t)(sq & 3), i);
+ sq >>= 2;
+ }
+ }
+
+ /**
+ * Return true iff the read substring is cacheable.
+ */
+ bool cacheable() const { return len != 0xffffffff; }
+
+ /**
+ * Reset to uninitialized state.
+ */
+ void reset() { seq = 0; len = 0xffffffff; }
+
+ /**
+ * True -> my key is less than the given key.
+ */
+ bool operator<(const QKey& o) const {
+ return seq < o.seq || (seq == o.seq && len < o.len);
+ }
+
+ /**
+ * True -> my key is greater than the given key.
+ */
+ bool operator>(const QKey& o) const {
+ return !(*this < o || *this == o);
+ }
+
+ /**
+ * True -> my key is equal to the given key.
+ */
+ bool operator==(const QKey& o) const {
+ return seq == o.seq && len == o.len;
+ }
+
+
+ /**
+ * True -> my key is not equal to the given key.
+ */
+ bool operator!=(const QKey& o) const {
+ return !(*this == o);
+ }
+
+#ifndef NDEBUG
+ /**
+ * Check that this is a valid, initialized QKey.
+ */
+ bool repOk() const {
+ return len != 0xffffffff;
+ }
+#endif
+
+ uint64_t seq; // sequence
+ uint32_t len; // length of sequence
+};
+
+template
+class AlignmentCache;
+
+/**
+ * Payload for the query multimap: a range of elements in the reference
+ * string list.
+ */
+template
+class QVal {
+
+public:
+
+ QVal() { reset(); }
+
+ /**
+ * Return the offset of the first reference substring in the qlist.
+ */
+ index_t offset() const { return i_; }
+
+ /**
+ * Return the number of reference substrings associated with a read
+ * substring.
+ */
+ index_t numRanges() const {
+ assert(valid());
+ return rangen_;
+ }
+
+ /**
+ * Return the number of elements associated with all associated
+ * reference substrings.
+ */
+ index_t numElts() const {
+ assert(valid());
+ return eltn_;
+ }
+
+ /**
+ * Return true iff the read substring is not associated with any
+ * reference substrings.
+ */
+ bool empty() const {
+ assert(valid());
+ return numRanges() == 0;
+ }
+
+ /**
+ * Return true iff the QVal is valid.
+ */
+ bool valid() const { return rangen_ != (index_t)OFF_MASK; }
+
+ /**
+ * Reset to invalid state.
+ */
+ void reset() {
+ i_ = 0; rangen_ = eltn_ = (index_t)OFF_MASK;
+ }
+
+ /**
+ * Initialize Qval.
+ */
+ void init(index_t i, index_t ranges, index_t elts) {
+ i_ = i; rangen_ = ranges; eltn_ = elts;
+ }
+
+ /**
+ * Tally another range with given number of elements.
+ */
+ void addRange(index_t numElts) {
+ rangen_++;
+ eltn_ += numElts;
+ }
+
+#ifndef NDEBUG
+ /**
+ * Check that this QVal is internally consistent and consistent
+ * with the contents of the given cache.
+ */
+ bool repOk(const AlignmentCache& ac) const;
+#endif
+
+protected:
+
+ index_t i_; // idx of first elt in qlist
+ index_t rangen_; // # ranges (= # associated reference substrings)
+ index_t eltn_; // # elements (total)
+};
+
+/**
+ * Key for the suffix array multimap: the reference substring and its
+ * length. Same as QKey so I typedef it.
+ */
+typedef QKey SAKey;
+
+/**
+ * Payload for the suffix array multimap: (a) the top element of the
+ * range in BWT, (b) the offset of the first elt in the salist, (c)
+ * length of the range.
+ */
+template
+struct SAVal {
+
+ SAVal() : topf(), topb(), i(), len(OFF_MASK) { }
+
+ /**
+ * Return true iff the SAVal is valid.
+ */
+ bool valid() { return len != (index_t)OFF_MASK; }
+
+#ifndef NDEBUG
+ /**
+ * Check that this SAVal is internally consistent and consistent
+ * with the contents of the given cache.
+ */
+ bool repOk(const AlignmentCache& ac) const;
+#endif
+
+ /**
+ * Initialize the SAVal.
+ */
+ void init(
+ index_t tf,
+ index_t tb,
+ index_t ii,
+ index_t ln)
+ {
+ topf = tf;
+ topb = tb;
+ i = ii;
+ len = ln;
+ }
+
+ index_t topf; // top in BWT
+ index_t topb; // top in BWT'
+ index_t i; // idx of first elt in salist
+ index_t len; // length of range
+};
+
+/**
+ * One data structure that encapsulates all of the cached information
+ * associated with a particular reference substring. This is useful
+ * for summarizing what info should be added to the cache for a partial
+ * alignment.
+ */
+template
+class SATuple {
+
+public:
+
+ SATuple() { reset(); };
+
+ SATuple(SAKey k, index_t tf, index_t tb, TSlice o) {
+ init(k, tf, tb, o);
+ }
+
+ void init(SAKey k, index_t tf, index_t tb, TSlice o) {
+ key = k; topf = tf; topb = tb; offs = o;
+ }
+
+ /**
+ * Initialize this SATuple from a subrange of the SATuple 'src'.
+ */
+ void init(const SATuple& src, index_t first, index_t last) {
+ assert_neq((index_t)OFF_MASK, src.topb);
+ key = src.key;
+ topf = (index_t)(src.topf + first);
+ topb = (index_t)OFF_MASK; // unknown!
+ offs.init(src.offs, first, last);
+ }
+
+#ifndef NDEBUG
+ /**
+ * Check that this SATuple is internally consistent and that its
+ * PListSlice is consistent with its backing PList.
+ */
+ bool repOk() const {
+ assert(offs.repOk());
+ return true;
+ }
+#endif
+
+ /**
+ * Function for ordering SATuples. This is used when prioritizing which to
+ * explore first when extending seed hits into full alignments. Smaller
+ * ranges get higher priority and we use 'top' to break ties, though any
+ * way of breaking a tie would be fine.
+ */
+ bool operator<(const SATuple& o) const {
+ if(offs.size() < o.offs.size()) {
+ return true;
+ }
+ if(offs.size() > o.offs.size()) {
+ return false;
+ }
+ return topf < o.topf;
+ }
+ bool operator>(const SATuple& o) const {
+ if(offs.size() < o.offs.size()) {
+ return false;
+ }
+ if(offs.size() > o.offs.size()) {
+ return true;
+ }
+ return topf > o.topf;
+ }
+
+ bool operator==(const SATuple& o) const {
+ return key == o.key && topf == o.topf && topb == o.topb && offs == o.offs;
+ }
+
+ void reset() { topf = topb = (index_t)OFF_MASK; offs.reset(); }
+
+ /**
+ * Set the length to be at most the original length.
+ */
+ void setLength(index_t nlen) {
+ assert_leq(nlen, offs.size());
+ offs.setLength(nlen);
+ }
+
+ /**
+ * Return the number of times this reference substring occurs in the
+ * reference, which is also the size of the 'offs' TSlice.
+ */
+ index_t size() const { return (index_t)offs.size(); }
+
+ // bot/length of SA range equals offs.size()
+ SAKey key; // sequence key
+ index_t topf; // top in BWT index
+ index_t topb; // top in BWT' index
+ TSlice offs; // offsets
+};
+
+/**
+ * Encapsulate the data structures and routines that constitute a
+ * particular cache, i.e., a particular stratum of the cache system,
+ * which might comprise many strata.
+ *
+ * Each thread has a "current-read" AlignmentCache which is used to
+ * build and store subproblem results as alignment is performed. When
+ * we're finished with a read, we might copy the cached results for
+ * that read (and perhaps a bundle of other recently-aligned reads) to
+ * a higher-level "across-read" cache. Higher-level caches may or may
+ * not be shared among threads.
+ *
+ * A cache consists chiefly of two multimaps, each implemented as a
+ * Red-Black tree map backed by an EList. A 'version' counter is
+ * incremented every time the cache is cleared.
+ */
+template
+class AlignmentCache {
+
+ typedef RedBlackNode > QNode;
+ typedef RedBlackNode > SANode;
+
+ typedef PList TQList;
+ typedef PList TSAList;
+
+public:
+
+ AlignmentCache(
+ uint64_t bytes,
+ bool shared) :
+ pool_(bytes, CACHE_PAGE_SZ, CA_CAT),
+ qmap_(CACHE_PAGE_SZ, CA_CAT),
+ qlist_(CA_CAT),
+ samap_(CACHE_PAGE_SZ, CA_CAT),
+ salist_(CA_CAT),
+ shared_(shared),
+ mutex_m(),
+ version_(0)
+ {
+ }
+
+ /**
+ * Given a QVal, populate the given EList of SATuples with records
+ * describing all of the cached information about the QVal's
+ * reference substrings.
+ */
+ template
+ void queryQval(
+ const QVal& qv,
+ EList, S>& satups,
+ index_t& nrange,
+ index_t& nelt,
+ bool getLock = true)
+ {
+ ThreadSafe ts(lockPtr(), shared_ && getLock);
+ assert(qv.repOk(*this));
+ const index_t refi = qv.offset();
+ const index_t reff = refi + qv.numRanges();
+ // For each reference sequence sufficiently similar to the
+ // query sequence in the QKey...
+ for(index_t i = refi; i < reff; i++) {
+ // Get corresponding SAKey, containing similar reference
+ // sequence & length
+ SAKey sak = qlist_.get(i);
+ // Shouldn't have identical keys in qlist_
+ assert(i == refi || qlist_.get(i) != qlist_.get(i-1));
+ // Get corresponding SANode
+ SANode *n = samap_.lookup(sak);
+ assert(n != NULL);
+ const SAVal& sav = n->payload;
+ assert(sav.repOk(*this));
+ if(sav.len > 0) {
+ nrange++;
+ satups.expand();
+ satups.back().init(sak, sav.topf, sav.topb, TSlice(salist_, sav.i, sav.len));
+ nelt += sav.len;
+#ifndef NDEBUG
+ // Shouldn't add consecutive identical entries too satups
+ if(i > refi) {
+ const SATuple b1 = satups.back();
+ const SATuple b2 = satups[satups.size()-2];
+ assert(b1.key != b2.key || b1.topf != b2.topf || b1.offs != b2.offs);
+ }
+#endif
+ }
+ }
+ }
+
+ /**
+ * Return true iff the cache has no entries in it.
+ */
+ bool empty() const {
+ bool ret = qmap_.empty();
+ assert(!ret || qlist_.empty());
+ assert(!ret || samap_.empty());
+ assert(!ret || salist_.empty());
+ return ret;
+ }
+
+ /**
+ * Add a new query key ('qk'), usually a 2-bit encoded substring of
+ * the read) as the key in a new Red-Black node in the qmap and
+ * return a pointer to the node's QVal.
+ *
+ * The expectation is that the caller is about to set about finding
+ * associated reference substrings, and that there will be future
+ * calls to addOnTheFly to add associations to reference substrings
+ * found.
+ */
+ QVal* add(
+ const QKey& qk,
+ bool *added,
+ bool getLock = true)
+ {
+ ThreadSafe ts(lockPtr(), shared_ && getLock);
+ assert(qk.cacheable());
+ QNode *n = qmap_.add(pool(), qk, added);
+ return (n != NULL ? &n->payload : NULL);
+ }
+
+ /**
+ * Add a new association between a read sequnce ('seq') and a
+ * reference sequence ('')
+ */
+ bool addOnTheFly(
+ QVal& qv, // qval that points to the range of reference substrings
+ const SAKey& sak, // the key holding the reference substring
+ index_t topf, // top range elt in BWT index
+ index_t botf, // bottom range elt in BWT index
+ index_t topb, // top range elt in BWT' index
+ index_t botb, // bottom range elt in BWT' index
+ bool getLock = true);
+
+ /**
+ * Clear the cache, i.e. turn it over. All HitGens referring to
+ * ranges in this cache will become invalid and the corresponding
+ * reads will have to be re-aligned.
+ */
+ void clear(bool getLock = true) {
+ ThreadSafe ts(lockPtr(), shared_ && getLock);
+ pool_.clear();
+ qmap_.clear();
+ qlist_.clear();
+ samap_.clear();
+ salist_.clear();
+ version_++;
+ }
+
+ /**
+ * Return the number of keys in the query multimap.
+ */
+ index_t qNumKeys() const { return (index_t)qmap_.size(); }
+
+ /**
+ * Return the number of keys in the suffix array multimap.
+ */
+ index_t saNumKeys() const { return (index_t)samap_.size(); }
+
+ /**
+ * Return the number of elements in the reference substring list.
+ */
+ index_t qSize() const { return (index_t)qlist_.size(); }
+
+ /**
+ * Return the number of elements in the SA range list.
+ */
+ index_t saSize() const { return (index_t)salist_.size(); }
+
+ /**
+ * Return the pool.
+ */
+ Pool& pool() { return pool_; }
+
+ /**
+ * Return the lock object.
+ */
+ MUTEX_T& lock() {
+ return mutex_m;
+ }
+
+ /**
+ * Return a const pointer to the lock object. This allows us to
+ * write const member functions that grab the lock.
+ */
+ MUTEX_T* lockPtr() const {
+ return const_cast(&mutex_m);
+ }
+
+ /**
+ * Return true iff this cache is shared among threads.
+ */
+ bool shared() const { return shared_; }
+
+ /**
+ * Return the current "version" of the cache, i.e. the total number
+ * of times it has turned over since its creation.
+ */
+ uint32_t version() const { return version_; }
+
+protected:
+
+ Pool pool_; // dispenses memory pages
+ RedBlack > qmap_; // map from query substrings to reference substrings
+ TQList qlist_; // list of reference substrings
+ RedBlack > samap_; // map from reference substrings to SA ranges
+ TSAList salist_; // list of SA ranges
+
+ bool shared_; // true -> this cache is global
+ MUTEX_T mutex_m; // mutex used for syncronization in case the the cache is shared.
+ uint32_t version_; // cache version
+};
+
+/**
+ * Interface used to query and update a pair of caches: one thread-
+ * local and unsynchronized, another shared and synchronized. One or
+ * both can be NULL.
+ */
+template
+class AlignmentCacheIface {
+
+public:
+
+ AlignmentCacheIface(
+ AlignmentCache *current,
+ AlignmentCache *local,
+ AlignmentCache *shared) :
+ qk_(),
+ qv_(NULL),
+ cacheable_(false),
+ rangen_(0),
+ eltsn_(0),
+ current_(current),
+ local_(local),
+ shared_(shared)
+ {
+ assert(current_ != NULL);
+ }
+
+#if 0
+ /**
+ * Query the relevant set of caches, looking for a QVal to go with
+ * the provided QKey. If the QVal is found in a cache other than
+ * the current-read cache, it is copied into the current-read cache
+ * first and the QVal pointer for the current-read cache is
+ * returned. This function never returns a pointer from any cache
+ * other than the current-read cache. If the QVal could not be
+ * found in any cache OR if the QVal was found in a cache other
+ * than the current-read cache but could not be copied into the
+ * current-read cache, NULL is returned.
+ */
+ QVal* queryCopy(const QKey& qk, bool getLock = true) {
+ assert(qk.cacheable());
+ AlignmentCache* caches[3] = { current_, local_, shared_ };
+ for(int i = 0; i < 3; i++) {
+ if(caches[i] == NULL) continue;
+ QVal* qv = caches[i]->query(qk, getLock);
+ if(qv != NULL) {
+ if(i == 0) return qv;
+ if(!current_->copy(qk, *qv, *caches[i], getLock)) {
+ // Exhausted memory in the current cache while
+ // attempting to copy in the qk
+ return NULL;
+ }
+ QVal* curqv = current_->query(qk, getLock);
+ assert(curqv != NULL);
+ return curqv;
+ }
+ }
+ return NULL;
+ }
+
+ /**
+ * Query the relevant set of caches, looking for a QVal to go with
+ * the provided QKey. If a QVal is found and which is non-NULL,
+ * *which is set to 0 if the qval was found in the current-read
+ * cache, 1 if it was found in the local across-read cache, and 2
+ * if it was found in the shared across-read cache.
+ */
+ inline QVal* query(
+ const QKey& qk,
+ AlignmentCache** which,
+ bool getLock = true)
+ {
+ assert(qk.cacheable());
+ AlignmentCache* caches[3] = { current_, local_, shared_ };
+ for(int i = 0; i < 3; i++) {
+ if(caches[i] == NULL) continue;
+ QVal* qv = caches[i]->query(qk, getLock);
+ if(qv != NULL) {
+ if(which != NULL) *which = caches[i];
+ return qv;
+ }
+ }
+ return NULL;
+ }
+#endif
+
+ /**
+ * This function is called whenever we start to align a new read or
+ * read substring. We make key for it and store the key in qk_.
+ * If the sequence is uncacheable, we don't actually add it to the
+ * map but the corresponding reference substrings are still added
+ * to the qlist_.
+ *
+ * Returns:
+ * -1 if out of memory
+ * 0 if key was found in cache
+ * 1 if key was not found in cache (and there's enough memory to
+ * add a new key)
+ */
+ int beginAlign(
+ const BTDnaString& seq,
+ const BTString& qual,
+ QVal& qv, // out: filled in if we find it in the cache
+ bool getLock = true)
+ {
+ assert(repOk());
+ qk_.init(seq ASSERT_ONLY(, tmpdnastr_));
+ //if(qk_.cacheable() && (qv_ = current_->query(qk_, getLock)) != NULL) {
+ // // qv_ holds the answer
+ // assert(qv_->valid());
+ // qv = *qv_;
+ // resetRead();
+ // return 1; // found in cache
+ //} else
+ if(qk_.cacheable()) {
+ // Make a QNode for this key and possibly add the QNode to the
+ // Red-Black map; but if 'seq' isn't cacheable, just create the
+ // QNode (without adding it to the map).
+ qv_ = current_->add(qk_, &cacheable_, getLock);
+ } else {
+ qv_ = &qvbuf_;
+ }
+ if(qv_ == NULL) {
+ resetRead();
+ return -1; // Not in memory
+ }
+ qv_->reset();
+ return 0; // Need to search for it
+ }
+ ASSERT_ONLY(BTDnaString tmpdnastr_);
+
+ /**
+ * Called when is finished aligning a read (and so is finished
+ * adding associated reference strings). Returns a copy of the
+ * final QVal object and resets the alignment state of the
+ * current-read cache.
+ *
+ * Also, if the alignment is cacheable, it commits it to the next
+ * cache up in the cache hierarchy.
+ */
+ QVal finishAlign(bool getLock = true) {
+ if(!qv_->valid()) {
+ qv_->init(0, 0, 0);
+ }
+ // Copy this pointer because we're about to reset the qv_ field
+ // to NULL
+ QVal* qv = qv_;
+ // Commit the contents of the current-read cache to the next
+ // cache up in the hierarchy.
+ // If qk is cacheable, then it must be in the cache
+#if 0
+ if(qk_.cacheable()) {
+ AlignmentCache* caches[3] = { current_, local_, shared_ };
+ ASSERT_ONLY(AlignmentCache* which);
+ ASSERT_ONLY(QVal* qv2 = query(qk_, &which, true));
+ assert(qv2 == qv);
+ assert(which == current_);
+ for(int i = 1; i < 3; i++) {
+ if(caches[i] != NULL) {
+ // Copy this key/value pair to the to the higher
+ // level cache and, if its memory is exhausted,
+ // clear the cache and try again.
+ caches[i]->clearCopy(qk_, *qv_, *current_, getLock);
+ break;
+ }
+ }
+ }
+#endif
+ // Reset the state in this iface in preparation for the next
+ // alignment.
+ resetRead();
+ assert(repOk());
+ return *qv;
+ }
+
+ /**
+ * A call to this member indicates that the caller has finished
+ * with the last read (if any) and is ready to work on the next.
+ * This gives the cache a chance to reset some of its state if
+ * necessary.
+ */
+ void nextRead() {
+ current_->clear();
+ resetRead();
+ assert(!aligning());
+ }
+
+ /**
+ * Return true iff we're in the middle of aligning a sequence.
+ */
+ bool aligning() const {
+ return qv_ != NULL;
+ }
+
+ /**
+ * Clears both the local and shared caches.
+ */
+ void clear() {
+ if(current_ != NULL) current_->clear();
+ if(local_ != NULL) local_->clear();
+ if(shared_ != NULL) shared_->clear();
+ }
+
+ /**
+ * Add an alignment to the running list of alignments being
+ * compiled for the current read in the local cache.
+ */
+ bool addOnTheFly(
+ const BTDnaString& rfseq, // reference sequence close to read seq
+ index_t topf, // top in BWT index
+ index_t botf, // bot in BWT index
+ index_t topb, // top in BWT' index
+ index_t botb, // bot in BWT' index
+ bool getLock = true) // true -> lock is not held by caller
+ {
+
+ assert(aligning());
+ assert(repOk());
+ ASSERT_ONLY(BTDnaString tmp);
+ SAKey sak(rfseq ASSERT_ONLY(, tmp));
+ //assert(sak.cacheable());
+ if(current_->addOnTheFly((*qv_), sak, topf, botf, topb, botb, getLock)) {
+ rangen_++;
+ eltsn_ += (botf-topf);
+ return true;
+ }
+ return false;
+ }
+
+ /**
+ * Given a QVal, populate the given EList of SATuples with records
+ * describing all of the cached information about the QVal's
+ * reference substrings.
+ */
+ template
+ void queryQval(
+ const QVal& qv,
+ EList, S>& satups,
+ index_t& nrange,
+ index_t& nelt,
+ bool getLock = true)
+ {
+ current_->queryQval(qv, satups, nrange, nelt, getLock);
+ }
+
+ /**
+ * Return a pointer to the current-read cache object.
+ */
+ const AlignmentCache* currentCache() const { return current_; }
+
+ index_t curNumRanges() const { return rangen_; }
+ index_t curNumElts() const { return eltsn_; }
+
+#ifndef NDEBUG
+ /**
+ * Check that AlignmentCacheIface is internally consistent.
+ */
+ bool repOk() const {
+ assert(current_ != NULL);
+ assert_geq(eltsn_, rangen_);
+ if(qv_ == NULL) {
+ assert_eq(0, rangen_);
+ assert_eq(0, eltsn_);
+ }
+ return true;
+ }
+#endif
+
+ /**
+ * Return the alignment cache for the current read.
+ */
+ const AlignmentCache& current() {
+ return *current_;
+ }
+
+protected:
+
+ /**
+ * Reset fields encoding info about the in-process read.
+ */
+ void resetRead() {
+ cacheable_ = false;
+ rangen_ = eltsn_ = 0;
+ qv_ = NULL;
+ }
+
+ QKey qk_; // key representation for current read substring
+ QVal *qv_; // pointer to value representation for current read substring
+ QVal qvbuf_; // buffer for when key is uncacheable but we need a qv
+ bool cacheable_; // true iff the read substring currently being aligned is cacheable
+
+ index_t rangen_; // number of ranges since last alignment job began
+ index_t eltsn_; // number of elements since last alignment job began
+
+ AlignmentCache *current_; // cache dedicated to the current read
+ AlignmentCache *local_; // local, unsynchronized cache
+ AlignmentCache *shared_; // shared, synchronized cache
+};
+
+#ifndef NDEBUG
+/**
+ * Check that this QVal is internally consistent and consistent
+ * with the contents of the given cache.
+ */
+template
+bool QVal::repOk(const AlignmentCache& ac) const {
+ if(rangen_ > 0) {
+ assert_lt(i_, ac.qSize());
+ assert_leq(i_ + rangen_, ac.qSize());
+ }
+ assert_geq(eltn_, rangen_);
+ return true;
+}
+#endif
+
+#ifndef NDEBUG
+/**
+ * Check that this SAVal is internally consistent and consistent
+ * with the contents of the given cache.
+ */
+template
+bool SAVal::repOk(const AlignmentCache& ac) const {
+ assert(len == 0 || i < ac.saSize());
+ assert_leq(i + len, ac.saSize());
+ return true;
+}
+#endif
+
+/**
+ * Add a new association between a read sequnce ('seq') and a
+ * reference sequence ('')
+ */
+template
+bool AlignmentCache::addOnTheFly(
+ QVal& qv, // qval that points to the range of reference substrings
+ const SAKey& sak, // the key holding the reference substring
+ index_t topf, // top range elt in BWT index
+ index_t botf, // bottom range elt in BWT index
+ index_t topb, // top range elt in BWT' index
+ index_t botb, // bottom range elt in BWT' index
+ bool getLock)
+{
+ ThreadSafe ts(lockPtr(), shared_ && getLock);
+ bool added = true;
+ // If this is the first reference sequence we're associating with
+ // the query sequence, initialize the QVal.
+ if(!qv.valid()) {
+ qv.init((index_t)qlist_.size(), 0, 0);
+ }
+ qv.addRange(botf-topf); // update tally for # ranges and # elts
+ if(!qlist_.add(pool(), sak)) {
+ return false; // Exhausted pool memory
+ }
+#ifndef NDEBUG
+ for(index_t i = qv.offset(); i < qlist_.size(); i++) {
+ if(i > qv.offset()) {
+ assert(qlist_.get(i) != qlist_.get(i-1));
+ }
+ }
+#endif
+ assert_eq(qv.offset() + qv.numRanges(), qlist_.size());
+ SANode *s = samap_.add(pool(), sak, &added);
+ if(s == NULL) {
+ return false; // Exhausted pool memory
+ }
+ assert(s->key.repOk());
+ if(added) {
+ s->payload.i = (index_t)salist_.size();
+ s->payload.len = botf - topf;
+ s->payload.topf = topf;
+ s->payload.topb = topb;
+ for(size_t j = 0; j < (botf-topf); j++) {
+ if(!salist_.add(pool(), (index_t)0xffffffff)) {
+ // Change the payload's len field
+ s->payload.len = (uint32_t)j;
+ return false; // Exhausted pool memory
+ }
+ }
+ assert(s->payload.repOk(*this));
+ }
+ // Now that we know all allocations have succeeded, we can do a few final
+ // updates
+
+ return true;
+}
+
+#endif /*ALIGNER_CACHE_H_*/
diff --git a/aligner_driver.cpp b/aligner_driver.cpp
new file mode 100644
index 0000000..00703de
--- /dev/null
+++ b/aligner_driver.cpp
@@ -0,0 +1,80 @@
+/*
+ * Copyright 2012, Ben Langmead
+ *
+ * This file is part of Bowtie 2.
+ *
+ * Bowtie 2 is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ *
+ * Bowtie 2 is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with Bowtie 2. If not, see .
+ */
+
+#include "aligner_driver.h"
+
+void AlignerDriverRootSelector::select(
+ const Read& q,
+ const Read* qo,
+ bool nofw,
+ bool norc,
+ EList& confs,
+ EList& roots)
+{
+ // Calculate interval length for both mates
+ int interval = rootIval_.f((double)q.length());
+ if(qo != NULL) {
+ // Boost interval length by 20% for paired-end reads
+ interval = (int)(interval * 1.2 + 0.5);
+ }
+ float pri = 0.0f;
+ for(int fwi = 0; fwi < 2; fwi++) {
+ bool fw = (fwi == 0);
+ if((fw && nofw) || (!fw && norc)) {
+ continue;
+ }
+ // Put down left-to-right roots w/r/t forward and reverse-complement reads
+ {
+ bool first = true;
+ size_t i = 0;
+ while(first || (i + landing_ <= q.length())) {
+ confs.expand();
+ confs.back().cons.init(landing_, consExp_);
+ roots.expand();
+ roots.back().init(
+ i, // offset from 5' end
+ true, // left-to-right?
+ fw, // fw?
+ q.length(), // query length
+ pri); // root priority
+ i += interval;
+ first = false;
+ }
+ }
+ // Put down right-to-left roots w/r/t forward and reverse-complement reads
+ {
+ bool first = true;
+ size_t i = 0;
+ while(first || (i + landing_ <= q.length())) {
+ confs.expand();
+ confs.back().cons.init(landing_, consExp_);
+ roots.expand();
+ roots.back().init(
+ q.length() - i - 1, // offset from 5' end
+ false, // left-to-right?
+ fw, // fw?
+ q.length(), // query length
+ pri); // root priority
+ i += interval;
+ first = false;
+ }
+ }
+ }
+}
+
diff --git a/aligner_driver.h b/aligner_driver.h
new file mode 100644
index 0000000..97f06bf
--- /dev/null
+++ b/aligner_driver.h
@@ -0,0 +1,247 @@
+/*
+ * Copyright 2012, Ben Langmead
+ *
+ * This file is part of Bowtie 2.
+ *
+ * Bowtie 2 is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ *
+ * Bowtie 2 is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with Bowtie 2. If not, see .
+ */
+
+/*
+ * aligner_driver.h
+ *
+ * REDUNDANT SEED HITS
+ *
+ * We say that two seed hits are redundant if they trigger identical
+ * seed-extend dynamic programming problems. Put another way, they both lie on
+ * the same diagonal of the overall read/reference dynamic programming matrix.
+ * Detecting redundant seed hits is simple when the seed hits are ungapped. We
+ * do this after offset resolution but before the offset is converted to genome
+ * coordinates (see uses of the seenDiags1_/seenDiags2_ fields for examples).
+ *
+ * REDUNDANT ALIGNMENTS
+ *
+ * In an unpaired context, we say that two alignments are redundant if they
+ * share any cells in the global DP table. Roughly speaking, this is like
+ * saying that two alignments are redundant if any read character aligns to the
+ * same reference character (same reference sequence, same strand, same offset)
+ * in both alignments.
+ *
+ * In a paired-end context, we say that two paired-end alignments are redundant
+ * if the mate #1s are redundant and the mate #2s are redundant.
+ *
+ * How do we enforce this? In the unpaired context, this is relatively simple:
+ * the cells from each alignment are checked against a set containing all cells
+ * from all previous alignments. Given a new alignment, for each cell in the
+ * new alignment we check whether it is in the set. If there is any overlap,
+ * the new alignment is rejected as redundant. Otherwise, the new alignment is
+ * accepted and its cells are added to the set.
+ *
+ * Enforcement in a paired context is a little trickier. Consider the
+ * following approaches:
+ *
+ * 1. Skip anchors that are redundant with any previous anchor or opposite
+ * alignment. This is sufficient to ensure no two concordant alignments
+ * found are redundant.
+ *
+ * 2. Same as scheme 1, but with a "transitive closure" scheme for finding all
+ * concordant pairs in the vicinity of an anchor. Consider the AB/AC
+ * scenario from the previous paragraph. If B is the anchor alignment, we
+ * will find AB but not AC. But under this scheme, once we find AB we then
+ * let B be a new anchor and immediately look for its opposites. Likewise,
+ * if we find any opposite, we make them anchors and continue searching. We
+ * don't stop searching until every opposite is used as an anchor.
+ *
+ * 3. Skip anchors that are redundant with any previous anchor alignment (but
+ * allow anchors that are redundant with previous opposite alignments).
+ * This isn't sufficient to avoid redundant concordant alignments. To avoid
+ * redundant concordants, we need an additional procedure that checks each
+ * new concordant alignment one-by-one against a list of previous concordant
+ * alignments to see if it is redundant.
+ *
+ * We take approach 1.
+ */
+
+#ifndef ALIGNER_DRIVER_H_
+#define ALIGNER_DRIVER_H_
+
+#include "aligner_seed2.h"
+#include "simple_func.h"
+#include "aln_sink.h"
+
+/**
+ * Concrete subclass of DescentRootSelector. Puts a root every 'ival' chars,
+ * where 'ival' is determined by user-specified parameters. A root is filtered
+ * out if the end of the read is less than 'landing' positions away, in the
+ * direction of the search.
+ */
+class AlignerDriverRootSelector : public DescentRootSelector {
+
+public:
+
+ AlignerDriverRootSelector(
+ double consExp,
+ const SimpleFunc& rootIval,
+ size_t landing)
+ {
+ consExp_ = consExp;
+ rootIval_ = rootIval;
+ landing_ = landing;
+ }
+
+ virtual ~AlignerDriverRootSelector() { }
+
+ virtual void select(
+ const Read& q, // read that we're selecting roots for
+ const Read* qo, // opposite mate, if applicable
+ bool nofw, // don't add roots for fw read
+ bool norc, // don't add roots for rc read
+ EList& confs, // put DescentConfigs here
+ EList& roots); // put DescentRoot here
+
+protected:
+
+ double consExp_;
+ SimpleFunc rootIval_;
+ size_t landing_;
+};
+
+/**
+ * Return values from extendSeeds and extendSeedsPaired.
+ */
+enum {
+ // Candidates were examined exhaustively
+ ALDRIVER_EXHAUSTED_CANDIDATES = 1,
+ // The policy does not need us to look any further
+ ALDRIVER_POLICY_FULFILLED,
+ // We stopped because we ran up against a limit on how much work we should
+ // do for one set of seed ranges, e.g. the limit on number of consecutive
+ // unproductive DP extensions
+ ALDRIVER_EXCEEDED_LIMIT
+};
+
+/**
+ * This class is the glue between a DescentDriver and the dynamic programming
+ * implementations in Bowtie 2. The DescentDriver is used to find some very
+ * high-scoring alignments, but is additionally used to rank partial alignments
+ * so that they can be extended using dynamic programming.
+ */
+template
+class AlignerDriver {
+
+public:
+
+ AlignerDriver(
+ double consExp,
+ const SimpleFunc& rootIval,
+ size_t landing,
+ bool veryVerbose,
+ const SimpleFunc& totsz,
+ const SimpleFunc& totfmops) :
+ sel_(consExp, rootIval, landing),
+ alsel_(),
+ dr1_(veryVerbose),
+ dr2_(veryVerbose)
+ {
+ totsz_ = totsz;
+ totfmops_ = totfmops;
+ }
+
+ /**
+ * Initialize driver with respect to a new read or pair.
+ */
+ void initRead(
+ const Read& q1,
+ bool nofw,
+ bool norc,
+ TAlScore minsc,
+ TAlScore maxpen,
+ const Read* q2)
+ {
+ dr1_.initRead(q1, nofw, norc, minsc, maxpen, q2, &sel_);
+ red1_.init(q1.length());
+ paired_ = false;
+ if(q2 != NULL) {
+ dr2_.initRead(*q2, nofw, norc, minsc, maxpen, &q1, &sel_);
+ red2_.init(q2->length());
+ paired_ = true;
+ } else {
+ dr2_.reset();
+ }
+ size_t totsz = totsz_.f(q1.length());
+ size_t totfmops = totfmops_.f(q1.length());
+ stop_.init(
+ totsz,
+ 0,
+ true,
+ totfmops);
+ }
+
+ /**
+ * Start the driver. The driver will begin by conducting a best-first,
+ * index-assisted search through the space of possible full and partial
+ * alignments. This search may be followed up with a dynamic programming
+ * extension step, taking a prioritized set of partial SA ranges found
+ * during the search and extending each with DP. The process might also be
+ * iterated, with the search being occasioanally halted so that DPs can be
+ * tried, then restarted, etc.
+ */
+ int go(
+ const Scoring& sc,
+ const GFM& gfmFw,
+ const GFM& gfmBw,
+ const BitPairReference& ref,
+ DescentMetrics& met,
+ WalkMetrics& wlm,
+ PerReadMetrics& prm,
+ RandomSource& rnd,
+ AlnSinkWrap& sink);
+
+ /**
+ * Reset state of all DescentDrivers.
+ */
+ void reset() {
+ dr1_.reset();
+ dr2_.reset();
+ red1_.reset();
+ red2_.reset();
+ }
+
+protected:
+
+ AlignerDriverRootSelector sel_; // selects where roots should go
+ DescentAlignmentSelector alsel_; // one selector can deal with >1 drivers
+ DescentDriver dr1_; // driver for mate 1/unpaired reads
+ DescentDriver dr2_; // driver for paired-end reads
+ DescentStoppingConditions stop_; // when to pause index-assisted BFS
+ bool paired_; // current read is paired?
+
+ SimpleFunc totsz_; // memory limit on best-first search data
+ SimpleFunc totfmops_; // max # FM ops for best-first search
+
+ // For detecting redundant alignments
+ RedundantAlns red1_; // database of cells used for mate 1 alignments
+ RedundantAlns red2_; // database of cells used for mate 2 alignments
+
+ // For AlnRes::matchesRef
+ ASSERT_ONLY(SStringExpandable raw_refbuf_);
+ ASSERT_ONLY(SStringExpandable raw_destU32_);
+ ASSERT_ONLY(EList raw_matches_);
+ ASSERT_ONLY(BTDnaString tmp_rf_);
+ ASSERT_ONLY(BTDnaString tmp_rdseq_);
+ ASSERT_ONLY(BTString tmp_qseq_);
+ ASSERT_ONLY(EList tmp_reflens_);
+ ASSERT_ONLY(EList tmp_refoffs_);
+};
+
+#endif /* defined(ALIGNER_DRIVER_H_) */
diff --git a/aligner_metrics.h b/aligner_metrics.h
new file mode 100644
index 0000000..c0b0182
--- /dev/null
+++ b/aligner_metrics.h
@@ -0,0 +1,352 @@
+/*
+ * Copyright 2011, Ben Langmead
+ *
+ * This file is part of Bowtie 2.
+ *
+ * Bowtie 2 is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ *
+ * Bowtie 2 is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with Bowtie 2. If not, see .
+ */
+
+#ifndef ALIGNER_METRICS_H_
+#define ALIGNER_METRICS_H_
+
+#include
+#include
+#include "alphabet.h"
+#include "timer.h"
+#include "sstring.h"
+
+using namespace std;
+
+/**
+ * Borrowed from http://www.johndcook.com/standard_deviation.html,
+ * which in turn is borrowed from Knuth.
+ */
+class RunningStat {
+public:
+ RunningStat() : m_n(0), m_tot(0.0) { }
+
+ void clear() {
+ m_n = 0;
+ m_tot = 0.0;
+ }
+
+ void push(float x) {
+ m_n++;
+ m_tot += x;
+ // See Knuth TAOCP vol 2, 3rd edition, page 232
+ if (m_n == 1) {
+ m_oldM = m_newM = x;
+ m_oldS = 0.0;
+ } else {
+ m_newM = m_oldM + (x - m_oldM)/m_n;
+ m_newS = m_oldS + (x - m_oldM)*(x - m_newM);
+ // set up for next iteration
+ m_oldM = m_newM;
+ m_oldS = m_newS;
+ }
+ }
+
+ int num() const {
+ return m_n;
+ }
+
+ double tot() const {
+ return m_tot;
+ }
+
+ double mean() const {
+ return (m_n > 0) ? m_newM : 0.0;
+ }
+
+ double variance() const {
+ return ( (m_n > 1) ? m_newS/(m_n - 1) : 0.0 );
+ }
+
+ double stddev() const {
+ return sqrt(variance());
+ }
+
+private:
+ int m_n;
+ double m_tot;
+ double m_oldM, m_newM, m_oldS, m_newS;
+};
+
+/**
+ * Encapsulates a set of metrics that we would like an aligner to keep
+ * track of, so that we can possibly use it to diagnose performance
+ * issues.
+ */
+class AlignerMetrics {
+
+public:
+
+ AlignerMetrics() :
+ curBacktracks_(0),
+ curBwtOps_(0),
+ first_(true),
+ curIsLowEntropy_(false),
+ curIsHomoPoly_(false),
+ curHadRanges_(false),
+ curNumNs_(0),
+ reads_(0),
+ homoReads_(0),
+ lowEntReads_(0),
+ hiEntReads_(0),
+ alignedReads_(0),
+ unalignedReads_(0),
+ threeOrMoreNReads_(0),
+ lessThanThreeNRreads_(0),
+ bwtOpsPerRead_(),
+ backtracksPerRead_(),
+ bwtOpsPerHomoRead_(),
+ backtracksPerHomoRead_(),
+ bwtOpsPerLoEntRead_(),
+ backtracksPerLoEntRead_(),
+ bwtOpsPerHiEntRead_(),
+ backtracksPerHiEntRead_(),
+ bwtOpsPerAlignedRead_(),
+ backtracksPerAlignedRead_(),
+ bwtOpsPerUnalignedRead_(),
+ backtracksPerUnalignedRead_(),
+ bwtOpsPer0nRead_(),
+ backtracksPer0nRead_(),
+ bwtOpsPer1nRead_(),
+ backtracksPer1nRead_(),
+ bwtOpsPer2nRead_(),
+ backtracksPer2nRead_(),
+ bwtOpsPer3orMoreNRead_(),
+ backtracksPer3orMoreNRead_(),
+ timer_(cout, "", false)
+ { }
+
+ void printSummary() {
+ if(!first_) {
+ finishRead();
+ }
+ cout << "AlignerMetrics:" << endl;
+ cout << " # Reads: " << reads_ << endl;
+ float hopct = (reads_ > 0) ? (((float)homoReads_)/((float)reads_)) : (0.0f);
+ hopct *= 100.0f;
+ cout << " % homo-polymeric: " << (hopct) << endl;
+ float lopct = (reads_ > 0) ? ((float)lowEntReads_/(float)(reads_)) : (0.0f);
+ lopct *= 100.0f;
+ cout << " % low-entropy: " << (lopct) << endl;
+ float unpct = (reads_ > 0) ? ((float)unalignedReads_/(float)(reads_)) : (0.0f);
+ unpct *= 100.0f;
+ cout << " % unaligned: " << (unpct) << endl;
+ float npct = (reads_ > 0) ? ((float)threeOrMoreNReads_/(float)(reads_)) : (0.0f);
+ npct *= 100.0f;
+ cout << " % with 3 or more Ns: " << (npct) << endl;
+ cout << endl;
+ cout << " Total BWT ops: avg: " << bwtOpsPerRead_.mean() << ", stddev: " << bwtOpsPerRead_.stddev() << endl;
+ cout << " Total Backtracks: avg: " << backtracksPerRead_.mean() << ", stddev: " << backtracksPerRead_.stddev() << endl;
+ time_t elapsed = timer_.elapsed();
+ cout << " BWT ops per second: " << (bwtOpsPerRead_.tot()/elapsed) << endl;
+ cout << " Backtracks per second: " << (backtracksPerRead_.tot()/elapsed) << endl;
+ cout << endl;
+ cout << " Homo-poly:" << endl;
+ cout << " BWT ops: avg: " << bwtOpsPerHomoRead_.mean() << ", stddev: " << bwtOpsPerHomoRead_.stddev() << endl;
+ cout << " Backtracks: avg: " << backtracksPerHomoRead_.mean() << ", stddev: " << backtracksPerHomoRead_.stddev() << endl;
+ cout << " Low-entropy:" << endl;
+ cout << " BWT ops: avg: " << bwtOpsPerLoEntRead_.mean() << ", stddev: " << bwtOpsPerLoEntRead_.stddev() << endl;
+ cout << " Backtracks: avg: " << backtracksPerLoEntRead_.mean() << ", stddev: " << backtracksPerLoEntRead_.stddev() << endl;
+ cout << " High-entropy:" << endl;
+ cout << " BWT ops: avg: " << bwtOpsPerHiEntRead_.mean() << ", stddev: " << bwtOpsPerHiEntRead_.stddev() << endl;
+ cout << " Backtracks: avg: " << backtracksPerHiEntRead_.mean() << ", stddev: " << backtracksPerHiEntRead_.stddev() << endl;
+ cout << endl;
+ cout << " Unaligned:" << endl;
+ cout << " BWT ops: avg: " << bwtOpsPerUnalignedRead_.mean() << ", stddev: " << bwtOpsPerUnalignedRead_.stddev() << endl;
+ cout << " Backtracks: avg: " << backtracksPerUnalignedRead_.mean() << ", stddev: " << backtracksPerUnalignedRead_.stddev() << endl;
+ cout << " Aligned:" << endl;
+ cout << " BWT ops: avg: " << bwtOpsPerAlignedRead_.mean() << ", stddev: " << bwtOpsPerAlignedRead_.stddev() << endl;
+ cout << " Backtracks: avg: " << backtracksPerAlignedRead_.mean() << ", stddev: " << backtracksPerAlignedRead_.stddev() << endl;
+ cout << endl;
+ cout << " 0 Ns:" << endl;
+ cout << " BWT ops: avg: " << bwtOpsPer0nRead_.mean() << ", stddev: " << bwtOpsPer0nRead_.stddev() << endl;
+ cout << " Backtracks: avg: " << backtracksPer0nRead_.mean() << ", stddev: " << backtracksPer0nRead_.stddev() << endl;
+ cout << " 1 N:" << endl;
+ cout << " BWT ops: avg: " << bwtOpsPer1nRead_.mean() << ", stddev: " << bwtOpsPer1nRead_.stddev() << endl;
+ cout << " Backtracks: avg: " << backtracksPer1nRead_.mean() << ", stddev: " << backtracksPer1nRead_.stddev() << endl;
+ cout << " 2 Ns:" << endl;
+ cout << " BWT ops: avg: " << bwtOpsPer2nRead_.mean() << ", stddev: " << bwtOpsPer2nRead_.stddev() << endl;
+ cout << " Backtracks: avg: " << backtracksPer2nRead_.mean() << ", stddev: " << backtracksPer2nRead_.stddev() << endl;
+ cout << " >2 Ns:" << endl;
+ cout << " BWT ops: avg: " << bwtOpsPer3orMoreNRead_.mean() << ", stddev: " << bwtOpsPer3orMoreNRead_.stddev() << endl;
+ cout << " Backtracks: avg: " << backtracksPer3orMoreNRead_.mean() << ", stddev: " << backtracksPer3orMoreNRead_.stddev() << endl;
+ cout << endl;
+ }
+
+ /**
+ *
+ */
+ void nextRead(const BTDnaString& read) {
+ if(!first_) {
+ finishRead();
+ }
+ first_ = false;
+ //float ent = entropyDna5(read);
+ float ent = 0.0f;
+ curIsLowEntropy_ = (ent < 0.75f);
+ curIsHomoPoly_ = (ent < 0.001f);
+ curHadRanges_ = false;
+ curBwtOps_ = 0;
+ curBacktracks_ = 0;
+ // Count Ns
+ curNumNs_ = 0;
+ const size_t len = read.length();
+ for(size_t i = 0; i < len; i++) {
+ if((int)read[i] == 4) curNumNs_++;
+ }
+ }
+
+ /**
+ *
+ */
+ void setReadHasRange() {
+ curHadRanges_ = true;
+ }
+
+ /**
+ * Commit the running statistics for this read to
+ */
+ void finishRead() {
+ reads_++;
+ if(curIsHomoPoly_) homoReads_++;
+ else if(curIsLowEntropy_) lowEntReads_++;
+ else hiEntReads_++;
+ if(curHadRanges_) alignedReads_++;
+ else unalignedReads_++;
+ bwtOpsPerRead_.push((float)curBwtOps_);
+ backtracksPerRead_.push((float)curBacktracks_);
+ // Drill down by entropy
+ if(curIsHomoPoly_) {
+ bwtOpsPerHomoRead_.push((float)curBwtOps_);
+ backtracksPerHomoRead_.push((float)curBacktracks_);
+ } else if(curIsLowEntropy_) {
+ bwtOpsPerLoEntRead_.push((float)curBwtOps_);
+ backtracksPerLoEntRead_.push((float)curBacktracks_);
+ } else {
+ bwtOpsPerHiEntRead_.push((float)curBwtOps_);
+ backtracksPerHiEntRead_.push((float)curBacktracks_);
+ }
+ // Drill down by whether it aligned
+ if(curHadRanges_) {
+ bwtOpsPerAlignedRead_.push((float)curBwtOps_);
+ backtracksPerAlignedRead_.push((float)curBacktracks_);
+ } else {
+ bwtOpsPerUnalignedRead_.push((float)curBwtOps_);
+ backtracksPerUnalignedRead_.push((float)curBacktracks_);
+ }
+ if(curNumNs_ == 0) {
+ lessThanThreeNRreads_++;
+ bwtOpsPer0nRead_.push((float)curBwtOps_);
+ backtracksPer0nRead_.push((float)curBacktracks_);
+ } else if(curNumNs_ == 1) {
+ lessThanThreeNRreads_++;
+ bwtOpsPer1nRead_.push((float)curBwtOps_);
+ backtracksPer1nRead_.push((float)curBacktracks_);
+ } else if(curNumNs_ == 2) {
+ lessThanThreeNRreads_++;
+ bwtOpsPer2nRead_.push((float)curBwtOps_);
+ backtracksPer2nRead_.push((float)curBacktracks_);
+ } else {
+ threeOrMoreNReads_++;
+ bwtOpsPer3orMoreNRead_.push((float)curBwtOps_);
+ backtracksPer3orMoreNRead_.push((float)curBacktracks_);
+ }
+ }
+
+ // Running-total of the number of backtracks and BWT ops for the
+ // current read
+ uint32_t curBacktracks_;
+ uint32_t curBwtOps_;
+
+protected:
+
+ bool first_;
+
+ // true iff the current read is low entropy
+ bool curIsLowEntropy_;
+ // true if current read is all 1 char (or very close)
+ bool curIsHomoPoly_;
+ // true iff the current read has had one or more ranges reported
+ bool curHadRanges_;
+ // number of Ns in current read
+ int curNumNs_;
+
+ // # reads
+ uint32_t reads_;
+ // # homo-poly reads
+ uint32_t homoReads_;
+ // # low-entropy reads
+ uint32_t lowEntReads_;
+ // # high-entropy reads
+ uint32_t hiEntReads_;
+ // # reads with alignments
+ uint32_t alignedReads_;
+ // # reads without alignments
+ uint32_t unalignedReads_;
+ // # reads with 3 or more Ns
+ uint32_t threeOrMoreNReads_;
+ // # reads with < 3 Ns
+ uint32_t lessThanThreeNRreads_;
+
+ // Distribution of BWT operations per read
+ RunningStat bwtOpsPerRead_;
+ RunningStat backtracksPerRead_;
+
+ // Distribution of BWT operations per homo-poly read
+ RunningStat bwtOpsPerHomoRead_;
+ RunningStat backtracksPerHomoRead_;
+
+ // Distribution of BWT operations per low-entropy read
+ RunningStat bwtOpsPerLoEntRead_;
+ RunningStat backtracksPerLoEntRead_;
+
+ // Distribution of BWT operations per high-entropy read
+ RunningStat bwtOpsPerHiEntRead_;
+ RunningStat backtracksPerHiEntRead_;
+
+ // Distribution of BWT operations per read that "aligned" (for
+ // which a range was arrived at - range may not have necessarily
+ // lead to an alignment)
+ RunningStat bwtOpsPerAlignedRead_;
+ RunningStat backtracksPerAlignedRead_;
+
+ // Distribution of BWT operations per read that didn't align
+ RunningStat bwtOpsPerUnalignedRead_;
+ RunningStat backtracksPerUnalignedRead_;
+
+ // Distribution of BWT operations/backtracks per read with no Ns
+ RunningStat bwtOpsPer0nRead_;
+ RunningStat backtracksPer0nRead_;
+
+ // Distribution of BWT operations/backtracks per read with one N
+ RunningStat bwtOpsPer1nRead_;
+ RunningStat backtracksPer1nRead_;
+
+ // Distribution of BWT operations/backtracks per read with two Ns
+ RunningStat bwtOpsPer2nRead_;
+ RunningStat backtracksPer2nRead_;
+
+ // Distribution of BWT operations/backtracks per read with three or
+ // more Ns
+ RunningStat bwtOpsPer3orMoreNRead_;
+ RunningStat backtracksPer3orMoreNRead_;
+
+ Timer timer_;
+};
+
+#endif /* ALIGNER_METRICS_H_ */
diff --git a/aligner_report.h b/aligner_report.h
new file mode 100644
index 0000000..c5cd8db
--- /dev/null
+++ b/aligner_report.h
@@ -0,0 +1,35 @@
+/*
+ * Copyright 2011, Ben Langmead
+ *
+ * This file is part of Bowtie 2.
+ *
+ * Bowtie 2 is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ *
+ * Bowtie 2 is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with Bowtie 2. If not, see .
+ */
+
+#ifndef ALIGNER_REPORT_H_
+#define ALIGNER_REPORT_H_
+
+#include "aligner_cache.h"
+
+class Reporter {
+public:
+ /**
+ *
+ */
+ bool report(const AlignmentCacheIface& cache, const QVal& qv) {
+ return true; // don't retry
+ }
+};
+
+#endif /*ALIGNER_REPORT_H_*/
diff --git a/aligner_result.cpp b/aligner_result.cpp
new file mode 100644
index 0000000..9043a11
--- /dev/null
+++ b/aligner_result.cpp
@@ -0,0 +1,2162 @@
+/*
+ * Copyright 2011, Ben Langmead
+ *
+ * This file is part of Bowtie 2.
+ *
+ * Bowtie 2 is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ *
+ * Bowtie 2 is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with Bowtie 2. If not, see .
+ */
+
+#include
+#include "reference.h"
+#include "aligner_result.h"
+#include "read.h"
+#include "edit.h"
+#include "sstring.h"
+#include "ds.h"
+#include "util.h"
+#include "alphabet.h"
+
+using namespace std;
+
+/**
+ * Clear all contents.
+ */
+void AlnRes::reset() {
+ if(ned_ != NULL) {
+ assert(aed_ != NULL);
+ ned_->clear();
+ aed_->clear();
+ }
+ score_.invalidate();
+ refcoord_.reset();
+ refival_.reset();
+ shapeSet_ = false;
+ rdlen_ = 0;
+ rdid_ = 0;
+ reflen_ = 0;
+ rdrows_ = 0;
+ rdextent_ = 0;
+ rdexrows_ = 0;
+ rfextent_ = 0;
+ refns_ = 0;
+ type_ = ALN_RES_TYPE_UNPAIRED;
+ fraglen_ = -1;
+ trimSoft_ = false;
+ trim5p_ = 0;
+ trim3p_ = 0;
+ pretrimSoft_ = true;
+ pretrim5p_ = 0;
+ pretrim3p_ = 0;
+ seedmms_ = 0; // number of mismatches allowed in seed
+ seedlen_ = 0; // length of seed
+ seedival_ = 0; // interval between seeds
+ minsc_ = 0; // minimum score
+ nuc5p_ = 0;
+ nuc3p_ = 0;
+ fraglenSet_ = false;
+ num_spliced_ = 0;
+ assert(!refcoord_.inited());
+ assert(!refival_.inited());
+}
+
+/**
+ * Set the upstream-most reference offset involved in the alignment, and
+ * the extent of the alignment (w/r/t the reference)
+ */
+void AlnRes::setShape(
+ TRefId id, // id of reference aligned to
+ TRefOff off, // offset of first aligned char into ref seq
+ TRefOff reflen, // length of reference sequence aligned to
+ bool fw, // aligned to Watson strand?
+ size_t rdlen, // length of read after hard trimming, before soft
+ TReadId rdid, // read ID
+ bool pretrimSoft, // whether trimming prior to alignment was soft
+ size_t pretrim5p, // # poss trimmed form 5p end before alignment
+ size_t pretrim3p, // # poss trimmed form 3p end before alignment
+ bool trimSoft, // whether local-alignment trimming was soft
+ size_t trim5p, // # poss trimmed form 5p end during alignment
+ size_t trim3p) // # poss trimmed form 3p end during alignment
+{
+ rdlen_ = rdlen;
+ rdid_ = rdid;
+ rdrows_ = rdlen;
+ refcoord_.init(id, off, fw);
+ pretrimSoft_ = pretrimSoft;
+ pretrim5p_ = pretrim5p;
+ pretrim3p_ = pretrim3p;
+ trimSoft_ = trimSoft;
+ trim5p_ = trim5p;
+ trim3p_ = trim3p;
+ // Propagate trimming to the edits. We assume that the pos fields of the
+ // edits are set w/r/t to the rows of the dynamic programming table, and
+ // haven't taken trimming into account yet.
+ //
+ // TODO: The division of labor between the aligner and the AlnRes is not
+ // clean. Perhaps the trimming and *all* of its side-effects should be
+ // handled by the aligner.
+
+ // daehwan - check this out - this doesn't seem to work with SAWHI
+ // size_t trimBeg = fw ? trim5p : trim3p;
+ size_t trimBeg = trim5p;
+ if(trimBeg > 0) {
+ for(size_t i = 0; i < ned_->size(); i++) {
+ // Shift by trim5p, since edits are w/r/t 5p end
+ assert_geq((*ned_)[i].pos, trimBeg);
+ (*ned_)[i].pos -= (uint32_t)trimBeg;
+ }
+ }
+ // Length after all soft trimming and any hard trimming that occurred
+ // during alignment
+ rdextent_ = rdlen;
+ if(pretrimSoft_) {
+ rdextent_ -= (pretrim5p + pretrim3p); // soft trim
+ }
+ rdextent_ -= (trim5p + trim3p); // soft or hard trim from alignment
+ assert_gt(rdextent_, 0);
+ rdexrows_ = rdextent_;
+ calcRefExtent();
+ refival_.init(id, off, fw, rfextent_);
+ reflen_ = reflen;
+ shapeSet_ = true;
+}
+
+/**
+ * Initialize new AlnRes.
+ */
+void AlnRes::init(
+ size_t rdlen, // # chars after hard trimming
+ TReadId rdid, // read ID
+ AlnScore score, // alignment score
+ const EList* ned, // nucleotide edits
+ size_t ned_i, // first position to copy
+ size_t ned_n, // # positions to copy
+ const EList* aed, // ambiguous base resolutions
+ size_t aed_i, // first position to copy
+ size_t aed_n, // # positions to copy
+ Coord refcoord, // leftmost ref pos of 1st al char
+ TRefOff reflen, // length of ref aligned to
+ LinkedEList >* raw_edits,
+ int seedmms, // # seed mms allowed
+ int seedlen, // seed length
+ int seedival, // space between seeds
+ int64_t minsc, // minimum score for valid aln
+ int nuc5p,
+ int nuc3p,
+ bool pretrimSoft,
+ size_t pretrim5p, // trimming prior to alignment
+ size_t pretrim3p, // trimming prior to alignment
+ bool trimSoft,
+ size_t trim5p, // trimming from alignment
+ size_t trim3p, // trimming from alignment
+ bool repeat) // repeat
+{
+ assert(raw_edits != NULL);
+ assert(raw_edits_ == NULL || raw_edits_ == raw_edits);
+ raw_edits_ = raw_edits;
+ if(ned_ != NULL) {
+ assert(aed_ != NULL);
+ ned_->clear();
+ aed_->clear();
+ } else if(raw_edits_ != NULL) {
+ assert(aed_ == NULL);
+ assert(ned_node_ == NULL && aed_node_ == NULL);
+ ned_node_ = raw_edits_->new_node();
+ aed_node_ = raw_edits_->new_node();
+ assert(ned_node_ != NULL && aed_node_ != NULL);
+ ned_ = &(ned_node_->payload);
+ aed_ = &(aed_node_->payload);
+ }
+
+ rdlen_ = rdlen;
+ rdid_ = rdid;
+ rdrows_ = rdlen;
+ score_ = score;
+ ned_->clear();
+ aed_->clear();
+ if(ned != NULL) {
+ for(size_t i = ned_i; i < ned_i + ned_n; i++) {
+ ned_->push_back((*ned)[i]);
+ }
+ }
+ if(aed != NULL) {
+ for(size_t i = aed_i; i < aed_i + aed_n; i++) {
+ aed_->push_back((*aed)[i]);
+ }
+ }
+ refcoord_ = refcoord;
+ reflen_ = reflen;
+ seedmms_ = seedmms;
+ seedlen_ = seedlen;
+ seedival_ = seedival;
+ minsc_ = minsc;
+ nuc5p_ = nuc5p;
+ nuc3p_ = nuc3p;
+ pretrimSoft_ = pretrimSoft;
+ pretrim5p_ = pretrim5p;
+ pretrim3p_ = pretrim3p;
+ trimSoft_ = trimSoft;
+ trim5p_ = trim5p;
+ trim3p_ = trim3p;
+ repeat_ = repeat;
+ rdextent_ = rdlen; // # read characters after any hard trimming
+ if(pretrimSoft) {
+ rdextent_ -= (pretrim5p + pretrim3p);
+ }
+ if(trimSoft) {
+ rdextent_ -= (trim5p + trim3p);
+ }
+ rdexrows_ = rdextent_;
+ calcRefExtent();
+ setShape(
+ refcoord.ref(), // id of reference aligned to
+ refcoord.off(), // offset of first aligned char into ref seq
+ reflen, // length of reference sequence aligned to
+ refcoord.fw(), // aligned to Watson strand?
+ rdlen, // length of read after hard trimming, before soft
+ rdid, // read ID
+ pretrimSoft, // whether trimming prior to alignment was soft
+ pretrim5p, // # poss trimmed form 5p end before alignment
+ pretrim3p, // # poss trimmed form 3p end before alignment
+ trimSoft, // whether local-alignment trimming was soft
+ trim5p, // # poss trimmed form 5p end during alignment
+ trim3p); // # poss trimmed form 3p end during alignment
+ shapeSet_ = true;
+
+ num_spliced_ = 0;
+ for(size_t i = 0; i < ned_->size(); i++) {
+ if((*ned_)[i].type == EDIT_TYPE_SPL) {
+ num_spliced_++;
+ }
+ }
+}
+
+/**
+ * Clip given number of characters from the Watson-upstream end of the
+ * alignment.
+ */
+void AlnRes::clipLeft(size_t rd_amt, size_t rf_amt) {
+ assert_geq(rd_amt, 0);
+ assert_geq(rf_amt, 0);
+ assert_leq(rd_amt, rdexrows_);
+ assert_leq(rf_amt, rfextent_);
+ assert(trimSoft_);
+ if(fw()) {
+ trim5p_ += rd_amt;
+ Edit::clipLo(*ned_, rdexrows_, rd_amt);
+ Edit::clipLo(*aed_, rdexrows_, rd_amt);
+ } else {
+ trim3p_ += rd_amt;
+ Edit::clipHi(*ned_, rdexrows_, rd_amt);
+ Edit::clipHi(*aed_, rdexrows_, rd_amt);
+ }
+ rdexrows_ -= rd_amt;
+ rdextent_ -= rd_amt;
+ rfextent_ -= rf_amt;
+ refcoord_.adjustOff(rf_amt);
+ refival_.adjustOff(rf_amt);
+ // Adjust refns_?
+}
+
+/**
+ * Clip given number of characters from the Watson-downstream end of the
+ * alignment.
+ */
+void AlnRes::clipRight(size_t rd_amt, size_t rf_amt) {
+ assert_geq(rd_amt, 0);
+ assert_geq(rf_amt, 0);
+ assert_leq(rd_amt, rdexrows_);
+ assert_leq(rf_amt, rfextent_);
+ assert(trimSoft_);
+ if(fw()) {
+ trim3p_ += rd_amt;
+ Edit::clipHi(*ned_, rdexrows_, rd_amt);
+ Edit::clipHi(*aed_, rdexrows_, rd_amt);
+ } else {
+ trim5p_ += rd_amt;
+ Edit::clipLo(*ned_, rdexrows_, rd_amt);
+ Edit::clipLo(*aed_, rdexrows_, rd_amt);
+ }
+ rdexrows_ -= rd_amt;
+ rdextent_ -= rd_amt;
+ rfextent_ -= rf_amt;
+ // Adjust refns_?
+}
+
+/**
+ * Clip away portions of the alignment that are outside the given bounds.
+ * Clipping is soft if soft == true, hard otherwise. Assuming for now that
+ * there isn't any other clipping.
+ *
+ * Note that all clipping is expressed in terms of read positions. So if there
+ * are reference gaps in the overhanging portion, we must
+ */
+void AlnRes::clipOutside(bool soft, TRefOff refi, TRefOff reff) {
+ // Overhang on LHS
+ TRefOff left = refcoord_.off();
+ if(left < refi) {
+ size_t rf_amt = (size_t)(refi - left);
+ size_t rf_i = rf_amt;
+ size_t nedsz = ned_->size();
+ if(!fw()) {
+ Edit::invertPoss(*ned_, rdexrows_, false);
+ }
+ for(size_t i = 0; i < nedsz; i++) {
+ assert_lt((*ned_)[i].pos, rdexrows_);
+ if((*ned_)[i].pos > rf_i) break;
+ if((*ned_)[i].isRefGap()) rf_i++;
+ }
+ if(!fw()) {
+ Edit::invertPoss(*ned_, rdexrows_, false);
+ }
+ clipLeft(rf_i, rf_amt);
+ }
+ // Overhang on RHS
+ TRefOff right = refcoord_.off() + refNucExtent();
+ if(right > reff) {
+ size_t rf_amt = (size_t)(right - reff);
+ size_t rf_i = rf_amt;
+ size_t nedsz = ned_->size();
+ if(fw()) {
+ Edit::invertPoss(*ned_, rdexrows_, false);
+ }
+ for(size_t i = 0; i < nedsz; i++) {
+ assert_lt((*ned_)[i].pos, rdexrows_);
+ if((*ned_)[i].pos > rf_i) break;
+ if((*ned_)[i].isRefGap()) rf_i++;
+ }
+ if(fw()) {
+ Edit::invertPoss(*ned_, rdexrows_, false);
+ }
+ clipRight(rf_i, rf_amt);
+ }
+}
+
+/**
+ * Return true iff this AlnRes and the given AlnRes overlap. Two AlnRess
+ * overlap if they share a cell in the overall dynamic programming table:
+ * i.e. if there exists a read position s.t. that position in both reads
+ * matches up with the same reference character. E.g., the following
+ * alignments (drawn schematically as paths through a dynamic programming
+ * table) are redundant:
+ *
+ * a b a b
+ * \ \ \ \
+ * \ \ \ \
+ * \ \ \ \
+ * ---\ \ \
+ * \ ---\---
+ * ---\ \ \
+ * \ \ \ \
+ * \ \ \ \
+ * \ \ \ \
+ * a b a b
+ *
+ * We iterate over each read position that hasn't been hard-trimmed, but
+ * only overlaps at positions that have also not been soft-trimmed are
+ * considered.
+ */
+bool AlnRes::overlap(AlnRes& res) {
+ if(fw() != res.fw() || refid() != res.refid()) {
+ // Must be same reference and same strand in order to overlap
+ return false;
+ }
+ TRefOff my_left = refoff(); // my leftmost aligned char
+ TRefOff other_left = res.refoff(); // other leftmost aligned char
+ TRefOff my_right = my_left + refExtent();
+ TRefOff other_right = other_left + res.refExtent();
+ if(my_right < other_left || other_right < my_left) {
+ // The rectangular hulls of the two alignments don't overlap, so
+ // they can't overlap at any cell
+ return false;
+ }
+ // Reference and strand are the same and hulls overlap. Now go read
+ // position by read position testing if any align identically with the
+ // reference.
+
+ // Edits are ordered and indexed from 5' to 3' to start with. We
+ // reorder them to go from left to right along the Watson strand.
+ if(!fw()) {
+ invertEdits();
+ }
+ if(!res.fw()) {
+ res.invertEdits();
+ }
+ size_t nedidx = 0, onedidx = 0;
+ bool olap = false;
+ // For each row, going left to right along Watson reference strand...
+ for(size_t i = 0; i < rdexrows_; i++) {
+ size_t diff = 1; // amount to shift to right for next round
+ size_t odiff = 1; // amount to shift to right for next round
+ // Unless there are insertions before the next position, we say
+ // that there is one cell in this row involved in the alignment
+ my_right = my_left + 1;
+ other_right = other_left + 1;
+ while(nedidx < ned_->size() && (*ned_)[nedidx].pos == i) {
+ if((*ned_)[nedidx].isRefGap()) {
+ // Next my_left will be in same column as this round
+ diff = 0;
+ }
+ nedidx++;
+ }
+ while(onedidx < res.ned_->size() && (*res.ned_)[onedidx].pos == i) {
+ if((*res.ned_)[onedidx].isRefGap()) {
+ // Next my_left will be in same column as this round
+ odiff = 0;
+ }
+ onedidx++;
+ }
+ if(i < rdexrows_ - 1) {
+ // See how many inserts there are before the next read
+ // character
+ size_t nedidx_next = nedidx;
+ size_t onedidx_next = onedidx;
+ while(nedidx_next < ned_->size() &&
+ (*ned_)[nedidx_next].pos == i+1)
+ {
+ if((*ned_)[nedidx_next].isReadGap()) {
+ my_right++;
+ }
+ nedidx_next++;
+ }
+ while(onedidx_next < res.ned_->size() &&
+ (*res.ned_)[onedidx_next].pos == i+1)
+ {
+ if((*res.ned_)[onedidx_next].isReadGap()) {
+ other_right++;
+ }
+ onedidx_next++;
+ }
+ }
+ // Contained?
+ olap =
+ (my_left >= other_left && my_right <= other_right) ||
+ (other_left >= my_left && other_right <= my_right);
+ // Overlapping but not contained?
+ if(!olap) {
+ olap =
+ (my_left <= other_left && my_right > other_left) ||
+ (other_left <= my_left && other_right > my_left);
+ }
+ if(olap) {
+ break;
+ }
+ // How to do adjust my_left and my_right
+ my_left = my_right + diff - 1;
+ other_left = other_right + odiff - 1;
+ }
+ if(!fw()) {
+ invertEdits();
+ }
+ if(!res.fw()) {
+ res.invertEdits();
+ }
+ return olap;
+}
+
+#ifndef NDEBUG
+
+/**
+ * Assuming this AlnRes is an alignment for 'rd', check that the alignment and
+ * 'rd' are compatible with the corresponding reference sequence.
+ */
+bool AlnRes::matchesRef(
+ const Read& rd,
+ const BitPairReference& ref,
+ BTDnaString& rf,
+ BTDnaString& rdseq,
+ BTString& qseq,
+ SStringExpandable& raw_refbuf,
+ SStringExpandable& destU32,
+ EList& matches,
+ SStringExpandable& raw_refbuf2,
+ EList& reflens,
+ EList& refoffs)
+{
+ assert(!empty());
+ assert(repOk());
+ assert(refcoord_.inited());
+ size_t rdlen = rd.length();
+ bool fw = refcoord_.fw();
+ if(!fw) {
+ assert_lt(trim3p_, rdlen);
+ Edit::invertPoss(const_cast&>(*ned_), rdlen - trim5p_ - trim3p_, false);
+ }
+ size_t refallen = 0;
+ reflens.clear(); refoffs.clear();
+ int64_t reflen = 0;
+ int64_t refoff = refcoord_.off();
+ refoffs.push_back((uint32_t)refoff);
+ size_t eidx = 0;
+ assert_lt(trim5p_ + trim3p_, rdlen);
+ for(size_t i = 0; i < rdlen - trim5p_ - trim3p_; i++, reflen++, refoff++) {
+ while(eidx < ned_->size() && (*ned_)[eidx].pos == i) {
+ if((*ned_)[eidx].isReadGap()) {
+ reflen++;
+ refoff++;
+ } else if((*ned_)[eidx].isRefGap()) {
+ reflen--;
+ refoff--;
+ }
+ if((*ned_)[eidx].isSpliced()) {
+ assert_gt(reflen, 0);
+ refallen += (uint32_t)reflen;
+ reflens.push_back((uint32_t)reflen);
+ reflen = 0;
+ refoff += (*ned_)[eidx].splLen;
+ assert_gt(refoff, 0);
+ refoffs.push_back((uint32_t)refoff);
+ }
+ eidx++;
+ }
+ }
+ assert_gt(reflen, 0);
+ refallen += (uint32_t)reflen;
+ reflens.push_back((uint32_t)reflen);
+ assert_gt(reflens.size(), 0);
+ assert_gt(refoffs.size(), 0);
+ assert_eq(reflens.size(), refoffs.size());
+ if(!fw) {
+ assert_lt(trim3p_, rdlen);
+ Edit::invertPoss(const_cast&>(*ned_), rdlen - trim5p_ - trim3p_, false);
+ }
+
+ // Adjust reference string length according to edits
+#ifndef NDEBUG
+ if(reflens.size() == 1) {
+ assert_eq(refallen, refNucExtent());
+ }
+#endif
+
+ assert_geq(refcoord_.ref(), 0);
+ int nsOnLeft = 0;
+ if(refcoord_.off() < 0) {
+ nsOnLeft = -((int)refcoord_.off());
+ }
+ raw_refbuf.resize(refallen);
+ raw_refbuf.clear();
+ raw_refbuf2.clear();
+ for(size_t i = 0; i < reflens.size(); i++) {
+ assert_gt(reflens[i], 0);
+#ifndef NDEBUG
+ if(i > 0) {
+ assert_gt(refoffs[i], refoffs[i-1]);
+ }
+#endif
+ raw_refbuf2.resize(reflens[i] + 16);
+ raw_refbuf2.clear();
+ int off = ref.getStretch(
+ reinterpret_cast(raw_refbuf2.wbuf()),
+ (size_t)refcoord_.ref(),
+ (size_t)max(refoffs[i], 0),
+ reflens[i],
+ destU32);
+ assert_leq(off, 16);
+ raw_refbuf.append(raw_refbuf2.wbuf() + off, reflens[i]);
+ }
+ char *refbuf = raw_refbuf.wbuf();
+ size_t trim5 = 0, trim3 = 0;
+ if(trimSoft_) {
+ trim5 += trim5p_;
+ trim3 += trim3p_;
+ }
+ if(pretrimSoft_) {
+ trim5 += pretrim5p_;
+ trim3 += pretrim3p_;
+ }
+ rf.clear();
+ rdseq.clear();
+ rdseq = rd.patFw;
+ if(!fw) {
+ rdseq.reverseComp(false);
+ }
+ assert_eq(rdrows_, rdseq.length());
+ // rdseq is the nucleotide sequence from upstream to downstream on the
+ // Watson strand. ned_ are the nucleotide edits from upstream to
+ // downstream. rf contains the reference characters.
+ assert(Edit::repOk(*ned_, rdseq, fw, trim5, trim3));
+ Edit::toRef(rdseq, *ned_, rf, fw, trim5, trim3);
+ assert_eq(refallen, rf.length());
+ matches.clear();
+ bool matchesOverall = true;
+ matches.resize(refallen);
+ matches.fill(true);
+ for(size_t i = 0; i < refallen; i++) {
+ if((int)i < nsOnLeft) {
+ if((int)rf[i] != 4) {
+ matches[i] = false;
+ matchesOverall = false;
+ }
+ } else {
+ if((int)rf[i] != (int)refbuf[i-nsOnLeft]) {
+ matches[i] = false;
+ matchesOverall = false;
+ }
+ }
+ }
+ if(!matchesOverall) {
+ // Print a friendly message showing the difference between the
+ // reference sequence obtained with Edit::toRef and the actual
+ // reference sequence
+ cerr << endl;
+ Edit::printQAlignNoCheck(
+ cerr,
+ " ",
+ rdseq,
+ *ned_);
+ cerr << " ";
+ for(size_t i = 0; i < refallen; i++) {
+ cerr << (matches[i] ? " " : "*");
+ }
+ cerr << endl;
+ cerr << " ";
+ for(size_t i = 0; i < refallen-nsOnLeft; i++) {
+ cerr << "ACGTN"[(int)refbuf[i]];
+ }
+ cerr << endl;
+ Edit::printQAlign(
+ cerr,
+ " ",
+ rdseq,
+ *ned_);
+ cerr << endl;
+ }
+ return matchesOverall;
+}
+
+#endif /*ndef NDEBUG*/
+
+#define COPY_BUF() { \
+ char *bufc = buf; \
+ while(*bufc != '\0') { \
+ *occ = *bufc; \
+ occ++; \
+ bufc++; \
+ } \
+}
+
+/**
+ * Initialized the stacked alignment with respect to a read string, a list of
+ * edits (expressed left-to-right), and integers indicating how much hard and
+ * soft trimming has occurred on either end of the read.
+ *
+ * s: read sequence
+ * ed: all relevant edits, including ambiguous nucleotides
+ * trimLS: # bases soft-trimmed from LHS
+ * trimLH: # bases hard-trimmed from LHS
+ * trimRS: # bases soft-trimmed from RHS
+ * trimRH: # bases hard-trimmed from RHS
+ */
+void StackedAln::init(
+ const BTDnaString& s,
+ const EList& ed,
+ size_t trimLS,
+ size_t trimLH,
+ size_t trimRS,
+ size_t trimRH)
+{
+ trimLS_ = trimLS;
+ trimLH_ = trimLH;
+ trimRS_ = trimRS;
+ trimRH_ = trimRH;
+ ASSERT_ONLY(size_t ln_postsoft = s.length() - trimLS - trimRS);
+ stackRef_.clear();
+ stackRel_.clear();
+ stackSNP_.clear();
+ stackRead_.clear();
+ size_t rdoff = trimLS;
+ for(size_t i = 0; i < ed.size(); i++) {
+ assert_lt(ed[i].pos, ln_postsoft);
+ size_t pos = ed[i].pos + trimLS;
+ while(rdoff < pos) {
+ int c = s[rdoff++];
+ assert_range(0, 4, c);
+ stackRef_.push_back("ACGTN"[c]);
+ stackRel_.push_back('=');
+ stackSNP_.push_back(false);
+ stackRead_.push_back("ACGTN"[c]);
+ }
+ if(ed[i].isMismatch()) {
+ int c = s[rdoff++];
+ assert_range(0, 4, c);
+ assert_eq(c, asc2dna[(int)ed[i].qchr]);
+ assert_neq(c, asc2dna[(int)ed[i].chr]);
+ stackRef_.push_back(ed[i].chr);
+ stackRel_.push_back('X');
+ stackSNP_.push_back(ed[i].snpID != (uint32_t)INDEX_MAX);
+ stackRead_.push_back("ACGTN"[c]);
+ } else if(ed[i].isRefGap()) {
+ int c = s[rdoff++];
+ assert_range(0, 4, c);
+ assert_eq(c, asc2dna[(int)ed[i].qchr]);
+ stackRef_.push_back('-');
+ stackRel_.push_back('I');
+ stackSNP_.push_back(ed[i].snpID != (uint32_t)INDEX_MAX);
+ stackRead_.push_back("ACGTN"[c]);
+ } else if(ed[i].isReadGap()) {
+ stackRef_.push_back(ed[i].chr);
+ stackRel_.push_back('D');
+ stackSNP_.push_back(ed[i].snpID != (uint32_t)INDEX_MAX);
+ stackRead_.push_back('-');
+ } else if(ed[i].isSpliced()) {
+ stackRef_.push_back('N');
+ stackRel_.push_back('N');
+ stackSNP_.push_back(false);
+ stackRead_.push_back('N');
+ assert_gt(ed[i].splLen, 0);
+ stackSkip_.push_back(ed[i].splLen);
+ }
+ }
+ while(rdoff < s.length() - trimRS) {
+ int c = s[rdoff++];
+ assert_range(0, 4, c);
+ stackRef_.push_back("ACGTN"[c]);
+ stackRel_.push_back('=');
+ stackSNP_.push_back(false);
+ stackRead_.push_back("ACGTN"[c]);
+ }
+ inited_ = true;
+}
+
+/**
+ * Left-align all the gaps. If this changes the alignment and the CIGAR or
+ * MD:Z strings have already been calculated, this renders them invalid.
+ *
+ * We left-align gaps with in the following way: for each gap, we check
+ * whether the character opposite the rightmost gap character is the same
+ * as the character opposite the character just to the left of the gap. If
+ * this is the case, we can slide the gap to the left and make the
+ * rightmost position previously covered by the gap into a non-gap.
+ *
+ * This scheme allows us to push the gap past a mismatch. BWA does seem to
+ * allow this. It's not clear that Bowtie 2 should, since moving the
+ * mismatch could cause a mismatch with one base quality to be replaced
+ * with a mismatch with a different base quality.
+ */
+void StackedAln::leftAlign(bool pastMms) {
+ assert(inited_);
+ bool changed = false;
+ size_t ln = stackRef_.size();
+ // Scan left-to-right
+ for(size_t i = 0; i < ln; i++) {
+ int rel = stackRel_[i];
+ if(rel != '=' && rel != 'X' && rel != 'N') {
+ // Neither a match nor a mismatch - must be a gap
+ assert(rel == 'I' || rel == 'D');
+ if(stackSNP_[i]) continue;
+ size_t glen = 1;
+ // Scan further right to measure length of gap
+ for(size_t j = i+1; j < ln; j++) {
+ if(rel != (int)stackRel_[j]) break;
+ glen++;
+ }
+ // We've identified a gap of type 'rel' (D = deletion or read
+ // gap, I = insertion or ref gap) with length 'glen'. Now we
+ // can try to slide it to the left repeatedly.
+ size_t l = i - 1;
+ size_t r = l + glen;
+ EList& gp = ((rel == 'I') ? stackRef_ : stackRead_);
+ const EList& ngp = ((rel == 'I') ? stackRead_ : stackRef_);
+ while(l > 0 && ngp[l] == ngp[r]) {
+ if(stackRel_[l] == 'I' || stackRel_[l] == 'D') break;
+ assert(stackRel_[l] == '=' || stackRel_[l] == 'X' || stackRel_[l] == 'N');
+ assert(stackRel_[r] == 'D' || stackRel_[r] == 'I');
+ if(!pastMms && (stackRel_[l] == 'X' || stackRel_[l] == 'N')) {
+ break;
+ }
+ swap(gp[l], gp[r]);
+ swap(stackRel_[l], stackRel_[r]);
+ assert_neq('-', gp[r]);
+ assert_eq('-', gp[l]);
+ l--; r--;
+ changed = true;
+ }
+ i += (glen-1);
+ }
+ }
+ if(changed) {
+ cigCalc_ = mdzCalc_ = false;
+ }
+}
+
+/**
+ * Build the CIGAR list, if it hasn't already built. Returns true iff it
+ * was built for the first time.
+ */
+bool StackedAln::buildCigar(bool xeq) {
+ assert(inited_);
+ if(cigCalc_) {
+ return false; // already done
+ }
+ cigOp_.clear();
+ cigRun_.clear();
+ if(trimLS_ > 0) {
+ cigOp_.push_back('S');
+ cigRun_.push_back(trimLS_);
+ }
+ size_t numSkips = 0;
+ size_t ln = stackRef_.size();
+ for(size_t i = 0; i < ln; i++) {
+ char op = stackRel_[i];
+ if(!xeq && (op == 'X' || op == '=')) {
+ op = 'M';
+ }
+ size_t run;
+ if(op != 'N') {
+ run = 1;
+ for(; i + run < ln; run++) {
+ char op2 = stackRel_[i + run];
+ if(!xeq && (op2 == 'X' || op2 == '=')) {
+ op2 = 'M';
+ }
+ if(op2 != op) {
+ break;
+ }
+ }
+ i += (run-1);
+ } else {
+ assert_lt(numSkips, stackSkip_.size());
+ run = stackSkip_[numSkips];
+ numSkips++;
+ }
+ cigOp_.push_back(op);
+ cigRun_.push_back(run);
+ }
+ if(trimRS_ > 0) {
+ cigOp_.push_back('S');
+ cigRun_.push_back(trimRS_);
+ }
+ cigCalc_ = true;
+ return true;
+}
+
+/**
+ * Build the CIGAR list, if it hasn't already built. Returns true iff it
+ * was built for the first time.
+ */
+bool StackedAln::buildMdz() {
+ assert(inited_);
+ if(mdzCalc_) {
+ return false; // already done
+ }
+ mdzOp_.clear();
+ mdzChr_.clear();
+ mdzRun_.clear();
+ size_t ln = stackRef_.size();
+ for(size_t i = 0; i < ln; i++) {
+ char op = stackRel_[i];
+ if(op == '=') {
+ size_t run = 1;
+ size_t ninserts = 0;
+ size_t nskips = 0;
+ // Skip over matches and insertions (ref gaps)
+ for(; i+run < ln; run++) {
+ if(stackRel_[i + run] == '=') {
+ // do nothing
+ } else if(stackRel_[i + run] == 'I') {
+ ninserts++;
+ } else if(stackRel_[i + run] == 'N') {
+ nskips++;
+ } else {
+ break;
+ }
+ }
+ i += (run - 1);
+ mdzOp_.push_back('='); // = X or G
+ mdzChr_.push_back('-');
+ mdzRun_.push_back(run - ninserts - nskips);
+ } else if(op == 'X') {
+ assert_neq(stackRef_[i], stackRead_[i]);
+ mdzOp_.push_back('X'); // = X or G
+ mdzChr_.push_back(stackRef_[i]);
+ mdzRun_.push_back(1);
+ } else if(op == 'D') {
+ assert_neq('-', stackRef_[i]);
+ mdzOp_.push_back('G'); // = X or G
+ mdzChr_.push_back(stackRef_[i]);
+ mdzRun_.push_back(1);
+ }
+ }
+ mdzCalc_ = true;
+ return true;
+}
+
+/**
+ * Write a CIGAR representation of the alignment to the given string and/or
+ * char buffer.
+ */
+void StackedAln::writeCigar(
+ BTString* o, // if non-NULL, string to append to
+ char* occ) const // if non-NULL, character string to append to
+{
+ const EList& op = cigOp_;
+ const EList& run = cigRun_;
+ assert_eq(op.size(), run.size());
+ if(o != NULL || occ != NULL) {
+ char buf[128];
+ ASSERT_ONLY(bool printed = false);
+ for(size_t i = 0; i < op.size(); i++) {
+ size_t r = run[i];
+ if(r > 0) {
+ itoa10(r, buf);
+ ASSERT_ONLY(printed = true);
+ if(o != NULL) {
+ o->append(buf);
+ o->append(op[i]);
+ }
+ if(occ != NULL) {
+ COPY_BUF();
+ *occ = op[i];
+ occ++;
+ }
+ }
+ }
+ assert(printed);
+ if(occ != NULL) {
+ *occ = '\0';
+ }
+ }
+}
+
+void StackedAln::writeCigar(Alignment* o, char* occ) const {
+ const EList& op = cigOp_;
+ const EList& run = cigRun_;
+ assert_eq(op.size(), run.size());
+ if(o != NULL || occ != NULL) {
+ char buf[128];
+ ASSERT_ONLY(bool printed = false);
+ o->cigarSegments.reserve(op.size());
+ for(size_t i = 0; i < op.size(); i++) {
+ size_t r = run[i];
+ if(r > 0) {
+ itoa10(r, buf);
+ ASSERT_ONLY(printed = true);
+ if(o != NULL) {
+ o->cigarString.append(buf);
+ o->cigarString.append(op[i]);
+ o->cigarSegments.emplace_back(r, op[i]);
+ o->cigarLength += r;
+ }
+ if(occ != NULL) {
+ COPY_BUF();
+ *occ = op[i];
+ occ++;
+ }
+ }
+ }
+ assert(printed);
+ if(occ != NULL) {
+ *occ = '\0';
+ }
+ }
+}
+
+/**
+ * Write an MD:Z representation of the alignment to the given string and/or
+ * char buffer.
+ */
+void StackedAln::writeMdz(BTString* o, char* occ) const {
+ char buf[128];
+ bool mm_last = false;
+ bool rdgap_last = false;
+ bool first_print = true;
+ const EList& op = mdzOp_;
+ const EList& ch = mdzChr_;
+ const EList& run = mdzRun_;
+ for(size_t i = 0; i < op.size(); i++) {
+ size_t r = run[i];
+ if(r > 0) {
+ if(op[i] == '=') {
+ // Write run length
+ itoa10(r, buf);
+ if(o != NULL) { o->append(buf); }
+ if(occ != NULL) { COPY_BUF(); }
+ first_print = false;
+ mm_last = false;
+ rdgap_last = false;
+ } else if(op[i] == 'X') {
+ if(o != NULL) {
+ if(rdgap_last || mm_last || first_print) {
+ o->append('0');
+ }
+ o->append(ch[i]);
+ }
+ if(occ != NULL) {
+ if(rdgap_last || mm_last || first_print) {
+ *occ = '0';
+ occ++;
+ }
+ *occ = ch[i];
+ occ++;
+ }
+ first_print = false;
+ mm_last = true;
+ rdgap_last = false;
+ } else if(op[i] == 'G') {
+ if(o != NULL) {
+ if(mm_last || first_print) {
+ o->append('0');
+ }
+ if(!rdgap_last) {
+ o->append('^');
+ }
+ o->append(ch[i]);
+ }
+ if(occ != NULL) {
+ if(mm_last || first_print) {
+ *occ = '0'; occ++;
+ }
+ if(!rdgap_last) {
+ *occ = '^'; occ++;
+ }
+ *occ = ch[i];
+ occ++;
+ }
+ first_print = false;
+ mm_last = false;
+ rdgap_last = true;
+ }
+ } // if r > 0
+ } // for loop over ops
+ if(mm_last || rdgap_last) {
+ if(o != NULL) { o->append('0'); }
+ if(occ != NULL) { *occ = '0'; occ++; }
+ }
+ if(occ != NULL) { *occ = '\0'; }
+}
+
+/**
+ * Print the sequence for the read that aligned using A, C, G and
+ * T. This will simply print the read sequence (or its reverse
+ * complement).
+ */
+void AlnRes::printSeq(
+ const Read& rd, // read
+ const BTDnaString* dns, // already-decoded nucleotides
+ BTString& o) const // buffer to write to
+{
+ assert(!rd.patFw.empty());
+ ASSERT_ONLY(size_t written = 0);
+ // Print decoded nucleotides
+ assert(dns != NULL);
+ size_t len = dns->length();
+ size_t st = 0;
+ size_t en = len;
+ for(size_t i = st; i < en; i++) {
+ int c = dns->get(i);
+ assert_range(0, 3, c);
+ o.append("ACGT"[c]);
+ ASSERT_ONLY(written++);
+ }
+#ifndef NDEBUG
+ for(size_t i = 0; i < ned_->size(); i++) {
+ if((*ned_)[i].isReadGap()) {
+ assert_leq((*ned_)[i].pos, dns->length());
+ } else {
+ assert_lt((*ned_)[i].pos, dns->length());
+ }
+ }
+#endif
+}
+
+/**
+ * Print the quality string for the read that aligned. This will simply print
+ * the read qualities (or their reverse).
+ */
+void AlnRes::printQuals(
+ const Read& rd, // read
+ const BTString* dqs, // already-decoded qualities
+ BTString& o) const // output stream to write to
+{
+ assert(dqs != NULL);
+ size_t len = dqs->length();
+ // Print decoded qualities from upstream to downstream Watson
+ for(size_t i = 1; i < len-1; i++) {
+ o.append(dqs->get(i));
+ }
+}
+
+/**
+ * Add all of the cells involved in the given alignment to the database.
+ */
+void RedundantAlns::add(const AlnRes& res) {
+ assert(!cells_.empty());
+ TRefOff left = res.refoff(), right;
+ const size_t len = res.readExtentRows();
+ if(!res.fw()) {
+ const_cast(res).invertEdits();
+ }
+ const EList& ned = res.ned();
+ size_t nedidx = 0;
+ assert_leq(len, cells_.size());
+ // For each row...
+ for(size_t i = 0; i < len; i++) {
+ size_t diff = 1; // amount to shift to right for next round
+ right = left + 1;
+ while(nedidx < ned.size() && ned[nedidx].pos == i) {
+ if(ned[nedidx].isRefGap()) {
+ // Next my_left will be in same column as this round
+ diff = 0;
+ }
+ nedidx++;
+ }
+ if(i < len - 1) {
+ // See how many inserts there are before the next read
+ // character
+ size_t nedidx_next = nedidx;
+ while(nedidx_next < ned.size() && ned[nedidx_next].pos == i+1)
+ {
+ if(ned[nedidx_next].isReadGap()) {
+ right++;
+ }
+ nedidx_next++;
+ }
+ }
+ for(TRefOff j = left; j < right; j++) {
+ // Add to db
+ RedundantCell c(res.refid(), res.fw(), j, i);
+ ASSERT_ONLY(bool ret =) cells_[i].insert(c);
+ assert(ret);
+ }
+ left = right + diff - 1;
+ }
+ if(!res.fw()) {
+ const_cast(res).invertEdits();
+ }
+}
+
+/**
+ * Return true iff the given alignment has at least one cell that overlaps
+ * one of the cells in the database.
+ */
+bool RedundantAlns::overlap(const AlnRes& res) {
+ assert(!cells_.empty());
+ TRefOff left = res.refoff(), right;
+ const size_t len = res.readExtentRows();
+ if(!res.fw()) {
+ const_cast(res).invertEdits();
+ }
+ const EList& ned = res.ned();
+ size_t nedidx = 0;
+ // For each row...
+ bool olap = false;
+ assert_leq(len, cells_.size());
+ for(size_t i = 0; i < len; i++) {
+ size_t diff = 1; // amount to shift to right for next round
+ right = left + 1;
+ while(nedidx < ned.size() && ned[nedidx].pos == i) {
+ if(ned[nedidx].isRefGap()) {
+ // Next my_left will be in same column as this round
+ diff = 0;
+ }
+ nedidx++;
+ }
+ if(i < len - 1) {
+ // See how many inserts there are before the next read
+ // character
+ size_t nedidx_next = nedidx;
+ while(nedidx_next < ned.size() && ned[nedidx_next].pos == i+1)
+ {
+ if(ned[nedidx_next].isReadGap()) {
+ right++;
+ }
+ nedidx_next++;
+ }
+ }
+ for(TRefOff j = left; j < right; j++) {
+ // Add to db
+ RedundantCell c(res.refid(), res.fw(), j, i);
+ if(cells_[i].contains(c)) {
+ olap = true;
+ break;
+ }
+ }
+ if(olap) {
+ break;
+ }
+ left = right + diff - 1;
+ }
+ if(!res.fw()) {
+ const_cast(res).invertEdits();
+ }
+ return olap;
+}
+
+/**
+ * Given all the paired and unpaired results involving mates #1 and #2,
+ * calculate best and second-best scores for both mates. These are
+ * used for future MAPQ calculations.
+ */
+void AlnSetSumm::init(
+ const Read* rd1,
+ const Read* rd2,
+ const EList* rs1,
+ const EList* rs2,
+ const EList* rs1u,
+ const EList* rs2u,
+ bool exhausted1,
+ bool exhausted2,
+ TRefId orefid,
+ TRefOff orefoff,
+ bool repeat)
+{
+ assert(rd1 != NULL || rd2 != NULL);
+ assert((rs1 == NULL) == (rs2 == NULL));
+ AlnScore best[2], secbest[2], bestPaired, secbestPaired;
+ size_t szs[2];
+ best[0].invalidate(); secbest[0].invalidate();
+ best[1].invalidate(); secbest[1].invalidate();
+ bestPaired.invalidate(); secbestPaired.invalidate();
+ bool paired = (rs1 != NULL && rs2 != NULL);
+ szs[0] = szs[1] = 0;
+ TNumAlns numAlns1 = 0, numAlns2 = 0, numAlnsPaired = 0;
+ if(paired) {
+ // Paired alignments
+ assert_eq(rs1->size(), rs2->size());
+ szs[0] = szs[1] = rs1->size();
+ assert_gt(szs[0], 0);
+ numAlnsPaired = szs[0];
+ for(size_t i = 0; i < rs1->size(); i++) {
+ AlnScore sc = (*rs1)[i].score() + (*rs2)[i].score();
+ if(sc > bestPaired) {
+ secbestPaired = bestPaired;
+ bestPaired = sc;
+ assert(VALID_AL_SCORE(bestPaired));
+ } else if(sc > secbestPaired) {
+ secbestPaired = sc;
+ assert(VALID_AL_SCORE(bestPaired));
+ assert(VALID_AL_SCORE(secbestPaired));
+ }
+ }
+ }
+ for(int j = 0; j < 2; j++) {
+ const EList* rs = (j == 0 ? rs1u : rs2u);
+ if(rs == NULL) {
+ continue;
+ }
+ assert(rs != NULL);
+ szs[j] = rs->size();
+ if(j == 0) {
+ numAlns1 = szs[j];
+ } else {
+ numAlns2 = szs[j];
+ }
+ //assert_gt(szs[j], 0);
+ for(size_t i = 0; i < rs->size(); i++) {
+ AlnScore sc = (*rs)[i].score();
+ if(sc > best[j]) {
+ secbest[j] = best[j];
+ best[j] = sc;
+ assert(VALID_AL_SCORE(best[j]));
+ } else if(sc > secbest[j]) {
+ secbest[j] = sc;
+ assert(VALID_AL_SCORE(best[j]));
+ assert(VALID_AL_SCORE(secbest[j]));
+ }
+ }
+ }
+ if(szs[0] > 0 || szs[1] > 0) {
+ init(
+ best[0],
+ secbest[0],
+ best[1],
+ secbest[1],
+ bestPaired,
+ secbestPaired,
+ (szs[0] == 0) ? 0 : (szs[0] - 1),
+ (szs[1] == 0) ? 0 : (szs[1] - 1),
+ paired,
+ exhausted1,
+ exhausted2,
+ orefid,
+ orefoff,
+ repeat,
+ numAlns1,
+ numAlns2,
+ numAlnsPaired);
+ } else {
+ reset();
+ orefid_ = orefid;
+ orefoff_ = orefoff;
+ repeat_ = repeat;
+ }
+}
+
+/**
+ * Print out string representation of YF:i flag for indicating whether and
+ * why the mate was filtered.
+ */
+bool AlnFlags::printYF(BTString& o, bool first) const {
+ const char *flag = "";
+ if (!lenfilt_) flag = "LN";
+ else if(!nfilt_ ) flag = "NS";
+ else if(!scfilt_ ) flag = "SC";
+ else if(!qcfilt_ ) flag = "QC";
+ if(*flag > 0) {
+ if(!first) o.append('\t');
+ o.append("YF:Z:");
+ o.append(flag);
+ return false;
+ }
+ return true;
+}
+
+
+/**
+ * Print out string representation of YM:i flag for indicating with the
+ * mate per se aligned repetitively.
+ */
+void AlnFlags::printYM(BTString& o) const {
+ o.append("YM:i:");
+ o.append(maxed() ? '1' : '0');
+}
+
+/**
+ * Print out string representation of YM:i flag for indicating with the
+ * pair containing the mate aligned repetitively.
+ */
+void AlnFlags::printYP(BTString& o) const {
+ o.append("YP:i:");
+ o.append(maxedPair() ? '1' : '0');
+}
+
+/**
+ * Print out string representation of these flags.
+ */
+void AlnFlags::printYT(BTString& o) const {
+ o.append("YT:Z:");
+ if(alignedConcordant()) {
+ o.append("CP");
+ } else if(alignedDiscordant()) {
+ o.append("DP");
+ } else if(alignedUnpairedMate()) {
+ o.append("UP");
+ } else if(alignedUnpaired()) {
+ o.append("UU");
+ } else { throw 1; }
+}
+
+#ifdef ALIGNER_RESULT_MAIN
+
+#include "mem_ids.h"
+
+int main() {
+ EList op;
+ EList ch;
+ EList run;
+ {
+ // On top of each other, same length
+ cerr << "Test case 1, simple overlap 1 ... ";
+ AlnRes res1;
+ res1.init(
+ 10,
+ AlnScore(),
+ NULL,
+ NULL,
+ NULL,
+ Coord(0, 0, true),
+ false);
+ AlnRes res2;
+ res2.init(
+ 10,
+ AlnScore(),
+ NULL,
+ NULL,
+ NULL,
+ Coord(0, 0, true),
+ false);
+ assert(res1.overlap(res2));
+
+ // Try again, but using the redundant-alignment database
+ RedundantAlns ra;
+ ra.reset();
+ ra.init(10);
+ ra.add(res1);
+ assert(ra.overlap(res1));
+ assert(ra.overlap(res2));
+
+ char buf1[1024];
+ res1.printCigar(false, false, false, op, run, NULL, buf1);
+ assert_eq(0, strcmp(buf1, "10M"));
+ res1.printCigar(false, false, true, op, run, NULL, buf1);
+ assert_eq(0, strcmp(buf1, "10="));
+
+ char buf2[1024];
+ res2.printCigar(false, false, false, op, run, NULL, buf2);
+ assert_eq(0, strcmp(buf2, "10M"));
+ res2.printCigar(false, false, true, op, run, NULL, buf2);
+ assert_eq(0, strcmp(buf2, "10="));
+
+ char buf3[1024];
+ res1.printMD(false, false, op, ch, run, NULL, buf3);
+ assert_eq(0, strcmp(buf3, "10"));
+ res1.printMD(false, true, op, ch, run, NULL, buf3);
+ assert_eq(0, strcmp(buf3, "8"));
+
+ char buf4[1024];
+ res2.printMD(false, false, op, ch, run, NULL, buf4);
+ assert_eq(0, strcmp(buf4, "10"));
+ res2.printMD(false, true, op, ch, run, NULL, buf4);
+ assert_eq(0, strcmp(buf4, "8"));
+
+ cerr << "PASSED" << endl;
+ }
+
+ {
+ // On top of each other, different lengths
+ cerr << "Test case 2, simple overlap 2 ... ";
+ AlnRes res1;
+ res1.init(
+ 10,
+ AlnScore(),
+ NULL,
+ NULL,
+ NULL,
+ Coord(0, 0, true),
+ false);
+ AlnRes res2;
+ res2.init(
+ 11,
+ AlnScore(),
+ NULL,
+ NULL,
+ NULL,
+ Coord(0, 0, true),
+ false);
+ assert(res1.overlap(res2));
+
+ // Try again, but using the redundant-alignment database
+ RedundantAlns ra;
+ ra.reset();
+ ra.init(11);
+ ra.add(res1);
+ assert(ra.overlap(res1));
+ assert(ra.overlap(res2));
+
+ char buf1[1024];
+ res1.printCigar(false, false, false, op, run, NULL, buf1);
+ assert_eq(0, strcmp(buf1, "10M"));
+ res1.printCigar(false, false, true, op, run, NULL, buf1);
+ assert_eq(0, strcmp(buf1, "10="));
+
+ char buf2[1024];
+ res2.printCigar(false, false, false, op, run, NULL, buf2);
+ assert_eq(0, strcmp(buf2, "11M"));
+ res2.printCigar(false, false, true, op, run, NULL, buf2);
+ assert_eq(0, strcmp(buf2, "11="));
+
+ char buf3[1024];
+ res1.printMD(false, false, op, ch, run, NULL, buf3);
+ assert_eq(0, strcmp(buf3, "10"));
+ res1.printMD(false, true, op, ch, run, NULL, buf3);
+ assert_eq(0, strcmp(buf3, "8"));
+
+ char buf4[1024];
+ res2.printMD(false, false, op, ch, run, NULL, buf4);
+ assert_eq(0, strcmp(buf4, "11"));
+ res2.printMD(false, true, op, ch, run, NULL, buf4);
+ assert_eq(0, strcmp(buf4, "9"));
+
+ cerr << "PASSED" << endl;
+ }
+
+ {
+ // Different references
+ cerr << "Test case 3, simple overlap 3 ... ";
+ AlnRes res1;
+ res1.init(
+ 10,
+ AlnScore(),
+ NULL,
+ NULL,
+ NULL,
+ Coord(0, 1, true),
+ false);
+ AlnRes res2;
+ res2.init(
+ 11,
+ AlnScore(),
+ NULL,
+ NULL,
+ NULL,
+ Coord(0, 0, true),
+ false);
+ assert(!res1.overlap(res2));
+
+ // Try again, but using the redundant-alignment database
+ RedundantAlns ra;
+ ra.reset();
+ ra.init(11);
+ ra.add(res1);
+ assert(ra.overlap(res1));
+ assert(!ra.overlap(res2));
+
+ cerr << "PASSED" << endl;
+ }
+
+ {
+ // Different references
+ cerr << "Test case 4, simple overlap 4 ... ";
+ AlnRes res1;
+ res1.init(
+ 10,
+ AlnScore(),
+ NULL,
+ NULL,
+ NULL,
+ Coord(0, 0, true),
+ false);
+ AlnRes res2;
+ res2.init(
+ 10,
+ AlnScore(),
+ NULL,
+ NULL,
+ NULL,
+ Coord(1, 0, true),
+ false);
+ assert(!res1.overlap(res2));
+
+ // Try again, but using the redundant-alignment database
+ RedundantAlns ra;
+ ra.reset();
+ ra.init(10);
+ ra.add(res1);
+ assert(ra.overlap(res1));
+ assert(!ra.overlap(res2));
+
+ cerr << "PASSED" << endl;
+ }
+
+ {
+ // Different strands
+ cerr << "Test case 5, simple overlap 5 ... ";
+ AlnRes res1;
+ res1.init(
+ 10,
+ AlnScore(),
+ NULL,
+ NULL,
+ NULL,
+ Coord(0, 0, true),
+ false);
+ AlnRes res2;
+ res2.init(
+ 10,
+ AlnScore(),
+ NULL,
+ NULL,
+ NULL,
+ Coord(0, 0, false),
+ false);
+ assert(!res1.overlap(res2));
+
+ // Try again, but using the redundant-alignment database
+ RedundantAlns ra;
+ ra.reset();
+ ra.init(10);
+ ra.add(res1);
+ assert(ra.overlap(res1));
+ assert(!ra.overlap(res2));
+
+ cerr << "PASSED" << endl;
+ }
+
+ {
+ // Different strands
+ cerr << "Test case 6, simple overlap 6 ... ";
+ EList ned1(RES_CAT);
+ ned1.expand();
+ // 1 step to the right in the middle of the alignment
+ ned1.back().init(5, 'A' /*chr*/, '-' /*qchr*/, EDIT_TYPE_READ_GAP);
+ AlnRes res1;
+ res1.init(
+ 10,
+ AlnScore(),
+ &ned1,
+ NULL,
+ NULL,
+ Coord(0, 5, false),
+ false);
+ AlnRes res2;
+ res2.init(
+ 10,
+ AlnScore(),
+ NULL,
+ NULL,
+ NULL,
+ Coord(0, 6, false),
+ false);
+ assert(res1.overlap(res2));
+
+ // Try again, but using the redundant-alignment database
+ RedundantAlns ra;
+ ra.reset();
+ ra.init(10);
+ ra.add(res1);
+ assert(ra.overlap(res1));
+ assert(ra.overlap(res2));
+
+ char buf1[1024];
+ res1.printCigar(false, false, false, op, run, NULL, buf1);
+ assert_eq(0, strcmp(buf1, "5M1D5M"));
+ res1.printCigar(false, false, true, op, run, NULL, buf1);
+ assert_eq(0, strcmp(buf1, "5=1D5="));
+
+ char buf2[1024];
+ res2.printCigar(false, false, false, op, run, NULL, buf2);
+ assert_eq(0, strcmp(buf2, "10M"));
+ res2.printCigar(false, false, true, op, run, NULL, buf2);
+ assert_eq(0, strcmp(buf2, "10="));
+
+ char buf3[1024];
+ res1.printMD(false, false, op, ch, run, NULL, buf3);
+ assert_eq(0, strcmp(buf3, "5^A5"));
+ res1.printMD(false, true, op, ch, run, NULL, buf3);
+ assert_eq(0, strcmp(buf3, "4^A4"));
+
+ char buf4[1024];
+ res2.printMD(false, false, op, ch, run, NULL, buf4);
+ assert_eq(0, strcmp(buf4, "10"));
+ res2.printMD(false, true, op, ch, run, NULL, buf4);
+ assert_eq(0, strcmp(buf4, "8"));
+
+ cerr << "PASSED" << endl;
+ }
+
+ {
+ // Different strands
+ cerr << "Test case 7, simple overlap 7 ... ";
+ EList ned1(RES_CAT);
+ // 3 steps to the right in the middle of the alignment
+ ned1.push_back(Edit(5, 'A', '-', EDIT_TYPE_READ_GAP));
+ ned1.push_back(Edit(5, 'C', '-', EDIT_TYPE_READ_GAP));
+ ned1.push_back(Edit(5, 'G', '-', EDIT_TYPE_READ_GAP));
+ AlnRes res1;
+ res1.init(
+ 10,
+ AlnScore(),
+ &ned1,
+ NULL,
+ NULL,
+ Coord(0, 5, false),
+ false);
+ AlnRes res2;
+ res2.init(
+ 10,
+ AlnScore(),
+ NULL,
+ NULL,
+ NULL,
+ Coord(0, 6, false),
+ false);
+ assert(res1.overlap(res2));
+
+ // Try again, but using the redundant-alignment database
+ RedundantAlns ra;
+ ra.reset();
+ ra.init(10);
+ ra.add(res1);
+ assert(ra.overlap(res1));
+ assert(ra.overlap(res2));
+
+ char buf1[1024];
+ res1.printCigar(false, false, false, op, run, NULL, buf1);
+ assert_eq(0, strcmp(buf1, "5M3D5M"));
+ res1.printCigar(false, false, true, op, run, NULL, buf1);
+ assert_eq(0, strcmp(buf1, "5=3D5="));
+
+ char buf2[1024];
+ res2.printCigar(false, false, false, op, run, NULL, buf2);
+ assert_eq(0, strcmp(buf2, "10M"));
+ res2.printCigar(false, false, true, op, run, NULL, buf2);
+ assert_eq(0, strcmp(buf2, "10="));
+
+ char buf3[1024];
+ res1.printMD(false, false, op, ch, run, NULL, buf3);
+ assert_eq(0, strcmp(buf3, "5^GCA5"));
+ res1.printMD(false, true, op, ch, run, NULL, buf3);
+ assert_eq(0, strcmp(buf3, "4^GCA4"));
+
+ char buf4[1024];
+ res2.printMD(false, false, op, ch, run, NULL, buf4);
+ assert_eq(0, strcmp(buf4, "10"));
+ res2.printMD(false, true, op, ch, run, NULL, buf4);
+ assert_eq(0, strcmp(buf4, "8"));
+
+ cerr << "PASSED" << endl;
+ }
+
+ {
+ // Both with horizontal movements; overlap
+ cerr << "Test case 8, simple overlap 8 ... ";
+ EList ned1(RES_CAT);
+ // 2 steps to the right in the middle of the alignment
+ ned1.push_back(Edit(5, 'A', '-', EDIT_TYPE_READ_GAP));
+ ned1.push_back(Edit(5, 'C', '-', EDIT_TYPE_READ_GAP));
+ AlnRes res1;
+ res1.init(
+ 10,
+ AlnScore(),
+ &ned1,
+ NULL,
+ NULL,
+ Coord(0, 5, false),
+ false);
+ EList ned2(RES_CAT);
+ // 2 steps to the right in the middle of the alignment
+ ned2.push_back(Edit(5, 'A', '-', EDIT_TYPE_READ_GAP));
+ ned2.push_back(Edit(5, 'C', '-', EDIT_TYPE_READ_GAP));
+ AlnRes res2;
+ res2.init(
+ 10,
+ AlnScore(),
+ &ned2,
+ NULL,
+ NULL,
+ Coord(0, 6, false),
+ false);
+ assert(res1.overlap(res2));
+
+ // Try again, but using the redundant-alignment database
+ RedundantAlns ra;
+ ra.reset();
+ ra.init(10);
+ ra.add(res1);
+ assert(ra.overlap(res1));
+ assert(ra.overlap(res2));
+
+ char buf1[1024];
+ res1.printCigar(false, false, false, op, run, NULL, buf1);
+ assert_eq(0, strcmp(buf1, "5M2D5M"));
+ res1.printCigar(false, false, true, op, run, NULL, buf1);
+ assert_eq(0, strcmp(buf1, "5=2D5="));
+
+ char buf2[1024];
+ res2.printCigar(false, false, false, op, run, NULL, buf2);
+ assert_eq(0, strcmp(buf2, "5M2D5M"));
+ res2.printCigar(false, false, true, op, run, NULL, buf2);
+ assert_eq(0, strcmp(buf2, "5=2D5="));
+
+ cerr << "PASSED" << endl;
+ }
+
+ {
+ // Both with horizontal movements; no overlap
+ cerr << "Test case 9, simple overlap 9 ... ";
+ EList ned1(RES_CAT);
+ // 2 steps to the right in the middle of the alignment
+ ned1.push_back(Edit(6, 'A', '-', EDIT_TYPE_READ_GAP));
+ ned1.push_back(Edit(6, 'C', '-', EDIT_TYPE_READ_GAP));
+ AlnRes res1;
+ res1.init(
+ 10,
+ AlnScore(),
+ &ned1,
+ NULL,
+ NULL,
+ Coord(0, 5, true),
+ false);
+ EList ned2(RES_CAT);
+ // 2 steps to the right in the middle of the alignment
+ ned2.push_back(Edit(5, 'A', '-', EDIT_TYPE_READ_GAP));
+ ned2.push_back(Edit(5, 'C', '-', EDIT_TYPE_READ_GAP));
+ AlnRes res2;
+ res2.init(
+ 10,
+ AlnScore(),
+ &ned2,
+ NULL,
+ NULL,
+ Coord(0, 6, true),
+ false);
+ assert(!res1.overlap(res2));
+
+ // Try again, but using the redundant-alignment database
+ RedundantAlns ra;
+ ra.reset();
+ ra.init(10);
+ ra.add(res1);
+ assert(ra.overlap(res1));
+ assert(!ra.overlap(res2));
+
+ char buf1[1024];
+ res1.printCigar(false, false, false, op, run, NULL, buf1);
+ assert_eq(0, strcmp(buf1, "6M2D4M"));
+ res1.printCigar(false, false, true, op, run, NULL, buf1);
+ assert_eq(0, strcmp(buf1, "6=2D4="));
+
+ char buf2[1024];
+ res2.printCigar(false, false, false, op, run, NULL, buf2);
+ assert_eq(0, strcmp(buf2, "5M2D5M"));
+ res2.printCigar(false, false, true, op, run, NULL, buf2);
+ assert_eq(0, strcmp(buf2, "5=2D5="));
+
+ cerr << "PASSED" << endl;
+ }
+
+ {
+ // Both with horizontal movements; no overlap. Reverse strand.
+ cerr << "Test case 10, simple overlap 10 ... ";
+ EList ned1(RES_CAT);
+ // 2 steps to the right in the middle of the alignment
+ ned1.push_back(Edit(5, 'A', '-', EDIT_TYPE_READ_GAP));
+ ned1.push_back(Edit(5, 'C', '-', EDIT_TYPE_READ_GAP));
+ AlnRes res1;
+ res1.init(
+ 10,
+ AlnScore(),
+ &ned1,
+ NULL,
+ NULL,
+ Coord(0, 5, false),
+ false);
+ EList ned2(RES_CAT);
+ // 2 steps to the right in the middle of the alignment
+ ned2.push_back(Edit(6, 'A', '-', EDIT_TYPE_READ_GAP));
+ ned2.push_back(Edit(6, 'C', '-', EDIT_TYPE_READ_GAP));
+ AlnRes res2;
+ res2.init(
+ 10,
+ AlnScore(),
+ &ned2,
+ NULL,
+ NULL,
+ Coord(0, 6, false),
+ false);
+ assert(!res1.overlap(res2));
+
+ // Try again, but using the redundant-alignment database
+ RedundantAlns ra;
+ ra.reset();
+ ra.init(10);
+ ra.add(res1);
+ assert(ra.overlap(res1));
+ assert(!ra.overlap(res2));
+
+ char buf1[1024];
+ res1.printCigar(false, false, false, op, run, NULL, buf1);
+ assert_eq(0, strcmp(buf1, "5M2D5M"));
+ res1.printCigar(false, false, true, op, run, NULL, buf1);
+ assert_eq(0, strcmp(buf1, "5=2D5="));
+
+ char buf2[1024];
+ res2.printCigar(false, false, false, op, run, NULL, buf2);
+ assert_eq(0, strcmp(buf2, "4M2D6M"));
+ res2.printCigar(false, false, true, op, run, NULL, buf2);
+ assert_eq(0, strcmp(buf2, "4=2D6="));
+
+ cerr << "PASSED" << endl;
+ }
+
+ {
+ // Both with vertical movements; no overlap
+ cerr << "Test case 11, simple overlap 11 ... ";
+ EList ned1(RES_CAT);
+ // 2 steps to the right in the middle of the alignment
+ ned1.push_back(Edit(5, '-', 'A', EDIT_TYPE_REF_GAP));
+ ned1.push_back(Edit(6, '-', 'C', EDIT_TYPE_REF_GAP));
+ AlnRes res1;
+ res1.init(
+ 10,
+ AlnScore(),
+ &ned1,
+ NULL,
+ NULL,
+ Coord(0, 5, true),
+ false);
+ EList ned2(RES_CAT);
+ // 2 steps to the right in the middle of the alignment
+ ned2.push_back(Edit(6, '-', 'A', EDIT_TYPE_REF_GAP));
+ ned2.push_back(Edit(7, '-', 'C', EDIT_TYPE_REF_GAP));
+ AlnRes res2;
+ res2.init(
+ 10,
+ AlnScore(),
+ &ned2,
+ NULL,
+ NULL,
+ Coord(0, 6, true),
+ false);
+ assert(!res1.overlap(res2));
+
+ // Try again, but using the redundant-alignment database
+ RedundantAlns ra;
+ ra.reset();
+ ra.init(10);
+ ra.add(res1);
+ assert(ra.overlap(res1));
+ assert(!ra.overlap(res2));
+
+ char buf1[1024];
+ res1.printCigar(false, false, false, op, run, NULL, buf1);
+ assert_eq(0, strcmp(buf1, "5M2I3M"));
+ res1.printCigar(false, false, true, op, run, NULL, buf1);
+ assert_eq(0, strcmp(buf1, "5=2I3="));
+
+ char buf2[1024];
+ res2.printCigar(false, false, false, op, run, NULL, buf2);
+ assert_eq(0, strcmp(buf2, "6M2I2M"));
+ res2.printCigar(false, false, true, op, run, NULL, buf2);
+ assert_eq(0, strcmp(buf2, "6=2I2="));
+
+ cerr << "PASSED" << endl;
+ }
+
+ {
+ // Both with vertical movements; no overlap
+ cerr << "Test case 12, simple overlap 12 ... ";
+ EList ned1(RES_CAT);
+ // 2 steps to the right in the middle of the alignment
+ ned1.push_back(Edit(5, '-', 'A', EDIT_TYPE_REF_GAP));
+ ned1.push_back(Edit(6, '-', 'C', EDIT_TYPE_REF_GAP));
+ AlnRes res1;
+ res1.init(
+ 10,
+ AlnScore(),
+ &ned1,
+ NULL,
+ NULL,
+ Coord(0, 5, true),
+ false);
+ EList ned2(RES_CAT);
+ // 2 steps to the right in the middle of the alignment
+ ned2.push_back(Edit(5, '-', 'A', EDIT_TYPE_REF_GAP));
+ ned2.push_back(Edit(6, '-', 'C', EDIT_TYPE_REF_GAP));
+ AlnRes res2;
+ res2.init(
+ 10,
+ AlnScore(),
+ &ned2,
+ NULL,
+ NULL,
+ Coord(0, 6, true),
+ false);
+ assert(!res1.overlap(res2));
+
+ // Try again, but using the redundant-alignment database
+ RedundantAlns ra;
+ ra.reset();
+ ra.init(10);
+ ra.add(res1);
+ assert(ra.overlap(res1));
+ assert(!ra.overlap(res2));
+
+ char buf1[1024];
+ res1.printCigar(false, false, false, op, run, NULL, buf1);
+ assert_eq(0, strcmp(buf1, "5M2I3M"));
+ res1.printCigar(false, false, true, op, run, NULL, buf1);
+ assert_eq(0, strcmp(buf1, "5=2I3="));
+
+ char buf2[1024];
+ res2.printCigar(false, false, false, op, run, NULL, buf2);
+ assert_eq(0, strcmp(buf2, "5M2I3M"));
+ res2.printCigar(false, false, true, op, run, NULL, buf2);
+ assert_eq(0, strcmp(buf2, "5=2I3="));
+
+ cerr << "PASSED" << endl;
+ }
+
+ {
+ // Both with vertical movements; overlap
+ cerr << "Test case 13, simple overlap 13 ... ";
+ EList ned1(RES_CAT);
+ // 2 steps to the right in the middle of the alignment
+ ned1.push_back(Edit(5, '-', 'A', EDIT_TYPE_REF_GAP));
+ ned1.push_back(Edit(6, '-', 'C', EDIT_TYPE_REF_GAP));
+ AlnRes res1;
+ res1.init(
+ 10,
+ AlnScore(),
+ &ned1,
+ NULL,
+ NULL,
+ Coord(0, 5, true),
+ false);
+ EList ned2(RES_CAT);
+ // 2 steps to the right in the middle of the alignment
+ ned2.push_back(Edit(4, '-', 'A', EDIT_TYPE_REF_GAP));
+ ned2.push_back(Edit(5, '-', 'C', EDIT_TYPE_REF_GAP));
+ AlnRes res2;
+ res2.init(
+ 10,
+ AlnScore(),
+ &ned2,
+ NULL,
+ NULL,
+ Coord(0, 6, true),
+ false);
+ assert(res1.overlap(res2));
+
+ // Try again, but using the redundant-alignment database
+ RedundantAlns ra;
+ ra.reset();
+ ra.init(10);
+ ra.add(res1);
+ assert(ra.overlap(res1));
+ assert(ra.overlap(res2));
+
+ char buf1[1024];
+ res1.printCigar(false, false, false, op, run, NULL, buf1);
+ assert_eq(0, strcmp(buf1, "5M2I3M"));
+ res1.printCigar(false, false, true, op, run, NULL, buf1);
+ assert_eq(0, strcmp(buf1, "5=2I3="));
+
+ char buf2[1024];
+ res2.printCigar(false, false, false, op, run, NULL, buf2);
+ assert_eq(0, strcmp(buf2, "4M2I4M"));
+ res2.printCigar(false, false, true, op, run, NULL, buf2);
+ assert_eq(0, strcmp(buf2, "4=2I4="));
+
+ cerr << "PASSED" << endl;
+ }
+
+ {
+ // Not even close
+ cerr << "Test case 14, simple overlap 14 ... ";
+ EList ned1(RES_CAT);
+ // 2 steps to the right in the middle of the alignment
+ ned1.push_back(Edit(5, '-', 'A', EDIT_TYPE_REF_GAP));
+ ned1.push_back(Edit(6, '-', 'C', EDIT_TYPE_REF_GAP));
+ AlnRes res1;
+ res1.init(
+ 10,
+ AlnScore(),
+ &ned1,
+ NULL,
+ NULL,
+ Coord(0, 5, true),
+ false);
+ EList ned2(RES_CAT);
+ // 2 steps to the right in the middle of the alignment
+ ned2.push_back(Edit(4, '-', 'A', EDIT_TYPE_REF_GAP));
+ ned2.push_back(Edit(5, '-', 'C', EDIT_TYPE_REF_GAP));
+ AlnRes res2;
+ res2.init(
+ 10,
+ AlnScore(),
+ &ned2,
+ NULL,
+ NULL,
+ Coord(0, 400, true),
+ false);
+ assert(!res1.overlap(res2));
+
+ // Try again, but using the redundant-alignment database
+ RedundantAlns ra;
+ ra.reset();
+ ra.init(10);
+ ra.add(res1);
+ assert(ra.overlap(res1));
+ assert(!ra.overlap(res2));
+
+ char buf1[1024];
+ res1.printCigar(false, false, false, op, run, NULL, buf1);
+ assert_eq(0, strcmp(buf1, "5M2I3M"));
+ res1.printCigar(false, false, true, op, run, NULL, buf1);
+ assert_eq(0, strcmp(buf1, "5=2I3="));
+
+ char buf2[1024];
+ res2.printCigar(false, false, false, op, run, NULL, buf2);
+ assert_eq(0, strcmp(buf2, "4M2I4M"));
+ res2.printCigar(false, false, true, op, run, NULL, buf2);
+ assert_eq(0, strcmp(buf2, "4=2I4="));
+
+ cerr << "PASSED" << endl;
+ }
+
+ {
+ cerr << "Test case 15, CIGAR string with mismatches ... ";
+ EList ned(RES_CAT);
+ // 2 steps to the right in the middle of the alignment
+ ned.push_back(Edit(0, 'C', 'A', EDIT_TYPE_MM));
+ ned.push_back(Edit(4, '-', 'C', EDIT_TYPE_REF_GAP));
+ ned.push_back(Edit(6, '-', 'C', EDIT_TYPE_REF_GAP));
+ ned.push_back(Edit(7, '-', 'C', EDIT_TYPE_REF_GAP));
+ ned.push_back(Edit(9, '-', 'A', EDIT_TYPE_READ_GAP));
+ ned.push_back(Edit(9, '-', 'A', EDIT_TYPE_READ_GAP));
+ ned.push_back(Edit(9, '-', 'A', EDIT_TYPE_READ_GAP));
+ ned.push_back(Edit(9, '-', 'A', EDIT_TYPE_READ_GAP));
+ ned.push_back(Edit(10, '-', 'A', EDIT_TYPE_MM));
+ AlnRes res; res.init(
+ 11,
+ AlnScore(),
+ &ned,
+ NULL,
+ NULL,
+ Coord(0, 44, true),
+ false);
+ char buf[1024];
+ res.printCigar(false, false, false, op, run, NULL, buf);
+ assert_eq(0, strcmp(buf, "4M1I1M2I1M4D2M"));
+ res.printCigar(false, false, true, op, run, NULL, buf);
+ assert_eq(0, strcmp(buf, "1X3=1I1=2I1=4D1=1X"));
+ cerr << "PASSED" << endl;
+ }
+
+ {
+ cerr << "Test case 17, Overhang ... ";
+ EList ned(RES_CAT);
+ // 2 steps to the right in the middle of the alignment
+ ned.push_back(Edit(0, 'N', 'A', EDIT_TYPE_MM));
+ ned.push_back(Edit(5, 'C', 'A', EDIT_TYPE_MM));
+ AlnRes res; res.init(
+ 10,
+ AlnScore(),
+ &ned,
+ NULL,
+ NULL,
+ Coord(0, -1, true),
+ false);
+
+ char buf[1024];
+ res.printCigar(false, false, false, op, run, NULL, buf);
+ assert_eq(0, strcmp(buf, "10M"));
+ res.printCigar(false, false, true, op, run, NULL, buf);
+ assert_eq(0, strcmp(buf, "1X4=1X4="));
+ res.printMD(false, false, op, ch, run, NULL, buf);
+ assert_eq(0, strcmp(buf, "0N4C4"));
+
+ #if 0
+ AlnRes res2(res);
+ // Now soft-clip away the overhang
+ res2.clipOutside(
+ true, // soft clip
+ 0, // ref begins
+ 40); // ref ends (excl)
+ res2.printCigar(false, false, false, op, run, NULL, buf);
+ assert_eq(0, strcmp(buf, "1S9M"));
+ res2.printCigar(false, false, true, op, run, NULL, buf);
+ assert_eq(0, strcmp(buf, "4=1X4="));
+ res2.printMD(false, false, op, ch, run, NULL, buf);
+ assert_eq(0, strcmp(buf, "4C4"));
+
+ AlnRes res3 = res;
+ // Now hard-clip away the overhang
+ res3.clipOutside(
+ false, // hard clip
+ 0, // ref begins
+ 40); // ref ends (excl)
+ res3.printCigar(false, false, false, op, run, NULL, buf);
+ assert_eq(0, strcmp(buf, "9M"));
+ res3.printCigar(false, false, true, op, run, NULL, buf);
+ assert_eq(0, strcmp(buf, "4=1X4="));
+ res3.printMD(false, false, op, ch, run, NULL, buf);
+ assert_eq(0, strcmp(buf, "4C4"));
+ #endif
+
+ cerr << "PASSED" << endl;
+ }
+}
+
+#endif /*def ALIGNER_RESULT_MAIN*/
diff --git a/aligner_result.h b/aligner_result.h
new file mode 100644
index 0000000..745647e
--- /dev/null
+++ b/aligner_result.h
@@ -0,0 +1,2325 @@
+/*
+ * Copyright 2011, Ben Langmead
+ *
+ * This file is part of Bowtie 2.
+ *
+ * Bowtie 2 is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ *
+ * Bowtie 2 is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with Bowtie 2. If not, see .
+ */
+
+#ifndef ALIGNER_RESULT_H_
+#define ALIGNER_RESULT_H_
+
+#include
+#include
+#include "mem_ids.h"
+#include "ref_coord.h"
+#include "read.h"
+#include "filebuf.h"
+#include "ds.h"
+#include "edit.h"
+#include "limit.h"
+#include "splice_site.h"
+#include "alignment_3n.h"
+
+typedef int64_t TAlScore;
+
+#define VALID_AL_SCORE(x) ((x).score_ > MIN_I64)
+#define VALID_SCORE(x) ((x) > MIN_I64)
+#define INVALIDATE_SCORE(x) ((x) = MIN_I64)
+
+/**
+ * A generic score object for an alignment. Used for accounting during
+ * SW and elsewhere. Encapsulates the score, the number of N positions
+ * and the number gaps in the alignment.
+ *
+ * The scale for 'score' is such that a perfect alignment score is 0
+ * and a score with non-zero penalty is less than 0. So differences
+ * between scores work as expected, but interpreting an individual
+ * score (larger is better) as a penalty (smaller is better) requires
+ * taking the absolute value.
+ */
+class AlnScore {
+
+public:
+
+ /**
+ * Gapped scores are invalid until proven valid.
+ */
+ inline AlnScore() {
+ reset();
+ invalidate();
+ assert(!valid());
+ }
+
+ /**
+ * Gapped scores are invalid until proven valid.
+ */
+ inline AlnScore(
+ TAlScore score,
+ TAlScore ns,
+ TAlScore gaps,
+ bool repeat = false,
+ TAlScore splicescore = 0,
+ bool knownTranscripts = false,
+ bool nearSpliceSites = false,
+ int leftTrim = 0,
+ int rightTrim = 0) {
+ score_ = score;
+ ns_ = ns;
+ gaps_ = gaps;
+ repeat_ = repeat;
+ splicescore_ = splicescore;
+ knownTranscripts_ = knownTranscripts;
+ nearSpliceSites_ = nearSpliceSites;
+ leftTrim_ = leftTrim;
+ rightTrim_ = rightTrim;
+ hisat2_score_ = calculate_hisat2_score();
+ assert(valid());
+ }
+
+ /**
+ * Reset the score.
+ */
+ void reset() {
+ score_ = hisat2_score_ = ns_ = gaps_ = 0;
+ repeat_ = false;
+ splicescore_ = 0;
+ knownTranscripts_ = false;
+ nearSpliceSites_ = false;
+ leftTrim_ = 0;
+ rightTrim_ = 0;
+ }
+
+ /**
+ * Return an invalid SwScore.
+ */
+ inline static AlnScore INVALID() {
+ AlnScore s;
+ s.invalidate();
+ assert(!s.valid());
+ return s;
+ }
+
+ /**
+ * Return true iff this score has a valid value.
+ */
+ inline bool valid() const {
+ return score_ != MIN_I64;
+ }
+
+ /**
+ * Make this score invalid (and therefore <= all other scores).
+ */
+ inline void invalidate() {
+ score_ = MIN_I64;
+ assert(!valid());
+ }
+
+ /**
+ * Increment the number of gaps. If currently invalid, this makes
+ * the score valid with gaps == 1.
+ */
+ inline void incNs(int nceil) {
+ if(++ns_ > nceil) {
+ invalidate();
+ }
+ assert_lt(ns_, 0x7fffffff);
+ }
+
+ /**
+ * Return true iff this score is > score o.
+ * Note: An "invalid" score is <= all other scores.
+ */
+ inline bool operator>(const AlnScore& o) const {
+ if(!VALID_AL_SCORE(o)) {
+ if(!VALID_AL_SCORE(*this)) {
+ // both invalid
+ return false;
+ } else {
+ // I'm valid, other is invalid
+ return true;
+ }
+ } else if(!VALID_AL_SCORE(*this)) {
+ // I'm invalid, other is valid
+ return false;
+ }
+ return score_ > o.score_ || (score_ == o.score_ && hisat2_score_ > o.hisat2_score_);
+ }
+
+ /**
+ * Scores are equal iff they're bitwise equal.
+ */
+ inline AlnScore& operator=(const AlnScore& o) {
+ // Profiling shows many cache misses on following lines
+ gaps_ = o.gaps_;
+ ns_ = o.ns_;
+ score_ = o.score_;
+ repeat_ = o.repeat_;
+ hisat2_score_ = o.hisat2_score_;
+ splicescore_ = o.splicescore_;
+ knownTranscripts_ = o.knownTranscripts_;
+ nearSpliceSites_ = o.nearSpliceSites_;
+ leftTrim_ = o.leftTrim_;
+ rightTrim_ = o.rightTrim_;
+ assert_lt(ns_, 0x7fffffff);
+ return *this;
+ }
+
+ /**
+ * Scores are equal iff they're bitwise equal.
+ */
+ inline bool operator==(const AlnScore& o) const {
+ // Profiling shows cache misses on following line
+ return VALID_AL_SCORE(*this) && VALID_AL_SCORE(o) && score_ == o.score_ && hisat2_score_ == o.hisat2_score_;
+ }
+
+ /**
+ * Return true iff the two scores are unequal.
+ */
+ inline bool operator!=(const AlnScore& o) const {
+ return !(*this == o);
+ }
+
+ /**
+ * Return true iff this score is >= score o.
+ */
+ inline bool operator>=(const AlnScore& o) const {
+ if(!VALID_AL_SCORE(o)) {
+ if(!VALID_AL_SCORE(*this)) {
+ // both invalid
+ return false;
+ } else {
+ // I'm valid, other is invalid
+ return true;
+ }
+ } else if(!VALID_AL_SCORE(*this)) {
+ // I'm invalid, other is valid
+ return false;
+ }
+ return score_ > o.score_ || (score_ == o.score_ && hisat2_score_ >= o.hisat2_score_);
+ }
+
+ /**
+ * Return true iff this score is < score o.
+ */
+ inline bool operator<(const AlnScore& o) const {
+ return !operator>=(o);
+ }
+
+ /**
+ * Return true iff this score is <= score o.
+ */
+ inline bool operator<=(const AlnScore& o) const {
+ return !operator>(o);
+ }
+
+ /**
+ * Calculate difference between two SwScores.
+ */
+ inline AlnScore operator-(const AlnScore& o) const {
+ if(!VALID_AL_SCORE(*this)) return *this;
+ AlnScore s;
+ s.gaps_ = gaps_ - o.gaps_;
+ s.ns_ = ns_;
+ s.score_ = score_ - o.score_;
+ s.splicescore_ = splicescore_ - o.splicescore_;
+ assert_lt(s.ns_, 0x7fffffff);
+ return s;
+ }
+
+ /**
+ * Calculate sum of two SwScores.
+ */
+ inline AlnScore operator+(const AlnScore& o) const {
+ if(!VALID_AL_SCORE(*this)) return *this;
+ AlnScore s;
+ s.gaps_ = gaps_ + o.gaps_;
+ s.ns_ = ns_;
+ s.score_ = score_ + o.score_;
+ s.repeat_ = repeat_ | o.repeat_;
+ s.splicescore_ = splicescore_ + o.splicescore_;
+ s.hisat2_score_ = hisat2_score_ + o.hisat2_score_;
+ s.knownTranscripts_ = knownTranscripts_ | o.knownTranscripts_;
+ s.nearSpliceSites_ = nearSpliceSites_ | o.nearSpliceSites_;
+ s.leftTrim_ = leftTrim_ + o.leftTrim_;
+ s.rightTrim_ = rightTrim_ + o.rightTrim_;
+ assert_lt(s.ns_, 0x7fffffff);
+ return s;
+ }
+
+ /**
+ * Add given SwScore into this one.
+ */
+ inline AlnScore operator+=(const AlnScore& o) {
+ if(VALID_AL_SCORE(*this)) {
+ gaps_ += o.gaps_;
+ score_ += o.score_;
+ repeat_ |= o.repeat_;
+ splicescore_ += o.splicescore_;
+ hisat2_score_ += o.hisat2_score_;
+ knownTranscripts_ |= o.knownTranscripts_;
+ nearSpliceSites_ |= o.nearSpliceSites_;
+ leftTrim_ += o.leftTrim_;
+ rightTrim_ += o.rightTrim_;
+ }
+ return (*this);
+ }
+
+ /**
+ * Subtract given SwScore from this one.
+ */
+ inline AlnScore operator-=(const AlnScore& o) {
+ if(VALID_AL_SCORE(*this)) {
+ gaps_ -= o.gaps_;
+ score_ -= o.score_;
+ // splicescore_ -= o.splicescore_;
+ }
+ return (*this);
+ }
+
+ /**
+ * Calculate difference between two SwScores.
+ */
+ inline AlnScore operator-(int o) const {
+ return (*this) + -o;
+ }
+
+ /**
+ * Calculate sum of a SwScore and an integer.
+ */
+ inline AlnScore operator+(int o) const {
+ if(!VALID_AL_SCORE(*this)) return *this;
+ AlnScore s;
+ s.gaps_ = gaps_;
+ s.ns_ = ns_;
+ s.score_ = score_ + o;
+ // s.splicescore_ = splicescore_;
+ assert_lt(s.ns_, 0x7fffffff);
+ return s;
+ }
+
+ TAlScore score() const { return score_; }
+ TAlScore hisat2_score() const { return hisat2_score_; }
+ TAlScore penalty() const { return -score_; }
+ TAlScore gaps() const { return gaps_; }
+ TAlScore ns() const { return ns_; }
+ bool repeat() const { return repeat_;}
+ TAlScore splicescore() const { return splicescore_; }
+ bool knownTranscripts() const { return knownTranscripts_; }
+ bool nearSpliceSites() const { return nearSpliceSites_; }
+ bool trimed() const { return leftTrim_ > 0 || rightTrim_ > 0; }
+
+ TAlScore calculate_hisat2_score() const
+ {
+ // TAlScore 32 bits used for score_
+ TAlScore score = score_;
+ if(score > MAX_I32) score = MAX_I32;
+ else if(score < MIN_I32) score = MIN_I32;
+
+ // Next 4 bits for repeat score
+ TAlScore repeat_score = 0;
+ if(repeat_) repeat_score = 1;
+
+ // Next 4 bits for alignments against transcripts
+ TAlScore transcript_score = 0;
+ if(knownTranscripts_) transcript_score = 2;
+ else if(nearSpliceSites_) transcript_score = 1;
+
+ // Next 8 bits for splice site score
+ TAlScore splicescore = splicescore_ / 100;
+ if(splicescore > MAX_U8) splicescore = 0;
+ else splicescore = MAX_U8 - splicescore;
+
+ // Remaining 16 bits (rightmost 16 bits) for sum of left and right trim lengths
+ TAlScore trim = leftTrim_ + rightTrim_;
+ if(trim > MAX_U16) trim = 0;
+ else trim = MAX_U16 - trim;
+ return (score << 32) | (repeat_score << 28) | (transcript_score << 24) | (splicescore << 16) | trim;
+ }
+
+ // Score accumulated so far (penalties are subtracted starting at 0)
+ TAlScore score_;
+
+ // HISAT2 score, which is used internally to distinguish the alignments of RNA-seq reads
+ TAlScore hisat2_score_;
+
+ // Ns accumulated so far. An N opposite a non-gap counts as 1 N
+ // (even if it's N-to-N)
+ TAlScore ns_;
+
+ // # gaps encountered so far, unless that number exceeds the
+ // target, in which case the score becomes invalid and therefore <=
+ // all other scores
+ TAlScore gaps_;
+
+ bool repeat_;
+
+ // splice scores
+ TAlScore splicescore_;
+
+ // mapped to known transcripts?
+ bool knownTranscripts_;
+
+ // continuous alignment near (known) splice sites?
+ bool nearSpliceSites_;
+
+ int leftTrim_;
+ int rightTrim_;
+};
+
+enum {
+ // This alignment is one of a pair of alignments that form a concordant
+ // alignment for a read
+ ALN_FLAG_PAIR_CONCORD_MATE1 = 1,
+ ALN_FLAG_PAIR_CONCORD_MATE2,
+
+ // This alignment is one of a pair of alignments that form a discordant
+ // alignment for a read
+ ALN_FLAG_PAIR_DISCORD_MATE1,
+ ALN_FLAG_PAIR_DISCORD_MATE2,
+
+ // This is an unpaired alignment but the read in question is a pair;
+ // usually, this happens because the read had no reportable paired-end
+ // alignments
+ ALN_FLAG_PAIR_UNPAIRED_MATE1,
+ ALN_FLAG_PAIR_UNPAIRED_MATE2,
+
+ // This is an unpaired alignment of an unpaired read
+ ALN_FLAG_PAIR_UNPAIRED
+};
+
+/**
+ * Encapsulates some general information about an alignment that doesn't belong
+ * in AlnRes. Specifically:
+ *
+ * 1. Whether the alignment is paired
+ * 2. If it's paried, whether it's concordant or discordant
+ * 3. Whether this alignment was found after the paired-end categories were
+ * maxed out
+ * 4. Whether the relevant unpaired category was maxed out
+ */
+class AlnFlags {
+
+public:
+
+ AlnFlags() {
+ init(
+ ALN_FLAG_PAIR_UNPAIRED,
+ false, // canMax
+ false, // maxed
+ false, // maxedPair
+ false, // nfilt
+ false, // scfilt
+ false, // lenfilt
+ false, // qcfilt
+ false, // mixedMode
+ false, // primary
+ false, // oppAligned
+ false); // oppFw
+ }
+
+ AlnFlags(
+ int pairing,
+ bool canMax,
+ bool maxed,
+ bool maxedPair,
+ bool nfilt,
+ bool scfilt,
+ bool lenfilt,
+ bool qcfilt,
+ bool mixedMode,
+ bool primary,
+ bool oppAligned, // opposite mate aligned?
+ bool oppFw) // opposite mate aligned forward?
+ {
+ init(pairing, canMax, maxed, maxedPair, nfilt, scfilt,
+ lenfilt, qcfilt, mixedMode, primary, oppAligned, oppFw);
+ }
+
+ /**
+ * Initialize given values for all settings.
+ */
+ void init(
+ int pairing,
+ bool canMax,
+ bool maxed,
+ bool maxedPair,
+ bool nfilt,
+ bool scfilt,
+ bool lenfilt,
+ bool qcfilt,
+ bool mixedMode,
+ bool primary,
+ bool oppAligned,
+ bool oppFw)
+ {
+ assert_gt(pairing, 0);
+ assert_leq(pairing, ALN_FLAG_PAIR_UNPAIRED);
+ pairing_ = pairing;
+ canMax_ = canMax;
+ maxed_ = maxed;
+ maxedPair_ = maxedPair;
+ nfilt_ = nfilt;
+ scfilt_ = scfilt;
+ lenfilt_ = lenfilt;
+ qcfilt_ = qcfilt;
+ mixedMode_ = mixedMode;
+ primary_ = primary;
+ oppAligned_ = oppAligned;
+ }
+
+ /**
+ * Return true iff this alignment is from a paired-end read.
+ */
+ bool partOfPair() const {
+ assert_gt(pairing_, 0);
+ return pairing_ < ALN_FLAG_PAIR_UNPAIRED;
+ }
+
+#ifndef NDEBUG
+ /**
+ * Check that the flags are internally consistent.
+ */
+ bool repOk() const {
+ assert(partOfPair() || !maxedPair_);
+ return true;
+ }
+#endif
+
+ /**
+ * Print out string representation of YF:i flag for indicating whether and
+ * why the mate was filtered.
+ */
+ bool printYF(BTString& o, bool first) const;
+
+ /**
+ * Print out string representation of YM:i flag for indicating with the
+ * mate per se aligned repetitively.
+ */
+ void printYM(BTString& o) const;
+
+ /**
+ * Print out string representation of YM:i flag for indicating with the
+ * pair containing the mate aligned repetitively.
+ */
+ void printYP(BTString& o) const;
+
+ /**
+ * Print out string representation of these flags.
+ */
+ void printYT(BTString& o) const;
+
+ inline int pairing() const { return pairing_; }
+ inline bool maxed() const { return maxed_; }
+ inline bool maxedPair() const { return maxedPair_; }
+
+ /**
+ * Return true iff the alignment is not the primary alignment; i.e. not the
+ * first reported alignment for the fragment.
+ */
+ inline bool isPrimary() const {
+ return primary_;
+ }
+
+ /**
+ * Set the primary flag.
+ */
+ void setPrimary(bool primary) {
+ primary_ = primary;
+ }
+
+ /**
+ * Return whether both paired and unpaired alignments are considered for
+ * pairs & their constituent mates
+ */
+ inline bool isMixedMode() const {
+ return mixedMode_;
+ }
+
+ /**
+ * Return true iff the alignment params are such that it's possible for a
+ * read to be suppressed for being repetitive.
+ */
+ inline bool canMax() const {
+ return canMax_;
+ }
+
+ /**
+ * Return true iff the alignment was filtered out.
+ */
+ bool filtered() const {
+ return !nfilt_ || !scfilt_ || !lenfilt_ || !qcfilt_;
+ }
+
+ /**
+ * Return true iff the read is mate #1 of a pair, regardless of whether it
+ * aligned as a pair.
+ */
+ bool readMate1() const {
+ return pairing_ == ALN_FLAG_PAIR_CONCORD_MATE1 ||
+ pairing_ == ALN_FLAG_PAIR_DISCORD_MATE1 ||
+ pairing_ == ALN_FLAG_PAIR_UNPAIRED_MATE1;
+ }
+
+ /**
+ * Return true iff the read is mate #2 of a pair, regardless of whether it
+ * aligned as a pair.
+ */
+ bool readMate2() const {
+ return pairing_ == ALN_FLAG_PAIR_CONCORD_MATE2 ||
+ pairing_ == ALN_FLAG_PAIR_DISCORD_MATE2 ||
+ pairing_ == ALN_FLAG_PAIR_UNPAIRED_MATE2;
+ }
+
+ /**
+ * Return true iff the read aligned as either mate of a concordant pair.
+ */
+ bool alignedConcordant() const {
+ return pairing_ == ALN_FLAG_PAIR_CONCORD_MATE1 ||
+ pairing_ == ALN_FLAG_PAIR_CONCORD_MATE2;
+ }
+
+ /**
+ * Return true iff the read aligned as either mate of a discordant pair.
+ */
+ bool alignedDiscordant() const {
+ return pairing_ == ALN_FLAG_PAIR_DISCORD_MATE1 ||
+ pairing_ == ALN_FLAG_PAIR_DISCORD_MATE2;
+ }
+
+ /**
+ * Return true iff the read aligned as either mate of a pair, concordant or
+ * discordant.
+ */
+ bool alignedPaired() const {
+ return alignedConcordant() || alignedDiscordant();
+ }
+
+ /**
+ * Return true iff the read aligned as an unpaired read.
+ */
+ bool alignedUnpaired() const {
+ return pairing_ == ALN_FLAG_PAIR_UNPAIRED;
+ }
+
+ /**
+ * Return true iff the read aligned as an unpaired mate from a paired read.
+ */
+ bool alignedUnpairedMate() const {
+ return pairing_ == ALN_FLAG_PAIR_UNPAIRED_MATE1 ||
+ pairing_ == ALN_FLAG_PAIR_UNPAIRED_MATE2;
+ }
+
+ bool mateAligned() const {
+ return oppAligned_;
+ }
+
+protected:
+
+ // See ALN_FLAG_PAIR_* above
+ int pairing_;
+
+ // True iff the alignment params are such that it's possible for a read to
+ // be suppressed for being repetitive
+ bool canMax_;
+
+ // This alignment is sampled from among many alignments that, taken
+ // together, cause this mate to align non-uniquely
+ bool maxed_;
+
+ // The paired-end read of which this mate is part has repetitive concordant
+ // alignments
+ bool maxedPair_;
+
+ bool nfilt_; // read/mate filtered b/c proportion of Ns exceeded ceil
+ bool scfilt_; // read/mate filtered b/c length can't provide min score
+ bool lenfilt_; // read/mate filtered b/c less than or equal to seed mms
+ bool qcfilt_; // read/mate filtered by upstream qc
+
+ // Whether both paired and unpaired alignments are considered for pairs &
+ // their constituent mates
+ bool mixedMode_;
+
+ // The read is the primary read
+ bool primary_;
+
+ // True iff the opposite mate aligned
+ bool oppAligned_;
+};
+
+static inline ostream& operator<<(ostream& os, const AlnScore& o) {
+ os << o.score();
+ return os;
+}
+
+// Forward declaration
+class BitPairReference;
+
+// A given AlnRes can be one of these three types
+enum {
+ ALN_RES_TYPE_UNPAIRED = 1, // unpaired alignment
+ ALN_RES_TYPE_UNPAIRED_MATE1, // mate #1 in pair, aligned unpaired
+ ALN_RES_TYPE_UNPAIRED_MATE2, // mate #2 in pair, aligned unpaired
+ ALN_RES_TYPE_MATE1, // mate #1 in paired-end alignment
+ ALN_RES_TYPE_MATE2 // mate #2 in paired-end alignment
+};
+
+/**
+ * Seed alignment summary
+ */
+struct SeedAlSumm {
+
+ SeedAlSumm() { reset(); }
+
+ void reset() {
+ nonzTot = nonzFw = nonzRc = 0;
+ nrangeTot = nrangeFw = nrangeRc = 0;
+ neltTot = neltFw = neltRc = 0;
+ minNonzRangeFw = minNonzRangeRc = 0;
+ maxNonzRangeFw = maxNonzRangeRc = 0;
+ minNonzEltFw = minNonzEltRc = 0;
+ maxNonzEltFw = maxNonzEltRc = 0;
+ }
+
+ size_t nonzTot;
+ size_t nonzFw;
+ size_t nonzRc;
+
+ size_t nrangeTot;
+ size_t nrangeFw;
+ size_t nrangeRc;
+
+ size_t neltTot;
+ size_t neltFw;
+ size_t neltRc;
+
+ size_t minNonzRangeFw;
+ size_t minNonzRangeRc;
+
+ size_t maxNonzRangeFw;
+ size_t maxNonzRangeRc;
+
+ size_t minNonzEltFw;
+ size_t minNonzEltRc;
+
+ size_t maxNonzEltFw;
+ size_t maxNonzEltRc;
+};
+
+/**
+ * Encapsulates a stacked alignment, a nice intermediate format for alignments
+ * from which to left-align gaps, print CIGAR strings, and print MD:Z strings.
+ */
+class StackedAln {
+
+public:
+
+ StackedAln() :
+ stackRef_(RES_CAT),
+ stackRel_(RES_CAT),
+ stackSNP_(RES_CAT),
+ stackRead_(RES_CAT),
+ stackSkip_(RES_CAT),
+ cigOp_(RES_CAT),
+ cigRun_(RES_CAT),
+ mdzOp_(RES_CAT),
+ mdzChr_(RES_CAT),
+ mdzRun_(RES_CAT)
+ {
+ reset();
+ }
+
+ /**
+ * Reset to an uninitialized state.
+ */
+ void reset() {
+ inited_ = false;
+ trimLS_ = trimLH_ = trimRS_ = trimRH_ = 0;
+ stackRef_.clear();
+ stackRel_.clear();
+ stackSNP_.clear();
+ stackRead_.clear();
+ stackSkip_.clear();
+ cigDistMm_ = cigCalc_ = false;
+ cigOp_.clear();
+ cigRun_.clear();
+ mdzCalc_ = false;
+ mdzOp_.clear();
+ mdzChr_.clear();
+ mdzRun_.clear();
+ }
+
+ /**
+ * Return true iff the stacked alignment has been initialized.
+ */
+ bool inited() const { return inited_; }
+
+ /**
+ * Initialized the stacked alignment with respect to a read string, a list of
+ * edits (expressed left-to-right), and integers indicating how much hard and
+ * soft trimming has occurred on either end of the read.
+ *
+ * s: read sequence
+ * ed: all relevant edits, including ambiguous nucleotides
+ * trimLS: # bases soft-trimmed from LHS
+ * trimLH: # bases hard-trimmed from LHS
+ * trimRS: # bases soft-trimmed from RHS
+ * trimRH: # bases hard-trimmed from RHS
+ */
+ void init(
+ const BTDnaString& s,
+ const EList& ed,
+ size_t trimLS,
+ size_t trimLH,
+ size_t trimRS,
+ size_t trimRH);
+
+ /**
+ * Left-align all the gaps. If this changes the alignment and the CIGAR or
+ * MD:Z strings have already been calculated, this renders them invalid.
+ *
+ * We left-align gaps with in the following way: for each gap, we check
+ * whether the character opposite the rightmost gap character is the same
+ * as the character opposite the character just to the left of the gap. If
+ * this is the case, we can slide the gap to the left and make the
+ * rightmost position previously covered by the gap into a non-gap.
+ *
+ * This scheme allows us to push the gap past a mismatch. BWA does seem to
+ * allow this. It's not clear that Bowtie 2 should, since moving the
+ * mismatch could cause a mismatch with one base quality to be replaced
+ * with a mismatch with a different base quality.
+ */
+ void leftAlign(bool pastMms);
+
+ /**
+ * Build the CIGAR list, if it hasn't already built. Returns true iff it
+ * was built for the first time.
+ */
+ bool buildCigar(bool xeq);
+
+ /**
+ * Build the MD:Z list, if it hasn't already built. Returns true iff it
+ * was built for the first time.
+ */
+ bool buildMdz();
+
+ /**
+ * Write a CIGAR representation of the alignment to the given string and/or
+ * char buffer.
+ */
+ void writeCigar(BTString* o, char* oc) const;
+
+ /**
+ * Write a CIGAR representation of the alignment to the given string and/or
+ * char buffer. This function is for HISAT-3N.
+ */
+ void writeCigar(Alignment* o, char* oc) const;
+
+ /**
+ * Write an MD:Z representation of the alignment to the given string and/or
+ * char buffer.
+ */
+ void writeMdz(BTString* o, char* oc) const;
+
+ /**
+ * Check internal consistency.
+ */
+#ifndef NDEBUG
+ bool repOk() const {
+ if(inited_) {
+ assert_eq(stackRef_.size(), stackRead_.size());
+ assert_eq(stackRef_.size(), stackRel_.size());
+ }
+ return true;
+ }
+#endif
+
+protected:
+
+ bool inited_; // true iff stacked alignment is initialized
+
+ size_t trimLS_; // amount soft-trimmed from the LHS
+ size_t trimLH_; // amount hard-trimmed from the LHS
+ size_t trimRS_; // amount soft-trimmed from the RHS
+ size_t trimRH_; // amount hard-trimmed from the RHS
+
+ EList stackRef_; // reference characters
+ EList stackRel_; // bars relating reference to read characters
+ EList stackSNP_; // known SNP?
+ EList stackRead_; // read characters
+ EList stackSkip_;
+
+ bool cigDistMm_; // distinguish between =/X, rather than just M
+ bool cigCalc_; // whether we've calculated CIGAR ops/runs
+ EList cigOp_; // CIGAR operations
+ EList cigRun_; // CIGAR run lengths
+
+ bool mdzCalc_; // whether we've calculated MD:Z ops/runs
+ EList mdzOp_; // MD:Z operations
+ EList mdzChr_; // MD:Z operations
+ EList mdzRun_; // MD:Z run lengths
+};
+
+/**
+ * Encapsulates an alignment result. The result comprises:
+ *
+ * 1. All the nucleotide edits for both mates ('ned').
+ * 2. All "edits" where an ambiguous reference char is resolved to an
+ * unambiguous char ('aed').
+ * 3. The score for the alginment, including summary information about the
+ * number of gaps and Ns involved.
+ * 4. The reference id, strand, and 0-based offset of the leftmost character
+ * involved in the alignment.
+ * 5. Information about trimming prior to alignment and whether it was hard or
+ * soft.
+ * 6. Information about trimming during alignment and whether it was hard or
+ * soft. Local-alignment trimming is usually soft when aligning nucleotide
+ * reads.
+ *
+ * Note that the AlnRes, together with the Read and an AlnSetSumm (*and* the
+ * opposite mate's AlnRes and Read in the case of a paired-end alignment),
+ * should contain enough information to print an entire alignment record.
+ *
+ * TRIMMING
+ *
+ * Accounting for trimming is tricky. Trimming affects:
+ *
+ * 1. The values of the trim* and pretrim* fields.
+ * 2. The offsets of the Edits in the ELists.
+ * 3. The read extent, if the trimming is soft.
+ * 4. The read extent and the read sequence and length, if trimming is hard.
+ *
+ * Handling 1. is not too difficult. 2., 3., and 4. are handled in setShape().
+ */
+class AlnRes {
+
+public:
+
+ AlnRes() :
+ // ned_(RES_CAT),
+ // aed_(RES_CAT)
+ ned_(NULL),
+ aed_(NULL),
+ ned_node_(NULL),
+ aed_node_(NULL),
+ raw_edits_(NULL)
+ {
+ reset();
+ }
+
+ AlnRes(const AlnRes& other) :
+ ned_(NULL),
+ aed_(NULL),
+ ned_node_(NULL),
+ aed_node_(NULL),
+ raw_edits_(NULL)
+ {
+ shapeSet_ = other.shapeSet_;
+ rdlen_ = other.rdlen_;
+ rdid_ = other.rdid_;
+ rdrows_ = other.rdrows_;
+ score_ = other.score_;
+ oscore_ = other.oscore_;
+ refcoord_ = other.refcoord_;
+ reflen_ = other.reflen_;
+ refival_ = other.refival_;
+ rdextent_ = other.rdextent_;
+ rdexrows_ = other.rdexrows_;
+ rfextent_ = other.rfextent_;
+ seedmms_ = other.seedmms_;
+ seedlen_ = other.seedlen_;
+ minsc_ = other.minsc_;
+ nuc5p_ = other.nuc5p_;
+ nuc3p_ = other.nuc3p_;
+ refns_ = other.refns_;
+ type_ = other.type_;
+ fraglenSet_ = other.fraglenSet_;
+ fraglen_ = other.fraglen_;
+ pretrimSoft_ = other.pretrimSoft_;
+ pretrim5p_ = other.pretrim5p_;
+ pretrim3p_ = other.pretrim3p_;
+ trimSoft_ = other.trimSoft_;
+ trim5p_ = other.trim5p_;
+ trim3p_ = other.trim3p_;
+ repeat_ = other.repeat_;
+
+ num_spliced_ = other.num_spliced_;
+ raw_edits_ = other.raw_edits_;
+ if(raw_edits_ != NULL) {
+ assert(ned_ == NULL && aed_ == NULL);
+ assert(ned_node_ == NULL && aed_node_ == NULL);
+ ned_node_ = raw_edits_->new_node();
+ aed_node_ = raw_edits_->new_node();
+ assert(ned_node_ != NULL && aed_node_ != NULL);
+ ned_ = &(ned_node_->payload);
+ aed_ = &(aed_node_->payload);
+ assert(other.ned_ != NULL && other.aed_ != NULL);
+ *ned_ = *(other.ned_);
+ *aed_ = *(other.aed_);
+ }
+ }
+
+ AlnRes& operator=(const AlnRes& other) {
+ if(this == &other) return *this;
+ shapeSet_ = other.shapeSet_;
+ rdlen_ = other.rdlen_;
+ rdid_ = other.rdid_;
+ rdrows_ = other.rdrows_;
+ score_ = other.score_;
+ oscore_ = other.oscore_;
+ refcoord_ = other.refcoord_;
+ reflen_ = other.reflen_;
+ refival_ = other.refival_;
+ rdextent_ = other.rdextent_;
+ rdexrows_ = other.rdexrows_;
+ rfextent_ = other.rfextent_;
+ seedmms_ = other.seedmms_;
+ seedlen_ = other.seedlen_;
+ minsc_ = other.minsc_;
+ nuc5p_ = other.nuc5p_;
+ nuc3p_ = other.nuc3p_;
+ refns_ = other.refns_;
+ type_ = other.type_;
+ fraglenSet_ = other.fraglenSet_;
+ fraglen_ = other.fraglen_;
+ pretrimSoft_ = other.pretrimSoft_;
+ pretrim5p_ = other.pretrim5p_;
+ pretrim3p_ = other.pretrim3p_;
+ trimSoft_ = other.trimSoft_;
+ trim5p_ = other.trim5p_;
+ trim3p_ = other.trim3p_;
+ repeat_ = other.repeat_;
+
+ num_spliced_ = other.num_spliced_;
+ assert(raw_edits_ == NULL || raw_edits_ == other.raw_edits_);
+ raw_edits_ = other.raw_edits_;
+ if(ned_ != NULL) {
+ assert(aed_ != NULL);
+ ned_->clear();
+ aed_->clear();
+ } else if(raw_edits_ != NULL) {
+ assert(aed_ == NULL);
+ assert(ned_node_ == NULL && aed_node_ == NULL);
+ ned_node_ = raw_edits_->new_node();
+ aed_node_ = raw_edits_->new_node();
+ assert(ned_node_ != NULL && aed_node_ != NULL);
+ ned_ = &(ned_node_->payload);
+ aed_ = &(aed_node_->payload);
+ }
+
+ if(other.ned_ != NULL) {
+ assert(other.aed_ != NULL);
+ *ned_ = *(other.ned_);
+ *aed_ = *(other.aed_);
+ }
+
+ return *this;
+ }
+
+ ~AlnRes()
+ {
+#ifndef NDEBUG
+ if(ned_node_ == NULL || aed_node_ == NULL) {
+ assert(ned_node_ == NULL && aed_node_ == NULL);
+ assert(ned_ == NULL && aed_ == NULL);
+ assert(raw_edits_ == NULL);
+ } else {
+ assert(ned_node_ != NULL && aed_node_ != NULL);
+ assert(ned_ != NULL && aed_ != NULL);
+ assert(raw_edits_ != NULL);
+ }
+#endif
+ if(ned_ != NULL) {
+ ned_->clear(); aed_->clear();
+ raw_edits_->delete_node(ned_node_);
+ raw_edits_->delete_node(aed_node_);
+ ned_ = aed_ = NULL;
+ ned_node_ = aed_node_ = NULL;
+ raw_edits_ = NULL;
+ }
+ }
+
+ /* DK - temporary implementation */
+ void init_raw_edits(LinkedEList >* raw_edits) {
+ if(raw_edits == NULL)
+ return;
+ raw_edits_ = raw_edits;
+ assert(ned_ == NULL && aed_ == NULL);
+ assert(ned_node_ == NULL && aed_node_ == NULL);
+ ned_node_ = raw_edits_->new_node();
+ aed_node_ = raw_edits_->new_node();
+ assert(ned_node_ != NULL && aed_node_ != NULL);
+ ned_ = &(ned_node_->payload);
+ aed_ = &(aed_node_->payload);
+ }
+
+ /**
+ * Clear all contents.
+ */
+ void reset();
+
+ /**
+ * Reverse all edit lists.
+ */
+ void reverseEdits() {
+ (*ned_).reverse();
+ (*aed_).reverse();
+ }
+
+ /**
+ * Invert positions of edits so that they're with respect to the other end
+ * of the alignment. The assumption is that the .pos fields of the edits
+ * in the ned_/aed_/ced_ structures are offsets with respect to the first
+ * aligned character (i.e. after all trimming).
+ */
+ void invertEdits() {
+ assert(shapeSet_);
+ assert_gt(rdlen_, 0);
+ assert_gt(rdrows_, 0);
+ Edit::invertPoss(*ned_, rdexrows_, false);
+ Edit::invertPoss(*aed_, rdexrows_, false);
+ }
+
+ /**
+ * Return true iff no result has been installed.
+ */
+ bool empty() const {
+ if(!VALID_AL_SCORE(score_)) {
+ assert(ned_ == NULL || ned_->empty());
+ assert(aed_ == NULL || aed_->empty());
+ assert(!refcoord_.inited());
+ assert(!refival_.inited());
+ return true;
+ } else {
+ return false;
+ }
+ }
+
+ /**
+ * Return the identifier for the reference that the alignment
+ * occurred in.
+ */
+ inline TRefId refid() const {
+ assert(shapeSet_);
+ return refcoord_.ref();
+ }
+
+ /**
+ * Return the orientation that the alignment occurred in.
+ */
+ inline int orient() const {
+ assert(shapeSet_);
+ return refcoord_.orient();
+ }
+
+ /**
+ * Return the 0-based offset of the alignment into the reference
+ * sequence it aligned to.
+ */
+ inline TRefOff refoff() const {
+ assert(shapeSet_);
+ return refcoord_.off();
+ }
+
+ /**
+ * Set arguments to coordinates for the upstream-most and downstream-most
+ * reference positions involved in the alignment.
+ */
+ inline void getCoords(
+ Coord& st, // out: install starting coordinate here
+ Coord& en, // out: install ending coordinate here
+ Coord& st2,
+ Coord& en2)
+ const
+ {
+ assert(shapeSet_);
+ st.init(refcoord_);
+ en.init(refcoord_);
+ en.adjustOff(refExtent() - 1);
+ Coord right = refcoord_right();
+ st2.init(right);
+ st2.adjustOff(1 - refExtent());
+ en2.init(right);
+ }
+
+ /**
+ * Set arguments to coordinates for the upstream-most and downstream-most
+ * reference positions covered by the read taking any read trimming into
+ * account. I.e. if the upstream-most offset involved in an alignment is
+ * 40 but the read was hard-trimmed by 5 on that end, the inferred
+ * upstream-most covered position is 35.
+ */
+ inline void getExtendedCoords(
+ Coord& st, // out: install starting coordinate here
+ Coord& en, // out: install ending coordinate here
+ Coord& st2,
+ Coord& en2)
+ const
+ {
+ getCoords(st, en, st2, en2);
+ // Take trimming into account
+ int64_t trim_st = (fw() ? trim5p_ : trim3p_);
+ int64_t trim_en = (fw() ? trim3p_ : trim5p_);
+ trim_st += (fw() ? pretrim5p_ : pretrim3p_);
+ trim_en += (fw() ? pretrim3p_ : pretrim5p_);
+ st.adjustOff(-trim_st);
+ en.adjustOff( trim_en);
+ st2.adjustOff(-trim_st);
+ en2.adjustOff( trim_en);
+ }
+
+ /**
+ * Set the upstream-most reference offset involved in the alignment, and
+ * the extent of the alignment (w/r/t the reference)
+ */
+ void setShape(
+ TRefId id, // id of reference aligned to
+ TRefOff off, // offset of first aligned char into ref seq
+ TRefOff reflen, // length of reference sequence aligned to
+ bool fw, // aligned to Watson strand?
+ size_t rdlen, // length of read after hard trimming, before soft
+ TReadId rdid, // read ID
+ bool pretrimSoft, // whether trimming prior to alignment was soft
+ size_t pretrim5p, // # poss trimmed form 5p end before alignment
+ size_t pretrim3p, // # poss trimmed form 3p end before alignment
+ bool trimSoft, // whether local-alignment trimming was soft
+ size_t trim5p, // # poss trimmed form 5p end during alignment
+ size_t trim3p); // # poss trimmed form 3p end during alignment
+
+ /**
+ * Return true iff the reference chars involved in this alignment result
+ * are entirely within with given bounds.
+ */
+ bool within(
+ TRefId id,
+ TRefOff off,
+ bool fw,
+ size_t extent) const
+ {
+ if(refcoord_.ref() == id &&
+ refcoord_.off() >= off &&
+ refcoord_.off() + refExtent() <= off + extent &&
+ refcoord_.fw() == fw)
+ {
+ return true;
+ }
+ return false;
+ }
+
+ /**
+ * Set alignment score for this alignment.
+ */
+ void setScore(AlnScore score) {
+ score_ = score;
+ }
+
+ /**
+ * Set the upstream-most and downstream-most nucleotides.
+ */
+ void setNucs(bool fw, int nup, int ndn) {
+ nuc5p_ = fw ? nup : ndn;
+ nuc3p_ = fw ? ndn : nup;
+ }
+
+ /**
+ * Return the 0-based offset of the leftmost reference position involved in
+ * the alignment.
+ */
+ const Coord& refcoord() const {
+ return refcoord_;
+ }
+
+ /**
+ * Return the 0-based offset of the leftmost reference position involved in
+ * the alignment.
+ */
+ const Interval& refival() const {
+ return refival_;
+ }
+
+ /**
+ * Return the 0-based offset of the leftmost reference position involved in
+ * the alignment.
+ */
+ Coord& refcoord() {
+ return refcoord_;
+ }
+
+ /**
+ * Return the 0-based offset of the rightmost reference position involved in
+ * the alignment.
+ */
+ Coord refcoord_right() const {
+ Coord coord_right = refcoord_;
+ TRefOff right = coord_right.off() + rfextent_ - 1;
+ for(size_t i = 0; i < ned_->size(); i++) {
+ const Edit& ed = (*ned_)[i];
+ if(ed.type == EDIT_TYPE_SPL) {
+ right += ed.splLen;
+ }
+ }
+
+ coord_right.setOff(right);
+ return coord_right;
+ }
+
+ /**
+ * Return true if this alignment is to the Watson strand.
+ */
+ inline bool fw() const {
+ return refcoord_.fw();
+ }
+
+ AlnScore score() const { return score_; }
+ AlnScore oscore() const { return oscore_; }
+ EList& ned() { return *ned_; }
+ EList& aed() { return *aed_; }
+ const EList& ned() const { return *ned_; }
+ const EList& aed() const { return *aed_; }
+ size_t readExtent() const { return rdextent_; }
+ size_t readExtentRows() const { return rdexrows_; }
+ size_t readLength() const { return rdlen_; }
+ TReadId readID() const { return rdid_; }
+ bool spliced() const { return num_spliced_ > 0; }
+ size_t num_spliced() const { return num_spliced_; }
+ uint8_t spliced_whichsense_transcript() const {
+ uint8_t whichsense = SPL_UNKNOWN;
+ if(spliced()) {
+ for(size_t i = 0; i < ned_->size(); i++) {
+ const Edit& ed = (*ned_)[i];
+ if(ed.type != EDIT_TYPE_SPL) continue;
+ if(whichsense == SPL_UNKNOWN) {
+ whichsense = ed.splDir;
+ } else if(ed.splDir != SPL_UNKNOWN) {
+ assert_neq(whichsense, SPL_UNKNOWN);
+ if(whichsense == SPL_FW || whichsense == SPL_SEMI_FW) {
+ if(ed.splDir != SPL_FW && ed.splDir != SPL_SEMI_FW) {
+ whichsense = SPL_UNKNOWN;
+ break;
+ }
+ }
+ if(whichsense == SPL_RC || whichsense == SPL_SEMI_RC) {
+ if(ed.splDir != SPL_RC && ed.splDir != SPL_SEMI_RC) {
+ whichsense = SPL_UNKNOWN;
+ break;
+ }
+ }
+ }
+ }
+ }
+
+ return whichsense;
+ }
+
+ /**
+ * Return the number of reference nucleotides involved in the alignment
+ * (i.e. the number of characters in the inclusive range from the first
+ * matched-up ref char to the last).
+ */
+ size_t refExtent() const {
+ return rfextent_;
+ }
+
+ /**
+ * Return length of reference sequence aligned to.
+ */
+ TRefOff reflen() const {
+ return reflen_;
+ }
+
+ /**
+ * Return the number of reference nucleotides in the alignment (i.e. the
+ * number of characters in the inclusive range from the first matched-up
+ * ref char to the last).
+ */
+ size_t refNucExtent() const {
+ return rfextent_;
+ }
+
+ /**
+ * Print the sequence for the read that aligned using A, C, G and
+ * T. This will simply print the read sequence (or its reverse
+ * complement).
+ */
+ void printSeq(
+ const Read& rd,
+ const BTDnaString* dns,
+ BTString& o) const;
+
+ /**
+ * Print the quality string for the read that aligned. This will
+ * simply print the read qualities (or their reverse).
+ */
+ void printQuals(
+ const Read& rd,
+ const BTString* dqs,
+ BTString& o) const;
+
+ /**
+ * Print a stacked alignment with the reference on top, query on bottom,
+ * and lines connecting matched-up positions.
+ */
+ void printStacked(
+ const Read& rd,
+ std::ostream& o) const
+ {
+ printStacked(refcoord_.fw() ? rd.patFw : rd.patRc, o);
+ }
+
+ /**
+ * Print a stacked alignment with the reference on bottom, query on top,
+ * and lines connecting matched-up positions.
+ */
+ void printStacked(
+ const BTDnaString& seq,
+ std::ostream& o) const
+ {
+ Edit::printQAlign(o, seq, *ned_);
+ // Print reference offset below reference string
+ o << "^" << std::endl;
+ o << "(" << refcoord_.ref() << "," << refcoord_.off() << ")" << std::endl;
+ }
+
+#ifndef NDEBUG
+ /**
+ * Check that alignment score is internally consistent.
+ */
+ bool repOk() const {
+ assert(refcoord_.repOk());
+ if(shapeSet_) {
+ assert_lt(refoff(), reflen_);
+ }
+ assert(refival_.repOk());
+ assert(VALID_AL_SCORE(score_) || ned_ == NULL || ned_->empty());
+ assert(VALID_AL_SCORE(score_) || aed_ == NULL || aed_->empty());
+ assert(empty() || refcoord_.inited());
+ assert(empty() || refival_.inited());
+ assert_geq(rdexrows_, rdextent_);
+ assert(empty() || rdextent_ > 0);
+ assert(empty() || rfextent_ > 0);
+ return true;
+ }
+
+ /**
+ * Check that alignment score is internally consistent.
+ */
+ bool repOk(const Read& rd) const {
+ assert(Edit::repOk(*ned_, refcoord_.fw() ? rd.patFw : rd.patRc,
+ refcoord_.fw(), trimmed5p(true), trimmed3p(true)));
+ return repOk();
+ }
+#endif
+
+#ifndef NDEBUG
+ /**
+ * Assuming this AlnRes is an alignment for 'rd', check that the
+ * alignment and 'rd' are compatible with the corresponding
+ * reference sequence.
+ */
+ bool matchesRef(
+ const Read& rd,
+ const BitPairReference& ref,
+ BTDnaString& rf,
+ BTDnaString& rdseq,
+ BTString& qseq,
+ SStringExpandable& raw_refbuf,
+ SStringExpandable& destU32,
+ EList& matches,
+ SStringExpandable& raw_refbuf2,
+ EList& reflens,
+ EList& refoffs);
+#endif
+
+ /**
+ * Set information about the alignment parameters that led to this
+ * alignment.
+ */
+ void setParams(
+ int seedmms,
+ int seedlen,
+ int seedival,
+ int64_t minsc)
+ {
+ seedmms_ = seedmms;
+ seedlen_ = seedlen;
+ seedival_ = seedival;
+ minsc_ = minsc;
+ }
+
+ // Accessors for alignment parameters
+ int seedmms() const { return seedmms_; }
+ int seedlen() const { return seedlen_; }
+ int seedival() const { return seedival_; }
+ int64_t minScore() const { return minsc_; }
+
+ /**
+ * Is the ith row from the 5' end of the DP table one of the ones
+ * soft-trimmed away by local alignment?
+ */
+ inline bool trimmedRow5p(size_t i) const {
+ return i < trim5p_ || rdrows_ - i - 1 < trim3p_;
+ }
+
+ /**
+ * Is the ith character from the 5' end of read sequence one of the ones
+ * soft-trimmed away by local alignment?
+ */
+ inline bool trimmedPos5p(size_t i) const {
+ return i < trim5p_ || rdlen_ - i - 1 < trim3p_;
+ }
+
+ /**
+ * Is the ith row from the 5' end of the DP table one of the ones that
+ * survived local-alignment soft trimming?
+ */
+ inline bool alignedRow5p(size_t i) const {
+ return !trimmedRow5p(i);
+ }
+
+ /**
+ * Is the ith character from the 5' end of the read sequence one of the
+ * ones that survived local-alignment soft trimming?
+ */
+ inline bool alignedPos5p(size_t i) const {
+ return !trimmedPos5p(i);
+ }
+
+ /**
+ * Return true iff this AlnRes and the given AlnRes overlap. Two AlnRess
+ * overlap if they share a cell in the overall dynamic programming table:
+ * i.e. if there exists a read position s.t. that position in both reads
+ * matches up with the same reference character. E.g., the following
+ * alignments (drawn schematically as paths through a dynamic programming
+ * table) are redundant:
+ *
+ * a b a b
+ * \ \ \ \
+ * \ \ \ \
+ * \ \ \ \
+ * ---\ \ \
+ * \ ---\---
+ * ---\ \ \
+ * \ \ \ \
+ * \ \ \ \
+ * \ \ \ \
+ * a b b a
+ *
+ * We iterate over each read position that hasn't been hard-trimmed, but
+ * only overlaps at positions that have also not been soft-trimmed are
+ * considered.
+ */
+ bool overlap(AlnRes& res);
+
+ /**
+ * Return true iff this read was unpaired to begin with.
+ */
+ inline bool readUnpaired() const {
+ assert_gt(type_, 0);
+ return type_ == ALN_RES_TYPE_UNPAIRED;
+ }
+
+ /**
+ * Return true iff this alignment aligned in an unpaired fashion; not part
+ * of a concordant or discordant pair.
+ */
+ inline bool alignedUnpaired() const {
+ assert_gt(type_, 0);
+ return type_ == ALN_RES_TYPE_UNPAIRED ||
+ type_ == ALN_RES_TYPE_UNPAIRED_MATE1 ||
+ type_ == ALN_RES_TYPE_UNPAIRED_MATE2;
+ }
+
+ /**
+ * Return true iff this alignment aligned as mate #1 or mate #2 in a pair,
+ * either concordant or discordant.
+ */
+ inline bool alignedPaired() const {
+ assert_gt(type_, 0);
+ return type_ == ALN_RES_TYPE_MATE1 ||
+ type_ == ALN_RES_TYPE_MATE2;
+ }
+
+ /**
+ * Return true iff this read started as mate #1 in a pair.
+ */
+ inline bool readMate1() const {
+ assert_gt(type_, 0);
+ return type_ == ALN_RES_TYPE_MATE1 ||
+ type_ == ALN_RES_TYPE_UNPAIRED_MATE1;
+ }
+
+ /**
+ * Return true iff this read aligned as mate #1 in a concordant or
+ * discordant pair.
+ */
+ inline bool alignedMate1() const {
+ assert_gt(type_, 0);
+ return type_ == ALN_RES_TYPE_MATE1;
+ }
+
+ /**
+ * Return true iff this alignment aligned as mate #2 in a pair, either
+ * concordant or discordant.
+ */
+ inline bool readMate2() const {
+ assert_gt(type_, 0);
+ return type_ == ALN_RES_TYPE_MATE2 ||
+ type_ == ALN_RES_TYPE_UNPAIRED_MATE2;
+ }
+
+ /**
+ * Return true iff this read aligned as mate #2 in a concordant or
+ * discordant pair.
+ */
+ inline bool alignedMate2() const {
+ assert_gt(type_, 0);
+ return type_ == ALN_RES_TYPE_MATE2;
+ }
+
+ /**
+ * Return true iff fragment length is set.
+ */
+ bool isFraglenSet() const {
+ return fraglenSet_;
+ }
+
+ /**
+ * Set whether this alignment is unpaired, or is mate #1 or mate #2 in a
+ * paired-end alignment.
+ */
+ void setMateParams(
+ int type,
+ const AlnRes* omate, // alignment result for the opposite mate
+ const AlnFlags& flags, // flags for this mate
+ const SpliceSiteDB* ssdb = NULL, // splice sites
+ uint64_t threads_rids_mindist = 0,
+ EList* spliceSites = NULL)
+ {
+ assert_gt(type, 0);
+ type_ = type;
+ fraglen_ = 0;
+ if(omate != NULL) {
+ oscore_ = omate->score_;
+ // When should we calculate a fragment length here? There are a
+ // couple reasonable ideas:
+ // 1. When mates align concordantly
+ // 2. When both mates align to the same reference string
+ // BWA seems to do 2., so that's what we'll do here.
+ bool sameChr = true;
+ if((sameChr && refcoord_.ref() == omate->refcoord_.ref()) ||
+ flags.alignedConcordant())
+ {
+ setFragmentLength(*omate, ssdb, threads_rids_mindist, spliceSites);
+ } else {
+ assert(!isFraglenSet());
+ }
+ }
+ }
+
+ /**
+ * Assuming this alignment and the given alignment are at the extreme ends
+ * of a fragment, return the length of the fragment. We take all clipping,
+ * both hard and soft, into account here. Any clipping that occurred
+ * earlier and isn't accounted for within Bowtie2 should be accounted for
+ * by the user in how they set the maximum and minimum fragment length
+ * settings.
+ */
+ int64_t setFragmentLength(const AlnRes& omate,
+ const SpliceSiteDB* ssdb = NULL, // splice sites
+ uint64_t threads_rids_mindist = 0,
+ EList* spliceSites = NULL) {
+ Coord st, en, st2, en2;
+ Coord ost, oen, ost2, oen2;
+ assert_eq(refid(), omate.refid());
+ getExtendedCoords(st, en, st2, en2);
+ omate.getExtendedCoords(ost, oen, ost2, oen2);
+ bool imUpstream = false;
+
+ if(st.off() < ost.off()) {
+ imUpstream = true;
+ } else if(st.off() == ost.off()) {
+ if(st.fw() && ost.fw() && readMate1()) {
+ imUpstream = true;
+ } else if(st.fw() && !ost.fw()) {
+ imUpstream = true;
+ } else {
+ imUpstream = false;
+ }
+ } else {
+ imUpstream = false;
+ }
+
+ TRefOff up, dn, up_right, dn_left;
+ if(imUpstream) {
+ up = std::min(st2.off(), ost.off());
+ up_right = std::min(en2.off(), oen.off());
+ dn_left = std::max(st2.off(), ost.off());
+ dn = std::max(en2.off(), oen.off());
+ } else {
+ up = std::min(st.off(), ost2.off());
+ up_right = std::min(en.off(), oen2.off());
+ dn_left = std::max(st.off(), ost2.off());
+ dn = std::max(en.off(), oen2.off());
+ }
+ assert_geq(dn, up);
+ TRefOff intron_len = 0;
+ if(ssdb != NULL &&
+ !repeat() &&
+ up_right + 100 < dn_left) {
+ assert(spliceSites != NULL);
+ if(spliceSites->size() == 0) {
+ ssdb->getRightSpliceSites(refid(), up_right, dn_left - up_right, *spliceSites);
+ }
+ for(size_t si = 0; si < spliceSites->size(); si++) {
+ const SpliceSite& ss = (*spliceSites)[si];
+ if(!ss._fromfile && ss._readid + threads_rids_mindist > rdid_) continue;
+ if(ss.left() <= up || ss.right() >= dn) continue;
+ TRefOff tmp_intron_len = ss.intron_len();
+ if(intron_len < tmp_intron_len) {
+ intron_len = tmp_intron_len;
+ }
+ }
+ }
+ fraglen_ = 1 + dn - up;
+ assert_geq(fraglen_, intron_len);
+ fraglen_ -= intron_len;
+ if(!imUpstream) {
+ fraglen_ = -fraglen_;
+ }
+ fraglenSet_ = true;
+ return fraglen_;
+ }
+
+ /**
+ * Return fragment length inferred by a paired-end alignment, or -1 if the
+ * alignment is not part of a pair.
+ */
+ int64_t fragmentLength() const {
+ assert_gt(type_, 0);
+ assert(fraglenSet_);
+ return fraglen_;
+ }
+
+ /**
+ * Initialize new AlnRes.
+ */
+ void init(
+ size_t rdlen, // # chars after hard trimming
+ TReadId rdid, // read ID
+ AlnScore score, // alignment score
+ const EList* ned, // nucleotide edits
+ size_t ned_i, // first position to copy
+ size_t ned_n, // # positions to copy
+ const EList* aed, // ambiguous base resolutions
+ size_t aed_i, // first position to copy
+ size_t aed_n, // # positions to copy
+ Coord refcoord, // leftmost ref pos of 1st al char
+ TRefOff reflen, // length of the reference
+ LinkedEList >* raw_edits,
+ int seedmms = -1,// # seed mms allowed
+ int seedlen = -1,// seed length
+ int seedival = -1,// space between seeds
+ int64_t minsc = -1,// minimum score for valid aln
+ int nuc5p = -1,//
+ int nuc3p = -1,
+ bool pretrimSoft = false,
+ size_t pretrim5p = 0, // trimming prior to alignment
+ size_t pretrim3p = 0, // trimming prior to alignment
+ bool trimSoft = true,
+ size_t trim5p = 0, // trimming from alignment
+ size_t trim3p = 0, // trimming from alignment
+ bool repeat = false); // repeat
+
+ /**
+ * Return number of bases trimmed from the 5' end. Argument determines
+ * whether we're counting hard- or soft-trimmed bases.
+ */
+ size_t trimmed5p(bool soft) const {
+ size_t trim = 0;
+ if(pretrimSoft_ == soft) trim += pretrim5p_;
+ if(trimSoft_ == soft) trim += trim5p_;
+ return trim;
+ }
+
+ /**
+ * Return number of bases trimmed from the 3' end. Argument determines
+ * whether we're counting hard- or soft-trimmed bases.
+ */
+ size_t trimmed3p(bool soft) const {
+ size_t trim = 0;
+ if(pretrimSoft_ == soft) trim += pretrim3p_;
+ if(trimSoft_ == soft) trim += trim3p_;
+ return trim;
+ }
+
+ /**
+ * Return number of bases trimmed from the left end. Argument determines
+ * whether we're counting hard- or soft-trimmed bases.
+ */
+ size_t trimmedLeft(bool soft) const {
+ return fw() ? trimmed5p(soft) : trimmed3p(soft);
+ }
+
+ /**
+ * Return number of bases trimmed from the right end. Argument determines
+ * whether we're counting hard- or soft-trimmed bases.
+ */
+ size_t trimmedRight(bool soft) const {
+ return fw() ? trimmed3p(soft) : trimmed5p(soft);
+ }
+
+ bool repeat() const { return repeat_; }
+
+ /**
+ * Set the number of reference Ns covered by the alignment.
+ */
+ void setRefNs(size_t refns) {
+ refns_ = refns;
+ }
+
+ /**
+ * Return the number of reference Ns covered by the alignment.
+ */
+ size_t refNs() const { return refns_; }
+
+ /**
+ * Clip away portions of the alignment that are outside the given bounds.
+ * Clipping is soft if soft == true, hard otherwise.
+ */
+ void clipOutside(bool soft, TRefOff refi, TRefOff reff);
+
+ /**
+ * Soft trim bases from the LHS of the alignment.
+ */
+ void clipLeft(size_t rd_amt, size_t rf_amt);
+
+ /**
+ * Soft trim bases from the RHS of the alignment.
+ */
+ void clipRight(size_t rd_amt, size_t rf_amt);
+
+ /**
+ * In debug mode, we put a copy of the decoded nucleotide sequence here.
+ */
+ ASSERT_ONLY(BTDnaString drd);
+
+ /**
+ * Return true iff this AlnRes should come before the given AlnRes in a
+ * prioritized list of results.
+ */
+ bool operator<(const AlnRes& o) const {
+ return score_ > o.score_;
+ }
+
+ bool operator==(const AlnRes& o) const {
+ return
+ shapeSet_ == o.shapeSet_ &&
+ rdlen_ == o.rdlen_ &&
+ rdid_ == o.rdid_ &&
+ rdrows_ == o.rdrows_ &&
+ score_ == o.score_ &&
+ //oscore_ == o.oscore_ &&
+ *ned_ == *(o.ned_) &&
+ *aed_ == *(o.aed_) &&
+ refcoord_ == o.refcoord_ &&
+ reflen_ == o.reflen_ &&
+ refival_ == o.refival_ &&
+ rdextent_ == o.rdextent_ &&
+ rdexrows_ == o.rdexrows_ &&
+ rfextent_ == o.rfextent_ &&
+ seedmms_ == o.seedmms_ &&
+ seedlen_ == o.seedlen_ &&
+ seedival_ == o.seedival_ &&
+ minsc_ == o.minsc_ &&
+ nuc5p_ == o.nuc5p_ &&
+ nuc3p_ == o.nuc3p_ &&
+ refns_ == o.refns_ &&
+ type_ == o.type_ &&
+ fraglen_ == o.fraglen_ &&
+ pretrimSoft_ == o.pretrimSoft_ &&
+ pretrim5p_ == o.pretrim5p_ &&
+ pretrim3p_ == o.pretrim3p_ &&
+ trimSoft_ == o.trimSoft_ &&
+ trim5p_ == o.trim5p_ &&
+ trim3p_ == o.trim3p_ &&
+ repeat_ == o.repeat_ &&
+ num_spliced_ == o.num_spliced_;
+ }
+
+ /**
+ * Initialize a StackedAln (stacked alignment) object w/r/t this alignment.
+ */
+ void initStacked(const Read& rd, StackedAln& st) const {
+ size_t trimLS = trimmed5p(true);
+ size_t trimLH = trimmed5p(false);
+ size_t trimRS = trimmed3p(true);
+ size_t trimRH = trimmed3p(false);
+ size_t len_trimmed = rd.length() - trimLS - trimRS;
+ if(!fw()) {
+ Edit::invertPoss(const_cast&>(*ned_), len_trimmed, false);
+ swap(trimLS, trimRS);
+ swap(trimLH, trimRH);
+ }
+ st.init(
+ fw() ? rd.patFw : rd.patRc,
+ *ned_, trimLS, trimLH, trimRS, trimRH);
+ if(!fw()) {
+ Edit::invertPoss(const_cast&>(*ned_), len_trimmed, false);
+ }
+ }
+
+protected:
+
+ /**
+ * Given that rdextent_ and ned_ are already set, calculate rfextent_.
+ */
+ void calcRefExtent() {
+ assert_gt(rdextent_, 0);
+ rfextent_ = rdextent_;
+ for(size_t i = 0; i < ned_->size(); i++) {
+ if((*ned_)[i].isRefGap()) rfextent_--;
+ if((*ned_)[i].isReadGap()) rfextent_++;
+ }
+ }
+
+ bool shapeSet_; // true iff setShape() has been called
+ size_t rdlen_; // length of the original read
+ TReadId rdid_; // read id
+ size_t rdrows_; // # rows in alignment problem
+ AlnScore score_; // best SW score found
+ AlnScore oscore_; // score of opposite mate
+ EList* ned_; // base edits
+ EList* aed_; // ambiguous base resolutions
+ Coord refcoord_; // ref coordinates (seq idx, offset, orient)
+ TRefOff reflen_; // reference length
+ Interval refival_; // ref interval (coord + length)
+ size_t rdextent_; // number of read chars involved in alignment
+ size_t rdexrows_; // number of read rows involved in alignment
+ size_t rfextent_; // number of ref chars involved in alignment
+ int seedmms_; // number of mismatches allowed in seed
+ int seedlen_; // length of seed
+ int seedival_; // interval between seeds
+ int64_t minsc_; // minimum score
+ int nuc5p_; // 5'-most decoded base; clipped if excluding end
+ int nuc3p_; // 3'-most decoded base; clipped if excluding end
+ size_t refns_; // # of reference Ns overlapped
+ int type_; // unpaired or mate #1 or mate #2?
+ bool fraglenSet_; // true iff a fragment length has been inferred
+ int64_t fraglen_; // inferred fragment length
+
+ // A tricky aspect of trimming is that we have to decide what the units are:
+ // read positions, reference positions??? We choose read positions here.
+ // In other words, if an alignment overhangs the end of the reference and
+ // part of the overhanging portion is a reference gap, we have to make sure
+ // the trim amount reflects the number of *read characters* to trim
+ // including the character opposite the reference gap.
+
+ // Nucleotide-sequence trimming
+ bool pretrimSoft_; // trimming prior to alignment is soft?
+ size_t pretrim5p_; // # bases trimmed from 5p end prior to alignment
+ size_t pretrim3p_; // # bases trimmed from 3p end prior to alignment
+ bool trimSoft_; // trimming by local alignment is soft?
+ size_t trim5p_; // # bases trimmed from 5p end by local alignment
+ size_t trim3p_; // # bases trimmed from 3p end by local alignment
+ bool repeat_; // repeat?
+
+ size_t num_spliced_;
+ LinkedEListNode >* ned_node_;
+ LinkedEListNode >* aed_node_;
+ LinkedEList >* raw_edits_;
+};
+
+/**
+ * Unique ID for a cell in the overall DP table. This is a helpful concept
+ * because of our definition of "redundnant". Two alignments are redundant iff
+ * they have at least one cell in common in the overall DP table.
+ */
+struct RedundantCell {
+
+ RedundantCell() {
+ rfid = 0;
+ fw = true;
+ rfoff = 0;
+ rdoff = 0;
+ }
+
+ RedundantCell(
+ TRefId rfid_,
+ bool fw_,
+ TRefOff rfoff_,
+ size_t rdoff_)
+ {
+ init(rfid_, fw_, rfoff_, rdoff_);
+ }
+
+ void init(
+ TRefId rfid_,
+ bool fw_,
+ TRefOff rfoff_,
+ size_t rdoff_)
+ {
+ rfid = rfid_;
+ fw = fw_;
+ rfoff = rfoff_;
+ rdoff = rdoff_;
+ }
+
+ /**
+ * Return true iff this RedundantCell is less than the given RedundantCell.
+ */
+ inline bool operator<(const RedundantCell& c) const {
+ if(rfid < c.rfid) return true;
+ if(rfid > c.rfid) return false;
+ if(!fw && c.fw) return true;
+ if( fw && !c.fw) return false;
+ if(rfoff < c.rfoff) return true;
+ if(rfoff > c.rfoff) return false;
+ return rdoff < c.rdoff;
+ }
+
+ /**
+ * Return true iff this RedundantCell is greater than the given
+ * RedundantCell.
+ */
+ inline bool operator>(const RedundantCell& c) const {
+ if(rfid > c.rfid) return true;
+ if(rfid < c.rfid) return false;
+ if( fw && !c.fw) return true;
+ if(!fw && c.fw) return false;
+ if(rfoff > c.rfoff) return true;
+ if(rfoff < c.rfoff) return false;
+ return rdoff > c.rdoff;
+ }
+
+ /**
+ * Return true iff this RedundantCell is equal to the given RedundantCell.
+ */
+ inline bool operator==(const RedundantCell& c) const {
+ return
+ rfid == c.rfid &&
+ fw == c.fw &&
+ rfoff == c.rfoff &&
+ rdoff == c.rdoff;
+ }
+
+ TRefId rfid; // reference id
+ bool fw; // orientation
+ TRefOff rfoff; // column
+ size_t rdoff; // row
+};
+
+/**
+ * Encapsulates data structures and routines allowing client to determine
+ * whether one alignment is redundant (has a DP cell in common with) with a set
+ * of others.
+ *
+ * Adding cells to and checking cell against this data structure can get rather
+ * slow when there are many alignments in play. Dividing the burden over
+ * read-position bins helps some.
+ */
+class RedundantAlns {
+
+public:
+
+ RedundantAlns(int cat = DP_CAT) : cells_(cat) { }
+
+ /**
+ * Empty the cell database.
+ */
+ void reset() { cells_.clear(); }
+
+ /**
+ * Initialize and set the list of sets to equal the read length.
+ */
+ void init(size_t npos) {
+ cells_.resize(npos);
+ for(size_t i = 0; i < npos; i++) {
+ cells_[i].clear();
+ }
+ }
+
+ /**
+ * Add all of the cells involved in the given alignment to the database.
+ */
+ void add(const AlnRes& res);
+
+ /**
+ * Return true iff the given alignment has at least one cell that overlaps
+ * one of the cells in the database.
+ */
+ bool overlap(const AlnRes& res);
+
+protected:
+
+ EList > cells_;
+};
+
+typedef uint64_t TNumAlns;
+
+/**
+ * Encapsulates a concise summary of a set of alignment results for a
+ * given pair or mate. Referring to the fields of this object should
+ * provide enough information to print output records for the read.
+ */
+class AlnSetSumm {
+
+public:
+
+ AlnSetSumm() { reset(); }
+
+ /**
+ * Given an unpaired read (in either rd1 or rd2) or a read pair
+ * (mate 1 in rd1, mate 2 in rd2).
+ */
+ explicit AlnSetSumm(
+ const Read* rd1,
+ const Read* rd2,
+ const EList* rs1,
+ const EList* rs2,
+ const EList* rs1u,
+ const EList* rs2u,
+ bool exhausted1,
+ bool exhausted2,
+ TRefId orefid,
+ TRefOff orefoff,
+ bool repeat)
+ {
+ init(rd1, rd2, rs1, rs2, rs1u, rs2u, exhausted1, exhausted2,
+ orefid, orefoff, repeat);
+ }
+
+ explicit AlnSetSumm(
+ AlnScore best1,
+ AlnScore secbest1,
+ AlnScore best2,
+ AlnScore secbest2,
+ AlnScore bestPaired,
+ AlnScore secbestPaired,
+ TNumAlns other1,
+ TNumAlns other2,
+ bool paired,
+ bool exhausted1,
+ bool exhausted2,
+ TRefId orefid,
+ TRefOff orefoff,
+ bool repeat,
+ TNumAlns numAlns1,
+ TNumAlns numAlns2,
+ TNumAlns numAlnsPaired)
+ {
+ init(
+ best1,
+ secbest1,
+ best2,
+ secbest2,
+ bestPaired,
+ secbestPaired,
+ other1,
+ other2,
+ paired,
+ exhausted1,
+ exhausted2,
+ orefid,
+ orefoff,
+ repeat,
+ numAlns1,
+ numAlns2,
+ numAlnsPaired);
+ }
+
+ /**
+ * Set to uninitialized state.
+ */
+ void reset() {
+ best1_.invalidate();
+ secbest1_.invalidate();
+ best2_.invalidate();
+ secbest2_.invalidate();
+ bestPaired_.invalidate();
+ secbestPaired_.invalidate();
+ other1_ = other2_ = 0;
+ paired_ = false;
+ exhausted1_ = exhausted2_ = false;
+ orefid_ = -1;
+ orefoff_ = -1;
+ repeat_ = false;
+ numAlns1_ = numAlns2_= numAlnsPaired_ = 0;
+ }
+
+ void init(
+ const Read* rd1,
+ const Read* rd2,
+ const EList* rs1,
+ const EList* rs2,
+ const EList* rs1u,
+ const EList* rs2u,
+ bool exhausted1,
+ bool exhausted2,
+ TRefId orefid,
+ TRefOff orefoff,
+ bool repeat);
+
+ /**
+ * Initialize given fields. See constructor for how fields are set.
+ */
+ void init(
+ AlnScore best1,
+ AlnScore secbest1,
+ AlnScore best2,
+ AlnScore secbest2,
+ AlnScore bestPaired,
+ AlnScore secbestPaired,
+ TNumAlns other1,
+ TNumAlns other2,
+ bool paired,
+ bool exhausted1,
+ bool exhausted2,
+ TRefId orefid,
+ TRefOff orefoff,
+ bool repeat,
+ TNumAlns numAlns1,
+ TNumAlns numAlns2,
+ TNumAlns numAlnsPaired)
+ {
+ best1_ = best1;
+ secbest1_ = secbest1;
+ best2_ = best2;
+ secbest2_ = secbest2;
+ bestPaired_ = bestPaired;
+ secbestPaired_ = secbestPaired;
+ other1_ = other1;
+ other2_ = other2;
+ paired_ = paired;
+ exhausted1_ = exhausted1;
+ exhausted2_ = exhausted2;
+ orefid_ = orefid;
+ orefoff_ = orefoff;
+ repeat_ = repeat;
+ numAlns1_ = numAlns1;
+ numAlns2_ = numAlns2;
+ numAlnsPaired_ = numAlnsPaired;
+ assert(repOk());
+ }
+
+ /**
+ * Return true iff there is at least a best alignment
+ */
+ bool empty() const {
+ assert(repOk());
+ return !VALID_AL_SCORE(best1_);
+ }
+
+#ifndef NDEBUG
+ /**
+ * Check that the summary is internally consistent.
+ */
+ bool repOk() const {
+ assert(other1_ == 0 || VALID_AL_SCORE(secbest1_));
+ assert(other1_ != 0 || !VALID_AL_SCORE(secbest1_));
+ assert(other2_ == 0 || VALID_AL_SCORE(secbest2_));
+ assert(other2_ != 0 || !VALID_AL_SCORE(secbest2_));
+ return true;
+ }
+#endif
+
+ AlnScore best1() const { return best1_; }
+ AlnScore secbest1() const { return secbest1_; }
+ AlnScore best2() const { return best2_; }
+ AlnScore secbest2() const { return secbest2_; }
+ AlnScore bestPaired() const { return bestPaired_; }
+ AlnScore secbestPaired() const { return secbestPaired_; }
+ TNumAlns other1() const { return other1_; }
+ TNumAlns other2() const { return other2_; }
+ bool paired() const { return paired_; }
+ bool exhausted1() const { return exhausted1_; }
+ bool exhausted2() const { return exhausted2_; }
+ TRefId orefid() const { return orefid_; }
+ TRefOff orefoff() const { return orefoff_; }
+ bool repeat() const { return repeat_; }
+
+ TNumAlns numAlns1() const { return numAlns1_; }
+ TNumAlns numAlns2() const { return numAlns2_; }
+ TNumAlns numAlnsPaired() const { return numAlnsPaired_; }
+
+ void numAlns1(TNumAlns numAlns1) { numAlns1_ = numAlns1; }
+ void numAlns2(TNumAlns numAlns2) { numAlns2_ = numAlns2; }
+ void numAlnsPaired(TNumAlns numAlnsPaired) { numAlnsPaired_ = numAlnsPaired; }
+
+ /**
+ *
+ */
+ AlnScore best(bool mate1) const { return mate1 ? best1_ : best2_; }
+
+ bool exhausted(bool mate1) const {
+ return mate1 ? exhausted1_ : exhausted2_;
+ }
+
+ /**
+ * Return the second-best score for the specified mate. If the alignment
+ * is paired and the specified mate aligns uniquely, return an invalid
+ * second-best score. This allows us to treat mates separately, so that
+ * repetitive paired-end alignments don't trump potentially unique unpaired
+ * alignments.
+ */
+ AlnScore secbestMate(bool mate1) const {
+ return mate1 ? secbest1_ : secbest2_;
+ }
+
+ /**
+ * Return the second-best score for the specified mate. If the alignment
+ * is paired and the specified mate aligns uniquely, return an invalid
+ * second-best score. This allows us to treat mates separately, so that
+ * repetitive paired-end alignments don't trump potentially unique unpaired
+ * alignments.
+ */
+ AlnScore secbest(bool mate1) const {
+ if(paired_) {
+ if(mate1) {
+ //if(!secbest1_.valid()) {
+ return secbest1_;
+ //}
+ } else {
+ //if(!secbest2_.valid()) {
+ return secbest2_;
+ //}
+ }
+ //return secbestPaired_;
+ } else {
+ return mate1 ? secbest1_ : secbest2_;
+ }
+ }
+
+protected:
+
+ AlnScore bestPaired_; // best full-alignment score found for this read
+ AlnScore secbestPaired_; // second-best
+ AlnScore best1_; // best full-alignment score found for this read
+ AlnScore secbest1_; // second-best
+ AlnScore best2_; // best full-alignment score found for this read
+ AlnScore secbest2_; // second-best
+ TNumAlns other1_; // # more alignments within N points of second-best
+ TNumAlns other2_; // # more alignments within N points of second-best
+ bool paired_; // results are paired
+ bool exhausted1_; // searched exhaustively for mate 1 alignments?
+ bool exhausted2_; // searched exhaustively for mate 2 alignments?
+ TRefId orefid_;
+ TRefOff orefoff_;
+ bool repeat_;
+
+ TNumAlns numAlns1_; // number of alignments for mate 1 as singleton or discordantly mapped
+ TNumAlns numAlns2_; // number of alignments for mate 2 as singleton or discordantly mapped
+ TNumAlns numAlnsPaired_; // number of concordant pair alignments
+};
+
+#endif
diff --git a/aligner_seed.cpp b/aligner_seed.cpp
new file mode 100644
index 0000000..5fe0419
--- /dev/null
+++ b/aligner_seed.cpp
@@ -0,0 +1,530 @@
+/*
+ * Copyright 2011, Ben Langmead
+ *
+ * This file is part of Bowtie 2.
+ *
+ * Bowtie 2 is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ *
+ * Bowtie 2 is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with Bowtie 2. If not, see .
+ */
+
+#include "aligner_cache.h"
+#include "aligner_seed.h"
+#include "search_globals.h"
+#include "gfm.h"
+
+using namespace std;
+
+/**
+ * Construct a constraint with no edits of any kind allowed.
+ */
+Constraint Constraint::exact() {
+ Constraint c;
+ c.edits = c.mms = c.ins = c.dels = c.penalty = 0;
+ return c;
+}
+
+/**
+ * Construct a constraint where the only constraint is a total
+ * penalty constraint.
+ */
+Constraint Constraint::penaltyBased(int pen) {
+ Constraint c;
+ c.penalty = pen;
+ return c;
+}
+
+/**
+ * Construct a constraint where the only constraint is a total
+ * penalty constraint related to the length of the read.
+ */
+Constraint Constraint::penaltyFuncBased(const SimpleFunc& f) {
+ Constraint c;
+ c.penFunc = f;
+ return c;
+}
+
+/**
+ * Construct a constraint where the only constraint is a total
+ * penalty constraint.
+ */
+Constraint Constraint::mmBased(int mms) {
+ Constraint c;
+ c.mms = mms;
+ c.edits = c.dels = c.ins = 0;
+ return c;
+}
+
+/**
+ * Construct a constraint where the only constraint is a total
+ * penalty constraint.
+ */
+Constraint Constraint::editBased(int edits) {
+ Constraint c;
+ c.edits = edits;
+ c.dels = c.ins = c.mms = 0;
+ return c;
+}
+
+//
+// Some static methods for constructing some standard SeedPolicies
+//
+
+/**
+ * Given a read, depth and orientation, extract a seed data structure
+ * from the read and fill in the steps & zones arrays. The Seed
+ * contains the sequence and quality values.
+ */
+bool
+Seed::instantiate(
+ const Read& read,
+ const BTDnaString& seq, // seed read sequence
+ const BTString& qual, // seed quality sequence
+ const Scoring& pens,
+ int depth,
+ int seedoffidx,
+ int seedtypeidx,
+ bool fw,
+ InstantiatedSeed& is) const
+{
+ assert(overall != NULL);
+ int seedlen = len;
+ if((int)read.length() < seedlen) {
+ // Shrink seed length to fit read if necessary
+ seedlen = (int)read.length();
+ }
+ assert_gt(seedlen, 0);
+ is.steps.resize(seedlen);
+ is.zones.resize(seedlen);
+ // Fill in 'steps' and 'zones'
+ //
+ // The 'steps' list indicates which read character should be
+ // incorporated at each step of the search process. Often we will
+ // simply proceed from one end to the other, in which case the
+ // 'steps' list is ascending or descending. In some cases (e.g.
+ // the 2mm case), we might want to switch directions at least once
+ // during the search, in which case 'steps' will jump in the
+ // middle. When an element of the 'steps' list is negative, this
+ // indicates that the next
+ //
+ // The 'zones' list indicates which zone constraint is active at
+ // each step. Each element of the 'zones' list is a pair; the
+ // first pair element indicates the applicable zone when
+ // considering either mismatch or delete (ref gap) events, while
+ // the second pair element indicates the applicable zone when
+ // considering insertion (read gap) events. When either pair
+ // element is a negative number, that indicates that we are about
+ // to leave the zone for good, at which point we may need to
+ // evaluate whether we have reached the zone's budget.
+ //
+ switch(type) {
+ case SEED_TYPE_EXACT: {
+ for(int k = 0; k < seedlen; k++) {
+ is.steps[k] = -(seedlen - k);
+ // Zone 0 all the way
+ is.zones[k].first = is.zones[k].second = 0;
+ }
+ break;
+ }
+ case SEED_TYPE_LEFT_TO_RIGHT: {
+ for(int k = 0; k < seedlen; k++) {
+ is.steps[k] = k+1;
+ // Zone 0 from 0 up to ceil(len/2), then 1
+ is.zones[k].first = is.zones[k].second = ((k < (seedlen+1)/2) ? 0 : 1);
+ }
+ // Zone 1 ends at the RHS
+ is.zones[seedlen-1].first = is.zones[seedlen-1].second = -1;
+ break;
+ }
+ case SEED_TYPE_RIGHT_TO_LEFT: {
+ for(int k = 0; k < seedlen; k++) {
+ is.steps[k] = -(seedlen - k);
+ // Zone 0 from 0 up to floor(len/2), then 1
+ is.zones[k].first = ((k < seedlen/2) ? 0 : 1);
+ // Inserts: Zone 0 from 0 up to ceil(len/2)-1, then 1
+ is.zones[k].second = ((k < (seedlen+1)/2+1) ? 0 : 1);
+ }
+ is.zones[seedlen-1].first = is.zones[seedlen-1].second = -1;
+ break;
+ }
+ case SEED_TYPE_INSIDE_OUT: {
+ // Zone 0 from ceil(N/4) up to N-floor(N/4)
+ int step = 0;
+ for(int k = (seedlen+3)/4; k < seedlen - (seedlen/4); k++) {
+ is.zones[step].first = is.zones[step].second = 0;
+ is.steps[step++] = k+1;
+ }
+ // Zone 1 from N-floor(N/4) up
+ for(int k = seedlen - (seedlen/4); k < seedlen; k++) {
+ is.zones[step].first = is.zones[step].second = 1;
+ is.steps[step++] = k+1;
+ }
+ // No Zone 1 if seedlen is short (like 2)
+ //assert_eq(1, is.zones[step-1].first);
+ is.zones[step-1].first = is.zones[step-1].second = -1;
+ // Zone 2 from ((seedlen+3)/4)-1 down to 0
+ for(int k = ((seedlen+3)/4)-1; k >= 0; k--) {
+ is.zones[step].first = is.zones[step].second = 2;
+ is.steps[step++] = -(k+1);
+ }
+ assert_eq(2, is.zones[step-1].first);
+ is.zones[step-1].first = is.zones[step-1].second = -2;
+ assert_eq(seedlen, step);
+ break;
+ }
+ default:
+ throw 1;
+ }
+ // Instantiate constraints
+ for(int i = 0; i < 3; i++) {
+ is.cons[i] = zones[i];
+ is.cons[i].instantiate(read.length());
+ }
+ is.overall = *overall;
+ is.overall.instantiate(read.length());
+ // Take a sweep through the seed sequence. Consider where the Ns
+ // occur and how zones are laid out. Calculate the maximum number
+ // of positions we can jump over initially (e.g. with the ftab) and
+ // perhaps set this function's return value to false, indicating
+ // that the arrangements of Ns prevents the seed from aligning.
+ bool streak = true;
+ is.maxjump = 0;
+ bool ret = true;
+ bool ltr = (is.steps[0] > 0); // true -> left-to-right
+ for(size_t i = 0; i < is.steps.size(); i++) {
+ assert_neq(0, is.steps[i]);
+ int off = is.steps[i];
+ off = abs(off)-1;
+ Constraint& cons = is.cons[abs(is.zones[i].first)];
+ int c = seq[off]; assert_range(0, 4, c);
+ int q = qual[off];
+ if(ltr != (is.steps[i] > 0) || // changed direction
+ is.zones[i].first != 0 || // changed zone
+ is.zones[i].second != 0) // changed zone
+ {
+ streak = false;
+ }
+ if(c == 4) {
+ // Induced mismatch
+ if(cons.canN(q, pens)) {
+ cons.chargeN(q, pens);
+ } else {
+ // Seed disqualified due to arrangement of Ns
+ return false;
+ }
+ }
+ if(streak) is.maxjump++;
+ }
+ is.seedoff = depth;
+ is.seedoffidx = seedoffidx;
+ is.fw = fw;
+ is.s = *this;
+ return ret;
+}
+
+/**
+ * Return a set consisting of 1 seed encapsulating an exact matching
+ * strategy.
+ */
+void
+Seed::zeroMmSeeds(int ln, EList& pols, Constraint& oall) {
+ oall.init();
+ // Seed policy 1: left-to-right search
+ pols.expand();
+ pols.back().len = ln;
+ pols.back().type = SEED_TYPE_EXACT;
+ pols.back().zones[0] = Constraint::exact();
+ pols.back().zones[1] = Constraint::exact();
+ pols.back().zones[2] = Constraint::exact(); // not used
+ pols.back().overall = &oall;
+}
+
+/**
+ * Return a set of 2 seeds encapsulating a half-and-half 1mm strategy.
+ */
+void
+Seed::oneMmSeeds(int ln, EList& pols, Constraint& oall) {
+ oall.init();
+ // Seed policy 1: left-to-right search
+ pols.expand();
+ pols.back().len = ln;
+ pols.back().type = SEED_TYPE_LEFT_TO_RIGHT;
+ pols.back().zones[0] = Constraint::exact();
+ pols.back().zones[1] = Constraint::mmBased(1);
+ pols.back().zones[2] = Constraint::exact(); // not used
+ pols.back().overall = &oall;
+ // Seed policy 2: right-to-left search
+ pols.expand();
+ pols.back().len = ln;
+ pols.back().type = SEED_TYPE_RIGHT_TO_LEFT;
+ pols.back().zones[0] = Constraint::exact();
+ pols.back().zones[1] = Constraint::mmBased(1);
+ pols.back().zones[1].mmsCeil = 0;
+ pols.back().zones[2] = Constraint::exact(); // not used
+ pols.back().overall = &oall;
+}
+
+/**
+ * Return a set of 3 seeds encapsulating search roots for:
+ *
+ * 1. Starting from the left-hand side and searching toward the
+ * right-hand side allowing 2 mismatches in the right half.
+ * 2. Starting from the right-hand side and searching toward the
+ * left-hand side allowing 2 mismatches in the left half.
+ * 3. Starting (effectively) from the center and searching out toward
+ * both the left and right-hand sides, allowing one mismatch on
+ * either side.
+ *
+ * This is not exhaustive. There are 2 mismatch cases mised; if you
+ * imagine the seed as divided into four successive quarters A, B, C
+ * and D, the cases we miss are when mismatches occur in A and C or B
+ * and D.
+ */
+void
+Seed::twoMmSeeds(int ln, EList& pols, Constraint& oall) {
+ oall.init();
+ // Seed policy 1: left-to-right search
+ pols.expand();
+ pols.back().len = ln;
+ pols.back().type = SEED_TYPE_LEFT_TO_RIGHT;
+ pols.back().zones[0] = Constraint::exact();
+ pols.back().zones[1] = Constraint::mmBased(2);
+ pols.back().zones[2] = Constraint::exact(); // not used
+ pols.back().overall = &oall;
+ // Seed policy 2: right-to-left search
+ pols.expand();
+ pols.back().len = ln;
+ pols.back().type = SEED_TYPE_RIGHT_TO_LEFT;
+ pols.back().zones[0] = Constraint::exact();
+ pols.back().zones[1] = Constraint::mmBased(2);
+ pols.back().zones[1].mmsCeil = 1; // Must have used at least 1 mismatch
+ pols.back().zones[2] = Constraint::exact(); // not used
+ pols.back().overall = &oall;
+ // Seed policy 3: inside-out search
+ pols.expand();
+ pols.back().len = ln;
+ pols.back().type = SEED_TYPE_INSIDE_OUT;
+ pols.back().zones[0] = Constraint::exact();
+ pols.back().zones[1] = Constraint::mmBased(1);
+ pols.back().zones[1].mmsCeil = 0; // Must have used at least 1 mismatch
+ pols.back().zones[2] = Constraint::mmBased(1);
+ pols.back().zones[2].mmsCeil = 0; // Must have used at least 1 mismatch
+ pols.back().overall = &oall;
+}
+
+/**
+ * Types of actions that can be taken by the SeedAligner.
+ */
+enum {
+ SA_ACTION_TYPE_RESET = 1,
+ SA_ACTION_TYPE_SEARCH_SEED, // 2
+ SA_ACTION_TYPE_FTAB, // 3
+ SA_ACTION_TYPE_FCHR, // 4
+ SA_ACTION_TYPE_MATCH, // 5
+ SA_ACTION_TYPE_EDIT // 6
+};
+
+#define MIN(x, y) ((x < y) ? x : y)
+
+#ifdef ALIGNER_SEED_MAIN
+
+#include
+#include
+
+/**
+ * Parse an int out of optarg and enforce that it be at least 'lower';
+ * if it is less than 'lower', than output the given error message and
+ * exit with an error and a usage message.
+ */
+static int parseInt(const char *errmsg, const char *arg) {
+ long l;
+ char *endPtr = NULL;
+ l = strtol(arg, &endPtr, 10);
+ if (endPtr != NULL) {
+ return (int32_t)l;
+ }
+ cerr << errmsg << endl;
+ throw 1;
+ return -1;
+}
+
+enum {
+ ARG_NOFW = 256,
+ ARG_NORC,
+ ARG_MM,
+ ARG_SHMEM,
+ ARG_TESTS,
+ ARG_RANDOM_TESTS,
+ ARG_SEED
+};
+
+static const char *short_opts = "vCt";
+static struct option long_opts[] = {
+ {(char*)"verbose", no_argument, 0, 'v'},
+ {(char*)"color", no_argument, 0, 'C'},
+ {(char*)"timing", no_argument, 0, 't'},
+ {(char*)"nofw", no_argument, 0, ARG_NOFW},
+ {(char*)"norc", no_argument, 0, ARG_NORC},
+ {(char*)"mm", no_argument, 0, ARG_MM},
+ {(char*)"shmem", no_argument, 0, ARG_SHMEM},
+ {(char*)"tests", no_argument, 0, ARG_TESTS},
+ {(char*)"random", required_argument, 0, ARG_RANDOM_TESTS},
+ {(char*)"seed", required_argument, 0, ARG_SEED},
+};
+
+static void printUsage(ostream& os) {
+ os << "Usage: ac [options]* " << endl;
+ os << "Options:" << endl;
+ os << " --mm memory-mapped mode" << endl;
+ os << " --shmem shared memory mode" << endl;
+ os << " --nofw don't align forward-oriented read" << endl;
+ os << " --norc don't align reverse-complemented read" << endl;
+ os << " -t/--timing show timing information" << endl;
+ os << " -C/--color colorspace mode" << endl;
+ os << " -v/--verbose talkative mode" << endl;
+}
+
+bool gNorc = false;
+bool gNofw = false;
+bool gColor = false;
+int gVerbose = 0;
+int gGapBarrier = 1;
+bool gColorExEnds = true;
+int gSnpPhred = 30;
+bool gReportOverhangs = true;
+
+extern void aligner_seed_tests();
+extern void aligner_random_seed_tests(
+ int num_tests,
+ uint32_t qslo,
+ uint32_t qshi,
+ bool color,
+ uint32_t seed);
+
+/**
+ * A way of feeding simply tests to the seed alignment infrastructure.
+ */
+int main(int argc, char **argv) {
+ bool useMm = false;
+ bool useShmem = false;
+ bool mmSweep = false;
+ bool noRefNames = false;
+ bool sanity = false;
+ bool timing = false;
+ int option_index = 0;
+ int seed = 777;
+ int next_option;
+ do {
+ next_option = getopt_long(
+ argc, argv, short_opts, long_opts, &option_index);
+ switch (next_option) {
+ case 'v': gVerbose = true; break;
+ case 'C': gColor = true; break;
+ case 't': timing = true; break;
+ case ARG_NOFW: gNofw = true; break;
+ case ARG_NORC: gNorc = true; break;
+ case ARG_MM: useMm = true; break;
+ case ARG_SHMEM: useShmem = true; break;
+ case ARG_SEED: seed = parseInt("", optarg); break;
+ case ARG_TESTS: {
+ aligner_seed_tests();
+ aligner_random_seed_tests(
+ 100, // num references
+ 100, // queries per reference lo
+ 400, // queries per reference hi
+ false, // true -> generate colorspace reference/reads
+ 18); // pseudo-random seed
+ return 0;
+ }
+ case ARG_RANDOM_TESTS: {
+ seed = parseInt("", optarg);
+ aligner_random_seed_tests(
+ 100, // num references
+ 100, // queries per reference lo
+ 400, // queries per reference hi
+ false, // true -> generate colorspace reference/reads
+ seed); // pseudo-random seed
+ return 0;
+ }
+ case -1: break;
+ default: {
+ cerr << "Unknown option: " << (char)next_option << endl;
+ printUsage(cerr);
+ exit(1);
+ }
+ }
+ } while(next_option != -1);
+ char *reffn;
+ if(optind >= argc) {
+ cerr << "No reference; quitting..." << endl;
+ return 1;
+ }
+ reffn = argv[optind++];
+ if(optind >= argc) {
+ cerr << "No reads; quitting..." << endl;
+ return 1;
+ }
+ string gfmBase(reffn);
+ BitPairReference ref(
+ gfmBase, // base path
+ gColor, // whether we expect it to be colorspace
+ sanity, // whether to sanity-check reference as it's loaded
+ NULL, // fasta files to sanity check reference against
+ NULL, // another way of specifying original sequences
+ false, // true -> infiles (2 args ago) contains raw seqs
+ useMm, // use memory mapping to load index?
+ useShmem, // use shared memory (not memory mapping)
+ mmSweep, // touch all the pages after memory-mapping the index
+ gVerbose, // verbose
+ gVerbose); // verbose but just for startup messages
+ Timer *t = new Timer(cerr, "Time loading fw index: ", timing);
+ GFM gfmFw(
+ gfmBase,
+ 0, // don't need entireReverse for fw index
+ true, // index is for the forward direction
+ -1, // offrate (irrelevant)
+ useMm, // whether to use memory-mapped files
+ useShmem, // whether to use shared memory
+ mmSweep, // sweep memory-mapped files
+ !noRefNames, // load names?
+ false, // load SA sample?
+ true, // load ftab?
+ true, // load rstarts?
+ NULL, // reference map, or NULL if none is needed
+ gVerbose, // whether to be talkative
+ gVerbose, // talkative during initialization
+ false, // handle memory exceptions, don't pass them up
+ sanity);
+ delete t;
+ t = new Timer(cerr, "Time loading bw index: ", timing);
+ GFM gfmBw(
+ gfmBase + ".rev",
+ 1, // need entireReverse
+ false, // index is for the backward direction
+ -1, // offrate (irrelevant)
+ useMm, // whether to use memory-mapped files
+ useShmem, // whether to use shared memory
+ mmSweep, // sweep memory-mapped files
+ !noRefNames, // load names?
+ false, // load SA sample?
+ true, // load ftab?
+ false, // load rstarts?
+ NULL, // reference map, or NULL if none is needed
+ gVerbose, // whether to be talkative
+ gVerbose, // talkative during initialization
+ false, // handle memory exceptions, don't pass them up
+ sanity);
+ delete t;
+ for(int i = optind; i < argc; i++) {
+ }
+}
+#endif
diff --git a/aligner_seed.h b/aligner_seed.h
new file mode 100644
index 0000000..a832fd4
--- /dev/null
+++ b/aligner_seed.h
@@ -0,0 +1,2922 @@
+/*
+ * Copyright 2011, Ben Langmead
+ *
+ * This file is part of Bowtie 2.
+ *
+ * Bowtie 2 is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ *
+ * Bowtie 2 is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with Bowtie 2. If not, see .
+ */
+
+#ifndef ALIGNER_SEED_H_
+#define ALIGNER_SEED_H_
+
+#include
+#include