From 8c80d760637f8df39262683cd2570f0589423d36 Mon Sep 17 00:00:00 2001 From: derek Date: Wed, 23 Mar 2011 02:37:39 -0400 Subject: [PATCH] Major update to BamTools version 1.0 --- CMakeLists.txt | 6 +- README | 811 +-------- docs/Doxyfile | 1601 +++++++++++++++++ src/api/BGZF.cpp | 398 ---- src/api/BGZF.h | 322 ---- src/api/BamAlignment.cpp | 1200 ++++++++---- src/api/BamAlignment.h | 211 +-- src/api/BamAux.h | 444 +++-- src/api/BamConstants.h | 66 +- src/api/BamIndex.cpp | 118 +- src/api/BamIndex.h | 83 +- src/api/BamMultiReader.cpp | 330 +++- src/api/BamMultiReader.h | 247 +-- src/api/BamReader.cpp | 375 +++- src/api/BamReader.h | 107 +- src/api/BamWriter.cpp | 129 +- src/api/BamWriter.h | 34 +- src/api/CMakeLists.txt | 7 +- src/api/SamHeader.cpp | 149 +- src/api/SamHeader.h | 89 +- src/api/SamReadGroup.cpp | 141 +- src/api/SamReadGroup.h | 76 +- src/api/SamReadGroupDictionary.cpp | 209 ++- src/api/SamReadGroupDictionary.h | 45 +- src/api/SamSequence.cpp | 123 +- src/api/SamSequence.h | 63 +- src/api/SamSequenceDictionary.cpp | 251 ++- src/api/SamSequenceDictionary.h | 47 +- src/api/internal/BamHeader_p.cpp | 123 +- src/api/internal/BamHeader_p.h | 34 +- src/api/internal/BamIndexFactory_p.cpp | 110 ++ src/api/internal/BamIndexFactory_p.h | 48 + src/api/internal/BamMultiMerger_p.h | 157 +- src/api/internal/BamMultiReader_p.cpp | 539 ++++-- src/api/internal/BamMultiReader_p.h | 42 +- .../internal/BamRandomAccessController_p.cpp | 274 +++ .../internal/BamRandomAccessController_p.h | 94 + src/api/internal/BamReader_p.cpp | 570 ++---- src/api/internal/BamReader_p.h | 107 +- src/api/internal/BamStandardIndex_p.cpp | 111 +- src/api/internal/BamStandardIndex_p.h | 242 +-- src/api/internal/BamToolsIndex_p.cpp | 122 +- src/api/internal/BamToolsIndex_p.h | 205 +-- src/api/internal/BamWriter_p.cpp | 308 ++-- src/api/internal/BamWriter_p.h | 43 +- src/api/internal/BgzfStream_p.cpp | 444 +++++ src/api/internal/BgzfStream_p.h | 109 ++ src/api/internal/SamFormatParser_p.cpp | 26 +- src/api/internal/SamFormatPrinter_p.cpp | 2 +- src/api/internal/SamHeaderValidator_p.cpp | 20 +- src/shared/bamtools_global.h | 55 +- src/toolkit/CMakeLists.txt | 4 +- src/toolkit/bamtools.cpp | 20 +- src/toolkit/bamtools_convert.cpp | 327 ++-- src/toolkit/bamtools_count.cpp | 51 +- src/toolkit/bamtools_coverage.cpp | 26 +- src/toolkit/bamtools_filter.cpp | 105 +- src/toolkit/bamtools_header.cpp | 28 +- src/toolkit/bamtools_index.cpp | 26 +- src/toolkit/bamtools_index.h | 4 +- src/toolkit/bamtools_merge.cpp | 66 +- src/toolkit/bamtools_random.cpp | 52 +- src/toolkit/bamtools_revert.cpp | 37 +- src/toolkit/bamtools_sort.cpp | 61 +- src/toolkit/bamtools_split.cpp | 112 +- src/toolkit/bamtools_split.h | 7 +- src/toolkit/bamtools_stats.cpp | 33 +- src/toolkit/bamtools_tool.h | 6 +- src/utils/CMakeLists.txt | 2 +- src/utils/bamtools_utilities.cpp | 29 +- src/utils/bamtools_utilities.h | 10 +- 71 files changed, 7921 insertions(+), 4552 deletions(-) create mode 100644 docs/Doxyfile delete mode 100644 src/api/BGZF.cpp delete mode 100644 src/api/BGZF.h create mode 100644 src/api/internal/BamIndexFactory_p.cpp create mode 100644 src/api/internal/BamIndexFactory_p.h create mode 100644 src/api/internal/BamRandomAccessController_p.cpp create mode 100644 src/api/internal/BamRandomAccessController_p.h create mode 100644 src/api/internal/BgzfStream_p.cpp create mode 100644 src/api/internal/BgzfStream_p.h diff --git a/CMakeLists.txt b/CMakeLists.txt index 5af858e..8fce335 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -30,8 +30,8 @@ ensure_out_of_source_build (" (or the Windows equivalent)\n") # set BamTools version information -set (BamTools_VERSION_MAJOR 0) -set (BamTools_VERSION_MINOR 9) +set (BamTools_VERSION_MAJOR 1) +set (BamTools_VERSION_MINOR 0) set (BamTools_VERSION_BUILD 0) # set our library and executable destination dirs @@ -39,7 +39,7 @@ set (EXECUTABLE_OUTPUT_PATH "${CMAKE_SOURCE_DIR}/bin") set (LIBRARY_OUTPUT_PATH "${CMAKE_SOURCE_DIR}/lib") # define compiler flags for all code -add_definitions (-Wall -O3 -D_FILE_OFFSET_BITS=64) +add_definitions (-Wall -O2 -D_FILE_OFFSET_BITS=64) # add our includes root path include_directories (src) diff --git a/README b/README index c471dc5..1780ede 100644 --- a/README +++ b/README @@ -2,816 +2,33 @@ README : BAMTOOLS -------------------------------------------------------------------------------- -BamTools: a C++ API & toolkit for reading/writing/manipulating BAM files. - -I. Introduction - a. The API - b. The Toolkit - -II. Installation - -III. Usage - a. The API - b. The Toolkit - -IV. License - -V. Acknowledgements - -VI. Contact - --------------------------------------------------------------------------------- -I. Introduction: --------------------------------------------------------------------------------- - BamTools provides both a programmer's API and an end-user's toolkit for handling BAM files. ----------------------------------------- -Ia. The API: ----------------------------------------- - -The API consists of 2 main modules: BamReader and BamWriter. As you would -expect, BamReader provides read-access to BAM files, while BamWriter handles -writing data to BAM files. BamReader provides the interface for random-access -(jumping) in a BAM file, as well as generating BAM index files. - -BamMultiReader is an extra module that allows you to manage multiple open BAM -files for reading. It provides some validation & bookkeeping under the hood to -keep all files sync'ed up for you. - -Additional files used by the API: - - - BamAlignment.* : implements the BamAlignment data structure - - - BamAux.h : contains various constants, data structures and utility - methods used throught the API. - - - BamIndex.* : implements both the standard BAM format index (".bai") as - well as a new BamTools-specific index (".bti"). - - - BGZF.* : contains our implementation of the Broad Institute's BGZF - compression format. - ----------------------------------------- -Ib. The Toolkit: ----------------------------------------- - -If you've been using the BamTools since the early days, you'll notice that our -'toy' API examples (BamConversion, BamDump, BamTrim,...) are now gone. We have -dumped these in favor of a suite of small utilities that we hope both -developers and end-users find useful: - -usage: bamtools [--help] COMMAND [ARGS] - -Available bamtools commands: - - convert Converts between BAM and a number of other formats - count Prints number of alignments in BAM file(s) - coverage Prints coverage statistics from the input BAM file - filter Filters BAM file(s) by user-specified criteria - header Prints BAM header information - index Generates index for BAM file - merge Merge multiple BAM files into single file - random Select random alignments from existing BAM file(s) - sort Sorts the BAM file according to some criteria - split Splits a BAM file on user-specifed property, creating a - new BAM output file for each value found - stats Prints some basic statistics from input BAM file(s) - -See 'bamtools help COMMAND' for more information on a specific command. - --------------------------------------------------------------------------------- -II. Installation : --------------------------------------------------------------------------------- - ----------------------------------------- -IIa. Get CMake ----------------------------------------- - -BamTools has been migrated to a CMake-based build system. We believe that this -should simplify the build process across all platforms, especially as the -BamTools API moves into a shared library (that you link to instead of compiling -lots of source files directly into your application). CMake is available on all -major platforms, and indeed comes *out-of-the-box* with many Linux distributions. - -To see if you have CMake (and which version), try this command: - - $ cmake --version - -BamTools requires CMake version >= 2.6.4. If you are missing CMake or have an -older version, check your OS package manager (for Linux) or download it here: -http://www.cmake.org/cmake/resources/software.html . - ----------------------------------------- -IIb. Build BamTools ----------------------------------------- - -Ok, now that you have CMake ready to go, let's build BamTools. A good -practice in building applications is to do an out-of-source build, meaning -that we're going to set up an isolated place to hold all the intermediate -installation steps. - -In the top-level directory of BamTools, type the following commands: - - $ mkdir build - $ cd build - $ cmake .. - -Windows users: -This creates a Visual Studio solution file, which can then be built to create -the toolkit executable and API DLL's. - -Everybody else: -After running cmake, just run: - - $ make +I. Learn More -Then go back up to the BamTools root directory. +II. License - $ cd .. +III. Acknowledgements ----------------------------------------- -IIIb. Check It ----------------------------------------- - -Assuming the build process finished correctly, you should be able to find the -toolkit executable here: - - ./bin/ - -The BamTools-associated libraries will be found here: - - ./lib/ - -The BamTools API headers will be found here: - - ./include/* +IV. Contact -------------------------------------------------------------------------------- -III. Usage : +I. Learn More: -------------------------------------------------------------------------------- -** General usage information - perhaps explain common terms, point to SAM/BAM -spec, etc ** - ----------------------------------------- -IIIa. The API ----------------------------------------- - -The API, as noted above, contains 2 main modules - BamReader & BamWriter - for -dealing with BAM files. Alignment data is made available through the -BamAlignment data structure. - -A simple (read-only) scenario for accessing BAM data would look like the -following: - - // open our BamReader - BamReader reader; - reader.Open("someData.bam", "someData.bam.bai"); - - // define our region of interest - // in this example: bases 0-500 on the reference "chrX" - int id = reader.GetReferenceID("chrX"); - BamRegion region(id, 0, id, 500); - reader.SetRegion(region); - - // iterate through alignments in this region, - // ignoring alignments with a MQ below some cutoff - BamAlignment al; - while ( reader.GetNextAlignment(al) ) { - if ( al.MapQuality >= 50 ) - // do something - } - - // close the reader - reader.Close(); - -To use this API in your application, you simply need to do the following: - - 1 - Build the BamTools library (see Installation steps above). - - 2 - Import BamTools API functionality as needed, for example: - - #include "api/BamReader.h" - #include "api/BamWriter.h" - using namespace BamTools; // all BamTools classes/methods live within - // this namespace - - 3 - In your own build step, point your include path to the - (BAMTOOLS_ROOT)/include directory. Link your app with '-lbamtools' ('l' - as in Lima). - -* You may need to modify the -L flag (library path) as well to help your linker -find the (BAMTOOLS_ROOT)/lib directory. - -* Depending on your platform and where you install the BamTools API library, you -may also need to adjust how your app locates the shared library at runtime. For -Windows users, this can be as simple as dropping the DLL in the same folder as -your executable. For *nix users (using gcc at least), you can add the following -to your app's CXXFLAGS: - - -Wl,-rpath,$(BAMTOOLS_LIB_DIR) - -where BAMTOOLS_LIB_DIR is, as you would guess, the directory containing the libs. -An alternative is to set your local LD_LIBRARY_PATH environment variable. - -Another alternative is to use the newly provided static library libbamtools.a and -resolve this issue at compile/link time, instead of runtime. - -See any included programs for more detailed usage examples. See comments in the -header files for more detailed API documentation. - -Note - For users that don't want to bother with the new BamTools shared library -scheme: you are certainly free to just compile the API source code directly into -your application, but be aware that the files involved are subject to change. -Meaning that filenames, number of files, etc. are not fixed. You will also need -to be sure to link with '-lz' for ZLIB functionality (linking with '-lbamtools' -gives you this automatically). - ----------------------------------------- -IIIb. The Toolkit ----------------------------------------- - -BamTools provides a small, but powerful suite of command-line utility programs -for manipulating and querying BAM files for data. - --------------------- -Input/Output --------------------- - -All BamTools utilities handle I/O operations using a common set of arguments. -These include: - - -in - -The input BAM files(s). - - If a tool accepts multiple BAM files as input, each file gets its own "-in" - option on the command line. If no "-in" is provided, the tool will attempt - to read BAM data from stdin. - - To read a single BAM file, use a single "-in" option: - > bamtools *tool* -in myData1.bam ...ARGS... - - To read multiple BAM files, use multiple "-in" options: - > bamtools *tool* -in myData1.bam -in myData2.bam ...ARGS... - - To read from stdin (if supported), omit the "-in" option: - > bamtools *tool* ...ARGS... - - -out - -The output BAM file. - - If a tool outputs a result BAM file, specify the filename using this option. - If none is provided, the tool will typically write to stdout. - - *Note: Not all tools output BAM data (e.g. count, header, etc.) - - -region - -A region of interest. See below for accepted 'REGION string' formats. - - Many of the tools accept this option, which allows a user to only consider - alignments that overlap this region (whether counting, filtering, merging, - etc.). - - An alignment is considered to overlap a region if any part of the alignments - intersects the left/right boundaries. Thus, a 50bp alignment at position 70 - will overlap a region beginning at position 100. - - REGION string format - ---------------------- - A proper REGION string can be formatted like any of the following examples: - where 'chr1' is the name of a reference (not its ID)and '' is any valid - integer position within that reference. - - To read - chr1 - only alignments on (entire) reference 'chr1' - chr1:500 - only alignments overlapping the region starting at - chr1:500 and continuing to the end of chr1 - chr1:500..1000 - only alignments overlapping the region starting at - chr1:500 and continuing to chr1:1000 - chr1:500..chr3:750 - only alignments overlapping the region starting at - chr1:500 and continuing to chr3:750. This 'spanning' - region assumes that the reference specified as the - right boundary will occur somewhere in the file after - the left boundary. On a sorted BAM, a REGION of - 'chr4:500..chr2:1500' will produce undefined - (incorrect) results. So don't do it. :) - - *Note: Most of the tools that accept a REGION string will perform without an - index file, but typically at great cost to performance (having to - plow through the entire file until the region of interest is found). - For optimum speed, be sure that index files are available for your - data. - - -forceCompression - -Force compression of BAM output. - - When tools are piped together (see details below), the default behavior is - to turn off compression. This can greatly increase performance when the data - does not have to be constantly decompressed and recompressed. This is - ignored any time an output BAM file is specified using "-out". - --------------------- -Piping --------------------- - -Many of the tools in BamTools can be chained together by piping. Any tool that -accepts stdin can be piped into, and any that can output stdout can be piped -from. For example: - -> bamtools filter -in data1.bam -in data2.bam -mapQuality ">50" | bamtools count - -will give a count of all alignments in your 2 BAM files with a mapQuality of -greater than 50. And of course, any tool writing to stdout can be piped into -other utilities. - --------------------- -The Tools --------------------- - - convert Converts between BAM and a number of other formats - count Prints number of alignments in BAM file(s) - coverage Prints coverage statistics from the input BAM file - filter Filters BAM file(s) by user-specified criteria - header Prints BAM header information - index Generates index for BAM file - merge Merge multiple BAM files into single file - random Select random alignments from existing BAM file(s) - sort Sorts the BAM file according to some criteria - split Splits a BAM file on user-specifed property, creating a new - BAM output file for each value found - stats Prints some basic statistics from input BAM file(s) - ----------- -convert ----------- - -Description: converts BAM to a number of other formats - -Usage: bamtools convert -format [-in -in ...] - [-out ] [other options] - -Input & Output: - -in the input BAM file(s) [stdin] - -out the output BAM file [stdout] - -format the output file format - see below for - supported formats - -Filters: - -region genomic region. Index file is recommended for - better performance, and is read - automatically if it exists. See 'bamtools - help index' for more details on creating - one. - -Pileup Options: - -fasta FASTA reference file - -mapqual print the mapping qualities - -SAM Options: - -noheader omit the SAM header from output - -Help: - --help, -h shows this help text - -** Notes ** - - - Currently supported output formats ( BAM -> X ) - - Format type FORMAT (command-line argument) - ------------ ------------------------------- - BED bed - FASTA fasta - FASTQ fastq - JSON json - Pileup pileup - SAM sam - YAML yaml - - Usage example: - > bamtools convert -format json -in myData.bam -out myData.json - - - Pileup Options have no effect on formats other than "pileup" - SAM Options have no effect on formats other than "sam" - ----------- -count ----------- - -Description: prints number of alignments in BAM file(s). - -Usage: bamtools count [-in -in ...] [-region ] - -Input & Output: - -in the input BAM file(s) [stdin] - -region genomic region. Index file is recommended - for better performance, and is used - automatically if it exists. See - 'bamtools help index' for more details - on creating one - -Help: - --help, -h shows this help text - ----------- -coverage ----------- - -Description: prints coverage data for a single BAM file. - -Usage: bamtools coverage [-in ] [-out ] - -Input & Output: - -in the input BAM file [stdin] - -out the output file [stdout] - -Help: - --help, -h shows this help text - ----------- -filter ----------- - -Description: filters BAM file(s). - -Usage: bamtools filter [-in -in ...] - [-out | [-forceCompression]] - [-region ] - [ [-script the input BAM file(s) [stdin] - -out the output BAM file [stdout] - -region only read data from this genomic region (see - README for more details) - -script the filter script file (see README for more - details) - -forceCompression if results are sent to stdout (like when - piping to another tool), default behavior - is to leave output uncompressed. Use this - flag to override and force compression - -General Filters: - -alignmentFlag keep reads with this *exact* alignment flag - (for more detailed queries, see below) - -insertSize keep reads with insert size that matches - pattern - -mapQuality <[0-255]> keep reads with map quality that matches - pattern - -name keep reads with name that matches pattern - -queryBases keep reads with motif that matches pattern - -tag keep reads with this key=>value pair - -Alignment Flag Filters: - -isDuplicate keep only alignments that are marked as - duplicate [true] - -isFailedQC keep only alignments that failed QC [true] - -isFirstMate keep only alignments marked as first mate - [true] - -isMapped keep only alignments that were mapped [true] - -isMateMapped keep only alignments with mates that mapped - [true] - -isMateReverseStrand keep only alignments with mate on reverse - strand [true] - -isPaired keep only alignments that were sequenced as - paired [true] - -isPrimaryAlignment keep only alignments marked as primary - [true] - -isProperPair keep only alignments that passed paired-end - resolution [true] - -isReverseStrand keep only alignments on reverse strand - [true] - -isSecondMate keep only alignments marked as second mate - [true] - -Help: - --help, -h shows this help text - - ***************** - * Filter Script * - ***************** - -The BamTools filter tool allows you to use an external filter script to define -complex filtering behavior. This script uses what I'm calling properties, -filters, and a rule - all implemented in a JSON syntax. - - ** Properties ** - -A 'property' is a typical JSON entry of the form: - - "propertyName" : "value" - -Here are the property names that BamTools will recognize: - - alignmentFlag - cigar - insertSize - isDuplicate - isFailedQC - isFirstMate - isMapped - isMateMapped - isMateReverseStrand - isPaired - isPrimaryAlignment - isProperPair - isReverseStrand - isSecondMate - mapQuality - matePosition - mateReference - name - position - queryBases - reference - tag - -For properties with boolean values, use the words "true" or "false". -For example, - - "isMapped" : "true" - -will keep only alignments that are flagged as 'mapped'. - -For properties with numeric values, use the desired number with optional -comparison operators ( >, >=, <, <=, !). For example, - - "mapQuality" : ">=75" - -will keep only alignments with mapQuality greater than or equal to 75. - -If you're familiar with JSON, you know that integers can be bare (without -quotes). However, if you a comparison operator, be sure to enclose in quotes. - -For string-based properties, the above operators are available. In addition, - you can also use some basic pattern-matching operators. For example, - - "reference" : "ALU*" // reference starts with 'ALU' - "name" : "*foo" // name ends with 'foo' - "cigar" : "*D*" // cigar contains a 'D' anywhere - -Notes - -The reference property refers to the reference name, not the BAM reference -numeric ID. - -The tag property has an extra layer, so that the syntax will look like this: - - "tag" : "XX:value" - -where XX is the 2-letter SAM/BAM tag and value is, well, the value. -Comparison operators can still apply to values, so tag properties of: - - "tag" : "AS:>60" - "tag" : "RG:foo*" - -are perfectly valid. - - ** Filters ** - -A 'filter' is a JSON container of properties that will be AND-ed together. For -example, - -{ - "reference" : "chr1", - "mapQuality" : ">50", - "tag" : "NM:<4" -} - -would result in an output BAM file containing only alignments from chr1 with a -mapQuality >50 and edit distance of less than 4. - -A single, unnamed filter like this is the minimum necessary for a complete -filter script. Save this file and use as the -script parameter and you should -be all set. - -Moving on to more potent filtering... - -You can also define multiple filters. -To do so, you just need to use the "filters" keyword along with JSON array -syntax, like this: - -{ - "filters" : - [ - { - "reference" : "chr1", - "mapQuality" : ">50" - }, - { - "reference" : "chr1", - "isReverseStrand" : "true" - } - ] -} - -These filters will be (inclusive) OR-ed together by default. So you'd get a -resulting BAM with only alignments from chr1 that had either mapQuality >50 or -on the reverse strand (or both). - - ** Rule ** - -Alternatively to anonymous OR-ed filters, you can also provide what I've called -a "rule". By giving each filter an "id", using this "rule" keyword you can -describe boolean relationships between your filter sets. - -Available rule operators: - - & // and - | // or - ! // not - -This might sound a little fuzzy at this point, so let's get back to an example: - -{ - "filters" : - [ - { - "id" : "filter1", - "reference" : "chr1", - "mapQuality" : ">50" - }, - { - "id" : "filter2", - "reference" : "chr1", - "isReverseStrand" : "true" - }, - { - "id" : "filter3", - "reference" : "chr1", - "queryBases" : "AGCT*" - } - ], - - "rule" : " (filter1 | filter2) & !filter3 " -} - -In this case, we would only retain aligments that passed filter 1 OR filter 2, -AND also NOT filter 3. - -These are dummy examples, and don't make much sense as an actual query case. But -hopefully this serves an adequate primer to get you started and discover the -potential flexibility here. - ----------- -header ----------- - -Description: prints header from BAM file(s). - -Usage: bamtools header [-in -in ...] - -Input & Output: - -in the input BAM file(s) [stdin] - -Help: - --help, -h shows this help text - ----------- -index ----------- - -Description: creates index for BAM file. - -Usage: bamtools index [-in ] [-bti] - -Input & Output: - -in the input BAM file [stdin] - -bti create (non-standard) BamTools index file - (*.bti). Default behavior is to create - standard BAM index (*.bai) - -Help: - --help, -h shows this help tex - ----------- -merge ----------- - -Description: merges multiple BAM files into one. - -Usage: bamtools merge [-in -in ...] - [-out | [-forceCompression]] [-region ] - -Input & Output: - -in the input BAM file(s) - -out the output BAM file - -forceCompression if results are sent to stdout (like when - piping to another tool), default behavior - is to leave output uncompressed. Use this - flag to override and force compression - -region genomic region. See README for more details - -Help: - --help, -h shows this help text - ----------- -random ----------- - -Description: grab a random subset of alignments. - -Usage: bamtools random [-in -in ...] - [-out ] [-forceCompression] [-n] - [-region ] - -Input & Output: - -in the input BAM file [stdin] - -out the output BAM file [stdout] - -forceCompression if results are sent to stdout (like when - piping to another tool), default behavior - is to leave output uncompressed. Use this - flag to override and force compression - -region only pull random alignments from within this - genomic region. Index file is - recommended for better performance, and - is used automatically if it exists. See - 'bamtools help index' for more details - on creating one - -Settings: - -n number of alignments to grab. Note that no - duplicate checking is performed [10000] - -Help: - --help, -h shows this help text - ----------- -sort ----------- - -Description: sorts a BAM file. - -Usage: bamtools sort [-in ] [-out ] [sortOptions] - -Input & Output: - -in the input BAM file [stdin] - -out the output BAM file [stdout] - -Sorting Methods: - -byname sort by alignment name - -Memory Settings: - -n max number of alignments per tempfile - [10000] - -mem max memory to use [1024] - -Help: - --help, -h shows this help text - ----------- -split ----------- - -Description: splits a BAM file on user-specified property, creating a new BAM -output file for each value found. - -Usage: bamtools split [-in ] [-stub ] - < -mapped | -paired | -reference | -tag > - -Input & Output: - -in the input BAM file [stdin] - -stub prefix stub for output BAM files (default - behavior is to use input filename, - without .bam extension, as stub). If - input is stdin and no stub provided, a - timestamp is generated as the stub. - -Split Options: - -mapped split mapped/unmapped alignments - -paired split single-end/paired-end alignments - -reference split alignments by reference - -tag splits alignments based on all values of TAG - encountered (i.e. -tag RG creates a BAM - file for each read group in original - BAM file) - -Help: - --help, -h shows this help text - ----------- -stats ----------- - -Description: prints general alignment statistics. - -Usage: bamtools stats [-in -in ...] [statsOptions] +Installation steps, tutorial, API documentation, etc. are all now available +through the BamTools project wiki: -Input & Output: - -in the input BAM file [stdin] +https://github.com/pezmaster31/bamtools/wiki -Additional Stats: - -insert summarize insert size data +Join the mailing list(s) to stay informed of updates or get involved with +contributing: -Help: - --help, -h shows this help text +https://github.com/pezmaster31/bamtools/wiki/Mailing-lists -------------------------------------------------------------------------------- -IV. License : +II. License : -------------------------------------------------------------------------------- Both the BamTools API and toolkit are released under the MIT License. @@ -821,7 +38,7 @@ Copyright (c) 2009-2010 Derek Barnett, Erik Garrison, Gabor Marth, See included file LICENSE for details. -------------------------------------------------------------------------------- -V. Acknowledgements : +III. Acknowledgements : -------------------------------------------------------------------------------- * Aaron Quinlan for several key feature ideas and bug fix contributions @@ -829,7 +46,7 @@ V. Acknowledgements : * Heng Li, author of SAMtools - the original C-language BAM API/toolkit. -------------------------------------------------------------------------------- -VI. Contact : +IV. Contact : -------------------------------------------------------------------------------- Feel free to contact me with any questions, comments, suggestions, bug reports, diff --git a/docs/Doxyfile b/docs/Doxyfile new file mode 100644 index 0000000..9a27f67 --- /dev/null +++ b/docs/Doxyfile @@ -0,0 +1,1601 @@ +# Doxyfile 1.6.3 + +# This file describes the settings to be used by the documentation system +# doxygen (www.doxygen.org) for a project +# +# All text after a hash (#) is considered a comment and will be ignored +# The format is: +# TAG = value [value, ...] +# For lists items can also be appended using: +# TAG += value [value, ...] +# Values that contain spaces should be placed between quotes (" ") + +#--------------------------------------------------------------------------- +# Project related configuration options +#--------------------------------------------------------------------------- + +# This tag specifies the encoding used for all characters in the config file +# that follow. The default is UTF-8 which is also the encoding used for all +# text before the first occurrence of this tag. Doxygen uses libiconv (or the +# iconv built into libc) for the transcoding. See +# http://www.gnu.org/software/libiconv for the list of possible encodings. + +DOXYFILE_ENCODING = UTF-8 + +# The PROJECT_NAME tag is a single word (or a sequence of words surrounded +# by quotes) that should identify the project. + +PROJECT_NAME = BamTools + +# The PROJECT_NUMBER tag can be used to enter a project or revision number. +# This could be handy for archiving the generated documentation or +# if some version control system is used. + +PROJECT_NUMBER = 1.0.0 + +# The OUTPUT_DIRECTORY tag is used to specify the (relative or absolute) +# base path where the generated documentation will be put. +# If a relative path is entered, it will be relative to the location +# where doxygen was started. If left blank the current directory will be used. + +OUTPUT_DIRECTORY = + +# If the CREATE_SUBDIRS tag is set to YES, then doxygen will create +# 4096 sub-directories (in 2 levels) under the output directory of each output +# format and will distribute the generated files over these directories. +# Enabling this option can be useful when feeding doxygen a huge amount of +# source files, where putting all generated files in the same directory would +# otherwise cause performance problems for the file system. + +CREATE_SUBDIRS = NO + +# The OUTPUT_LANGUAGE tag is used to specify the language in which all +# documentation generated by doxygen is written. Doxygen will use this +# information to generate all constant output in the proper language. +# The default language is English, other supported languages are: +# Afrikaans, Arabic, Brazilian, Catalan, Chinese, Chinese-Traditional, +# Croatian, Czech, Danish, Dutch, Esperanto, Farsi, Finnish, French, German, +# Greek, Hungarian, Italian, Japanese, Japanese-en (Japanese with English +# messages), Korean, Korean-en, Lithuanian, Norwegian, Macedonian, Persian, +# Polish, Portuguese, Romanian, Russian, Serbian, Serbian-Cyrilic, Slovak, +# Slovene, Spanish, Swedish, Ukrainian, and Vietnamese. + +OUTPUT_LANGUAGE = English + +# If the BRIEF_MEMBER_DESC tag is set to YES (the default) Doxygen will +# include brief member descriptions after the members that are listed in +# the file and class documentation (similar to JavaDoc). +# Set to NO to disable this. + +BRIEF_MEMBER_DESC = YES + +# If the REPEAT_BRIEF tag is set to YES (the default) Doxygen will prepend +# the brief description of a member or function before the detailed description. +# Note: if both HIDE_UNDOC_MEMBERS and BRIEF_MEMBER_DESC are set to NO, the +# brief descriptions will be completely suppressed. + +REPEAT_BRIEF = YES + +# This tag implements a quasi-intelligent brief description abbreviator +# that is used to form the text in various listings. Each string +# in this list, if found as the leading text of the brief description, will be +# stripped from the text and the result after processing the whole list, is +# used as the annotated text. Otherwise, the brief description is used as-is. +# If left blank, the following values are used ("$name" is automatically +# replaced with the name of the entity): "The $name class" "The $name widget" +# "The $name file" "is" "provides" "specifies" "contains" +# "represents" "a" "an" "the" + +ABBREVIATE_BRIEF = "The $name class" \ + "The $name widget" \ + "The $name file" \ + is \ + provides \ + specifies \ + contains \ + represents \ + a \ + an \ + the + +# If the ALWAYS_DETAILED_SEC and REPEAT_BRIEF tags are both set to YES then +# Doxygen will generate a detailed section even if there is only a brief +# description. + +ALWAYS_DETAILED_SEC = NO + +# If the INLINE_INHERITED_MEMB tag is set to YES, doxygen will show all +# inherited members of a class in the documentation of that class as if those +# members were ordinary class members. Constructors, destructors and assignment +# operators of the base classes will not be shown. + +INLINE_INHERITED_MEMB = NO + +# If the FULL_PATH_NAMES tag is set to YES then Doxygen will prepend the full +# path before files name in the file list and in the header files. If set +# to NO the shortest path that makes the file name unique will be used. + +FULL_PATH_NAMES = YES + +# If the FULL_PATH_NAMES tag is set to YES then the STRIP_FROM_PATH tag +# can be used to strip a user-defined part of the path. Stripping is +# only done if one of the specified strings matches the left-hand part of +# the path. The tag can be used to show relative paths in the file list. +# If left blank the directory from which doxygen is run is used as the +# path to strip. + +STRIP_FROM_PATH = + +# The STRIP_FROM_INC_PATH tag can be used to strip a user-defined part of +# the path mentioned in the documentation of a class, which tells +# the reader which header file to include in order to use a class. +# If left blank only the name of the header file containing the class +# definition is used. Otherwise one should specify the include paths that +# are normally passed to the compiler using the -I flag. + +STRIP_FROM_INC_PATH = + +# If the SHORT_NAMES tag is set to YES, doxygen will generate much shorter +# (but less readable) file names. This can be useful is your file systems +# doesn't support long names like on DOS, Mac, or CD-ROM. + +SHORT_NAMES = NO + +# If the JAVADOC_AUTOBRIEF tag is set to YES then Doxygen +# will interpret the first line (until the first dot) of a JavaDoc-style +# comment as the brief description. If set to NO, the JavaDoc +# comments will behave just like regular Qt-style comments +# (thus requiring an explicit @brief command for a brief description.) + +JAVADOC_AUTOBRIEF = NO + +# If the QT_AUTOBRIEF tag is set to YES then Doxygen will +# interpret the first line (until the first dot) of a Qt-style +# comment as the brief description. If set to NO, the comments +# will behave just like regular Qt-style comments (thus requiring +# an explicit \brief command for a brief description.) + +QT_AUTOBRIEF = NO + +# The MULTILINE_CPP_IS_BRIEF tag can be set to YES to make Doxygen +# treat a multi-line C++ special comment block (i.e. a block of //! or /// +# comments) as a brief description. This used to be the default behaviour. +# The new default is to treat a multi-line C++ comment block as a detailed +# description. Set this tag to YES if you prefer the old behaviour instead. + +MULTILINE_CPP_IS_BRIEF = NO + +# If the INHERIT_DOCS tag is set to YES (the default) then an undocumented +# member inherits the documentation from any documented member that it +# re-implements. + +INHERIT_DOCS = YES + +# If the SEPARATE_MEMBER_PAGES tag is set to YES, then doxygen will produce +# a new page for each member. If set to NO, the documentation of a member will +# be part of the file/class/namespace that contains it. + +SEPARATE_MEMBER_PAGES = NO + +# The TAB_SIZE tag can be used to set the number of spaces in a tab. +# Doxygen uses this value to replace tabs by spaces in code fragments. + +TAB_SIZE = 1 + +# This tag can be used to specify a number of aliases that acts +# as commands in the documentation. An alias has the form "name=value". +# For example adding "sideeffect=\par Side Effects:\n" will allow you to +# put the command \sideeffect (or @sideeffect) in the documentation, which +# will result in a user-defined paragraph with heading "Side Effects:". +# You can put \n's in the value part of an alias to insert newlines. + +ALIASES = + +# Set the OPTIMIZE_OUTPUT_FOR_C tag to YES if your project consists of C +# sources only. Doxygen will then generate output that is more tailored for C. +# For instance, some of the names that are used will be different. The list +# of all members will be omitted, etc. + +OPTIMIZE_OUTPUT_FOR_C = NO + +# Set the OPTIMIZE_OUTPUT_JAVA tag to YES if your project consists of Java +# sources only. Doxygen will then generate output that is more tailored for +# Java. For instance, namespaces will be presented as packages, qualified +# scopes will look different, etc. + +OPTIMIZE_OUTPUT_JAVA = NO + +# Set the OPTIMIZE_FOR_FORTRAN tag to YES if your project consists of Fortran +# sources only. Doxygen will then generate output that is more tailored for +# Fortran. + +OPTIMIZE_FOR_FORTRAN = NO + +# Set the OPTIMIZE_OUTPUT_VHDL tag to YES if your project consists of VHDL +# sources. Doxygen will then generate output that is tailored for +# VHDL. + +OPTIMIZE_OUTPUT_VHDL = NO + +# Doxygen selects the parser to use depending on the extension of the files it parses. +# With this tag you can assign which parser to use for a given extension. +# Doxygen has a built-in mapping, but you can override or extend it using this tag. +# The format is ext=language, where ext is a file extension, and language is one of +# the parsers supported by doxygen: IDL, Java, Javascript, C#, C, C++, D, PHP, +# Objective-C, Python, Fortran, VHDL, C, C++. For instance to make doxygen treat +# .inc files as Fortran files (default is PHP), and .f files as C (default is Fortran), +# use: inc=Fortran f=C. Note that for custom extensions you also need to set +# FILE_PATTERNS otherwise the files are not read by doxygen. + +EXTENSION_MAPPING = + +# If you use STL classes (i.e. std::string, std::vector, etc.) but do not want +# to include (a tag file for) the STL sources as input, then you should +# set this tag to YES in order to let doxygen match functions declarations and +# definitions whose arguments contain STL classes (e.g. func(std::string); v.s. +# func(std::string) {}). This also make the inheritance and collaboration +# diagrams that involve STL classes more complete and accurate. + +BUILTIN_STL_SUPPORT = NO + +# If you use Microsoft's C++/CLI language, you should set this option to YES to +# enable parsing support. + +CPP_CLI_SUPPORT = NO + +# Set the SIP_SUPPORT tag to YES if your project consists of sip sources only. +# Doxygen will parse them like normal C++ but will assume all classes use public +# instead of private inheritance when no explicit protection keyword is present. + +SIP_SUPPORT = NO + +# For Microsoft's IDL there are propget and propput attributes to indicate getter +# and setter methods for a property. Setting this option to YES (the default) +# will make doxygen to replace the get and set methods by a property in the +# documentation. This will only work if the methods are indeed getting or +# setting a simple type. If this is not the case, or you want to show the +# methods anyway, you should set this option to NO. + +IDL_PROPERTY_SUPPORT = YES + +# If member grouping is used in the documentation and the DISTRIBUTE_GROUP_DOC +# tag is set to YES, then doxygen will reuse the documentation of the first +# member in the group (if any) for the other members of the group. By default +# all members of a group must be documented explicitly. + +DISTRIBUTE_GROUP_DOC = NO + +# Set the SUBGROUPING tag to YES (the default) to allow class member groups of +# the same type (for instance a group of public functions) to be put as a +# subgroup of that type (e.g. under the Public Functions section). Set it to +# NO to prevent subgrouping. Alternatively, this can be done per class using +# the \nosubgrouping command. + +SUBGROUPING = YES + +# When TYPEDEF_HIDES_STRUCT is enabled, a typedef of a struct, union, or enum +# is documented as struct, union, or enum with the name of the typedef. So +# typedef struct TypeS {} TypeT, will appear in the documentation as a struct +# with name TypeT. When disabled the typedef will appear as a member of a file, +# namespace, or class. And the struct will be named TypeS. This can typically +# be useful for C code in case the coding convention dictates that all compound +# types are typedef'ed and only the typedef is referenced, never the tag name. + +TYPEDEF_HIDES_STRUCT = NO + +# The SYMBOL_CACHE_SIZE determines the size of the internal cache use to +# determine which symbols to keep in memory and which to flush to disk. +# When the cache is full, less often used symbols will be written to disk. +# For small to medium size projects (<1000 input files) the default value is +# probably good enough. For larger projects a too small cache size can cause +# doxygen to be busy swapping symbols to and from disk most of the time +# causing a significant performance penality. +# If the system has enough physical memory increasing the cache will improve the +# performance by keeping more symbols in memory. Note that the value works on +# a logarithmic scale so increasing the size by one will rougly double the +# memory usage. The cache size is given by this formula: +# 2^(16+SYMBOL_CACHE_SIZE). The valid range is 0..9, the default is 0, +# corresponding to a cache size of 2^16 = 65536 symbols + +SYMBOL_CACHE_SIZE = 0 + +#--------------------------------------------------------------------------- +# Build related configuration options +#--------------------------------------------------------------------------- + +# If the EXTRACT_ALL tag is set to YES doxygen will assume all entities in +# documentation are documented, even if no documentation was available. +# Private class members and static file members will be hidden unless +# the EXTRACT_PRIVATE and EXTRACT_STATIC tags are set to YES + +EXTRACT_ALL = YES + +# If the EXTRACT_PRIVATE tag is set to YES all private members of a class +# will be included in the documentation. + +EXTRACT_PRIVATE = NO + +# If the EXTRACT_STATIC tag is set to YES all static members of a file +# will be included in the documentation. + +EXTRACT_STATIC = NO + +# If the EXTRACT_LOCAL_CLASSES tag is set to YES classes (and structs) +# defined locally in source files will be included in the documentation. +# If set to NO only classes defined in header files are included. + +EXTRACT_LOCAL_CLASSES = YES + +# This flag is only useful for Objective-C code. When set to YES local +# methods, which are defined in the implementation section but not in +# the interface are included in the documentation. +# If set to NO (the default) only methods in the interface are included. + +EXTRACT_LOCAL_METHODS = NO + +# If this flag is set to YES, the members of anonymous namespaces will be +# extracted and appear in the documentation as a namespace called +# 'anonymous_namespace{file}', where file will be replaced with the base +# name of the file that contains the anonymous namespace. By default +# anonymous namespace are hidden. + +EXTRACT_ANON_NSPACES = NO + +# If the HIDE_UNDOC_MEMBERS tag is set to YES, Doxygen will hide all +# undocumented members of documented classes, files or namespaces. +# If set to NO (the default) these members will be included in the +# various overviews, but no documentation section is generated. +# This option has no effect if EXTRACT_ALL is enabled. + +HIDE_UNDOC_MEMBERS = NO + +# If the HIDE_UNDOC_CLASSES tag is set to YES, Doxygen will hide all +# undocumented classes that are normally visible in the class hierarchy. +# If set to NO (the default) these classes will be included in the various +# overviews. This option has no effect if EXTRACT_ALL is enabled. + +HIDE_UNDOC_CLASSES = NO + +# If the HIDE_FRIEND_COMPOUNDS tag is set to YES, Doxygen will hide all +# friend (class|struct|union) declarations. +# If set to NO (the default) these declarations will be included in the +# documentation. + +HIDE_FRIEND_COMPOUNDS = NO + +# If the HIDE_IN_BODY_DOCS tag is set to YES, Doxygen will hide any +# documentation blocks found inside the body of a function. +# If set to NO (the default) these blocks will be appended to the +# function's detailed documentation block. + +HIDE_IN_BODY_DOCS = NO + +# The INTERNAL_DOCS tag determines if documentation +# that is typed after a \internal command is included. If the tag is set +# to NO (the default) then the documentation will be excluded. +# Set it to YES to include the internal documentation. + +INTERNAL_DOCS = NO + +# If the CASE_SENSE_NAMES tag is set to NO then Doxygen will only generate +# file names in lower-case letters. If set to YES upper-case letters are also +# allowed. This is useful if you have classes or files whose names only differ +# in case and if your file system supports case sensitive file names. Windows +# and Mac users are advised to set this option to NO. + +CASE_SENSE_NAMES = NO + +# If the HIDE_SCOPE_NAMES tag is set to NO (the default) then Doxygen +# will show members with their full class and namespace scopes in the +# documentation. If set to YES the scope will be hidden. + +HIDE_SCOPE_NAMES = NO + +# If the SHOW_INCLUDE_FILES tag is set to YES (the default) then Doxygen +# will put a list of the files that are included by a file in the documentation +# of that file. + +SHOW_INCLUDE_FILES = YES + +# If the FORCE_LOCAL_INCLUDES tag is set to YES then Doxygen +# will list include files with double quotes in the documentation +# rather than with sharp brackets. + +FORCE_LOCAL_INCLUDES = NO + +# If the INLINE_INFO tag is set to YES (the default) then a tag [inline] +# is inserted in the documentation for inline members. + +INLINE_INFO = YES + +# If the SORT_MEMBER_DOCS tag is set to YES (the default) then doxygen +# will sort the (detailed) documentation of file and class members +# alphabetically by member name. If set to NO the members will appear in +# declaration order. + +SORT_MEMBER_DOCS = YES + +# If the SORT_BRIEF_DOCS tag is set to YES then doxygen will sort the +# brief documentation of file, namespace and class members alphabetically +# by member name. If set to NO (the default) the members will appear in +# declaration order. + +SORT_BRIEF_DOCS = NO + +# If the SORT_MEMBERS_CTORS_1ST tag is set to YES then doxygen +# will sort the (brief and detailed) documentation of class members so that +# constructors and destructors are listed first. If set to NO (the default) +# the constructors will appear in the respective orders defined by +# SORT_MEMBER_DOCS and SORT_BRIEF_DOCS. +# This tag will be ignored for brief docs if SORT_BRIEF_DOCS is set to NO +# and ignored for detailed docs if SORT_MEMBER_DOCS is set to NO. + +SORT_MEMBERS_CTORS_1ST = NO + +# If the SORT_GROUP_NAMES tag is set to YES then doxygen will sort the +# hierarchy of group names into alphabetical order. If set to NO (the default) +# the group names will appear in their defined order. + +SORT_GROUP_NAMES = NO + +# If the SORT_BY_SCOPE_NAME tag is set to YES, the class list will be +# sorted by fully-qualified names, including namespaces. If set to +# NO (the default), the class list will be sorted only by class name, +# not including the namespace part. +# Note: This option is not very useful if HIDE_SCOPE_NAMES is set to YES. +# Note: This option applies only to the class list, not to the +# alphabetical list. + +SORT_BY_SCOPE_NAME = NO + +# The GENERATE_TODOLIST tag can be used to enable (YES) or +# disable (NO) the todo list. This list is created by putting \todo +# commands in the documentation. + +GENERATE_TODOLIST = YES + +# The GENERATE_TESTLIST tag can be used to enable (YES) or +# disable (NO) the test list. This list is created by putting \test +# commands in the documentation. + +GENERATE_TESTLIST = YES + +# The GENERATE_BUGLIST tag can be used to enable (YES) or +# disable (NO) the bug list. This list is created by putting \bug +# commands in the documentation. + +GENERATE_BUGLIST = YES + +# The GENERATE_DEPRECATEDLIST tag can be used to enable (YES) or +# disable (NO) the deprecated list. This list is created by putting +# \deprecated commands in the documentation. + +GENERATE_DEPRECATEDLIST= YES + +# The ENABLED_SECTIONS tag can be used to enable conditional +# documentation sections, marked by \if sectionname ... \endif. + +ENABLED_SECTIONS = + +# The MAX_INITIALIZER_LINES tag determines the maximum number of lines +# the initial value of a variable or define consists of for it to appear in +# the documentation. If the initializer consists of more lines than specified +# here it will be hidden. Use a value of 0 to hide initializers completely. +# The appearance of the initializer of individual variables and defines in the +# documentation can be controlled using \showinitializer or \hideinitializer +# command in the documentation regardless of this setting. + +MAX_INITIALIZER_LINES = 30 + +# Set the SHOW_USED_FILES tag to NO to disable the list of files generated +# at the bottom of the documentation of classes and structs. If set to YES the +# list will mention the files that were used to generate the documentation. + +SHOW_USED_FILES = YES + +# If the sources in your project are distributed over multiple directories +# then setting the SHOW_DIRECTORIES tag to YES will show the directory hierarchy +# in the documentation. The default is NO. + +SHOW_DIRECTORIES = YES + +# Set the SHOW_FILES tag to NO to disable the generation of the Files page. +# This will remove the Files entry from the Quick Index and from the +# Folder Tree View (if specified). The default is YES. + +SHOW_FILES = YES + +# Set the SHOW_NAMESPACES tag to NO to disable the generation of the +# Namespaces page. This will remove the Namespaces entry from the Quick Index +# and from the Folder Tree View (if specified). The default is YES. + +SHOW_NAMESPACES = YES + +# The FILE_VERSION_FILTER tag can be used to specify a program or script that +# doxygen should invoke to get the current version for each file (typically from +# the version control system). Doxygen will invoke the program by executing (via +# popen()) the command , where is the value of +# the FILE_VERSION_FILTER tag, and is the name of an input file +# provided by doxygen. Whatever the program writes to standard output +# is used as the file version. See the manual for examples. + +FILE_VERSION_FILTER = + +# The LAYOUT_FILE tag can be used to specify a layout file which will be parsed by +# doxygen. The layout file controls the global structure of the generated output files +# in an output format independent way. The create the layout file that represents +# doxygen's defaults, run doxygen with the -l option. You can optionally specify a +# file name after the option, if omitted DoxygenLayout.xml will be used as the name +# of the layout file. + +LAYOUT_FILE = + +#--------------------------------------------------------------------------- +# configuration options related to warning and progress messages +#--------------------------------------------------------------------------- + +# The QUIET tag can be used to turn on/off the messages that are generated +# by doxygen. Possible values are YES and NO. If left blank NO is used. + +QUIET = NO + +# The WARNINGS tag can be used to turn on/off the warning messages that are +# generated by doxygen. Possible values are YES and NO. If left blank +# NO is used. + +WARNINGS = YES + +# If WARN_IF_UNDOCUMENTED is set to YES, then doxygen will generate warnings +# for undocumented members. If EXTRACT_ALL is set to YES then this flag will +# automatically be disabled. + +WARN_IF_UNDOCUMENTED = YES + +# If WARN_IF_DOC_ERROR is set to YES, doxygen will generate warnings for +# potential errors in the documentation, such as not documenting some +# parameters in a documented function, or documenting parameters that +# don't exist or using markup commands wrongly. + +WARN_IF_DOC_ERROR = YES + +# This WARN_NO_PARAMDOC option can be abled to get warnings for +# functions that are documented, but have no documentation for their parameters +# or return value. If set to NO (the default) doxygen will only warn about +# wrong or incomplete parameter documentation, but not about the absence of +# documentation. + +WARN_NO_PARAMDOC = NO + +# The WARN_FORMAT tag determines the format of the warning messages that +# doxygen can produce. The string should contain the $file, $line, and $text +# tags, which will be replaced by the file and line number from which the +# warning originated and the warning text. Optionally the format may contain +# $version, which will be replaced by the version of the file (if it could +# be obtained via FILE_VERSION_FILTER) + +WARN_FORMAT = "$file:$line: $text" + +# The WARN_LOGFILE tag can be used to specify a file to which warning +# and error messages should be written. If left blank the output is written +# to stderr. + +WARN_LOGFILE = + +#--------------------------------------------------------------------------- +# configuration options related to the input files +#--------------------------------------------------------------------------- + +# The INPUT tag can be used to specify the files and/or directories that contain +# documented source files. You may enter file names like "myfile.cpp" or +# directories like "/usr/src/myproject". Separate the files or directories +# with spaces. + +INPUT = /home/derek/development/bamtools/src/api + +# This tag can be used to specify the character encoding of the source files +# that doxygen parses. Internally doxygen uses the UTF-8 encoding, which is +# also the default input encoding. Doxygen uses libiconv (or the iconv built +# into libc) for the transcoding. See http://www.gnu.org/software/libiconv for +# the list of possible encodings. + +INPUT_ENCODING = UTF-8 + +# If the value of the INPUT tag contains directories, you can use the +# FILE_PATTERNS tag to specify one or more wildcard pattern (like *.cpp +# and *.h) to filter out the source-files in the directories. If left +# blank the following patterns are tested: +# *.c *.cc *.cxx *.cpp *.c++ *.java *.ii *.ixx *.ipp *.i++ *.inl *.h *.hh *.hxx +# *.hpp *.h++ *.idl *.odl *.cs *.php *.php3 *.inc *.m *.mm *.py *.f90 + +FILE_PATTERNS = *.c \ + *.cc \ + *.cxx \ + *.cpp \ + *.c++ \ + *.d \ + *.java \ + *.ii \ + *.ixx \ + *.ipp \ + *.i++ \ + *.inl \ + *.h \ + *.hh \ + *.hxx \ + *.hpp \ + *.h++ \ + *.idl \ + *.odl \ + *.cs \ + *.php \ + *.php3 \ + *.inc \ + *.m \ + *.mm \ + *.dox \ + *.py \ + *.f90 \ + *.f \ + *.vhd \ + *.vhdl + +# The RECURSIVE tag can be used to turn specify whether or not subdirectories +# should be searched for input files as well. Possible values are YES and NO. +# If left blank NO is used. + +RECURSIVE = YES + +# The EXCLUDE tag can be used to specify files and/or directories that should +# excluded from the INPUT source files. This way you can easily exclude a +# subdirectory from a directory tree whose root is specified with the INPUT tag. + +EXCLUDE = /home/derek/development/bamtools/src/api/internal + +# The EXCLUDE_SYMLINKS tag can be used select whether or not files or +# directories that are symbolic links (a Unix filesystem feature) are excluded +# from the input. + +EXCLUDE_SYMLINKS = NO + +# If the value of the INPUT tag contains directories, you can use the +# EXCLUDE_PATTERNS tag to specify one or more wildcard patterns to exclude +# certain files from those directories. Note that the wildcards are matched +# against the file with absolute path, so to exclude all test directories +# for example use the pattern */test/* + +EXCLUDE_PATTERNS = + +# The EXCLUDE_SYMBOLS tag can be used to specify one or more symbol names +# (namespaces, classes, functions, etc.) that should be excluded from the +# output. The symbol name can be a fully qualified name, a word, or if the +# wildcard * is used, a substring. Examples: ANamespace, AClass, +# AClass::ANamespace, ANamespace::*Test + +EXCLUDE_SYMBOLS = BamTools::Internal \ + BamTools::BamAlignment::BamAlignmentSupportData + +# The EXAMPLE_PATH tag can be used to specify one or more files or +# directories that contain example code fragments that are included (see +# the \include command). + +EXAMPLE_PATH = + +# If the value of the EXAMPLE_PATH tag contains directories, you can use the +# EXAMPLE_PATTERNS tag to specify one or more wildcard pattern (like *.cpp +# and *.h) to filter out the source-files in the directories. If left +# blank all files are included. + +EXAMPLE_PATTERNS = * + +# If the EXAMPLE_RECURSIVE tag is set to YES then subdirectories will be +# searched for input files to be used with the \include or \dontinclude +# commands irrespective of the value of the RECURSIVE tag. +# Possible values are YES and NO. If left blank NO is used. + +EXAMPLE_RECURSIVE = NO + +# The IMAGE_PATH tag can be used to specify one or more files or +# directories that contain image that are included in the documentation (see +# the \image command). + +IMAGE_PATH = + +# The INPUT_FILTER tag can be used to specify a program that doxygen should +# invoke to filter for each input file. Doxygen will invoke the filter program +# by executing (via popen()) the command , where +# is the value of the INPUT_FILTER tag, and is the name of an +# input file. Doxygen will then use the output that the filter program writes +# to standard output. If FILTER_PATTERNS is specified, this tag will be +# ignored. + +INPUT_FILTER = + +# The FILTER_PATTERNS tag can be used to specify filters on a per file pattern +# basis. Doxygen will compare the file name with each pattern and apply the +# filter if there is a match. The filters are a list of the form: +# pattern=filter (like *.cpp=my_cpp_filter). See INPUT_FILTER for further +# info on how filters are used. If FILTER_PATTERNS is empty, INPUT_FILTER +# is applied to all files. + +FILTER_PATTERNS = + +# If the FILTER_SOURCE_FILES tag is set to YES, the input filter (if set using +# INPUT_FILTER) will be used to filter the input files when producing source +# files to browse (i.e. when SOURCE_BROWSER is set to YES). + +FILTER_SOURCE_FILES = NO + +#--------------------------------------------------------------------------- +# configuration options related to source browsing +#--------------------------------------------------------------------------- + +# If the SOURCE_BROWSER tag is set to YES then a list of source files will +# be generated. Documented entities will be cross-referenced with these sources. +# Note: To get rid of all source code in the generated output, make sure also +# VERBATIM_HEADERS is set to NO. + +SOURCE_BROWSER = NO + +# Setting the INLINE_SOURCES tag to YES will include the body +# of functions and classes directly in the documentation. + +INLINE_SOURCES = NO + +# Setting the STRIP_CODE_COMMENTS tag to YES (the default) will instruct +# doxygen to hide any special comment blocks from generated source code +# fragments. Normal C and C++ comments will always remain visible. + +STRIP_CODE_COMMENTS = YES + +# If the REFERENCED_BY_RELATION tag is set to YES +# then for each documented function all documented +# functions referencing it will be listed. + +REFERENCED_BY_RELATION = NO + +# If the REFERENCES_RELATION tag is set to YES +# then for each documented function all documented entities +# called/used by that function will be listed. + +REFERENCES_RELATION = NO + +# If the REFERENCES_LINK_SOURCE tag is set to YES (the default) +# and SOURCE_BROWSER tag is set to YES, then the hyperlinks from +# functions in REFERENCES_RELATION and REFERENCED_BY_RELATION lists will +# link to the source code. Otherwise they will link to the documentation. + +REFERENCES_LINK_SOURCE = YES + +# If the USE_HTAGS tag is set to YES then the references to source code +# will point to the HTML generated by the htags(1) tool instead of doxygen +# built-in source browser. The htags tool is part of GNU's global source +# tagging system (see http://www.gnu.org/software/global/global.html). You +# will need version 4.8.6 or higher. + +USE_HTAGS = NO + +# If the VERBATIM_HEADERS tag is set to YES (the default) then Doxygen +# will generate a verbatim copy of the header file for each class for +# which an include is specified. Set to NO to disable this. + +VERBATIM_HEADERS = YES + +#--------------------------------------------------------------------------- +# configuration options related to the alphabetical class index +#--------------------------------------------------------------------------- + +# If the ALPHABETICAL_INDEX tag is set to YES, an alphabetical index +# of all compounds will be generated. Enable this if the project +# contains a lot of classes, structs, unions or interfaces. + +ALPHABETICAL_INDEX = NO + +# If the alphabetical index is enabled (see ALPHABETICAL_INDEX) then +# the COLS_IN_ALPHA_INDEX tag can be used to specify the number of columns +# in which this list will be split (can be a number in the range [1..20]) + +COLS_IN_ALPHA_INDEX = 5 + +# In case all classes in a project start with a common prefix, all +# classes will be put under the same header in the alphabetical index. +# The IGNORE_PREFIX tag can be used to specify one or more prefixes that +# should be ignored while generating the index headers. + +IGNORE_PREFIX = + +#--------------------------------------------------------------------------- +# configuration options related to the HTML output +#--------------------------------------------------------------------------- + +# If the GENERATE_HTML tag is set to YES (the default) Doxygen will +# generate HTML output. + +GENERATE_HTML = YES + +# The HTML_OUTPUT tag is used to specify where the HTML docs will be put. +# If a relative path is entered the value of OUTPUT_DIRECTORY will be +# put in front of it. If left blank `html' will be used as the default path. + +HTML_OUTPUT = html + +# The HTML_FILE_EXTENSION tag can be used to specify the file extension for +# each generated HTML page (for example: .htm,.php,.asp). If it is left blank +# doxygen will generate files with .html extension. + +HTML_FILE_EXTENSION = .html + +# The HTML_HEADER tag can be used to specify a personal HTML header for +# each generated HTML page. If it is left blank doxygen will generate a +# standard header. + +HTML_HEADER = + +# The HTML_FOOTER tag can be used to specify a personal HTML footer for +# each generated HTML page. If it is left blank doxygen will generate a +# standard footer. + +HTML_FOOTER = + +# The HTML_STYLESHEET tag can be used to specify a user-defined cascading +# style sheet that is used by each HTML page. It can be used to +# fine-tune the look of the HTML output. If the tag is left blank doxygen +# will generate a default style sheet. Note that doxygen will try to copy +# the style sheet file to the HTML output directory, so don't put your own +# stylesheet in the HTML output directory as well, or it will be erased! + +HTML_STYLESHEET = + +# If the HTML_TIMESTAMP tag is set to YES then the footer of each generated HTML +# page will contain the date and time when the page was generated. Setting +# this to NO can help when comparing the output of multiple runs. + +HTML_TIMESTAMP = YES + +# If the HTML_ALIGN_MEMBERS tag is set to YES, the members of classes, +# files or namespaces will be aligned in HTML using tables. If set to +# NO a bullet list will be used. + +HTML_ALIGN_MEMBERS = YES + +# If the HTML_DYNAMIC_SECTIONS tag is set to YES then the generated HTML +# documentation will contain sections that can be hidden and shown after the +# page has loaded. For this to work a browser that supports +# JavaScript and DHTML is required (for instance Mozilla 1.0+, Firefox +# Netscape 6.0+, Internet explorer 5.0+, Konqueror, or Safari). + +HTML_DYNAMIC_SECTIONS = YES + +# If the GENERATE_DOCSET tag is set to YES, additional index files +# will be generated that can be used as input for Apple's Xcode 3 +# integrated development environment, introduced with OSX 10.5 (Leopard). +# To create a documentation set, doxygen will generate a Makefile in the +# HTML output directory. Running make will produce the docset in that +# directory and running "make install" will install the docset in +# ~/Library/Developer/Shared/Documentation/DocSets so that Xcode will find +# it at startup. +# See http://developer.apple.com/tools/creatingdocsetswithdoxygen.html for more information. + +GENERATE_DOCSET = NO + +# When GENERATE_DOCSET tag is set to YES, this tag determines the name of the +# feed. A documentation feed provides an umbrella under which multiple +# documentation sets from a single provider (such as a company or product suite) +# can be grouped. + +DOCSET_FEEDNAME = "Doxygen generated docs" + +# When GENERATE_DOCSET tag is set to YES, this tag specifies a string that +# should uniquely identify the documentation set bundle. This should be a +# reverse domain-name style string, e.g. com.mycompany.MyDocSet. Doxygen +# will append .docset to the name. + +DOCSET_BUNDLE_ID = org.doxygen.Project + +# If the GENERATE_HTMLHELP tag is set to YES, additional index files +# will be generated that can be used as input for tools like the +# Microsoft HTML help workshop to generate a compiled HTML help file (.chm) +# of the generated HTML documentation. + +GENERATE_HTMLHELP = NO + +# If the GENERATE_HTMLHELP tag is set to YES, the CHM_FILE tag can +# be used to specify the file name of the resulting .chm file. You +# can add a path in front of the file if the result should not be +# written to the html output directory. + +CHM_FILE = + +# If the GENERATE_HTMLHELP tag is set to YES, the HHC_LOCATION tag can +# be used to specify the location (absolute path including file name) of +# the HTML help compiler (hhc.exe). If non-empty doxygen will try to run +# the HTML help compiler on the generated index.hhp. + +HHC_LOCATION = + +# If the GENERATE_HTMLHELP tag is set to YES, the GENERATE_CHI flag +# controls if a separate .chi index file is generated (YES) or that +# it should be included in the master .chm file (NO). + +GENERATE_CHI = NO + +# If the GENERATE_HTMLHELP tag is set to YES, the CHM_INDEX_ENCODING +# is used to encode HtmlHelp index (hhk), content (hhc) and project file +# content. + +CHM_INDEX_ENCODING = + +# If the GENERATE_HTMLHELP tag is set to YES, the BINARY_TOC flag +# controls whether a binary table of contents is generated (YES) or a +# normal table of contents (NO) in the .chm file. + +BINARY_TOC = NO + +# The TOC_EXPAND flag can be set to YES to add extra items for group members +# to the contents of the HTML help documentation and to the tree view. + +TOC_EXPAND = NO + +# If the GENERATE_QHP tag is set to YES and both QHP_NAMESPACE and QHP_VIRTUAL_FOLDER +# are set, an additional index file will be generated that can be used as input for +# Qt's qhelpgenerator to generate a Qt Compressed Help (.qch) of the generated +# HTML documentation. + +GENERATE_QHP = NO + +# If the QHG_LOCATION tag is specified, the QCH_FILE tag can +# be used to specify the file name of the resulting .qch file. +# The path specified is relative to the HTML output folder. + +QCH_FILE = + +# The QHP_NAMESPACE tag specifies the namespace to use when generating +# Qt Help Project output. For more information please see +# http://doc.trolltech.com/qthelpproject.html#namespace + +QHP_NAMESPACE = org.doxygen.Project + +# The QHP_VIRTUAL_FOLDER tag specifies the namespace to use when generating +# Qt Help Project output. For more information please see +# http://doc.trolltech.com/qthelpproject.html#virtual-folders + +QHP_VIRTUAL_FOLDER = doc + +# If QHP_CUST_FILTER_NAME is set, it specifies the name of a custom filter to add. +# For more information please see +# http://doc.trolltech.com/qthelpproject.html#custom-filters + +QHP_CUST_FILTER_NAME = + +# The QHP_CUST_FILT_ATTRS tag specifies the list of the attributes of the custom filter to add.For more information please see +# Qt Help Project / Custom Filters. + +QHP_CUST_FILTER_ATTRS = + +# The QHP_SECT_FILTER_ATTRS tag specifies the list of the attributes this project's +# filter section matches. +# Qt Help Project / Filter Attributes. + +QHP_SECT_FILTER_ATTRS = + +# If the GENERATE_QHP tag is set to YES, the QHG_LOCATION tag can +# be used to specify the location of Qt's qhelpgenerator. +# If non-empty doxygen will try to run qhelpgenerator on the generated +# .qhp file. + +QHG_LOCATION = + +# If the GENERATE_ECLIPSEHELP tag is set to YES, additional index files +# will be generated, which together with the HTML files, form an Eclipse help +# plugin. To install this plugin and make it available under the help contents +# menu in Eclipse, the contents of the directory containing the HTML and XML +# files needs to be copied into the plugins directory of eclipse. The name of +# the directory within the plugins directory should be the same as +# the ECLIPSE_DOC_ID value. After copying Eclipse needs to be restarted before +# the help appears. + +GENERATE_ECLIPSEHELP = NO + +# A unique identifier for the eclipse help plugin. When installing the plugin +# the directory name containing the HTML and XML files should also have +# this name. + +ECLIPSE_DOC_ID = org.doxygen.Project + +# The DISABLE_INDEX tag can be used to turn on/off the condensed index at +# top of each HTML page. The value NO (the default) enables the index and +# the value YES disables it. + +DISABLE_INDEX = NO + +# This tag can be used to set the number of enum values (range [1..20]) +# that doxygen will group on one line in the generated HTML documentation. + +ENUM_VALUES_PER_LINE = 4 + +# The GENERATE_TREEVIEW tag is used to specify whether a tree-like index +# structure should be generated to display hierarchical information. +# If the tag value is set to YES, a side panel will be generated +# containing a tree-like index structure (just like the one that +# is generated for HTML Help). For this to work a browser that supports +# JavaScript, DHTML, CSS and frames is required (i.e. any modern browser). +# Windows users are probably better off using the HTML help feature. + +GENERATE_TREEVIEW = NO + +# By enabling USE_INLINE_TREES, doxygen will generate the Groups, Directories, +# and Class Hierarchy pages using a tree view instead of an ordered list. + +USE_INLINE_TREES = NO + +# If the treeview is enabled (see GENERATE_TREEVIEW) then this tag can be +# used to set the initial width (in pixels) of the frame in which the tree +# is shown. + +TREEVIEW_WIDTH = 250 + +# Use this tag to change the font size of Latex formulas included +# as images in the HTML documentation. The default is 10. Note that +# when you change the font size after a successful doxygen run you need +# to manually remove any form_*.png images from the HTML output directory +# to force them to be regenerated. + +FORMULA_FONTSIZE = 10 + +# When the SEARCHENGINE tag is enabled doxygen will generate a search box +# for the HTML output. The underlying search engine uses javascript +# and DHTML and should work on any modern browser. Note that when using +# HTML help (GENERATE_HTMLHELP), Qt help (GENERATE_QHP), or docsets +# (GENERATE_DOCSET) there is already a search function so this one should +# typically be disabled. For large projects the javascript based search engine +# can be slow, then enabling SERVER_BASED_SEARCH may provide a better solution. + +SEARCHENGINE = YES + +# When the SERVER_BASED_SEARCH tag is enabled the search engine will be +# implemented using a PHP enabled web server instead of at the web client +# using Javascript. Doxygen will generate the search PHP script and index +# file to put on the web server. The advantage of the server +# based approach is that it scales better to large projects and allows +# full text search. The disadvances is that it is more difficult to setup +# and does not have live searching capabilities. + +SERVER_BASED_SEARCH = NO + +#--------------------------------------------------------------------------- +# configuration options related to the LaTeX output +#--------------------------------------------------------------------------- + +# If the GENERATE_LATEX tag is set to YES (the default) Doxygen will +# generate Latex output. + +GENERATE_LATEX = NO + +# The LATEX_OUTPUT tag is used to specify where the LaTeX docs will be put. +# If a relative path is entered the value of OUTPUT_DIRECTORY will be +# put in front of it. If left blank `latex' will be used as the default path. + +LATEX_OUTPUT = latex + +# The LATEX_CMD_NAME tag can be used to specify the LaTeX command name to be +# invoked. If left blank `latex' will be used as the default command name. +# Note that when enabling USE_PDFLATEX this option is only used for +# generating bitmaps for formulas in the HTML output, but not in the +# Makefile that is written to the output directory. + +LATEX_CMD_NAME = latex + +# The MAKEINDEX_CMD_NAME tag can be used to specify the command name to +# generate index for LaTeX. If left blank `makeindex' will be used as the +# default command name. + +MAKEINDEX_CMD_NAME = makeindex + +# If the COMPACT_LATEX tag is set to YES Doxygen generates more compact +# LaTeX documents. This may be useful for small projects and may help to +# save some trees in general. + +COMPACT_LATEX = NO + +# The PAPER_TYPE tag can be used to set the paper type that is used +# by the printer. Possible values are: a4, a4wide, letter, legal and +# executive. If left blank a4wide will be used. + +PAPER_TYPE = a4wide + +# The EXTRA_PACKAGES tag can be to specify one or more names of LaTeX +# packages that should be included in the LaTeX output. + +EXTRA_PACKAGES = + +# The LATEX_HEADER tag can be used to specify a personal LaTeX header for +# the generated latex document. The header should contain everything until +# the first chapter. If it is left blank doxygen will generate a +# standard header. Notice: only use this tag if you know what you are doing! + +LATEX_HEADER = + +# If the PDF_HYPERLINKS tag is set to YES, the LaTeX that is generated +# is prepared for conversion to pdf (using ps2pdf). The pdf file will +# contain links (just like the HTML output) instead of page references +# This makes the output suitable for online browsing using a pdf viewer. + +PDF_HYPERLINKS = YES + +# If the USE_PDFLATEX tag is set to YES, pdflatex will be used instead of +# plain latex in the generated Makefile. Set this option to YES to get a +# higher quality PDF documentation. + +USE_PDFLATEX = YES + +# If the LATEX_BATCHMODE tag is set to YES, doxygen will add the \\batchmode. +# command to the generated LaTeX files. This will instruct LaTeX to keep +# running if errors occur, instead of asking the user for help. +# This option is also used when generating formulas in HTML. + +LATEX_BATCHMODE = NO + +# If LATEX_HIDE_INDICES is set to YES then doxygen will not +# include the index chapters (such as File Index, Compound Index, etc.) +# in the output. + +LATEX_HIDE_INDICES = NO + +# If LATEX_SOURCE_CODE is set to YES then doxygen will include +# source code with syntax highlighting in the LaTeX output. +# Note that which sources are shown also depends on other settings +# such as SOURCE_BROWSER. + +LATEX_SOURCE_CODE = NO + +#--------------------------------------------------------------------------- +# configuration options related to the RTF output +#--------------------------------------------------------------------------- + +# If the GENERATE_RTF tag is set to YES Doxygen will generate RTF output +# The RTF output is optimized for Word 97 and may not look very pretty with +# other RTF readers or editors. + +GENERATE_RTF = NO + +# The RTF_OUTPUT tag is used to specify where the RTF docs will be put. +# If a relative path is entered the value of OUTPUT_DIRECTORY will be +# put in front of it. If left blank `rtf' will be used as the default path. + +RTF_OUTPUT = rtf + +# If the COMPACT_RTF tag is set to YES Doxygen generates more compact +# RTF documents. This may be useful for small projects and may help to +# save some trees in general. + +COMPACT_RTF = NO + +# If the RTF_HYPERLINKS tag is set to YES, the RTF that is generated +# will contain hyperlink fields. The RTF file will +# contain links (just like the HTML output) instead of page references. +# This makes the output suitable for online browsing using WORD or other +# programs which support those fields. +# Note: wordpad (write) and others do not support links. + +RTF_HYPERLINKS = NO + +# Load stylesheet definitions from file. Syntax is similar to doxygen's +# config file, i.e. a series of assignments. You only have to provide +# replacements, missing definitions are set to their default value. + +RTF_STYLESHEET_FILE = + +# Set optional variables used in the generation of an rtf document. +# Syntax is similar to doxygen's config file. + +RTF_EXTENSIONS_FILE = + +#--------------------------------------------------------------------------- +# configuration options related to the man page output +#--------------------------------------------------------------------------- + +# If the GENERATE_MAN tag is set to YES (the default) Doxygen will +# generate man pages + +GENERATE_MAN = NO + +# The MAN_OUTPUT tag is used to specify where the man pages will be put. +# If a relative path is entered the value of OUTPUT_DIRECTORY will be +# put in front of it. If left blank `man' will be used as the default path. + +MAN_OUTPUT = man + +# The MAN_EXTENSION tag determines the extension that is added to +# the generated man pages (default is the subroutine's section .3) + +MAN_EXTENSION = .3 + +# If the MAN_LINKS tag is set to YES and Doxygen generates man output, +# then it will generate one additional man file for each entity +# documented in the real man page(s). These additional files +# only source the real man page, but without them the man command +# would be unable to find the correct page. The default is NO. + +MAN_LINKS = NO + +#--------------------------------------------------------------------------- +# configuration options related to the XML output +#--------------------------------------------------------------------------- + +# If the GENERATE_XML tag is set to YES Doxygen will +# generate an XML file that captures the structure of +# the code including all documentation. + +GENERATE_XML = NO + +# The XML_OUTPUT tag is used to specify where the XML pages will be put. +# If a relative path is entered the value of OUTPUT_DIRECTORY will be +# put in front of it. If left blank `xml' will be used as the default path. + +XML_OUTPUT = xml + +# The XML_SCHEMA tag can be used to specify an XML schema, +# which can be used by a validating XML parser to check the +# syntax of the XML files. + +XML_SCHEMA = + +# The XML_DTD tag can be used to specify an XML DTD, +# which can be used by a validating XML parser to check the +# syntax of the XML files. + +XML_DTD = + +# If the XML_PROGRAMLISTING tag is set to YES Doxygen will +# dump the program listings (including syntax highlighting +# and cross-referencing information) to the XML output. Note that +# enabling this will significantly increase the size of the XML output. + +XML_PROGRAMLISTING = YES + +#--------------------------------------------------------------------------- +# configuration options for the AutoGen Definitions output +#--------------------------------------------------------------------------- + +# If the GENERATE_AUTOGEN_DEF tag is set to YES Doxygen will +# generate an AutoGen Definitions (see autogen.sf.net) file +# that captures the structure of the code including all +# documentation. Note that this feature is still experimental +# and incomplete at the moment. + +GENERATE_AUTOGEN_DEF = NO + +#--------------------------------------------------------------------------- +# configuration options related to the Perl module output +#--------------------------------------------------------------------------- + +# If the GENERATE_PERLMOD tag is set to YES Doxygen will +# generate a Perl module file that captures the structure of +# the code including all documentation. Note that this +# feature is still experimental and incomplete at the +# moment. + +GENERATE_PERLMOD = NO + +# If the PERLMOD_LATEX tag is set to YES Doxygen will generate +# the necessary Makefile rules, Perl scripts and LaTeX code to be able +# to generate PDF and DVI output from the Perl module output. + +PERLMOD_LATEX = NO + +# If the PERLMOD_PRETTY tag is set to YES the Perl module output will be +# nicely formatted so it can be parsed by a human reader. This is useful +# if you want to understand what is going on. On the other hand, if this +# tag is set to NO the size of the Perl module output will be much smaller +# and Perl will parse it just the same. + +PERLMOD_PRETTY = YES + +# The names of the make variables in the generated doxyrules.make file +# are prefixed with the string contained in PERLMOD_MAKEVAR_PREFIX. +# This is useful so different doxyrules.make files included by the same +# Makefile don't overwrite each other's variables. + +PERLMOD_MAKEVAR_PREFIX = + +#--------------------------------------------------------------------------- +# Configuration options related to the preprocessor +#--------------------------------------------------------------------------- + +# If the ENABLE_PREPROCESSING tag is set to YES (the default) Doxygen will +# evaluate all C-preprocessor directives found in the sources and include +# files. + +ENABLE_PREPROCESSING = YES + +# If the MACRO_EXPANSION tag is set to YES Doxygen will expand all macro +# names in the source code. If set to NO (the default) only conditional +# compilation will be performed. Macro expansion can be done in a controlled +# way by setting EXPAND_ONLY_PREDEF to YES. + +MACRO_EXPANSION = NO + +# If the EXPAND_ONLY_PREDEF and MACRO_EXPANSION tags are both set to YES +# then the macro expansion is limited to the macros specified with the +# PREDEFINED and EXPAND_AS_DEFINED tags. + +EXPAND_ONLY_PREDEF = NO + +# If the SEARCH_INCLUDES tag is set to YES (the default) the includes files +# in the INCLUDE_PATH (see below) will be search if a #include is found. + +SEARCH_INCLUDES = YES + +# The INCLUDE_PATH tag can be used to specify one or more directories that +# contain include files that are not input files but should be processed by +# the preprocessor. + +INCLUDE_PATH = + +# You can use the INCLUDE_FILE_PATTERNS tag to specify one or more wildcard +# patterns (like *.h and *.hpp) to filter out the header-files in the +# directories. If left blank, the patterns specified with FILE_PATTERNS will +# be used. + +INCLUDE_FILE_PATTERNS = + +# The PREDEFINED tag can be used to specify one or more macro names that +# are defined before the preprocessor is started (similar to the -D option of +# gcc). The argument of the tag is a list of macros of the form: name +# or name=definition (no spaces). If the definition and the = are +# omitted =1 is assumed. To prevent a macro definition from being +# undefined via #undef or recursively expanded use the := operator +# instead of the = operator. + +PREDEFINED = + +# If the MACRO_EXPANSION and EXPAND_ONLY_PREDEF tags are set to YES then +# this tag can be used to specify a list of macro names that should be expanded. +# The macro definition that is found in the sources will be used. +# Use the PREDEFINED tag if you want to use a different macro definition. + +EXPAND_AS_DEFINED = + +# If the SKIP_FUNCTION_MACROS tag is set to YES (the default) then +# doxygen's preprocessor will remove all function-like macros that are alone +# on a line, have an all uppercase name, and do not end with a semicolon. Such +# function macros are typically used for boiler-plate code, and will confuse +# the parser if not removed. + +SKIP_FUNCTION_MACROS = YES + +#--------------------------------------------------------------------------- +# Configuration::additions related to external references +#--------------------------------------------------------------------------- + +# The TAGFILES option can be used to specify one or more tagfiles. +# Optionally an initial location of the external documentation +# can be added for each tagfile. The format of a tag file without +# this location is as follows: +# TAGFILES = file1 file2 ... +# Adding location for the tag files is done as follows: +# TAGFILES = file1=loc1 "file2 = loc2" ... +# where "loc1" and "loc2" can be relative or absolute paths or +# URLs. If a location is present for each tag, the installdox tool +# does not have to be run to correct the links. +# Note that each tag file must have a unique name +# (where the name does NOT include the path) +# If a tag file is not located in the directory in which doxygen +# is run, you must also specify the path to the tagfile here. + +TAGFILES = + +# When a file name is specified after GENERATE_TAGFILE, doxygen will create +# a tag file that is based on the input files it reads. + +GENERATE_TAGFILE = + +# If the ALLEXTERNALS tag is set to YES all external classes will be listed +# in the class index. If set to NO only the inherited external classes +# will be listed. + +ALLEXTERNALS = NO + +# If the EXTERNAL_GROUPS tag is set to YES all external groups will be listed +# in the modules index. If set to NO, only the current project's groups will +# be listed. + +EXTERNAL_GROUPS = NO + +# The PERL_PATH should be the absolute path and name of the perl script +# interpreter (i.e. the result of `which perl'). + +PERL_PATH = /usr/bin/perl + +#--------------------------------------------------------------------------- +# Configuration options related to the dot tool +#--------------------------------------------------------------------------- + +# If the CLASS_DIAGRAMS tag is set to YES (the default) Doxygen will +# generate a inheritance diagram (in HTML, RTF and LaTeX) for classes with base +# or super classes. Setting the tag to NO turns the diagrams off. Note that +# this option is superseded by the HAVE_DOT option below. This is only a +# fallback. It is recommended to install and use dot, since it yields more +# powerful graphs. + +CLASS_DIAGRAMS = NO + +# You can define message sequence charts within doxygen comments using the \msc +# command. Doxygen will then run the mscgen tool (see +# http://www.mcternan.me.uk/mscgen/) to produce the chart and insert it in the +# documentation. The MSCGEN_PATH tag allows you to specify the directory where +# the mscgen tool resides. If left empty the tool is assumed to be found in the +# default search path. + +MSCGEN_PATH = + +# If set to YES, the inheritance and collaboration graphs will hide +# inheritance and usage relations if the target is undocumented +# or is not a class. + +HIDE_UNDOC_RELATIONS = YES + +# If you set the HAVE_DOT tag to YES then doxygen will assume the dot tool is +# available from the path. This tool is part of Graphviz, a graph visualization +# toolkit from AT&T and Lucent Bell Labs. The other options in this section +# have no effect if this option is set to NO (the default) + +HAVE_DOT = NO + +# By default doxygen will write a font called FreeSans.ttf to the output +# directory and reference it in all dot files that doxygen generates. This +# font does not include all possible unicode characters however, so when you need +# these (or just want a differently looking font) you can specify the font name +# using DOT_FONTNAME. You need need to make sure dot is able to find the font, +# which can be done by putting it in a standard location or by setting the +# DOTFONTPATH environment variable or by setting DOT_FONTPATH to the directory +# containing the font. + +DOT_FONTNAME = FreeSans + +# The DOT_FONTSIZE tag can be used to set the size of the font of dot graphs. +# The default size is 10pt. + +DOT_FONTSIZE = 10 + +# By default doxygen will tell dot to use the output directory to look for the +# FreeSans.ttf font (which doxygen will put there itself). If you specify a +# different font using DOT_FONTNAME you can set the path where dot +# can find it using this tag. + +DOT_FONTPATH = + +# If the CLASS_GRAPH and HAVE_DOT tags are set to YES then doxygen +# will generate a graph for each documented class showing the direct and +# indirect inheritance relations. Setting this tag to YES will force the +# the CLASS_DIAGRAMS tag to NO. + +CLASS_GRAPH = YES + +# If the COLLABORATION_GRAPH and HAVE_DOT tags are set to YES then doxygen +# will generate a graph for each documented class showing the direct and +# indirect implementation dependencies (inheritance, containment, and +# class references variables) of the class with other documented classes. + +COLLABORATION_GRAPH = YES + +# If the GROUP_GRAPHS and HAVE_DOT tags are set to YES then doxygen +# will generate a graph for groups, showing the direct groups dependencies + +GROUP_GRAPHS = YES + +# If the UML_LOOK tag is set to YES doxygen will generate inheritance and +# collaboration diagrams in a style similar to the OMG's Unified Modeling +# Language. + +UML_LOOK = NO + +# If set to YES, the inheritance and collaboration graphs will show the +# relations between templates and their instances. + +TEMPLATE_RELATIONS = NO + +# If the ENABLE_PREPROCESSING, SEARCH_INCLUDES, INCLUDE_GRAPH, and HAVE_DOT +# tags are set to YES then doxygen will generate a graph for each documented +# file showing the direct and indirect include dependencies of the file with +# other documented files. + +INCLUDE_GRAPH = YES + +# If the ENABLE_PREPROCESSING, SEARCH_INCLUDES, INCLUDED_BY_GRAPH, and +# HAVE_DOT tags are set to YES then doxygen will generate a graph for each +# documented header file showing the documented files that directly or +# indirectly include this file. + +INCLUDED_BY_GRAPH = YES + +# If the CALL_GRAPH and HAVE_DOT options are set to YES then +# doxygen will generate a call dependency graph for every global function +# or class method. Note that enabling this option will significantly increase +# the time of a run. So in most cases it will be better to enable call graphs +# for selected functions only using the \callgraph command. + +CALL_GRAPH = NO + +# If the CALLER_GRAPH and HAVE_DOT tags are set to YES then +# doxygen will generate a caller dependency graph for every global function +# or class method. Note that enabling this option will significantly increase +# the time of a run. So in most cases it will be better to enable caller +# graphs for selected functions only using the \callergraph command. + +CALLER_GRAPH = NO + +# If the GRAPHICAL_HIERARCHY and HAVE_DOT tags are set to YES then doxygen +# will graphical hierarchy of all classes instead of a textual one. + +GRAPHICAL_HIERARCHY = YES + +# If the DIRECTORY_GRAPH, SHOW_DIRECTORIES and HAVE_DOT tags are set to YES +# then doxygen will show the dependencies a directory has on other directories +# in a graphical way. The dependency relations are determined by the #include +# relations between the files in the directories. + +DIRECTORY_GRAPH = YES + +# The DOT_IMAGE_FORMAT tag can be used to set the image format of the images +# generated by dot. Possible values are png, jpg, or gif +# If left blank png will be used. + +DOT_IMAGE_FORMAT = png + +# The tag DOT_PATH can be used to specify the path where the dot tool can be +# found. If left blank, it is assumed the dot tool can be found in the path. + +DOT_PATH = + +# The DOTFILE_DIRS tag can be used to specify one or more directories that +# contain dot files that are included in the documentation (see the +# \dotfile command). + +DOTFILE_DIRS = + +# The DOT_GRAPH_MAX_NODES tag can be used to set the maximum number of +# nodes that will be shown in the graph. If the number of nodes in a graph +# becomes larger than this value, doxygen will truncate the graph, which is +# visualized by representing a node as a red box. Note that doxygen if the +# number of direct children of the root node in a graph is already larger than +# DOT_GRAPH_MAX_NODES then the graph will not be shown at all. Also note +# that the size of a graph can be further restricted by MAX_DOT_GRAPH_DEPTH. + +DOT_GRAPH_MAX_NODES = 50 + +# The MAX_DOT_GRAPH_DEPTH tag can be used to set the maximum depth of the +# graphs generated by dot. A depth value of 3 means that only nodes reachable +# from the root by following a path via at most 3 edges will be shown. Nodes +# that lay further from the root node will be omitted. Note that setting this +# option to 1 or 2 may greatly reduce the computation time needed for large +# code bases. Also note that the size of a graph can be further restricted by +# DOT_GRAPH_MAX_NODES. Using a depth of 0 means no depth restriction. + +MAX_DOT_GRAPH_DEPTH = 0 + +# Set the DOT_TRANSPARENT tag to YES to generate images with a transparent +# background. This is disabled by default, because dot on Windows does not +# seem to support this out of the box. Warning: Depending on the platform used, +# enabling this option may lead to badly anti-aliased labels on the edges of +# a graph (i.e. they become hard to read). + +DOT_TRANSPARENT = NO + +# Set the DOT_MULTI_TARGETS tag to YES allow dot to generate multiple output +# files in one run (i.e. multiple -o and -T options on the command line). This +# makes dot run faster, but since only newer versions of dot (>1.8.10) +# support this, this feature is disabled by default. + +DOT_MULTI_TARGETS = NO + +# If the GENERATE_LEGEND tag is set to YES (the default) Doxygen will +# generate a legend page explaining the meaning of the various boxes and +# arrows in the dot generated graphs. + +GENERATE_LEGEND = YES + +# If the DOT_CLEANUP tag is set to YES (the default) Doxygen will +# remove the intermediate dot files that are used to generate +# the various graphs. + +DOT_CLEANUP = YES diff --git a/src/api/BGZF.cpp b/src/api/BGZF.cpp deleted file mode 100644 index 701fa7f..0000000 --- a/src/api/BGZF.cpp +++ /dev/null @@ -1,398 +0,0 @@ -// *************************************************************************** -// BGZF.cpp (c) 2009 Derek Barnett, Michael Str�mberg -// Marth Lab, Department of Biology, Boston College -// All rights reserved. -// --------------------------------------------------------------------------- -// Last modified: 19 November 2010 (DB) -// --------------------------------------------------------------------------- -// BGZF routines were adapted from the bgzf.c code developed at the Broad -// Institute. -// --------------------------------------------------------------------------- -// Provides the basic functionality for reading & writing BGZF files -// *************************************************************************** - -#include -using namespace BamTools; - -#include -using namespace std; - -BgzfData::BgzfData(void) - : UncompressedBlockSize(DEFAULT_BLOCK_SIZE) - , CompressedBlockSize(MAX_BLOCK_SIZE) - , BlockLength(0) - , BlockOffset(0) - , BlockAddress(0) - , IsOpen(false) - , IsWriteOnly(false) - , IsWriteUncompressed(false) - , Stream(NULL) - , UncompressedBlock(NULL) - , CompressedBlock(NULL) -{ - try { - CompressedBlock = new char[CompressedBlockSize]; - UncompressedBlock = new char[UncompressedBlockSize]; - } catch( std::bad_alloc& ba ) { - fprintf(stderr, "BGZF ERROR: unable to allocate memory for our BGZF object.\n"); - exit(1); - } -} - -// destructor -BgzfData::~BgzfData(void) { - if( CompressedBlock ) delete[] CompressedBlock; - if( UncompressedBlock ) delete[] UncompressedBlock; -} - -// closes BGZF file -void BgzfData::Close(void) { - - // skip if file not open, otherwise set flag - if ( !IsOpen ) return; - - // if writing to file, flush the current BGZF block, - // then write an empty block (as EOF marker) - if ( IsWriteOnly ) { - FlushBlock(); - int blockLength = DeflateBlock(); - fwrite(CompressedBlock, 1, blockLength, Stream); - } - - // flush and close - fflush(Stream); - fclose(Stream); - IsWriteUncompressed = false; - IsOpen = false; -} - -// compresses the current block -int BgzfData::DeflateBlock(void) { - - // initialize the gzip header - char* buffer = CompressedBlock; - memset(buffer, 0, 18); - buffer[0] = GZIP_ID1; - buffer[1] = (char)GZIP_ID2; - buffer[2] = CM_DEFLATE; - buffer[3] = FLG_FEXTRA; - buffer[9] = (char)OS_UNKNOWN; - buffer[10] = BGZF_XLEN; - buffer[12] = BGZF_ID1; - buffer[13] = BGZF_ID2; - buffer[14] = BGZF_LEN; - - // set compression level - const int compressionLevel = ( IsWriteUncompressed ? 0 : Z_DEFAULT_COMPRESSION ); - - // loop to retry for blocks that do not compress enough - int inputLength = BlockOffset; - int compressedLength = 0; - unsigned int bufferSize = CompressedBlockSize; - - while ( true ) { - - // initialize zstream values - z_stream zs; - zs.zalloc = NULL; - zs.zfree = NULL; - zs.next_in = (Bytef*)UncompressedBlock; - zs.avail_in = inputLength; - zs.next_out = (Bytef*)&buffer[BLOCK_HEADER_LENGTH]; - zs.avail_out = bufferSize - BLOCK_HEADER_LENGTH - BLOCK_FOOTER_LENGTH; - - // initialize the zlib compression algorithm - if ( deflateInit2(&zs, compressionLevel, Z_DEFLATED, GZIP_WINDOW_BITS, Z_DEFAULT_MEM_LEVEL, Z_DEFAULT_STRATEGY) != Z_OK ) { - fprintf(stderr, "BGZF ERROR: zlib deflate initialization failed.\n"); - exit(1); - } - - // compress the data - int status = deflate(&zs, Z_FINISH); - if ( status != Z_STREAM_END ) { - - deflateEnd(&zs); - - // reduce the input length and try again - if ( status == Z_OK ) { - inputLength -= 1024; - if( inputLength < 0 ) { - fprintf(stderr, "BGZF ERROR: input reduction failed.\n"); - exit(1); - } - continue; - } - - fprintf(stderr, "BGZF ERROR: zlib::deflateEnd() failed.\n"); - exit(1); - } - - // finalize the compression routine - if ( deflateEnd(&zs) != Z_OK ) { - fprintf(stderr, "BGZF ERROR: zlib::deflateEnd() failed.\n"); - exit(1); - } - - compressedLength = zs.total_out; - compressedLength += BLOCK_HEADER_LENGTH + BLOCK_FOOTER_LENGTH; - if ( compressedLength > MAX_BLOCK_SIZE ) { - fprintf(stderr, "BGZF ERROR: deflate overflow.\n"); - exit(1); - } - - break; - } - - // store the compressed length - BgzfData::PackUnsignedShort(&buffer[16], (unsigned short)(compressedLength - 1)); - - // store the CRC32 checksum - unsigned int crc = crc32(0, NULL, 0); - crc = crc32(crc, (Bytef*)UncompressedBlock, inputLength); - BgzfData::PackUnsignedInt(&buffer[compressedLength - 8], crc); - BgzfData::PackUnsignedInt(&buffer[compressedLength - 4], inputLength); - - // ensure that we have less than a block of data left - int remaining = BlockOffset - inputLength; - if ( remaining > 0 ) { - if ( remaining > inputLength ) { - fprintf(stderr, "BGZF ERROR: after deflate, remainder too large.\n"); - exit(1); - } - memcpy(UncompressedBlock, UncompressedBlock + inputLength, remaining); - } - - BlockOffset = remaining; - return compressedLength; -} - -// flushes the data in the BGZF block -void BgzfData::FlushBlock(void) { - - // flush all of the remaining blocks - while ( BlockOffset > 0 ) { - - // compress the data block - int blockLength = DeflateBlock(); - - // flush the data to our output stream - int numBytesWritten = fwrite(CompressedBlock, 1, blockLength, Stream); - - if ( numBytesWritten != blockLength ) { - fprintf(stderr, "BGZF ERROR: expected to write %u bytes during flushing, but wrote %u bytes.\n", blockLength, numBytesWritten); - exit(1); - } - - BlockAddress += blockLength; - } -} - -// de-compresses the current block -int BgzfData::InflateBlock(const int& blockLength) { - - // Inflate the block in m_BGZF.CompressedBlock into m_BGZF.UncompressedBlock - z_stream zs; - zs.zalloc = NULL; - zs.zfree = NULL; - zs.next_in = (Bytef*)CompressedBlock + 18; - zs.avail_in = blockLength - 16; - zs.next_out = (Bytef*)UncompressedBlock; - zs.avail_out = UncompressedBlockSize; - - int status = inflateInit2(&zs, GZIP_WINDOW_BITS); - if ( status != Z_OK ) { - fprintf(stderr, "BGZF ERROR: could not decompress block - zlib::inflateInit() failed\n"); - return -1; - } - - status = inflate(&zs, Z_FINISH); - if ( status != Z_STREAM_END ) { - inflateEnd(&zs); - fprintf(stderr, "BGZF ERROR: could not decompress block - zlib::inflate() failed\n"); - return -1; - } - - status = inflateEnd(&zs); - if ( status != Z_OK ) { - fprintf(stderr, "BGZF ERROR: could not decompress block - zlib::inflateEnd() failed\n"); - return -1; - } - - return zs.total_out; -} - -// opens the BGZF file for reading (mode is either "rb" for reading, or "wb" for writing) -bool BgzfData::Open(const string& filename, const char* mode, bool isWriteUncompressed ) { - - // determine open mode - if ( strcmp(mode, "rb") == 0 ) - IsWriteOnly = false; - else if ( strcmp(mode, "wb") == 0) - IsWriteOnly = true; - else { - fprintf(stderr, "BGZF ERROR: unknown file mode: %s\n", mode); - return false; - } - - // ---------------------------------------------------------------- - // open Stream to read to/write from file, stdin, or stdout - // stdin/stdout option contributed by Aaron Quinlan (2010-Jan-03) - - // read/write BGZF data to/from a file - if ( (filename != "stdin") && (filename != "stdout") ) - Stream = fopen(filename.c_str(), mode); - - // read BGZF data from stdin - else if ( (filename == "stdin") && (strcmp(mode, "rb") == 0 ) ) - Stream = freopen(NULL, mode, stdin); - - // write BGZF data to stdout - else if ( (filename == "stdout") && (strcmp(mode, "wb") == 0) ) - Stream = freopen(NULL, mode, stdout); - - if ( !Stream ) { - fprintf(stderr, "BGZF ERROR: unable to open file %s\n", filename.c_str() ); - return false; - } - - // set flags, return success - IsOpen = true; - IsWriteUncompressed = isWriteUncompressed; - return true; -} - -// reads BGZF data into a byte buffer -int BgzfData::Read(char* data, const unsigned int dataLength) { - - if ( !IsOpen || IsWriteOnly || dataLength == 0 ) return 0; - - char* output = data; - unsigned int numBytesRead = 0; - while ( numBytesRead < dataLength ) { - - int bytesAvailable = BlockLength - BlockOffset; - if ( bytesAvailable <= 0 ) { - if ( !ReadBlock() ) return -1; - bytesAvailable = BlockLength - BlockOffset; - if ( bytesAvailable <= 0 ) break; - } - - char* buffer = UncompressedBlock; - int copyLength = min( (int)(dataLength-numBytesRead), bytesAvailable ); - memcpy(output, buffer + BlockOffset, copyLength); - - BlockOffset += copyLength; - output += copyLength; - numBytesRead += copyLength; - } - - if ( BlockOffset == BlockLength ) { - BlockAddress = ftell64(Stream); - BlockOffset = 0; - BlockLength = 0; - } - - return numBytesRead; -} - -// reads a BGZF block -bool BgzfData::ReadBlock(void) { - - char header[BLOCK_HEADER_LENGTH]; - int64_t blockAddress = ftell64(Stream); - - int count = fread(header, 1, sizeof(header), Stream); - if ( count == 0 ) { - BlockLength = 0; - return true; - } - - if ( count != sizeof(header) ) { - fprintf(stderr, "BGZF ERROR: read block failed - could not read block header\n"); - return false; - } - - if ( !BgzfData::CheckBlockHeader(header) ) { - fprintf(stderr, "BGZF ERROR: read block failed - invalid block header\n"); - return false; - } - - int blockLength = BgzfData::UnpackUnsignedShort(&header[16]) + 1; - char* compressedBlock = CompressedBlock; - memcpy(compressedBlock, header, BLOCK_HEADER_LENGTH); - int remaining = blockLength - BLOCK_HEADER_LENGTH; - - count = fread(&compressedBlock[BLOCK_HEADER_LENGTH], 1, remaining, Stream); - if ( count != remaining ) { - fprintf(stderr, "BGZF ERROR: read block failed - could not read data from block\n"); - return false; - } - - count = InflateBlock(blockLength); - if ( count < 0 ) { - fprintf(stderr, "BGZF ERROR: read block failed - could not decompress block data\n"); - return false; - } - - if ( BlockLength != 0 ) - BlockOffset = 0; - - BlockAddress = blockAddress; - BlockLength = count; - return true; -} - -// seek to position in BGZF file -bool BgzfData::Seek(int64_t position) { - - if ( !IsOpen ) return false; - - int blockOffset = (position & 0xFFFF); - int64_t blockAddress = (position >> 16) & 0xFFFFFFFFFFFFLL; - - if ( fseek64(Stream, blockAddress, SEEK_SET) != 0 ) { - fprintf(stderr, "BGZF ERROR: unable to seek in file\n"); - return false; - } - - BlockLength = 0; - BlockAddress = blockAddress; - BlockOffset = blockOffset; - return true; -} - -// get file position in BGZF file -int64_t BgzfData::Tell(void) { - if ( !IsOpen ) - return false; - else - return ( (BlockAddress << 16) | (BlockOffset & 0xFFFF) ); -} - -// writes the supplied data into the BGZF buffer -unsigned int BgzfData::Write(const char* data, const unsigned int dataLen) { - - if ( !IsOpen || !IsWriteOnly ) return false; - - // initialize - unsigned int numBytesWritten = 0; - const char* input = data; - unsigned int blockLength = UncompressedBlockSize; - - // copy the data to the buffer - while ( numBytesWritten < dataLen ) { - - unsigned int copyLength = min(blockLength - BlockOffset, dataLen - numBytesWritten); - char* buffer = UncompressedBlock; - memcpy(buffer + BlockOffset, input, copyLength); - - BlockOffset += copyLength; - input += copyLength; - numBytesWritten += copyLength; - - if ( BlockOffset == blockLength ) - FlushBlock(); - } - - return numBytesWritten; -} diff --git a/src/api/BGZF.h b/src/api/BGZF.h deleted file mode 100644 index 0ee0286..0000000 --- a/src/api/BGZF.h +++ /dev/null @@ -1,322 +0,0 @@ -// *************************************************************************** -// BGZF.h (c) 2009 Derek Barnett, Michael Str�mberg -// Marth Lab, Department of Biology, Boston College -// All rights reserved. -// --------------------------------------------------------------------------- -// Last modified: 19 November 2010 (DB) -// --------------------------------------------------------------------------- -// BGZF routines were adapted from the bgzf.c code developed at the Broad -// Institute. -// --------------------------------------------------------------------------- -// Provides the basic functionality for reading & writing BGZF files -// *************************************************************************** - -#ifndef BGZF_H -#define BGZF_H - -#include -#include "zlib.h" - -#include -#include -#include -#include - -// Platform-specific large-file support -#ifndef BAMTOOLS_LFS -#define BAMTOOLS_LFS - #ifdef WIN32 - #define ftell64(a) _ftelli64(a) - #define fseek64(a,b,c) _fseeki64(a,b,c) - #else - #define ftell64(a) ftello(a) - #define fseek64(a,b,c) fseeko(a,b,c) - #endif -#endif // BAMTOOLS_LFS - -// Platform-specific type definitions -#ifndef BAMTOOLS_TYPES -#define BAMTOOLS_TYPES - #ifdef _MSC_VER - typedef char int8_t; - typedef unsigned char uint8_t; - typedef short int16_t; - typedef unsigned short uint16_t; - typedef int int32_t; - typedef unsigned int uint32_t; - typedef long long int64_t; - typedef unsigned long long uint64_t; - #else - #include - #endif -#endif // BAMTOOLS_TYPES - -namespace BamTools { - -// zlib constants -const int GZIP_ID1 = 31; -const int GZIP_ID2 = 139; -const int CM_DEFLATE = 8; -const int FLG_FEXTRA = 4; -const int OS_UNKNOWN = 255; -const int BGZF_XLEN = 6; -const int BGZF_ID1 = 66; -const int BGZF_ID2 = 67; -const int BGZF_LEN = 2; -const int GZIP_WINDOW_BITS = -15; -const int Z_DEFAULT_MEM_LEVEL = 8; - -// BZGF constants -const int BLOCK_HEADER_LENGTH = 18; -const int BLOCK_FOOTER_LENGTH = 8; -const int MAX_BLOCK_SIZE = 65536; -const int DEFAULT_BLOCK_SIZE = 65536; - -struct API_EXPORT BgzfData { - - // data members - public: - unsigned int UncompressedBlockSize; - unsigned int CompressedBlockSize; - unsigned int BlockLength; - unsigned int BlockOffset; - uint64_t BlockAddress; - bool IsOpen; - bool IsWriteOnly; - bool IsWriteUncompressed; - FILE* Stream; - char* UncompressedBlock; - char* CompressedBlock; - - // constructor & destructor - public: - BgzfData(void); - ~BgzfData(void); - - // main interface methods - public: - // closes BGZF file - void Close(void); - // opens the BGZF file (mode is either "rb" for reading, or "wb" for writing) - bool Open(const std::string& filename, const char* mode, bool isWriteUncompressed = false); - // reads BGZF data into a byte buffer - int Read(char* data, const unsigned int dataLength); - // seek to position in BGZF file - bool Seek(int64_t position); - // get file position in BGZF file - int64_t Tell(void); - // writes the supplied data into the BGZF buffer - unsigned int Write(const char* data, const unsigned int dataLen); - - // internal methods - private: - // compresses the current block - int DeflateBlock(void); - // flushes the data in the BGZF block - void FlushBlock(void); - // de-compresses the current block - int InflateBlock(const int& blockLength); - // reads a BGZF block - bool ReadBlock(void); - - // static 'utility' methods - public: - // checks BGZF block header - static inline bool CheckBlockHeader(char* header); - // packs an unsigned integer into the specified buffer - static inline void PackUnsignedInt(char* buffer, unsigned int value); - // packs an unsigned short into the specified buffer - static inline void PackUnsignedShort(char* buffer, unsigned short value); - // unpacks a buffer into a double - static inline double UnpackDouble(char* buffer); - static inline double UnpackDouble(const char* buffer); - // unpacks a buffer into a float - static inline float UnpackFloat(char* buffer); - static inline float UnpackFloat(const char* buffer); - // unpacks a buffer into a signed int - static inline signed int UnpackSignedInt(char* buffer); - static inline signed int UnpackSignedInt(const char* buffer); - // unpacks a buffer into a signed short - static inline signed short UnpackSignedShort(char* buffer); - static inline signed short UnpackSignedShort(const char* buffer); - // unpacks a buffer into an unsigned int - static inline unsigned int UnpackUnsignedInt(char* buffer); - static inline unsigned int UnpackUnsignedInt(const char* buffer); - // unpacks a buffer into an unsigned short - static inline unsigned short UnpackUnsignedShort(char* buffer); - static inline unsigned short UnpackUnsignedShort(const char* buffer); -}; - -// ------------------------------------------------------------- -// static 'utility' method implementations - -// checks BGZF block header -inline -bool BgzfData::CheckBlockHeader(char* header) { - return (header[0] == GZIP_ID1 && - header[1] == (char)GZIP_ID2 && - header[2] == Z_DEFLATED && - (header[3] & FLG_FEXTRA) != 0 && - BgzfData::UnpackUnsignedShort(&header[10]) == BGZF_XLEN && - header[12] == BGZF_ID1 && - header[13] == BGZF_ID2 && - BgzfData::UnpackUnsignedShort(&header[14]) == BGZF_LEN ); -} - -// 'packs' an unsigned integer into the specified buffer -inline -void BgzfData::PackUnsignedInt(char* buffer, unsigned int value) { - buffer[0] = (char)value; - buffer[1] = (char)(value >> 8); - buffer[2] = (char)(value >> 16); - buffer[3] = (char)(value >> 24); -} - -// 'packs' an unsigned short into the specified buffer -inline -void BgzfData::PackUnsignedShort(char* buffer, unsigned short value) { - buffer[0] = (char)value; - buffer[1] = (char)(value >> 8); -} - -// 'unpacks' a buffer into a double (includes both non-const & const char* flavors) -inline -double BgzfData::UnpackDouble(char* buffer) { - union { double value; unsigned char valueBuffer[sizeof(double)]; } un; - un.value = 0; - un.valueBuffer[0] = buffer[0]; - un.valueBuffer[1] = buffer[1]; - un.valueBuffer[2] = buffer[2]; - un.valueBuffer[3] = buffer[3]; - un.valueBuffer[4] = buffer[4]; - un.valueBuffer[5] = buffer[5]; - un.valueBuffer[6] = buffer[6]; - un.valueBuffer[7] = buffer[7]; - return un.value; -} - -inline -double BgzfData::UnpackDouble(const char* buffer) { - union { double value; unsigned char valueBuffer[sizeof(double)]; } un; - un.value = 0; - un.valueBuffer[0] = buffer[0]; - un.valueBuffer[1] = buffer[1]; - un.valueBuffer[2] = buffer[2]; - un.valueBuffer[3] = buffer[3]; - un.valueBuffer[4] = buffer[4]; - un.valueBuffer[5] = buffer[5]; - un.valueBuffer[6] = buffer[6]; - un.valueBuffer[7] = buffer[7]; - return un.value; -} - -// 'unpacks' a buffer into a float (includes both non-const & const char* flavors) -inline -float BgzfData::UnpackFloat(char* buffer) { - union { float value; unsigned char valueBuffer[sizeof(float)]; } un; - un.value = 0; - un.valueBuffer[0] = buffer[0]; - un.valueBuffer[1] = buffer[1]; - un.valueBuffer[2] = buffer[2]; - un.valueBuffer[3] = buffer[3]; - return un.value; -} - -inline -float BgzfData::UnpackFloat(const char* buffer) { - union { float value; unsigned char valueBuffer[sizeof(float)]; } un; - un.value = 0; - un.valueBuffer[0] = buffer[0]; - un.valueBuffer[1] = buffer[1]; - un.valueBuffer[2] = buffer[2]; - un.valueBuffer[3] = buffer[3]; - return un.value; -} - -// 'unpacks' a buffer into a signed int (includes both non-const & const char* flavors) -inline -signed int BgzfData::UnpackSignedInt(char* buffer) { - union { signed int value; unsigned char valueBuffer[sizeof(signed int)]; } un; - un.value = 0; - un.valueBuffer[0] = buffer[0]; - un.valueBuffer[1] = buffer[1]; - un.valueBuffer[2] = buffer[2]; - un.valueBuffer[3] = buffer[3]; - return un.value; -} - -inline -signed int BgzfData::UnpackSignedInt(const char* buffer) { - union { signed int value; unsigned char valueBuffer[sizeof(signed int)]; } un; - un.value = 0; - un.valueBuffer[0] = buffer[0]; - un.valueBuffer[1] = buffer[1]; - un.valueBuffer[2] = buffer[2]; - un.valueBuffer[3] = buffer[3]; - return un.value; -} - -// 'unpacks' a buffer into a signed short (includes both non-const & const char* flavors) -inline -signed short BgzfData::UnpackSignedShort(char* buffer) { - union { signed short value; unsigned char valueBuffer[sizeof(signed short)]; } un; - un.value = 0; - un.valueBuffer[0] = buffer[0]; - un.valueBuffer[1] = buffer[1]; - return un.value; -} - -inline -signed short BgzfData::UnpackSignedShort(const char* buffer) { - union { signed short value; unsigned char valueBuffer[sizeof(signed short)]; } un; - un.value = 0; - un.valueBuffer[0] = buffer[0]; - un.valueBuffer[1] = buffer[1]; - return un.value; -} - -// 'unpacks' a buffer into an unsigned int (includes both non-const & const char* flavors) -inline -unsigned int BgzfData::UnpackUnsignedInt(char* buffer) { - union { unsigned int value; unsigned char valueBuffer[sizeof(unsigned int)]; } un; - un.value = 0; - un.valueBuffer[0] = buffer[0]; - un.valueBuffer[1] = buffer[1]; - un.valueBuffer[2] = buffer[2]; - un.valueBuffer[3] = buffer[3]; - return un.value; -} - -inline -unsigned int BgzfData::UnpackUnsignedInt(const char* buffer) { - union { unsigned int value; unsigned char valueBuffer[sizeof(unsigned int)]; } un; - un.value = 0; - un.valueBuffer[0] = buffer[0]; - un.valueBuffer[1] = buffer[1]; - un.valueBuffer[2] = buffer[2]; - un.valueBuffer[3] = buffer[3]; - return un.value; -} - -// 'unpacks' a buffer into an unsigned short (includes both non-const & const char* flavors) -inline -unsigned short BgzfData::UnpackUnsignedShort(char* buffer) { - union { unsigned short value; unsigned char valueBuffer[sizeof(unsigned short)]; } un; - un.value = 0; - un.valueBuffer[0] = buffer[0]; - un.valueBuffer[1] = buffer[1]; - return un.value; -} - -inline -unsigned short BgzfData::UnpackUnsignedShort(const char* buffer) { - union { unsigned short value; unsigned char valueBuffer[sizeof(unsigned short)]; } un; - un.value = 0; - un.valueBuffer[0] = buffer[0]; - un.valueBuffer[1] = buffer[1]; - return un.value; -} - -} // namespace BamTools - -#endif // BGZF_H diff --git a/src/api/BamAlignment.cpp b/src/api/BamAlignment.cpp index 5538bda..162e195 100644 --- a/src/api/BamAlignment.cpp +++ b/src/api/BamAlignment.cpp @@ -3,12 +3,13 @@ // Marth Lab, Department of Biology, Boston College // All rights reserved. // --------------------------------------------------------------------------- -// Last modified: 22 December 2010 (DB) +// Last modified: 21 March 2011 (DB) // --------------------------------------------------------------------------- // Provides the BamAlignment data structure // *************************************************************************** #include +#include using namespace BamTools; #include @@ -20,10 +21,189 @@ using namespace BamTools; #include using namespace std; -const char* DNA_LOOKUP = "=ACMGRSVTWYHKDBN"; +// internal utility methods +namespace BamTools { +namespace Internal { -// default ctor -BamAlignment::BamAlignment(void) +/*! \fn bool IsValidSize(const string& tag, const string& type) + \internal + + Checks that tag name & type strings are expected sizes. + \a tag should have length + \a type should have length 1 + + \param tag BAM tag name + \param type BAM tag type-code + + \return \c true if both \a tag and \a type are correct sizes +*/ +bool IsValidSize(const string& tag, const string& type) { + return (tag.size() == Constants::BAM_TAG_TAGSIZE) && + (type.size() == Constants::BAM_TAG_TYPESIZE); +} + +/*! \fn bool SkipToNextTag(const char storageType, char* &pTagData, unsigned int& numBytesParsed) + \internal + + Moves to next available tag in tag data string + + \param storageType BAM tag type-code that determines how far to move cursor + \param pTagData pointer to current position (cursor) in tag string + \param numBytesParsed report of how many bytes were parsed (cumulatively) + + \return \c if storageType was a recognized BAM tag type + \post \a pTagData will point to the byte where the next tag data begins. + \a numBytesParsed will correspond to the cursor's position in the full TagData string. +*/ +bool SkipToNextTag(const char storageType, char* &pTagData, unsigned int& numBytesParsed) { + + switch (storageType) { + + case (Constants::BAM_TAG_TYPE_ASCII) : + case (Constants::BAM_TAG_TYPE_INT8) : + case (Constants::BAM_TAG_TYPE_UINT8) : + ++numBytesParsed; + ++pTagData; + break; + + case (Constants::BAM_TAG_TYPE_INT16) : + case (Constants::BAM_TAG_TYPE_UINT16) : + numBytesParsed += 2; + pTagData += 2; + break; + + case (Constants::BAM_TAG_TYPE_FLOAT) : + case (Constants::BAM_TAG_TYPE_INT32) : + case (Constants::BAM_TAG_TYPE_UINT32) : + numBytesParsed += 4; + pTagData += 4; + break; + + case (Constants::BAM_TAG_TYPE_STRING) : + case (Constants::BAM_TAG_TYPE_HEX) : + while(*pTagData) { + ++numBytesParsed; + ++pTagData; + } + // increment for null-terminator + ++numBytesParsed; + ++pTagData; + break; + + default: + // error case + fprintf(stderr, "BamAlignment ERROR: unknown tag type encountered: [%c]\n", storageType); + return false; + } + + // return success + return true; +} + +/*! \fn bool FindTag(const std::string& tag, char* &pTagData, const unsigned int& tagDataLength, unsigned int& numBytesParsed) + \internal + + Searches for requested tag in BAM tag data. + + \param tag requested 2-character tag name + \param pTagData pointer to current position in BamAlignment::TagData + \param tagDataLength length of BamAlignment::TagData + \param numBytesParsed number of bytes parsed so far + + \return \c true if found + + \post If \a tag is found, \a pTagData will point to the byte where the tag data begins. + \a numBytesParsed will correspond to the position in the full TagData string. + +*/ +bool FindTag(const std::string& tag, + char* &pTagData, + const unsigned int& tagDataLength, + unsigned int& numBytesParsed) +{ + + while ( numBytesParsed < tagDataLength ) { + + const char* pTagType = pTagData; + const char* pTagStorageType = pTagData + 2; + pTagData += 3; + numBytesParsed += 3; + + // check the current tag, return true on match + if ( strncmp(pTagType, tag.c_str(), 2) == 0 ) + return true; + + // get the storage class and find the next tag + if ( *pTagStorageType == '\0' ) return false; + if ( !SkipToNextTag(*pTagStorageType, pTagData, numBytesParsed) ) return false; + if ( *pTagData == '\0' ) return false; + } + + // checked all tags, none match + return false; +} + +} // namespace Internal +} // namespace BamTools + +/*! \class BamTools::BamAlignment + \brief The main BAM alignment data structure. + + Provides methods to query/modify BAM alignment data fields. +*/ +/*! \var BamAlignment::Name + \brief read name +*/ +/*! \var BamAlignment::Length + \brief length of query sequence +*/ +/*! \var BamAlignment::QueryBases + \brief 'original' sequence (as reported from sequencing machine) +*/ +/*! \var BamAlignment::AlignedBases + \brief 'aligned' sequence (includes any indels, padding, clipping) +*/ +/*! \var BamAlignment::Qualities + \brief FASTQ qualities (ASCII characters, not numeric values) +*/ +/*! \var BamAlignment::TagData + \brief tag data (use the provided methods to query/modify) +*/ +/*! \var BamAlignment::RefID + \brief ID number for reference sequence +*/ +/*! \var BamAlignment::Position + \brief position (0-based) where alignment starts +*/ +/*! \var BamAlignment::Bin + \brief BAM (standard) index bin number for this alignment +*/ +/*! \var BamAlignment::MapQuality + \brief mapping quality score +*/ +/*! \var BamAlignment::AlignmentFlag + \brief alignment bit-flag (use the provided methods to query/modify) +*/ +/*! \var BamAlignment::CigarData + \brief CIGAR operations for this alignment +*/ +/*! \var BamAlignment::MateRefID + \brief ID number for reference sequence where alignment's mate was aligned +*/ +/*! \var BamAlignment::MatePosition + \brief position (0-based) where alignment's mate starts +*/ +/*! \var BamAlignment::InsertSize + \brief mate-pair insert size +*/ +/*! \var BamAlignment::Filename + \brief name of BAM file which this alignment comes from +*/ + +/*! \fn BamAlignment::BamAlignment(void) + \brief constructor +*/ +BamAlignment::BamAlignment(void) : RefID(-1) , Position(-1) , MateRefID(-1) @@ -31,7 +211,9 @@ BamAlignment::BamAlignment(void) , InsertSize(0) { } -// copy ctor +/*! \fn BamAlignment::BamAlignment(const BamAlignment& other) + \brief copy constructor +*/ BamAlignment::BamAlignment(const BamAlignment& other) : Name(other.Name) , Length(other.Length) @@ -48,52 +230,222 @@ BamAlignment::BamAlignment(const BamAlignment& other) , MateRefID(other.MateRefID) , MatePosition(other.MatePosition) , InsertSize(other.InsertSize) + , Filename(other.Filename) , SupportData(other.SupportData) { } -// dtor +/*! \fn BamAlignment::~BamAlignment(void) + \brief destructor +*/ BamAlignment::~BamAlignment(void) { } -// Queries against alignment flags -bool BamAlignment::IsDuplicate(void) const { return ( (AlignmentFlag & DUPLICATE) != 0 ); } -bool BamAlignment::IsFailedQC(void) const { return ( (AlignmentFlag & QC_FAILED) != 0 ); } -bool BamAlignment::IsFirstMate(void) const { return ( (AlignmentFlag & READ_1) != 0 ); } -bool BamAlignment::IsMapped(void) const { return ( (AlignmentFlag & UNMAPPED) == 0 ); } -bool BamAlignment::IsMateMapped(void) const { return ( (AlignmentFlag & MATE_UNMAPPED) == 0 ); } -bool BamAlignment::IsMateReverseStrand(void) const { return ( (AlignmentFlag & MATE_REVERSE) != 0 ); } -bool BamAlignment::IsPaired(void) const { return ( (AlignmentFlag & PAIRED) != 0 ); } -bool BamAlignment::IsPrimaryAlignment(void) const { return ( (AlignmentFlag & SECONDARY) == 0 ); } -bool BamAlignment::IsProperPair(void) const { return ( (AlignmentFlag & PROPER_PAIR) != 0 ); } -bool BamAlignment::IsReverseStrand(void) const { return ( (AlignmentFlag & REVERSE) != 0 ); } -bool BamAlignment::IsSecondMate(void) const { return ( (AlignmentFlag & READ_2) != 0 ); } - -// Manipulate alignment flags -void BamAlignment::SetIsDuplicate(bool ok) { if (ok) AlignmentFlag |= DUPLICATE; else AlignmentFlag &= ~DUPLICATE; } -void BamAlignment::SetIsFailedQC(bool ok) { if (ok) AlignmentFlag |= QC_FAILED; else AlignmentFlag &= ~QC_FAILED; } -void BamAlignment::SetIsFirstMate(bool ok) { if (ok) AlignmentFlag |= READ_1; else AlignmentFlag &= ~READ_1; } -void BamAlignment::SetIsMapped(bool ok) { SetIsUnmapped(!ok); } -void BamAlignment::SetIsMateMapped(bool ok) { SetIsMateUnmapped(!ok); } -void BamAlignment::SetIsMateUnmapped(bool ok) { if (ok) AlignmentFlag |= MATE_UNMAPPED; else AlignmentFlag &= ~MATE_UNMAPPED; } -void BamAlignment::SetIsMateReverseStrand(bool ok) { if (ok) AlignmentFlag |= MATE_REVERSE; else AlignmentFlag &= ~MATE_REVERSE; } -void BamAlignment::SetIsPaired(bool ok) { if (ok) AlignmentFlag |= PAIRED; else AlignmentFlag &= ~PAIRED; } -void BamAlignment::SetIsPrimaryAlignment(bool ok) { SetIsSecondaryAlignment(!ok); } -void BamAlignment::SetIsProperPair(bool ok) { if (ok) AlignmentFlag |= PROPER_PAIR; else AlignmentFlag &= ~PROPER_PAIR; } -void BamAlignment::SetIsReverseStrand(bool ok) { if (ok) AlignmentFlag |= REVERSE; else AlignmentFlag &= ~REVERSE; } -void BamAlignment::SetIsSecondaryAlignment(bool ok) { if (ok) AlignmentFlag |= SECONDARY; else AlignmentFlag &= ~SECONDARY; } -void BamAlignment::SetIsSecondMate(bool ok) { if (ok) AlignmentFlag |= READ_2; else AlignmentFlag &= ~READ_2; } -void BamAlignment::SetIsUnmapped(bool ok) { if (ok) AlignmentFlag |= UNMAPPED; else AlignmentFlag &= ~UNMAPPED; } - -// fills out character data +/*! \fn bool BamAlignment::AddTag(const std::string& tag, const std::string& type, const std::string& value) + \brief Adds a field with string data to the BAM tags. + + Does NOT modify an existing tag - use \link BamAlignment::EditTag() \endlink instead. + + \param tag 2-character tag name + \param type 1-character tag type (must be "Z" or "H") + \param value string data to store + + \return \c true if the \b new tag was added successfully + + \sa http://samtools.sourceforge.net/SAM-1.3.pdf + for more details on reserved tag names, supported tag types, etc. +*/ +bool BamAlignment::AddTag(const std::string& tag, const std::string& type, const std::string& value) { + + // skip if core data not parsed + if ( SupportData.HasCoreOnly ) return false; + + // validate tag/type size & that type is OK for string value + if ( !Internal::IsValidSize(tag, type) ) return false; + if ( type.at(0) != Constants::BAM_TAG_TYPE_STRING && + type.at(0) != Constants::BAM_TAG_TYPE_HEX ) + return false; + + // localize the tag data + char* pTagData = (char*)TagData.data(); + const unsigned int tagDataLength = TagData.size(); + unsigned int numBytesParsed = 0; + + // if tag already exists, return false + // use EditTag explicitly instead + if ( Internal::FindTag(tag, pTagData, tagDataLength, numBytesParsed) ) + return false; + + // otherwise, copy tag data to temp buffer + string newTag = tag + type + value; + const int newTagDataLength = tagDataLength + newTag.size() + 1; // leave room for null-term + char originalTagData[newTagDataLength]; + memcpy(originalTagData, TagData.c_str(), tagDataLength + 1); // '+1' for TagData null-term + + // append newTag + strcat(originalTagData + tagDataLength, newTag.data()); // removes original null-term, appends newTag + null-term + + // store temp buffer back in TagData + const char* newTagData = (const char*)originalTagData; + TagData.assign(newTagData, newTagDataLength); + + // return success + return true; +} + +/*! \fn bool BamAlignment::AddTag(const std::string& tag, const std::string& type, const uint32_t& value) + \brief Adds a field with unsigned integer data to the BAM tags. + + Does NOT modify an existing tag - use \link BamAlignment::EditTag() \endlink instead. + + \param tag 2-character tag name + \param type 1-character tag type (must NOT be "f", "Z", or "H") + \param value unsigned int data to store + + \return \c true if the \b new tag was added successfully + \sa http://samtools.sourceforge.net/SAM-1.3.pdf + for more details on reserved tag names, supported tag types, etc. +*/ +bool BamAlignment::AddTag(const std::string& tag, const std::string& type, const uint32_t& value) { + + // skip if core data not parsed + if ( SupportData.HasCoreOnly ) return false; + + // validate tag/type size & that type is OK for uint32_t value + if ( !Internal::IsValidSize(tag, type) ) return false; + if ( type.at(0) == Constants::BAM_TAG_TYPE_FLOAT || + type.at(0) == Constants::BAM_TAG_TYPE_STRING || + type.at(0) == Constants::BAM_TAG_TYPE_HEX ) + return false; + + // localize the tag data + char* pTagData = (char*)TagData.data(); + const unsigned int tagDataLength = TagData.size(); + unsigned int numBytesParsed = 0; + + // if tag already exists, return false + // use EditTag explicitly instead + if ( Internal::FindTag(tag, pTagData, tagDataLength, numBytesParsed) ) + return false; + + // otherwise, convert value to string + union { uint32_t value; char valueBuffer[sizeof(uint32_t)]; } un; + un.value = value; + + // copy original tag data to temp buffer + string newTag = tag + type; + const int newTagDataLength = tagDataLength + newTag.size() + 4; // leave room for new integer + char originalTagData[newTagDataLength]; + memcpy(originalTagData, TagData.c_str(), tagDataLength + 1); // '+1' for TagData null-term + + // append newTag + strcat(originalTagData + tagDataLength, newTag.data()); + memcpy(originalTagData + tagDataLength + newTag.size(), un.valueBuffer, sizeof(uint32_t)); + + // store temp buffer back in TagData + const char* newTagData = (const char*)originalTagData; + TagData.assign(newTagData, newTagDataLength); + + // return success + return true; +} + +/*! \fn bool BamAlignment::AddTag(const std::string& tag, const std::string& type, const int32_t& value) + \brief Adds a field with signed integer data to the BAM tags. + + Does NOT modify an existing tag - use \link BamAlignment::EditTag() \endlink instead. + + \param tag 2-character tag name + \param type 1-character tag type (must NOT be "f", "Z", or "H") + \param value signed int data to store + + \return \c true if the \b new tag was added successfully + + \sa http://samtools.sourceforge.net/SAM-1.3.pdf + for more details on reserved tag names, supported tag types, etc. +*/ +bool BamAlignment::AddTag(const std::string& tag, const std::string& type, const int32_t& value) { + return AddTag(tag, type, (const uint32_t&)value); +} + +/*! \fn bool BamAlignment::AddTag(const std::string& tag, const std::string& type, const float& value) + \brief Adds a field with floating-point data to the BAM tags. + + Does NOT modify an existing tag - use \link BamAlignment::EditTag() \endlink instead. + + \param tag 2-character tag name + \param type 1-character tag type (must NOT be "Z" or "H") + \param value float data to store + + \return \c true if the \b new tag was added successfully + + \sa http://samtools.sourceforge.net/SAM-1.3.pdf + for more details on reserved tag names, supported tag types, etc. +*/ +bool BamAlignment::AddTag(const std::string& tag, const std::string& type, const float& value) { + + // skip if core data not parsed + if ( SupportData.HasCoreOnly ) return false; + + // validate tag/type size & that type is OK for float value + if ( !Internal::IsValidSize(tag, type) ) return false; + if ( type.at(0) == Constants::BAM_TAG_TYPE_STRING || + type.at(0) == Constants::BAM_TAG_TYPE_HEX ) + return false; + + // localize the tag data + char* pTagData = (char*)TagData.data(); + const unsigned int tagDataLength = TagData.size(); + unsigned int numBytesParsed = 0; + + // if tag already exists, return false + // use EditTag explicitly instead + if ( Internal::FindTag(tag, pTagData, tagDataLength, numBytesParsed) ) + return false; + + // otherwise, convert value to string + union { float value; char valueBuffer[sizeof(float)]; } un; + un.value = value; + + // copy original tag data to temp buffer + string newTag = tag + type; + const int newTagDataLength = tagDataLength + newTag.size() + 4; // leave room for new float + char originalTagData[newTagDataLength]; + memcpy(originalTagData, TagData.c_str(), tagDataLength + 1); // '+1' for TagData null-term + + // append newTag + strcat(originalTagData + tagDataLength, newTag.data()); + memcpy(originalTagData + tagDataLength + newTag.size(), un.valueBuffer, sizeof(float)); + + // store temp buffer back in TagData + const char* newTagData = (const char*)originalTagData; + TagData.assign(newTagData, newTagDataLength); + + // return success + return true; +} + +/*! \fn bool BamAlignment::BuildCharData(void) + \brief Populates alignment string fields (read name, bases, qualities, tag data). + + An alignment retrieved using BamReader::GetNextAlignmentCore() lacks this data. + Using that method makes parsing much quicker when only positional data is required. + + However, if you later want to access the character data fields from such an alignment, + use this method to populate those fields. Provides ability to do 'lazy evaluation' of + alignment parsing. + + \return \c true if character data populated successfully (or was already available to begin with) +*/ bool BamAlignment::BuildCharData(void) { // skip if char data already parsed - if ( !SupportData.HasCoreOnly ) return true; + if ( !SupportData.HasCoreOnly ) + return true; // check system endianness bool IsBigEndian = BamTools::SystemIsBigEndian(); // calculate character lengths/offsets - const unsigned int dataLength = SupportData.BlockLength - BAM_CORE_SIZE; + const unsigned int dataLength = SupportData.BlockLength - Constants::BAM_CORE_SIZE; const unsigned int seqDataOffset = SupportData.QueryNameLength + (SupportData.NumCigarOperations * 4); const unsigned int qualDataOffset = seqDataOffset + (SupportData.QuerySequenceLength+1)/2; const unsigned int tagDataOffset = qualDataOffset + SupportData.QuerySequenceLength; @@ -118,7 +470,7 @@ bool BamAlignment::BuildCharData(void) { if ( hasSeqData ) { QueryBases.reserve(SupportData.QuerySequenceLength); for (unsigned int i = 0; i < SupportData.QuerySequenceLength; ++i) { - char singleBase = DNA_LOOKUP[ ( (seqData[(i/2)] >> (4*(1-(i%2)))) & 0xf ) ]; + char singleBase = Constants::BAM_DNA_LOOKUP[ ( (seqData[(i/2)] >> (4*(1-(i%2)))) & 0xf ) ]; QueryBases.append(1, singleBase); } } @@ -148,44 +500,44 @@ bool BamAlignment::BuildCharData(void) { vector::const_iterator cigarIter = CigarData.begin(); vector::const_iterator cigarEnd = CigarData.end(); for ( ; cigarIter != cigarEnd; ++cigarIter ) { - const CigarOp& op = (*cigarIter); - switch(op.Type) { + + switch (op.Type) { // for 'M', 'I' - write bases - case ('M') : - case ('I') : + case (Constants::BAM_CIGAR_MATCH_CHAR) : + case (Constants::BAM_CIGAR_INS_CHAR) : AlignedBases.append(QueryBases.substr(k, op.Length)); // fall through // for 'S' - soft clip, do not write bases // but increment placeholder 'k' - case ('S') : + case (Constants::BAM_CIGAR_SOFTCLIP_CHAR) : k += op.Length; break; // for 'D' - write gap character - case ('D') : - AlignedBases.append(op.Length, '-'); + case (Constants::BAM_CIGAR_DEL_CHAR) : + AlignedBases.append(op.Length, Constants::BAM_DNA_DEL); break; // for 'P' - write padding character - case ('P') : - AlignedBases.append( op.Length, '*' ); + case (Constants::BAM_CIGAR_PAD_CHAR) : + AlignedBases.append( op.Length, Constants::BAM_DNA_PAD ); break; // for 'N' - write N's, skip bases in original query sequence - case ('N') : - AlignedBases.append( op.Length, 'N' ); + case (Constants::BAM_CIGAR_REFSKIP_CHAR) : + AlignedBases.append( op.Length, Constants::BAM_DNA_N ); break; // for 'H' - hard clip, do nothing to AlignedBases, move to next op - case ('H') : + case (Constants::BAM_CIGAR_HARDCLIP_CHAR) : break; // shouldn't get here default: - fprintf(stderr, "ERROR: Invalid Cigar op type\n"); + fprintf(stderr, "BamAlignment ERROR: invalid CIGAR operation type: %c\n", op.Type); exit(1); } } @@ -198,42 +550,42 @@ bool BamAlignment::BuildCharData(void) { int i = 0; while ( (unsigned int)i < tagDataLength ) { - i += 2; // skip tagType chars (e.g. "RG", "NM", etc.) - uint8_t type = toupper(tagData[i]); // lower & upper case letters have same meaning - ++i; // skip valueType char (e.g. 'A', 'I', 'Z', etc.) + i += Constants::BAM_TAG_TAGSIZE; // skip tag chars (e.g. "RG", "NM", etc.) + const char type = tagData[i]; // get tag type at position i + ++i; // move i past tag type switch (type) { - case('A') : - case('C') : + case(Constants::BAM_TAG_TYPE_ASCII) : + case(Constants::BAM_TAG_TYPE_INT8) : + case(Constants::BAM_TAG_TYPE_UINT8) : ++i; break; - case('S') : - SwapEndian_16p(&tagData[i]); + case(Constants::BAM_TAG_TYPE_INT16) : + case(Constants::BAM_TAG_TYPE_UINT16) : + BamTools::SwapEndian_16p(&tagData[i]); i += sizeof(uint16_t); break; - case('F') : - case('I') : - SwapEndian_32p(&tagData[i]); + case(Constants::BAM_TAG_TYPE_FLOAT) : + case(Constants::BAM_TAG_TYPE_INT32) : + case(Constants::BAM_TAG_TYPE_UINT32) : + BamTools::SwapEndian_32p(&tagData[i]); i += sizeof(uint32_t); break; - case('D') : - SwapEndian_64p(&tagData[i]); - i += sizeof(uint64_t); - break; - - case('H') : - case('Z') : + case(Constants::BAM_TAG_TYPE_HEX) : + case(Constants::BAM_TAG_TYPE_STRING) : + // no endian swapping necessary for hex-string/string data while (tagData[i]) { ++i; } - ++i; // increment one more for null terminator + // increment one more for null terminator + ++i; break; // shouldn't get here default : - fprintf(stderr, "ERROR: Invalid tag value type\n"); + fprintf(stderr, "BamAlignment ERROR: invalid tag value type: %c\n", type); exit(1); } } @@ -251,145 +603,31 @@ bool BamAlignment::BuildCharData(void) { return true; } -// calculates alignment end position, based on starting position and CIGAR operations -int BamAlignment::GetEndPosition(bool usePadded, bool zeroBased) const { - - // initialize alignment end to starting position - int alignEnd = Position; - - // iterate over cigar operations - vector::const_iterator cigarIter = CigarData.begin(); - vector::const_iterator cigarEnd = CigarData.end(); - for ( ; cigarIter != cigarEnd; ++cigarIter) { - const char cigarType = (*cigarIter).Type; - if ( cigarType == 'M' || cigarType == 'D' || cigarType == 'N' ) - alignEnd += (*cigarIter).Length; - else if ( usePadded && cigarType == 'I' ) - alignEnd += (*cigarIter).Length; - } - - // adjust for zeroBased, if necessary - if (zeroBased) - return alignEnd - 1; - else - return alignEnd; -} - -bool BamAlignment::AddTag(const string& tag, const string& type, const string& value) { - - if ( SupportData.HasCoreOnly ) return false; - if ( tag.size() != 2 || type.size() != 1 ) return false; - if ( type != "Z" && type != "H" ) return false; - - // localize the tag data - char* pTagData = (char*)TagData.data(); - const unsigned int tagDataLength = TagData.size(); - unsigned int numBytesParsed = 0; - - // if tag already exists, return false - // use EditTag explicitly instead - if ( FindTag(tag, pTagData, tagDataLength, numBytesParsed) ) return false; - - // otherwise, copy tag data to temp buffer - string newTag = tag + type + value; - const int newTagDataLength = tagDataLength + newTag.size() + 1; // leave room for null-term - char originalTagData[newTagDataLength]; - memcpy(originalTagData, TagData.c_str(), tagDataLength + 1); // '+1' for TagData null-term - - // append newTag - strcat(originalTagData + tagDataLength, newTag.data()); // removes original null-term, appends newTag + null-term - - // store temp buffer back in TagData - const char* newTagData = (const char*)originalTagData; - TagData.assign(newTagData, newTagDataLength); - - // return success - return true; -} +/*! \fn bool BamAlignment::EditTag(const std::string& tag, const std::string& type, const std::string& value) + \brief Edits a BAM tag field containing string data. -bool BamAlignment::AddTag(const string& tag, const string& type, const uint32_t& value) { - - if ( SupportData.HasCoreOnly ) return false; - if ( tag.size() != 2 || type.size() != 1 ) return false; - if ( type == "f" || type == "Z" || type == "H" ) return false; - - // localize the tag data - char* pTagData = (char*)TagData.data(); - const unsigned int tagDataLength = TagData.size(); - unsigned int numBytesParsed = 0; - - // if tag already exists, return false - // use EditTag explicitly instead - if ( FindTag(tag, pTagData, tagDataLength, numBytesParsed) ) return false; - - // otherwise, convert value to string - union { unsigned int value; char valueBuffer[sizeof(unsigned int)]; } un; - un.value = value; + If \a tag does not exist, a new entry is created. - // copy original tag data to temp buffer - string newTag = tag + type; - const int newTagDataLength = tagDataLength + newTag.size() + 4; // leave room for new integer - char originalTagData[newTagDataLength]; - memcpy(originalTagData, TagData.c_str(), tagDataLength + 1); // '+1' for TagData null-term - - // append newTag - strcat(originalTagData + tagDataLength, newTag.data()); - memcpy(originalTagData + tagDataLength + newTag.size(), un.valueBuffer, sizeof(unsigned int)); - - // store temp buffer back in TagData - const char* newTagData = (const char*)originalTagData; - TagData.assign(newTagData, newTagDataLength); - - // return success - return true; -} + \param tag 2-character tag name + \param type 1-character tag type (must be "Z" or "H") + \param value string data to store -bool BamAlignment::AddTag(const string& tag, const string& type, const int32_t& value) { - return AddTag(tag, type, (const uint32_t&)value); -} + \return \c true if the tag was modified/created successfully -bool BamAlignment::AddTag(const string& tag, const string& type, const float& value) { + \sa BamAlignment::RemoveTag() + \sa http://samtools.sourceforge.net/SAM-1.3.pdf + for more details on reserved tag names, supported tag types, etc. +*/ +bool BamAlignment::EditTag(const std::string& tag, const std::string& type, const std::string& value) { + // skip if core data not parsed if ( SupportData.HasCoreOnly ) return false; - if ( tag.size() != 2 || type.size() != 1 ) return false; - if ( type == "Z" || type == "H" ) return false; - - // localize the tag data - char* pTagData = (char*)TagData.data(); - const unsigned int tagDataLength = TagData.size(); - unsigned int numBytesParsed = 0; - - // if tag already exists, return false - // use EditTag explicitly instead - if ( FindTag(tag, pTagData, tagDataLength, numBytesParsed) ) return false; - - // otherwise, convert value to string - union { float value; char valueBuffer[sizeof(float)]; } un; - un.value = value; - - // copy original tag data to temp buffer - string newTag = tag + type; - const int newTagDataLength = tagDataLength + newTag.size() + 4; // leave room for new float - char originalTagData[newTagDataLength]; - memcpy(originalTagData, TagData.c_str(), tagDataLength + 1); // '+1' for TagData null-term - - // append newTag - strcat(originalTagData + tagDataLength, newTag.data()); - memcpy(originalTagData + tagDataLength + newTag.size(), un.valueBuffer, sizeof(float)); - - // store temp buffer back in TagData - const char* newTagData = (const char*)originalTagData; - TagData.assign(newTagData, newTagDataLength); - - // return success - return true; -} -bool BamAlignment::EditTag(const string& tag, const string& type, const string& value) { - - if ( SupportData.HasCoreOnly ) return false; - if ( tag.size() != 2 || type.size() != 1 ) return false; - if ( type != "Z" && type != "H" ) return false; + // validate tag/type size & that type is OK for string value + if ( !Internal::IsValidSize(tag, type) ) return false; + if ( type.at(0) != Constants::BAM_TAG_TYPE_STRING && + type.at(0) != Constants::BAM_TAG_TYPE_HEX ) + return false; // localize the tag data char* pOriginalTagData = (char*)TagData.data(); @@ -400,7 +638,7 @@ bool BamAlignment::EditTag(const string& tag, const string& type, const string& unsigned int numBytesParsed = 0; // if tag found, store data in readGroup, return success - if ( FindTag(tag, pTagData, originalTagDataLength, numBytesParsed) ) { + if ( Internal::FindTag(tag, pTagData, originalTagDataLength, numBytesParsed) ) { // make sure array is more than big enough char newTagData[originalTagDataLength + value.size()]; @@ -416,7 +654,8 @@ bool BamAlignment::EditTag(const string& tag, const string& type, const string& // skip to next tag (if tag for removal is last, return true) const char* pTagStorageType = pTagData - 1; - if ( !SkipToNextTag(*pTagStorageType, pTagData, numBytesParsed) ) return true; + if ( !Internal::SkipToNextTag(*pTagStorageType, pTagData, numBytesParsed) ) + return true; // copy everything from current tag (the next one after tag for removal) to end const unsigned int skippedDataLength = (numBytesParsed - beginningTagDataLength); @@ -436,13 +675,34 @@ bool BamAlignment::EditTag(const string& tag, const string& type, const string& else return AddTag(tag, type, value); } -bool BamAlignment::EditTag(const string& tag, const string& type, const uint32_t& value) { +/*! \fn bool BamAlignment::EditTag(const std::string& tag, const std::string& type, const uint32_t& value) + \brief Edits a BAM tag field containing unsigned integer data. + + If \a tag does not exist, a new entry is created. + + \param tag 2-character tag name + \param type 1-character tag type (must NOT be "f", "Z", or "H") + \param value unsigned integer data to store + + \return \c true if the tag was modified/created successfully + + \sa BamAlignment::RemoveTag() + \sa http://samtools.sourceforge.net/SAM-1.3.pdf + for more details on reserved tag names, supported tag types, etc. +*/ +bool BamAlignment::EditTag(const std::string& tag, const std::string& type, const uint32_t& value) { + // skip if core data not parsed if ( SupportData.HasCoreOnly ) return false; - if ( tag.size() != 2 || type.size() != 1 ) return false; - if ( type == "f" || type == "Z" || type == "H" ) return false; - - // localize the tag data + + // validate tag/type size & that type is OK for uint32_t value + if ( !Internal::IsValidSize(tag, type) ) return false; + if ( type.at(0) == Constants::BAM_TAG_TYPE_FLOAT || + type.at(0) == Constants::BAM_TAG_TYPE_STRING || + type.at(0) == Constants::BAM_TAG_TYPE_HEX ) + return false; + + // localize the tag data char* pOriginalTagData = (char*)TagData.data(); char* pTagData = pOriginalTagData; const unsigned int originalTagDataLength = TagData.size(); @@ -451,7 +711,7 @@ bool BamAlignment::EditTag(const string& tag, const string& type, const uint32_t unsigned int numBytesParsed = 0; // if tag found, store data in readGroup, return success - if ( FindTag(tag, pTagData, originalTagDataLength, numBytesParsed) ) { + if ( Internal::FindTag(tag, pTagData, originalTagDataLength, numBytesParsed) ) { // make sure array is more than big enough char newTagData[originalTagDataLength + sizeof(value)]; @@ -462,17 +722,18 @@ bool BamAlignment::EditTag(const string& tag, const string& type, const uint32_t memcpy(newTagData, pOriginalTagData, numBytesParsed); // copy new VALUE in place of current tag data - union { unsigned int value; char valueBuffer[sizeof(unsigned int)]; } un; + union { uint32_t value; char valueBuffer[sizeof(uint32_t)]; } un; un.value = value; - memcpy(newTagData + beginningTagDataLength, un.valueBuffer, sizeof(unsigned int)); + memcpy(newTagData + beginningTagDataLength, un.valueBuffer, sizeof(uint32_t)); // skip to next tag (if tag for removal is last, return true) const char* pTagStorageType = pTagData - 1; - if ( !SkipToNextTag(*pTagStorageType, pTagData, numBytesParsed) ) return true; + if ( !Internal::SkipToNextTag(*pTagStorageType, pTagData, numBytesParsed) ) + return true; // copy everything from current tag (the next one after tag for removal) to end const unsigned int skippedDataLength = (numBytesParsed - beginningTagDataLength); - const unsigned int endTagOffset = beginningTagDataLength + sizeof(unsigned int); + const unsigned int endTagOffset = beginningTagDataLength + sizeof(uint32_t); const unsigned int endTagDataLength = originalTagDataLength - beginningTagDataLength - skippedDataLength; memcpy(newTagData + endTagOffset, pTagData, endTagDataLength); @@ -488,16 +749,51 @@ bool BamAlignment::EditTag(const string& tag, const string& type, const uint32_t else return AddTag(tag, type, value); } -bool BamAlignment::EditTag(const string& tag, const string& type, const int32_t& value) { +/*! \fn bool BamAlignment::EditTag(const std::string& tag, const std::string& type, const int32_t& value) + \brief Edits a BAM tag field containing signed integer data. + + If \a tag does not exist, a new entry is created. + + \param tag 2-character tag name + \param type 1-character tag type (must NOT be "f", "Z", or "H") + \param value signed integer data to store + + \return \c true if the tag was modified/created successfully + + \sa BamAlignment::RemoveTag() + \sa http://samtools.sourceforge.net/SAM-1.3.pdf + for more details on reserved tag names, supported tag types, etc. +*/ +bool BamAlignment::EditTag(const std::string& tag, const std::string& type, const int32_t& value) { return EditTag(tag, type, (const uint32_t&)value); } -bool BamAlignment::EditTag(const string& tag, const string& type, const float& value) { +/*! \fn bool BamAlignment::EditTag(const std::string& tag, const std::string& type, const float& value) + \brief Edits a BAM tag field containing floating-point data. + + If \a tag does not exist, a new entry is created. + + \param tag 2-character tag name + \param type 1-character tag type (must NOT be "Z" or "H") + \param value float data to store + + \return \c true if the tag was modified/created successfully + + \sa BamAlignment::RemoveTag() + \sa http://samtools.sourceforge.net/SAM-1.3.pdf + for more details on reserved tag names, supported tag types, etc. +*/ +bool BamAlignment::EditTag(const std::string& tag, const std::string& type, const float& value) { + // skip if core data not parsed if ( SupportData.HasCoreOnly ) return false; - if ( tag.size() != 2 || type.size() != 1 ) return false; - if ( type == "Z" || type == "H" ) return false; - + + // validate tag/type size & that type is OK for float value + if ( !Internal::IsValidSize(tag, type) ) return false; + if ( type.at(0) == Constants::BAM_TAG_TYPE_STRING || + type.at(0) == Constants::BAM_TAG_TYPE_HEX ) + return false; + // localize the tag data char* pOriginalTagData = (char*)TagData.data(); char* pTagData = pOriginalTagData; @@ -507,7 +803,7 @@ bool BamAlignment::EditTag(const string& tag, const string& type, const float& v unsigned int numBytesParsed = 0; // if tag found, store data in readGroup, return success - if ( FindTag(tag, pTagData, originalTagDataLength, numBytesParsed) ) { + if ( Internal::FindTag(tag, pTagData, originalTagDataLength, numBytesParsed) ) { // make sure array is more than big enough char newTagData[originalTagDataLength + sizeof(value)]; @@ -524,7 +820,8 @@ bool BamAlignment::EditTag(const string& tag, const string& type, const float& v // skip to next tag (if tag for removal is last, return true) const char* pTagStorageType = pTagData - 1; - if ( !SkipToNextTag(*pTagStorageType, pTagData, numBytesParsed) ) return true; + if ( !Internal::SkipToNextTag(*pTagStorageType, pTagData, numBytesParsed) ) + return true; // copy everything from current tag (the next one after tag for removal) to end const unsigned int skippedDataLength = (numBytesParsed - beginningTagDataLength); @@ -544,19 +841,84 @@ bool BamAlignment::EditTag(const string& tag, const string& type, const float& v else return AddTag(tag, type, value); } -// get "NM" tag data - originally contributed by Aaron Quinlan -// stores data in 'editDistance', returns success/fail -bool BamAlignment::GetEditDistance(uint32_t& editDistance) const { +/*! \fn bool BamAlignment::GetEditDistance(uint32_t& editDistance) const + \brief Retrieves value of edit distance tag ("NM"). + + \deprecated Instead use BamAlignment::GetTag() + \code + BamAlignment::GetTag("NM", editDistance); + \endcode + + \param editDistance destination for retrieved value + + \return \c true if found +*/ +bool BamAlignment::GetEditDistance(uint32_t& editDistance) const { return GetTag("NM", (uint32_t&)editDistance); } -// get "RG" tag data -// stores data in 'readGroup', returns success/fail -bool BamAlignment::GetReadGroup(string& readGroup) const { +/*! \fn int BamAlignment::GetEndPosition(bool usePadded = false, bool zeroBased = true) const + \brief Calculates alignment end position, based on starting position and CIGAR data. + + \param usePadded Inserted bases affect reported position. Default is false, so that reported + position stays 'sync-ed' with reference coordinates. + \param zeroBased Return (BAM standard) 0-based coordinate. Setting this to false can be useful + when using BAM data with half-open formats (e.g. BED). + + \return alignment end position +*/ +int BamAlignment::GetEndPosition(bool usePadded, bool zeroBased) const { + + // initialize alignment end to starting position + int alignEnd = Position; + + // iterate over cigar operations + vector::const_iterator cigarIter = CigarData.begin(); + vector::const_iterator cigarEnd = CigarData.end(); + for ( ; cigarIter != cigarEnd; ++cigarIter) { + const char cigarType = (*cigarIter).Type; + const uint32_t& cigarLength = (*cigarIter).Length; + + if ( cigarType == Constants::BAM_CIGAR_MATCH_CHAR || + cigarType == Constants::BAM_CIGAR_DEL_CHAR || + cigarType == Constants::BAM_CIGAR_REFSKIP_CHAR ) + alignEnd += cigarLength; + else if ( usePadded && cigarType == Constants::BAM_CIGAR_INS_CHAR ) + alignEnd += cigarLength; + } + + // adjust for zero-based coordinates, if requested + if ( zeroBased ) alignEnd -= 1; + + // return result + return alignEnd; +} + +/*! \fn bool BamAlignment::GetReadGroup(std::string& readGroup) const + \brief Retrieves value of read group tag ("RG"). + + \deprecated Instead use BamAlignment::GetTag() + \code + BamAlignment::GetTag("RG", readGroup); + \endcode + + \param readGroup destination for retrieved value + + \return \c true if found +*/ +bool BamAlignment::GetReadGroup(std::string& readGroup) const { return GetTag("RG", readGroup); } -bool BamAlignment::GetTag(const string& tag, string& destination) const { +/*! \fn bool BamAlignment::GetTag(const std::string& tag, std::string& destination) const + \brief Retrieves the string value associated with a BAM tag. + + \param tag 2-character tag name + \param destination destination for retrieved value + + \return \c true if found +*/ +bool BamAlignment::GetTag(const std::string& tag, std::string& destination) const { // make sure tag data exists if ( SupportData.HasCoreOnly || TagData.empty() ) @@ -568,7 +930,7 @@ bool BamAlignment::GetTag(const string& tag, string& destination) const { unsigned int numBytesParsed = 0; // if tag found, store data in readGroup, return success - if ( FindTag(tag, pTagData, tagDataLength, numBytesParsed) ) { + if ( Internal::FindTag(tag, pTagData, tagDataLength, numBytesParsed) ) { const unsigned int dataLength = strlen(pTagData); destination.clear(); destination.resize(dataLength); @@ -580,7 +942,15 @@ bool BamAlignment::GetTag(const string& tag, string& destination) const { return false; } -bool BamAlignment::GetTag(const string& tag, uint32_t& destination) const { +/*! \fn bool BamAlignment::GetTag(const std::string& tag, uint32_t& destination) const + \brief Retrieves the unsigned integer value associated with a BAM tag. + + \param tag 2-character tag name + \param destination destination for retrieved value + + \return \c true if found +*/ +bool BamAlignment::GetTag(const std::string& tag, uint32_t& destination) const { // make sure tag data exists if ( SupportData.HasCoreOnly || TagData.empty() ) @@ -592,7 +962,7 @@ bool BamAlignment::GetTag(const string& tag, uint32_t& destination) const { unsigned int numBytesParsed = 0; // if tag found, determine data byte-length, store data in readGroup, return success - if ( FindTag(tag, pTagData, tagDataLength, numBytesParsed) ) { + if ( Internal::FindTag(tag, pTagData, tagDataLength, numBytesParsed) ) { // determine data byte-length const char type = *(pTagData - 1); @@ -600,34 +970,34 @@ bool BamAlignment::GetTag(const string& tag, uint32_t& destination) const { switch (type) { // 1 byte data - case 'A': - case 'c': - case 'C': + case (Constants::BAM_TAG_TYPE_ASCII) : + case (Constants::BAM_TAG_TYPE_INT8) : + case (Constants::BAM_TAG_TYPE_UINT8) : destinationLength = 1; break; // 2 byte data - case 's': - case 'S': + case (Constants::BAM_TAG_TYPE_INT16) : + case (Constants::BAM_TAG_TYPE_UINT16) : destinationLength = 2; break; // 4 byte data - case 'i': - case 'I': + case (Constants::BAM_TAG_TYPE_INT32) : + case (Constants::BAM_TAG_TYPE_UINT32) : destinationLength = 4; break; // unsupported type for integer destination (float or var-length strings) - case 'f': - case 'Z': - case 'H': - fprintf(stderr, "ERROR: Cannot store tag of type %c in integer destination\n", type); + case (Constants::BAM_TAG_TYPE_FLOAT) : + case (Constants::BAM_TAG_TYPE_STRING) : + case (Constants::BAM_TAG_TYPE_HEX) : + fprintf(stderr, "BamAlignment ERROR: cannot store tag of type %c in integer destination\n", type); return false; // unknown tag type default: - fprintf(stderr, "ERROR: Unknown tag storage class encountered: [%c]\n", type); + fprintf(stderr, "BamAlignment ERROR: unknown tag type encountered: [%c]\n", type); return false; } @@ -641,11 +1011,27 @@ bool BamAlignment::GetTag(const string& tag, uint32_t& destination) const { return false; } -bool BamAlignment::GetTag(const string& tag, int32_t& destination) const { +/*! \fn bool BamAlignment::GetTag(const std::string& tag, int32_t& destination) const + \brief Retrieves the signed integer value associated with a BAM tag. + + \param tag 2-character tag name + \param destination destination for retrieved value + + \return \c true if found +*/ +bool BamAlignment::GetTag(const std::string& tag, int32_t& destination) const { return GetTag(tag, (uint32_t&)destination); } -bool BamAlignment::GetTag(const string& tag, float& destination) const { +/*! \fn bool BamAlignment::GetTag(const std::string& tag, float& destination) const + \brief Retrieves the floating-point value associated with a BAM tag. + + \param tag 2-character tag name + \param destination destination for retrieved value + + \return \c true if found +*/ +bool BamAlignment::GetTag(const std::string& tag, float& destination) const { // make sure tag data exists if ( SupportData.HasCoreOnly || TagData.empty() ) @@ -657,42 +1043,42 @@ bool BamAlignment::GetTag(const string& tag, float& destination) const { unsigned int numBytesParsed = 0; // if tag found, determine data byte-length, store data in readGroup, return success - if ( FindTag(tag, pTagData, tagDataLength, numBytesParsed) ) { + if ( Internal::FindTag(tag, pTagData, tagDataLength, numBytesParsed) ) { // determine data byte-length const char type = *(pTagData - 1); int destinationLength = 0; - switch(type) { + switch (type) { // 1 byte data - case 'A': - case 'c': - case 'C': + case (Constants::BAM_TAG_TYPE_ASCII) : + case (Constants::BAM_TAG_TYPE_INT8) : + case (Constants::BAM_TAG_TYPE_UINT8) : destinationLength = 1; break; // 2 byte data - case 's': - case 'S': + case (Constants::BAM_TAG_TYPE_INT16) : + case (Constants::BAM_TAG_TYPE_UINT16) : destinationLength = 2; break; // 4 byte data - case 'f': - case 'i': - case 'I': + case (Constants::BAM_TAG_TYPE_FLOAT) : + case (Constants::BAM_TAG_TYPE_INT32) : + case (Constants::BAM_TAG_TYPE_UINT32) : destinationLength = 4; break; // unsupported type (var-length strings) - case 'Z': - case 'H': - fprintf(stderr, "ERROR: Cannot store tag of type %c in integer destination\n", type); + case (Constants::BAM_TAG_TYPE_STRING) : + case (Constants::BAM_TAG_TYPE_HEX) : + fprintf(stderr, "BamAlignment ERROR: cannot store tag of type %c in float destination\n", type); return false; // unknown tag type default: - fprintf(stderr, "ERROR: Unknown tag storage class encountered: [%c]\n", type); + fprintf(stderr, "BamAlignment ERROR: unknown tag type encountered: [%c]\n", type); return false; } @@ -706,7 +1092,18 @@ bool BamAlignment::GetTag(const string& tag, float& destination) const { return false; } -bool BamAlignment::GetTagType(const string& tag, char& type) const { +/*! \fn bool BamAlignment::GetTagType(const std::string& tag, char& type) const + \brief Retrieves the BAM tag type-code associated with requested tag name. + + \param tag 2-character tag name + \param type destination for the retrieved (1-character) tag type + + \return \c true if found + + \sa http://samtools.sourceforge.net/SAM-1.3.pdf + for more details on reserved tag names, supported tag types, etc. +*/ +bool BamAlignment::GetTagType(const std::string& tag, char& type) const { // make sure tag data exists if ( SupportData.HasCoreOnly || TagData.empty() ) @@ -718,28 +1115,28 @@ bool BamAlignment::GetTagType(const string& tag, char& type) const { unsigned int numBytesParsed = 0; // lookup tag - if ( FindTag(tag, pTagData, tagDataLength, numBytesParsed) ) { + if ( Internal::FindTag(tag, pTagData, tagDataLength, numBytesParsed) ) { // retrieve tag type code type = *(pTagData - 1); // validate that type is a proper BAM tag type - switch(type) { - case 'A': - case 'c': - case 'C': - case 's': - case 'S': - case 'f': - case 'i': - case 'I': - case 'Z': - case 'H': + switch (type) { + case (Constants::BAM_TAG_TYPE_ASCII) : + case (Constants::BAM_TAG_TYPE_INT8) : + case (Constants::BAM_TAG_TYPE_UINT8) : + case (Constants::BAM_TAG_TYPE_INT16) : + case (Constants::BAM_TAG_TYPE_UINT16) : + case (Constants::BAM_TAG_TYPE_INT32) : + case (Constants::BAM_TAG_TYPE_UINT32) : + case (Constants::BAM_TAG_TYPE_FLOAT) : + case (Constants::BAM_TAG_TYPE_STRING) : + case (Constants::BAM_TAG_TYPE_HEX) : return true; // unknown tag type default: - fprintf(stderr, "ERROR: Unknown tag storage class encountered: [%c]\n", type); + fprintf(stderr, "BamAlignment ERROR: unknown tag type encountered: [%c]\n", type); return false; } } @@ -748,11 +1145,94 @@ bool BamAlignment::GetTagType(const string& tag, char& type) const { return false; } -bool BamAlignment::RemoveTag(const string& tag) { +/*! \fn bool BamAlignment::IsDuplicate(void) const + \return \c true if this read is a PCR duplicate +*/ +bool BamAlignment::IsDuplicate(void) const { + return ( (AlignmentFlag & Constants::BAM_ALIGNMENT_DUPLICATE) != 0 ); +} + +/*! \fn bool BamAlignment::IsFailedQC(void) const + \return \c true if this read failed quality control +*/ +bool BamAlignment::IsFailedQC(void) const { + return ( (AlignmentFlag & Constants::BAM_ALIGNMENT_QC_FAILED) != 0 ); +} + +/*! \fn bool BamAlignment::IsFirstMate(void) const + \return \c true if alignment is first mate on paired-end read +*/ +bool BamAlignment::IsFirstMate(void) const { + return ( (AlignmentFlag & Constants::BAM_ALIGNMENT_READ_1) != 0 ); +} + +/*! \fn bool BamAlignment::IsMapped(void) const + \return \c true if alignment is mapped +*/ +bool BamAlignment::IsMapped(void) const { + return ( (AlignmentFlag & Constants::BAM_ALIGNMENT_UNMAPPED) == 0 ); +} + +/*! \fn bool BamAlignment::IsMateMapped(void) const + \return \c true if alignment's mate is mapped +*/ +bool BamAlignment::IsMateMapped(void) const { + return ( (AlignmentFlag & Constants::BAM_ALIGNMENT_MATE_UNMAPPED) == 0 ); +} + +/*! \fn bool BamAlignment::IsMateReverseStrand(void) const + \return \c true if alignment's mate mapped to reverse strand +*/ +bool BamAlignment::IsMateReverseStrand(void) const { + return ( (AlignmentFlag & Constants::BAM_ALIGNMENT_MATE_REVERSE_STRAND) != 0 ); +} + +/*! \fn bool BamAlignment::IsPaired(void) const + \return \c true if alignment part of paired-end read +*/ +bool BamAlignment::IsPaired(void) const { + return ( (AlignmentFlag & Constants::BAM_ALIGNMENT_PAIRED) != 0 ); +} + +/*! \fn bool BamAlignment::IsPrimaryAlignment(void) const + \return \c true if reported position is primary alignment +*/ +bool BamAlignment::IsPrimaryAlignment(void) const { + return ( (AlignmentFlag & Constants::BAM_ALIGNMENT_SECONDARY) == 0 ); +} + +/*! \fn bool BamAlignment::IsProperPair(void) const + \return \c true if alignment is part of read that satisfied paired-end resolution +*/ +bool BamAlignment::IsProperPair(void) const { + return ( (AlignmentFlag & Constants::BAM_ALIGNMENT_PROPER_PAIR) != 0 ); +} + +/*! \fn bool BamAlignment::IsReverseStrand(void) const + \return \c true if alignment mapped to reverse strand +*/ +bool BamAlignment::IsReverseStrand(void) const { + return ( (AlignmentFlag & Constants::BAM_ALIGNMENT_REVERSE_STRAND) != 0 ); +} + +/*! \fn bool BamAlignment::IsSecondMate(void) const + \return \c true if alignment is second mate on read +*/ +bool BamAlignment::IsSecondMate(void) const { + return ( (AlignmentFlag & Constants::BAM_ALIGNMENT_READ_2) != 0 ); +} + +/*! \fn bool BamAlignment::RemoveTag(const std::string& tag) + \brief Removes field from BAM tags. + + \return \c true if tag was removed successfully (or didn't exist before) +*/ +bool BamAlignment::RemoveTag(const std::string& tag) { // BamAlignments fetched using BamReader::GetNextAlignmentCore() are not allowed // also, return false if no data present to remove - if ( SupportData.HasCoreOnly || TagData.empty() ) return false; + if ( SupportData.HasCoreOnly || TagData.empty() ) + return false; // localize the tag data char* pOriginalTagData = (char*)TagData.data(); @@ -762,12 +1242,12 @@ bool BamAlignment::RemoveTag(const string& tag) { unsigned int numBytesParsed = 0; // if tag found, store data in readGroup, return success - if ( FindTag(tag, pTagData, originalTagDataLength, numBytesParsed) ) { + if ( Internal::FindTag(tag, pTagData, originalTagDataLength, numBytesParsed) ) { char newTagData[originalTagDataLength]; // copy original tag data up til desired tag - pTagData -= 3; + pTagData -= 3; numBytesParsed -= 3; const unsigned int beginningTagDataLength = numBytesParsed; newTagDataLength += beginningTagDataLength; @@ -777,7 +1257,8 @@ bool BamAlignment::RemoveTag(const string& tag) { const char* pTagStorageType = pTagData + 2; pTagData += 3; numBytesParsed += 3; - if ( !SkipToNextTag(*pTagStorageType, pTagData, numBytesParsed) ) return true; + if ( !Internal::SkipToNextTag(*pTagStorageType, pTagData, numBytesParsed) ) + return true; // copy everything from current tag (the next one after tag for removal) to end const unsigned int skippedDataLength = (numBytesParsed - beginningTagDataLength); @@ -793,74 +1274,117 @@ bool BamAlignment::RemoveTag(const string& tag) { return false; } -bool BamAlignment::FindTag(const string& tag, - char* &pTagData, - const unsigned int& tagDataLength, - unsigned int& numBytesParsed) -{ +/*! \fn void BamAlignment::SetIsDuplicate(bool ok) + \brief Sets value of "PCR duplicate" flag to \a ok. +*/ +void BamAlignment::SetIsDuplicate(bool ok) { + if (ok) AlignmentFlag |= Constants::BAM_ALIGNMENT_DUPLICATE; + else AlignmentFlag &= ~Constants::BAM_ALIGNMENT_DUPLICATE; +} - while ( numBytesParsed < tagDataLength ) { +/*! \fn void BamAlignment::SetIsFailedQC(bool ok) + \brief Sets "failed quality control" flag to \a ok. +*/ +void BamAlignment::SetIsFailedQC(bool ok) { + if (ok) AlignmentFlag |= Constants::BAM_ALIGNMENT_QC_FAILED; + else AlignmentFlag &= ~Constants::BAM_ALIGNMENT_QC_FAILED; +} - const char* pTagType = pTagData; - const char* pTagStorageType = pTagData + 2; - pTagData += 3; - numBytesParsed += 3; +/*! \fn void BamAlignment::SetIsFirstMate(bool ok) + \brief Sets "alignment is first mate" flag to \a ok. +*/ +void BamAlignment::SetIsFirstMate(bool ok) { + if (ok) AlignmentFlag |= Constants::BAM_ALIGNMENT_READ_1; + else AlignmentFlag &= ~Constants::BAM_ALIGNMENT_READ_1; +} - // check the current tag, return true on match - if ( strncmp(pTagType, tag.c_str(), 2) == 0 ) - return true; +/*! \fn void BamAlignment::SetIsMapped(bool ok) + \brief Sets "alignment is mapped" flag to \a ok. +*/ +void BamAlignment::SetIsMapped(bool ok) { + if (ok) AlignmentFlag &= ~Constants::BAM_ALIGNMENT_UNMAPPED; + else AlignmentFlag |= Constants::BAM_ALIGNMENT_UNMAPPED; +} - // get the storage class and find the next tag - if ( *pTagStorageType == '\0' ) return false; - if ( !SkipToNextTag(*pTagStorageType, pTagData, numBytesParsed) ) return false; - if ( *pTagData == '\0' ) return false; - } - - // checked all tags, none match - return false; +/*! \fn void BamAlignment::SetIsMateMapped(bool ok) + \brief Sets "alignment's mate is mapped" flag to \a ok. +*/ +void BamAlignment::SetIsMateMapped(bool ok) { + if (ok) AlignmentFlag &= ~Constants::BAM_ALIGNMENT_MATE_UNMAPPED; + else AlignmentFlag |= Constants::BAM_ALIGNMENT_MATE_UNMAPPED; } -bool BamAlignment::SkipToNextTag(const char storageType, char* &pTagData, unsigned int& numBytesParsed) { - - switch(storageType) { +/*! \fn void BamAlignment::SetIsMateUnmapped(bool ok) + \brief Complement of using SetIsMateMapped(). + \deprecated For sake of symmetry with the query methods + \sa IsMateMapped(), SetIsMateMapped() +*/ +void BamAlignment::SetIsMateUnmapped(bool ok) { + SetIsMateMapped(!ok); +} - case 'A': - case 'c': - case 'C': - ++numBytesParsed; - ++pTagData; - break; +/*! \fn void BamAlignment::SetIsMateReverseStrand(bool ok) + \brief Sets "alignment's mate mapped to reverse strand" flag to \a ok. +*/ +void BamAlignment::SetIsMateReverseStrand(bool ok) { + if (ok) AlignmentFlag |= Constants::BAM_ALIGNMENT_MATE_REVERSE_STRAND; + else AlignmentFlag &= ~Constants::BAM_ALIGNMENT_MATE_REVERSE_STRAND; +} - case 's': - case 'S': - numBytesParsed += 2; - pTagData += 2; - break; +/*! \fn void BamAlignment::SetIsPaired(bool ok) + \brief Sets "alignment part of paired-end read" flag to \a ok. +*/ +void BamAlignment::SetIsPaired(bool ok) { + if (ok) AlignmentFlag |= Constants::BAM_ALIGNMENT_PAIRED; + else AlignmentFlag &= ~Constants::BAM_ALIGNMENT_PAIRED; +} - case 'f': - case 'i': - case 'I': - numBytesParsed += 4; - pTagData += 4; - break; +/*! \fn void BamAlignment::SetIsPrimaryAlignment(bool ok) + \brief Sets "position is primary alignment" flag to \a ok. +*/ +void BamAlignment::SetIsPrimaryAlignment(bool ok) { + if (ok) AlignmentFlag &= ~Constants::BAM_ALIGNMENT_SECONDARY; + else AlignmentFlag |= Constants::BAM_ALIGNMENT_SECONDARY; +} - case 'Z': - case 'H': - while(*pTagData) { - ++numBytesParsed; - ++pTagData; - } - // increment for null-terminator - ++numBytesParsed; - ++pTagData; - break; +/*! \fn void BamAlignment::SetIsProperPair(bool ok) + \brief Sets "alignment is part of read that satisfied paired-end resolution" flag to \a ok. +*/ +void BamAlignment::SetIsProperPair(bool ok) { + if (ok) AlignmentFlag |= Constants::BAM_ALIGNMENT_PROPER_PAIR; + else AlignmentFlag &= ~Constants::BAM_ALIGNMENT_PROPER_PAIR; +} - default: - // error case - fprintf(stderr, "ERROR: Unknown tag storage class encountered: [%c]\n", storageType); - return false; - } - - // return success - return true; +/*! \fn void BamAlignment::SetIsReverseStrand(bool ok) + \brief Sets "alignment mapped to reverse strand" flag to \a ok. +*/ +void BamAlignment::SetIsReverseStrand(bool ok) { + if (ok) AlignmentFlag |= Constants::BAM_ALIGNMENT_REVERSE_STRAND; + else AlignmentFlag &= ~Constants::BAM_ALIGNMENT_REVERSE_STRAND; +} + +/*! \fn void BamAlignment::SetIsSecondaryAlignment(bool ok) + \brief Complement of using SetIsPrimaryAlignment(). + \deprecated For sake of symmetry with the query methods + \sa IsPrimaryAlignment(), SetIsPrimaryAlignment() +*/ +void BamAlignment::SetIsSecondaryAlignment(bool ok) { + SetIsPrimaryAlignment(!ok); +} + +/*! \fn void BamAlignment::SetIsSecondMate(bool ok) + \brief Sets "alignment is second mate on read" flag to \a ok. +*/ +void BamAlignment::SetIsSecondMate(bool ok) { + if (ok) AlignmentFlag |= Constants::BAM_ALIGNMENT_READ_2; + else AlignmentFlag &= ~Constants::BAM_ALIGNMENT_READ_2; +} + +/*! \fn void BamAlignment::SetIsUnmapped(bool ok) + \brief Complement of using SetIsMapped(). + \deprecated For sake of symmetry with the query methods + \sa IsMapped(), SetIsMapped() +*/ +void BamAlignment::SetIsUnmapped(bool ok) { + SetIsMapped(!ok); } diff --git a/src/api/BamAlignment.h b/src/api/BamAlignment.h index 6eb7618..fb54b1a 100644 --- a/src/api/BamAlignment.h +++ b/src/api/BamAlignment.h @@ -3,7 +3,7 @@ // Marth Lab, Department of Biology, Boston College // All rights reserved. // --------------------------------------------------------------------------- -// Last modified: 22 December 2010 (DB) +// Last modified: 21 March 2011 (DB) // --------------------------------------------------------------------------- // Provides the BamAlignment data structure // *************************************************************************** @@ -18,7 +18,7 @@ namespace BamTools { -// forward declare BamAlignment's friend classes +// forward declaration of BamAlignment's friend classes namespace Internal { class BamReaderPrivate; class BamWriterPrivate; @@ -33,132 +33,109 @@ struct API_EXPORT BamAlignment { BamAlignment(const BamAlignment& other); ~BamAlignment(void); - // Queries against alignment flags + // queries against alignment flags public: - bool IsDuplicate(void) const; // Returns true if this read is a PCR duplicate - bool IsFailedQC(void) const; // Returns true if this read failed quality control - bool IsFirstMate(void) const; // Returns true if alignment is first mate on read - bool IsMapped(void) const; // Returns true if alignment is mapped - bool IsMateMapped(void) const; // Returns true if alignment's mate is mapped - bool IsMateReverseStrand(void) const; // Returns true if alignment's mate mapped to reverse strand - bool IsPaired(void) const; // Returns true if alignment part of paired-end read - bool IsPrimaryAlignment(void) const; // Returns true if reported position is primary alignment - bool IsProperPair(void) const; // Returns true if alignment is part of read that satisfied paired-end resolution - bool IsReverseStrand(void) const; // Returns true if alignment mapped to reverse strand - bool IsSecondMate(void) const; // Returns true if alignment is second mate on read - - // Manipulate alignment flags + bool IsDuplicate(void) const; // returns true if this read is a PCR duplicate + bool IsFailedQC(void) const; // returns true if this read failed quality control + bool IsFirstMate(void) const; // returns true if alignment is first mate on read + bool IsMapped(void) const; // returns true if alignment is mapped + bool IsMateMapped(void) const; // returns true if alignment's mate is mapped + bool IsMateReverseStrand(void) const; // returns true if alignment's mate mapped to reverse strand + bool IsPaired(void) const; // returns true if alignment part of paired-end read + bool IsPrimaryAlignment(void) const; // returns true if reported position is primary alignment + bool IsProperPair(void) const; // returns true if alignment is part of read that satisfied paired-end resolution + bool IsReverseStrand(void) const; // returns true if alignment mapped to reverse strand + bool IsSecondMate(void) const; // returns true if alignment is second mate on read + + // manipulate alignment flags public: - void SetIsDuplicate(bool ok); // Sets "PCR duplicate" flag - void SetIsFailedQC(bool ok); // Sets "failed quality control" flag - void SetIsFirstMate(bool ok); // Sets "alignment is first mate" flag - void SetIsMapped(bool ok); // Sets "alignment is mapped" flag - void SetIsMateMapped(bool ok); // Sets "alignment's mate is mapped" flag - void SetIsMateReverseStrand(bool ok); // Sets "alignment's mate mapped to reverse strand" flag - void SetIsPaired(bool ok); // Sets "alignment part of paired-end read" flag - void SetIsPrimaryAlignment(bool ok); // Sets "position is primary alignment" flag - void SetIsProperPair(bool ok); // Sets "alignment is part of read that satisfied paired-end resolution" flag - void SetIsReverseStrand(bool ok); // Sets "alignment mapped to reverse strand" flag - void SetIsSecondMate(bool ok); // Sets "alignment is second mate on read" flag - - // legacy methods (deprecated, but available) - void SetIsMateUnmapped(bool ok); // Complement of IsMateMapped() flag - void SetIsSecondaryAlignment(bool ok); // Complement of IsPrimaryAlignment() flag - void SetIsUnmapped(bool ok); // Complement of IsMapped() flag - - // Tag data access methods + void SetIsDuplicate(bool ok); // sets value of "PCR duplicate" flag + void SetIsFailedQC(bool ok); // sets value of "failed quality control" flag + void SetIsFirstMate(bool ok); // sets value of "alignment is first mate" flag + void SetIsMapped(bool ok); // sets value of "alignment is mapped" flag + void SetIsMateMapped(bool ok); // sets value of "alignment's mate is mapped" flag + void SetIsMateReverseStrand(bool ok); // sets value of "alignment's mate mapped to reverse strand" flag + void SetIsPaired(bool ok); // sets value of "alignment part of paired-end read" flag + void SetIsPrimaryAlignment(bool ok); // sets value of "position is primary alignment" flag + void SetIsProperPair(bool ok); // sets value of "alignment is part of read that satisfied paired-end resolution" flag + void SetIsReverseStrand(bool ok); // sets value of "alignment mapped to reverse strand" flag + void SetIsSecondMate(bool ok); // sets value of "alignment is second mate on read" flag + + // legacy methods (consider deprecated, but still available) + void SetIsMateUnmapped(bool ok); // complement of using SetIsMateMapped() + void SetIsSecondaryAlignment(bool ok); // complement of using SetIsPrimaryAlignment() + void SetIsUnmapped(bool ok); // complement of using SetIsMapped() + + // tag data access methods public: + // ------------------------------------------------------------------------------------- // N.B. - The following tag access methods may not be used on BamAlignments fetched // using BamReader::GetNextAlignmentCore(). Attempting to use them will not result in // error message (to keep output clean) but will ALWAYS return false. Only user-created // BamAlignments or those retrieved using BamReader::GetNextAlignment() are valid here. + // + // You can call BuildCharData() on such an alignment retrieved by GetNextAlignmentCore(). + // This populates all the character data, and will enable subsequent queries on tag data. + // ------------------------------------------------------------------------------------- - // add tag data (create new TAG entry with TYPE and VALUE) - // TYPE is one of {A, i, f, Z, H} depending on VALUE - see SAM/BAM spec for details - // returns true if new data added, false if error or TAG already exists - // N.B. - will NOT modify existing tag. Use EditTag() instead - // @tag - two character tag name - // @type - single character tag type (see SAM/BAM spec for details) - // @value - value to associate with tag - bool AddTag(const std::string& tag, const std::string& type, const std::string& value); // type must be Z or H - bool AddTag(const std::string& tag, const std::string& type, const uint32_t& value); // type must be A or i - bool AddTag(const std::string& tag, const std::string& type, const int32_t& value); // type must be A or i - bool AddTag(const std::string& tag, const std::string& type, const float& value); // type must be A, i, or f - - // edit tag data (sets existing TAG with TYPE to VALUE or adds new TAG if not already present) - // TYPE is one of {A, i, f, Z, H} depending on VALUE - see SAM/BAM spec for details - // returns true if edit was successfaul, false if error - // @tag - two character tag name - // @type - single character tag type (see SAM/BAM spec for details) - // @value - new value for tag - bool EditTag(const std::string& tag, const std::string& type, const std::string& value); // type must be Z or H - bool EditTag(const std::string& tag, const std::string& type, const uint32_t& value); // type must be A or i - bool EditTag(const std::string& tag, const std::string& type, const int32_t& value); // type must be A or i - bool EditTag(const std::string& tag, const std::string& type, const float& value); // type must be A, i, or f - - // specific tag data access methods - these only remain for legacy support - // returns whether specific tag could be retrieved - bool GetEditDistance(uint32_t& editDistance) const; // get "NM" tag data (equivalent to GetTag("NM", editDistance)) - bool GetReadGroup(std::string& readGroup) const; // get "RG" tag data (equivalent to GetTag("RG", readGroup)) + // adds a tag + bool AddTag(const std::string& tag, const std::string& type, const std::string& value); + bool AddTag(const std::string& tag, const std::string& type, const uint32_t& value); + bool AddTag(const std::string& tag, const std::string& type, const int32_t& value); + bool AddTag(const std::string& tag, const std::string& type, const float& value); - // generic tag data access methods - // returns whether tag is found & tag type is compatible with DESTINATION - // @tag - two character tag name - // @destination - if found, tag value is stored here - bool GetTag(const std::string& tag, std::string& destination) const; // access variable-length char or hex strings - bool GetTag(const std::string& tag, uint32_t& destination) const; // access unsigned integer data - bool GetTag(const std::string& tag, int32_t& destination) const; // access signed integer data - bool GetTag(const std::string& tag, float& destination) const; // access floating point data - - // retrieve the tag type code for TAG - // returns true if tag could be found and type determined + // edits a tag + bool EditTag(const std::string& tag, const std::string& type, const std::string& value); + bool EditTag(const std::string& tag, const std::string& type, const uint32_t& value); + bool EditTag(const std::string& tag, const std::string& type, const int32_t& value); + bool EditTag(const std::string& tag, const std::string& type, const float& value); + + // retrieves data for a tag + bool GetTag(const std::string& tag, std::string& destination) const; + bool GetTag(const std::string& tag, uint32_t& destination) const; + bool GetTag(const std::string& tag, int32_t& destination) const; + bool GetTag(const std::string& tag, float& destination) const; + + // retrieves the BAM tag-type character for a tag bool GetTagType(const std::string& tag, char& type) const; + + // legacy methods (consider deprecated, but still available) + bool GetEditDistance(uint32_t& editDistance) const; // retrieves value of "NM" tag + bool GetReadGroup(std::string& readGroup) const; // retrieves value of "RG" tag - // remove tag data - // returns true if removal was successful, false if error - // N.B. - returns false if TAG does not exist (no removal can occur) - // @tag - two character tag name + // removes a tag bool RemoveTag(const std::string& tag); - // Populate an alignment retrieved by BamAlignment::GetNextAlignmentCore() with full character data - // (read name, bases, qualities, tag data) + // additional methods public: + // populates alignment string fields bool BuildCharData(void); - - // Additional data access methods - public: - // calculates & returns alignment end position, based on starting position and CIGAR operations - // @usePadded - if true, counts inserted bases. Default is false, so that alignment end position matches the last base's position in reference - // @zeroBased - if true, returns 0-based coordinate; else returns 1-based. Setting this to false is useful when using BAM data along with other, half-open formats. + // calculates alignment end position int GetEndPosition(bool usePadded = false, bool zeroBased = true) const; - // 'internal' utility methods - private: - static bool FindTag(const std::string& tag, char* &pTagData, const unsigned int& tagDataLength, unsigned int& numBytesParsed); - static bool SkipToNextTag(const char storageType, char* &pTagData, unsigned int& numBytesParsed); - - // Data members + // public data fields public: - std::string Name; // Read name - int32_t Length; // Query length - std::string QueryBases; // 'Original' sequence (as reported from sequencing machine) - std::string AlignedBases; // 'Aligned' sequence (includes any indels, padding, clipping) + std::string Name; // read name + int32_t Length; // length of query sequence + std::string QueryBases; // 'original' sequence (as reported from sequencing machine) + std::string AlignedBases; // 'aligned' sequence (includes any indels, padding, clipping) std::string Qualities; // FASTQ qualities (ASCII characters, not numeric values) - std::string TagData; // Tag data (accessor methods will pull the requested information out) + std::string TagData; // tag data (use provided methods to query/modify) int32_t RefID; // ID number for reference sequence - int32_t Position; // Position (0-based) where alignment starts - uint16_t Bin; // Bin in BAM file where this alignment resides - uint16_t MapQuality; // Mapping quality score - uint32_t AlignmentFlag; // Alignment bit-flag - see Is() methods to query this value, SetIs() methods to manipulate - std::vector CigarData; // CIGAR operations for this alignment + int32_t Position; // position (0-based) where alignment starts + uint16_t Bin; // BAM (standard) index bin number for this alignment + uint16_t MapQuality; // mapping quality score + uint32_t AlignmentFlag; // alignment bit-flag (use provided methods to query/modify) + std::vector CigarData; // CIGAR operations for this alignment int32_t MateRefID; // ID number for reference sequence where alignment's mate was aligned - int32_t MatePosition; // Position (0-based) where alignment's mate starts - int32_t InsertSize; // Mate-pair insert size - - // Internal data, inaccessible to client code - // but available BamReaderPrivate & BamWriterPrivate + int32_t MatePosition; // position (0-based) where alignment's mate starts + int32_t InsertSize; // mate-pair insert size + std::string Filename; // name of BAM file which this alignment comes from + + // internal data private: + //! \cond struct BamAlignmentSupportData { // data members @@ -178,28 +155,12 @@ struct API_EXPORT BamAlignment { , HasCoreOnly(false) { } }; - BamAlignmentSupportData SupportData; - friend class Internal::BamReaderPrivate; - friend class Internal::BamWriterPrivate; - - // Alignment flag query constants - // Use the get/set methods above instead - private: - enum { PAIRED = 1 - , PROPER_PAIR = 2 - , UNMAPPED = 4 - , MATE_UNMAPPED = 8 - , REVERSE = 16 - , MATE_REVERSE = 32 - , READ_1 = 64 - , READ_2 = 128 - , SECONDARY = 256 - , QC_FAILED = 512 - , DUPLICATE = 1024 - }; + BamAlignmentSupportData SupportData; + friend class Internal::BamReaderPrivate; + friend class Internal::BamWriterPrivate; + //! \endcond }; -// convenience typedef(s) typedef std::vector BamAlignmentVector; } // namespace BamTools diff --git a/src/api/BamAux.h b/src/api/BamAux.h index bc99cb7..d171e70 100644 --- a/src/api/BamAux.h +++ b/src/api/BamAux.h @@ -3,115 +3,91 @@ // Marth Lab, Department of Biology, Boston College // All rights reserved. // --------------------------------------------------------------------------- -// Last modified: 19 November 2010 (DB) +// Last modified: 4 March 2011 (DB) // --------------------------------------------------------------------------- -// Provides the basic constants, data structures, utilities etc. -// used throughout the API for handling BAM files +// Provides data structures & utility methods that are used throughout the API. // *************************************************************************** #ifndef BAMAUX_H #define BAMAUX_H #include - #include #include #include #include -// Platform-specific large-file support -#ifndef BAMTOOLS_LFS -#define BAMTOOLS_LFS - #ifdef WIN32 - #define ftell64(a) _ftelli64(a) - #define fseek64(a,b,c) _fseeki64(a,b,c) - #else - #define ftell64(a) ftello(a) - #define fseek64(a,b,c) fseeko(a,b,c) - #endif -#endif // BAMTOOLS_LFS - -// Platform-specific type definitions -#ifndef BAMTOOLS_TYPES -#define BAMTOOLS_TYPES - #ifdef _MSC_VER - typedef char int8_t; - typedef unsigned char uint8_t; - typedef short int16_t; - typedef unsigned short uint16_t; - typedef int int32_t; - typedef unsigned int uint32_t; - typedef long long int64_t; - typedef unsigned long long uint64_t; - #else - #include - #endif -#endif // BAMTOOLS_TYPES +/*! \file BamAux.h + + Provides data structures & utility methods that are used throughout the API. +*/ +/*! \namespace BamTools + \brief Contains all BamTools classes & methods. + The BamTools API contained in this namespace contains classes and methods + for reading, writing, and manipulating BAM alignment files. +*/ namespace BamTools { // ---------------------------------------------------------------- -// ---------------------------------------------------------------- -// BAM constants - -const int BAM_CMATCH = 0; -const int BAM_CINS = 1; -const int BAM_CDEL = 2; -const int BAM_CREF_SKIP = 3; -const int BAM_CSOFT_CLIP = 4; -const int BAM_CHARD_CLIP = 5; -const int BAM_CPAD = 6; -const int BAM_CIGAR_SHIFT = 4; -const int BAM_CIGAR_MASK = ((1 << BAM_CIGAR_SHIFT) - 1); -const int BAM_CORE_SIZE = 32; -const int BT_SIZEOF_INT = 4; +// CigarOp -// ---------------------------------------------------------------- -// ---------------------------------------------------------------- -// Data structs & typedefs +/*! \struct BamTools::CigarOp + \brief Represents a CIGAR alignment operation. -// CIGAR operation data structure + \sa http://samtools.sourceforge.net/SAM-1.3.pdf for more details on using CIGAR operations. +*/ struct API_EXPORT CigarOp { - // data members - char Type; // Operation type (MIDNSHP) - uint32_t Length; // Operation length (number of bases) + char Type; //!< CIGAR operation type (MIDNSHP) + uint32_t Length; //!< CIGAR operation length (number of bases) - // constructor + //! constructor CigarOp(const char type = '\0', - const uint32_t length = 0) + const uint32_t& length = 0) : Type(type) , Length(length) { } }; -// Reference data entry +// ---------------------------------------------------------------- +// RefData + +/*! \struct BamTools::RefData + \brief Represents a reference sequence entry +*/ struct API_EXPORT RefData { - // data members - std::string RefName; // Name of reference sequence - int32_t RefLength; // Length of reference sequence - bool RefHasAlignments; // True if BAM file contains alignments mapped to reference sequence + std::string RefName; //!< name of reference sequence + int32_t RefLength; //!< length of reference sequence - // constructor - RefData(const int32_t& length = 0, - bool ok = false) - : RefLength(length) - , RefHasAlignments(ok) + //! constructor + RefData(const std::string& name = "", + const int32_t& length = 0) + : RefName(name) + , RefLength(length) { } }; + +//! convenience typedef for vector of RefData entries typedef std::vector RefVector; -// General (sequential) genome region +// ---------------------------------------------------------------- +// BamRegion + +/*! \struct BamTools::BamRegion + \brief Represents a sequential genomic region + + Allowed to span multiple (sequential) references. +*/ struct API_EXPORT BamRegion { - // data members - int LeftRefID; - int LeftPosition; - int RightRefID; - int RightPosition; + int LeftRefID; //!< reference ID for region's left boundary + int LeftPosition; //!< position for region's left boundary + int RightRefID; //!< reference ID for region's right boundary + int RightPosition; //!< position for region's right boundary - // constructor + //! constructor BamRegion(const int& leftID = -1, const int& leftPos = -1, const int& rightID = -1, @@ -122,42 +98,75 @@ struct API_EXPORT BamRegion { , RightPosition(rightPos) { } - // copy constructor + //! copy constructor BamRegion(const BamRegion& other) - : LeftRefID(other.LeftRefID) - , LeftPosition(other.LeftPosition) - , RightRefID(other.RightRefID) - , RightPosition(other.RightPosition) + : LeftRefID(other.LeftRefID) + , LeftPosition(other.LeftPosition) + , RightRefID(other.RightRefID) + , RightPosition(other.RightPosition) { } - // member functions - void clear(void) { LeftRefID = -1; LeftPosition = -1; RightRefID = -1; RightPosition = -1; } - bool isLeftBoundSpecified(void) const { return ( LeftRefID >= 0 && LeftPosition >= 0 ); } - bool isNull(void) const { return ( !isLeftBoundSpecified() && !isRightBoundSpecified() ); } - bool isRightBoundSpecified(void) const { return ( RightRefID >= 0 && RightPosition >= 0 ); } + //! Clears region boundaries + void clear(void) { + LeftRefID = -1; LeftPosition = -1; + RightRefID = -1; RightPosition = -1; + } + + //! Returns true if region has a left boundary + bool isLeftBoundSpecified(void) const { + return ( LeftRefID >= 0 && LeftPosition >= 0 ); + } + + //! Returns true if region boundaries are not defined + bool isNull(void) const { + return ( !isLeftBoundSpecified() && !isRightBoundSpecified() ); + } + + //! Returns true if region has a right boundary + bool isRightBoundSpecified(void) const { + return ( RightRefID >= 0 && RightPosition >= 0 ); + } }; // ---------------------------------------------------------------- -// ---------------------------------------------------------------- -// General utilities +// General utility methods -// returns true if system is big endian -inline bool SystemIsBigEndian(void) { - const uint16_t one = 0x0001; - return ((*(char*) &one) == 0 ); +/*! \fn bool FileExists(const std::string& filename) + \brief checks if file exists + + Attempts to open file in a read-only mode. + + \return \c true if file can be opened successfully +*/ +API_EXPORT inline bool FileExists(const std::string& filename) { + std::ifstream f(filename.c_str(), std::ifstream::in); + return !f.fail(); } -// swaps endianness of 16-bit value 'in place' -inline void SwapEndian_16(int16_t& x) { +/*! \fn void SwapEndian_16(int16_t& x) + \brief swaps endianness of signed 16-bit integer, in place + + Swaps endian representation of value in \a x. +*/ +API_EXPORT inline void SwapEndian_16(int16_t& x) { x = ((x >> 8) | (x << 8)); } -inline void SwapEndian_16(uint16_t& x) { +/*! \fn void SwapEndian_16(uint16_t& x) + \brief swaps endianness of unsigned 16-bit integer, in place + + Swaps endian representation of value in \a x. +*/ +API_EXPORT inline void SwapEndian_16(uint16_t& x) { x = ((x >> 8) | (x << 8)); } -// swaps endianness of 32-bit value 'in-place' -inline void SwapEndian_32(int32_t& x) { +/*! \fn void SwapEndian_32(int32_t& x) + \brief swaps endianness of signed 32-bit integer, in place + + Swaps endian representation of value in \a x. +*/ +API_EXPORT inline void SwapEndian_32(int32_t& x) { x = ( (x >> 24) | ((x << 8) & 0x00FF0000) | ((x >> 8) & 0x0000FF00) | @@ -165,7 +174,12 @@ inline void SwapEndian_32(int32_t& x) { ); } -inline void SwapEndian_32(uint32_t& x) { +/*! \fn void SwapEndian_32(uint32_t& x) + \brief swaps endianness of unsigned 32-bit integer, in place + + Swaps endian representation of value in \a x. +*/ +API_EXPORT inline void SwapEndian_32(uint32_t& x) { x = ( (x >> 24) | ((x << 8) & 0x00FF0000) | ((x >> 8) & 0x0000FF00) | @@ -173,8 +187,12 @@ inline void SwapEndian_32(uint32_t& x) { ); } -// swaps endianness of 64-bit value 'in-place' -inline void SwapEndian_64(int64_t& x) { +/*! \fn void SwapEndian_64(int64_t& x) + \brief swaps endianness of signed 64-bit integer, in place + + Swaps endian representation of value in \a x. +*/ +API_EXPORT inline void SwapEndian_64(int64_t& x) { x = ( (x >> 56) | ((x << 40) & 0x00FF000000000000ll) | ((x << 24) & 0x0000FF0000000000ll) | @@ -186,7 +204,12 @@ inline void SwapEndian_64(int64_t& x) { ); } -inline void SwapEndian_64(uint64_t& x) { +/*! \fn void SwapEndian_64(uint64_t& x) + \brief swaps endianness of unsigned 64-bit integer, in place + + Swaps endian representation of value in \a x. +*/ +API_EXPORT inline void SwapEndian_64(uint64_t& x) { x = ( (x >> 56) | ((x << 40) & 0x00FF000000000000ll) | ((x << 24) & 0x0000FF0000000000ll) | @@ -198,28 +221,235 @@ inline void SwapEndian_64(uint64_t& x) { ); } -// swaps endianness of 'next 2 bytes' in a char buffer (in-place) -inline void SwapEndian_16p(char* data) { +/*! \fn void SwapEndian_16p(char* data) + \brief swaps endianness of the next 2 bytes in a buffer, in place + + Swaps endian representation the next 2 bytes in \a data. +*/ +API_EXPORT inline void SwapEndian_16p(char* data) { uint16_t& value = (uint16_t&)*data; SwapEndian_16(value); } -// swaps endianness of 'next 4 bytes' in a char buffer (in-place) -inline void SwapEndian_32p(char* data) { +/*! \fn void SwapEndian_32p(char* data) + \brief swaps endianness of the next 4 bytes in a buffer, in place + + Swaps endian representation the next 4 bytes in \a data. +*/ +API_EXPORT inline void SwapEndian_32p(char* data) { uint32_t& value = (uint32_t&)*data; SwapEndian_32(value); } -// swaps endianness of 'next 8 bytes' in a char buffer (in-place) -inline void SwapEndian_64p(char* data) { +/*! \fn void SwapEndian_64p(char* data) + \brief swaps endianness of the next 8 bytes in a buffer, in place + + Swaps endian representation the next 8 bytes in \a data. +*/ +API_EXPORT inline void SwapEndian_64p(char* data) { uint64_t& value = (uint64_t&)*data; SwapEndian_64(value); } -// returns whether file exists (can be opened OK) -inline bool FileExists(const std::string& filename) { - std::ifstream f(filename.c_str(), std::ifstream::in); - return !f.fail(); +/*! \fn bool SystemIsBigEndian(void) + \brief checks host architecture's byte order + \return \c true if system uses big-endian ordering +*/ +API_EXPORT inline bool SystemIsBigEndian(void) { + const uint16_t one = 0x0001; + return ((*(char*) &one) == 0 ); +} + +/*! \fn void PackUnsignedInt(char* buffer, unsigned int value) + \brief stores unsigned integer value in a byte buffer + + \param buffer destination buffer + \param value unsigned integer to 'pack' in buffer +*/ +API_EXPORT inline void PackUnsignedInt(char* buffer, unsigned int value) { + buffer[0] = (char)value; + buffer[1] = (char)(value >> 8); + buffer[2] = (char)(value >> 16); + buffer[3] = (char)(value >> 24); +} + +/*! \fn void PackUnsignedShort(char* buffer, unsigned short value) + \brief stores unsigned short integer value in a byte buffer + + \param buffer destination buffer + \param value unsigned short integer to 'pack' in buffer +*/ +API_EXPORT inline void PackUnsignedShort(char* buffer, unsigned short value) { + buffer[0] = (char)value; + buffer[1] = (char)(value >> 8); +} + +/*! \fn double UnpackDouble(const char* buffer) + \brief reads a double value from byte buffer + + \param buffer source byte buffer + \return the (double) value read from the buffer +*/ +API_EXPORT inline double UnpackDouble(const char* buffer) { + union { double value; unsigned char valueBuffer[sizeof(double)]; } un; + un.value = 0; + un.valueBuffer[0] = buffer[0]; + un.valueBuffer[1] = buffer[1]; + un.valueBuffer[2] = buffer[2]; + un.valueBuffer[3] = buffer[3]; + un.valueBuffer[4] = buffer[4]; + un.valueBuffer[5] = buffer[5]; + un.valueBuffer[6] = buffer[6]; + un.valueBuffer[7] = buffer[7]; + return un.value; +} + +/*! \fn double UnpackDouble(char* buffer) + \brief reads a double value from byte buffer + + This is an overloaded function. + + \param buffer source byte buffer + \return the (double) value read from the buffer +*/ +API_EXPORT inline double UnpackDouble(char* buffer) { + return UnpackDouble( (const char*)buffer ); +} + +/*! \fn double UnpackFloat(const char* buffer) + \brief reads a float value from byte buffer + + \param buffer source byte buffer + \return the (float) value read from the buffer +*/ +API_EXPORT inline float UnpackFloat(const char* buffer) { + union { float value; unsigned char valueBuffer[sizeof(float)]; } un; + un.value = 0; + un.valueBuffer[0] = buffer[0]; + un.valueBuffer[1] = buffer[1]; + un.valueBuffer[2] = buffer[2]; + un.valueBuffer[3] = buffer[3]; + return un.value; +} + +/*! \fn double UnpackFloat(char* buffer) + \brief reads a float value from byte buffer + + This is an overloaded function. + + \param buffer source byte buffer + \return the (float) value read from the buffer +*/ +API_EXPORT inline float UnpackFloat(char* buffer) { + return UnpackFloat( (const char*)buffer ); +} + +/*! \fn signed int UnpackSignedInt(const char* buffer) + \brief reads a signed integer value from byte buffer + + \param buffer source byte buffer + \return the (signed int) value read from the buffer +*/ +API_EXPORT inline signed int UnpackSignedInt(const char* buffer) { + union { signed int value; unsigned char valueBuffer[sizeof(signed int)]; } un; + un.value = 0; + un.valueBuffer[0] = buffer[0]; + un.valueBuffer[1] = buffer[1]; + un.valueBuffer[2] = buffer[2]; + un.valueBuffer[3] = buffer[3]; + return un.value; +} + +/*! \fn signed int UnpackSignedInt(char* buffer) + \brief reads a signed integer value from byte buffer + + This is an overloaded function. + + \param buffer source byte buffer + \return the (signed int) value read from the buffer +*/ +API_EXPORT inline signed int UnpackSignedInt(char* buffer) { + return UnpackSignedInt( (const char*) buffer ); +} + +/*! \fn signed short UnpackSignedShort(const char* buffer) + \brief reads a signed short integer value from byte buffer + + \param buffer source byte buffer + \return the (signed short) value read from the buffer +*/ +API_EXPORT inline signed short UnpackSignedShort(const char* buffer) { + union { signed short value; unsigned char valueBuffer[sizeof(signed short)]; } un; + un.value = 0; + un.valueBuffer[0] = buffer[0]; + un.valueBuffer[1] = buffer[1]; + return un.value; +} + +/*! \fn signed short UnpackSignedShort(char* buffer) + \brief reads a signed short integer value from byte buffer + + This is an overloaded function. + + \param buffer source byte buffer + \return the (signed short) value read from the buffer +*/ +API_EXPORT inline signed short UnpackSignedShort(char* buffer) { + return UnpackSignedShort( (const char*)buffer ); +} + +/*! \fn unsigned int UnpackUnsignedInt(const char* buffer) + \brief reads an unsigned integer value from byte buffer + + \param buffer source byte buffer + \return the (unsigned int) value read from the buffer +*/ +API_EXPORT inline unsigned int UnpackUnsignedInt(const char* buffer) { + union { unsigned int value; unsigned char valueBuffer[sizeof(unsigned int)]; } un; + un.value = 0; + un.valueBuffer[0] = buffer[0]; + un.valueBuffer[1] = buffer[1]; + un.valueBuffer[2] = buffer[2]; + un.valueBuffer[3] = buffer[3]; + return un.value; +} + +/*! \fn unsigned int UnpackUnsignedInt(char* buffer) + \brief reads an unsigned integer value from byte buffer + + This is an overloaded function. + + \param buffer source byte buffer + \return the (unsigned int) value read from the buffer +*/ +API_EXPORT inline unsigned int UnpackUnsignedInt(char* buffer) { + return UnpackUnsignedInt( (const char*)buffer ); +} + +/*! \fn unsigned short UnpackUnsignedShort(const char* buffer) + \brief reads an unsigned short integer value from byte buffer + + \param buffer source byte buffer + \return the (unsigned short) value read from the buffer +*/ +API_EXPORT inline unsigned short UnpackUnsignedShort(const char* buffer) { + union { unsigned short value; unsigned char valueBuffer[sizeof(unsigned short)]; } un; + un.value = 0; + un.valueBuffer[0] = buffer[0]; + un.valueBuffer[1] = buffer[1]; + return un.value; +} + +/*! \fn unsigned short UnpackUnsignedShort(char* buffer) + \brief reads an unsigned short integer value from byte buffer + + This is an overloaded function. + + \param buffer source byte buffer + \return the (unsigned short) value read from the buffer +*/ +API_EXPORT inline unsigned short UnpackUnsignedShort(char* buffer) { + return UnpackUnsignedShort( (const char*)buffer ); } } // namespace BamTools diff --git a/src/api/BamConstants.h b/src/api/BamConstants.h index 5bf03f9..6a97980 100644 --- a/src/api/BamConstants.h +++ b/src/api/BamConstants.h @@ -3,17 +3,22 @@ #include +/*! \namespace BamTools::Constants + \brief Contains most of the constants used throughout BamTools. +*/ + namespace BamTools { namespace Constants { const int BAM_SIZEOF_INT = 4; // header magic number -const char* const BAM_HEADER_MAGIC = "BAM\001"; -const unsigned int BAM_HEADER_MAGIC_SIZE = 4; +const char* const BAM_HEADER_MAGIC = "BAM\1"; +const unsigned int BAM_HEADER_MAGIC_LENGTH = 4; // BAM alignment core size const int BAM_CORE_SIZE = 32; +const int BAM_CORE_BUFFER_SIZE = 8; // BAM alignment flags const int BAM_ALIGNMENT_PAIRED = 1; @@ -38,11 +43,68 @@ const int BAM_CIGAR_SOFTCLIP = 4; const int BAM_CIGAR_HARDCLIP = 5; const int BAM_CIGAR_PAD = 6; +const char BAM_CIGAR_MATCH_CHAR = 'M'; +const char BAM_CIGAR_INS_CHAR = 'I'; +const char BAM_CIGAR_DEL_CHAR = 'D'; +const char BAM_CIGAR_REFSKIP_CHAR = 'N'; +const char BAM_CIGAR_SOFTCLIP_CHAR = 'S'; +const char BAM_CIGAR_HARDCLIP_CHAR = 'H'; +const char BAM_CIGAR_PAD_CHAR = 'P'; + const int BAM_CIGAR_SHIFT = 4; const int BAM_CIGAR_MASK = ((1 << BAM_CIGAR_SHIFT) - 1); +// BAM tag types +const char BAM_TAG_TYPE_ASCII = 'A'; +const char BAM_TAG_TYPE_UINT8 = 'c'; +const char BAM_TAG_TYPE_INT8 = 'C'; +const char BAM_TAG_TYPE_UINT16 = 's'; +const char BAM_TAG_TYPE_INT16 = 'S'; +const char BAM_TAG_TYPE_UINT32 = 'i'; +const char BAM_TAG_TYPE_INT32 = 'I'; +const char BAM_TAG_TYPE_FLOAT = 'f'; +const char BAM_TAG_TYPE_STRING = 'Z'; +const char BAM_TAG_TYPE_HEX = 'H'; + +const size_t BAM_TAG_TAGSIZE = 2; +const size_t BAM_TAG_TYPESIZE = 1; + // DNA bases const char* const BAM_DNA_LOOKUP = "=ACMGRSVTWYHKDBN"; +const unsigned char BAM_BASECODE_EQUAL = 0; +const unsigned char BAM_BASECODE_A = 1; +const unsigned char BAM_BASECODE_C = 2; +const unsigned char BAM_BASECODE_G = 4; +const unsigned char BAM_BASECODE_T = 8; +const unsigned char BAM_BASECODE_N = 15; + +const char BAM_DNA_EQUAL = '='; +const char BAM_DNA_A = 'A'; +const char BAM_DNA_C = 'C'; +const char BAM_DNA_G = 'G'; +const char BAM_DNA_T = 'T'; +const char BAM_DNA_N = 'N'; +const char BAM_DNA_DEL = '-'; +const char BAM_DNA_PAD = '*'; + +// zlib constants +const int GZIP_ID1 = 31; +const int GZIP_ID2 = 139; +const int CM_DEFLATE = 8; +const int FLG_FEXTRA = 4; +const int OS_UNKNOWN = 255; +const int BGZF_XLEN = 6; +const int BGZF_ID1 = 66; +const int BGZF_ID2 = 67; +const int BGZF_LEN = 2; +const int GZIP_WINDOW_BITS = -15; +const int Z_DEFAULT_MEM_LEVEL = 8; + +// BZGF constants +const int BGZF_BLOCK_HEADER_LENGTH = 18; +const int BGZF_BLOCK_FOOTER_LENGTH = 8; +const int BGZF_MAX_BLOCK_SIZE = 65536; +const int BGZF_DEFAULT_BLOCK_SIZE = 65536; } // namespace Constants } // namespace BamTools diff --git a/src/api/BamIndex.cpp b/src/api/BamIndex.cpp index dbfe1c9..3e5f86e 100644 --- a/src/api/BamIndex.cpp +++ b/src/api/BamIndex.cpp @@ -3,7 +3,7 @@ // Marth Lab, Department of Biology, Boston College // All rights reserved. // --------------------------------------------------------------------------- -// Last modified: 22 November 2010 (DB) +// Last modified: 23 March 2011 (DB) // --------------------------------------------------------------------------- // Provides index functionality - both for the default (standardized) BAM // index format (.bai) as well as a BamTools-specific (nonstandard) index @@ -12,7 +12,6 @@ #include #include -#include #include #include using namespace BamTools; @@ -25,83 +24,35 @@ using namespace BamTools::Internal; #include using namespace std; -// -------------------------------------------------- -// BamIndex factory methods - -// returns index based on BAM filename 'stub' -// checks first for preferred type, returns that type if found -// (if not found, attmempts to load other type(s), returns 0 if NONE found) -// -// ** default preferred type is BamToolsIndex ** use this anytime it exists -BamIndex* BamIndex::FromBamFilename(const std::string& bamFilename, - BamTools::BgzfData* bgzf, - BamTools::BamReader* reader, - const BamIndex::PreferredIndexType& type) -{ - // --------------------------------------------------- - // attempt to load preferred type first - - const std::string bamtoolsIndexFilename = bamFilename + ".bti"; - const bool bamtoolsIndexExists = BamTools::FileExists(bamtoolsIndexFilename); - if ( (type == BamIndex::BAMTOOLS) && bamtoolsIndexExists ) - return new BamToolsIndex(bgzf, reader); - - const std::string standardIndexFilename = bamFilename + ".bai"; - const bool standardIndexExists = BamTools::FileExists(standardIndexFilename); - if ( (type == BamIndex::STANDARD) && standardIndexExists ) - return new BamStandardIndex(bgzf, reader); - - // ---------------------------------------------------- - // preferred type could not be found, try other (non-preferred) types - // if none found, return 0 - - if ( bamtoolsIndexExists ) return new BamToolsIndex(bgzf, reader); - if ( standardIndexExists ) return new BamStandardIndex(bgzf, reader); - return 0; -} - -// returns index based on explicitly named index file (or 0 if not found) -BamIndex* BamIndex::FromIndexFilename(const std::string& indexFilename, - BamTools::BgzfData* bgzf, - BamTools::BamReader* reader) -{ - // see if specified file exists - const bool indexExists = BamTools::FileExists(indexFilename); - if ( !indexExists ) return 0; +/*! \class BamTools::BamIndex + \brief Provides methods for generating & loading BAM index files. - const std::string bamtoolsIndexExtension(".bti"); - const std::string standardIndexExtension(".bai"); + This class straddles the line between public API and internal + implementation detail. Most client code should never have to use this + class directly. - // if has bamtoolsIndexExtension - if ( indexFilename.find(bamtoolsIndexExtension) == (indexFilename.length() - bamtoolsIndexExtension.length()) ) - return new BamToolsIndex(bgzf, reader); + It is exposed to the public API to allow advanced users to implement + their own custom indexing schemes. - // if has standardIndexExtension - if ( indexFilename.find(standardIndexExtension) == (indexFilename.length() - standardIndexExtension.length()) ) - return new BamStandardIndex(bgzf, reader); - - // otherwise, unsupported file type - return 0; -} - -// ------------------------------- -// BamIndex implementation + N.B. - Please note that if you wish to derive your own subclass, you are + entering waters that are not well-documented at the moment and are + likely to be changing soon anyway. Implementing a custom index is + technically do-able at the moment, but the learning curve is (at the + moment) overly steep. Changes will be coming soon to alleviate some + of this headache. +*/ // ctor -BamIndex::BamIndex(BamTools::BgzfData* bgzf, BamTools::BamReader* reader) - : m_BGZF(bgzf) - , m_reader(reader) +BamIndex::BamIndex(void) + : m_indexStream(0) + , m_indexFilename("") , m_cacheMode(BamIndex::LimitedIndexCaching) - , m_indexStream(0) -{ - if ( m_reader && m_reader->IsOpen() ) - m_references = m_reader->GetReferenceData(); -} +{ } // dtor BamIndex::~BamIndex(void) { - if ( IsOpen() ) - fclose(m_indexStream); + if ( IsOpen() ) fclose(m_indexStream); + m_indexFilename = ""; } // return true if FILE* is open @@ -114,7 +65,7 @@ bool BamIndex::Load(const string& filename) { // open index file, abort on error if ( !OpenIndexFile(filename, "rb") ) { - fprintf(stderr, "ERROR: Unable to open the BAM index file %s for reading.\n", filename.c_str()); + fprintf(stderr, "BamIndex ERROR: unable to open the BAM index file %s for reading.\n", filename.c_str()); return false; } @@ -140,8 +91,14 @@ bool BamIndex::Load(const string& filename) { // opens index file for reading/writing, return true if opened OK bool BamIndex::OpenIndexFile(const string& filename, const string& mode) { + + // attempt to open file, return false if error m_indexStream = fopen(filename.c_str(), mode.c_str()); - return ( m_indexStream != 0 ); + if ( m_indexStream == 0 ) return false; + + // otherwise save filename & return sucess + m_indexFilename = filename; + return true; } // rewind index file to beginning of index data, return true if rewound OK @@ -150,7 +107,7 @@ bool BamIndex::Rewind(void) { } // change the index caching behavior -void BamIndex::SetCacheMode(const BamIndexCacheMode mode) { +void BamIndex::SetCacheMode(const BamIndex::IndexCacheMode& mode) { if ( mode != m_cacheMode ) { m_cacheMode = mode; UpdateCache(); @@ -180,9 +137,11 @@ void BamIndex::UpdateCache(void) { LoadFirstReference(true); } break; + case(BamIndex::NoIndexCaching) : ClearAllData(); break; + default : // unreachable ; @@ -195,13 +154,13 @@ bool BamIndex::Write(const string& bamFilename) { // open index file for writing string indexFilename = bamFilename + Extension(); if ( !OpenIndexFile(indexFilename, "wb") ) { - fprintf(stderr, "ERROR: Could not open file to save index.\n"); + fprintf(stderr, "BamIndex ERROR: could not open file to save index data.\n"); return false; } // write index header data if ( !WriteHeader() ) { - fprintf(stderr, "ERROR: There was a problem writing index metadata to new index file.\n"); + fprintf(stderr, "BamIndex ERROR: there was a problem writing index metadata to the new index file.\n"); fflush(m_indexStream); fclose(m_indexStream); exit(1); @@ -209,22 +168,23 @@ bool BamIndex::Write(const string& bamFilename) { // write main index data if ( !WriteAllReferences() ) { - fprintf(stderr, "ERROR: There was a problem writing index data to new index file.\n"); + fprintf(stderr, "BamIndex ERROR: there was a problem writing index data to the new index file.\n"); fflush(m_indexStream); fclose(m_indexStream); exit(1); } - // flush any remaining output, rewind file, and return success + // flush any remaining output fflush(m_indexStream); fclose(m_indexStream); // re-open index file for later reading if ( !OpenIndexFile(indexFilename, "rb") ) { - fprintf(stderr, "ERROR: Could not open newly created index file for reading.\n"); + fprintf(stderr, "BamIndex ERROR: could not open newly created index file for reading.\n"); return false; } - // return success/failure of write + // save index filename & return success + m_indexFilename = indexFilename; return true; } diff --git a/src/api/BamIndex.h b/src/api/BamIndex.h index d09106a..5ba1469 100644 --- a/src/api/BamIndex.h +++ b/src/api/BamIndex.h @@ -3,7 +3,7 @@ // Marth Lab, Department of Biology, Boston College // All rights reserved. // --------------------------------------------------------------------------- -// Last modified: 19 November 2010 (DB) +// Last modified: 24 February 2011 (DB) // --------------------------------------------------------------------------- // Provides basic BAM index interface // *************************************************************************** @@ -20,61 +20,62 @@ namespace BamTools { class BamReader; -class BgzfData; namespace Internal { - class BamStandardIndex; - class BamToolsIndex; + class BamReaderPrivate; } // namespace Internal // -------------------------------------------------- // BamIndex base class class API_EXPORT BamIndex { - // specify index-caching behavior - // - // @FullIndexCaching - store entire index file contents in memory - // @LimitedIndexCaching - store only index data for current reference - // being processed - // @NoIndexCaching - do not store any index data. Load as needed to - // calculate jump offset - public: enum BamIndexCacheMode { FullIndexCaching = 0 - , LimitedIndexCaching - , NoIndexCaching - }; + // enums + public: + // specify index-caching behavior + enum IndexCacheMode { FullIndexCaching = 0 // store entire index file contents in memory + , LimitedIndexCaching // store only index data for current reference + , NoIndexCaching // do not store any index data between jumps + }; + + // list of supported BamIndex types + enum IndexType { BAMTOOLS = 0 + , STANDARD + }; // ctor & dtor public: - BamIndex(BamTools::BgzfData* bgzf, BamTools::BamReader* reader); + BamIndex(void); virtual ~BamIndex(void); // index interface public: - // creates index data (in-memory) from current reader data - virtual bool Build(void) =0; + // creates index data (in-memory) from @reader data + virtual bool Build(Internal::BamReaderPrivate* reader) =0; // returns supported file extension - virtual const std::string Extension(void) const =0; + virtual const std::string Extension(void) =0; // returns whether reference has alignments or no virtual bool HasAlignments(const int& referenceID) const =0; - // attempts to use index to jump to region; returns success/fail + // attempts to use index data to jump to @region in @reader; returns success/fail // a "successful" jump indicates no error, but not whether this region has data - // * thus, the method sets a flag to indicate whether there are alignments + // * thus, the method sets a flag to indicate whether there are alignments // available after the jump position - virtual bool Jump(const BamTools::BamRegion& region, bool* hasAlignmentsInRegion) =0; + virtual bool Jump(Internal::BamReaderPrivate* reader, + const BamTools::BamRegion& region, + bool* hasAlignmentsInRegion) =0; // loads existing data from file into memory virtual bool Load(const std::string& filename); // change the index caching behavior - virtual void SetCacheMode(const BamIndexCacheMode mode); + virtual void SetCacheMode(const BamIndex::IndexCacheMode& mode); // writes in-memory index data out to file // N.B. - (this is the original BAM filename, method will modify it to use applicable extension) virtual bool Write(const std::string& bamFilename); - + // derived-classes MUST provide implementation protected: // clear all current index offset data in memory virtual void ClearAllData(void) =0; // return file position after header metadata - virtual const off_t DataBeginOffset(void) const =0; + virtual off_t DataBeginOffset(void) const =0; // return true if all index data is cached virtual bool HasFullDataCache(void) const =0; // clears index data from all references except the first @@ -94,7 +95,7 @@ class API_EXPORT BamIndex { // write index header data virtual bool WriteHeader(void) =0; - // internal methods + // internal methods (but available to derived classes) protected: // rewind index file to beginning of index data, return true if rewound OK bool Rewind(void); @@ -107,37 +108,11 @@ class API_EXPORT BamIndex { // updates in-memory cache of index data, depending on current cache mode void UpdateCache(void); - // factory methods for returning proper BamIndex-derived type based on available index files - public: - - // returns index based on BAM filename 'stub' - // checks first for preferred type, returns that type if found - // (if not found, attmempts to load other type(s), returns 0 if NONE found) - // - // ** default preferred type is BamToolsIndex ** use this anytime it exists - enum PreferredIndexType { BAMTOOLS = 0, STANDARD }; - static BamIndex* FromBamFilename(const std::string& bamFilename, - BamTools::BgzfData* bgzf, - BamTools::BamReader* reader, - const BamIndex::PreferredIndexType& type = BamIndex::BAMTOOLS); - - // returns index based on explicitly named index file (or 0 if not found) - static BamIndex* FromIndexFilename(const std::string& indexFilename, - BamTools::BgzfData* bgzf, - BamTools::BamReader* reader); - // data members protected: - BamTools::BgzfData* m_BGZF; - BamTools::BamReader* m_reader; - BamTools::RefVector m_references; - BamIndex::BamIndexCacheMode m_cacheMode; FILE* m_indexStream; - - - // friends - friend class Internal::BamStandardIndex; - friend class Internal::BamToolsIndex; + std::string m_indexFilename; + BamIndex::IndexCacheMode m_cacheMode; }; } // namespace BamTools diff --git a/src/api/BamMultiReader.cpp b/src/api/BamMultiReader.cpp index e5af282..06055df 100644 --- a/src/api/BamMultiReader.cpp +++ b/src/api/BamMultiReader.cpp @@ -3,12 +3,9 @@ // Marth Lab, Department of Biology, Boston College // All rights reserved. // --------------------------------------------------------------------------- -// Last modified: 23 December 2010 (DB) +// Last modified: 15 March 2011 (DB) // --------------------------------------------------------------------------- -// Uses BGZF routines were adapted from the bgzf.c code developed at the Broad -// Institute. -// --------------------------------------------------------------------------- -// Functionality for simultaneously reading multiple BAM files. +// Convenience class for reading multiple BAM files. // // This functionality allows applications to work on very large sets of files // without requiring intermediate merge, sort, and index steps for each file @@ -17,7 +14,6 @@ // *************************************************************************** #include -#include #include using namespace BamTools; @@ -25,83 +21,357 @@ using namespace BamTools; #include using namespace std; -// ----------------------------------------------------- -// BamMultiReader implementation -// ----------------------------------------------------- +/*! \class BamTools::BamReader + \brief Convenience class for reading multiple BAM files. +*/ +/*! \fn BamMultiReader::BamMultiReader(void) + \brief constructor +*/ BamMultiReader::BamMultiReader(void) : d(new Internal::BamMultiReaderPrivate) { } +/*! \fn BamMultiReader::~BamMultiReader(void) + \brief destructor +*/ BamMultiReader::~BamMultiReader(void) { delete d; d = 0; } +/*! \fn void BamMultiReader::Close(void) + \brief Closes all open BAM files. + + Also clears out all header and reference data. + + \sa CloseFile(), IsOpen(), Open(), BamReader::Close() +*/ void BamMultiReader::Close(void) { d->Close(); } -bool BamMultiReader::CreateIndexes(bool useStandardIndex) { - return d->CreateIndexes(useStandardIndex); +/*! \fn void BamMultiReader::CloseFile(const std::string& filename) + \brief Closes requested BAM file. + + Leaves any other file(s) open, along with header and reference data. + + \sa Close(), IsOpen(), Open(), BamReader::Close() +*/ +void BamMultiReader::CloseFile(const std::string& filename) { + d->CloseFile(filename); +} + +/*! \fn bool BamMultiReader::CreateIndexes(const BamIndex::IndexType& type) + \brief Creates index files for the current BAM files. + + \param type file format to create, see BamIndex::IndexType for available formats + \return \c true if index files created OK + \sa LocateIndexes(), OpenIndexes(), BamReader::CreateIndex() +*/ +bool BamMultiReader::CreateIndexes(const BamIndex::IndexType& type) { + return d->CreateIndexes(type); } -void BamMultiReader::SetIndexCacheMode(const BamIndex::BamIndexCacheMode mode) { - d->SetIndexCacheMode(mode); +/*! \fn const std::vector BamMultiReader::Filenames(void) const + \brief Returns list of filenames for all open BAM files. + + Retrieved filenames will contain whatever was passed via Open(). + If you need full directory paths here, be sure to include them + when you open the BAM files. + + \returns names of open BAM files. If no files are open, returns an empty vector. + \sa IsOpen(), BamReader::GetFilename() +*/ +const std::vector BamMultiReader::Filenames(void) const { + return d->Filenames(); +} + +/*! \fn SamHeader BamMultiReader::GetHeader(void) const + \brief Returns unified SAM-format header for all files + + N.B. - Modifying the retrieved text does NOT affect the current + BAM files. Thesse file have been opened in a read-only mode. However, + your modified header text can be used in conjunction with BamWriter + to generate a new BAM file with the appropriate header information. + + \returns header data wrapped in SamHeader object + \sa GetHeaderText(), BamReader::GetHeader() +*/ +SamHeader BamMultiReader::GetHeader(void) const { + return d->GetHeader(); } -const string BamMultiReader::GetHeaderText(void) const { +/*! \fn std::string BamMultiReader::GetHeaderText(void) const + \brief Returns unified SAM-format header text for all files + + N.B. - Modifying the retrieved text does NOT affect the current + BAM files. Thesse file have been opened in a read-only mode. However, + your modified header text can be used in conjunction with BamWriter + to generate a new BAM file with the appropriate header information. + + \returns SAM-formatted header text + \sa GetHeader(), BamReader::GetHeaderText() +*/ +std::string BamMultiReader::GetHeaderText(void) const { return d->GetHeaderText(); } +/*! \fn bool BamMultiReader::GetNextAlignment(BamAlignment& alignment) + \brief Retrieves next available alignment. + + Equivalent to BamReader::GetNextAlignment() with respect to what is a valid + overlapping alignment and what data gets populated. + + This method takes care of determining which alignment actually is 'next' + across multiple files, depending on current SortOrder. + + \param alignment destination for alignment record data + \returns \c true if a valid alignment was found + \sa GetNextAlignmentCore(), SetRegion(), SetSortOrder(), BamReader::GetNextAlignment() +*/ bool BamMultiReader::GetNextAlignment(BamAlignment& nextAlignment) { return d->GetNextAlignment(nextAlignment); } +/*! \fn bool BamMultiReader::GetNextAlignmentCore(BamAlignment& alignment) + \brief Retrieves next available alignment. + + Equivalent to BamReader::GetNextAlignmentCore() with respect to what is a valid + overlapping alignment and what data gets populated. + + This method takes care of determining which alignment actually is 'next' + across multiple files, depending on current SortOrder. + + \param alignment destination for alignment record data + \returns \c true if a valid alignment was found + \sa GetNextAlignment(), SetRegion(), SetSortOrder(), BamReader::GetNextAlignmentCore() +*/ bool BamMultiReader::GetNextAlignmentCore(BamAlignment& nextAlignment) { return d->GetNextAlignmentCore(nextAlignment); } -const int BamMultiReader::GetReferenceCount(void) const { +/*! \fn int BamMultiReader::GetReferenceCount(void) const + \brief Returns number of reference sequences. + \sa BamReader::GetReferenceCount() +*/ +int BamMultiReader::GetReferenceCount(void) const { return d->GetReferenceCount(); } +/*! \fn const RefVector& BamMultiReader::GetReferenceData(void) const + \brief Returns all reference sequence entries. + \sa RefData, BamReader::GetReferenceData() +*/ const BamTools::RefVector BamMultiReader::GetReferenceData(void) const { return d->GetReferenceData(); } -const int BamMultiReader::GetReferenceID(const string& refName) const { +/*! \fn int BamMultiReader::GetReferenceID(const std::string& refName) const + \brief Returns the ID of the reference with this name. + + If \a refName is not found, returns -1. + + \sa BamReader::GetReferenceID() +*/ +int BamMultiReader::GetReferenceID(const std::string& refName) const { return d->GetReferenceID(refName); } -bool BamMultiReader::HasOpenReaders() { +/*! \fn bool BamMultiReader::HasIndexes(void) const + \brief Returns \c true if all BAM files have index data available. + \sa BamReader::HasIndex() +*/ +bool BamMultiReader::HasIndexes(void) const { + return d->HasIndexes(); +} + +/*! \fn bool BamMultiReader::HasOpenReaders(void) const + \brief Returns \c true if there are any open BAM files. +*/ +bool BamMultiReader::HasOpenReaders(void) const { return d->HasOpenReaders(); } +/*! \fn bool BamMultiReader::IsIndexLoaded(void) const + \brief Returns \c true if all BAM files have index data available. + + \deprecated Instead use HasIndexes() + \cond + See explanation in BamReader.cpp for more details on the deprecation decision. + \endcond +*/ + bool BamMultiReader::IsIndexLoaded(void) const { - return d->IsIndexLoaded(); + return d->HasIndexes(); } +/*! \fn bool BamMultiReader::Jump(int refID, int position) + \brief Performs a random-access jump within current BAM files. + + This is a convenience method, equivalent to calling SetRegion() + with only a left boundary specified. + + \returns \c true if jump was successful + \sa HasIndex(), BamReader::Jump() +*/ + bool BamMultiReader::Jump(int refID, int position) { return d->Jump(refID, position); } -bool BamMultiReader::Open(const vector& filenames, - bool openIndexes, - bool coreMode, - bool preferStandardIndex) -{ - return d->Open(filenames, openIndexes, coreMode, preferStandardIndex); +/*! \fn bool BamMultiReader::LocateIndexes(const BamIndex::IndexType& preferredType) + \brief Looks for index files that match current BAM files. + + Use this function when you need index files, and perhaps have a + preferred index format, but do not depend heavily on which indexes + actually get loaded at runtime. + + For each BAM file, this function will defer to your \a preferredType + whenever possible. However, if an index file of \a preferredType can + not be found, then it will look for any other index file that matches + that BAM file. + + An example case would look this: + \code + + BamMultiReader reader; + // do setup + + // ensure that all files have an index + if ( !reader.LocateIndexes() ) // opens any existing index files that match our BAM files + reader.CreateIndexes(); // creates index files for BAM files that still lack one + + // do interesting stuff + // ... + + \endcode + + If you want precise control over which index files are loaded, use OpenIndexes() + with the desired index filenames. If that function returns false, you can use + CreateIndexes() to then build index files of the exact requested format. + + \param preferredType desired index file format, see BamIndex::IndexType for available formats + \returns \c true if index files could be found for \b ALL open BAM files + \sa BamReader::LocateIndex() +*/ +bool BamMultiReader::LocateIndexes(const BamIndex::IndexType& preferredType) { + return d->LocateIndexes(preferredType); +} + +/*! \fn bool BamMultiReader::Open(const std::vector& filenames) + \brief Opens BAM files. + + N.B. - Opening BAM files will invalidate any current region set on the multireader. + All file pointers will be returned to the beginning of the alignment data. + Follow this with Jump() or SetRegion() to establish a region of interest. + + \param filenames list of BAM filenames to open + \returns \c true if BAM files were opened successfully + \sa Close(), HasOpenReaders(), OpenFile(), OpenIndexes(), BamReader::Open() +*/ +bool BamMultiReader::Open(const std::vector& filenames) { + return d->Open(filenames); } +/*! \fn bool BamMultiReader::OpenFile(const std::string& filename) + \brief Opens a single BAM file. + + Adds another BAM file to multireader "on-the-fly". + + N.B. - Opening a BAM file invalidates any current region set on the multireader. + All file pointers will be returned to the beginning of the alignment data. + Follow this with Jump() or SetRegion() to establish a region of interest. + + \param filename BAM filename to open + \returns \c true if BAM file was opened successfully + \sa Close(), HasOpenReaders(), Open(), OpenIndexes(), BamReader::Open() +*/ +bool BamMultiReader::OpenFile(const std::string& filename) { + return d->OpenFile(filename); +} + +/*! \fn bool BamMultiReader::OpenIndexes(const std::vector& indexFilenames) + \brief Opens index files for current BAM files. + + N.B. - Currently assumes that index filenames match the order (and number) of + BAM files passed to Open(). + + \param indexFilenames list of BAM index file names + \returns \c true if BAM index file was opened & data loaded successfully + \sa LocateIndex(), Open(), SetIndex(), BamReader::OpenIndex() +*/ +bool BamMultiReader::OpenIndexes(const std::vector& indexFilenames) { + return d->OpenIndexes(indexFilenames); +} + +/*! \fn void BamMultiReader::PrintFilenames(void) const + \brief Convenience method for printing filenames to stdout. + \deprecated Doesn't really belong as an API function. Clients should + determine how the data is reported. + \sa Filenames(), BamReader::GetFilename() +*/ void BamMultiReader::PrintFilenames(void) const { d->PrintFilenames(); } +/*! \fn bool BamMultiReader::Rewind(void) + \brief Returns the internal file pointers to the beginning of alignment records. + + Useful for performing multiple sequential passes through BAM files. + Calling this function clears any prior region that may have been set. + + \returns \c true if rewind operation was successful + \sa Jump(), SetRegion(), BamReader::Rewind() +*/ bool BamMultiReader::Rewind(void) { return d->Rewind(); } +/*! \fn void BamMultiReader::SetIndexCacheMode(const BamIndex::IndexCacheMode& mode) + \brief Changes the caching behavior of the index data. + + Default mode is BamIndex::LimitedIndexCaching. + + \param mode desired cache mode for index, see BamIndex::IndexCacheMode for + description of the available cache modes + \sa HasIndex(), BamReader::SetIndexCacheMode() +*/ +void BamMultiReader::SetIndexCacheMode(const BamIndex::IndexCacheMode& mode) { + d->SetIndexCacheMode(mode); +} + +/*! \fn bool BamMultiReader::SetRegion(const BamRegion& region) + \brief Sets a target region of interest + + Equivalent to calling BamReader::SetRegion() on all open BAM files. + + \param region desired region-of-interest to activate + \returns \c true if ALL readers set the region successfully + \sa HasIndexes(), Jump(), BamReader::SetRegion() +*/ +bool BamMultiReader::SetRegion(const BamRegion& region) { + return d->SetRegion(region); +} + +/*! \fn bool BamMultiReader::SetRegion(const int& leftRefID, + const int& leftPosition, + const int& rightRefID, + const int& rightPosition) + \brief Sets a target region of interest + + This is an overloaded function. + + Equivalent to calling BamReader::SetRegion() on all open BAM files. + + \param leftRefID referenceID of region's left boundary + \param leftPosition position of region's left boundary + \param rightRefID reference ID of region's right boundary + \param rightPosition position of region's right boundary + + \returns \c true if ALL readers set the region successfully + \sa HasIndexes(), Jump(), BamReader::SetRegion() +*/ bool BamMultiReader::SetRegion(const int& leftRefID, const int& leftPosition, const int& rightRefID, @@ -111,10 +381,16 @@ bool BamMultiReader::SetRegion(const int& leftRefID, return d->SetRegion(region); } -bool BamMultiReader::SetRegion(const BamRegion& region) { - return d->SetRegion(region); -} +/*! \fn void BamMultiReader::SetSortOrder(const SortOrder& order) + \brief Sets the expected sorting order for reading across multiple BAM files. + + Default is BamMultiReader::SortedByPosition. + + The SortOrder determines how the reader determines which alignment is "next" + from among its open readers. + \param order expected sort order +*/ void BamMultiReader::SetSortOrder(const SortOrder& order) { d->SetSortOrder(order); } diff --git a/src/api/BamMultiReader.h b/src/api/BamMultiReader.h index cc36efc..cc49ec8 100644 --- a/src/api/BamMultiReader.h +++ b/src/api/BamMultiReader.h @@ -1,120 +1,127 @@ -// *************************************************************************** -// BamMultiReader.h (c) 2010 Erik Garrison, Derek Barnett -// Marth Lab, Department of Biology, Boston College -// All rights reserved. -// --------------------------------------------------------------------------- -// Last modified: 17 January 2011 (DB) -// --------------------------------------------------------------------------- -// Functionality for simultaneously reading multiple BAM files -// *************************************************************************** - -#ifndef BAMMULTIREADER_H -#define BAMMULTIREADER_H - -#include -#include -#include -#include -#include -#include - -namespace BamTools { - -namespace Internal { - class BamMultiReaderPrivate; -} // namespace Internal - -class API_EXPORT BamMultiReader { - - // constructor / destructor - public: - BamMultiReader(void); - ~BamMultiReader(void); - - // public interface - public: - - // ---------------------- - // BAM file operations - // ---------------------- - - // close BAM files - void Close(void); - // opens BAM files (and optional BAM index files, if provided) - // @openIndexes - triggers index opening, useful for suppressing - // error messages during merging of files in which we may not have - // indexes. - // @coreMode - setup our first alignments using GetNextAlignmentCore(); - // also useful for merging - // @preferStandardIndex - look for standard BAM index ".bai" first. If false, - // will look for BamTools index ".bti". - bool Open(const std::vector& filenames, - bool openIndexes = true, - bool coreMode = false, - bool preferStandardIndex = false); - // returns whether underlying BAM readers ALL have an index loaded - // this is useful to indicate whether Jump() or SetRegion() are possible - bool IsIndexLoaded(void) const; - // performs random-access jump to reference, position - bool Jump(int refID, int position = 0); - // list files associated with this multireader - void PrintFilenames(void) const; - // sets the target region - bool SetRegion(const BamRegion& region); - bool SetRegion(const int& leftRefID, - const int& leftBound, - const int& rightRefID, - const int& rightBound); - // returns file pointers to beginning of alignments - bool Rewind(void); - - // ---------------------- - // access alignment data - // ---------------------- - - // retrieves next available alignment (returns success/fail) from all files - bool GetNextAlignment(BamAlignment& alignment); - // retrieves next available alignment (returns success/fail) from all files - // and populates the support data with information about the alignment - // *** BUT DOES NOT PARSE CHARACTER DATA FROM THE ALIGNMENT - bool GetNextAlignmentCore(BamAlignment& alignment); - // ... should this be private? - bool HasOpenReaders(void); - // set sort order for merging BAM files (i.e. which alignment from the files is 'next'?) - // default behavior is to sort by position, use this method to handle BAMs sorted by read name - enum SortOrder { SortedByPosition = 0 - , SortedByReadName - , Unsorted - }; - void SetSortOrder(const SortOrder& order); - - // ---------------------- - // access auxiliary data - // ---------------------- - - // returns unified SAM header text for all files - const std::string GetHeaderText(void) const; - // returns number of reference sequences - const int GetReferenceCount(void) const; - // returns vector of reference objects - const BamTools::RefVector GetReferenceData(void) const; - // returns reference id (used for BamMultiReader::Jump()) for the given reference name - const int GetReferenceID(const std::string& refName) const; - - // ---------------------- - // BAM index operations - // ---------------------- - - // creates index for BAM files which lack them, saves to files (default = bamFilename + ".bai") - bool CreateIndexes(bool useStandardIndex = true); - // sets the index caching mode for the readers - void SetIndexCacheMode(const BamIndex::BamIndexCacheMode mode); - - // private implementation - private: - Internal::BamMultiReaderPrivate* d; -}; - -} // namespace BamTools - -#endif // BAMMULTIREADER_H +// *************************************************************************** +// BamMultiReader.h (c) 2010 Erik Garrison, Derek Barnett +// Marth Lab, Department of Biology, Boston College +// All rights reserved. +// --------------------------------------------------------------------------- +// Last modified: 15 March 2011 (DB) +// --------------------------------------------------------------------------- +// Convenience class for reading multiple BAM files. +// *************************************************************************** + +#ifndef BAMMULTIREADER_H +#define BAMMULTIREADER_H + +#include +#include +#include +#include +#include +#include + +namespace BamTools { + +namespace Internal { + class BamMultiReaderPrivate; +} // namespace Internal + +class API_EXPORT BamMultiReader { + + public: + enum SortOrder { SortedByPosition = 0 + , SortedByReadName + , Unsorted + }; + + // constructor / destructor + public: + BamMultiReader(void); + ~BamMultiReader(void); + + // public interface + public: + + // ---------------------- + // BAM file operations + // ---------------------- + + // closes all open BAM files + void Close(void); + // close only the requested BAM file + void CloseFile(const std::string& filename); + // returns list of filenames for all open BAM files + const std::vector Filenames(void) const; + // returns true if multireader has any open BAM files + bool HasOpenReaders(void) const; + // performs random-access jump within current BAM files + bool Jump(int refID, int position = 0); + // opens BAM files + bool Open(const std::vector& filenames); + // opens a single BAM file, adding to any other current BAM files + bool OpenFile(const std::string& filename); + // returns file pointers to beginning of alignments + bool Rewind(void); + // sets the target region of interest + bool SetRegion(const BamRegion& region); + // sets the target region of interest + bool SetRegion(const int& leftRefID, + const int& leftPosition, + const int& rightRefID, + const int& rightPosition); + + // ---------------------- + // access alignment data + // ---------------------- + + // retrieves next available alignment + bool GetNextAlignment(BamAlignment& alignment); + // retrieves next available alignmnet (without populating the alignment's string data fields) + bool GetNextAlignmentCore(BamAlignment& alignment); + + // sets the expected sorting order for reading across multiple BAM files + void SetSortOrder(const SortOrder& order); + + // ---------------------- + // access auxiliary data + // ---------------------- + + // returns unified SAM header for all files + SamHeader GetHeader(void) const; + // returns unified SAM header text for all files + std::string GetHeaderText(void) const; + // returns number of reference sequences + int GetReferenceCount(void) const; + // returns all reference sequence entries. + const BamTools::RefVector GetReferenceData(void) const; + // returns the ID of the reference with this name. + int GetReferenceID(const std::string& refName) const; + + // ---------------------- + // BAM index operations + // ---------------------- + + // creates index files for current BAM files + bool CreateIndexes(const BamIndex::IndexType& type = BamIndex::STANDARD); + // returns true if all BAM files have index data available + bool HasIndexes(void) const; + // looks for index files that match current BAM files + bool LocateIndexes(const BamIndex::IndexType& preferredType = BamIndex::STANDARD); + // opens index files for current BAM files. + bool OpenIndexes(const std::vector& indexFilenames); + // changes the caching behavior of the index data + void SetIndexCacheMode(const BamIndex::IndexCacheMode& mode); + + // deprecated methods + public: + // returns \c true if all BAM files have index data available. + bool IsIndexLoaded(void) const; + // convenience method for printing filenames to stdout + void PrintFilenames(void) const; + + // private implementation + private: + Internal::BamMultiReaderPrivate* d; +}; + +} // namespace BamTools + +#endif // BAMMULTIREADER_H diff --git a/src/api/BamReader.cpp b/src/api/BamReader.cpp index 473dc89..eaa6882 100644 --- a/src/api/BamReader.cpp +++ b/src/api/BamReader.cpp @@ -3,9 +3,9 @@ // Marth Lab, Department of Biology, Boston College // All rights reserved. // --------------------------------------------------------------------------- -// Last modified: 11 January 2011 (DB) +// Last modified: 4 March 2011 (DB) // --------------------------------------------------------------------------- -// Provides the basic functionality for reading BAM files +// Provides read access to BAM files. // *************************************************************************** #include @@ -20,48 +20,351 @@ using namespace BamTools::Internal; #include using namespace std; -// constructor -BamReader::BamReader(void) { - d = new BamReaderPrivate(this); -} +/*! \class BamTools::BamReader + \brief Provides read access to BAM files. +*/ + +/*! \fn BamReader::BamReader(void) + \brief constructor +*/ +BamReader::BamReader(void) + : d(new BamReaderPrivate(this)) +{ } -// destructor +/*! \fn BamReader::~BamReader(void) + \brief destructor +*/ BamReader::~BamReader(void) { delete d; d = 0; } -// file operations -void BamReader::Close(void) { d->Close(); } -bool BamReader::HasIndex(void) const { return d->HasIndex; } -bool BamReader::IsIndexLoaded(void) const { return HasIndex(); } -bool BamReader::IsOpen(void) const { return d->mBGZF.IsOpen; } -bool BamReader::Jump(int refID, int position) { return d->SetRegion( BamRegion(refID, position) ); } -bool BamReader::Open(const std::string& filename, - const std::string& indexFilename, - const bool lookForIndex, - const bool preferStandardIndex) -{ - return d->Open(filename, indexFilename, lookForIndex, preferStandardIndex); +/*! \fn void BamReader::Close(void) + \brief Closes the current BAM file. + + Also clears out all header and reference data. + + \sa IsOpen(), Open() +*/ +void BamReader::Close(void) { + d->Close(); } -bool BamReader::Rewind(void) { return d->Rewind(); } -bool BamReader::SetRegion(const BamRegion& region) { return d->SetRegion(region); } -bool BamReader::SetRegion(const int& leftRefID, const int& leftBound, const int& rightRefID, const int& rightBound) { - return d->SetRegion( BamRegion(leftRefID, leftBound, rightRefID, rightBound) ); + +/*! \fn bool BamReader::CreateIndex(const BamIndex::IndexType& type) + \brief Creates an index file for current BAM file. + + \param type file format to create, see BamIndex::IndexType for available formats + \return \c true if index created OK + \sa LocateIndex(), OpenIndex() +*/ +bool BamReader::CreateIndex(const BamIndex::IndexType& type) { + return d->CreateIndex(type); +} + +/*! \fn const std::string BamReader::GetFilename(void) const + \brief Returns name of current BAM file. + + Retrieved filename will contain whatever was passed via Open(). + If you need full directory paths here, be sure to include them + when you open the BAM file. + + \returns name of open BAM file. If no file is open, returns an empty string. + \sa IsOpen() +*/ +const std::string BamReader::GetFilename(void) const { + return d->Filename(); +} + +/*! \fn SamHeader BamReader::GetHeader(void) const + \brief Returns SAM header data. + + Header data is wrapped in a SamHeader object that can be conveniently queried & modified. + + N.B. - Modifying the retrieved SamHeader object does NOT affect the + current BAM file. This file has been opened in a read-only mode. + However, your modified SamHeader object can be used in conjunction with + BamWriter to generate a new BAM file with the appropriate header information. + + \returns header data object + \sa GetHeaderText() +*/ +SamHeader BamReader::GetHeader(void) const { + return d->GetSamHeader(); +} + +/*! \fn std::string BamReader::GetHeaderText(void) const + \brief Returns SAM header data, as SAM-formatted text. + + N.B. - Modifying the retrieved text does NOT affect the current + BAM file. This file has been opened in a read-only mode. However, + your modified header text can be used in conjunction with BamWriter + to generate a new BAM file with the appropriate header information. + + \returns SAM-formatted header text + \sa GetHeader() +*/ +std::string BamReader::GetHeaderText(void) const { + return d->GetHeaderText(); +} + +/*! \fn bool BamReader::GetNextAlignment(BamAlignment& alignment) + \brief Retrieves next available alignment. + + Attempts to read the next alignment record from BAM file, and checks to see + if it overlaps the current region. If no region is currently set, then the + next alignment available is always considered valid. + + If a region has been set, via Jump() or SetRegion(), an alignment is only + considered valid if it overlaps the region. If the actual 'next' alignment record + in the BAM file does not overlap this region, then this function will read sequentially + through the file until the next alignment that overlaps this region is found. + Once the region has been exhausted (i.e. the next alignment loaded is beyond the region), + the function aborts and returns \c false. In this case, there is no point to continue + reading, assuming properly sorted alignments. + + This function fully populates all of the alignment's available data fields, + including the string data fields (read name, bases, qualities, tags, filename). + If only positional data (refID, position, CIGAR ops, alignment flags, etc.) + are required, consider using GetNextAlignmentCore() for a significant + performance boost. + + \param alignment destination for alignment record data + \returns \c true if a valid alignment was found +*/ +bool BamReader::GetNextAlignment(BamAlignment& alignment) { + return d->GetNextAlignment(alignment); +} + +/*! \fn bool BamReader::GetNextAlignmentCore(BamAlignment& alignment) + \brief Retrieves next available alignment, without populating the alignment's string data fields. + + Equivalent to GetNextAlignment() with respect to what is a valid overlapping alignment. + + However, this method does NOT populate the alignment's string data fields + (read name, bases, qualities, tags, filename). This provides a boost in speed + when these fields are not required for every alignment. These fields can be + populated 'lazily' (as needed) by calling BamAlignment::BuildCharData() later. + + \param alignment destination for alignment record data + \returns \c true if a valid alignment was found + \sa SetRegion() +*/ +bool BamReader::GetNextAlignmentCore(BamAlignment& alignment) { + return d->GetNextAlignmentCore(alignment); +} + +/*! \fn int BamReader::GetReferenceCount(void) const + \brief Returns number of reference sequences. +*/ +int BamReader::GetReferenceCount(void) const { + return d->GetReferenceCount(); +} + +/*! \fn const RefVector& BamReader::GetReferenceData(void) const + \brief Returns all reference sequence entries. + \sa RefData +*/ +const RefVector& BamReader::GetReferenceData(void) const { + return d->GetReferenceData(); +} + +/*! \fn int BamReader::GetReferenceID(const std::string& refName) const + \brief Returns the ID of the reference with this name. + + If \a refName is not found, returns -1. +*/ +int BamReader::GetReferenceID(const std::string& refName) const { + return d->GetReferenceID(refName); +} + +/*! \fn bool BamReader::HasIndex(void) const + \brief Returns \c true if index data is available. +*/ +bool BamReader::HasIndex(void) const { + return d->HasIndex(); +} + +/*! \fn bool BamReader::IsIndexLoaded(void) const + \brief Returns \c true if index data is available. + + \deprecated Instead use HasIndex() + \cond + Deprecated purely for API semantic clarity - HasIndex() should be clearer + than IsIndexLoaded() in light of the new caching modes that may clear the + index data from memory, but leave the index file open for later random access + seeks. + + For example, what would (IsIndexLoaded() == true) mean when cacheMode has been + explicitly set to NoIndexCaching? This is confusing at best, misleading about + current memory behavior at worst. + \endcond +*/ +bool BamReader::IsIndexLoaded(void) const { + return d->HasIndex(); } -// access alignment data -bool BamReader::GetNextAlignment(BamAlignment& bAlignment) { return d->GetNextAlignment(bAlignment); } -bool BamReader::GetNextAlignmentCore(BamAlignment& bAlignment) { return d->GetNextAlignmentCore(bAlignment); } +/*! \fn bool BamReader::IsOpen(void) const + \brief Returns \c true if a BAM file is open for reading. +*/ +bool BamReader::IsOpen(void) const { + return d->IsOpen(); +} + +/*! \fn bool BamReader::Jump(int refID, int position) + \brief Performs a random-access jump within BAM file. + + This is a convenience method, equivalent to calling SetRegion() + with only a left boundary specified. + + \returns \c true if jump was successful + \sa HasIndex() +*/ +bool BamReader::Jump(int refID, int position) { + return d->SetRegion( BamRegion(refID, position) ); +} + +/*! \fn bool BamReader::LocateIndex(const BamIndex::IndexType& preferredType) + \brief Looks in BAM file's directory for a matching index file. + + Use this function when you need an index file, and perhaps have a + preferred index format, but do not depend heavily on which format + actually gets loaded at runtime. + + This function will defer to your \a preferredType whenever possible. + However, if an index file of \a preferredType can not be found, then + it will look for any other index file that corresponds to this BAM file. + + If you want precise control over which index file is loaded, use OpenIndex() + with the desired index filename. If that function returns false, you can use + CreateIndex() to then build an index of the exact requested format. + + \param preferredType desired index file format, see BamIndex::IndexType for available formats + \returns \c true if (any) index file could be found +*/ +bool BamReader::LocateIndex(const BamIndex::IndexType& preferredType) { + return d->LocateIndex(preferredType); +} + +/*! \fn bool BamReader::Open(const std::string& filename) + \brief Opens a BAM file. -// access auxiliary data -SamHeader BamReader::GetHeader(void) const { return d->GetSamHeader(); } -const string BamReader::GetHeaderText(void) const { return d->GetHeaderText(); } -int BamReader::GetReferenceCount(void) const { return d->References.size(); } -const RefVector& BamReader::GetReferenceData(void) const { return d->References; } -int BamReader::GetReferenceID(const string& refName) const { return d->GetReferenceID(refName); } -const std::string BamReader::GetFilename(void) const { return d->Filename; } + If BamReader is already opened on another file, this function closes + that file, then attempts to open requested \a filename. + + \param filename name of BAM file to open + \returns \c true if BAM file was opened successfully + \sa Close(), IsOpen(), OpenIndex() +*/ +bool BamReader::Open(const std::string& filename) { + return d->Open(filename); +} + +/*! \fn bool BamReader::OpenIndex(const std::string& indexFilename) + \brief Opens a BAM index file. + + \param indexFilename name of BAM index file + + \returns \c true if BAM index file was opened & data loaded successfully + \sa LocateIndex(), Open(), SetIndex() +*/ +bool BamReader::OpenIndex(const std::string& indexFilename) { + return d->OpenIndex(indexFilename); +} + +/*! \fn bool BamReader::Rewind(void) + \brief Returns the internal file pointer to the first alignment record. + + Useful for performing multiple sequential passes through a BAM file. + Calling this function clears any prior region that may have been set. + + N.B. - Note that this function sets the file pointer to first alignment record + in the BAM file, NOT the beginning of the file. + + \returns \c true if rewind operation was successful + \sa Jump(), SetRegion() +*/ +bool BamReader::Rewind(void) { + return d->Rewind(); +} -// index operations -bool BamReader::CreateIndex(bool useStandardIndex) { return d->CreateIndex(useStandardIndex); } -void BamReader::SetIndexCacheMode(const BamIndex::BamIndexCacheMode mode) { d->SetIndexCacheMode(mode); } +/*! \fn void BamReader::SetIndex(BamIndex* index) + \brief Sets a custom BamIndex on this reader. + + Only necessary for custom BamIndex subclasses. Most clients should + never have to use this function. + + Example: + \code + BamReader reader; + reader.SetIndex(new MyCustomBamIndex); + \endcode + + N.B. - BamReader takes ownership of \a index - i.e. BamReader will + take care of deleting the pointer when the reader is destructed, + when the current BAM file is closed, or when a new index is requested. + + \param index custom BamIndex subclass created by client + \sa CreateIndex(), LocateIndex(), OpenIndex() +*/ +void BamReader::SetIndex(BamIndex* index) { + d->SetIndex(index); +} + +/*! \fn void BamReader::SetIndexCacheMode(const BamIndex::IndexCacheMode& mode) + \brief Changes the caching behavior of the index data. + + Default mode is BamIndex::LimitedIndexCaching. + + \param mode desired cache mode for index, see BamIndex::IndexCacheMode for + description of the available cache modes + \sa HasIndex() +*/ +void BamReader::SetIndexCacheMode(const BamIndex::IndexCacheMode& mode) { + d->SetIndexCacheMode(mode); +} + +/*! \fn bool BamReader::SetRegion(const BamRegion& region) + \brief Sets a target region of interest + + Requires that index data be available. Attempts a random-access + jump in the BAM file, near \a region left boundary position. + + Subsequent calls to GetNextAlignment() or GetNextAlignmentCore() + will only return \c true when alignments can be found that overlap + this \a region. + + A \a region with no right boundary is considered open-ended, meaning + that all alignments that lie downstream of the left boundary are + considered valid, continuing to the end of the BAM file. + + \param region desired region-of-interest to activate + \returns \c true if reader was able to jump successfully to the region's left boundary + \sa HasIndex(), Jump() +*/ +bool BamReader::SetRegion(const BamRegion& region) { + return d->SetRegion(region); +} + +/*! \fn bool BamReader::SetRegion(const int& leftRefID, + const int& leftPosition, + const int& rightRefID, + const int& rightPosition) + \brief Sets a target region of interest. + + This is an overloaded function. + + \param leftRefID referenceID of region's left boundary + \param leftPosition position of region's left boundary + \param rightRefID reference ID of region's right boundary + \param rightPosition position of region's right boundary + + \returns \c true if reader was able to jump successfully to the region's left boundary + \sa HasIndex(), Jump() +*/ +bool BamReader::SetRegion(const int& leftRefID, + const int& leftBound, + const int& rightRefID, + const int& rightBound) +{ + return d->SetRegion( BamRegion(leftRefID, leftBound, rightRefID, rightBound) ); +} diff --git a/src/api/BamReader.h b/src/api/BamReader.h index d68ab6c..85b0c0d 100644 --- a/src/api/BamReader.h +++ b/src/api/BamReader.h @@ -3,9 +3,9 @@ // Marth Lab, Department of Biology, Boston College // All rights reserved. // --------------------------------------------------------------------------- -// Last modified: 11 January 2011 (DB) +// Last modified: 4 March 2011 (DB) // --------------------------------------------------------------------------- -// Provides the basic functionality for reading BAM files +// Provides read access to BAM files. // *************************************************************************** #ifndef BAMREADER_H @@ -37,90 +37,75 @@ class API_EXPORT BamReader { // BAM file operations // ---------------------- - // close BAM file + // closes the current BAM file void Close(void); - // returns whether reader is open for reading or not + // returns filename of current BAM file + const std::string GetFilename(void) const; + // returns true if a BAM file is open for reading bool IsOpen(void) const; - // performs random-access jump using (reference, position) as a left-bound + // performs random-access jump within BAM file bool Jump(int refID, int position = 0); - // opens BAM file (and optional BAM index file, if provided) - // @lookForIndex - if no indexFilename provided, look in BAM file's directory for an existing index file - // default behavior is to skip index file search if no index filename given - // @preferStandardIndex - if true, give priority in index file searching to standard BAM index (*.bai) - // default behavior is to prefer the BamToolsIndex (*.bti) if both are available - bool Open(const std::string& filename, - const std::string& indexFilename = "", - const bool lookForIndex = false, - const bool preferStandardIndex = false); - // returns file pointer to beginning of alignments + // opens a BAM file + bool Open(const std::string& filename); + // returns internal file pointer to beginning of alignment data bool Rewind(void); - // sets a region of interest (with left & right bound reference/position) - // returns success/failure of seeking to left bound of region + // sets the target region of interest bool SetRegion(const BamRegion& region); - bool SetRegion(const int& leftRefID, const int& leftBound, const int& rightRefID, const int& rightBound); + // sets the target region of interest + bool SetRegion(const int& leftRefID, + const int& leftPosition, + const int& rightRefID, + const int& rightPosition); // ---------------------- // access alignment data // ---------------------- - // retrieves next available alignment (returns success/fail) - bool GetNextAlignment(BamAlignment& bAlignment); - // retrieves next available alignment core data (returns success/fail) - // ** DOES NOT parse any character data (read name, bases, qualities, tag data) ** - // useful for operations requiring ONLY aligner-related information - // (refId/position, alignment flags, CIGAR, mapQuality, etc) - bool GetNextAlignmentCore(BamAlignment& bAlignment); + // retrieves next available alignment + bool GetNextAlignment(BamAlignment& alignment); + // retrieves next available alignmnet (without populating the alignment's string data fields) + bool GetNextAlignmentCore(BamAlignment& alignment); // ---------------------- - // access auxiliary data + // access header data // ---------------------- - // returns SamHeader object - see SamHeader.h for more info + // returns SAM header data SamHeader GetHeader(void) const; - // returns SAM header text - const std::string GetHeaderText(void) const; - // returns number of reference sequences + // returns SAM header data, as SAM-formatted text + std::string GetHeaderText(void) const; + + // ---------------------- + // access reference data + // ---------------------- + + // returns the number of reference sequences int GetReferenceCount(void) const; - // returns vector of reference objects - const BamTools::RefVector& GetReferenceData(void) const; - // returns reference id (used for BamReader::Jump()) for the given reference name + // returns all reference sequence entries + const RefVector& GetReferenceData(void) const; + // returns the ID of the reference with this name int GetReferenceID(const std::string& refName) const; - // returns the name of the file associated with this BamReader - const std::string GetFilename(void) const; // ---------------------- // BAM index operations // ---------------------- - // creates index for BAM file, saves to file - // default behavior is to create the BAM standard index (".bai") - // set flag to false to create the BamTools-specific index (".bti") - bool CreateIndex(bool useStandardIndex = true); - // returns whether index data is available for reading - // (e.g. if true, BamReader should be able to seek to a region) + // creates an index file for current BAM file, using the requested index type + bool CreateIndex(const BamIndex::IndexType& type = BamIndex::STANDARD); + // returns true if index data is available bool HasIndex(void) const; - // change the index caching behavior - // default BamReader/Index mode is LimitedIndexCaching - // @mode - can be either FullIndexCaching, LimitedIndexCaching, - // or NoIndexCaching. See BamIndex.h for more details - void SetIndexCacheMode(const BamIndex::BamIndexCacheMode mode); - + // looks in BAM file's directory for a matching index file + bool LocateIndex(const BamIndex::IndexType& preferredType = BamIndex::STANDARD); + // opens a BAM index file + bool OpenIndex(const std::string& indexFilename); + // sets a custom BamIndex on this reader + void SetIndex(BamIndex* index); + // changes the caching behavior of the index data + void SetIndexCacheMode(const BamIndex::IndexCacheMode& mode); + // deprecated methods public: - - // deprecated (but still available): prefer HasIndex() instead - // - // Deprecated purely for API semantic clarity - HasIndex() should be clearer - // than IsIndexLoaded() in light of the new caching modes that may clear the - // index data from memory, but leave the index file open for later random access - // seeks. - // - // For example, what would (IsIndexLoaded() == true) mean when cacheMode has been - // explicitly set to NoIndexCaching? This is confusing at best, misleading about - // current memory behavior at worst. - // - // returns whether index data is available - // (e.g. if true, BamReader should be able to seek to a region) + // returns true if index data is available bool IsIndexLoaded(void) const; // private implementation diff --git a/src/api/BamWriter.cpp b/src/api/BamWriter.cpp index 386755d..8582f34 100644 --- a/src/api/BamWriter.cpp +++ b/src/api/BamWriter.cpp @@ -3,12 +3,14 @@ // Marth Lab, Department of Biology, Boston College // All rights reserved. // --------------------------------------------------------------------------- -// Last modified: 11 January 2011 (DB) +// Last modified: 4 March 2011 (DB) // --------------------------------------------------------------------------- // Provides the basic functionality for producing BAM files // *************************************************************************** +#include #include +#include #include using namespace BamTools; using namespace BamTools::Internal; @@ -16,41 +18,126 @@ using namespace BamTools::Internal; #include using namespace std; -// constructor -BamWriter::BamWriter(void) { - d = new BamWriterPrivate; -} +/*! \class BamTools::BamWriter + \brief Provides write access for generating BAM files. +*/ +/*! \enum BamTools::BamWriter::CompressionMode + \brief This enum describes the compression behaviors for output BAM files. +*/ +/*! \var BamWriter::CompressionMode BamWriter::Compressed + \brief Use normal BAM compression +*/ +/*! \var BamWriter::CompressionMode BamWriter::Uncompressed + \brief Disable BAM compression + + Useful in situations where the BAM data is streamed (e.g. piping). + It would be wasteful to compress, and then immediately decompress + the data. +*/ + +/*! \fn BamWriter::BamWriter(void) + \brief constructor +*/ +BamWriter::BamWriter(void) + : d(new BamWriterPrivate) +{ } -// destructor +/*! \fn BamWriter::~BamWriter(void) + \brief destructor +*/ BamWriter::~BamWriter(void) { delete d; d = 0; } -// closes the alignment archive +/*! \fn BamWriter::Close(void) + \brief Closes the current BAM file. + \sa Open() +*/ void BamWriter::Close(void) { d->Close(); } -// opens the alignment archive (using std::string SAM header) -bool BamWriter::Open(const string& filename, - const string& samHeader, - const RefVector& referenceSequences, - bool isWriteUncompressed) +/*! \fn bool BamWriter::IsOpen(void) const + \brief Returns \c true if BAM file is open for writing. + \sa Open() +*/ +bool BamWriter::IsOpen(void) const { + return d->IsOpen(); +} + +/*! \fn bool BamWriter::Open(const std::string& filename, + const std::string& samHeaderText, + const RefVector& referenceSequences) + \brief Opens a BAM file for writing. + + Will overwrite the BAM file if it already exists. + + \param filename name of output BAM file + \param samHeaderText header data, as SAM-formatted string + \param referenceSequences list of reference entries + + \return \c true if opened successfully + \sa Close(), IsOpen(), BamReader::GetHeaderText(), BamReader::GetReferenceData() +*/ +bool BamWriter::Open(const std::string& filename, + const std::string& samHeaderText, + const RefVector& referenceSequences) { - return d->Open(filename, samHeader, referenceSequences, isWriteUncompressed); + return d->Open(filename, samHeaderText, referenceSequences); } -// opens the alignment archive (using SamHeader object) -bool BamWriter::Open(const string& filename, +/*! \fn bool BamWriter::Open(const std::string& filename, + const SamHeader& samHeader, + const RefVector& referenceSequences) + \brief Opens a BAM file for writing. + + This is an overloaded function. + + Will overwrite the BAM file if it already exists. + + \param filename name of output BAM file + \param samHeader header data, wrapped in SamHeader object + \param referenceSequences list of reference entries + + \return \c true if opened successfully + \sa Close(), IsOpen(), BamReader::GetHeader(), BamReader::GetReferenceData() +*/ +bool BamWriter::Open(const std::string& filename, const SamHeader& samHeader, - const RefVector& referenceSequences, - bool isWriteUncompressed) + const RefVector& referenceSequences) { - return d->Open(filename, samHeader.ToString(), referenceSequences, isWriteUncompressed); + return d->Open(filename, samHeader.ToString(), referenceSequences); +} + +/*! \fn void BamWriter::SaveAlignment(const BamAlignment& alignment) + \brief Saves an alignment to the BAM file. + + \param alignment BamAlignment record to save + \sa BamReader::GetNextAlignment(), BamReader::GetNextAlignmentCore() +*/ +void BamWriter::SaveAlignment(const BamAlignment& alignment) { + d->SaveAlignment(alignment); } -// saves the alignment to the alignment archive -void BamWriter::SaveAlignment(const BamAlignment& al) { - d->SaveAlignment(al); +/*! \fn void BamWriter::SetCompressionMode(const CompressionMode& compressionMode) + \brief Sets the output compression mode. + + Default mode is BamWriter::Compressed. + + N.B. - Changing the compression mode is disabled on open files (i.e. the request will be ignored). + Be sure to call this function before opening the BAM file. + + \code + BamWriter writer; + writer.SetCompressionMode(BamWriter::Uncompressed); + writer.Open( ... ); + // ... + \endcode + + \param compressionMode desired output compression behavior + \sa IsOpen(), Open() +*/ +void BamWriter::SetCompressionMode(const CompressionMode& compressionMode) { + d->SetWriteCompressed( compressionMode == BamWriter::Compressed ); } diff --git a/src/api/BamWriter.h b/src/api/BamWriter.h index 2d8b528..476dbec 100644 --- a/src/api/BamWriter.h +++ b/src/api/BamWriter.h @@ -3,7 +3,7 @@ // Marth Lab, Department of Biology, Boston College // All rights reserved. // --------------------------------------------------------------------------- -// Last modified: 11 January 2011 (DB) +// Last modified: 4 March 2011 (DB) // --------------------------------------------------------------------------- // Provides the basic functionality for producing BAM files // *************************************************************************** @@ -12,39 +12,47 @@ #define BAMWRITER_H #include -#include -#include +#include #include namespace BamTools { +class BamAlignment; +class SamHeader; + namespace Internal { class BamWriterPrivate; } // namespace Internal class API_EXPORT BamWriter { - // constructor/destructor + public: enum CompressionMode { Compressed = 0 + , Uncompressed + }; + + // ctor & dtor public: BamWriter(void); ~BamWriter(void); // public interface public: - // closes the alignment archive + // closes the current BAM file void Close(void); - // opens the alignment archive (using std::string SAM header) + // returns true if BAM file is open for writing + bool IsOpen(void) const; + // opens a BAM file for writing bool Open(const std::string& filename, - const std::string& samHeader, - const BamTools::RefVector& referenceSequences, - bool writeUncompressed = false); - // opens the alignment archive (using SamHeader object) + const std::string& samHeaderText, + const RefVector& referenceSequences); + // opens a BAM file for writing bool Open(const std::string& filename, const SamHeader& samHeader, - const BamTools::RefVector& referenceSequences, - bool writeUncompressed = false); + const RefVector& referenceSequences); // saves the alignment to the alignment archive - void SaveAlignment(const BamTools::BamAlignment& al); + void SaveAlignment(const BamAlignment& alignment); + // sets the output compression mode + void SetCompressionMode(const CompressionMode& compressionMode); // private implementation private: diff --git a/src/api/CMakeLists.txt b/src/api/CMakeLists.txt index 7f9b344..57efba2 100644 --- a/src/api/CMakeLists.txt +++ b/src/api/CMakeLists.txt @@ -18,18 +18,20 @@ set( BamToolsAPISources BamMultiReader.cpp BamReader.cpp BamWriter.cpp - BGZF.cpp SamHeader.cpp SamReadGroup.cpp SamReadGroupDictionary.cpp SamSequence.cpp SamSequenceDictionary.cpp internal/BamHeader_p.cpp + internal/BamIndexFactory_p.cpp internal/BamMultiReader_p.cpp + internal/BamRandomAccessController_p.cpp internal/BamReader_p.cpp internal/BamStandardIndex_p.cpp internal/BamToolsIndex_p.cpp internal/BamWriter_p.cpp + internal/BgzfStream_p.cpp internal/SamFormatParser_p.cpp internal/SamFormatPrinter_p.cpp internal/SamHeaderValidator_p.cpp @@ -37,7 +39,7 @@ set( BamToolsAPISources # create main BamTools API shared library add_library( BamTools SHARED ${BamToolsAPISources} ) -set_target_properties( BamTools PROPERTIES SOVERSION "0.9.3" ) +set_target_properties( BamTools PROPERTIES SOVERSION "1.0.0" ) set_target_properties( BamTools PROPERTIES OUTPUT_NAME "bamtools" ) # create main BamTools API static library @@ -64,7 +66,6 @@ ExportHeader(APIHeaders BamIndex.h ${ApiIncludeDir}) ExportHeader(APIHeaders BamMultiReader.h ${ApiIncludeDir}) ExportHeader(APIHeaders BamReader.h ${ApiIncludeDir}) ExportHeader(APIHeaders BamWriter.h ${ApiIncludeDir}) -ExportHeader(APIHeaders BGZF.h ${ApiIncludeDir}) ExportHeader(APIHeaders SamConstants.h ${ApiIncludeDir}) ExportHeader(APIHeaders SamHeader.h ${ApiIncludeDir}) ExportHeader(APIHeaders SamReadGroup.h ${ApiIncludeDir}) diff --git a/src/api/SamHeader.cpp b/src/api/SamHeader.cpp index 5134630..7a69162 100644 --- a/src/api/SamHeader.cpp +++ b/src/api/SamHeader.cpp @@ -3,10 +3,10 @@ // Marth Lab, Department of Biology, Boston College // All rights reserved. // --------------------------------------------------------------------------- -// Last modified: 23 December 2010 (DB) +// Last modified: 4 March 2011 (DB) // --------------------------------------------------------------------------- -// Provides functionality for querying/manipulating SAM header data -// ************************************************************************** +// Provides direct read/write access to the SAM header data fields. +// *************************************************************************** #include #include @@ -16,8 +16,47 @@ using namespace BamTools; using namespace BamTools::Internal; using namespace std; -// ctor -SamHeader::SamHeader(const string& headerText) +/*! \struct BamTools::SamHeader + \brief Represents the SAM-formatted text header that is part of the BAM file header. + + Provides direct read/write access to the SAM header data fields. + + \sa http://samtools.sourceforge.net/SAM-1.3.pdf +*/ +/*! \var SamHeader::Version + \brief corresponds to \@HD VN:\ +*/ +/*! \var SamHeader::SortOrder + \brief corresponds to \@HD SO:\ +*/ +/*! \var SamHeader::GroupOrder + \brief corresponds to \@HD GO:\ +*/ +/*! \var SamHeader::Sequences + \brief corresponds to \@SQ entries + \sa SamSequence, SamSequenceDictionary +*/ +/*! \var SamHeader::ReadGroups + \brief corresponds to \@RG entries + \sa SamReadGroup, SamReadGroupDictionary +*/ +/*! \var SamHeader::ProgramName + \brief corresponds to \@PG ID:\ +*/ +/*! \var SamHeader::ProgramVersion + \brief corresponds to \@PG VN:\ +*/ +/*! \var SamHeader::ProgramCommandLine + \brief corresponds to \@PG CL:\ +*/ +/*! \var SamHeader::Comments + \brief corresponds to \@CO entries +*/ + +/*! \fn SamHeader::SamHeader(const std::string& headerText = "") + \brief constructor +*/ +SamHeader::SamHeader(const std::string& headerText) : Version("") , SortOrder("") , GroupOrder("") @@ -29,7 +68,9 @@ SamHeader::SamHeader(const string& headerText) parser.Parse(headerText); } -// copy ctor +/*! \fn SamHeader::SamHeader(const SamHeader& other) + \brief copy constructor +*/ SamHeader::SamHeader(const SamHeader& other) : Version(other.Version) , SortOrder(other.SortOrder) @@ -41,11 +82,14 @@ SamHeader::SamHeader(const SamHeader& other) , ProgramCommandLine(other.ProgramCommandLine) { } -// dtor -SamHeader::~SamHeader(void) { - Clear(); -} +/*! \fn SamHeader::~SamHeader(void) + \brief destructor +*/ +SamHeader::~SamHeader(void) { } +/*! \fn void SamHeader::Clear(void) + \brief Clears all header contents. +*/ void SamHeader::Clear(void) { Version.clear(); SortOrder.clear(); @@ -58,69 +102,102 @@ void SamHeader::Clear(void) { Comments.clear(); } -void SamHeader::SetHeaderText(const std::string& headerText) { - - // clear prior data - Clear(); - - // parse header text into data - SamFormatParser parser(*this); - parser.Parse(headerText); -} - -// retrieve the SAM header, with any local modifications -string SamHeader::ToString(void) const { - SamFormatPrinter printer(*this); - return printer.ToString(); -} - -// query if header contains @HD ID: +/*! \fn bool SamHeader::HasVersion(void) const + \brief Returns \c true if header contains \@HD ID:\ +*/ bool SamHeader::HasVersion(void) const { return (!Version.empty()); } -// query if header contains @HD SO: +/*! \fn bool SamHeader::HasSortOrder(void) const + \brief Returns \c true if header contains \@HD SO:\ +*/ bool SamHeader::HasSortOrder(void) const { return (!SortOrder.empty()); } -// query if header contains @HD GO: +/*! \fn bool SamHeader::HasGroupOrder(void) const + \brief Returns \c true if header contains \@HD GO:\ +*/ bool SamHeader::HasGroupOrder(void) const { return (!GroupOrder.empty()); } -// query if header contains @SQ entries +/*! \fn bool SamHeader::HasSequences(void) const + \brief Returns \c true if header contains any \@SQ entries +*/ bool SamHeader::HasSequences(void) const { return (!Sequences.IsEmpty()); } -// query if header contains @RG entries +/*! \fn bool SamHeader::HasReadGroups(void) const + \brief Returns \c true if header contains any \@RG entries +*/ bool SamHeader::HasReadGroups(void) const { return (!ReadGroups.IsEmpty()); } -// query if header contains @PG ID: +/*! \fn bool SamHeader::HasProgramName(void) const + \brief Returns \c true if header contains \@PG ID:\ +*/ bool SamHeader::HasProgramName(void) const { return (!ProgramName.empty()); } -// query if header contains @HD VN: +/*! \fn bool SamHeader::HasProgramVersion(void) const + \brief Returns \c true if header contains \@PG VN:\ +*/ bool SamHeader::HasProgramVersion(void) const { return (!ProgramVersion.empty()); } -// query if header contains @HD CL: +/*! \fn bool SamHeader::HasProgramCommandLine(void) const + \brief Returns \c true if header contains \@PG CL:\ +*/ bool SamHeader::HasProgramCommandLine(void) const { return (!ProgramCommandLine.empty()); } -// query if header contains @CO entries +/*! \fn bool SamHeader::HasComments(void) const + \brief Returns \c true if header contains any \@CO entries +*/ bool SamHeader::HasComments(void) const { return (!Comments.empty()); } -// validation +/*! \fn bool SamHeader::IsValid(bool verbose = false) const + \brief Checks header contents for required data and proper formatting. + \param verbose If set to true, validation errors & warnings will be printed to stderr. + Otherwise, output is suppressed and only validation check occurs. + \return \c true if SAM header is well-formed +*/ bool SamHeader::IsValid(bool verbose) const { SamHeaderValidator validator(*this); return validator.Validate(verbose); } + +/*! \fn void SamHeader::SetHeaderText(const std::string& headerText) + \brief Replaces header contents with \a headerText. + \param headerText SAM formatted-text that will be parsed into data fields +*/ +void SamHeader::SetHeaderText(const std::string& headerText) { + + // clear prior data + Clear(); + + // parse header text into data + SamFormatParser parser(*this); + parser.Parse(headerText); +} + +/*! \fn std::string SamHeader::ToString(void) const + \brief Converts data fields to SAM-formatted text. + + Applies any local modifications made since creating this object or calling SetHeaderText(). + + \return SAM-formatted header text +*/ +string SamHeader::ToString(void) const { + SamFormatPrinter printer(*this); + return printer.ToString(); +} diff --git a/src/api/SamHeader.h b/src/api/SamHeader.h index 5de1560..3ff4946 100644 --- a/src/api/SamHeader.h +++ b/src/api/SamHeader.h @@ -3,10 +3,10 @@ // Marth Lab, Department of Biology, Boston College // All rights reserved. // --------------------------------------------------------------------------- -// Last modified: 12 January 2011 (DB) +// Last modified: 4 March 2011 (DB) // --------------------------------------------------------------------------- -// Provides functionality for querying/manipulating SAM header data -// ************************************************************************** +// Provides direct read/write access to the SAM header data fields. +// *************************************************************************** #ifndef SAM_HEADER_H #define SAM_HEADER_H @@ -22,62 +22,47 @@ namespace BamTools { struct API_EXPORT SamHeader { // ctor & dtor - public: - SamHeader(const std::string& headerText = ""); - SamHeader(const SamHeader& other); - ~SamHeader(void); - - // query/modify entire SamHeader at once - public: - - // clear all header contents - void Clear(void); - - // checks if SAM header is well-formed - // @verbose - if true, validation errors & warnings will be printed to stderr - // otherwise, output is suppressed and only validation check occurs - bool IsValid(bool verbose = false) const; - - // replaces SamHeader contents with headerText - void SetHeaderText(const std::string& headerText); - - // retrieves the printable, SAM-formatted header - // (with any local modifications since construction) - std::string ToString(void) const; - - // query if header contains data elements - public: - bool HasVersion(void) const; - bool HasSortOrder(void) const; - bool HasGroupOrder(void) const; - bool HasSequences(void) const; - bool HasReadGroups(void) const; - bool HasProgramName(void) const; - bool HasProgramVersion(void) const; - bool HasProgramCommandLine(void) const; - bool HasComments(void) const; + SamHeader(const std::string& headerText = ""); + SamHeader(const SamHeader& other); + ~SamHeader(void); + + // query/modify entire SamHeader + void Clear(void); // clears all header contents + bool IsValid(bool verbose = false) const; // returns true if SAM header is well-formed + void SetHeaderText(const std::string& headerText); // replaces data fields with contents of SAM-formatted text + std::string ToString(void) const; // returns the printable, SAM-formatted header text + + // convenience query methods + bool HasVersion(void) const; // returns true if header contains format version entry + bool HasSortOrder(void) const; // returns true if header contains sort order entry + bool HasGroupOrder(void) const; // returns true if header contains group order entry + bool HasSequences(void) const; // returns true if header contains any sequence entries + bool HasReadGroups(void) const; // returns true if header contains any read group entries + bool HasProgramName(void) const; // returns true if header contains program name + bool HasProgramVersion(void) const; // returns true if header contains program version + bool HasProgramCommandLine(void) const; // returns true if header contains program command line + bool HasComments(void) const; // returns true if header contains comments // data members - public: - // header metadata (@HD line) - std::string Version; // VN: - std::string SortOrder; // SO: - std::string GroupOrder; // GO: + // header metadata (@HD line) + std::string Version; // VN: + std::string SortOrder; // SO: + std::string GroupOrder; // GO: - // header sequences (@SQ entries) - SamSequenceDictionary Sequences; + // header sequences (@SQ entries) + SamSequenceDictionary Sequences; - // header read groups (@RG entries) - SamReadGroupDictionary ReadGroups; + // header read groups (@RG entries) + SamReadGroupDictionary ReadGroups; - // header program data (@PG entries) - std::string ProgramName; // ID: - std::string ProgramVersion; // VN: - std::string ProgramCommandLine; // CL: + // header program data (@PG entries) + std::string ProgramName; // ID: + std::string ProgramVersion; // VN: + std::string ProgramCommandLine; // CL: - // header comments (@CO entries) - std::vector Comments; + // header comments (@CO entries) + std::vector Comments; }; } // namespace BamTools diff --git a/src/api/SamReadGroup.cpp b/src/api/SamReadGroup.cpp index 8debc58..da50d08 100644 --- a/src/api/SamReadGroup.cpp +++ b/src/api/SamReadGroup.cpp @@ -3,16 +3,53 @@ // Marth Lab, Department of Biology, Boston College // All rights reserved. // --------------------------------------------------------------------------- -// Last modified: 23 December 2010 (DB) +// Last modified: 4 March 2011 (DB) // --------------------------------------------------------------------------- -// Provides functionality for querying/manipulating read group data -// ************************************************************************** +// Provides direct read/write access to the SAM read group data fields. +// *************************************************************************** #include using namespace BamTools; using namespace std; -// default ctor +/*! \struct BamTools::SamReadGroup + \brief Represents a SAM read group entry. + + Provides direct read/write access to the SAM read group data fields. + + \sa http://samtools.sourceforge.net/SAM-1.3.pdf +*/ +/*! \var SamReadGroup::ID + \brief corresponds to \@RG ID:\ +*/ +/*! \var SamReadGroup::Sample + \brief corresponds to \@RG SM:\ +*/ +/*! \var SamReadGroup::Library + \brief corresponds to \@RG LB:\ +*/ +/*! \var SamReadGroup::Description + \brief corresponds to \@RG DS:\ +*/ +/*! \var SamReadGroup::PlatformUnit + \brief corresponds to \@RG PU:\ +*/ +/*! \var SamReadGroup::PredictedInsertSize + \brief corresponds to \@RG PI:\ +*/ +/*! \var SamReadGroup::SequencingCenter + \brief corresponds to \@RG CN:\ +*/ +/*! \var SamReadGroup::ProductionDate + \brief corresponds to \@RG DT:\ +*/ +/*! \var SamReadGroup::SequencingTechnology + \brief corresponds to \@RG PL:\ +*/ + +/*! \fn SamReadGroup::SamReadGroup(void) + \brief default constructor +*/ SamReadGroup::SamReadGroup(void) : ID("") , Sample("") @@ -25,8 +62,12 @@ SamReadGroup::SamReadGroup(void) , SequencingTechnology("") { } -// ctor with provided ID -SamReadGroup::SamReadGroup(const string& id) +/*! \fn SamReadGroup::SamReadGroup(const std::string& id) + \brief constructs read group with \a id + + \param id desired read group ID +*/ +SamReadGroup::SamReadGroup(const std::string& id) : ID(id) , Sample("") , Library("") @@ -38,7 +79,9 @@ SamReadGroup::SamReadGroup(const string& id) , SequencingTechnology("") { } -// copy ctor +/*! \fn SamReadGroup::SamReadGroup(const SamReadGroup& other) + \brief copy constructor +*/ SamReadGroup::SamReadGroup(const SamReadGroup& other) : ID(other.ID) , Sample(other.Sample) @@ -51,12 +94,14 @@ SamReadGroup::SamReadGroup(const SamReadGroup& other) , SequencingTechnology(other.SequencingTechnology) { } -// dtor -SamReadGroup::~SamReadGroup(void) { - Clear(); -} +/*! \fn SamReadGroup::~SamReadGroup(void) + \brief destructor +*/ +SamReadGroup::~SamReadGroup(void) { } -// clear all contents +/*! \fn void SamReadGroup::Clear(void) + \brief Clears all data fields. +*/ void SamReadGroup::Clear(void) { ID.clear(); Sample.clear(); @@ -69,13 +114,65 @@ void SamReadGroup::Clear(void) { SequencingTechnology.clear(); } -// convenience methods to check if SamReadGroup contains these values: -bool SamReadGroup::HasID(void) const { return (!ID.empty()); } -bool SamReadGroup::HasSample(void) const { return (!Sample.empty()); } -bool SamReadGroup::HasLibrary(void) const { return (!Library.empty()); } -bool SamReadGroup::HasDescription(void) const { return (!Description.empty()); } -bool SamReadGroup::HasPlatformUnit(void) const { return (!PlatformUnit.empty()); } -bool SamReadGroup::HasPredictedInsertSize(void) const { return (!PredictedInsertSize.empty()); } -bool SamReadGroup::HasSequencingCenter(void) const { return (!SequencingCenter.empty()); } -bool SamReadGroup::HasProductionDate(void) const { return (!ProductionDate.empty()); } -bool SamReadGroup::HasSequencingTechnology(void) const { return (!SequencingTechnology.empty()); } +/*! \fn bool SamReadGroup::HasID(void) const + \brief Returns \c true if read group contains \@RG: ID:\ +*/ +bool SamReadGroup::HasID(void) const { + return (!ID.empty()); +} + +/*! \fn bool SamReadGroup::HasSample(void) const + \brief Returns \c true if read group contains \@RG SM:\ +*/ +bool SamReadGroup::HasSample(void) const { + return (!Sample.empty()); +} + +/*! \fn bool SamReadGroup::HasLibrary(void) const + \brief Returns \c true if read group contains \@RG LB:\ +*/ +bool SamReadGroup::HasLibrary(void) const { + return (!Library.empty()); +} + +/*! \fn bool SamReadGroup::HasDescription(void) const + \brief Returns \c true if read group contains \@RG DS:\ +*/ +bool SamReadGroup::HasDescription(void) const { + return (!Description.empty()); +} + +/*! \fn bool SamReadGroup::HasPlatformUnit(void) const + \brief Returns \c true if read group contains \@RG PU:\ +*/ +bool SamReadGroup::HasPlatformUnit(void) const { + return (!PlatformUnit.empty()); +} + +/*! \fn bool SamReadGroup::HasPredictedInsertSize(void) const + \brief Returns \c true if read group contains \@RG PI:\ +*/ +bool SamReadGroup::HasPredictedInsertSize(void) const { + return (!PredictedInsertSize.empty()); +} + +/*! \fn bool SamReadGroup::HasSequencingCenter(void) const + \brief Returns \c true if read group contains \@RG CN:\ +*/ +bool SamReadGroup::HasSequencingCenter(void) const { + return (!SequencingCenter.empty()); +} + +/*! \fn bool SamReadGroup::HasProductionDate(void) const + \brief Returns \c true if read group contains \@RG DT:\ +*/ +bool SamReadGroup::HasProductionDate(void) const { + return (!ProductionDate.empty()); +} + +/*! \fn bool SamReadGroup::HasSequencingTechnology(void) const + \brief Returns \c true if read group contains \@RG PL:\ +*/ +bool SamReadGroup::HasSequencingTechnology(void) const { + return (!SequencingTechnology.empty()); +} diff --git a/src/api/SamReadGroup.h b/src/api/SamReadGroup.h index c53274a..538617d 100644 --- a/src/api/SamReadGroup.h +++ b/src/api/SamReadGroup.h @@ -3,10 +3,10 @@ // Marth Lab, Department of Biology, Boston College // All rights reserved. // --------------------------------------------------------------------------- -// Last modified: 23 December 2010 (DB) +// Last modified: 4 March 2011 (DB) // --------------------------------------------------------------------------- -// Provides functionality for querying/manipulating read group data -// ************************************************************************** +// Provides direct read/write access to the SAM read group data fields. +// *************************************************************************** #ifndef SAM_READGROUP_H #define SAM_READGROUP_H @@ -16,50 +16,44 @@ namespace BamTools { -class API_EXPORT SamReadGroup { +struct API_EXPORT SamReadGroup { // ctor & dtor - public: - SamReadGroup(void); - SamReadGroup(const std::string& id); - SamReadGroup(const SamReadGroup& other); - ~SamReadGroup(void); - - // public methods - public: - - // clear all contents - void Clear(void); - - // convenience methods to check if SamReadGroup contains these values: - bool HasID(void) const; - bool HasSample(void) const; - bool HasLibrary(void) const; - bool HasDescription(void) const; - bool HasPlatformUnit(void) const; - bool HasPredictedInsertSize(void) const; - bool HasSequencingCenter(void) const; - bool HasProductionDate(void) const; - bool HasSequencingTechnology(void) const; + SamReadGroup(void); + SamReadGroup(const std::string& id); + SamReadGroup(const SamReadGroup& other); + ~SamReadGroup(void); + + // query/modify entire read group + void Clear(void); // clears all data fields + + // convenience query methods + bool HasID(void) const; // returns true if read group has a group ID + bool HasSample(void) const; // returns true if read group has a sample name + bool HasLibrary(void) const; // returns true if read group has a library name + bool HasDescription(void) const; // returns true if read group has a description + bool HasPlatformUnit(void) const; // returns true if read group has a platform unit ID + bool HasPredictedInsertSize(void) const; // returns true if read group has a predicted insert size + bool HasSequencingCenter(void) const; // returns true if read group has a sequencing center ID + bool HasProductionDate(void) const; // returns true if read group has a production date + bool HasSequencingTechnology(void) const; // returns true if read group has a sequencing technology ID // data members - public: - std::string ID; // ID: - std::string Sample; // SM: - std::string Library; // LB: - std::string Description; // DS: - std::string PlatformUnit; // PU: - std::string PredictedInsertSize; // PI: - std::string SequencingCenter; // CN: - std::string ProductionDate; // DT: - std::string SequencingTechnology; // PL: + std::string ID; // ID: + std::string Sample; // SM: + std::string Library; // LB: + std::string Description; // DS: + std::string PlatformUnit; // PU: + std::string PredictedInsertSize; // PI: + std::string SequencingCenter; // CN: + std::string ProductionDate; // DT: + std::string SequencingTechnology; // PL: }; -// --------------------------------------------------- -// comparison operators - -// for equality: compare IDs -inline bool operator==(const SamReadGroup& lhs, const SamReadGroup& rhs) { +/*! \fn bool operator==(const SamReadGroup& lhs, const SamReadGroup& rhs) + \brief tests equality by comparing read group IDs +*/ +API_EXPORT inline bool operator==(const SamReadGroup& lhs, const SamReadGroup& rhs) { return lhs.ID == rhs.ID; } diff --git a/src/api/SamReadGroupDictionary.cpp b/src/api/SamReadGroupDictionary.cpp index 2f0534e..e6f8a05 100644 --- a/src/api/SamReadGroupDictionary.cpp +++ b/src/api/SamReadGroupDictionary.cpp @@ -3,10 +3,10 @@ // Marth Lab, Department of Biology, Boston College // All rights reserved. // --------------------------------------------------------------------------- -// Last modified: 23 December 2010 (DB) +// Last modified: 4 March 2011 (DB) // --------------------------------------------------------------------------- -// Provides container operations for collection of read group entries -// ************************************************************************* +// Provides methods for operating on a collection of SamReadGroup entries. +// *************************************************************************** #include using namespace BamTools; @@ -15,150 +15,263 @@ using namespace BamTools; #include using namespace std; -// ctor +/*! \class BamTools::SamReadGroupDictionary + \brief Container of SamReadGroup entries. + + Provides methods for operating on a collection of SamReadGroup entries. +*/ + +/*! \fn SamReadGroupDictionary::SamReadGroupDictionary(void) + \brief constructor +*/ SamReadGroupDictionary::SamReadGroupDictionary(void) { } -// copy ctor +/*! \fn SamReadGroupDictionary::SamReadGroupDictionary(const SamReadGroupDictionary& other) + \brief copy constructor +*/ SamReadGroupDictionary::SamReadGroupDictionary(const SamReadGroupDictionary& other) : m_data(other.m_data) { } -// dtor -SamReadGroupDictionary::~SamReadGroupDictionary(void) { - m_data.clear(); -} +/*! \fn SamReadGroupDictionary::~SamReadGroupDictionary(void) + \brief destructor +*/ +SamReadGroupDictionary::~SamReadGroupDictionary(void) { } + +/*! \fn void SamReadGroupDictionary::Add(const SamReadGroup& readGroup) + \brief Adds a read group to the dictionary. -// adds read group if not already in container + Duplicate entries are discarded. + + \param readGroup entry to be added +*/ void SamReadGroupDictionary::Add(const SamReadGroup& readGroup) { if ( IsEmpty() || !Contains(readGroup) ) m_data.push_back(readGroup); } -// overload to support std::string -void SamReadGroupDictionary::Add(const string& readGroupId) { +/*! \fn void SamReadGroupDictionary::Add(const std::string& readGroupId) + \brief Adds a read group to the dictionary. + + This is an overloaded function. + + \param readGroupId ID of read group to be added + \sa Add() +*/ +void SamReadGroupDictionary::Add(const std::string& readGroupId) { Add( SamReadGroup(readGroupId) ); } -// add multiple read groups -void SamReadGroupDictionary::Add(const vector& readGroups) { +/*! \fn void SamReadGroupDictionary::Add(const std::vector& readGroups) + \brief Adds multiple read groups to the dictionary. + + This is an overloaded function. + + \param readGroups entries to be added + \sa Add() +*/ +void SamReadGroupDictionary::Add(const std::vector& readGroups) { vector::const_iterator rgIter = readGroups.begin(); vector::const_iterator rgEnd = readGroups.end(); for ( ; rgIter!= rgEnd; ++rgIter ) Add(*rgIter); } -// overload to support std::string -void SamReadGroupDictionary::Add(const vector& readGroupIds) { +/*! \fn void SamReadGroupDictionary::Add(const std::vector& readGroupIds) + \brief Adds multiple read groups to the dictionary. + + This is an overloaded function. + + \param readGroupIds IDs of read groups to be added + \sa Add() +*/ +void SamReadGroupDictionary::Add(const std::vector& readGroupIds) { vector::const_iterator rgIter = readGroupIds.begin(); vector::const_iterator rgEnd = readGroupIds.end(); for ( ; rgIter!= rgEnd; ++rgIter ) Add(*rgIter); } -// returns iterator to container begin +/*! \fn SamReadGroupIterator SamReadGroupDictionary::Begin(void) + \return an STL iterator pointing to the first read group + \sa ConstBegin(), End() +*/ SamReadGroupIterator SamReadGroupDictionary::Begin(void) { return m_data.begin(); } -// returns const_iterator to container begin +/*! \fn SamReadGroupConstIterator SamReadGroupDictionary::Begin(void) const + + This is an overloaded function. + + \return a const STL iterator pointing to the first read group + \sa ConstBegin(), End() +*/ SamReadGroupConstIterator SamReadGroupDictionary::Begin(void) const { return m_data.begin(); } -// clear read group container +/*! \fn void SamReadGroupDictionary::Clear(void) + \brief Clears all read group entries. +*/ void SamReadGroupDictionary::Clear(void) { m_data.clear(); } -// explicit request for const_iterator to container begin +/*! \fn SamReadGroupConstIterator SamReadGroupDictionary::ConstBegin(void) const + \return a const STL iterator pointing to the first read group + \sa Begin(), ConstEnd() +*/ SamReadGroupConstIterator SamReadGroupDictionary::ConstBegin(void) const { return m_data.begin(); } -// explicit request for const_iterator to container end +/*! \fn SamReadGroupConstIterator SamReadGroupDictionary::ConstEnd(void) const + \return a const STL iterator pointing to the imaginary entry after the last read group + \sa ConstBegin(), End() +*/ SamReadGroupConstIterator SamReadGroupDictionary::ConstEnd(void) const { return m_data.end(); } -// returns true if container contains a read group with this ID tag -bool SamReadGroupDictionary::Contains(const string& readGroupId) const { - return ( IndexOf(readGroupId) != (int)m_data.size() ); +/*! \fn bool SamReadGroupDictionary::Contains(const std::string& readGroupId) const + \brief Returns true if dictionary contains read group. + + This is an overloaded function. + + \param readGroupId search for read group matching this ID + \return \c true if dictionary contains a read group with this ID +*/ +bool SamReadGroupDictionary::Contains(const std::string& readGroupId) const { + return Contains( SamReadGroup(readGroupId) ); } +/*! \fn bool SamReadGroupDictionary::Contains(const SamReadGroup& readGroup) const + \brief Returns true if dictionary contains read group. + \param readGroup search for this read group + \return \c true if dictionary contains read group +*/ bool SamReadGroupDictionary::Contains(const SamReadGroup& readGroup) const { return ( IndexOf(readGroup) != (int)m_data.size() ); } -// returns iterator to container end +/*! \fn SamReadGroupIterator SamReadGroupDictionary::End(void) + \return an STL iterator pointing to the imaginary entry after the last read group + \sa Begin(), ConstEnd() +*/ SamReadGroupIterator SamReadGroupDictionary::End(void) { return m_data.end(); } -// returns const_iterator to container begin +/*! \fn SamReadGroupConstIterator SamReadGroupDictionary::End(void) const + + This is an overloaded function. + + \return a const STL iterator pointing to the imaginary entry after the last read group + \sa Begin(), ConstEnd() +*/ SamReadGroupConstIterator SamReadGroupDictionary::End(void) const { return m_data.end(); } -// returns vector index of read group if found -// returns vector::size() (invalid index) if not found +/*! \fn int SamReadGroupDictionary::IndexOf(const SamReadGroup& readGroup) const + \internal + \return index of read group if found. Otherwise, returns vector::size() (invalid index). +*/ int SamReadGroupDictionary::IndexOf(const SamReadGroup& readGroup) const { SamReadGroupConstIterator begin = ConstBegin(); SamReadGroupConstIterator iter = begin; SamReadGroupConstIterator end = ConstEnd(); - for ( ; iter != end; ++iter ) - if ( *iter == readGroup ) break; + for ( ; iter != end; ++iter ) { + if ( *iter == readGroup ) + break; + } return distance( begin, iter ); } -// overload to support std::string -int SamReadGroupDictionary::IndexOf(const string& readGroupId) const { - return IndexOf( SamReadGroup(readGroupId) ); -} - -// returns true if container is empty +/*! \fn bool SamReadGroupDictionary::IsEmpty(void) const + \brief Returns \c true if dictionary contains no read groups + \sa Size() +*/ bool SamReadGroupDictionary::IsEmpty(void) const { return m_data.empty(); } -// removes read group (if it exists) +/*! \fn void SamReadGroupDictionary::Remove(const SamReadGroup& readGroup) + \brief Removes read group from dictionary, if found. + \param readGroup read group to remove +*/ void SamReadGroupDictionary::Remove(const SamReadGroup& readGroup) { if ( Contains(readGroup) ) m_data.erase( m_data.begin() + IndexOf(readGroup) ); } -// overlaod to support std::string -void SamReadGroupDictionary::Remove(const string& readGroupId) { +/*! \fn void SamReadGroupDictionary::Remove(const std::string& readGroupId) + \brief Removes read group from dictionary, if found. + + This is an overloaded function. + + \param readGroupId ID of read group to remove + \sa Remove() +*/ +void SamReadGroupDictionary::Remove(const std::string& readGroupId) { Remove( SamReadGroup(readGroupId) ); } -// remove multiple read groups -void SamReadGroupDictionary::Remove(const vector& readGroups) { +/*! \fn void SamReadGroupDictionary::Remove(const std::vector& readGroups) + \brief Removes multiple read groups from dictionary. + + This is an overloaded function. + + \param readGroups read groups to remove + \sa Remove() +*/ +void SamReadGroupDictionary::Remove(const std::vector& readGroups) { vector::const_iterator rgIter = readGroups.begin(); vector::const_iterator rgEnd = readGroups.end(); for ( ; rgIter!= rgEnd; ++rgIter ) Remove(*rgIter); } -// overload to support std::string -void SamReadGroupDictionary::Remove(const vector& readGroupIds) { +/*! \fn void SamReadGroupDictionary::Remove(const std::vector& readGroupIds) + \brief Removes multiple read groups from dictionary. + + This is an overloaded function. + + \param readGroupIds IDs of the read groups to remove + \sa Remove() +*/ +void SamReadGroupDictionary::Remove(const std::vector& readGroupIds) { vector::const_iterator rgIter = readGroupIds.begin(); vector::const_iterator rgEnd = readGroupIds.end(); for ( ; rgIter!= rgEnd; ++rgIter ) Remove(*rgIter); } -// returns size of container (number of current read groups) +/*! \fn int SamReadGroupDictionary::Size(void) const + \brief Returns number of read groups in dictionary. + \sa IsEmpty() +*/ int SamReadGroupDictionary::Size(void) const { return m_data.size(); } -// retrieves the SamReadGroup object associated with this ID -// if readGroupId is unknown, a new SamReadGroup is created with this ID -// and a reference to this new read group entry is returned (like std::map) +/*! \fn SamReadGroup& SamReadGroupDictionary::operator[](const std::string& readGroupId) + \brief Retrieves the modifiable SamReadGroup that matches \a readGroupId. + + NOTE - If the dictionary contains no read group matching this ID, this function inserts + a new one with this ID, and returns a reference to it. + + If you want to avoid this insertion behavior, check the result of Contains() before + using this operator. + + \param readGroupId ID of read group to retrieve + \return a modifiable reference to the SamReadGroup associated with the ID +*/ SamReadGroup& SamReadGroupDictionary::operator[](const std::string& readGroupId) { // look up read group ID - int index = IndexOf(readGroupId); + int index = IndexOf( SamReadGroup(readGroupId) ); // if found, return read group at index if ( index != (int)m_data.size() ) diff --git a/src/api/SamReadGroupDictionary.h b/src/api/SamReadGroupDictionary.h index 8a2bb5b..75df199 100644 --- a/src/api/SamReadGroupDictionary.h +++ b/src/api/SamReadGroupDictionary.h @@ -3,10 +3,10 @@ // Marth Lab, Department of Biology, Boston College // All rights reserved. // --------------------------------------------------------------------------- -// Last modified: 23 December 2010 (DB) +// Last modified: 4 March 2011 (DB) // --------------------------------------------------------------------------- -// Provides container operations for collection of read group entries -// ************************************************************************* +// Provides methods for operating on a collection of SamReadGroup entries. +// *************************************************************************** #ifndef SAM_READGROUP_DICTIONARY_H #define SAM_READGROUP_DICTIONARY_H @@ -22,8 +22,6 @@ typedef std::vector SamReadGroupContainer; typedef SamReadGroupContainer::iterator SamReadGroupIterator; typedef SamReadGroupContainer::const_iterator SamReadGroupConstIterator; -// stores read groups -// can access read groups using SamReadGroup object or (std::string) read group ID tag class API_EXPORT SamReadGroupDictionary { // ctor & dtor @@ -34,15 +32,15 @@ class API_EXPORT SamReadGroupDictionary { // query/modify read group data public: - // add a read group + // adds a read group void Add(const SamReadGroup& readGroup); - void Add(const std::string& readGroupIds); + void Add(const std::string& readGroupId); - // add multiple read groups + // adds multiple read groups void Add(const std::vector& readGroups); void Add(const std::vector& readGroupIds); - // clear all read groups records + // clears all read group entries void Clear(void); // returns true if dictionary contains this read group @@ -52,39 +50,32 @@ class API_EXPORT SamReadGroupDictionary { // returns true if dictionary is empty bool IsEmpty(void) const; - // remove a single read group (does nothing if read group not found) + // removes read group, if found void Remove(const SamReadGroup& readGroup); void Remove(const std::string& readGroupId); - // remove multiple read groups + // removes multiple read groups void Remove(const std::vector& readGroups); void Remove(const std::vector& readGroupIds); - // returns size of dictionary (number of current elements) + // returns number of read groups in dictionary int Size(void) const; - // retrieves the SamReadGroup object associated with this ID - // if readGroupId is unknown, a new SamReadGroup is created with this ID (and no other data) - // and a reference to this new read group entry is returned (like std::map) - // - // * To avoid these partial entries being created, it is recommended to check - // for existence first using Contains() + // retrieves a modifiable reference to the SamReadGroup object associated with this ID SamReadGroup& operator[](const std::string& readGroupId); - // retrieve read group iterators - // these are typedefs for STL iterators and thus are compatible with STL containers/algorithms + // retrieve STL-compatible iterators public: - SamReadGroupIterator Begin(void); - SamReadGroupConstIterator Begin(void) const; - SamReadGroupConstIterator ConstBegin(void) const; - SamReadGroupIterator End(void); - SamReadGroupConstIterator End(void) const; - SamReadGroupConstIterator ConstEnd(void) const; + SamReadGroupIterator Begin(void); // returns iterator to begin() + SamReadGroupConstIterator Begin(void) const; // returns const_iterator to begin() + SamReadGroupConstIterator ConstBegin(void) const; // returns const_iterator to begin() + SamReadGroupIterator End(void); // returns iterator to end() + SamReadGroupConstIterator End(void) const; // returns const_iterator to end() + SamReadGroupConstIterator ConstEnd(void) const; // returns const_iterator to end() // internal methods private: int IndexOf(const SamReadGroup& readGroup) const; - int IndexOf(const std::string& readGroupId) const; // data members private: diff --git a/src/api/SamSequence.cpp b/src/api/SamSequence.cpp index e323f7a..869c24d 100644 --- a/src/api/SamSequence.cpp +++ b/src/api/SamSequence.cpp @@ -3,18 +3,47 @@ // Marth Lab, Department of Biology, Boston College // All rights reserved. // --------------------------------------------------------------------------- -// Last modified: 23 December 2010 (DB) +// Last modified: 20 March 2011 (DB) // --------------------------------------------------------------------------- -// Provides functionality for querying/manipulating sequence data -// ************************************************************************* +// Provides direct read/write access to the SAM sequence data fields. +// *************************************************************************** #include +#include using namespace BamTools; using namespace std; -// ctor -SamSequence::SamSequence(const string& name) - : Name(name) +/*! \struct BamTools::SamSequence + \brief Represents a SAM sequence entry. + + Provides direct read/write access to the SAM sequence data fields. + + \sa http://samtools.sourceforge.net/SAM-1.3.pdf +*/ +/*! \var SamSequence::Name + \brief corresponds to \@SQ SN:\ +*/ +/*! \var SamSequence::Length + \brief corresponds to \@SQ LN:\ +*/ +/*! \var SamSequence::AssemblyID + \brief corresponds to \@SQ AS:\ +*/ +/*! \var SamSequence::Checksum + \brief corresponds to \@SQ M5:\ +*/ +/*! \var SamSequence::URI + \brief corresponds to \@SQ UR:\ +*/ +/*! \var SamSequence::Species + \brief corresponds to \@SQ SP:\ +*/ + +/*! \fn SamSequence::SamSequence(void) + \brief default constructor +*/ +SamSequence::SamSequence(void) + : Name("") , Length("") , AssemblyID("") , Checksum("") @@ -22,7 +51,27 @@ SamSequence::SamSequence(const string& name) , Species("") { } -// copy ctor +/*! \fn SamSequence::SamSequence(const std::string& name, const int& length) + \brief constructs sequence with \a name and \a length + + \param name desired sequence name + \param length desired sequence length (numeric value) +*/ +SamSequence::SamSequence(const std::string& name, const int& length) + : Name(name) + , AssemblyID("") + , Checksum("") + , URI("") + , Species("") +{ + stringstream s(""); + s << length; + Length = s.str(); +} + +/*! \fn SamSequence::SamSequence(const SamSequence& other) + \brief copy constructor +*/ SamSequence::SamSequence(const SamSequence& other) : Name(other.Name) , Length(other.Length) @@ -32,12 +81,14 @@ SamSequence::SamSequence(const SamSequence& other) , Species(other.Species) { } -// dtor -SamSequence::~SamSequence(void) { - Clear(); -} +/*! \fn SamSequence::~SamSequence(void) + \brief destructor +*/ +SamSequence::~SamSequence(void) { } -// clear all contents +/*! \fn void SamSequence::Clear(void) + \brief Clears all data fields. +*/ void SamSequence::Clear(void) { Name.clear(); Length.clear(); @@ -47,10 +98,44 @@ void SamSequence::Clear(void) { Species.clear(); } -// convenience methods to check if SamSequence contains these values: -bool SamSequence::HasName(void) const { return (!Name.empty()); } -bool SamSequence::HasLength(void) const { return (!Length.empty()); } -bool SamSequence::HasAssemblyID(void) const { return (!AssemblyID.empty()); } -bool SamSequence::HasChecksum(void) const { return (!Checksum.empty()); } -bool SamSequence::HasURI(void) const { return (!URI.empty()); } -bool SamSequence::HasSpecies(void) const { return (!Species.empty()); } +/*! \fn bool SamSequence::HasName(void) const + \brief Returns \c true if sequence contains \@SQ SN:\ +*/ +bool SamSequence::HasName(void) const { + return (!Name.empty()); +} + +/*! \fn bool SamSequence::HasLength(void) const + \brief Returns \c true if sequence contains \@SQ LN:\ +*/ +bool SamSequence::HasLength(void) const { + return (!Length.empty()); +} + +/*! \fn bool SamSequence::HasAssemblyID(void) const + \brief Returns \c true if sequence contains \@SQ AS:\ +*/ +bool SamSequence::HasAssemblyID(void) const { + return (!AssemblyID.empty()); +} + +/*! \fn bool SamSequence::HasChecksum(void) const + \brief Returns \c true if sequence contains \@SQ M5:\ +*/ +bool SamSequence::HasChecksum(void) const { + return (!Checksum.empty()); +} + +/*! \fn bool SamSequence::HasURI(void) const + \brief Returns \c true if sequence contains \@SQ UR:\ +*/ +bool SamSequence::HasURI(void) const { + return (!URI.empty()); +} + +/*! \fn bool SamSequence::HasSpecies(void) const + \brief Returns \c true if sequence contains \@SQ SP:\ +*/ +bool SamSequence::HasSpecies(void) const { + return (!Species.empty()); +} diff --git a/src/api/SamSequence.h b/src/api/SamSequence.h index 4dedcaa..fea09d3 100644 --- a/src/api/SamSequence.h +++ b/src/api/SamSequence.h @@ -3,10 +3,10 @@ // Marth Lab, Department of Biology, Boston College // All rights reserved. // --------------------------------------------------------------------------- -// Last modified: 23 December 2010 (DB) +// Last modified: 4 March 2011 (DB) // --------------------------------------------------------------------------- -// Provides functionality for querying/manipulating sequence data -// ************************************************************************** +// Provides direct read/write access to the SAM sequence data fields. +// *************************************************************************** #ifndef SAM_SEQUENCE_H #define SAM_SEQUENCE_H @@ -16,43 +16,38 @@ namespace BamTools { -class API_EXPORT SamSequence { +struct API_EXPORT SamSequence { // ctor & dtor - public: - SamSequence(const std::string& name = ""); - SamSequence(const SamSequence& other); - ~SamSequence(void); - - // public methods - public: - - // clear all contents - void Clear(void); - - // convenience methods to check if SamSequence contains these values: - bool HasName(void) const; - bool HasLength(void) const; - bool HasAssemblyID(void) const; - bool HasChecksum(void) const; - bool HasURI(void) const; - bool HasSpecies(void) const; + SamSequence(void); + SamSequence(const std::string& name, const int& length); + SamSequence(const SamSequence& other); + ~SamSequence(void); + + // query/modify entire sequence + void Clear(void); // clears all contents + + // convenience query methods + bool HasName(void) const; // returns true if sequence has a name + bool HasLength(void) const; // returns true if sequence has a length + bool HasAssemblyID(void) const; // returns true if sequence has an assembly ID + bool HasChecksum(void) const; // returns true if sequence has an MD5 checksum + bool HasURI(void) const; // returns true if sequence has a URI + bool HasSpecies(void) const; // returns true if sequence has a species ID // data members - public: - std::string Name; // SN: - std::string Length; // LN: - std::string AssemblyID; // AS: - std::string Checksum; // M5: - std::string URI; // UR: - std::string Species; // SP: + std::string Name; // SN: + std::string Length; // LN: + std::string AssemblyID; // AS: + std::string Checksum; // M5: + std::string URI; // UR: + std::string Species; // SP: }; -// --------------------------------------------------- -// comparison operators - -// for equality: compare Name, Length, & Checksum (if it exists for both) -inline bool operator==(const SamSequence& lhs, const SamSequence& rhs) { +/*! \fn bool operator==(const SamSequence& lhs, const SamSequence& rhs) + \brief tests equality by comparing sequence names, lengths, & checksums (if available) +*/ +API_EXPORT inline bool operator==(const SamSequence& lhs, const SamSequence& rhs) { if ( lhs.Name != rhs.Name ) return false; if ( lhs.Length != rhs.Length ) return false; if ( lhs.HasChecksum() && rhs.HasChecksum() ) diff --git a/src/api/SamSequenceDictionary.cpp b/src/api/SamSequenceDictionary.cpp index 2f9bf28..3249bd4 100644 --- a/src/api/SamSequenceDictionary.cpp +++ b/src/api/SamSequenceDictionary.cpp @@ -3,9 +3,9 @@ // Marth Lab, Department of Biology, Boston College // All rights reserved. // --------------------------------------------------------------------------- -// Last modified: 23 December 2010 (DB) +// Last modified: 20 March 2011 (DB) // --------------------------------------------------------------------------- -// Provides container operations for collection of sequence entries +// Provides methods for operating on a collection of SamSequence entries. // ************************************************************************* #include @@ -14,146 +14,282 @@ using namespace BamTools; #include using namespace std; -// ctor +/*! \class BamTools::SamSequenceDictionary + \brief Container of SamSequence entries. + + Provides methods for operating on a collection of SamSequence entries. +*/ + +/*! \fn SamSequenceDictionary::SamSequenceDictionary(void) + \brief constructor +*/ SamSequenceDictionary::SamSequenceDictionary(void) { } -// copy ctor +/*! \fn SamSequenceDictionary::SamSequenceDictionary(const SamSequenceDictionary& other) + \brief copy constructor +*/ SamSequenceDictionary::SamSequenceDictionary(const SamSequenceDictionary& other) : m_data(other.m_data) { } -// dtor -SamSequenceDictionary::~SamSequenceDictionary(void) { - m_data.clear(); -} +/*! \fn SamSequenceDictionary::~SamSequenceDictionary(void) + \brief destructor +*/ +SamSequenceDictionary::~SamSequenceDictionary(void) { } + +/*! \fn void SamSequenceDictionary::Add(const SamSequence& sequence) + \brief Adds a sequence to the dictionary. + + Duplicate entries are discarded. -// adds sequence if not already in container + \param sequence entry to be added +*/ void SamSequenceDictionary::Add(const SamSequence& sequence) { if ( IsEmpty() || !Contains(sequence) ) m_data.push_back(sequence); } -// overload to support std::string -void SamSequenceDictionary::Add(const string& sequenceName) { - Add( SamSequence(sequenceName) ); +/*! \fn void SamSequenceDictionary::Add(const std::string& name, const int& length) + \brief Adds a sequence to the dictionary. + + This is an overloaded function. + + \param name name of sequence entry to be added + \param length length of sequence entry to be added + \sa Add() +*/ +void SamSequenceDictionary::Add(const std::string& name, const int& length) { + Add( SamSequence(name, length) ); } -// add multiple sequences -void SamSequenceDictionary::Add(const vector& sequences) { - vector::const_iterator rgIter = sequences.begin(); - vector::const_iterator rgEnd = sequences.end(); - for ( ; rgIter!= rgEnd; ++rgIter ) - Add(*rgIter); +/*! \fn void SamSequenceDictionary::Add(const std::vector& sequences) + \brief Adds multiple sequences to the dictionary. + + This is an overloaded function. + + \param sequences entries to be added + \sa Add() +*/ +void SamSequenceDictionary::Add(const std::vector& sequences) { + vector::const_iterator seqIter = sequences.begin(); + vector::const_iterator seqEnd = sequences.end(); + for ( ; seqIter!= seqEnd; ++seqIter ) + Add(*seqIter); } -// overload to support std::string -void SamSequenceDictionary::Add(const vector& sequenceNames) { - vector::const_iterator rgIter = sequenceNames.begin(); - vector::const_iterator rgEnd = sequenceNames.end(); - for ( ; rgIter!= rgEnd; ++rgIter ) - Add(*rgIter); +/*! \fn void SamSequenceDictionary::Add(const std::map& sequenceMap) + \brief Adds multiple sequences to the dictionary. + + This is an overloaded function. + + \param sequenceMap map of sequence entries (name => length) to be added + \sa Add() +*/ +void SamSequenceDictionary::Add(const std::map& sequenceMap) { + map::const_iterator seqIter = sequenceMap.begin(); + map::const_iterator seqEnd = sequenceMap.end(); + for ( ; seqIter != seqEnd; ++seqIter ) { + const string& name = (*seqIter).first; + const int& length = (*seqIter).second; + Add( SamSequence(name, length) ); + } } -// returns iterator to container begin +/*! \fn SamSequenceIterator SamSequenceDictionary::Begin(void) + \return an STL iterator pointing to the first sequence + \sa ConstBegin(), End() +*/ SamSequenceIterator SamSequenceDictionary::Begin(void) { return m_data.begin(); } -// returns const_iterator to container begin +/*! \fn SamSequenceConstIterator SamSequenceDictionary::Begin(void) const + + This is an overloaded function. + + \return a const STL iterator pointing to the first sequence + \sa ConstBegin(), End() +*/ SamSequenceConstIterator SamSequenceDictionary::Begin(void) const { return m_data.begin(); } -// clear sequence container +/*! \fn void SamSequenceDictionary::Clear(void) + \brief Clears all sequence entries. +*/ void SamSequenceDictionary::Clear(void) { m_data.clear(); } -// explicit request for const_iterator to container begin +/*! \fn SamSequenceConstIterator SamSequenceDictionary::ConstBegin(void) const + \return a const STL iterator pointing to the first sequence + \sa Begin(), ConstEnd() +*/ SamSequenceConstIterator SamSequenceDictionary::ConstBegin(void) const { return m_data.begin(); } -// explicit request for const_iterator to container end +/*! \fn SamSequenceConstIterator SamSequenceDictionary::ConstEnd(void) const + \return a const STL iterator pointing to the imaginary entry after the last sequence + \sa End(), ConstBegin() +*/ SamSequenceConstIterator SamSequenceDictionary::ConstEnd(void) const { return m_data.end(); } -// returns true if container contains a sequence with this ID tag -bool SamSequenceDictionary::Contains(const string& sequenceName) const { +/*! \fn bool SamSequenceDictionary::Contains(const std::string& sequenceName) const + \brief Returns true if dictionary contains sequence. + \param sequenceName search for sequence matching this name + \return \c true if dictionary contains a sequence with this name +*/ +bool SamSequenceDictionary::Contains(const std::string& sequenceName) const { return ( IndexOf(sequenceName) != (int)m_data.size() ); } -bool SamSequenceDictionary::Contains(const SamSequence& seq) const { - return ( IndexOf(seq) != (int)m_data.size() ); +/*! \fn bool SamSequenceDictionary::Contains(const SamSequence& sequence) const + \brief Returns true if dictionary contains sequence. + \param sequence search for this sequence + \return \c true if dictionary contains sequence +*/ +bool SamSequenceDictionary::Contains(const SamSequence& sequence) const { + return ( IndexOf(sequence) != (int)m_data.size() ); } -// returns iterator to container end +/*! \fn SamSequenceIterator SamSequenceDictionary::End(void) + \return an STL iterator pointing to the imaginary entry after the last sequence + \sa Begin(), ConstEnd() +*/ SamSequenceIterator SamSequenceDictionary::End(void) { return m_data.end(); } -// returns const_iterator to container begin +/*! \fn SamSequenceConstIterator SamSequenceDictionary::End(void) const + + This is an overloaded function. + + \return a const STL iterator pointing to the imaginary entry after the last sequence + \sa Begin(), ConstEnd() +*/ SamSequenceConstIterator SamSequenceDictionary::End(void) const { return m_data.end(); } -// returns vector index of sequence if found -// returns vector::size() (invalid index) if not found +/*! \fn int SamSequenceDictionary::IndexOf(const SamSequence& sequence) const + \internal + + Uses operator==(SamSequence, SamSequence) + + \return index of sequence if found. Otherwise, returns vector::size() (invalid index). +*/ int SamSequenceDictionary::IndexOf(const SamSequence& sequence) const { SamSequenceConstIterator begin = ConstBegin(); SamSequenceConstIterator iter = begin; SamSequenceConstIterator end = ConstEnd(); - for ( ; iter != end; ++iter ) - if ( *iter == sequence ) break; + for ( ; iter != end; ++iter ) { + const SamSequence& currentSeq = (*iter); + if ( currentSeq == sequence ) + break; + } return distance( begin, iter ); } -// overload to support std::string -int SamSequenceDictionary::IndexOf(const string& sequenceName) const { - return IndexOf( SamSequence(sequenceName) ); +/*! \fn int SamSequenceDictionary::IndexOf(const std::string& name) const + \internal + + Use comparison of SamSequence::Name to \a name + + \return index of sequence if found. Otherwise, returns vector::size() (invalid index). +*/ +int SamSequenceDictionary::IndexOf(const std::string& name) const { + SamSequenceConstIterator begin = ConstBegin(); + SamSequenceConstIterator iter = begin; + SamSequenceConstIterator end = ConstEnd(); + for ( ; iter != end; ++iter ) { + const SamSequence& currentSeq = (*iter); + if ( currentSeq.Name == name ) + break; + } + return distance( begin, iter ); } -// returns true if container is empty +/*! \fn bool SamSequenceDictionary::IsEmpty(void) const + \brief Returns \c true if dictionary contains no sequences + \sa Size() +*/ bool SamSequenceDictionary::IsEmpty(void) const { return m_data.empty(); } -// removes sequence (if it exists) +/*! \fn void SamSequenceDictionary::Remove(const SamSequence& sequence) + \brief Removes sequence from dictionary, if found. + \param sequence sequence to remove +*/ void SamSequenceDictionary::Remove(const SamSequence& sequence) { if ( Contains(sequence) ) m_data.erase( m_data.begin() + IndexOf(sequence) ); } -// overlaod to support std::string -void SamSequenceDictionary::Remove(const string& sequenceName) { - Remove( SamSequence(sequenceName) ); +/*! \fn void SamSequenceDictionary::Remove(const std::string& sequenceName) + \brief Removes sequence from dictionary, if found. + + \param sequenceName name of sequence to remove + \sa Remove() +*/ +void SamSequenceDictionary::Remove(const std::string& sequenceName) { + if ( Contains(sequenceName) ) + m_data.erase( m_data.begin() + IndexOf(sequenceName) ); } -// remove multiple sequences -void SamSequenceDictionary::Remove(const vector& sequences) { +/*! \fn void SamSequenceDictionary::Remove(const std::vector& sequences) + \brief Removes multiple sequences from dictionary. + + This is an overloaded function. + + \param sequences sequences to remove + \sa Remove() +*/ +void SamSequenceDictionary::Remove(const std::vector& sequences) { vector::const_iterator rgIter = sequences.begin(); vector::const_iterator rgEnd = sequences.end(); for ( ; rgIter!= rgEnd; ++rgIter ) Remove(*rgIter); } -// overload to support std::string -void SamSequenceDictionary::Remove(const vector& sequenceNames) { +/*! \fn void SamSequenceDictionary::Remove(const std::vector& sequenceNames) + \brief Removes multiple sequences from dictionary. + + This is an overloaded function. + + \param sequenceNames names of the sequences to remove + \sa Remove() +*/ +void SamSequenceDictionary::Remove(const std::vector& sequenceNames) { vector::const_iterator rgIter = sequenceNames.begin(); vector::const_iterator rgEnd = sequenceNames.end(); for ( ; rgIter!= rgEnd; ++rgIter ) Remove(*rgIter); } -// returns size of container (number of current sequences) +/*! \fn int SamSequenceDictionary::Size(void) const + \brief Returns number of sequences in dictionary. + \sa IsEmpty() +*/ int SamSequenceDictionary::Size(void) const { return m_data.size(); } -// retrieves the SamSequence object associated with this name -// if sequenceName is unknown, a new SamSequence is created with this name (and invalid length 0) -// and a reference to this new sequence entry is returned (like std::map) +/*! \fn SamSequence& SamSequenceDictionary::operator[](const std::string& sequenceName) + \brief Retrieves the modifiable SamSequence that matches \a sequenceName. + + NOTE - If the dictionary contains no sequence matching this name, this function inserts + a new one with this name, and returns a reference to it. + + If you want to avoid this insertion behavior, check the result of Contains() before + using this operator. + + \param sequenceName name of sequence to retrieve + \return a modifiable reference to the SamSequence associated with the name +*/ SamSequence& SamSequenceDictionary::operator[](const std::string& sequenceName) { // look up sequence ID @@ -165,10 +301,7 @@ SamSequence& SamSequenceDictionary::operator[](const std::string& sequenceName) // otherwise, append new sequence and return reference else { - SamSequence seq(sequenceName); - seq.Length = "0"; - m_data.push_back(seq); + m_data.push_back( SamSequence(sequenceName, 0) ); return m_data.back(); } } - diff --git a/src/api/SamSequenceDictionary.h b/src/api/SamSequenceDictionary.h index ce80756..fca8b22 100644 --- a/src/api/SamSequenceDictionary.h +++ b/src/api/SamSequenceDictionary.h @@ -3,10 +3,10 @@ // Marth Lab, Department of Biology, Boston College // All rights reserved. // --------------------------------------------------------------------------- -// Last modified: 23 December 2010 (DB) +// Last modified: 20 March 2011 // --------------------------------------------------------------------------- -// Provides container operations for collection of sequence entries -// ************************************************************************* +// Provides methods for operating on a collection of SamSequence entries. +// *************************************************************************** #ifndef SAM_SEQUENCE_DICTIONARY_H #define SAM_SEQUENCE_DICTIONARY_H @@ -14,6 +14,7 @@ #include #include #include +#include #include namespace BamTools { @@ -32,15 +33,15 @@ class API_EXPORT SamSequenceDictionary { // query/modify sequence data public: - // add a sequence + // adds a sequence void Add(const SamSequence& sequence); - void Add(const std::string& sequenceNames); + void Add(const std::string& name, const int& length); - // add multiple sequences + // adds multiple sequences void Add(const std::vector& sequences); - void Add(const std::vector& sequenceNames); + void Add(const std::map& sequenceMap); - // clear all sequence records + // clears all sequence entries void Clear(void); // returns true if dictionary contains this sequence @@ -50,39 +51,33 @@ class API_EXPORT SamSequenceDictionary { // returns true if dictionary is empty bool IsEmpty(void) const; - // remove a single sequence (does nothing if sequence not found) + // removes sequence, if found void Remove(const SamSequence& sequence); void Remove(const std::string& sequenceName); - // remove multiple sequences + // removes multiple sequences void Remove(const std::vector& sequences); void Remove(const std::vector& sequenceNames); - // returns size of dictionary (number of current elements) + // returns number of sequences in dictionary int Size(void) const; - // retrieves the SamSequence object associated with this name - // if sequenceName is unknown, a new SamSequence is created with this name (and invalid length 0) - // and a reference to this new sequence entry is returned (like std::map) - // - // * To avoid these partial entries being created, it is recommended to check - // for existence first using Contains() + // retrieves a modifiable reference to the SamSequence object associated with this name SamSequence& operator[](const std::string& sequenceName); - // retrieve sequence iterators - // these are typedefs for STL iterators and thus are compatible with STL containers/algorithms + // retrieve STL-compatible iterators public: - SamSequenceIterator Begin(void); - SamSequenceConstIterator Begin(void) const; - SamSequenceConstIterator ConstBegin(void) const; - SamSequenceIterator End(void); - SamSequenceConstIterator End(void) const; - SamSequenceConstIterator ConstEnd(void) const; + SamSequenceIterator Begin(void); // returns iterator to begin() + SamSequenceConstIterator Begin(void) const; // returns const_iterator to begin() + SamSequenceConstIterator ConstBegin(void) const; // returns const_iterator to begin() + SamSequenceIterator End(void); // returns iterator to end() + SamSequenceConstIterator End(void) const; // returns const_iterator to end() + SamSequenceConstIterator ConstEnd(void) const; // returns const_iterator to end() // internal methods private: int IndexOf(const SamSequence& sequence) const; - int IndexOf(const std::string& sequenceName) const; + int IndexOf(const std::string& name) const; // data members private: diff --git a/src/api/internal/BamHeader_p.cpp b/src/api/internal/BamHeader_p.cpp index 2fb8257..0eaf7bc 100644 --- a/src/api/internal/BamHeader_p.cpp +++ b/src/api/internal/BamHeader_p.cpp @@ -3,15 +3,15 @@ // Marth Lab, Department of Biology, Boston College // All rights reserved. // --------------------------------------------------------------------------- -// Last modified: 25 December 2010 (DB) +// Last modified: 21 March 2011 (DB) // --------------------------------------------------------------------------- // Provides the basic functionality for handling BAM headers. // *************************************************************************** #include #include -#include #include +#include using namespace BamTools; using namespace BamTools::Internal; @@ -21,35 +21,45 @@ using namespace BamTools::Internal; #include using namespace std; -// --------------------------------- -// BamHeaderPrivate implementation +// ctor +BamHeader::BamHeader(void) { } -struct BamHeader::BamHeaderPrivate { +// dtor +BamHeader::~BamHeader(void) { } - // data members - SamHeader* m_samHeader; +// reads magic number from BGZF stream, returns true if valid +bool BamHeader::CheckMagicNumber(BgzfStream* stream) { - // ctor - BamHeaderPrivate(void) - : m_samHeader(new SamHeader("")) - { } + // try to read magic number + char buffer[Constants::BAM_HEADER_MAGIC_LENGTH]; + if ( stream->Read(buffer, Constants::BAM_HEADER_MAGIC_LENGTH) != (int)Constants::BAM_HEADER_MAGIC_LENGTH ) { + fprintf(stderr, "BamHeader ERROR: could not read magic number\n"); + return false; + } - // dtor - ~BamHeaderPrivate(void) { - delete m_samHeader; - m_samHeader = 0; + // validate magic number + if ( strncmp(buffer, Constants::BAM_HEADER_MAGIC, Constants::BAM_HEADER_MAGIC_LENGTH) != 0 ) { + fprintf(stderr, "BamHeader ERROR: invalid magic number\n"); + return false; } - // 'public' interface - bool Load(BgzfData* stream); + // all checks out + return true; +} - // internal methods - bool CheckMagicNumber(BgzfData* stream); - bool ReadHeaderLength(BgzfData* stream, uint32_t& length); - bool ReadHeaderText(BgzfData* stream, const uint32_t& length); -}; +// clear SamHeader data +void BamHeader::Clear(void) { + m_header.Clear(); +} -bool BamHeader::BamHeaderPrivate::Load(BgzfData* stream) { +// return true if SamHeader data is valid +bool BamHeader::IsValid(void) const { + return m_header.IsValid(); +} + +// load BAM header ('magic number' and SAM header text) from BGZF stream +// returns true if all OK +bool BamHeader::Load(BgzfStream* stream) { // cannot load if invalid stream if ( stream == 0 ) @@ -72,42 +82,27 @@ bool BamHeader::BamHeaderPrivate::Load(BgzfData* stream) { return true; } -bool BamHeader::BamHeaderPrivate::CheckMagicNumber(BgzfData* stream) { - - // try to read magic number - char buffer[Constants::BAM_HEADER_MAGIC_SIZE]; - if ( stream->Read(buffer, Constants::BAM_HEADER_MAGIC_SIZE) != (int)Constants::BAM_HEADER_MAGIC_SIZE ) { - fprintf(stderr, "BAM header error - could not read magic number\n"); - return false; - } - - // validate magic number - if ( strncmp(buffer, Constants::BAM_HEADER_MAGIC, Constants::BAM_HEADER_MAGIC_SIZE) != 0 ) { - fprintf(stderr, "BAM header error - invalid magic number\n"); - return false; - } - - // all checks out - return true; -} - -bool BamHeader::BamHeaderPrivate::ReadHeaderLength(BgzfData* stream, uint32_t& length) { +// reads SAM header text length from BGZF stream, stores it in @length +// returns read success/fail status +bool BamHeader::ReadHeaderLength(BgzfStream* stream, uint32_t& length) { // attempt to read BAM header text length char buffer[sizeof(uint32_t)]; if ( stream->Read(buffer, sizeof(uint32_t)) != sizeof(uint32_t) ) { - fprintf(stderr, "BAM header error - could not read header length\n"); + fprintf(stderr, "BamHeader ERROR: could not read header length\n"); return false; } // convert char buffer to length, return success - length = BgzfData::UnpackUnsignedInt(buffer); + length = BamTools::UnpackUnsignedInt(buffer); if ( BamTools::SystemIsBigEndian() ) - SwapEndian_32(length); + BamTools::SwapEndian_32(length); return true; } -bool BamHeader::BamHeaderPrivate::ReadHeaderText(BgzfData* stream, const uint32_t& length) { +// reads SAM header text from BGZF stream, stores in SamHeader object +// returns read success/fail status +bool BamHeader::ReadHeaderText(BgzfStream* stream, const uint32_t& length) { // set up destination buffer char* headerText = (char*)calloc(length + 1, 1); @@ -116,9 +111,9 @@ bool BamHeader::BamHeaderPrivate::ReadHeaderText(BgzfData* stream, const uint32_ const unsigned bytesRead = stream->Read(headerText, length); const bool readOk = ( bytesRead == length ); if ( readOk ) - m_samHeader->SetHeaderText( (string)((const char*)headerText) ); + m_header.SetHeaderText( (string)((const char*)headerText) ); else - fprintf(stderr, "BAM header error - could not read header text\n"); + fprintf(stderr, "BamHeader ERROR: could not read header text\n"); // clean up calloc-ed temp variable (on success or fail) free(headerText); @@ -127,34 +122,12 @@ bool BamHeader::BamHeaderPrivate::ReadHeaderText(BgzfData* stream, const uint32_ return readOk; } -// -------------------------- -// BamHeader implementation - -BamHeader::BamHeader(void) - : d(new BamHeaderPrivate) -{ } - -BamHeader::~BamHeader(void) { - delete d; - d = 0; -} - -void BamHeader::Clear(void) { - d->m_samHeader->Clear(); -} - -bool BamHeader::IsValid(void) const { - return d->m_samHeader->IsValid(); -} - -bool BamHeader::Load(BgzfData* stream) { - return d->Load(stream); -} - +// returns *copy* of SamHeader data object SamHeader BamHeader::ToSamHeader(void) const { - return *(d->m_samHeader); + return m_header; } +// returns SAM-formatted string of header data string BamHeader::ToString(void) const { - return d->m_samHeader->ToString(); + return m_header.ToString(); } diff --git a/src/api/internal/BamHeader_p.h b/src/api/internal/BamHeader_p.h index 352ce88..1f1a31c 100644 --- a/src/api/internal/BamHeader_p.h +++ b/src/api/internal/BamHeader_p.h @@ -3,7 +3,7 @@ // Marth Lab, Department of Biology, Boston College // All rights reserved. // --------------------------------------------------------------------------- -// Last modified: 25 December 2010 (DB) +// Last modified: 26 January 2011 (DB) // --------------------------------------------------------------------------- // Provides the basic functionality for handling BAM headers. // *************************************************************************** @@ -25,29 +25,45 @@ #include namespace BamTools { - -class BgzfData; - namespace Internal { +class BgzfStream; + class BamHeader { + // ctor & dtor public: BamHeader(void); ~BamHeader(void); + // BamHeader interface public: + // clear SamHeader data void Clear(void); + // return true if SamHeader data is valid bool IsValid(void) const; - bool Load(BgzfData* stream); - - public: + // load BAM header ('magic number' and SAM header text) from BGZF stream + // returns true if all OK + bool Load(BgzfStream* stream); + // returns (editable) copy of SamHeader data object SamHeader ToSamHeader(void) const; + // returns SAM-formatted string of header data std::string ToString(void) const; + // internal methods + private: + // reads magic number from BGZF stream, returns true if valid + bool CheckMagicNumber(BgzfStream* stream); + // reads SAM header length from BGZF stream, stores it in @length + // returns read success/fail status + bool ReadHeaderLength(BgzfStream* stream, uint32_t& length); + // reads SAM header text from BGZF stream, stores in SamHeader object + // returns read success/fail status + bool ReadHeaderText(BgzfStream* stream, const uint32_t& length); + + // data members private: - struct BamHeaderPrivate; - BamHeaderPrivate* d; + SamHeader m_header; }; } // namespace Internal diff --git a/src/api/internal/BamIndexFactory_p.cpp b/src/api/internal/BamIndexFactory_p.cpp new file mode 100644 index 0000000..dcc11ce --- /dev/null +++ b/src/api/internal/BamIndexFactory_p.cpp @@ -0,0 +1,110 @@ +// *************************************************************************** +// BamIndexFactory_p.cpp (c) 2011 Derek Barnett +// Marth Lab, Department of Biology, Boston College +// All rights reserved. +// --------------------------------------------------------------------------- +// Last modified: 21 March 2011 (DB) +// --------------------------------------------------------------------------- +// Provides interface for generating BamIndex implementations +// *************************************************************************** + +#include +#include +#include +#include +using namespace BamTools; +using namespace BamTools::Internal; + +#include +using namespace std; + +// generates index filename from BAM filename (depending on requested type) +// if type is unknown, returns empty string +const string BamIndexFactory::CreateIndexFilename(const string& bamFilename, + const BamIndex::IndexType& type) +{ + switch ( type ) { + case ( BamIndex::STANDARD ) : return ( bamFilename + BAI_EXTENSION ); + case ( BamIndex::BAMTOOLS ) : return ( bamFilename + BTI_EXTENSION ); + default : + fprintf(stderr, "BamIndexFactory ERROR: unknown index type %u\n", type); + return string(); + } +} + +// creates a new BamIndex object, depending on extension of @indexFilename +BamIndex* BamIndexFactory::CreateIndexFromFilename(const string& indexFilename) { + + // if file doesn't exist, return null index + if ( !BamTools::FileExists(indexFilename) ) + return 0; + + // get file extension from index filename, including dot (".EXT") + // if can't get file extension, return null index + const string extension = FileExtension(indexFilename); + if ( extension.empty() ) + return 0; + + // create index based on extension + if ( extension == BAI_EXTENSION ) return new BamStandardIndex; + else if ( extension == BTI_EXTENSION ) return new BamToolsIndex; + else return 0; +} + +// creates a new BamIndex, object of requested @type +BamIndex* BamIndexFactory::CreateIndexOfType(const BamIndex::IndexType& type) { + switch ( type ) { + case ( BamIndex::STANDARD ) : return new BamStandardIndex; + case ( BamIndex::BAMTOOLS ) : return new BamToolsIndex; + default : + fprintf(stderr, "BamIndexFactory ERROR: unknown index type %u\n", type); + return 0; + } +} + +// retrieves file extension (including '.') +const string BamIndexFactory::FileExtension(const string& filename) { + + // if filename cannot contain valid path + extension, return empty string + if ( filename.empty() || filename.length() <= 4 ) + return string(); + + // look for last dot in filename + size_t lastDotPosition = filename.find_last_of('.'); + + // if none found, return empty string + if ( lastDotPosition == string::npos ) + return string(); + + // return substring from last dot position + return filename.substr(lastDotPosition); +} + +// returns name of existing index file that corresponds to @bamFilename +// will defer to @preferredType if possible, if not will attempt to load any supported type +// returns empty string if not found +const string BamIndexFactory::FindIndexFilename(const string& bamFilename, + const BamIndex::IndexType& preferredType) +{ + // try to find index of preferred type first + // return index filename if found + string indexFilename = CreateIndexFilename(bamFilename, preferredType); + if ( !indexFilename.empty() && BamTools::FileExists(indexFilename) ) + return indexFilename; + + // couldn't find preferred type, try the other supported types + // return index filename if found + if ( preferredType != BamIndex::STANDARD ) { + indexFilename = CreateIndexFilename(bamFilename, BamIndex::STANDARD); + if ( !indexFilename.empty() && BamTools::FileExists(indexFilename) ) + return indexFilename; + } + if ( preferredType != BamIndex::BAMTOOLS ) { + indexFilename = CreateIndexFilename(bamFilename, BamIndex::BAMTOOLS); + if ( !indexFilename.empty() && BamTools::FileExists(indexFilename) ) + return indexFilename; + } + + // otherwise couldn't find any index matching this filename + return string(); +} diff --git a/src/api/internal/BamIndexFactory_p.h b/src/api/internal/BamIndexFactory_p.h new file mode 100644 index 0000000..4ef9585 --- /dev/null +++ b/src/api/internal/BamIndexFactory_p.h @@ -0,0 +1,48 @@ +// *************************************************************************** +// BamIndexFactory_p.h (c) 2011 Derek Barnett +// Marth Lab, Department of Biology, Boston College +// All rights reserved. +// --------------------------------------------------------------------------- +// Last modified: 26 January 2011 (DB) +// --------------------------------------------------------------------------- +// Provides interface for generating BamIndex implementations +// *************************************************************************** + +#ifndef BAMINDEX_FACTORY_P_H +#define BAMINDEX_FACTORY_P_H + +#include +#include + +namespace BamTools { +namespace Internal { + +class BamIndexFactory { + + // static interface methods + public: + // creates a new BamIndex object, depending on extension of @indexFilename + static BamIndex* CreateIndexFromFilename(const std::string& indexFilename); + // creates a new BamIndex object, of requested @type + static BamIndex* CreateIndexOfType(const BamIndex::IndexType& type); + // returns name of existing index file that corresponds to @bamFilename + // will defer to @preferredType if possible + // if @preferredType not found, will attempt to load any supported index type + // returns empty string if no index file (of any type) is found + static const std::string FindIndexFilename(const std::string& bamFilename, + const BamIndex::IndexType& preferredType); + + // internal methods + public: + // generates index filename from BAM filename (depending on requested type) + // if type is unknown, returns empty string + static const std::string CreateIndexFilename(const std::string& bamFilename, + const BamIndex::IndexType& type); + // retrieves file extension (including '.') + static const std::string FileExtension(const std::string& filename); +}; + +} // namespace Internal +} // namespace BamTools + +#endif // BAMINDEX_FACTORY_P_H diff --git a/src/api/internal/BamMultiMerger_p.h b/src/api/internal/BamMultiMerger_p.h index b21fe31..ae67eea 100644 --- a/src/api/internal/BamMultiMerger_p.h +++ b/src/api/internal/BamMultiMerger_p.h @@ -3,7 +3,7 @@ // Marth Lab, Department of Biology, Boston College // All rights reserved. // --------------------------------------------------------------------------- -// Last modified: 17 January 2011 (DB) +// Last modified: 18 March 2011 (DB) // --------------------------------------------------------------------------- // Provides merging functionality for BamMultiReader. At this point, supports // sorting results by (refId, position) or by read name. @@ -45,7 +45,9 @@ class IBamMultiMerger { virtual void Add(const ReaderAlignment& value) =0; virtual void Clear(void) =0; virtual const ReaderAlignment& First(void) const =0; - virtual const int Size(void) const =0; + virtual bool IsEmpty(void) const =0; + virtual void Remove(BamReader* reader) =0; + virtual int Size(void) const =0; virtual ReaderAlignment TakeFirst(void) =0; }; @@ -60,17 +62,21 @@ class PositionMultiMerger : public IBamMultiMerger { void Add(const ReaderAlignment& value); void Clear(void); const ReaderAlignment& First(void) const; - const int Size(void) const; + bool IsEmpty(void) const; + void Remove(BamReader* reader); + int Size(void) const; ReaderAlignment TakeFirst(void); private: - typedef std::pair KeyType; - typedef std::multimap IndexType; - typedef std::pair KeyValueType; - typedef IndexType::iterator IndexIterator; - typedef IndexType::const_iterator IndexConstIterator; + typedef std::pair KeyType; + typedef ReaderAlignment ValueType; + typedef std::pair ElementType; - IndexType m_data; + typedef std::multimap ContainerType; + typedef ContainerType::iterator DataIterator; + typedef ContainerType::const_iterator DataConstIterator; + + ContainerType m_data; }; // IBamMultiMerger implementation - sorted on BamAlignment: Name @@ -84,17 +90,21 @@ class ReadNameMultiMerger : public IBamMultiMerger { void Add(const ReaderAlignment& value); void Clear(void); const ReaderAlignment& First(void) const; - const int Size(void) const; + bool IsEmpty(void) const; + void Remove(BamReader* reader); + int Size(void) const; ReaderAlignment TakeFirst(void); private: - typedef std::string KeyType; - typedef std::multimap IndexType; - typedef std::pair KeyValueType; - typedef IndexType::iterator IndexIterator; - typedef IndexType::const_iterator IndexConstIterator; + typedef std::string KeyType; + typedef ReaderAlignment ValueType; + typedef std::pair ElementType; + + typedef std::multimap ContainerType; + typedef ContainerType::iterator DataIterator; + typedef ContainerType::const_iterator DataConstIterator; - IndexType m_data; + ContainerType m_data; }; // IBamMultiMerger implementation - unsorted BAM file(s) @@ -108,20 +118,26 @@ class UnsortedMultiMerger : public IBamMultiMerger { void Add(const ReaderAlignment& value); void Clear(void); const ReaderAlignment& First(void) const; - const int Size(void) const; + bool IsEmpty(void) const; + void Remove(BamReader* reader); + int Size(void) const; ReaderAlignment TakeFirst(void); private: - typedef std::queue IndexType; - IndexType m_data; + typedef ReaderAlignment ElementType; + typedef std::vector ContainerType; + typedef ContainerType::iterator DataIterator; + typedef ContainerType::const_iterator DataConstIterator; + + ContainerType m_data; }; // ------------------------------------------ // PositionMultiMerger implementation inline void PositionMultiMerger::Add(const ReaderAlignment& value) { - const KeyType key = std::make_pair( value.second->RefID, value.second->Position ); - m_data.insert( KeyValueType(key, value) ); + const KeyType key( value.second->RefID, value.second->Position ); + m_data.insert( ElementType(key, value) ); } inline void PositionMultiMerger::Clear(void) { @@ -129,16 +145,41 @@ inline void PositionMultiMerger::Clear(void) { } inline const ReaderAlignment& PositionMultiMerger::First(void) const { - const KeyValueType& entry = (*m_data.begin()); + const ElementType& entry = (*m_data.begin()); return entry.second; } -inline const int PositionMultiMerger::Size(void) const { +inline bool PositionMultiMerger::IsEmpty(void) const { + return m_data.empty(); +} + +inline void PositionMultiMerger::Remove(BamReader* reader) { + + if ( reader == 0 ) return; + const std::string filenameToRemove = reader->GetFilename(); + + // iterate over readers in cache + DataIterator dataIter = m_data.begin(); + DataIterator dataEnd = m_data.end(); + for ( ; dataIter != dataEnd; ++dataIter ) { + const ValueType& entry = (*dataIter).second; + const BamReader* entryReader = entry.first; + if ( entryReader == 0 ) continue; + + // remove iterator on match + if ( entryReader->GetFilename() == filenameToRemove ) { + m_data.erase(dataIter); + return; + } + } +} + +inline int PositionMultiMerger::Size(void) const { return m_data.size(); } inline ReaderAlignment PositionMultiMerger::TakeFirst(void) { - IndexIterator first = m_data.begin(); + DataIterator first = m_data.begin(); ReaderAlignment next = (*first).second; m_data.erase(first); return next; @@ -148,8 +189,8 @@ inline ReaderAlignment PositionMultiMerger::TakeFirst(void) { // ReadNameMultiMerger implementation inline void ReadNameMultiMerger::Add(const ReaderAlignment& value) { - const KeyType key = value.second->Name; - m_data.insert( KeyValueType(key, value) ); + const KeyType key(value.second->Name); + m_data.insert( ElementType(key, value) ); } inline void ReadNameMultiMerger::Clear(void) { @@ -157,16 +198,42 @@ inline void ReadNameMultiMerger::Clear(void) { } inline const ReaderAlignment& ReadNameMultiMerger::First(void) const { - const KeyValueType& entry = (*m_data.begin()); + const ElementType& entry = (*m_data.begin()); return entry.second; } -inline const int ReadNameMultiMerger::Size(void) const { +inline bool ReadNameMultiMerger::IsEmpty(void) const { + return m_data.empty(); +} + +inline void ReadNameMultiMerger::Remove(BamReader* reader) { + + if ( reader == 0 ) return; + const std::string filenameToRemove = reader->GetFilename(); + + // iterate over readers in cache + DataIterator dataIter = m_data.begin(); + DataIterator dataEnd = m_data.end(); + for ( ; dataIter != dataEnd; ++dataIter ) { + const ValueType& entry = (*dataIter).second; + const BamReader* entryReader = entry.first; + if ( entryReader == 0 ) continue; + + // remove iterator on match + if ( entryReader->GetFilename() == filenameToRemove ) { + m_data.erase(dataIter); + return; + } + } + +} + +inline int ReadNameMultiMerger::Size(void) const { return m_data.size(); } inline ReaderAlignment ReadNameMultiMerger::TakeFirst(void) { - IndexIterator first = m_data.begin(); + DataIterator first = m_data.begin(); ReaderAlignment next = (*first).second; m_data.erase(first); return next; @@ -176,25 +243,49 @@ inline ReaderAlignment ReadNameMultiMerger::TakeFirst(void) { // UnsortedMultiMerger implementation inline void UnsortedMultiMerger::Add(const ReaderAlignment& value) { - m_data.push(value); + m_data.push_back(value); } inline void UnsortedMultiMerger::Clear(void) { for (size_t i = 0; i < m_data.size(); ++i ) - m_data.pop(); + m_data.pop_back(); } inline const ReaderAlignment& UnsortedMultiMerger::First(void) const { return m_data.front(); } -inline const int UnsortedMultiMerger::Size(void) const { +inline bool UnsortedMultiMerger::IsEmpty(void) const { + return m_data.empty(); +} + +inline void UnsortedMultiMerger::Remove(BamReader* reader) { + + if ( reader == 0 ) return; + const std::string filenameToRemove = reader->GetFilename(); + + // iterate over readers in cache + DataIterator dataIter = m_data.begin(); + DataIterator dataEnd = m_data.end(); + for ( ; dataIter != dataEnd; ++dataIter ) { + const BamReader* entryReader = (*dataIter).first; + if ( entryReader == 0 ) continue; + + // remove iterator on match + if ( entryReader->GetFilename() == filenameToRemove ) { + m_data.erase(dataIter); + return; + } + } +} + +inline int UnsortedMultiMerger::Size(void) const { return m_data.size(); } inline ReaderAlignment UnsortedMultiMerger::TakeFirst(void) { ReaderAlignment first = m_data.front(); - m_data.pop(); + m_data.erase( m_data.begin() ); return first; } diff --git a/src/api/internal/BamMultiReader_p.cpp b/src/api/internal/BamMultiReader_p.cpp index bfec6bb..583085c 100644 --- a/src/api/internal/BamMultiReader_p.cpp +++ b/src/api/internal/BamMultiReader_p.cpp @@ -3,7 +3,7 @@ // Marth Lab, Department of Biology, Boston College // All rights reserved. // --------------------------------------------------------------------------- -// Last modified: 17 January 2011 (DB) +// Last modified: 21 March 2011 (DB) // --------------------------------------------------------------------------- // Functionality for simultaneously reading multiple BAM files // ************************************************************************* @@ -40,47 +40,82 @@ BamMultiReaderPrivate::~BamMultiReaderPrivate(void) { m_alignments = 0; } -// close the BAM files +// close all BAM files void BamMultiReaderPrivate::Close(void) { + CloseFiles( Filenames() ); +} - // clear out alignment cache - m_alignments->Clear(); - - // iterate over readers - vector::iterator readerIter = m_readers.begin(); - vector::iterator readerEnd = m_readers.end(); - for ( ; readerIter != readerEnd; ++readerIter ) { +// close requested BAM file +void BamMultiReaderPrivate::CloseFile(const string& filename) { + vector filenames(1, filename); + CloseFiles(filenames); +} - // close reader - BamReader* reader = (*readerIter).first; - BamAlignment* alignment = (*readerIter).second; - if ( reader ) reader->Close(); +// close requested BAM files +void BamMultiReaderPrivate::CloseFiles(const vector& filenames) { - // delete pointers - delete reader; - reader = 0; - delete alignment; - alignment = 0; + // iterate over filenames + vector::const_iterator filesIter = filenames.begin(); + vector::const_iterator filesEnd = filenames.end(); + for ( ; filesIter != filesEnd; ++filesIter ) { + const string& filename = (*filesIter); + if ( filename.empty() ) continue; + + // iterate over readers + vector::iterator readerIter = m_readers.begin(); + vector::iterator readerEnd = m_readers.end(); + for ( ; readerIter != readerEnd; ++readerIter ) { + BamReader* reader = (*readerIter).first; + if ( reader == 0 ) continue; + + // if reader matches requested filename + if ( reader->GetFilename() == filename ) { + + // remove reader/alignment from alignment cache + m_alignments->Remove(reader); + + // close & delete reader + reader->Close(); + delete reader; + reader = 0; + + // delete reader's alignment entry + BamAlignment* alignment = (*readerIter).second; + delete alignment; + alignment = 0; + + // remove reader from container + m_readers.erase(readerIter); + + // on match, just go on to next filename + // (no need to keep looking and iterator is invalid now anyway) + break; + } + } } - // clear out readers - m_readers.clear(); - - // reset default flags - m_isCoreMode = false; - m_sortOrder = BamMultiReader::SortedByPosition; + // make sure alignment cache is cleared if all readers are now closed + if ( m_readers.empty() && m_alignments != 0 ) + m_alignments->Clear(); } -// saves index data to BAM index files (".bai"/".bti") where necessary, returns success/fail -bool BamMultiReaderPrivate::CreateIndexes(bool useStandardIndex) { +// creates index files for BAM files that don't have them +bool BamMultiReaderPrivate::CreateIndexes(const BamIndex::IndexType& type) { bool result = true; + + // iterate over readers vector::iterator readerIter = m_readers.begin(); vector::iterator readerEnd = m_readers.end(); for ( ; readerIter != readerEnd; ++readerIter ) { BamReader* reader = (*readerIter).first; - result &= reader->CreateIndex(useStandardIndex); + if ( reader == 0 ) continue; + + // if reader doesn't have an index, create one + if ( !reader->HasIndex() ) + result &= reader->CreateIndex(type); } + return result; } @@ -89,7 +124,8 @@ IBamMultiMerger* BamMultiReaderPrivate::CreateMergerForCurrentSortOrder(void) co case ( BamMultiReader::SortedByPosition ) : return new PositionMultiMerger; case ( BamMultiReader::SortedByReadName ) : return new ReadNameMultiMerger; case ( BamMultiReader::Unsorted ) : return new UnsortedMultiMerger; - default : //print error + default : + cerr << "BamMultiReader ERROR: requested sort order is unknown" << endl; return 0; } } @@ -113,21 +149,50 @@ const string BamMultiReaderPrivate::ExtractReadGroup(const string& headerLine) c return readGroup; } +const vector BamMultiReaderPrivate::Filenames(void) const { + + // init filename container + vector filenames; + filenames.reserve( m_readers.size() ); + + // iterate over readers + vector::const_iterator readerIter = m_readers.begin(); + vector::const_iterator readerEnd = m_readers.end(); + for ( ; readerIter != readerEnd; ++readerIter ) { + const BamReader* reader = (*readerIter).first; + if ( reader == 0 ) continue; + + // store filename if not empty + const string filename = reader->GetFilename(); + if ( !filename.empty() ) + filenames.push_back( reader->GetFilename() ); + } + + // return result + return filenames; +} + +SamHeader BamMultiReaderPrivate::GetHeader(void) const { + string text = GetHeaderText(); + return SamHeader(text); +} + // makes a virtual, unified header for all the bam files in the multireader -const string BamMultiReaderPrivate::GetHeaderText(void) const { +string BamMultiReaderPrivate::GetHeaderText(void) const { - // just spit single header out if only have one reader open + // if only one reader is open if ( m_readers.size() == 1 ) { + // just return reader's header text + const ReaderAlignment& ra = m_readers.front(); + const BamReader* reader = ra.first; + if ( reader ) return reader->GetHeaderText(); - vector::const_iterator readerBegin = m_readers.begin(); - const ReaderAlignment& entry = (*readerBegin); - const BamReader* reader = entry.first; - if ( reader == 0 ) return ""; - return reader->GetHeaderText(); + // invalid reader + return string(); } - string mergedHeader = ""; + string mergedHeader(""); map readGroups; // foreach extraction entry (each BAM file) @@ -135,10 +200,10 @@ const string BamMultiReaderPrivate::GetHeaderText(void) const { vector::const_iterator readerIter = readerBegin; vector::const_iterator readerEnd = m_readers.end(); for ( ; readerIter != readerEnd; ++readerIter ) { - - // get header from reader const BamReader* reader = (*readerIter).first; if ( reader == 0 ) continue; + + // get header from reader string headerText = reader->GetHeaderText(); if ( headerText.empty() ) continue; @@ -153,9 +218,10 @@ const string BamMultiReaderPrivate::GetHeaderText(void) const { // get next line from header, skip if empty const string headerLine = (*linesIter); - if ( headerLine.empty() ) { continue; } + if ( headerLine.empty() ) continue; // if first file, save HD & SQ entries + // TODO: what if first file has empty header, should just check for empty 'mergedHeader' instead ? if ( readerIter == readerBegin ) { if ( headerLine.find("@HD") == 0 || headerLine.find("@SQ") == 0) { mergedHeader.append(headerLine.c_str()); @@ -179,8 +245,8 @@ const string BamMultiReaderPrivate::GetHeaderText(void) const { // warn iff we are reading one file and discover duplicated @RG tags in the header // otherwise, we emit no warning, as we might be merging multiple BAM files with identical @RG tags if ( currentFileReadGroups.find(readGroup) != currentFileReadGroups.end() ) { - cerr << "WARNING: duplicate @RG tag " << readGroup - << " entry in header of " << reader->GetFilename() << endl; + cerr << "BamMultiReader WARNING: duplicate @RG tag " << readGroup + << " entry in header of " << reader->GetFilename() << endl; } } } @@ -193,198 +259,344 @@ const string BamMultiReaderPrivate::GetHeaderText(void) const { // get next alignment among all files bool BamMultiReaderPrivate::GetNextAlignment(BamAlignment& al) { - return LoadNextAlignment(al, false); + m_isCoreMode = false; + return LoadNextAlignment(al); } // get next alignment among all files without parsing character data from alignments bool BamMultiReaderPrivate::GetNextAlignmentCore(BamAlignment& al) { - return LoadNextAlignment(al, true); + m_isCoreMode = true; + return LoadNextAlignment(al); } // --------------------------------------------------------------------------------------- // // NB: The following GetReferenceX() functions assume that we have identical -// references for all BAM files. We enforce this by invoking the above -// validation function (ValidateReaders) to verify that our reference data -// is the same across all files on Open, so we will not encounter a situation -// in which there is a mismatch and we are still live. +// references for all BAM files. We enforce this by invoking the +// ValidateReaders() method to verify that our reference data is the same +// across all files on Open - so we will not encounter a situation in which +// there is a mismatch and we are still live. // // --------------------------------------------------------------------------------------- // returns the number of reference sequences -const int BamMultiReaderPrivate::GetReferenceCount(void) const { - const ReaderAlignment& firstReader = m_readers.front(); - const BamReader* reader = firstReader.first; +int BamMultiReaderPrivate::GetReferenceCount(void) const { + + // handle empty multireader + if ( m_readers.empty() ) + return 0; + + // return reference count from first reader + const ReaderAlignment& ra = m_readers.front(); + const BamReader* reader = ra.first; if ( reader ) return reader->GetReferenceCount(); - else return 0; + + // invalid reader + return 0; } // returns vector of reference objects const RefVector BamMultiReaderPrivate::GetReferenceData(void) const { - const ReaderAlignment& firstReader = m_readers.front(); - const BamReader* reader = firstReader.first; + + // handle empty multireader + if ( m_readers.empty() ) + return RefVector(); + + // return reference data from first BamReader + const ReaderAlignment& ra = m_readers.front(); + const BamReader* reader = ra.first; if ( reader ) return reader->GetReferenceData(); - else return RefVector(); + + // invalid reader + return RefVector(); } // returns refID from reference name -const int BamMultiReaderPrivate::GetReferenceID(const string& refName) const { - const ReaderAlignment& firstReader = m_readers.front(); - const BamReader* reader = firstReader.first; +int BamMultiReaderPrivate::GetReferenceID(const string& refName) const { + + // handle empty multireader + if ( m_readers.empty() ) + return -1; + + // return reference ID from first BamReader + const ReaderAlignment& ra = m_readers.front(); + const BamReader* reader = ra.first; if ( reader ) return reader->GetReferenceID(refName); - else return -1; // ERROR case - how to report -} + // invalid reader + return -1; +} // --------------------------------------------------------------------------------------- // checks if any readers still have alignments -bool BamMultiReaderPrivate::HasOpenReaders(void) { - return ( m_alignments->Size() > 0 ); +bool BamMultiReaderPrivate::HasAlignmentData(void) const { + if ( m_alignments == 0 ) + return false; + return !m_alignments->IsEmpty(); } -// returns whether underlying BAM readers ALL have an index loaded +// returns true if all readers have index data available // this is useful to indicate whether Jump() or SetRegion() are possible -bool BamMultiReaderPrivate::IsIndexLoaded(void) const { - bool ok = true; +bool BamMultiReaderPrivate::HasIndexes(void) const { + + // handle empty multireader + if ( m_readers.empty() ) + return false; + + bool result = true; + + // iterate over readers + vector::const_iterator readerIter = m_readers.begin(); + vector::const_iterator readerEnd = m_readers.end(); + for ( ; readerIter != readerEnd; ++readerIter ) { + const BamReader* reader = (*readerIter).first; + if ( reader == 0 ) continue; + + // see if current reader has index data + result &= reader->HasIndex(); + } + + return result; +} + +// returns true if multireader has open readers +bool BamMultiReaderPrivate::HasOpenReaders(void) { + + // iterate over readers vector::const_iterator readerIter = m_readers.begin(); vector::const_iterator readerEnd = m_readers.end(); for ( ; readerIter != readerEnd; ++readerIter ) { const BamReader* reader = (*readerIter).first; - if ( reader ) ok &= reader->IsIndexLoaded(); + if ( reader == 0 ) continue; + + // return true whenever an open reader is found + if ( reader->IsOpen() ) return true; } - return ok; + + // no readers open + return false; } -// jumps to specified region(refID, leftBound) in BAM files, returns success/fail +// performs random-access jump using (refID, position) as a left-bound bool BamMultiReaderPrivate::Jump(int refID, int position) { - bool ok = true; + // NB: While it may make sense to track readers in which we can + // successfully Jump, in practice a failure of Jump means "no + // alignments here." It makes sense to simply accept the failure, + // UpdateAlignments(), and continue. + + // iterate over readers vector::iterator readerIter = m_readers.begin(); vector::iterator readerEnd = m_readers.end(); for ( ; readerIter != readerEnd; ++readerIter ) { BamReader* reader = (*readerIter).first; if ( reader == 0 ) continue; - ok &= reader->Jump(refID, position); - if ( !ok ) { - cerr << "ERROR: could not jump " << reader->GetFilename() + // attempt jump() on each + if ( !reader->Jump(refID, position) ) { + cerr << "BamMultiReader ERROR: could not jump " << reader->GetFilename() << " to " << refID << ":" << position << endl; - exit(1); } } - if (ok) UpdateAlignments(); - return ok; + // update alignment cache & return success + UpdateAlignmentCache(); + return true; } -bool BamMultiReaderPrivate::LoadNextAlignment(BamAlignment& al, bool coreMode) { +bool BamMultiReaderPrivate::LoadNextAlignment(BamAlignment& al) { // bail out if no more data to process - if ( !HasOpenReaders() ) return false; + if ( !HasAlignmentData() ) + return false; // "pop" next alignment and reader ReaderAlignment nextReaderAlignment = m_alignments->TakeFirst(); - BamReader* reader = nextReaderAlignment.first; + BamReader* reader = nextReaderAlignment.first; BamAlignment* alignment = nextReaderAlignment.second; - // save it by copy to our argument - al = BamAlignment(*alignment); + // store cached alignment into destination parameter (by copy) + al = *alignment; // peek to next alignment & store in cache - m_isCoreMode = coreMode; - SaveNextAlignment(reader,alignment); + SaveNextAlignment(reader, alignment); // return success return true; } -// opens BAM files -bool BamMultiReaderPrivate::Open(const vector& filenames, - bool openIndexes, - bool coreMode, - bool preferStandardIndex) -{ - // store core mode flag - m_isCoreMode = coreMode; - - // first clear out any prior alignment cache prior data - if ( m_alignments ) { - m_alignments->Clear(); - delete m_alignments; - m_alignments = 0; +// locate (& load) index files for BAM readers that don't already have one loaded +bool BamMultiReaderPrivate::LocateIndexes(const BamIndex::IndexType& preferredType) { + + bool result = true; + + // iterate over readers + vector::iterator readerIter = m_readers.begin(); + vector::iterator readerEnd = m_readers.end(); + for ( ; readerIter != readerEnd; ++readerIter ) { + BamReader* reader = (*readerIter).first; + if ( reader == 0 ) continue; + + // if reader has no index, try to locate one + if ( !reader->HasIndex() ) + result &= reader->LocateIndex(preferredType); } - // create alignment cache based on sorting mode - m_alignments = CreateMergerForCurrentSortOrder(); - if ( m_alignments == 0 ) return false; + return result; +} + +// opens BAM files +bool BamMultiReaderPrivate::Open(const vector& filenames) { + + // create alignment cache if neccessary + if ( m_alignments == 0 ) { + m_alignments = CreateMergerForCurrentSortOrder(); + if ( m_alignments == 0 ) return false; + } // iterate over filenames vector::const_iterator filenameIter = filenames.begin(); vector::const_iterator filenameEnd = filenames.end(); for ( ; filenameIter != filenameEnd; ++filenameIter ) { - const string filename = (*filenameIter); + const string& filename = (*filenameIter); + if ( filename.empty() ) continue; - bool openedOk = true; - BamReader* reader = new BamReader; - openedOk = reader->Open(filename, "", openIndexes, preferStandardIndex); + // attempt to open BamReader on filename + BamReader* reader = OpenReader(filename); + if ( reader == 0 ) continue; - // if file opened ok - if ( openedOk ) { + // store reader with new alignment + m_readers.push_back( make_pair(reader, new BamAlignment) ); + } - // try to read first alignment - bool fileOk = true; - BamAlignment* alignment = new BamAlignment; - fileOk &= ( coreMode ? reader->GetNextAlignmentCore(*alignment) - : reader->GetNextAlignment(*alignment) ); + // validate & rewind any opened readers, also refreshes alignment cache + if ( !m_readers.empty() ) { + ValidateReaders(); + Rewind(); + } - if ( fileOk ) { + // return success + return true; +} - m_readers.push_back( make_pair(reader, alignment) ); - m_alignments->Add( make_pair(reader, alignment) ); +bool BamMultiReaderPrivate::OpenFile(const std::string& filename) { + vector filenames(1, filename); + return Open(filenames); +} - } else { - cerr << "WARNING: could not read first alignment in " - << filename << ", ignoring file" << endl; +bool BamMultiReaderPrivate::OpenIndexes(const vector& indexFilenames) { - // if only file available & could not be read, return failure - if ( filenames.size() == 1 ) - return false; - } + // TODO: This needs to be cleaner - should not assume same order. + // And either way, shouldn't start at first reader. Should start at + // first reader without an index? + + // make sure same number of index filenames as readers + if ( m_readers.size() != indexFilenames.size() || !indexFilenames.empty() ) + return false; + + // init result flag + bool result = true; + + // iterate over BamReaders + vector::const_iterator indexFilenameIter = indexFilenames.begin(); + vector::const_iterator indexFilenameEnd = indexFilenames.end(); + vector::iterator readerIter = m_readers.begin(); + vector::iterator readerEnd = m_readers.end(); + for ( ; readerIter != readerEnd; ++readerIter ) { + BamReader* reader = (*readerIter).first; + + // open index filename on reader + if ( reader ) { + const string& indexFilename = (*indexFilenameIter); + result &= reader->OpenIndex(indexFilename); + } + + // increment filename iterator, skip if no more index files to open + if ( ++indexFilenameIter == indexFilenameEnd ) + break; + } + + // TODO: validation ?? + + // return success/fail + return result; +} + +BamReader* BamMultiReaderPrivate::OpenReader(const std::string& filename) { + + // create new BamReader + BamReader* reader = new BamReader; + + // if reader opens OK + if ( reader->Open(filename) ) { + + // attempt to read first alignment (sanity check) + // if ok, then return BamReader pointer + BamAlignment al; + if ( reader->GetNextAlignmentCore(al) ) + return reader; + + // could not read alignment + else { + cerr << "BamMultiReader WARNING: Could not read first alignment from " + << filename << ", ignoring file" << endl; } + } - // TODO; any further error handling when openedOK is false ?? - else return false; + // reader could not open + else { + cerr << "BamMultiReader WARNING: Could not open: " + << filename << ", ignoring file" << endl; } - // files opened ok, at least one alignment could be read, - // now need to check that all files use same reference data - ValidateReaders(); - return true; + // if we get here, there was a problem with this BAM file (opening or reading) + // clean up memory allocation & return null pointer + delete reader; + return 0; } // print associated filenames to stdout void BamMultiReaderPrivate::PrintFilenames(void) const { + const vector& filenames = Filenames(); + vector::const_iterator filenameIter = filenames.begin(); + vector::const_iterator filenameEnd = filenames.end(); + for ( ; filenameIter != filenameEnd; ++filenameIter ) + cout << (*filenameIter) << endl; +} - vector::const_iterator readerIter = m_readers.begin(); - vector::const_iterator readerEnd = m_readers.end(); - for ( ; readerIter != readerEnd; ++readerIter ) { - const BamReader* reader = (*readerIter).first; - if ( reader == 0 ) continue; - cout << reader->GetFilename() << endl; +// returns BAM file pointers to beginning of alignment data & resets alignment cache +bool BamMultiReaderPrivate::Rewind(void) { + + // clear out alignment cache + m_alignments->Clear(); + + // attempt to rewind files + if ( !RewindReaders() ) { + cerr << "BamMultiReader ERROR: could not rewind file(s) successfully"; + return false; } + + // reset cache & return success + UpdateAlignmentCache(); + return true; } // returns BAM file pointers to beginning of alignment data -bool BamMultiReaderPrivate::Rewind(void) { +bool BamMultiReaderPrivate::RewindReaders(void) { bool result = true; + + // iterate over readers vector::iterator readerIter = m_readers.begin(); vector::iterator readerEnd = m_readers.end(); for ( ; readerIter != readerEnd; ++readerIter ) { BamReader* reader = (*readerIter).first; if ( reader == 0 ) continue; + + // attempt rewind on BamReader result &= reader->Rewind(); } + return result; } @@ -404,13 +616,16 @@ void BamMultiReaderPrivate::SaveNextAlignment(BamReader* reader, BamAlignment* a } // sets the index caching mode on the readers -void BamMultiReaderPrivate::SetIndexCacheMode(const BamIndex::BamIndexCacheMode mode) { +void BamMultiReaderPrivate::SetIndexCacheMode(const BamIndex::IndexCacheMode mode) { + // iterate over readers vector::iterator readerIter = m_readers.begin(); vector::iterator readerEnd = m_readers.end(); for ( ; readerIter != readerEnd; ++readerIter ) { BamReader* reader = (*readerIter).first; if ( reader == 0 ) continue; + + // set reader's index cache mode reader->SetIndexCacheMode(mode); } } @@ -422,19 +637,23 @@ bool BamMultiReaderPrivate::SetRegion(const BamRegion& region) { // alignments here." It makes sense to simply accept the failure, // UpdateAlignments(), and continue. + // iterate over alignments vector::iterator readerIter = m_readers.begin(); vector::iterator readerEnd = m_readers.end(); for ( ; readerIter != readerEnd; ++readerIter ) { BamReader* reader = (*readerIter).first; if ( reader == 0 ) continue; + + // attempt to set BamReader's region of interest if ( !reader->SetRegion(region) ) { - cerr << "ERROR: could not jump " << reader->GetFilename() << " to " + cerr << "BamMultiReader ERROR: could not jump " << reader->GetFilename() << " to " << region.LeftRefID << ":" << region.LeftPosition << ".." << region.RightRefID << ":" << region.RightPosition << endl; } } - UpdateAlignments(); + // update alignment cache & return success + UpdateAlignmentCache(); return true; } @@ -452,7 +671,7 @@ void BamMultiReaderPrivate::SetSortOrder(const BamMultiReader::SortOrder& order) // copy old cache contents to new cache while ( m_alignments->Size() > 0 ) { - ReaderAlignment value = m_alignments->TakeFirst(); + ReaderAlignment value = m_alignments->TakeFirst(); // retrieves & 'pops' newAlignmentCache->Add(value); } @@ -461,33 +680,44 @@ void BamMultiReaderPrivate::SetSortOrder(const BamMultiReader::SortOrder& order) m_alignments = newAlignmentCache; } +// splits the entire header into a list of strings +const vector BamMultiReaderPrivate::SplitHeaderText(const string& headerText) const { + + stringstream header(headerText); + string item; + + vector lines; + while ( getline(header, item) ) + lines.push_back(item); + return lines; +} + // updates our alignment cache -void BamMultiReaderPrivate::UpdateAlignments(void) { +void BamMultiReaderPrivate::UpdateAlignmentCache(void) { + + // skip if invalid alignment cache + if ( m_alignments == 0 ) return; // clear the cache m_alignments->Clear(); + // seed cache with fully-populated alignments + // further updates will fill with full/core-only as requested + m_isCoreMode = false; + // iterate over readers vector::iterator readerIter = m_readers.begin(); vector::iterator readerEnd = m_readers.end(); for ( ; readerIter != readerEnd; ++readerIter ) { BamReader* reader = (*readerIter).first; BamAlignment* alignment = (*readerIter).second; - if ( reader == 0 ) continue; + if ( reader == 0 || alignment == 0 ) continue; + + // save next alignment from each reader in cache SaveNextAlignment(reader, alignment); } } -// splits the entire header into a list of strings -const vector BamMultiReaderPrivate::SplitHeaderText(const string& headerText) const { - stringstream header(headerText); - vector lines; - string item; - while ( getline(header, item) ) - lines.push_back(item); - return lines; -} - // ValidateReaders checks that all the readers point to BAM files representing // alignments against the same set of reference sequences, and that the // sequences are identically ordered. If these checks fail the operation of @@ -496,7 +726,7 @@ void BamMultiReaderPrivate::ValidateReaders(void) const { // retrieve first reader data const BamReader* firstReader = m_readers.front().first; - if ( firstReader == 0 ) return; // signal error? + if ( firstReader == 0 ) return; const RefVector firstReaderRefData = firstReader->GetReferenceData(); const int firstReaderRefCount = firstReader->GetReferenceCount(); const int firstReaderRefSize = firstReaderRefData.size(); @@ -508,7 +738,7 @@ void BamMultiReaderPrivate::ValidateReaders(void) const { // get current reader data BamReader* reader = (*readerIter).first; - if ( reader == 0 ) continue; // error? + if ( reader == 0 ) continue; const RefVector currentReaderRefData = reader->GetReferenceData(); const int currentReaderRefCount = reader->GetReferenceCount(); const int currentReaderRefSize = currentReaderRefData.size(); @@ -522,7 +752,7 @@ void BamMultiReaderPrivate::ValidateReaders(void) const { if ( (currentReaderRefCount != firstReaderRefCount) || (firstReaderRefSize != currentReaderRefSize) ) { - cerr << "ERROR: mismatched number of references in " << reader->GetFilename() + cerr << "BamMultiReader ERROR: mismatched number of references in " << reader->GetFilename() << " expected " << firstReaderRefCount << " reference sequences but only found " << currentReaderRefCount << endl; exit(1); @@ -531,7 +761,6 @@ void BamMultiReaderPrivate::ValidateReaders(void) const { // this will be ok; we just checked above that we have identically-sized sets of references // here we simply check if they are all, in fact, equal in content while ( firstRefIter != firstRefEnd ) { - const RefData& firstRef = (*firstRefIter); const RefData& currentRef = (*currentRefIter); @@ -539,8 +768,10 @@ void BamMultiReaderPrivate::ValidateReaders(void) const { if ( (firstRef.RefName != currentRef.RefName) || (firstRef.RefLength != currentRef.RefLength) ) { - cerr << "ERROR: mismatched references found in " << reader->GetFilename() + cerr << "BamMultiReader ERROR: mismatched references found in " << reader->GetFilename() << " expected: " << endl; + + // print first reader's reference data RefVector::const_iterator refIter = firstReaderRefData.begin(); RefVector::const_iterator refEnd = firstReaderRefData.end(); for ( ; refIter != refEnd; ++refIter ) { @@ -549,6 +780,8 @@ void BamMultiReaderPrivate::ValidateReaders(void) const { } cerr << "but found: " << endl; + + // print current reader's reference data refIter = currentReaderRefData.begin(); refEnd = currentReaderRefData.end(); for ( ; refIter != refEnd; ++refIter ) { diff --git a/src/api/internal/BamMultiReader_p.h b/src/api/internal/BamMultiReader_p.h index 942f60b..b34fb0c 100644 --- a/src/api/internal/BamMultiReader_p.h +++ b/src/api/internal/BamMultiReader_p.h @@ -3,7 +3,7 @@ // Marth Lab, Department of Biology, Boston College // All rights reserved. // --------------------------------------------------------------------------- -// Last modified: 17 January 2011 (DB) +// Last modified: 13 March 2011 (DB) // --------------------------------------------------------------------------- // Functionality for simultaneously reading multiple BAM files // ************************************************************************* @@ -21,6 +21,7 @@ // // We mean it. +#include #include #include #include @@ -42,15 +43,15 @@ class BamMultiReaderPrivate { // file operations void Close(void); - bool Open(const std::vector& filenames, - bool openIndexes = true, - bool coreMode = false, - bool preferStandardIndex = false); - bool IsIndexLoaded(void) const; + void CloseFile(const std::string& filename); + void CloseFiles(const std::vector& filenames); + const std::vector Filenames(void) const; bool Jump(int refID, int position = 0); + bool Open(const std::vector& filenames); + bool OpenFile(const std::string& filename); void PrintFilenames(void) const; - bool SetRegion(const BamRegion& region); bool Rewind(void); + bool SetRegion(const BamRegion& region); // access alignment data bool GetNextAlignment(BamAlignment& al); @@ -59,25 +60,30 @@ class BamMultiReaderPrivate { void SetSortOrder(const BamMultiReader::SortOrder& order); // access auxiliary data - const std::string GetHeaderText(void) const; - const int GetReferenceCount(void) const; + SamHeader GetHeader(void) const; + std::string GetHeaderText(void) const; + int GetReferenceCount(void) const; const BamTools::RefVector GetReferenceData(void) const; - const int GetReferenceID(const std::string& refName) const; + int GetReferenceID(const std::string& refName) const; // BAM index operations - bool CreateIndexes(bool useStandardIndex = true); - void SetIndexCacheMode(const BamIndex::BamIndexCacheMode mode); + bool CreateIndexes(const BamIndex::IndexType& type = BamIndex::STANDARD); + bool HasIndexes(void) const; + bool LocateIndexes(const BamIndex::IndexType& preferredType = BamIndex::STANDARD); + bool OpenIndexes(const std::vector& indexFilenames); + void SetIndexCacheMode(const BamIndex::IndexCacheMode mode); - // internal methods - private: + // 'internal' methods + public: IBamMultiMerger* CreateMergerForCurrentSortOrder(void) const; const std::string ExtractReadGroup(const std::string& headerLine) const; - bool LoadNextAlignment(BamAlignment& al, bool coreMode); + bool HasAlignmentData(void) const; + bool LoadNextAlignment(BamAlignment& al); + BamTools::BamReader* OpenReader(const std::string& filename); + bool RewindReaders(void); void SaveNextAlignment(BamTools::BamReader* reader, BamTools::BamAlignment* alignment); const std::vector SplitHeaderText(const std::string& headerText) const; - // updates our alignment cache - void UpdateAlignments(void); - // validates that we have a congruent set of BAM files that are aligned against the same reference sequences + void UpdateAlignmentCache(void); void ValidateReaders(void) const; // data members diff --git a/src/api/internal/BamRandomAccessController_p.cpp b/src/api/internal/BamRandomAccessController_p.cpp new file mode 100644 index 0000000..a785610 --- /dev/null +++ b/src/api/internal/BamRandomAccessController_p.cpp @@ -0,0 +1,274 @@ +// *************************************************************************** +// BamRandomAccessController_p.cpp (c) 2011 Derek Barnett +// Marth Lab, Department of Biology, Boston College +// All rights reserved. +// --------------------------------------------------------------------------- +// Last modified: 21 March 2011(DB) +// --------------------------------------------------------------------------- +// Manages random access operations in a BAM file +// ************************************************************************** + +#include +#include +#include +#include +using namespace BamTools; +using namespace BamTools::Internal; + +#include +using namespace std; + +BamRandomAccessController::BamRandomAccessController(void) + : m_index(0) + , m_indexCacheMode(BamIndex::LimitedIndexCaching) + , m_hasAlignmentsInRegion(true) +{ } + +BamRandomAccessController::~BamRandomAccessController(void) { + Close(); +} + +void BamRandomAccessController::AdjustRegion(const int& referenceCount) { + + // skip if no index available + if ( m_index == 0 ) + return; + + // see if any references in region have alignments + m_hasAlignmentsInRegion = false; + int currentId = m_region.LeftRefID; + const int rightBoundRefId = ( m_region.isRightBoundSpecified() ? m_region.RightRefID : referenceCount - 1 ); + while ( currentId <= rightBoundRefId ) { + m_hasAlignmentsInRegion = m_index->HasAlignments(currentId); + if ( m_hasAlignmentsInRegion ) break; + ++currentId; + } + + // if no data found on any reference in region + if ( !m_hasAlignmentsInRegion ) + return; + + // if left bound of desired region had no data, use first reference that had data + // otherwise, leave requested region as-is + if ( currentId != m_region.LeftRefID ) { + m_region.LeftRefID = currentId; + m_region.LeftPosition = 0; + } +} + +// returns alignments' "RegionState": { Before|Overlaps|After } current region +BamRandomAccessController::RegionState +BamRandomAccessController::AlignmentState(const BamAlignment& alignment) const { + + // if region has no left bound at all + if ( !m_region.isLeftBoundSpecified() ) + return OverlapsRegion; + + // handle unmapped reads - return AFTER region to halt processing + if ( alignment.RefID == -1 ) + return AfterRegion; + + // if alignment is on any reference before left bound reference + if ( alignment.RefID < m_region.LeftRefID ) + return BeforeRegion; + + // if alignment is on left bound reference + else if ( alignment.RefID == m_region.LeftRefID ) { + + // if alignment starts at or after left bound position + if ( alignment.Position >= m_region.LeftPosition) { + + if ( m_region.isRightBoundSpecified() && // right bound is specified AND + m_region.LeftRefID == m_region.RightRefID && // left & right bounds on same reference AND + alignment.Position > m_region.RightPosition ) // alignment starts after right bound position + return AfterRegion; + + // otherwise, alignment overlaps region + else return OverlapsRegion; + } + + // alignment starts before left bound position + else { + + // if alignment overlaps left bound position + if ( alignment.GetEndPosition() >= m_region.LeftPosition ) + return OverlapsRegion; + else + return BeforeRegion; + } + } + + // otherwise alignment is on a reference after left bound reference + else { + + // if region has a right bound + if ( m_region.isRightBoundSpecified() ) { + + // alignment is on any reference between boundaries + if ( alignment.RefID < m_region.RightRefID ) + return OverlapsRegion; + + // alignment is on any reference after right boundary + else if ( alignment.RefID > m_region.RightRefID ) + return AfterRegion; + + // alignment is on right bound reference + else { + + // if alignment starts on or before right bound position + if ( alignment.Position <= m_region.RightPosition ) + return OverlapsRegion; + else + return AfterRegion; + } + } + + // otherwise, alignment starts after left bound and there is no right bound + else return OverlapsRegion; + } +} + +void BamRandomAccessController::Close(void) { + ClearIndex(); + ClearRegion(); +} + +void BamRandomAccessController::ClearIndex(void) { + delete m_index; + m_index = 0; +} + +void BamRandomAccessController::ClearRegion(void) { + m_region.clear(); + m_hasAlignmentsInRegion = true; +} + +bool BamRandomAccessController::CreateIndex(BamReaderPrivate* reader, + const BamIndex::IndexType& type) { + + // skip if reader is invalid + if ( reader == 0 ) + return false; + + // create new index of requested type + BamIndex* newIndex = BamIndexFactory::CreateIndexOfType(type); + if ( newIndex == 0 ) { + cerr << "BamRandomAccessController ERROR: could not create index of type " << type << endl; + return false; + } + + // attempt to build index from current BamReader file + if ( !newIndex->Build(reader) ) { + cerr << "BamRandomAccessController ERROR: could not build index on BAM file: " << reader->Filename() << endl; + return false; + } + + // save new index + SetIndex(newIndex); + + // attempt to write new index file + if ( newIndex->Write(reader->Filename()) ) { + cerr << "BamRandomAccessController ERROR: could not save new index for BAM file: " << reader->Filename() << endl; + return false; + } + + // set new index's cache mode & return success + newIndex->SetCacheMode(m_indexCacheMode); + return true; +} + +bool BamRandomAccessController::HasIndex(void) const { + return ( m_index != 0 ); +} + +bool BamRandomAccessController::HasRegion(void) const { + return ( !m_region.isNull() ); +} + +bool BamRandomAccessController::IndexHasAlignmentsForReference(const int& refId) { + return m_index->HasAlignments(refId); +} + +bool BamRandomAccessController::LocateIndex(const string& bamFilename, + const BamIndex::IndexType& preferredType) +{ + // look up index filename, deferring to preferredType if possible + const string& indexFilename = BamIndexFactory::FindIndexFilename(bamFilename, preferredType); + + // if no index file found (of any type) + if ( indexFilename.empty() ) + return false; + + // otherwise open & use index file that was found + return OpenIndex(indexFilename); +} + +bool BamRandomAccessController::OpenIndex(const string& indexFilename) { + + // attempt create new index of type based on filename + BamIndex* index = BamIndexFactory::CreateIndexFromFilename(indexFilename); + if ( index == 0 ) { + cerr << "BamRandomAccessController ERROR: could not create index for file: " << indexFilename << endl; + return false; + } + + // set cache mode + index->SetCacheMode(m_indexCacheMode); + + // attempt to load data from index file + if ( !index->Load(indexFilename) ) { + cerr << "BamRandomAccessController ERROR: could not load index data from file: " << indexFilename << endl; + return false; + } + + // save new index & return success + SetIndex(index); + return true; +} + +bool BamRandomAccessController::RegionHasAlignments(void) const { + return m_hasAlignmentsInRegion; +} + +void BamRandomAccessController::SetIndex(BamIndex* index) { + if ( m_index ) + ClearIndex(); + m_index = index; +} + +void BamRandomAccessController::SetIndexCacheMode(const BamIndex::IndexCacheMode& mode) { + m_indexCacheMode = mode; + if ( m_index ) + m_index->SetCacheMode(mode); +} + +bool BamRandomAccessController::SetRegion(BamReaderPrivate* reader, + const BamRegion& region, + const int& referenceCount) +{ + // store region + m_region = region; + + // cannot jump when no index is available + if ( !HasIndex() ) + return false; + + // adjust region as necessary to reflect where data actually begins + AdjustRegion(referenceCount); + + // if no data present, return true + // * Not an error, but future attempts to access alignments in this region will not return data + // Returning true is useful in a BamMultiReader setting where some BAM files may + // lack alignments in regions where other BAMs do have data. + if ( !m_hasAlignmentsInRegion ) + return true; + + // return success/failure of jump to specified region, + // + // * Index::Jump() is allowed to modify the m_hasAlignmentsInRegion flag + // This covers 'corner case' where a region is requested that lies beyond the last + // alignment on a reference. If this occurs, any subsequent calls to GetNextAlignment[Core] + // will not return data. BamMultiReader will still be able to successfully pull alignments + // from a region from multiple files even if one or more have no data. + return m_index->Jump(reader, m_region, &m_hasAlignmentsInRegion); +} diff --git a/src/api/internal/BamRandomAccessController_p.h b/src/api/internal/BamRandomAccessController_p.h new file mode 100644 index 0000000..e541cdf --- /dev/null +++ b/src/api/internal/BamRandomAccessController_p.h @@ -0,0 +1,94 @@ +// *************************************************************************** +// BamRandomAccessController_p.h (c) 2011 Derek Barnett +// Marth Lab, Department of Biology, Boston College +// All rights reserved. +// --------------------------------------------------------------------------- +// Last modified: 24 February 2011(DB) +// --------------------------------------------------------------------------- +// Manages random access operations in a BAM file +// *************************************************************************** + +#ifndef BAMRACONTROLLER_P_H +#define BAMRACONTROLLER_P_H + +// ------------- +// W A R N I N G +// ------------- +// +// This file is not part of the BamTools API. It exists purely as an +// implementation detail. This header file may change from version to version +// without notice, or even be removed. +// +// We mean it. + +#include +#include + +namespace BamTools { + +class BamAlignment; + +namespace Internal { + +class BamReaderPrivate; + +class BamRandomAccessController { + + // enums + public: enum RegionState { BeforeRegion = 0 + , OverlapsRegion + , AfterRegion + }; + + // ctor & dtor + public: + BamRandomAccessController(void); + ~BamRandomAccessController(void); + + // general interface + public: + void Close(void); + + // index operations + public: + // + void ClearIndex(void); + bool CreateIndex(BamReaderPrivate* reader, const BamIndex::IndexType& type); + bool HasIndex(void) const; + bool IndexHasAlignmentsForReference(const int& refId); + bool LocateIndex(const std::string& bamFilename, const BamIndex::IndexType& preferredType); + bool OpenIndex(const std::string& indexFilename); + void SetIndex(BamIndex* index); + void SetIndexCacheMode(const BamIndex::IndexCacheMode& mode); + + // region operations + public: + void ClearRegion(void); + bool HasRegion(void) const; + RegionState AlignmentState(const BamAlignment& alignment) const; + bool RegionHasAlignments(void) const; + bool SetRegion(BamReaderPrivate* reader, + const BamRegion& region, + const int& referenceCount); + + // 'internal' methods + public: + // adjusts requested region if necessary (depending on where data actually begins) + void AdjustRegion(const int& referenceCount); + + // data members + private: + + // index data + BamIndex* m_index; // owns index, not a copy - responsible for deleting + BamIndex::IndexCacheMode m_indexCacheMode; + + // region data + BamRegion m_region; + bool m_hasAlignmentsInRegion; +}; + +} // namespace Internal +} // namespace BamTools + +#endif // BAMRACONTROLLER_P_H diff --git a/src/api/internal/BamReader_p.cpp b/src/api/internal/BamReader_p.cpp index e0a3d90..441e8c0 100644 --- a/src/api/internal/BamReader_p.cpp +++ b/src/api/internal/BamReader_p.cpp @@ -3,17 +3,19 @@ // Marth Lab, Department of Biology, Boston College // All rights reserved. // --------------------------------------------------------------------------- -// Last modified: 11 January 2011 (DB) +// Last modified: 21 March 2011 (DB) // --------------------------------------------------------------------------- // Provides the basic functionality for reading BAM files // *************************************************************************** +#include #include -#include #include +#include #include #include #include +#include using namespace BamTools; using namespace BamTools::Internal; @@ -25,172 +27,121 @@ using namespace std; // constructor BamReaderPrivate::BamReaderPrivate(BamReader* parent) - : Index(0) - , HasIndex(false) - , AlignmentsBeginOffset(0) - , IndexCacheMode(BamIndex::LimitedIndexCaching) - , HasAlignmentsInRegion(true) - , Parent(parent) - , m_header(new BamHeader) - , DNA_LOOKUP("=ACMGRSVTWYHKDBN") - , CIGAR_LOOKUP("MIDNSHP") + : m_alignmentsBeginOffset(0) + , m_parent(parent) { - IsBigEndian = SystemIsBigEndian(); + m_isBigEndian = BamTools::SystemIsBigEndian(); } // destructor BamReaderPrivate::~BamReaderPrivate(void) { - Close(); - - delete m_header; - m_header = 0; -} - -// adjusts requested region if necessary (depending on where data actually begins) -void BamReaderPrivate::AdjustRegion(BamRegion& region) { - - // check for valid index first - if ( Index == 0 ) return; - - // see if any references in region have alignments - HasAlignmentsInRegion = false; - int currentId = region.LeftRefID; - - const int rightBoundRefId = ( region.isRightBoundSpecified() ? region.RightRefID : References.size() - 1 ); - while ( currentId <= rightBoundRefId ) { - HasAlignmentsInRegion = Index->HasAlignments(currentId); - if ( HasAlignmentsInRegion ) break; - ++currentId; - } - - // if no data found on any reference in region - if ( !HasAlignmentsInRegion ) return; - - // if left bound of desired region had no data, use first reference that had data - // otherwise, leave requested region as-is - if ( currentId != region.LeftRefID ) { - region.LeftRefID = currentId; - region.LeftPosition = 0; - } -} - -// clear index data structure -void BamReaderPrivate::ClearIndex(void) { - delete Index; - Index = 0; - HasIndex = false; } // closes the BAM file void BamReaderPrivate::Close(void) { - // close BGZF file stream - mBGZF.Close(); - - // clear out index data - ClearIndex(); + // clear header & reference data + m_references.clear(); + m_header.Clear(); - // clear out header data - m_header->Clear(); + // close internal + m_randomAccessController.Close(); + m_stream.Close(); - // clear out region flags - Region.clear(); + // clear filename + m_filename.clear(); } -// creates index for BAM file, saves to file -// default behavior is to create the BAM standard index (".bai") -// set flag to false to create the BamTools-specific index (".bti") -bool BamReaderPrivate::CreateIndex(bool useStandardIndex) { - - // clear out prior index data - ClearIndex(); - - // create index based on type requested - if ( useStandardIndex ) - Index = new BamStandardIndex(&mBGZF, Parent); - else - Index = new BamToolsIndex(&mBGZF, Parent); - - // set index cache mode to full for writing - Index->SetCacheMode(BamIndex::FullIndexCaching); - - // build new index - bool ok = true; - ok &= Index->Build(); - HasIndex = ok; - - // mark empty references - MarkReferences(); - - // attempt to save index data to file - ok &= Index->Write(Filename); - - // set client's desired index cache mode - Index->SetCacheMode(IndexCacheMode); +// creates an index file of requested type on current BAM file +bool BamReaderPrivate::CreateIndex(const BamIndex::IndexType& type) { + if ( !IsOpen() ) return false; + return m_randomAccessController.CreateIndex(this, type); +} - // return success/fail of both building & writing index - return ok; +// return path & filename of current BAM file +const string BamReaderPrivate::Filename(void) const { + return m_filename; } -const string BamReaderPrivate::GetHeaderText(void) const { - return m_header->ToString(); +// return header data as std::string +string BamReaderPrivate::GetHeaderText(void) const { + return m_header.ToString(); } -const SamHeader BamReaderPrivate::GetSamHeader(void) const { - return m_header->ToSamHeader(); +// return header data as SamHeader object +SamHeader BamReaderPrivate::GetSamHeader(void) const { + return m_header.ToSamHeader(); } -// get next alignment (from specified region, if given) -bool BamReaderPrivate::GetNextAlignment(BamAlignment& bAlignment) { +// get next alignment (with character data fully parsed) +bool BamReaderPrivate::GetNextAlignment(BamAlignment& alignment) { - // if valid alignment found, attempt to parse char data, and return success/failure - if ( GetNextAlignmentCore(bAlignment) ) - return bAlignment.BuildCharData(); + // if valid alignment found + if ( GetNextAlignmentCore(alignment) ) { + + // store alignment's "source" filename + alignment.Filename = m_filename; + + // return success/failure of parsing char data + return alignment.BuildCharData(); + } // no valid alignment found - else return false; + return false; } // retrieves next available alignment core data (returns success/fail) -// ** DOES NOT parse any character data (read name, bases, qualities, tag data) +// ** DOES NOT populate any character data fields (read name, bases, qualities, tag data, filename) // these can be accessed, if necessary, from the supportData // useful for operations requiring ONLY positional or other alignment-related information -bool BamReaderPrivate::GetNextAlignmentCore(BamAlignment& bAlignment) { +bool BamReaderPrivate::GetNextAlignmentCore(BamAlignment& alignment) { - // if region is set but has no alignments - if ( !Region.isNull() && !HasAlignmentsInRegion ) + // skip if region is set but has no alignments + if ( m_randomAccessController.HasRegion() && + !m_randomAccessController.RegionHasAlignments() ) + { return false; + } - // if valid alignment available - if ( LoadNextAlignment(bAlignment) ) { + // if can't read next alignment + if ( !LoadNextAlignment(alignment) ) + return false; - // set core-only flag - bAlignment.SupportData.HasCoreOnly = true; + // check alignment's region-overlap state + BamRandomAccessController::RegionState state = m_randomAccessController.AlignmentState(alignment); - // if region not specified with at least a left boundary, return success - if ( !Region.isLeftBoundSpecified() ) return true; + // if alignment starts after region, no need to keep reading + if ( state == BamRandomAccessController::AfterRegion ) + return false; - // determine region state (before, within, after) - BamReaderPrivate::RegionState state = IsOverlap(bAlignment); + // read until overlap is found + while ( state != BamRandomAccessController::OverlapsRegion ) { - // if alignment lies after region, return false - if ( state == AFTER_REGION ) return false; + // if can't read next alignment + if ( !LoadNextAlignment(alignment) ) + return false; - while ( state != WITHIN_REGION ) { - // if no valid alignment available (likely EOF) return failure - if ( !LoadNextAlignment(bAlignment) ) return false; - // if alignment lies after region, return false (no available read within region) - state = IsOverlap(bAlignment); - if ( state == AFTER_REGION ) return false; - } + // check alignment's region-overlap state + state = m_randomAccessController.AlignmentState(alignment); - // return success (alignment found that overlaps region) - return true; + // if alignment starts after region, no need to keep reading + if ( state == BamRandomAccessController::AfterRegion ) + return false; } - // no valid alignment - else return false; + // if we get here, we found the next 'valid' alignment + // (e.g. overlaps current region if one was set, simply the next alignment if not) + alignment.SupportData.HasCoreOnly = true; + return true; +} + +int BamReaderPrivate::GetReferenceCount(void) const { + return m_references.size(); +} + +const RefVector& BamReaderPrivate::GetReferenceData(void) const { + return m_references; } // returns RefID for given RefName (returns References.size() if not found) @@ -198,178 +149,81 @@ int BamReaderPrivate::GetReferenceID(const string& refName) const { // retrieve names from reference data vector refNames; - RefVector::const_iterator refIter = References.begin(); - RefVector::const_iterator refEnd = References.end(); + RefVector::const_iterator refIter = m_references.begin(); + RefVector::const_iterator refEnd = m_references.end(); for ( ; refIter != refEnd; ++refIter) refNames.push_back( (*refIter).RefName ); - // return 'index-of' refName ( if not found, returns refNames.size() ) - return distance(refNames.begin(), find(refNames.begin(), refNames.end(), refName)); + // return 'index-of' refName (or -1 if not found) + int index = distance(refNames.begin(), find(refNames.begin(), refNames.end(), refName)); + if ( index == (int)m_references.size() ) return -1; + else return index; } -// returns region state - whether alignment ends before, overlaps, or starts after currently specified region -// this *internal* method should ONLY called when (at least) IsLeftBoundSpecified == true -BamReaderPrivate::RegionState BamReaderPrivate::IsOverlap(BamAlignment& bAlignment) { - - // if alignment is on any reference sequence before left bound - if ( bAlignment.RefID < Region.LeftRefID ) - return BEFORE_REGION; - - // if alignment starts on left bound reference - else if ( bAlignment.RefID == Region.LeftRefID ) { - - // if alignment starts at or after left boundary - if ( bAlignment.Position >= Region.LeftPosition) { - - // if right boundary is specified AND - // left/right boundaries are on same reference AND - // alignment starts past right boundary - if ( Region.isRightBoundSpecified() && - Region.LeftRefID == Region.RightRefID && - bAlignment.Position > Region.RightPosition ) - return AFTER_REGION; - - // otherwise, alignment is within region - else - return WITHIN_REGION; - } - - // alignment starts before left boundary - else { - // check if alignment overlaps left boundary - if ( bAlignment.GetEndPosition() >= Region.LeftPosition ) - return WITHIN_REGION; - else - return BEFORE_REGION; - } - } - - // alignment starts on a reference after the left bound - else { - - // if region has a right boundary - if ( Region.isRightBoundSpecified() ) { - - // alignment is on reference between boundaries - if ( bAlignment.RefID < Region.RightRefID ) - return WITHIN_REGION; - - // alignment is on reference after right boundary - else if ( bAlignment.RefID > Region.RightRefID ) - return AFTER_REGION; - - // alignment is on right bound reference - else { - // check if alignment starts before or at right boundary - if ( bAlignment.Position <= Region.RightPosition ) - return WITHIN_REGION; - else - return AFTER_REGION; - } - } - - // otherwise, alignment is after left bound reference, but there is no right boundary - else return WITHIN_REGION; - } +bool BamReaderPrivate::HasIndex(void) const { + return m_randomAccessController.HasIndex(); } -// load BAM header data -void BamReaderPrivate::LoadHeaderData(void) { - m_header->Load(&mBGZF); +bool BamReaderPrivate::IsOpen(void) const { + return m_stream.IsOpen; } -// load existing index data from BAM index file (".bti" OR ".bai"), return success/fail -bool BamReaderPrivate::LoadIndex(const bool lookForIndex, const bool preferStandardIndex) { - - // clear out any existing index data - ClearIndex(); - - // if no index filename provided, so we need to look for available index files - if ( IndexFilename.empty() ) { - - // attempt to load BamIndex based on current Filename provided & preferStandardIndex flag - const BamIndex::PreferredIndexType type = (preferStandardIndex ? BamIndex::STANDARD : BamIndex::BAMTOOLS); - Index = BamIndex::FromBamFilename(Filename, &mBGZF, Parent, type); - - // if null, return failure - if ( Index == 0 ) return false; - - // generate proper IndexFilename based on type of index created - IndexFilename = Filename + Index->Extension(); - } - - else { - - // attempt to load BamIndex based on IndexFilename provided by client - Index = BamIndex::FromIndexFilename(IndexFilename, &mBGZF, Parent); - - // if null, return failure - if ( Index == 0 ) return false; - } - - // set cache mode for BamIndex - Index->SetCacheMode(IndexCacheMode); - - // loading the index data from file - HasIndex = Index->Load(IndexFilename); - - // mark empty references - MarkReferences(); - - // return index status - return HasIndex; +// load BAM header data +bool BamReaderPrivate::LoadHeaderData(void) { + return m_header.Load(&m_stream); } // populates BamAlignment with alignment data under file pointer, returns success/fail -bool BamReaderPrivate::LoadNextAlignment(BamAlignment& bAlignment) { +bool BamReaderPrivate::LoadNextAlignment(BamAlignment& alignment) { // read in the 'block length' value, make sure it's not zero - char buffer[4]; - mBGZF.Read(buffer, 4); - bAlignment.SupportData.BlockLength = BgzfData::UnpackUnsignedInt(buffer); - if ( IsBigEndian ) { SwapEndian_32(bAlignment.SupportData.BlockLength); } - if ( bAlignment.SupportData.BlockLength == 0 ) return false; + char buffer[sizeof(uint32_t)]; + m_stream.Read(buffer, sizeof(uint32_t)); + alignment.SupportData.BlockLength = BamTools::UnpackUnsignedInt(buffer); + if ( m_isBigEndian ) BamTools::SwapEndian_32(alignment.SupportData.BlockLength); + if ( alignment.SupportData.BlockLength == 0 ) return false; // read in core alignment data, make sure the right size of data was read - char x[BAM_CORE_SIZE]; - if ( mBGZF.Read(x, BAM_CORE_SIZE) != BAM_CORE_SIZE ) + char x[Constants::BAM_CORE_SIZE]; + if ( m_stream.Read(x, Constants::BAM_CORE_SIZE) != Constants::BAM_CORE_SIZE ) return false; - if ( IsBigEndian ) { - for ( int i = 0; i < BAM_CORE_SIZE; i+=sizeof(uint32_t) ) - SwapEndian_32p(&x[i]); + // swap core endian-ness if necessary + if ( m_isBigEndian ) { + for ( int i = 0; i < Constants::BAM_CORE_SIZE; i+=sizeof(uint32_t) ) + BamTools::SwapEndian_32p(&x[i]); } // set BamAlignment 'core' and 'support' data - bAlignment.RefID = BgzfData::UnpackSignedInt(&x[0]); - bAlignment.Position = BgzfData::UnpackSignedInt(&x[4]); + alignment.RefID = BamTools::UnpackSignedInt(&x[0]); + alignment.Position = BamTools::UnpackSignedInt(&x[4]); - unsigned int tempValue = BgzfData::UnpackUnsignedInt(&x[8]); - bAlignment.Bin = tempValue >> 16; - bAlignment.MapQuality = tempValue >> 8 & 0xff; - bAlignment.SupportData.QueryNameLength = tempValue & 0xff; + unsigned int tempValue = BamTools::UnpackUnsignedInt(&x[8]); + alignment.Bin = tempValue >> 16; + alignment.MapQuality = tempValue >> 8 & 0xff; + alignment.SupportData.QueryNameLength = tempValue & 0xff; - tempValue = BgzfData::UnpackUnsignedInt(&x[12]); - bAlignment.AlignmentFlag = tempValue >> 16; - bAlignment.SupportData.NumCigarOperations = tempValue & 0xffff; + tempValue = BamTools::UnpackUnsignedInt(&x[12]); + alignment.AlignmentFlag = tempValue >> 16; + alignment.SupportData.NumCigarOperations = tempValue & 0xffff; - bAlignment.SupportData.QuerySequenceLength = BgzfData::UnpackUnsignedInt(&x[16]); - bAlignment.MateRefID = BgzfData::UnpackSignedInt(&x[20]); - bAlignment.MatePosition = BgzfData::UnpackSignedInt(&x[24]); - bAlignment.InsertSize = BgzfData::UnpackSignedInt(&x[28]); + alignment.SupportData.QuerySequenceLength = BamTools::UnpackUnsignedInt(&x[16]); + alignment.MateRefID = BamTools::UnpackSignedInt(&x[20]); + alignment.MatePosition = BamTools::UnpackSignedInt(&x[24]); + alignment.InsertSize = BamTools::UnpackSignedInt(&x[28]); // set BamAlignment length - bAlignment.Length = bAlignment.SupportData.QuerySequenceLength; + alignment.Length = alignment.SupportData.QuerySequenceLength; // read in character data - make sure proper data size was read bool readCharDataOK = false; - const unsigned int dataLength = bAlignment.SupportData.BlockLength - BAM_CORE_SIZE; + const unsigned int dataLength = alignment.SupportData.BlockLength - Constants::BAM_CORE_SIZE; char* allCharData = (char*)calloc(sizeof(char), dataLength); - if ( mBGZF.Read(allCharData, dataLength) == (signed int)dataLength) { + if ( m_stream.Read(allCharData, dataLength) == (signed int)dataLength ) { // store 'allCharData' in supportData structure - bAlignment.SupportData.AllCharData.assign((const char*)allCharData, dataLength); + alignment.SupportData.AllCharData.assign((const char*)allCharData, dataLength); // set success flag readCharDataOK = true; @@ -377,180 +231,140 @@ bool BamReaderPrivate::LoadNextAlignment(BamAlignment& bAlignment) { // save CIGAR ops // need to calculate this here so that BamAlignment::GetEndPosition() performs correctly, // even when GetNextAlignmentCore() is called - const unsigned int cigarDataOffset = bAlignment.SupportData.QueryNameLength; + const unsigned int cigarDataOffset = alignment.SupportData.QueryNameLength; uint32_t* cigarData = (uint32_t*)(allCharData + cigarDataOffset); CigarOp op; - bAlignment.CigarData.clear(); - bAlignment.CigarData.reserve(bAlignment.SupportData.NumCigarOperations); - for (unsigned int i = 0; i < bAlignment.SupportData.NumCigarOperations; ++i) { + alignment.CigarData.clear(); + alignment.CigarData.reserve(alignment.SupportData.NumCigarOperations); + for ( unsigned int i = 0; i < alignment.SupportData.NumCigarOperations; ++i ) { - // swap if necessary - if ( IsBigEndian ) SwapEndian_32(cigarData[i]); + // swap endian-ness if necessary + if ( m_isBigEndian ) BamTools::SwapEndian_32(cigarData[i]); // build CigarOp structure - op.Length = (cigarData[i] >> BAM_CIGAR_SHIFT); - op.Type = CIGAR_LOOKUP[ (cigarData[i] & BAM_CIGAR_MASK) ]; + op.Length = (cigarData[i] >> Constants::BAM_CIGAR_SHIFT); + op.Type = Constants::BAM_CIGAR_LOOKUP[ (cigarData[i] & Constants::BAM_CIGAR_MASK) ]; // save CigarOp - bAlignment.CigarData.push_back(op); + alignment.CigarData.push_back(op); } } + // clean up & return parsing success/failure free(allCharData); return readCharDataOK; } // loads reference data from BAM file -void BamReaderPrivate::LoadReferenceData(void) { +bool BamReaderPrivate::LoadReferenceData(void) { // get number of reference sequences - char buffer[4]; - mBGZF.Read(buffer, 4); - unsigned int numberRefSeqs = BgzfData::UnpackUnsignedInt(buffer); - if ( IsBigEndian ) SwapEndian_32(numberRefSeqs); - if ( numberRefSeqs == 0 ) return; - References.reserve((int)numberRefSeqs); + char buffer[sizeof(uint32_t)]; + m_stream.Read(buffer, sizeof(uint32_t)); + uint32_t numberRefSeqs = BamTools::UnpackUnsignedInt(buffer); + if ( m_isBigEndian ) BamTools::SwapEndian_32(numberRefSeqs); + m_references.reserve((int)numberRefSeqs); // iterate over all references in header - for (unsigned int i = 0; i != numberRefSeqs; ++i) { + for ( unsigned int i = 0; i != numberRefSeqs; ++i ) { // get length of reference name - mBGZF.Read(buffer, 4); - unsigned int refNameLength = BgzfData::UnpackUnsignedInt(buffer); - if ( IsBigEndian ) SwapEndian_32(refNameLength); + m_stream.Read(buffer, sizeof(uint32_t)); + uint32_t refNameLength = BamTools::UnpackUnsignedInt(buffer); + if ( m_isBigEndian ) BamTools::SwapEndian_32(refNameLength); char* refName = (char*)calloc(refNameLength, 1); // get reference name and reference sequence length - mBGZF.Read(refName, refNameLength); - mBGZF.Read(buffer, 4); - int refLength = BgzfData::UnpackSignedInt(buffer); - if ( IsBigEndian ) SwapEndian_32(refLength); + m_stream.Read(refName, refNameLength); + m_stream.Read(buffer, sizeof(int32_t)); + int32_t refLength = BamTools::UnpackSignedInt(buffer); + if ( m_isBigEndian ) BamTools::SwapEndian_32(refLength); // store data for reference RefData aReference; aReference.RefName = (string)((const char*)refName); aReference.RefLength = refLength; - References.push_back(aReference); + m_references.push_back(aReference); // clean up calloc-ed temp variable free(refName); } -} -// mark references with no alignment data -void BamReaderPrivate::MarkReferences(void) { - - // ensure index is available - if ( !HasIndex ) return; + // return success + return true; +} - // mark empty references - for ( int i = 0; i < (int)References.size(); ++i ) - References.at(i).RefHasAlignments = Index->HasAlignments(i); +bool BamReaderPrivate::LocateIndex(const BamIndex::IndexType& preferredType) { + return m_randomAccessController.LocateIndex(m_filename, preferredType); } // opens BAM file (and index) -bool BamReaderPrivate::Open(const string& filename, - const string& indexFilename, - const bool lookForIndex, - const bool preferStandardIndex) -{ - // store filenames - Filename = filename; - IndexFilename = indexFilename; +bool BamReaderPrivate::Open(const string& filename) { - // open the BGZF file for reading, return false on failure - if ( !mBGZF.Open(filename, "rb") ) return false; + // close current BAM file if open + if ( m_stream.IsOpen ) + Close(); - // retrieve header text & reference data - LoadHeaderData(); - LoadReferenceData(); + // attempt to open BgzfStream for reading + if ( !m_stream.Open(filename, "rb") ) + return false; - // store file offset of first alignment - AlignmentsBeginOffset = mBGZF.Tell(); + // attempt to load header data + if ( !LoadHeaderData() ) + return false; - // if no index filename provided - if ( IndexFilename.empty() ) { + // attempt to load reference data + if ( !LoadReferenceData() ) + return false; - // client did not specify that index SHOULD be found - // useful for cases where sequential access is all that is required - if ( !lookForIndex ) return true; + // if all OK, store filename & offset of first alignment + m_filename = filename; + m_alignmentsBeginOffset = m_stream.Tell(); - // otherwise, look for index file, return success/fail - else return LoadIndex(lookForIndex, preferStandardIndex) ; - } + // return success + return true; +} - // client supplied an index filename - // attempt to load index data, return success/fail - return LoadIndex(lookForIndex, preferStandardIndex); +bool BamReaderPrivate::OpenIndex(const std::string& indexFilename) { + return m_randomAccessController.OpenIndex(indexFilename); } // returns BAM file pointer to beginning of alignment data bool BamReaderPrivate::Rewind(void) { - // rewind to first alignment, return false if unable to seek - if ( !mBGZF.Seek(AlignmentsBeginOffset) ) return false; + // attempt rewind to first alignment + if ( !m_stream.Seek(m_alignmentsBeginOffset) ) + return false; - // retrieve first alignment data, return false if unable to read + // verify that we can read first alignment BamAlignment al; - if ( !LoadNextAlignment(al) ) return false; + if ( !LoadNextAlignment(al) ) + return false; - // reset default region info using first alignment in file - Region.clear(); - HasAlignmentsInRegion = true; + // reset region + m_randomAccessController.ClearRegion(); // rewind back to beginning of first alignment // return success/fail of seek - return mBGZF.Seek(AlignmentsBeginOffset); + return m_stream.Seek(m_alignmentsBeginOffset); +} + +void BamReaderPrivate::SetIndex(BamIndex* index) { + m_randomAccessController.SetIndex(index); } // change the index caching behavior -void BamReaderPrivate::SetIndexCacheMode(const BamIndex::BamIndexCacheMode mode) { - IndexCacheMode = mode; - if ( Index == 0 ) return; - Index->SetCacheMode(mode); +void BamReaderPrivate::SetIndexCacheMode(const BamIndex::IndexCacheMode& mode) { + m_randomAccessController.SetIndexCacheMode(mode); } -// asks Index to attempt a Jump() to specified region +// sets current region & attempts to jump to it // returns success/failure bool BamReaderPrivate::SetRegion(const BamRegion& region) { + return m_randomAccessController.SetRegion(this, region, m_references.size()); +} - // clear out any prior BamReader region data - // - // N.B. - this is cleared so that BamIndex now has free reign to call - // GetNextAlignmentCore() and do overlap checking without worrying about BamReader - // performing any overlap checking of its own and moving on to the next read... Calls - // to GetNextAlignmentCore() with no Region set, simply return the next alignment. - // This ensures that the Index is able to do just that. (All without exposing - // LoadNextAlignment() to the public API, and potentially confusing clients with the nomenclature) - Region.clear(); - - // check for existing index - if ( !HasIndex ) return false; - - // adjust region if necessary to reflect where data actually begins - BamRegion adjustedRegion(region); - AdjustRegion(adjustedRegion); - - // if no data present, return true - // not an error, but BamReader knows that no data is there for future alignment access - // (this is useful in a MultiBamReader setting where some BAM files may lack data in regions - // that other BAMs have data) - if ( !HasAlignmentsInRegion ) { - Region = adjustedRegion; - return true; - } - - // attempt jump to user-specified region return false if jump could not be performed at all - // (invalid index, unknown reference, etc) - // - // Index::Jump() is allowed to modify the HasAlignmentsInRegion flag - // * This covers case where a region is requested that lies beyond the last alignment on a reference - // If this occurs, any subsequent calls to GetNexAlignment[Core] simply return false - // BamMultiReader is then able to successfully pull alignments from a region from multiple files - // even if one or more have no data. - if ( !Index->Jump(adjustedRegion, &HasAlignmentsInRegion) ) return false; - - // save region and return success - Region = adjustedRegion; - return true; +// returns handle to internal BgzfStream +BgzfStream* BamReaderPrivate::Stream(void) { + return &m_stream; } diff --git a/src/api/internal/BamReader_p.h b/src/api/internal/BamReader_p.h index 3d49a63..7dda67f 100644 --- a/src/api/internal/BamReader_p.h +++ b/src/api/internal/BamReader_p.h @@ -3,7 +3,7 @@ // Marth Lab, Department of Biology, Boston College // All rights reserved. // --------------------------------------------------------------------------- -// Last modified: 11 January 2011 (DB) +// Last modified: 24 February 2011 (DB) // --------------------------------------------------------------------------- // Provides the basic functionality for reading BAM files // *************************************************************************** @@ -23,114 +23,85 @@ #include #include -#include +#include #include +#include +#include +#include #include namespace BamTools { - -class BamReader; -class SamHeader; - namespace Internal { -class BamHeader; - class BamReaderPrivate { - // enums - public: enum RegionState { BEFORE_REGION = 0 - , WITHIN_REGION - , AFTER_REGION - }; - // ctor & dtor public: BamReaderPrivate(BamReader* parent); ~BamReaderPrivate(void); - // 'public' interface to BamReader + // BamReader interface public: // file operations void Close(void); - bool Open(const std::string& filename, - const std::string& indexFilename, - const bool lookForIndex, - const bool preferStandardIndex); + const std::string Filename(void) const; + bool IsOpen(void) const; + bool Open(const std::string& filename); bool Rewind(void); bool SetRegion(const BamRegion& region); // access alignment data - bool GetNextAlignment(BamAlignment& bAlignment); - bool GetNextAlignmentCore(BamAlignment& bAlignment); + bool GetNextAlignment(BamAlignment& alignment); + bool GetNextAlignmentCore(BamAlignment& alignment); // access auxiliary data - const std::string GetHeaderText(void) const; - const SamHeader GetSamHeader(void) const; + std::string GetHeaderText(void) const; + SamHeader GetSamHeader(void) const; + int GetReferenceCount(void) const; + const RefVector& GetReferenceData(void) const; int GetReferenceID(const std::string& refName) const; // index operations - bool CreateIndex(bool useStandardIndex); - void SetIndexCacheMode(const BamIndex::BamIndexCacheMode mode); + bool CreateIndex(const BamIndex::IndexType& type); + bool HasIndex(void) const; + bool LocateIndex(const BamIndex::IndexType& preferredType); + bool OpenIndex(const std::string& indexFilename); + void SetIndex(BamIndex* index); + void SetIndexCacheMode(const BamIndex::IndexCacheMode& mode); + + // BamReaderPrivate interface + public: + BgzfStream* Stream(void); // 'internal' methods public: - - // --------------------------------------- - // reading alignments and auxiliary data - - // adjusts requested region if necessary (depending on where data actually begins) - void AdjustRegion(BamRegion& region); - // checks to see if alignment overlaps current region - RegionState IsOverlap(BamAlignment& bAlignment); // retrieves header text from BAM file - void LoadHeaderData(void); + bool LoadHeaderData(void); // retrieves BAM alignment under file pointer - bool LoadNextAlignment(BamAlignment& bAlignment); + // (does no overlap checking or character data parsing) + bool LoadNextAlignment(BamAlignment& alignment); // builds reference data structure from BAM file - void LoadReferenceData(void); - // mark references with 'HasAlignments' status - void MarkReferences(void); - - // --------------------------------- - // index file handling - - // clear out inernal index data structure - void ClearIndex(void); - // loads index from BAM index file - bool LoadIndex(const bool lookForIndex, const bool preferStandardIndex); + bool LoadReferenceData(void); // data members public: - // general file data - BgzfData mBGZF; - BamIndex* Index; - RefVector References; - bool HasIndex; - int64_t AlignmentsBeginOffset; - std::string Filename; - std::string IndexFilename; - - - // index caching mode - BamIndex::BamIndexCacheMode IndexCacheMode; + // general BAM file data + int64_t m_alignmentsBeginOffset; + std::string m_filename; + RefVector m_references; // system data - bool IsBigEndian; - - // user-specified region values - BamRegion Region; - bool HasAlignmentsInRegion; + bool m_isBigEndian; // parent BamReader - BamReader* Parent; - BamHeader* m_header; + BamReader* m_parent; - // BAM character constants - const char* DNA_LOOKUP; - const char* CIGAR_LOOKUP; + // BamReaderPrivate components + BamHeader m_header; + BamRandomAccessController m_randomAccessController; + BgzfStream m_stream; }; } // namespace Internal diff --git a/src/api/internal/BamStandardIndex_p.cpp b/src/api/internal/BamStandardIndex_p.cpp index f243dbc..cf0e2c1 100644 --- a/src/api/internal/BamStandardIndex_p.cpp +++ b/src/api/internal/BamStandardIndex_p.cpp @@ -3,27 +3,29 @@ // Marth Lab, Department of Biology, Boston College // All rights reserved. // --------------------------------------------------------------------------- -// Last modified: 13 January 2011 (DB) +// Last modified: 21 March 2011 (DB) // --------------------------------------------------------------------------- // Provides index operations for the standardized BAM index format (".bai") // *************************************************************************** #include #include -#include +#include #include +#include using namespace BamTools; using namespace BamTools::Internal; #include #include +#include #include #include #include using namespace std; -BamStandardIndex::BamStandardIndex(BgzfData* bgzf, BamReader* reader) - : BamIndex(bgzf, reader) +BamStandardIndex::BamStandardIndex(void) + : BamIndex() , m_dataBeginOffset(0) , m_hasFullDataCache(false) { @@ -36,8 +38,9 @@ BamStandardIndex::~BamStandardIndex(void) { // calculate bins that overlap region int BamStandardIndex::BinsFromRegion(const BamRegion& region, - const bool isRightBoundSpecified, - uint16_t bins[MAX_BIN]) + const RefVector& references, + const bool isRightBoundSpecified, + uint16_t bins[MAX_BIN]) { // get region boundaries uint32_t begin = (unsigned int)region.LeftPosition; @@ -50,7 +53,7 @@ int BamStandardIndex::BinsFromRegion(const BamRegion& region, // otherwise, use end of left bound reference as cutoff else - end = (unsigned int)m_references.at(region.LeftRefID).RefLength - 1; + end = (unsigned int)references.at(region.LeftRefID).RefLength - 1; // initialize list, bin '0' always a valid bin int i = 0; @@ -68,18 +71,23 @@ int BamStandardIndex::BinsFromRegion(const BamRegion& region, return i; } -// creates index data (in-memory) from current reader data -bool BamStandardIndex::Build(void) { +// creates index data (in-memory) from @reader data +bool BamStandardIndex::Build(Internal::BamReaderPrivate* reader) { - // be sure reader & BGZF file are valid & open for reading - if ( m_reader == 0 || m_BGZF == 0 || !m_BGZF->IsOpen ) + // skip if invalid reader + if ( reader == 0 ) return false; - // move file pointer to beginning of alignments - m_reader->Rewind(); + // skip if reader BgzfStream is invalid or not open + BgzfStream* bgzfStream = reader->Stream(); + if ( bgzfStream == 0 || !bgzfStream->IsOpen ) + return false; + + // move reader's file pointer to beginning of alignments + reader->Rewind(); // get reference count, reserve index space - const int numReferences = (int)m_references.size(); + const int numReferences = reader->GetReferenceCount(); m_indexData.clear(); m_hasFullDataCache = false; SetReferenceCount(numReferences); @@ -96,14 +104,14 @@ bool BamStandardIndex::Build(void) { int32_t lastRefID(defaultValue); // offset data - uint64_t saveOffset = m_BGZF->Tell(); + uint64_t saveOffset = bgzfStream->Tell(); uint64_t lastOffset = saveOffset; // coordinate data int32_t lastCoordinate = defaultValue; BamAlignment bAlignment; - while ( m_reader->GetNextAlignmentCore(bAlignment) ) { + while ( reader->LoadNextAlignment(bAlignment) ) { // change of chromosome, save ID, reset bin if ( lastRefID != bAlignment.RefID ) { @@ -113,9 +121,9 @@ bool BamStandardIndex::Build(void) { // if lastCoordinate greater than BAM position - file not sorted properly else if ( lastCoordinate > bAlignment.Position ) { - fprintf(stderr, "BAM file not properly sorted:\n"); - fprintf(stderr, "Alignment %s : %d > %d on reference (id = %d)", bAlignment.Name.c_str(), - lastCoordinate, bAlignment.Position, bAlignment.RefID); + fprintf(stderr, "BamStandardIndex ERROR: file not properly sorted:\n"); + fprintf(stderr, "Alignment %s : %d > %d on reference (id = %d)", + bAlignment.Name.c_str(), lastCoordinate, bAlignment.Position, bAlignment.RefID); exit(1); } @@ -159,13 +167,13 @@ bool BamStandardIndex::Build(void) { } // make sure that current file pointer is beyond lastOffset - if ( m_BGZF->Tell() <= (int64_t)lastOffset ) { - fprintf(stderr, "Error in BGZF offsets.\n"); + if ( bgzfStream->Tell() <= (int64_t)lastOffset ) { + fprintf(stderr, "BamStandardIndex ERROR: could not build index - calculating offsets failed.\n"); exit(1); } // update lastOffset - lastOffset = m_BGZF->Tell(); + lastOffset = bgzfStream->Tell(); // update lastCoordinate lastCoordinate = bAlignment.Position; @@ -198,8 +206,8 @@ bool BamStandardIndex::Build(void) { sort(offsets.begin(), offsets.end()); } - // rewind file pointer to beginning of alignments, return success/fail - return m_reader->Rewind(); + // rewind reader's file pointer to beginning of alignments, return success/fail + return reader->Rewind(); } // check index file magic number, return true if OK @@ -211,7 +219,7 @@ bool BamStandardIndex::CheckMagicNumber(void) { // compare to expected value if ( strncmp(magic, "BAI\1", 4) != 0 ) { - fprintf(stderr, "Problem with index file - invalid format.\n"); + fprintf(stderr, "BamStandardIndex ERROR: could not load index file - invalid magic number.\n"); fclose(m_indexStream); return false; } @@ -247,15 +255,16 @@ void BamStandardIndex::ClearReferenceOffsets(const int& refId) { } // return file position after header metadata -const off_t BamStandardIndex::DataBeginOffset(void) const { +off_t BamStandardIndex::DataBeginOffset(void) const { return m_dataBeginOffset; } // calculates offset(s) for a given region bool BamStandardIndex::GetOffsets(const BamRegion& region, - const bool isRightBoundSpecified, - vector& offsets, - bool* hasAlignmentsInRegion) + const RefVector& references, + const bool isRightBoundSpecified, + vector& offsets, + bool* hasAlignmentsInRegion) { // return false if leftBound refID is not found in index data if ( m_indexData.find(region.LeftRefID) == m_indexData.end() ) @@ -271,7 +280,7 @@ bool BamStandardIndex::GetOffsets(const BamRegion& region, // calculate which bins overlap this region uint16_t* bins = (uint16_t*)calloc(MAX_BIN, 2); - int numBins = BinsFromRegion(region, isRightBoundSpecified, bins); + int numBins = BinsFromRegion(region, references, isRightBoundSpecified, bins); // get bins for this reference BamStandardIndexData::const_iterator indexIter = m_indexData.find(region.LeftRefID); @@ -315,8 +324,8 @@ bool BamStandardIndex::GetOffsets(const BamRegion& region, *hasAlignmentsInRegion = (offsets.size() != 0 ); // if cache mode set to none, dump the data we just loaded - if (m_cacheMode == BamIndex::NoIndexCaching ) - ClearReferenceOffsets(region.LeftRefID); + if ( m_cacheMode == BamIndex::NoIndexCaching ) + ClearReferenceOffsets(region.LeftRefID); // return succes return true; @@ -352,49 +361,59 @@ bool BamStandardIndex::IsDataLoaded(const int& refId) const { } // attempts to use index to jump to region; returns success/fail -bool BamStandardIndex::Jump(const BamRegion& region, bool* hasAlignmentsInRegion) { +bool BamStandardIndex::Jump(Internal::BamReaderPrivate* reader, + const BamTools::BamRegion& region, + bool *hasAlignmentsInRegion) +{ + // skip if invalid reader + if ( reader == 0 ) + return false; - // be sure reader & BGZF file are valid & open for reading - if ( m_reader == 0 || m_BGZF == 0 || !m_BGZF->IsOpen ) + // skip if reader BgzfStream is invalid or not open + BgzfStream* bgzfStream = reader->Stream(); + if ( bgzfStream == 0 || !bgzfStream->IsOpen ) return false; + // retrieve references from reader + const RefVector references = reader->GetReferenceData(); + // make sure left-bound position is valid - if ( region.LeftPosition > m_references.at(region.LeftRefID).RefLength ) + if ( region.LeftPosition > references.at(region.LeftRefID).RefLength ) return false; // calculate offsets for this region // if failed, print message, set flag, and return failure vector offsets; - if ( !GetOffsets(region, region.isRightBoundSpecified(), offsets, hasAlignmentsInRegion) ) { - fprintf(stderr, "ERROR: Could not jump: unable to calculate offset(s) for specified region.\n"); + if ( !GetOffsets(region, references, region.isRightBoundSpecified(), offsets, hasAlignmentsInRegion) ) { + fprintf(stderr, "BamStandardIndex ERROR: could not jump - unable to calculate offset candidates for specified region.\n"); *hasAlignmentsInRegion = false; return false; } // iterate through offsets - BamAlignment bAlignment; + BamAlignment alignment; bool result = true; for ( vector::const_iterator o = offsets.begin(); o != offsets.end(); ++o) { // attempt seek & load first available alignment // set flag to true if data exists - result &= m_BGZF->Seek(*o); - *hasAlignmentsInRegion = m_reader->GetNextAlignmentCore(bAlignment); + result &= bgzfStream->Seek(*o); + *hasAlignmentsInRegion = reader->GetNextAlignmentCore(alignment); // if this alignment corresponds to desired position // return success of seeking back to the offset before the 'current offset' (to cover overlaps) - if ( ((bAlignment.RefID == region.LeftRefID) && - ((bAlignment.Position + bAlignment.Length) > region.LeftPosition)) || - (bAlignment.RefID > region.LeftRefID) ) + if ( ((alignment.RefID == region.LeftRefID) && + ((alignment.Position + alignment.Length) > region.LeftPosition)) || + (alignment.RefID > region.LeftRefID) ) { if ( o != offsets.begin() ) --o; - return m_BGZF->Seek(*o); + return bgzfStream->Seek(*o); } } // if error in jumping, print message & set flag if ( !result ) { - fprintf(stderr, "ERROR: Could not jump: unable to determine correct offset for specified region.\n"); + fprintf(stderr, "BamStandardIndex ERROR: could not jump - unable to determine correct offset for specified region.\n"); *hasAlignmentsInRegion = false; } diff --git a/src/api/internal/BamStandardIndex_p.h b/src/api/internal/BamStandardIndex_p.h index da179f4..767606e 100644 --- a/src/api/internal/BamStandardIndex_p.h +++ b/src/api/internal/BamStandardIndex_p.h @@ -3,7 +3,7 @@ // Marth Lab, Department of Biology, Boston College // All rights reserved. // --------------------------------------------------------------------------- -// Last modified: 19 November 2010 (DB) +// Last modified: 19 January 2011 (DB) // --------------------------------------------------------------------------- // Provides index operations for the standardized BAM index format (".bai") // *************************************************************************** @@ -36,6 +36,7 @@ namespace Internal { // BAM index constants const int MAX_BIN = 37450; // =(8^6-1)/7+1 const int BAM_LIDX_SHIFT = 14; +const std::string BAI_EXTENSION = ".bai"; // -------------------------------------------------- // BamStandardIndex data structures & typedefs @@ -47,9 +48,9 @@ struct Chunk { // constructor Chunk(const uint64_t& start = 0, - const uint64_t& stop = 0) - : Start(start) - , Stop(stop) + const uint64_t& stop = 0) + : Start(start) + , Stop(stop) { } }; @@ -70,12 +71,12 @@ struct ReferenceIndex { bool HasAlignments; // constructor - ReferenceIndex(const BamBinMap& binMap = BamBinMap(), - const LinearOffsetVector& offsets = LinearOffsetVector(), - const bool hasAlignments = false) - : Bins(binMap) - , Offsets(offsets) - , HasAlignments(hasAlignments) + ReferenceIndex(const BamBinMap& binMap = BamBinMap(), + const LinearOffsetVector& offsets = LinearOffsetVector(), + const bool hasAlignments = false) + : Bins(binMap) + , Offsets(offsets) + , HasAlignments(hasAlignments) { } }; @@ -85,126 +86,131 @@ class BamStandardIndex : public BamIndex { // ctor & dtor public: - BamStandardIndex(BamTools::BgzfData* bgzf, BamTools::BamReader* reader); - ~BamStandardIndex(void); + BamStandardIndex(void); + ~BamStandardIndex(void); // interface (implements BamIndex virtual methods) public: - // creates index data (in-memory) from current reader data - bool Build(void); - // returns supported file extension - const std::string Extension(void) const { return std::string(".bai"); } - // returns whether reference has alignments or no - bool HasAlignments(const int& referenceID) const; - // attempts to use index to jump to region; returns success/fail - // a "successful" jump indicates no error, but not whether this region has data - // * thus, the method sets a flag to indicate whether there are alignments - // available after the jump position - bool Jump(const BamTools::BamRegion& region, bool* hasAlignmentsInRegion); + // creates index data (in-memory) from @reader data + bool Build(Internal::BamReaderPrivate* reader); + // returns supported file extension + const std::string Extension(void) { return BAI_EXTENSION; } + // returns whether reference has alignments or no + bool HasAlignments(const int& referenceID) const; + // attempts to use index to jump to @region in @reader; returns success/fail + // a "successful" jump indicates no error, but not whether this region has data + // * thus, the method sets a flag to indicate whether there are alignments + // available after the jump position + bool Jump(Internal::BamReaderPrivate* reader, + const BamTools::BamRegion& region, + bool* hasAlignmentsInRegion); + public: - // clear all current index offset data in memory - void ClearAllData(void); - // return file position after header metadata - const off_t DataBeginOffset(void) const; - // return true if all index data is cached - bool HasFullDataCache(void) const; - // clears index data from all references except the first - void KeepOnlyFirstReferenceOffsets(void); - // load index data for all references, return true if loaded OK - // @saveData - save data in memory if true, just read & discard if false - bool LoadAllReferences(bool saveData = true); - // load first reference from file, return true if loaded OK - // @saveData - save data in memory if true, just read & discard if false - bool LoadFirstReference(bool saveData = true); - // load header data from index file, return true if loaded OK - bool LoadHeader(void); - // position file pointer to first reference begin, return true if skipped OK - bool SkipToFirstReference(void); - // write index reference data - bool WriteAllReferences(void); - // write index header data - bool WriteHeader(void); + // clear all current index offset data in memory + void ClearAllData(void); + // return file position after header metadata + off_t DataBeginOffset(void) const; + // return true if all index data is cached + bool HasFullDataCache(void) const; + // clears index data from all references except the first + void KeepOnlyFirstReferenceOffsets(void); + // load index data for all references, return true if loaded OK + // @saveData - save data in memory if true, just read & discard if false + bool LoadAllReferences(bool saveData = true); + // load first reference from file, return true if loaded OK + // @saveData - save data in memory if true, just read & discard if false + bool LoadFirstReference(bool saveData = true); + // load header data from index file, return true if loaded OK + bool LoadHeader(void); + // position file pointer to first reference begin, return true if skipped OK + bool SkipToFirstReference(void); + // write index reference data + bool WriteAllReferences(void); + // write index header data + bool WriteHeader(void); // 'internal' methods public: - // ----------------------- - // index file operations - - // check index file magic number, return true if OK - bool CheckMagicNumber(void); - // check index file version, return true if OK - bool CheckVersion(void); - // load a single index bin entry from file, return true if loaded OK - // @saveData - save data in memory if true, just read & discard if false - bool LoadBin(ReferenceIndex& refEntry, bool saveData = true); - bool LoadBins(ReferenceIndex& refEntry, bool saveData = true); - // load a single index bin entry from file, return true if loaded OK - // @saveData - save data in memory if true, just read & discard if false - bool LoadChunk(ChunkVector& chunks, bool saveData = true); - bool LoadChunks(ChunkVector& chunks, bool saveData = true); - // load a single index linear offset entry from file, return true if loaded OK - // @saveData - save data in memory if true, just read & discard if false - bool LoadLinearOffsets(ReferenceIndex& refEntry, bool saveData = true); - // load a single reference from file, return true if loaded OK - // @saveData - save data in memory if true, just read & discard if false - bool LoadReference(const int& refId, bool saveData = true); - // loads number of references, return true if loaded OK - bool LoadReferenceCount(int& numReferences); - // position file pointer to desired reference begin, return true if skipped OK - bool SkipToReference(const int& refId); - // write index data for bin to new index file - bool WriteBin(const uint32_t& binId, const ChunkVector& chunks); - // write index data for bins to new index file - bool WriteBins(const BamBinMap& bins); - // write index data for chunk entry to new index file - bool WriteChunk(const Chunk& chunk); - // write index data for chunk entry to new index file - bool WriteChunks(const ChunkVector& chunks); - // write index data for linear offsets entry to new index file - bool WriteLinearOffsets(const LinearOffsetVector& offsets); - // write index data single reference to new index file - bool WriteReference(const ReferenceIndex& refEntry); - - // ----------------------- - // index data operations - - // calculate bins that overlap region - int BinsFromRegion(const BamRegion& region, - const bool isRightBoundSpecified, - uint16_t bins[MAX_BIN]); - // clear all index offset data for desired reference - void ClearReferenceOffsets(const int& refId); - // calculates offset(s) for a given region - bool GetOffsets(const BamRegion& region, - const bool isRightBoundSpecified, - std::vector& offsets, - bool* hasAlignmentsInRegion); - // returns true if index cache has data for desired reference - bool IsDataLoaded(const int& refId) const; - // clears index data from all references except the one specified - void KeepOnlyReferenceOffsets(const int& refId); - // simplifies index by merging 'chunks' - void MergeChunks(void); - // saves BAM bin entry for index - void SaveBinEntry(BamBinMap& binMap, - const uint32_t& saveBin, - const uint64_t& saveOffset, - const uint64_t& lastOffset); - // saves linear offset entry for index - void SaveLinearOffset(LinearOffsetVector& offsets, - const BamAlignment& bAlignment, - const uint64_t& lastOffset); - // initializes index data structure to hold @count references - void SetReferenceCount(const int& count); + // ----------------------- + // index file operations + + // check index file magic number, return true if OK + bool CheckMagicNumber(void); + // check index file version, return true if OK + bool CheckVersion(void); + // load a single index bin entry from file, return true if loaded OK + // @saveData - save data in memory if true, just read & discard if false + bool LoadBin(ReferenceIndex& refEntry, bool saveData = true); + bool LoadBins(ReferenceIndex& refEntry, bool saveData = true); + // load a single index bin entry from file, return true if loaded OK + // @saveData - save data in memory if true, just read & discard if false + bool LoadChunk(ChunkVector& chunks, bool saveData = true); + bool LoadChunks(ChunkVector& chunks, bool saveData = true); + // load a single index linear offset entry from file, return true if loaded OK + // @saveData - save data in memory if true, just read & discard if false + bool LoadLinearOffsets(ReferenceIndex& refEntry, bool saveData = true); + // load a single reference from file, return true if loaded OK + // @saveData - save data in memory if true, just read & discard if false + bool LoadReference(const int& refId, bool saveData = true); + // loads number of references, return true if loaded OK + bool LoadReferenceCount(int& numReferences); + // position file pointer to desired reference begin, return true if skipped OK + bool SkipToReference(const int& refId); + // write index data for bin to new index file + bool WriteBin(const uint32_t& binId, const ChunkVector& chunks); + // write index data for bins to new index file + bool WriteBins(const BamBinMap& bins); + // write index data for chunk entry to new index file + bool WriteChunk(const Chunk& chunk); + // write index data for chunk entry to new index file + bool WriteChunks(const ChunkVector& chunks); + // write index data for linear offsets entry to new index file + bool WriteLinearOffsets(const LinearOffsetVector& offsets); + // write index data single reference to new index file + bool WriteReference(const ReferenceIndex& refEntry); + + // ----------------------- + // index data operations + + // calculate bins that overlap region + int BinsFromRegion(const BamRegion& region, + const RefVector& references, + const bool isRightBoundSpecified, + uint16_t bins[MAX_BIN]); + // clear all index offset data for desired reference + void ClearReferenceOffsets(const int& refId); + // calculates offset(s) for a given region + bool GetOffsets(const BamRegion& region, + const RefVector& references, + const bool isRightBoundSpecified, + std::vector& offsets, + bool* hasAlignmentsInRegion); + // returns true if index cache has data for desired reference + bool IsDataLoaded(const int& refId) const; + // clears index data from all references except the one specified + void KeepOnlyReferenceOffsets(const int& refId); + // simplifies index by merging 'chunks' + void MergeChunks(void); + // saves BAM bin entry for index + void SaveBinEntry(BamBinMap& binMap, + const uint32_t& saveBin, + const uint64_t& saveOffset, + const uint64_t& lastOffset); + // saves linear offset entry for index + void SaveLinearOffset(LinearOffsetVector& offsets, + const BamAlignment& bAlignment, + const uint64_t& lastOffset); + // initializes index data structure to hold @count references + void SetReferenceCount(const int& count); // data members private: - BamStandardIndexData m_indexData; - off_t m_dataBeginOffset; - bool m_hasFullDataCache; - bool m_isBigEndian; + BamStandardIndexData m_indexData; + off_t m_dataBeginOffset; + bool m_hasFullDataCache; + bool m_isBigEndian; }; } // namespace Internal diff --git a/src/api/internal/BamToolsIndex_p.cpp b/src/api/internal/BamToolsIndex_p.cpp index cccb484..954400e 100644 --- a/src/api/internal/BamToolsIndex_p.cpp +++ b/src/api/internal/BamToolsIndex_p.cpp @@ -3,27 +3,29 @@ // Marth Lab, Department of Biology, Boston College // All rights reserved. // --------------------------------------------------------------------------- -// Last modified: 13 January 2011 (DB) +// Last modified: 19 January 2011 (DB) // --------------------------------------------------------------------------- // Provides index operations for the BamTools index format (".bti") // *************************************************************************** #include #include -#include +#include #include +#include using namespace BamTools; using namespace BamTools::Internal; #include #include +#include #include #include #include using namespace std; -BamToolsIndex::BamToolsIndex(BgzfData* bgzf, BamReader* reader) - : BamIndex(bgzf, reader) +BamToolsIndex::BamToolsIndex(void) + : BamIndex() , m_blockSize(1000) , m_dataBeginOffset(0) , m_hasFullDataCache(false) @@ -38,25 +40,30 @@ BamToolsIndex::~BamToolsIndex(void) { ClearAllData(); } -// creates index data (in-memory) from current reader data -bool BamToolsIndex::Build(void) { +// creates index data (in-memory) from @reader data +bool BamToolsIndex::Build(Internal::BamReaderPrivate* reader) { - // be sure reader & BGZF file are valid & open for reading - if ( m_reader == 0 || m_BGZF == 0 || !m_BGZF->IsOpen ) + // skip if invalid reader + if ( reader == 0 ) return false; - // move file pointer to beginning of alignments - if ( !m_reader->Rewind() ) return false; + // skip if reader's BgzfStream is invalid or not open + BgzfStream* bgzfStream = reader->Stream(); + if ( bgzfStream == 0 || !bgzfStream->IsOpen ) + return false; + + // move reader's file pointer to beginning of alignments + if ( !reader->Rewind() ) return false; // initialize index data structure with space for all references - const int numReferences = (int)m_references.size(); + const int numReferences = reader->GetReferenceCount(); m_indexData.clear(); m_hasFullDataCache = false; SetReferenceCount(numReferences); // set up counters and markers int32_t currentBlockCount = 0; - int64_t currentAlignmentOffset = m_BGZF->Tell(); + int64_t currentAlignmentOffset = bgzfStream->Tell(); int32_t blockRefId = 0; int32_t blockMaxEndPosition = 0; int64_t blockStartOffset = currentAlignmentOffset; @@ -64,7 +71,7 @@ bool BamToolsIndex::Build(void) { // plow through alignments, storing index entries BamAlignment al; - while ( m_reader->GetNextAlignmentCore(al) ) { + while ( reader->LoadNextAlignment(al) ) { // if block contains data (not the first time through) AND alignment is on a new reference if ( currentBlockCount > 0 && al.RefID != blockRefId ) { @@ -97,13 +104,13 @@ bool BamToolsIndex::Build(void) { if ( currentBlockCount == m_blockSize ) { BamToolsIndexEntry entry(blockMaxEndPosition, blockStartOffset, blockStartPosition); SaveOffsetEntry(blockRefId, entry); - blockStartOffset = m_BGZF->Tell(); + blockStartOffset = bgzfStream->Tell(); currentBlockCount = 0; } // not the best name, but for the next iteration, this value will be the offset of the *current* alignment // necessary because we won't know if this next alignment is on a new reference until we actually read it - currentAlignmentOffset = m_BGZF->Tell(); + currentAlignmentOffset = bgzfStream->Tell(); } // store final block with data @@ -114,7 +121,7 @@ bool BamToolsIndex::Build(void) { m_hasFullDataCache = true; // return success/failure of rewind - return m_reader->Rewind(); + return reader->Rewind(); } // check index file magic number, return true if OK @@ -125,7 +132,7 @@ bool BamToolsIndex::CheckMagicNumber(void) { size_t elementsRead = fread(magic, 1, 4, m_indexStream); if ( elementsRead != 4 ) return false; if ( strncmp(magic, "BTI\1", 4) != 0 ) { - fprintf(stderr, "Problem with index file - invalid format.\n"); + fprintf(stderr, "BamToolsIndex ERROR: could not load index file - invalid magic number.\n"); return false; } @@ -143,13 +150,13 @@ bool BamToolsIndex::CheckVersion(void) { // if version is negative, or zero if ( m_inputVersion <= 0 ) { - fprintf(stderr, "Problem with index file - invalid version.\n"); + fprintf(stderr, "BamToolsIndex ERROR: could not load index file - invalid version.\n"); return false; } // if version is newer than can be supported by this version of bamtools else if ( m_inputVersion > m_outputVersion ) { - fprintf(stderr, "Problem with index file - attempting to use an outdated version of BamTools with a newer index file.\n"); + fprintf(stderr, "BamToolsIndex ERROR: could not load index file - this version of BamTools does not recognize new index file version.\n"); fprintf(stderr, "Please update BamTools to a more recent version to support this index file.\n"); return false; } @@ -159,14 +166,14 @@ bool BamToolsIndex::CheckVersion(void) { // (typically whose format did not accomodate a particular bug fix) else if ( (Version)m_inputVersion == BTI_1_0 ) { - fprintf(stderr, "\nProblem with index file - this version of the index contains a bug related to accessing data near reference ends.\n"); - fprintf(stderr, "\nPlease run \'bamtools index -bti -in yourData.bam\' to generate an up-to-date BamToolsIndex.\n\n"); + fprintf(stderr, "BamToolsIndex ERROR: could not load index file - this version of the index contains a bug related to accessing data near reference ends.\n"); + fprintf(stderr, "\nPlease run \'bamtools index -bti -in yourData.bam\' to generate an up-to-date, fixed BTI file.\n\n"); return false; } else if ( (Version)m_inputVersion == BTI_1_1 ) { - fprintf(stderr, "\nProblem with index file - this version of the index contains a bug related to handling empty references.\n"); - fprintf(stderr, "\nPlease run \'bamtools index -bti -in yourData.bam\' to generate an up-to-date BamToolsIndex.\n\n"); + fprintf(stderr, "BamToolsIndex ERROR: could not load index file - this version of the index contains a bug related to handling empty references.\n"); + fprintf(stderr, "\nPlease run \'bamtools index -bti -in yourData.bam\' to generate an up-to-date, fixed BTI file.\n\n"); return false; } @@ -193,7 +200,7 @@ void BamToolsIndex::ClearReferenceOffsets(const int& refId) { } // return file position after header metadata -const off_t BamToolsIndex::DataBeginOffset(void) const { +off_t BamToolsIndex::DataBeginOffset(void) const { return m_dataBeginOffset; } @@ -208,10 +215,12 @@ bool BamToolsIndex::GetOffset(const BamRegion& region, int64_t& offset, bool* ha // return false if leftBound refID is not found in index data BamToolsIndexData::const_iterator indexIter = m_indexData.find(region.LeftRefID); - if ( indexIter == m_indexData.end()) return false; + if ( indexIter == m_indexData.end() ) + return false; // load index data for region if not already cached if ( !IsDataLoaded(region.LeftRefID) ) { + bool loadedOk = true; loadedOk &= SkipToReference(region.LeftRefID); loadedOk &= LoadReference(region.LeftRefID); @@ -220,9 +229,12 @@ bool BamToolsIndex::GetOffset(const BamRegion& region, int64_t& offset, bool* ha // localize index data for this reference (& sanity check that data actually exists) indexIter = m_indexData.find(region.LeftRefID); - if ( indexIter == m_indexData.end()) return false; + if ( indexIter == m_indexData.end() ) + return false; + const vector& referenceOffsets = (*indexIter).second.Offsets; - if ( referenceOffsets.empty() ) return false; + if ( referenceOffsets.empty() ) + return false; // ------------------------------------------------------- // calculate nearest index to jump to @@ -235,6 +247,7 @@ bool BamToolsIndex::GetOffset(const BamRegion& region, int64_t& offset, bool* ha vector::const_iterator offsetEnd = referenceOffsets.end(); for ( ; offsetIter != offsetEnd; ++offsetIter ) { const BamToolsIndexEntry& entry = (*offsetIter); + // break if alignment 'entry' overlaps region if ( entry.MaxEndPosition >= region.LeftPosition ) break; offset = (*offsetIter).StartOffset; @@ -244,7 +257,7 @@ bool BamToolsIndex::GetOffset(const BamRegion& region, int64_t& offset, bool* ha *hasAlignmentsInRegion = ( offsetIter != offsetEnd ); // if cache mode set to none, dump the data we just loaded - if (m_cacheMode == BamIndex::NoIndexCaching ) + if ( m_cacheMode == BamIndex::NoIndexCaching ) ClearReferenceOffsets(region.LeftRefID); // return success @@ -277,31 +290,36 @@ bool BamToolsIndex::IsDataLoaded(const int& refId) const { return !refEntry.Offsets.empty(); } -// attempts to use index to jump to region; returns success/fail -bool BamToolsIndex::Jump(const BamRegion& region, bool* hasAlignmentsInRegion) { - +// attempts to use index to jump to @region in @reader; returns success/fail +bool BamToolsIndex::Jump(Internal::BamReaderPrivate* reader, + const BamTools::BamRegion& region, + bool* hasAlignmentsInRegion) +{ // clear flag *hasAlignmentsInRegion = false; - // check valid BamReader state - if ( m_reader == 0 || m_BGZF == 0 || !m_reader->IsOpen() ) { - fprintf(stderr, "ERROR: Could not jump: invalid BamReader state.\n"); + // skip if invalid reader + if ( reader == 0 ) return false; + + // skip if reader's BgzfStream is invalid or not open + BgzfStream* bgzfStream = reader->Stream(); + if ( bgzfStream == 0 || !bgzfStream->IsOpen ) return false; - } // make sure left-bound position is valid - if ( region.LeftPosition > m_references.at(region.LeftRefID).RefLength ) + const RefVector& references = reader->GetReferenceData(); + if ( region.LeftPosition > references.at(region.LeftRefID).RefLength ) return false; // calculate nearest offset to jump to int64_t offset; if ( !GetOffset(region, offset, hasAlignmentsInRegion) ) { - fprintf(stderr, "ERROR: Could not jump - unable to calculate offset for specified region.\n"); + fprintf(stderr, "BamToolsIndex ERROR: could not jump - unable to calculate offset for specified region.\n"); return false; } // return success/failure of seek - return m_BGZF->Seek(offset); + return bgzfStream->Seek(offset); } // clears index data from all references except the first @@ -357,13 +375,15 @@ bool BamToolsIndex::LoadHeader(void) { // read in block size size_t elementsRead = fread(&m_blockSize, sizeof(m_blockSize), 1, m_indexStream); if ( elementsRead != 1 ) return false; + + // swap endian-ness if necessary if ( m_isBigEndian ) SwapEndian_32(m_blockSize); // store offset of beginning of data m_dataBeginOffset = ftell64(m_indexStream); - // return success/failure of load - return (elementsRead == 1); + // return success + return true; } // load a single index entry from file, return true if loaded OK @@ -376,10 +396,7 @@ bool BamToolsIndex::LoadIndexEntry(const int& refId, bool saveData) { elementsRead += fread(&entry.MaxEndPosition, sizeof(entry.MaxEndPosition), 1, m_indexStream); elementsRead += fread(&entry.StartOffset, sizeof(entry.StartOffset), 1, m_indexStream); elementsRead += fread(&entry.StartPosition, sizeof(entry.StartPosition), 1, m_indexStream); - if ( elementsRead != 3 ) { - cerr << "Error reading index entry. Expected 3 elements, read in: " << elementsRead << endl; - return false; - } + if ( elementsRead != 3 ) return false; // swap endian-ness if necessary if ( m_isBigEndian ) { @@ -411,6 +428,8 @@ bool BamToolsIndex::LoadReference(const int& refId, bool saveData) { uint32_t numOffsets; size_t elementsRead = fread(&numOffsets, sizeof(numOffsets), 1, m_indexStream); if ( elementsRead != 1 ) return false; + + // swap endian-ness if necessary if ( m_isBigEndian ) SwapEndian_32(numOffsets); // initialize offsets container for this reference @@ -431,10 +450,13 @@ bool BamToolsIndex::LoadReferenceCount(int& numReferences) { // read reference count elementsRead += fread(&numReferences, sizeof(numReferences), 1, m_indexStream); + if ( elementsRead != 1 ) return false; + + // swap endian-ness if necessary if ( m_isBigEndian ) SwapEndian_32(numReferences); - // return success/failure of load - return ( elementsRead == 1 ); + // return success + return true; } // saves an index offset entry in memory @@ -521,6 +543,7 @@ bool BamToolsIndex::WriteAllReferences(void) { int32_t numReferences = (int32_t)m_indexData.size(); if ( m_isBigEndian ) SwapEndian_32(numReferences); elementsWritten += fwrite(&numReferences, sizeof(numReferences), 1, m_indexStream); + if ( elementsWritten != 1 ) return false; // iterate through references in index bool refOk = true; @@ -529,7 +552,8 @@ bool BamToolsIndex::WriteAllReferences(void) { for ( ; refIter != refEnd; ++refIter ) refOk &= WriteReferenceEntry( (*refIter).second ); - return ( (elementsWritten == 1) && refOk ); + // return success/fail + return refOk; } // write current reference index data to new index file @@ -541,6 +565,7 @@ bool BamToolsIndex::WriteReferenceEntry(const BamToolsReferenceEntry& refEntry) uint32_t numOffsets = refEntry.Offsets.size(); if ( m_isBigEndian ) SwapEndian_32(numOffsets); elementsWritten += fwrite(&numOffsets, sizeof(numOffsets), 1, m_indexStream); + if ( elementsWritten != 1 ) return false; // iterate over offset entries bool entriesOk = true; @@ -549,7 +574,8 @@ bool BamToolsIndex::WriteReferenceEntry(const BamToolsReferenceEntry& refEntry) for ( ; offsetIter != offsetEnd; ++offsetIter ) entriesOk &= WriteIndexEntry( (*offsetIter) ); - return ( (elementsWritten == 1) && entriesOk ); + // return success/fail + return entriesOk; } // write current index offset entry to new index file diff --git a/src/api/internal/BamToolsIndex_p.h b/src/api/internal/BamToolsIndex_p.h index c99834d..ee5abbc 100644 --- a/src/api/internal/BamToolsIndex_p.h +++ b/src/api/internal/BamToolsIndex_p.h @@ -3,7 +3,7 @@ // Marth Lab, Department of Biology, Boston College // All rights reserved. // --------------------------------------------------------------------------- -// Last modified: 19 November 2010 (DB) +// Last modified: 19 January 2011 (DB) // --------------------------------------------------------------------------- // Provides index operations for the BamTools index format (".bti") // *************************************************************************** @@ -28,9 +28,11 @@ #include namespace BamTools { - namespace Internal { +// BTI constants +const std::string BTI_EXTENSION = ".bti"; + // individual index offset entry struct BamToolsIndexEntry { @@ -41,11 +43,11 @@ struct BamToolsIndexEntry { // ctor BamToolsIndexEntry(const int32_t& maxEndPosition = 0, - const int64_t& startOffset = 0, - const int32_t& startPosition = 0) - : MaxEndPosition(maxEndPosition) - , StartOffset(startOffset) - , StartPosition(startPosition) + const int64_t& startOffset = 0, + const int32_t& startPosition = 0) + : MaxEndPosition(maxEndPosition) + , StartOffset(startOffset) + , StartPosition(startPosition) { } }; @@ -58,7 +60,7 @@ struct BamToolsReferenceEntry { // ctor BamToolsReferenceEntry(void) - : HasAlignments(false) + : HasAlignments(false) { } }; @@ -78,112 +80,113 @@ class BamToolsIndex : public BamIndex { // else // do the old thing enum Version { BTI_1_0 = 1 - , BTI_1_1 - , BTI_1_2 - }; + , BTI_1_1 + , BTI_1_2 + }; // ctor & dtor public: - BamToolsIndex(BamTools::BgzfData* bgzf, BamTools::BamReader* reader); - ~BamToolsIndex(void); + BamToolsIndex(void); + ~BamToolsIndex(void); // interface (implements BamIndex virtual methods) public: - // creates index data (in-memory) from current reader data - bool Build(void); - // returns supported file extension - const std::string Extension(void) const { return std::string(".bti"); } - // returns whether reference has alignments or no - bool HasAlignments(const int& referenceID) const; - // attempts to use index to jump to region; returns success/fail - // a "successful" jump indicates no error, but not whether this region has data - // * thus, the method sets a flag to indicate whether there are alignments - // available after the jump position - bool Jump(const BamTools::BamRegion& region, bool* hasAlignmentsInRegion); + // creates index data (in-memory) from @reader data + bool Build(Internal::BamReaderPrivate* reader); + // returns supported file extension + const std::string Extension(void) { return BTI_EXTENSION; } + // returns whether reference has alignments or no + bool HasAlignments(const int& referenceID) const; + // attempts to use index to jump to @region in @reader; returns success/fail + // a "successful" jump indicates no error, but not whether this region has data + // * thus, the method sets a flag to indicate whether there are alignments + // available after the jump position + bool Jump(Internal::BamReaderPrivate* reader, + const BamTools::BamRegion& region, + bool *hasAlignmentsInRegion); + public: - // clear all current index offset data in memory - void ClearAllData(void); - // return file position after header metadata - const off_t DataBeginOffset(void) const; - // return true if all index data is cached - bool HasFullDataCache(void) const; - // clears index data from all references except the first - void KeepOnlyFirstReferenceOffsets(void); - // load index data for all references, return true if loaded OK - // @saveData - save data in memory if true, just read & discard if false - bool LoadAllReferences(bool saveData = true); - // load first reference from file, return true if loaded OK - // @saveData - save data in memory if true, just read & discard if false - bool LoadFirstReference(bool saveData = true); - // load header data from index file, return true if loaded OK - bool LoadHeader(void); - // position file pointer to first reference begin, return true if skipped OK - bool SkipToFirstReference(void); - // write index reference data - bool WriteAllReferences(void); - // write index header data - bool WriteHeader(void); - - // 'internal' methods + // clear all current index offset data in memory + void ClearAllData(void); + // return file position after header metadata + off_t DataBeginOffset(void) const; + // return true if all index data is cached + bool HasFullDataCache(void) const; + // clears index data from all references except the first + void KeepOnlyFirstReferenceOffsets(void); + // load index data for all references, return true if loaded OK + // @saveData - save data in memory if true, just read & discard if false + bool LoadAllReferences(bool saveData = true); + // load first reference from file, return true if loaded OK + // @saveData - save data in memory if true, just read & discard if false + bool LoadFirstReference(bool saveData = true); + // load header data from index file, return true if loaded OK + bool LoadHeader(void); + // position file pointer to first reference begin, return true if skipped OK + bool SkipToFirstReference(void); + // write index reference data + bool WriteAllReferences(void); + // write index header data + bool WriteHeader(void); + + // internal methods public: - // ----------------------- - // index file operations - - // check index file magic number, return true if OK - bool CheckMagicNumber(void); - // check index file version, return true if OK - bool CheckVersion(void); - // return true if FILE* is open - bool IsOpen(void) const; - // load a single index entry from file, return true if loaded OK - // @saveData - save data in memory if true, just read & discard if false - bool LoadIndexEntry(const int& refId, bool saveData = true); - // load a single reference from file, return true if loaded OK - // @saveData - save data in memory if true, just read & discard if false - bool LoadReference(const int& refId, bool saveData = true); - // loads number of references, return true if loaded OK - bool LoadReferenceCount(int& numReferences); - // position file pointer to desired reference begin, return true if skipped OK - bool SkipToReference(const int& refId); - // write current reference index data to new index file - bool WriteReferenceEntry(const BamToolsReferenceEntry& refEntry); - // write current index offset entry to new index file - bool WriteIndexEntry(const BamToolsIndexEntry& entry); - - // ----------------------- - // index data operations - - // clear all index offset data for desired reference - void ClearReferenceOffsets(const int& refId); - // calculate BAM file offset for desired region - // return true if no error (*NOT* equivalent to "has alignments or valid offset") - // check @hasAlignmentsInRegion to determine this status - // @region - target region - // @offset - resulting seek target - // @hasAlignmentsInRegion - sometimes a file just lacks data in region, this flag indicates that status - bool GetOffset(const BamRegion& region, int64_t& offset, bool* hasAlignmentsInRegion); - // returns true if index cache has data for desired reference - bool IsDataLoaded(const int& refId) const; - // clears index data from all references except the one specified - void KeepOnlyReferenceOffsets(const int& refId); - // saves an index offset entry in memory - void SaveOffsetEntry(const int& refId, const BamToolsIndexEntry& entry); - // pre-allocates size for offset vector - void SetOffsetCount(const int& refId, const int& offsetCount); - // initializes index data structure to hold @count references - void SetReferenceCount(const int& count); + // ----------------------- + // index file operations + + // check index file magic number, return true if OK + bool CheckMagicNumber(void); + // check index file version, return true if OK + bool CheckVersion(void); + // load a single index entry from file, return true if loaded OK + // @saveData - save data in memory if true, just read & discard if false + bool LoadIndexEntry(const int& refId, bool saveData = true); + // load a single reference from file, return true if loaded OK + // @saveData - save data in memory if true, just read & discard if false + bool LoadReference(const int& refId, bool saveData = true); + // loads number of references, return true if loaded OK + bool LoadReferenceCount(int& numReferences); + // position file pointer to desired reference begin, return true if skipped OK + bool SkipToReference(const int& refId); + // write current reference index data to new index file + bool WriteReferenceEntry(const BamToolsReferenceEntry& refEntry); + // write current index offset entry to new index file + bool WriteIndexEntry(const BamToolsIndexEntry& entry); + + // ----------------------- + // index data operations + + // clear all index offset data for desired reference + void ClearReferenceOffsets(const int& refId); + // calculate BAM file offset for desired region + // return true if no error (*NOT* equivalent to "has alignments or valid offset") + // check @hasAlignmentsInRegion to determine this status + // @region - target region + // @offset - resulting seek target + // @hasAlignmentsInRegion - sometimes a file just lacks data in region, this flag indicates that status + bool GetOffset(const BamRegion& region, int64_t& offset, bool* hasAlignmentsInRegion); + // returns true if index cache has data for desired reference + bool IsDataLoaded(const int& refId) const; + // clears index data from all references except the one specified + void KeepOnlyReferenceOffsets(const int& refId); + // saves an index offset entry in memory + void SaveOffsetEntry(const int& refId, const BamToolsIndexEntry& entry); + // pre-allocates size for offset vector + void SetOffsetCount(const int& refId, const int& offsetCount); + // initializes index data structure to hold @count references + void SetReferenceCount(const int& count); // data members private: - int32_t m_blockSize; - BamToolsIndexData m_indexData; - off_t m_dataBeginOffset; - bool m_hasFullDataCache; - bool m_isBigEndian; - int32_t m_inputVersion; // Version is serialized as int - Version m_outputVersion; + int32_t m_blockSize; + BamToolsIndexData m_indexData; + off_t m_dataBeginOffset; + bool m_hasFullDataCache; + bool m_isBigEndian; + int32_t m_inputVersion; // Version is serialized as int + Version m_outputVersion; }; } // namespace Internal diff --git a/src/api/internal/BamWriter_p.cpp b/src/api/internal/BamWriter_p.cpp index 90959b6..7147a33 100644 --- a/src/api/internal/BamWriter_p.cpp +++ b/src/api/internal/BamWriter_p.cpp @@ -3,41 +3,46 @@ // Marth Lab, Department of Biology, Boston College // All rights reserved. // --------------------------------------------------------------------------- -// Last modified: 11 January 2011 (DB) +// Last modified: 21 March 2011 (DB) // --------------------------------------------------------------------------- // Provides the basic functionality for producing BAM files // *************************************************************************** #include +#include #include using namespace BamTools; using namespace BamTools::Internal; + +#include +#include +#include using namespace std; // ctor -BamWriterPrivate::BamWriterPrivate(void) { - IsBigEndian = SystemIsBigEndian(); -} +BamWriterPrivate::BamWriterPrivate(void) + : m_isBigEndian( BamTools::SystemIsBigEndian() ) +{ } // dtor BamWriterPrivate::~BamWriterPrivate(void) { - mBGZF.Close(); + m_stream.Close(); } // calculates minimum bin for a BAM alignment interval -const unsigned int BamWriterPrivate::CalculateMinimumBin(const int begin, int end) const { +unsigned int BamWriterPrivate::CalculateMinimumBin(const int begin, int end) const { --end; - if( (begin >> 14) == (end >> 14) ) return 4681 + (begin >> 14); - if( (begin >> 17) == (end >> 17) ) return 585 + (begin >> 17); - if( (begin >> 20) == (end >> 20) ) return 73 + (begin >> 20); - if( (begin >> 23) == (end >> 23) ) return 9 + (begin >> 23); - if( (begin >> 26) == (end >> 26) ) return 1 + (begin >> 26); + if ( (begin >> 14) == (end >> 14) ) return 4681 + (begin >> 14); + if ( (begin >> 17) == (end >> 17) ) return 585 + (begin >> 17); + if ( (begin >> 20) == (end >> 20) ) return 73 + (begin >> 20); + if ( (begin >> 23) == (end >> 23) ) return 9 + (begin >> 23); + if ( (begin >> 26) == (end >> 26) ) return 1 + (begin >> 26); return 0; } // closes the alignment archive void BamWriterPrivate::Close(void) { - mBGZF.Close(); + m_stream.Close(); } // creates a cigar string from the supplied alignment @@ -45,29 +50,32 @@ void BamWriterPrivate::CreatePackedCigar(const vector& cigarOperations, // initialize const unsigned int numCigarOperations = cigarOperations.size(); - packedCigar.resize(numCigarOperations * BT_SIZEOF_INT); + packedCigar.resize(numCigarOperations * Constants::BAM_SIZEOF_INT); // pack the cigar data into the string unsigned int* pPackedCigar = (unsigned int*)packedCigar.data(); - unsigned int cigarOp; - vector::const_iterator coIter; - for(coIter = cigarOperations.begin(); coIter != cigarOperations.end(); ++coIter) { - - switch(coIter->Type) { - case 'M': cigarOp = BAM_CMATCH; break; - case 'I': cigarOp = BAM_CINS; break; - case 'D': cigarOp = BAM_CDEL; break; - case 'N': cigarOp = BAM_CREF_SKIP; break; - case 'S': cigarOp = BAM_CSOFT_CLIP; break; - case 'H': cigarOp = BAM_CHARD_CLIP; break; - case 'P': cigarOp = BAM_CPAD; break; + // iterate over cigar operations + vector::const_iterator coIter = cigarOperations.begin(); + vector::const_iterator coEnd = cigarOperations.end(); + for ( ; coIter != coEnd; ++coIter ) { + + // store op in packedCigar + unsigned int cigarOp; + switch ( coIter->Type ) { + case (Constants::BAM_CIGAR_MATCH_CHAR) : cigarOp = Constants::BAM_CIGAR_MATCH; break; + case (Constants::BAM_CIGAR_INS_CHAR) : cigarOp = Constants::BAM_CIGAR_INS; break; + case (Constants::BAM_CIGAR_DEL_CHAR) : cigarOp = Constants::BAM_CIGAR_DEL; break; + case (Constants::BAM_CIGAR_REFSKIP_CHAR) : cigarOp = Constants::BAM_CIGAR_REFSKIP; break; + case (Constants::BAM_CIGAR_SOFTCLIP_CHAR) : cigarOp = Constants::BAM_CIGAR_SOFTCLIP; break; + case (Constants::BAM_CIGAR_HARDCLIP_CHAR) : cigarOp = Constants::BAM_CIGAR_HARDCLIP; break; + case (Constants::BAM_CIGAR_PAD_CHAR) : cigarOp = Constants::BAM_CIGAR_PAD; break; default: - fprintf(stderr, "ERROR: Unknown cigar operation found: %c\n", coIter->Type); + fprintf(stderr, "BamWriter ERROR: unknown cigar operation found: %c\n", coIter->Type); exit(1); } - *pPackedCigar = coIter->Length << BAM_CIGAR_SHIFT | cigarOp; + *pPackedCigar = coIter->Length << Constants::BAM_CIGAR_SHIFT | cigarOp; pPackedCigar++; } } @@ -85,91 +93,52 @@ void BamWriterPrivate::EncodeQuerySequence(const string& query, string& encodedQ unsigned char nucleotideCode; bool useHighWord = true; - while(*pQuery) { - - switch(*pQuery) { - case '=': nucleotideCode = 0; break; - case 'A': nucleotideCode = 1; break; - case 'C': nucleotideCode = 2; break; - case 'G': nucleotideCode = 4; break; - case 'T': nucleotideCode = 8; break; - case 'N': nucleotideCode = 15; break; + while ( *pQuery ) { + switch ( *pQuery ) { + case (Constants::BAM_DNA_EQUAL) : nucleotideCode = Constants::BAM_BASECODE_EQUAL; break; + case (Constants::BAM_DNA_A) : nucleotideCode = Constants::BAM_BASECODE_A; break; + case (Constants::BAM_DNA_C) : nucleotideCode = Constants::BAM_BASECODE_C; break; + case (Constants::BAM_DNA_G) : nucleotideCode = Constants::BAM_BASECODE_G; break; + case (Constants::BAM_DNA_T) : nucleotideCode = Constants::BAM_BASECODE_T; break; + case (Constants::BAM_DNA_N) : nucleotideCode = Constants::BAM_BASECODE_N; break; default: - fprintf(stderr, "ERROR: Only the following bases are supported in the BAM format: {=, A, C, G, T, N}. Found [%c]\n", *pQuery); + fprintf(stderr, "BamWriter ERROR: only the following bases are supported in the BAM format: {=, A, C, G, T, N}. Found [%c]\n", *pQuery); exit(1); } // pack the nucleotide code - if(useHighWord) { + if ( useHighWord ) { *pEncodedQuery = nucleotideCode << 4; useHighWord = false; } else { *pEncodedQuery |= nucleotideCode; - pEncodedQuery++; + ++pEncodedQuery; useHighWord = true; } // increment the query position - pQuery++; + ++pQuery; } } +// returns whether BAM file is open for writing or not +bool BamWriterPrivate::IsOpen(void) const { + return m_stream.IsOpen; +} + // opens the alignment archive bool BamWriterPrivate::Open(const string& filename, - const string& samHeader, - const RefVector& referenceSequences, - bool isWriteUncompressed) + const string& samHeaderText, + const RefVector& referenceSequences) { // open the BGZF file for writing, return failure if error - if ( !mBGZF.Open(filename, "wb", isWriteUncompressed) ) + if ( !m_stream.Open(filename, "wb") ) return false; - // ================ - // write the header - // ================ - - // write the BAM signature - const unsigned char SIGNATURE_LENGTH = 4; - const char* BAM_SIGNATURE = "BAM\1"; - mBGZF.Write(BAM_SIGNATURE, SIGNATURE_LENGTH); - - // write the SAM header text length - uint32_t samHeaderLen = samHeader.size(); - if (IsBigEndian) SwapEndian_32(samHeaderLen); - mBGZF.Write((char*)&samHeaderLen, BT_SIZEOF_INT); - - // write the SAM header text - if(samHeaderLen > 0) - mBGZF.Write(samHeader.data(), samHeaderLen); - - // write the number of reference sequences - uint32_t numReferenceSequences = referenceSequences.size(); - if (IsBigEndian) SwapEndian_32(numReferenceSequences); - mBGZF.Write((char*)&numReferenceSequences, BT_SIZEOF_INT); - - // ============================= - // write the sequence dictionary - // ============================= - - RefVector::const_iterator rsIter = referenceSequences.begin(); - RefVector::const_iterator rsEnd = referenceSequences.end(); - for( ; rsIter != rsEnd; ++rsIter ) { - - // write the reference sequence name length - uint32_t referenceSequenceNameLen = rsIter->RefName.size() + 1; - if (IsBigEndian) SwapEndian_32(referenceSequenceNameLen); - mBGZF.Write((char*)&referenceSequenceNameLen, BT_SIZEOF_INT); - - // write the reference sequence name - mBGZF.Write(rsIter->RefName.c_str(), referenceSequenceNameLen); - - // write the reference sequence length - int32_t referenceLength = rsIter->RefLength; - if (IsBigEndian) SwapEndian_32(referenceLength); - mBGZF.Write((char*)&referenceLength, BT_SIZEOF_INT); - } - - // return success + // write BAM file 'metadata' components + WriteMagicNumber(); + WriteSamHeaderText(samHeaderText); + WriteReferences(referenceSequences); return true; } @@ -182,11 +151,11 @@ void BamWriterPrivate::SaveAlignment(const BamAlignment& al) { // write the block size unsigned int blockSize = al.SupportData.BlockLength; - if (IsBigEndian) SwapEndian_32(blockSize); - mBGZF.Write((char*)&blockSize, BT_SIZEOF_INT); + if ( m_isBigEndian ) BamTools::SwapEndian_32(blockSize); + m_stream.Write((char*)&blockSize, Constants::BAM_SIZEOF_INT); // assign the BAM core data - uint32_t buffer[8]; + uint32_t buffer[Constants::BAM_CORE_BUFFER_SIZE]; buffer[0] = al.RefID; buffer[1] = al.Position; buffer[2] = (al.Bin << 16) | (al.MapQuality << 8) | al.SupportData.QueryNameLength; @@ -197,16 +166,17 @@ void BamWriterPrivate::SaveAlignment(const BamAlignment& al) { buffer[7] = al.InsertSize; // swap BAM core endian-ness, if necessary - if ( IsBigEndian ) { + if ( m_isBigEndian ) { for ( int i = 0; i < 8; ++i ) - SwapEndian_32(buffer[i]); + BamTools::SwapEndian_32(buffer[i]); } // write the BAM core - mBGZF.Write((char*)&buffer, BAM_CORE_SIZE); + m_stream.Write((char*)&buffer, Constants::BAM_CORE_SIZE); // write the raw char data - mBGZF.Write((char*)al.SupportData.AllCharData.data(), al.SupportData.BlockLength-BAM_CORE_SIZE); + m_stream.Write((char*)al.SupportData.AllCharData.data(), + al.SupportData.BlockLength-Constants::BAM_CORE_SIZE); } // otherwise, BamAlignment should contain character in the standard fields: Name, QueryBases, etc @@ -240,12 +210,12 @@ void BamWriterPrivate::SaveAlignment(const BamAlignment& al) { encodedQueryLength + queryLength + tagDataLength; - unsigned int blockSize = BAM_CORE_SIZE + dataBlockSize; - if (IsBigEndian) SwapEndian_32(blockSize); - mBGZF.Write((char*)&blockSize, BT_SIZEOF_INT); + unsigned int blockSize = Constants::BAM_CORE_SIZE + dataBlockSize; + if ( m_isBigEndian ) BamTools::SwapEndian_32(blockSize); + m_stream.Write((char*)&blockSize, Constants::BAM_SIZEOF_INT); // assign the BAM core data - uint32_t buffer[8]; + uint32_t buffer[Constants::BAM_CORE_BUFFER_SIZE]; buffer[0] = al.RefID; buffer[1] = al.Position; buffer[2] = (alignmentBin << 16) | (al.MapQuality << 8) | nameLength; @@ -256,45 +226,40 @@ void BamWriterPrivate::SaveAlignment(const BamAlignment& al) { buffer[7] = al.InsertSize; // swap BAM core endian-ness, if necessary - if ( IsBigEndian ) { + if ( m_isBigEndian ) { for ( int i = 0; i < 8; ++i ) - SwapEndian_32(buffer[i]); + BamTools::SwapEndian_32(buffer[i]); } // write the BAM core - mBGZF.Write((char*)&buffer, BAM_CORE_SIZE); + m_stream.Write((char*)&buffer, Constants::BAM_CORE_SIZE); // write the query name - mBGZF.Write(al.Name.c_str(), nameLength); + m_stream.Write(al.Name.c_str(), nameLength); // write the packed cigar - if ( IsBigEndian ) { - + if ( m_isBigEndian ) { char* cigarData = (char*)calloc(sizeof(char), packedCigarLength); memcpy(cigarData, packedCigar.data(), packedCigarLength); - - for (unsigned int i = 0; i < packedCigarLength; ++i) { - if ( IsBigEndian ) - SwapEndian_32p(&cigarData[i]); - } - - mBGZF.Write(cigarData, packedCigarLength); + for (unsigned int i = 0; i < packedCigarLength; ++i) + if (m_isBigEndian) BamTools::SwapEndian_32p(&cigarData[i]); + m_stream.Write(cigarData, packedCigarLength); free(cigarData); } else - mBGZF.Write(packedCigar.data(), packedCigarLength); + m_stream.Write(packedCigar.data(), packedCigarLength); // write the encoded query sequence - mBGZF.Write(encodedQuery.data(), encodedQueryLength); + m_stream.Write(encodedQuery.data(), encodedQueryLength); // write the base qualities char* pBaseQualities = (char*)al.Qualities.data(); for(unsigned int i = 0; i < queryLength; i++) pBaseQualities[i] -= 33; // FASTQ conversion - mBGZF.Write(pBaseQualities, queryLength); + m_stream.Write(pBaseQualities, queryLength); // write the read group tag - if ( IsBigEndian ) { + if ( m_isBigEndian ) { char* tagData = (char*)calloc(sizeof(char), tagDataLength); memcpy(tagData, al.TagData.data(), tagDataLength); @@ -302,50 +267,107 @@ void BamWriterPrivate::SaveAlignment(const BamAlignment& al) { int i = 0; while ( (unsigned int)i < tagDataLength ) { - i += 2; // skip tag type (e.g. "RG", "NM", etc) - uint8_t type = toupper(tagData[i]); // lower & upper case letters have same meaning - ++i; // skip value type + i += Constants::BAM_TAG_TAGSIZE; // skip tag chars (e.g. "RG", "NM", etc.) + const char type = tagData[i]; // get tag type at position i + ++i; - switch (type) { + switch ( type ) { - case('A') : - case('C') : + case(Constants::BAM_TAG_TYPE_ASCII) : + case(Constants::BAM_TAG_TYPE_INT8) : + case(Constants::BAM_TAG_TYPE_UINT8) : ++i; break; - case('S') : - SwapEndian_16p(&tagData[i]); - i+=2; // sizeof(uint16_t) - break; - - case('F') : - case('I') : - SwapEndian_32p(&tagData[i]); - i+=4; // sizeof(uint32_t) + case(Constants::BAM_TAG_TYPE_INT16) : + case(Constants::BAM_TAG_TYPE_UINT16) : + BamTools::SwapEndian_16p(&tagData[i]); + i += sizeof(uint16_t); break; - case('D') : - SwapEndian_64p(&tagData[i]); - i+=8; // sizeof(uint64_t) + case(Constants::BAM_TAG_TYPE_FLOAT) : + case(Constants::BAM_TAG_TYPE_INT32) : + case(Constants::BAM_TAG_TYPE_UINT32) : + BamTools::SwapEndian_32p(&tagData[i]); + i += sizeof(uint32_t); break; - case('H') : - case('Z') : + case(Constants::BAM_TAG_TYPE_HEX) : + case(Constants::BAM_TAG_TYPE_STRING) : + // no endian swapping necessary for hex-string/string data while (tagData[i]) { ++i; } - ++i; // increment one more for null terminator + // increment one more for null terminator + ++i; break; default : - fprintf(stderr, "ERROR: Invalid tag value type\n"); // shouldn't get here + fprintf(stderr, "BamWriter ERROR: invalid tag value type\n"); // shouldn't get here free(tagData); exit(1); } } - - mBGZF.Write(tagData, tagDataLength); + m_stream.Write(tagData, tagDataLength); free(tagData); } else - mBGZF.Write(al.TagData.data(), tagDataLength); + m_stream.Write(al.TagData.data(), tagDataLength); + } +} + +void BamWriterPrivate::SetWriteCompressed(bool ok) { + + // warn if BAM file is already open + // modifying compression is not allowed in this case + if ( IsOpen() ) { + cerr << "BamWriter WARNING: attempting to change compression mode on an open BAM file is not allowed. " + << "Ignoring request." << endl; + return; } + + // set BgzfStream compression mode + m_stream.SetWriteCompressed(ok); +} + +void BamWriterPrivate::WriteMagicNumber(void) { + // write BAM file 'magic number' + m_stream.Write(Constants::BAM_HEADER_MAGIC, Constants::BAM_HEADER_MAGIC_LENGTH); +} + +void BamWriterPrivate::WriteReferences(const BamTools::RefVector& referenceSequences) { + + // write the number of reference sequences + uint32_t numReferenceSequences = referenceSequences.size(); + if ( m_isBigEndian ) BamTools::SwapEndian_32(numReferenceSequences); + m_stream.Write((char*)&numReferenceSequences, Constants::BAM_SIZEOF_INT); + + // foreach reference sequence + RefVector::const_iterator rsIter = referenceSequences.begin(); + RefVector::const_iterator rsEnd = referenceSequences.end(); + for ( ; rsIter != rsEnd; ++rsIter ) { + + // write the reference sequence name length + uint32_t referenceSequenceNameLen = rsIter->RefName.size() + 1; + if ( m_isBigEndian ) BamTools::SwapEndian_32(referenceSequenceNameLen); + m_stream.Write((char*)&referenceSequenceNameLen, Constants::BAM_SIZEOF_INT); + + // write the reference sequence name + m_stream.Write(rsIter->RefName.c_str(), referenceSequenceNameLen); + + // write the reference sequence length + int32_t referenceLength = rsIter->RefLength; + if ( m_isBigEndian ) BamTools::SwapEndian_32(referenceLength); + m_stream.Write((char*)&referenceLength, Constants::BAM_SIZEOF_INT); + } +} + +void BamWriterPrivate::WriteSamHeaderText(const std::string& samHeaderText) { + + // write the SAM header text length + uint32_t samHeaderLen = samHeaderText.size(); + if ( m_isBigEndian ) BamTools::SwapEndian_32(samHeaderLen); + m_stream.Write((char*)&samHeaderLen, Constants::BAM_SIZEOF_INT); + + // write the SAM header text + if (samHeaderLen > 0) + m_stream.Write(samHeaderText.data(), samHeaderLen); } diff --git a/src/api/internal/BamWriter_p.h b/src/api/internal/BamWriter_p.h index d931405..dd2b0fe 100644 --- a/src/api/internal/BamWriter_p.h +++ b/src/api/internal/BamWriter_p.h @@ -3,7 +3,7 @@ // Marth Lab, Department of Biology, Boston College // All rights reserved. // --------------------------------------------------------------------------- -// Last modified: 19 November 2010 (DB) +// Last modified: 24 February 2011 (DB) // --------------------------------------------------------------------------- // Provides the basic functionality for producing BAM files // *************************************************************************** @@ -22,42 +22,43 @@ // We mean it. #include -#include +#include #include #include namespace BamTools { - -class SamHeader; - namespace Internal { class BamWriterPrivate { // ctor & dtor public: - BamWriterPrivate(void); - ~BamWriterPrivate(void); + BamWriterPrivate(void); + ~BamWriterPrivate(void); - // "public" interface to BamWriter + // interface methods public: - void Close(void); - bool Open(const std::string& filename, - const std::string& samHeader, - const BamTools::RefVector& referenceSequences, - bool isWriteUncompressed); - void SaveAlignment(const BamAlignment& al); + void Close(void); + bool IsOpen(void) const; + bool Open(const std::string& filename, + const std::string& samHeaderText, + const BamTools::RefVector& referenceSequences); + void SaveAlignment(const BamAlignment& al); + void SetWriteCompressed(bool ok); - // internal methods + // 'internal' methods public: - const unsigned int CalculateMinimumBin(const int begin, int end) const; - void CreatePackedCigar(const std::vector& cigarOperations, std::string& packedCigar); - void EncodeQuerySequence(const std::string& query, std::string& encodedQuery); + unsigned int CalculateMinimumBin(const int begin, int end) const; + void CreatePackedCigar(const std::vector& cigarOperations, std::string& packedCigar); + void EncodeQuerySequence(const std::string& query, std::string& encodedQuery); + void WriteMagicNumber(void); + void WriteReferences(const BamTools::RefVector& referenceSequences); + void WriteSamHeaderText(const std::string& samHeaderText); // data members - public: - BgzfData mBGZF; - bool IsBigEndian; + private: + BgzfStream m_stream; + bool m_isBigEndian; }; } // namespace Internal diff --git a/src/api/internal/BgzfStream_p.cpp b/src/api/internal/BgzfStream_p.cpp new file mode 100644 index 0000000..fb67799 --- /dev/null +++ b/src/api/internal/BgzfStream_p.cpp @@ -0,0 +1,444 @@ +// *************************************************************************** +// BgzfStream_p.cpp (c) 2011 Derek Barnett +// Marth Lab, Department of Biology, Boston College +// All rights reserved. +// --------------------------------------------------------------------------- +// Last modified: 21 March 2011(DB) +// --------------------------------------------------------------------------- +// Based on BGZF routines developed at the Broad Institute. +// Provides the basic functionality for reading & writing BGZF files +// Replaces the old BGZF.* files to avoid clashing with other toolkits +// *************************************************************************** + +#include +using namespace BamTools; +using namespace BamTools::Internal; + +#include +#include +using namespace std; + +// constructor +BgzfStream::BgzfStream(void) + : UncompressedBlockSize(Constants::BGZF_DEFAULT_BLOCK_SIZE) + , CompressedBlockSize(Constants::BGZF_MAX_BLOCK_SIZE) + , BlockLength(0) + , BlockOffset(0) + , BlockAddress(0) + , IsOpen(false) + , IsWriteOnly(false) + , IsWriteCompressed(true) + , Stream(NULL) + , UncompressedBlock(NULL) + , CompressedBlock(NULL) +{ + try { + CompressedBlock = new char[CompressedBlockSize]; + UncompressedBlock = new char[UncompressedBlockSize]; + } catch( std::bad_alloc& ba ) { + fprintf(stderr, "BgzfStream ERROR: unable to allocate memory\n"); + exit(1); + } +} + +// destructor +BgzfStream::~BgzfStream(void) { + if( CompressedBlock ) delete[] CompressedBlock; + if( UncompressedBlock ) delete[] UncompressedBlock; +} + +// closes BGZF file +void BgzfStream::Close(void) { + + // skip if file not open + if ( !IsOpen ) return; + + // if writing to file, flush the current BGZF block, + // then write an empty block (as EOF marker) + if ( IsWriteOnly ) { + FlushBlock(); + int blockLength = DeflateBlock(); + fwrite(CompressedBlock, 1, blockLength, Stream); + } + + // flush and close stream + fflush(Stream); + fclose(Stream); + + // reset flags + IsWriteCompressed = true; + IsOpen = false; +} + +// compresses the current block +int BgzfStream::DeflateBlock(void) { + + // initialize the gzip header + char* buffer = CompressedBlock; + memset(buffer, 0, 18); + buffer[0] = Constants::GZIP_ID1; + buffer[1] = (char)Constants::GZIP_ID2; + buffer[2] = Constants::CM_DEFLATE; + buffer[3] = Constants::FLG_FEXTRA; + buffer[9] = (char)Constants::OS_UNKNOWN; + buffer[10] = Constants::BGZF_XLEN; + buffer[12] = Constants::BGZF_ID1; + buffer[13] = Constants::BGZF_ID2; + buffer[14] = Constants::BGZF_LEN; + + // set compression level + const int compressionLevel = ( IsWriteCompressed ? Z_DEFAULT_COMPRESSION : 0 ); + + // loop to retry for blocks that do not compress enough + int inputLength = BlockOffset; + int compressedLength = 0; + unsigned int bufferSize = CompressedBlockSize; + + while ( true ) { + + // initialize zstream values + z_stream zs; + zs.zalloc = NULL; + zs.zfree = NULL; + zs.next_in = (Bytef*)UncompressedBlock; + zs.avail_in = inputLength; + zs.next_out = (Bytef*)&buffer[Constants::BGZF_BLOCK_HEADER_LENGTH]; + zs.avail_out = bufferSize - Constants::BGZF_BLOCK_HEADER_LENGTH - Constants::BGZF_BLOCK_FOOTER_LENGTH; + + // initialize the zlib compression algorithm + if ( deflateInit2(&zs, + compressionLevel, + Z_DEFLATED, + Constants::GZIP_WINDOW_BITS, + Constants::Z_DEFAULT_MEM_LEVEL, + Z_DEFAULT_STRATEGY) != Z_OK ) + { + fprintf(stderr, "BgzfStream ERROR: zlib deflate initialization failed\n"); + exit(1); + } + + // compress the data + int status = deflate(&zs, Z_FINISH); + if ( status != Z_STREAM_END ) { + + deflateEnd(&zs); + + // reduce the input length and try again + if ( status == Z_OK ) { + inputLength -= 1024; + if ( inputLength < 0 ) { + fprintf(stderr, "BgzfStream ERROR: input reduction failed\n"); + exit(1); + } + continue; + } + + fprintf(stderr, "BgzfStream ERROR: zlib::deflateEnd() failed\n"); + exit(1); + } + + // finalize the compression routine + if ( deflateEnd(&zs) != Z_OK ) { + fprintf(stderr, "BgzfStream ERROR: zlib::deflateEnd() failed\n"); + exit(1); + } + + compressedLength = zs.total_out; + compressedLength += Constants::BGZF_BLOCK_HEADER_LENGTH + Constants::BGZF_BLOCK_FOOTER_LENGTH; + if ( compressedLength > Constants::BGZF_MAX_BLOCK_SIZE ) { + fprintf(stderr, "BgzfStream ERROR: deflate overflow\n"); + exit(1); + } + + break; + } + + // store the compressed length + BamTools::PackUnsignedShort(&buffer[16], (unsigned short)(compressedLength - 1)); + + // store the CRC32 checksum + unsigned int crc = crc32(0, NULL, 0); + crc = crc32(crc, (Bytef*)UncompressedBlock, inputLength); + BamTools::PackUnsignedInt(&buffer[compressedLength - 8], crc); + BamTools::PackUnsignedInt(&buffer[compressedLength - 4], inputLength); + + // ensure that we have less than a block of data left + int remaining = BlockOffset - inputLength; + if ( remaining > 0 ) { + if ( remaining > inputLength ) { + fprintf(stderr, "BgzfStream ERROR: after deflate, remainder too large\n"); + exit(1); + } + memcpy(UncompressedBlock, UncompressedBlock + inputLength, remaining); + } + + // update block data + BlockOffset = remaining; + + // return result + return compressedLength; +} + +// flushes the data in the BGZF block +void BgzfStream::FlushBlock(void) { + + // flush all of the remaining blocks + while ( BlockOffset > 0 ) { + + // compress the data block + int blockLength = DeflateBlock(); + + // flush the data to our output stream + int numBytesWritten = fwrite(CompressedBlock, 1, blockLength, Stream); + if ( numBytesWritten != blockLength ) { + fprintf(stderr, "BgzfStream ERROR: expected to write %u bytes during flushing, but wrote %u bytes\n", + blockLength, numBytesWritten); + exit(1); + } + + // update block data + BlockAddress += blockLength; + } +} + +// decompresses the current block +int BgzfStream::InflateBlock(const int& blockLength) { + + // inflate the data from compressed buffer into uncompressed buffer + z_stream zs; + zs.zalloc = NULL; + zs.zfree = NULL; + zs.next_in = (Bytef*)CompressedBlock + 18; + zs.avail_in = blockLength - 16; + zs.next_out = (Bytef*)UncompressedBlock; + zs.avail_out = UncompressedBlockSize; + + int status = inflateInit2(&zs, Constants::GZIP_WINDOW_BITS); + if ( status != Z_OK ) { + fprintf(stderr, "BgzfStream ERROR: could not decompress block - zlib::inflateInit() failed\n"); + return -1; + } + + status = inflate(&zs, Z_FINISH); + if ( status != Z_STREAM_END ) { + inflateEnd(&zs); + fprintf(stderr, "BgzfStream ERROR: could not decompress block - zlib::inflate() failed\n"); + return -1; + } + + status = inflateEnd(&zs); + if ( status != Z_OK ) { + fprintf(stderr, "BgzfStream ERROR: could not decompress block - zlib::inflateEnd() failed\n"); + return -1; + } + + // return result + return zs.total_out; +} + +// opens the BGZF file for reading (mode is either "rb" for reading, or "wb" for writing) +bool BgzfStream::Open(const string& filename, const char* mode) { + + // close current stream, if necessary, before opening next + if ( IsOpen ) Close(); + + // determine open mode + if ( strcmp(mode, "rb") == 0 ) + IsWriteOnly = false; + else if ( strcmp(mode, "wb") == 0) + IsWriteOnly = true; + else { + fprintf(stderr, "BgzfStream ERROR: unknown file mode: %s\n", mode); + return false; + } + + // open BGZF stream on a file + if ( (filename != "stdin") && (filename != "stdout") ) + Stream = fopen(filename.c_str(), mode); + + // open BGZF stream on stdin + else if ( (filename == "stdin") && (strcmp(mode, "rb") == 0 ) ) + Stream = freopen(NULL, mode, stdin); + + // open BGZF stream on stdout + else if ( (filename == "stdout") && (strcmp(mode, "wb") == 0) ) + Stream = freopen(NULL, mode, stdout); + + if ( !Stream ) { + fprintf(stderr, "BgzfStream ERROR: unable to open file %s\n", filename.c_str() ); + return false; + } + + // set flag & return success + IsOpen = true; + return true; +} + +// reads BGZF data into a byte buffer +int BgzfStream::Read(char* data, const unsigned int dataLength) { + + // if stream not open for reading (or empty request) + if ( !IsOpen || IsWriteOnly || dataLength == 0 ) + return 0; + + // read blocks as needed until desired data length is retrieved + char* output = data; + unsigned int numBytesRead = 0; + while ( numBytesRead < dataLength ) { + + // determine bytes available in current block + int bytesAvailable = BlockLength - BlockOffset; + + // read (and decompress) next block if needed + if ( bytesAvailable <= 0 ) { + if ( !ReadBlock() ) return -1; + bytesAvailable = BlockLength - BlockOffset; + if ( bytesAvailable <= 0 ) break; + } + + // copy data from uncompressed source buffer into data destination buffer + char* buffer = UncompressedBlock; + int copyLength = min( (int)(dataLength-numBytesRead), bytesAvailable ); + memcpy(output, buffer + BlockOffset, copyLength); + + // update counters + BlockOffset += copyLength; + output += copyLength; + numBytesRead += copyLength; + } + + // update block data + if ( BlockOffset == BlockLength ) { + BlockAddress = ftell64(Stream); + BlockOffset = 0; + BlockLength = 0; + } + + return numBytesRead; +} + +// reads a BGZF block +bool BgzfStream::ReadBlock(void) { + + char header[Constants::BGZF_BLOCK_HEADER_LENGTH]; + int64_t blockAddress = ftell64(Stream); + + // read block header from file + int count = fread(header, 1, sizeof(header), Stream); + + // if block header empty + if ( count == 0 ) { + BlockLength = 0; + return true; + } + + // if block header invalid size + if ( count != sizeof(header) ) { + fprintf(stderr, "BgzfStream ERROR: read block failed - could not read block header\n"); + return false; + } + + // validate block header contents + if ( !BgzfStream::CheckBlockHeader(header) ) { + fprintf(stderr, "BgzfStream ERROR: read block failed - invalid block header\n"); + return false; + } + + // copy header contents to compressed buffer + int blockLength = BamTools::UnpackUnsignedShort(&header[16]) + 1; + char* compressedBlock = CompressedBlock; + memcpy(compressedBlock, header, Constants::BGZF_BLOCK_HEADER_LENGTH); + int remaining = blockLength - Constants::BGZF_BLOCK_HEADER_LENGTH; + + // read remainder of block + count = fread(&compressedBlock[Constants::BGZF_BLOCK_HEADER_LENGTH], 1, remaining, Stream); + if ( count != remaining ) { + fprintf(stderr, "BgzfStream ERROR: read block failed - could not read data from block\n"); + return false; + } + + // decompress block data + count = InflateBlock(blockLength); + if ( count < 0 ) { + fprintf(stderr, "BgzfStream ERROR: read block failed - could not decompress block data\n"); + return false; + } + + // update block data + if ( BlockLength != 0 ) + BlockOffset = 0; + BlockAddress = blockAddress; + BlockLength = count; + + // return success + return true; +} + +// seek to position in BGZF file +bool BgzfStream::Seek(int64_t position) { + + // skip if not open + if ( !IsOpen ) return false; + + // determine adjusted offset & address + int blockOffset = (position & 0xFFFF); + int64_t blockAddress = (position >> 16) & 0xFFFFFFFFFFFFLL; + + // attempt seek in file + if ( fseek64(Stream, blockAddress, SEEK_SET) != 0 ) { + fprintf(stderr, "BgzfStream ERROR: unable to seek in file\n"); + return false; + } + + // update block data + BlockLength = 0; + BlockAddress = blockAddress; + BlockOffset = blockOffset; + + // return success + return true; +} + +void BgzfStream::SetWriteCompressed(bool ok) { + IsWriteCompressed = ok; +} + +// get file position in BGZF file +int64_t BgzfStream::Tell(void) { + + // skip if file not open + if ( !IsOpen ) return false; + + // otherwise return file pointer position + return ( (BlockAddress << 16) | (BlockOffset & 0xFFFF) ); +} + +// writes the supplied data into the BGZF buffer +unsigned int BgzfStream::Write(const char* data, const unsigned int dataLen) { + + // skip if file not open for writing + if ( !IsOpen || !IsWriteOnly ) return false; + + // write blocks as needed til all data is written + unsigned int numBytesWritten = 0; + const char* input = data; + unsigned int blockLength = UncompressedBlockSize; + while ( numBytesWritten < dataLen ) { + + // copy data contents to uncompressed output buffer + unsigned int copyLength = min(blockLength - BlockOffset, dataLen - numBytesWritten); + char* buffer = UncompressedBlock; + memcpy(buffer + BlockOffset, input, copyLength); + + // update counter + BlockOffset += copyLength; + input += copyLength; + numBytesWritten += copyLength; + + // flush (& compress) output buffer when full + if ( BlockOffset == blockLength ) FlushBlock(); + } + + // return result + return numBytesWritten; +} diff --git a/src/api/internal/BgzfStream_p.h b/src/api/internal/BgzfStream_p.h new file mode 100644 index 0000000..69473e6 --- /dev/null +++ b/src/api/internal/BgzfStream_p.h @@ -0,0 +1,109 @@ +// *************************************************************************** +// BgzfStream_p.h (c) 2011 Derek Barnett +// Marth Lab, Department of Biology, Boston College +// All rights reserved. +// --------------------------------------------------------------------------- +// Last modified: 24 February 2011(DB) +// --------------------------------------------------------------------------- +// Based on BGZF routines developed at the Broad Institute. +// Provides the basic functionality for reading & writing BGZF files +// Replaces the old BGZF.* files to avoid clashing with other toolkits +// *************************************************************************** + +#ifndef BGZFSTREAM_P_H +#define BGZFSTREAM_P_H + +// ------------- +// W A R N I N G +// ------------- +// +// This file is not part of the BamTools API. It exists purely as an +// implementation detail. This header file may change from version to version +// without notice, or even be removed. +// +// We mean it. + +#include +#include +#include "zlib.h" +#include +#include + +namespace BamTools { +namespace Internal { + +class BgzfStream { + + // constructor & destructor + public: + BgzfStream(void); + ~BgzfStream(void); + + // main interface methods + public: + // closes BGZF file + void Close(void); + // opens the BGZF file (mode is either "rb" for reading, or "wb" for writing) + bool Open(const std::string& filename, const char* mode); + // reads BGZF data into a byte buffer + int Read(char* data, const unsigned int dataLength); + // seek to position in BGZF file + bool Seek(int64_t position); + // enable/disable compressed output + void SetWriteCompressed(bool ok); + // get file position in BGZF file + int64_t Tell(void); + // writes the supplied data into the BGZF buffer + unsigned int Write(const char* data, const unsigned int dataLen); + + // internal methods + private: + // compresses the current block + int DeflateBlock(void); + // flushes the data in the BGZF block + void FlushBlock(void); + // de-compresses the current block + int InflateBlock(const int& blockLength); + // reads a BGZF block + bool ReadBlock(void); + + // static 'utility' methods + public: + // checks BGZF block header + static inline bool CheckBlockHeader(char* header); + + // data members + public: + unsigned int UncompressedBlockSize; + unsigned int CompressedBlockSize; + unsigned int BlockLength; + unsigned int BlockOffset; + uint64_t BlockAddress; + bool IsOpen; + bool IsWriteOnly; + bool IsWriteCompressed; + FILE* Stream; + char* UncompressedBlock; + char* CompressedBlock; +}; + +// ------------------------------------------------------------- +// static 'utility' method implementations + +// checks BGZF block header +inline +bool BgzfStream::CheckBlockHeader(char* header) { + return (header[0] == Constants::GZIP_ID1 && + header[1] == (char)Constants::GZIP_ID2 && + header[2] == Z_DEFLATED && + (header[3] & Constants::FLG_FEXTRA) != 0 && + BamTools::UnpackUnsignedShort(&header[10]) == Constants::BGZF_XLEN && + header[12] == Constants::BGZF_ID1 && + header[13] == Constants::BGZF_ID2 && + BamTools::UnpackUnsignedShort(&header[14]) == Constants::BGZF_LEN ); +} + +} // namespace Internal +} // namespace BamTools + +#endif // BGZFSTREAM_P_H diff --git a/src/api/internal/SamFormatParser_p.cpp b/src/api/internal/SamFormatParser_p.cpp index aa690b8..02e9889 100644 --- a/src/api/internal/SamFormatParser_p.cpp +++ b/src/api/internal/SamFormatParser_p.cpp @@ -36,7 +36,7 @@ void SamFormatParser::Parse(const string& headerText) { // other wise parse SAM lines istringstream headerStream(headerText); - string headerLine = ""; + string headerLine(""); while ( getline(headerStream, headerLine) ) ParseSamLine(headerLine); return; @@ -55,7 +55,9 @@ void SamFormatParser::ParseSamLine(const string& line) { else if ( firstToken == Constants::SAM_RG_BEGIN_TOKEN) ParseRGLine(restOfLine); else if ( firstToken == Constants::SAM_PG_BEGIN_TOKEN) ParsePGLine(restOfLine); else if ( firstToken == Constants::SAM_CO_BEGIN_TOKEN) ParseCOLine(restOfLine); - else cerr << "SAM Format Error - unknown token: " << firstToken << endl; + else + cerr << "SamFormatParser ERROR: unknown token: " << firstToken << endl; + return; } @@ -78,12 +80,12 @@ void SamFormatParser::ParseHDLine(const string& line) { else if ( tokenTag == Constants::SAM_HD_GROUPORDER_TAG ) m_header.GroupOrder = tokenValue; else if ( tokenTag == Constants::SAM_HD_SORTORDER_TAG ) m_header.SortOrder = tokenValue; else - cerr << "SAM Format Error - unknown HD tag: " << tokenTag << endl; + cerr << "SamFormatParser ERROR: unknown HD tag: " << tokenTag << endl; } // if @HD line exists, VN must be provided if ( !m_header.HasVersion() ) { - cerr << "SAM Format Error - @HD line is missing VN tag!" << endl; + cerr << "SamFormatParser ERROR: @HD line is missing VN tag" << endl; return; } } @@ -112,18 +114,18 @@ void SamFormatParser::ParseSQLine(const string& line) { else if ( tokenTag == Constants::SAM_SQ_CHECKSUM_TAG ) seq.Checksum = tokenValue; else if ( tokenTag == Constants::SAM_SQ_SPECIES_TAG ) seq.Species = tokenValue; else - cerr << "SAM Format Error - unknown SQ tag: " << tokenTag << endl; + cerr << "SamFormatParser ERROR: unknown SQ tag: " << tokenTag << endl; } // if @SQ line exists, SN must be provided if ( !seq.HasName() ) { - cerr << "SAM Format Error - @SQ line is missing SN tag!" << endl; + cerr << "SamFormatParser ERROR: @SQ line is missing SN tag" << endl; return; } // if @SQ line exists, LN must be provided if ( !seq.HasLength() ) { - cerr << "SAM Format Error - @SQ line is missing LN tag!" << endl; + cerr << "SamFormatParser ERROR: @SQ line is missing LN tag" << endl; return; } @@ -158,18 +160,18 @@ void SamFormatParser::ParseRGLine(const string& line) { else if ( tokenTag == Constants::SAM_RG_PRODUCTIONDATE_TAG ) rg.ProductionDate = tokenValue; else if ( tokenTag == Constants::SAM_RG_SEQTECHNOLOGY_TAG ) rg.SequencingTechnology = tokenValue; else - cerr << "SAM Format Error - unknown RG tag: " << tokenTag << endl; + cerr << "SamFormatParser ERROR: unknown RG tag: " << tokenTag << endl; } // if @RG line exists, ID must be provided if ( !rg.HasID() ) { - cerr << "SAM Format Error - @RG line is missing ID tag!" << endl; + cerr << "SamFormatParser ERROR: @RG line is missing ID tag" << endl; return; } // if @RG line exists, SM must be provided if ( !rg.HasSample() ) { - cerr << "SAM Format Error - @RG line is missing SM tag!" << endl; + cerr << "SamFormatParser ERROR: @RG line is missing SM tag" << endl; return; } @@ -196,12 +198,12 @@ void SamFormatParser::ParsePGLine(const string& line) { else if ( tokenTag == Constants::SAM_PG_VERSION_TAG ) m_header.ProgramVersion = tokenValue; else if ( tokenTag == Constants::SAM_PG_COMMANDLINE_TAG ) m_header.ProgramCommandLine = tokenValue; else - cerr << "SAM Format Error - unknown PG tag: " << tokenTag << endl; + cerr << "SamFormatParser ERROR: unknown PG tag: " << tokenTag << endl; } // if @PG line exists, ID must be provided if ( !m_header.HasProgramName() ) { - cerr << "SAM Format Error - @PG line is missing ID tag!" << endl; + cerr << "SamFormatParser ERROR:- @PG line is missing ID tag" << endl; return; } } diff --git a/src/api/internal/SamFormatPrinter_p.cpp b/src/api/internal/SamFormatPrinter_p.cpp index dcde46e..69c78df 100644 --- a/src/api/internal/SamFormatPrinter_p.cpp +++ b/src/api/internal/SamFormatPrinter_p.cpp @@ -3,7 +3,7 @@ // Marth Lab, Department of Biology, Boston College // All rights reserved. // --------------------------------------------------------------------------- -// Last modified: 23 December 2010 (DB) +// Last modified: 21 March 2011 (DB) // --------------------------------------------------------------------------- // Provides functionality for printing formatted SAM header to string // *************************************************************************** diff --git a/src/api/internal/SamHeaderValidator_p.cpp b/src/api/internal/SamHeaderValidator_p.cpp index 131fc3d..4aa6395 100644 --- a/src/api/internal/SamHeaderValidator_p.cpp +++ b/src/api/internal/SamHeaderValidator_p.cpp @@ -3,7 +3,7 @@ // Marth Lab, Department of Biology, Boston College // All rights reserved. // --------------------------------------------------------------------------- -// Last modified: 13 January 2011 (DB) +// Last modified: 21 March 2011 (DB) // --------------------------------------------------------------------------- // Provides functionality for validating SamHeader data // *************************************************************************** @@ -135,7 +135,9 @@ bool SamHeaderValidator::ValidateSortOrder(void) { sortOrder == Constants::SAM_HD_SORTORDER_QUERYNAME || sortOrder == Constants::SAM_HD_SORTORDER_UNSORTED ) - { return true; } + { + return true; + } // otherwise AddError("Invalid sort order (SO): " + sortOrder); @@ -147,14 +149,17 @@ bool SamHeaderValidator::ValidateGroupOrder(void) { const string& groupOrder = m_header.GroupOrder; // if no group order, no problem, just return OK - if ( groupOrder.empty() ) return true; + if ( groupOrder.empty() ) + return true; // if group order is valid keyword if ( groupOrder == Constants::SAM_HD_GROUPORDER_NONE || groupOrder == Constants::SAM_HD_GROUPORDER_QUERY || groupOrder == Constants::SAM_HD_GROUPORDER_REFERENCE ) - { return true; } + { + return true; + } // otherwise AddError("Invalid group order (GO): " + groupOrder); @@ -358,7 +363,8 @@ bool SamHeaderValidator::CheckReadGroupID(const string& id) { bool SamHeaderValidator::CheckSequencingTechnology(const string& technology) { // if no technology provided, no problem, just return OK - if ( technology.empty() ) return true; + if ( technology.empty() ) + return true; // if technology is valid keyword if ( Is454(technology) || @@ -367,7 +373,9 @@ bool SamHeaderValidator::CheckSequencingTechnology(const string& technology) { IsPacBio(technology) || IsSolid(technology) ) - { return true; } + { + return true; + } // otherwise AddError("Invalid read group sequencing platform (PL): " + technology); diff --git a/src/shared/bamtools_global.h b/src/shared/bamtools_global.h index be7e034..6e3cb39 100644 --- a/src/shared/bamtools_global.h +++ b/src/shared/bamtools_global.h @@ -3,15 +3,18 @@ // Marth Lab, Department of Biology, Boston College // All rights reserved. // --------------------------------------------------------------------------- -// Last modified: 19 November 2010 (DB) +// Last modified: 3 March 2011 (DB) // --------------------------------------------------------------------------- -// Provides the basic definitions for exporting & importing library symbols +// Provides the basic definitions for exporting & importing library symbols. +// Also provides some platform-specific rules for definitions. // *************************************************************************** #ifndef BAMTOOLS_GLOBAL_H #define BAMTOOLS_GLOBAL_H -// BAMTOOLS_LIBRARY_EXPORT +/*! \brief Library export macro + \internal +*/ #ifndef BAMTOOLS_LIBRARY_EXPORT # if defined(WIN32) # define BAMTOOLS_LIBRARY_EXPORT __declspec(dllexport) @@ -20,7 +23,9 @@ # endif #endif // BAMTOOLS_LIBRARY_EXPORT -// BAMTOOLS_LIBRARY_IMPORT +/*! \brief Library import macro + \internal +*/ #ifndef BAMTOOLS_LIBRARY_IMPORT # if defined(WIN32) # define BAMTOOLS_LIBRARY_IMPORT __declspec(dllimport) @@ -29,4 +34,46 @@ # endif #endif // BAMTOOLS_LIBRARY_IMPORT +/*! \brief Platform-specific type definitions + \internal +*/ +#ifndef BAMTOOLS_LFS +#define BAMTOOLS_LFS + #ifdef WIN32 + #define ftell64(a) _ftelli64(a) + #define fseek64(a,b,c) _fseeki64(a,b,c) + #else + #define ftell64(a) ftello(a) + #define fseek64(a,b,c) fseeko(a,b,c) + #endif +#endif // BAMTOOLS_LFS + +/*! \def ftell64(a) + \brief Platform-independent tell() operation. + \internal +*/ +/*! \def fseek64(a,b,c) + \brief Platform-independent seek() operation. + \internal +*/ + +/*! \brief Platform-specific type definitions + \internal +*/ +#ifndef BAMTOOLS_TYPES +#define BAMTOOLS_TYPES + #ifdef _MSC_VER + typedef char int8_t; + typedef unsigned char uint8_t; + typedef short int16_t; + typedef unsigned short uint16_t; + typedef int int32_t; + typedef unsigned int uint32_t; + typedef long long int64_t; + typedef unsigned long long uint64_t; + #else + #include + #endif +#endif // BAMTOOLS_TYPES + #endif // BAMTOOLS_GLOBAL_H diff --git a/src/toolkit/CMakeLists.txt b/src/toolkit/CMakeLists.txt index 3923bc9..d0e75a8 100644 --- a/src/toolkit/CMakeLists.txt +++ b/src/toolkit/CMakeLists.txt @@ -21,7 +21,7 @@ add_executable ( bamtools bamtools_index.cpp bamtools_merge.cpp bamtools_random.cpp - bamtools_revert.cpp + bamtools_revert.cpp bamtools_sort.cpp bamtools_split.cpp bamtools_stats.cpp @@ -30,7 +30,7 @@ add_executable ( bamtools # set BamTools application properties set_target_properties( BamTools PROPERTIES - VERSION 0.9.0 + VERSION 1.0.0 ) # make version info available in application configure_file(bamtools_version.h.in ${BamTools_SOURCE_DIR}/src/toolkit/bamtools_version.h) diff --git a/src/toolkit/bamtools.cpp b/src/toolkit/bamtools.cpp index 2f91e2d..4875a9a 100644 --- a/src/toolkit/bamtools.cpp +++ b/src/toolkit/bamtools.cpp @@ -3,12 +3,11 @@ // Marth Lab, Department of Biology, Boston College // All rights reserved. // --------------------------------------------------------------------------- -// Last modified: 13 December 2010 +// Last modified: 21 March 2011 (DB) // --------------------------------------------------------------------------- // Integrates a number of BamTools functionalities into a single executable. // *************************************************************************** -// includes #include "bamtools_convert.h" #include "bamtools_count.h" #include "bamtools_coverage.h" @@ -22,12 +21,11 @@ #include "bamtools_split.h" #include "bamtools_stats.h" #include "bamtools_version.h" - #include +#include #include #include #include - using namespace BamTools; using namespace std; @@ -121,7 +119,7 @@ int Help(int argc, char* argv[]) { cerr << endl; cerr << "See 'bamtools help COMMAND' for more information on a specific command." << endl; cerr << endl; - return 0; + return EXIT_SUCCESS; } // print version info @@ -136,9 +134,9 @@ int Version(void) { cout << "bamtools " << versionStream.str() << endl; cout << "Part of BamTools API and toolkit" << endl; cout << "Primary authors: Derek Barnett, Erik Garrison, Michael Stromberg" << endl; - cout << "(c) 2009-2010 Marth Lab, Biology Dept., Boston College" << endl; + cout << "(c) 2009-2011 Marth Lab, Biology Dept., Boston College" << endl; cout << endl; - return 0; + return EXIT_SUCCESS; } // toolkit entry point @@ -153,10 +151,10 @@ int main(int argc, char* argv[]) { // 'bamtools version', 'bamtools --version', or 'bamtools -v' if ( IsVersion(argv[1]) ) return Version(); - // determine desired sub-tool + // determine desired sub-tool, run if found AbstractTool* tool = CreateTool( argv[1] ); - - // if found, run tool... otherwise show help if ( tool ) return tool->Run(argc, argv); - else return Help(argc, argv); + + // no tool matched, show help + return Help(argc, argv); } diff --git a/src/toolkit/bamtools_convert.cpp b/src/toolkit/bamtools_convert.cpp index 5bd957b..cc32e2c 100644 --- a/src/toolkit/bamtools_convert.cpp +++ b/src/toolkit/bamtools_convert.cpp @@ -3,25 +3,27 @@ // Marth Lab, Department of Biology, Boston College // All rights reserved. // --------------------------------------------------------------------------- -// Last modified: 4 October 2010 +// Last modified: 21 March 2011 // --------------------------------------------------------------------------- // Converts between BAM and a number of other formats // *************************************************************************** +#include "bamtools_convert.h" + +#include +#include +#include +#include +#include +#include +using namespace BamTools; + #include #include #include #include #include -#include "bamtools_convert.h" -#include "bamtools_fasta.h" -#include "bamtools_options.h" -#include "bamtools_pileup_engine.h" -#include "bamtools_utilities.h" -#include "BGZF.h" -#include "BamMultiReader.h" using namespace std; -using namespace BamTools; namespace BamTools { @@ -29,15 +31,13 @@ namespace BamTools { // ConvertTool constants // supported conversion format command-line names - static const string FORMAT_BED = "bed"; - static const string FORMAT_BEDGRAPH = "bedgraph"; - static const string FORMAT_FASTA = "fasta"; - static const string FORMAT_FASTQ = "fastq"; - static const string FORMAT_JSON = "json"; - static const string FORMAT_SAM = "sam"; - static const string FORMAT_PILEUP = "pileup"; - static const string FORMAT_WIGGLE = "wig"; - static const string FORMAT_YAML = "yaml"; + static const string FORMAT_BED = "bed"; + static const string FORMAT_FASTA = "fasta"; + static const string FORMAT_FASTQ = "fastq"; + static const string FORMAT_JSON = "json"; + static const string FORMAT_SAM = "sam"; + static const string FORMAT_PILEUP = "pileup"; + static const string FORMAT_YAML = "yaml"; // other constants static const unsigned int FASTA_LINE_MAX = 50; @@ -126,12 +126,10 @@ struct ConvertTool::ConvertToolPrivate { // internal methods private: void PrintBed(const BamAlignment& a); - void PrintBedGraph(const BamAlignment& a); void PrintFasta(const BamAlignment& a); void PrintFastq(const BamAlignment& a); void PrintJson(const BamAlignment& a); void PrintSam(const BamAlignment& a); - void PrintWiggle(const BamAlignment& a); void PrintYaml(const BamAlignment& a); // special case - uses the PileupEngine @@ -162,33 +160,40 @@ bool ConvertTool::ConvertToolPrivate::Run(void) { // open input files BamMultiReader reader; - if ( !m_settings->HasInput ) { // don't attempt to open index for stdin - if ( !reader.Open(m_settings->InputFiles, false) ) { - cerr << "Could not open input files" << endl; + if ( !reader.Open(m_settings->InputFiles) ) { + cerr << "bamtools convert ERROR: could not open input BAM file(s)... Aborting." << endl; + return false; + } + + // if input is not stdin & a region is provided, look for index files + if ( m_settings->HasInput && m_settings->HasRegion ) { + if ( !reader.LocateIndexes() ) { + cerr << "bamtools convert ERROR: could not locate index file(s)... Aborting." << endl; return false; } - } else { - if ( !reader.Open(m_settings->InputFiles, true) ) { - if ( !reader.Open(m_settings->InputFiles, false) ) { - cerr << "Could not open input files" << endl; - return false; - } else { - cerr << "Opened reader without index file, jumping is disabled." << endl; - } - } } + + // retrieve reference data m_references = reader.GetReferenceData(); // set region if specified BamRegion region; if ( m_settings->HasRegion ) { if ( Utilities::ParseRegionString(m_settings->Region, reader, region) ) { - if ( !reader.SetRegion(region) ) { - cerr << "Could not set BamReader region to REGION: " << m_settings->Region << endl; - return false; + + if ( reader.HasIndexes() ) { + if ( !reader.SetRegion(region) ) { + cerr << "bamtools convert ERROR: set region failed. Check that REGION describes a valid range" << endl; + reader.Close(); + return false; + } } + } else { - cerr << "Could not parse REGION: " << m_settings->Region << endl; + cerr << "bamtools convert ERROR: could not parse REGION: " << m_settings->Region << endl; + cerr << "Check that REGION is in valid format (see documentation) and that the coordinates are valid" + << endl; + reader.Close(); return false; } } @@ -200,7 +205,8 @@ bool ConvertTool::ConvertToolPrivate::Run(void) { // open output file stream outFile.open(m_settings->OutputFilename.c_str()); if ( !outFile ) { - cerr << "Could not open " << m_settings->OutputFilename << " for output" << endl; + cerr << "bamtools convert ERROR: could not open " << m_settings->OutputFilename + << " for output" << endl; return false; } @@ -225,17 +231,15 @@ bool ConvertTool::ConvertToolPrivate::Run(void) { // set function pointer to proper conversion method void (BamTools::ConvertTool::ConvertToolPrivate::*pFunction)(const BamAlignment&) = 0; - if ( m_settings->Format == FORMAT_BED ) pFunction = &BamTools::ConvertTool::ConvertToolPrivate::PrintBed; - else if ( m_settings->Format == FORMAT_BEDGRAPH ) pFunction = &BamTools::ConvertTool::ConvertToolPrivate::PrintBedGraph; - else if ( m_settings->Format == FORMAT_FASTA ) pFunction = &BamTools::ConvertTool::ConvertToolPrivate::PrintFasta; - else if ( m_settings->Format == FORMAT_FASTQ ) pFunction = &BamTools::ConvertTool::ConvertToolPrivate::PrintFastq; - else if ( m_settings->Format == FORMAT_JSON ) pFunction = &BamTools::ConvertTool::ConvertToolPrivate::PrintJson; - else if ( m_settings->Format == FORMAT_SAM ) pFunction = &BamTools::ConvertTool::ConvertToolPrivate::PrintSam; - else if ( m_settings->Format == FORMAT_WIGGLE ) pFunction = &BamTools::ConvertTool::ConvertToolPrivate::PrintWiggle; - else if ( m_settings->Format == FORMAT_YAML ) pFunction = &BamTools::ConvertTool::ConvertToolPrivate::PrintYaml; + if ( m_settings->Format == FORMAT_BED ) pFunction = &BamTools::ConvertTool::ConvertToolPrivate::PrintBed; + else if ( m_settings->Format == FORMAT_FASTA ) pFunction = &BamTools::ConvertTool::ConvertToolPrivate::PrintFasta; + else if ( m_settings->Format == FORMAT_FASTQ ) pFunction = &BamTools::ConvertTool::ConvertToolPrivate::PrintFastq; + else if ( m_settings->Format == FORMAT_JSON ) pFunction = &BamTools::ConvertTool::ConvertToolPrivate::PrintJson; + else if ( m_settings->Format == FORMAT_SAM ) pFunction = &BamTools::ConvertTool::ConvertToolPrivate::PrintSam; + else if ( m_settings->Format == FORMAT_YAML ) pFunction = &BamTools::ConvertTool::ConvertToolPrivate::PrintYaml; else { - cerr << "Unrecognized format: " << m_settings->Format << endl; - cerr << "Please see help|README (?) for details on supported formats " << endl; + cerr << "bamtools convert ERROR: unrecognized format: " << m_settings->Format << endl; + cerr << "Please see documentation for list of supported formats " << endl; formatError = true; convertedOk = false; } @@ -250,7 +254,7 @@ bool ConvertTool::ConvertToolPrivate::Run(void) { // iterate through file, doing conversion BamAlignment a; while ( reader.GetNextAlignment(a) ) - (this->*pFunction)(a); + (this->*pFunction)(a); // set flag for successful conversion convertedOk = true; @@ -259,9 +263,9 @@ bool ConvertTool::ConvertToolPrivate::Run(void) { // ------------------------ // clean up & exit - reader.Close(); - if ( m_settings->HasOutput ) outFile.close(); + if ( m_settings->HasOutput ) + outFile.close(); return convertedOk; } @@ -283,10 +287,6 @@ void ConvertTool::ConvertToolPrivate::PrintBed(const BamAlignment& a) { << (a.IsReverseStrand() ? "-" : "+") << endl; } -void ConvertTool::ConvertToolPrivate::PrintBedGraph(const BamAlignment& a) { - ; -} - // print BamAlignment in FASTA format // N.B. - uses QueryBases NOT AlignedBases void ConvertTool::ConvertToolPrivate::PrintFasta(const BamAlignment& a) { @@ -376,11 +376,12 @@ void ConvertTool::ConvertToolPrivate::PrintJson(const BamAlignment& a) { if ( !cigarData.empty() ) { m_out << "\"cigar\":["; vector::const_iterator cigarBegin = cigarData.begin(); - vector::const_iterator cigarIter = cigarBegin; - vector::const_iterator cigarEnd = cigarData.end(); + vector::const_iterator cigarIter = cigarBegin; + vector::const_iterator cigarEnd = cigarData.end(); for ( ; cigarIter != cigarEnd; ++cigarIter ) { const CigarOp& op = (*cigarIter); - if (cigarIter != cigarBegin) m_out << ","; + if (cigarIter != cigarBegin) + m_out << ","; m_out << "\"" << op.Length << op.Type << "\""; } m_out << "],"; @@ -403,23 +404,25 @@ void ConvertTool::ConvertToolPrivate::PrintJson(const BamAlignment& a) { string::const_iterator s = a.Qualities.begin(); m_out << "\"qualities\":[" << static_cast(*s) - 33; ++s; - for (; s != a.Qualities.end(); ++s) { + for ( ; s != a.Qualities.end(); ++s ) m_out << "," << static_cast(*s) - 33; - } m_out << "],"; } + // write alignment's source BAM file + m_out << "\"filename\":" << a.Filename << ","; + // write tag data const char* tagData = a.TagData.c_str(); const size_t tagDataLength = a.TagData.length(); size_t index = 0; - if (index < tagDataLength) { + if ( index < tagDataLength ) { m_out << "\"tags\":{"; while ( index < tagDataLength ) { - if (index > 0) + if ( index > 0 ) m_out << ","; // write tag name @@ -429,55 +432,45 @@ void ConvertTool::ConvertToolPrivate::PrintJson(const BamAlignment& a) { // get data type char type = a.TagData.at(index); ++index; - - switch (type) { - case('A') : + switch ( type ) { + case (Constants::BAM_TAG_TYPE_ASCII) : m_out << "\"" << tagData[index] << "\""; ++index; break; - case('C') : + case (Constants::BAM_TAG_TYPE_INT8) : + case (Constants::BAM_TAG_TYPE_UINT8) : m_out << (int)tagData[index]; ++index; break; - case('c') : - m_out << (int)tagData[index]; - ++index; + case (Constants::BAM_TAG_TYPE_INT16) : + m_out << BamTools::UnpackSignedShort(&tagData[index]); + index += sizeof(int16_t); break; - - case('S') : - m_out << BgzfData::UnpackUnsignedShort(&tagData[index]); - index += 2; + + case (Constants::BAM_TAG_TYPE_UINT16) : + m_out << BamTools::UnpackUnsignedShort(&tagData[index]); + index += sizeof(uint16_t); break; - case('s') : - m_out << BgzfData::UnpackSignedShort(&tagData[index]); - index += 2; - break; - - case('I') : - m_out << BgzfData::UnpackUnsignedInt(&tagData[index]); - index += 4; - break; - - case('i') : - m_out << BgzfData::UnpackSignedInt(&tagData[index]); - index += 4; + case (Constants::BAM_TAG_TYPE_INT32) : + m_out << BamTools::UnpackSignedInt(&tagData[index]); + index += sizeof(int32_t); break; - - case('f') : - m_out << BgzfData::UnpackFloat(&tagData[index]); - index += 4; + + case (Constants::BAM_TAG_TYPE_UINT32) : + m_out << BamTools::UnpackUnsignedInt(&tagData[index]); + index += sizeof(uint32_t); break; - - case('d') : - m_out << BgzfData::UnpackDouble(&tagData[index]); - index += 8; + + case (Constants::BAM_TAG_TYPE_FLOAT) : + m_out << BamTools::UnpackFloat(&tagData[index]); + index += sizeof(float); break; - case('Z') : - case('H') : + case (Constants::BAM_TAG_TYPE_HEX) : + case (Constants::BAM_TAG_TYPE_STRING) : m_out << "\""; while (tagData[index]) { if (tagData[index] == '\"') @@ -534,19 +527,26 @@ void ConvertTool::ConvertToolPrivate::PrintSam(const BamAlignment& a) { // write mate reference name, mate position, & insert size if ( a.IsPaired() && (a.MateRefID >= 0) && (a.MateRefID < (int)m_references.size()) ) { - if ( a.MateRefID == a.RefID ) m_out << "=\t"; - else m_out << m_references[a.MateRefID].RefName << "\t"; + if ( a.MateRefID == a.RefID ) + m_out << "=\t"; + else + m_out << m_references[a.MateRefID].RefName << "\t"; m_out << a.MatePosition+1 << "\t" << a.InsertSize << "\t"; } - else m_out << "*\t0\t0\t"; + else + m_out << "*\t0\t0\t"; // write sequence - if ( a.QueryBases.empty() ) m_out << "*\t"; - else m_out << a.QueryBases << "\t"; + if ( a.QueryBases.empty() ) + m_out << "*\t"; + else + m_out << a.QueryBases << "\t"; // write qualities - if ( a.Qualities.empty() ) m_out << "*"; - else m_out << a.Qualities; + if ( a.Qualities.empty() ) + m_out << "*"; + else + m_out << a.Qualities; // write tag data const char* tagData = a.TagData.c_str(); @@ -563,61 +563,52 @@ void ConvertTool::ConvertToolPrivate::PrintSam(const BamAlignment& a) { // get data type char type = a.TagData.at(index); ++index; - switch (type) { - case('A') : - m_out << "A:" << tagData[index]; - ++index; - break; - - case('C') : - m_out << "i:" << (int)tagData[index]; - ++index; + switch ( type ) { + case (Constants::BAM_TAG_TYPE_ASCII) : + m_out << "A:" << tagData[index]; + ++index; break; - - case('c') : + + case (Constants::BAM_TAG_TYPE_INT8) : + case (Constants::BAM_TAG_TYPE_UINT8) : m_out << "i:" << (int)tagData[index]; - ++index; + ++index; break; - - case('S') : - m_out << "i:" << BgzfData::UnpackUnsignedShort(&tagData[index]); - index += 2; - break; - - case('s') : - m_out << "i:" << BgzfData::UnpackSignedShort(&tagData[index]); - index += 2; + + case (Constants::BAM_TAG_TYPE_INT16) : + m_out << "i:" << BamTools::UnpackSignedShort(&tagData[index]); + index += sizeof(int16_t); break; - - case('I') : - m_out << "i:" << BgzfData::UnpackUnsignedInt(&tagData[index]); - index += 4; + + case (Constants::BAM_TAG_TYPE_UINT16) : + m_out << "i:" << BamTools::UnpackUnsignedShort(&tagData[index]); + index += sizeof(uint16_t); break; - - case('i') : - m_out << "i:" << BgzfData::UnpackSignedInt(&tagData[index]); - index += 4; + + case (Constants::BAM_TAG_TYPE_INT32) : + m_out << "i:" << BamTools::UnpackSignedInt(&tagData[index]); + index += sizeof(int32_t); break; - - case('f') : - m_out << "f:" << BgzfData::UnpackFloat(&tagData[index]); - index += 4; + + case (Constants::BAM_TAG_TYPE_UINT32) : + m_out << "i:" << BamTools::UnpackUnsignedInt(&tagData[index]); + index += sizeof(uint32_t); break; - - case('d') : - m_out << "d:" << BgzfData::UnpackDouble(&tagData[index]); - index += 8; + + case (Constants::BAM_TAG_TYPE_FLOAT) : + m_out << "f:" << BamTools::UnpackFloat(&tagData[index]); + index += sizeof(float); break; - - case('Z') : - case('H') : + + case (Constants::BAM_TAG_TYPE_HEX) : + case (Constants::BAM_TAG_TYPE_STRING) : m_out << type << ":"; while (tagData[index]) { m_out << tagData[index]; ++index; } - ++index; - break; + ++index; + break; } if ( tagData[index] == '\0') @@ -627,10 +618,6 @@ void ConvertTool::ConvertToolPrivate::PrintSam(const BamAlignment& a) { m_out << endl; } -void ConvertTool::ConvertToolPrivate::PrintWiggle(const BamAlignment& a) { - ; -} - // Print BamAlignment in YAML format void ConvertTool::ConvertToolPrivate::PrintYaml(const BamAlignment& a) { @@ -639,28 +626,29 @@ void ConvertTool::ConvertToolPrivate::PrintYaml(const BamAlignment& a) { m_out << a.Name << ":" << endl; // write alignment data - m_out << " " << "AlndBases: " << a.AlignedBases << endl; - m_out << " " << "Qualities: " << a.Qualities << endl; - m_out << " " << "Name: " << a.Name << endl; - m_out << " " << "Length: " << a.Length << endl; - m_out << " " << "TagData: " << a.TagData << endl; - m_out << " " << "RefID: " << a.RefID << endl; - m_out << " " << "RefName: " << m_references[a.RefID].RefName << endl; - m_out << " " << "Position: " << a.Position << endl; - m_out << " " << "Bin: " << a.Bin << endl; - m_out << " " << "MapQuality: " << a.MapQuality << endl; - m_out << " " << "AlignmentFlag: " << a.AlignmentFlag << endl; - m_out << " " << "MateRefID: " << a.MateRefID << endl; - m_out << " " << "MatePosition: " << a.MatePosition << endl; - m_out << " " << "InsertSize: " << a.InsertSize << endl; + m_out << " " << "AlndBases: " << a.AlignedBases << endl; + m_out << " " << "Qualities: " << a.Qualities << endl; + m_out << " " << "Name: " << a.Name << endl; + m_out << " " << "Length: " << a.Length << endl; + m_out << " " << "TagData: " << a.TagData << endl; + m_out << " " << "RefID: " << a.RefID << endl; + m_out << " " << "RefName: " << m_references[a.RefID].RefName << endl; + m_out << " " << "Position: " << a.Position << endl; + m_out << " " << "Bin: " << a.Bin << endl; + m_out << " " << "MapQuality: " << a.MapQuality << endl; + m_out << " " << "AlignmentFlag: " << a.AlignmentFlag << endl; + m_out << " " << "MateRefID: " << a.MateRefID << endl; + m_out << " " << "MatePosition: " << a.MatePosition << endl; + m_out << " " << "InsertSize: " << a.InsertSize << endl; + m_out << " " << "Filename: " << a.Filename << endl; // write Cigar data const vector& cigarData = a.CigarData; if ( !cigarData.empty() ) { m_out << " " << "Cigar: "; vector::const_iterator cigarBegin = cigarData.begin(); - vector::const_iterator cigarIter = cigarBegin; - vector::const_iterator cigarEnd = cigarData.end(); + vector::const_iterator cigarIter = cigarBegin; + vector::const_iterator cigarEnd = cigarData.end(); for ( ; cigarIter != cigarEnd; ++cigarIter ) { const CigarOp& op = (*cigarIter); m_out << op.Length << op.Type; @@ -799,7 +787,7 @@ void ConvertPileupFormatVisitor::Visit(const PileupPosition& pileupData ) { char referenceBase('N'); if ( m_hasFasta && (pileupData.Position < m_references[pileupData.RefId].RefLength) ) { if ( !m_fasta.GetBase(pileupData.RefId, pileupData.Position, referenceBase ) ) { - cerr << "Pileup error : Could not read reference base from FASTA file" << endl; + cerr << "bamtools convert ERROR: pileup conversion - could not read reference base from FASTA file" << endl; return; } } @@ -836,11 +824,11 @@ void ConvertPileupFormatVisitor::Visit(const PileupPosition& pileupData ) { toupper(base) == toupper(referenceBase) || tolower(base) == tolower(referenceBase) ) { - base = (ba.IsReverseStrand() ? ',' : '.' ); + base = ( ba.IsReverseStrand() ? ',' : '.' ); } // mismatches reference - else base = (ba.IsReverseStrand() ? tolower(base) : toupper(base) ); + else base = ( ba.IsReverseStrand() ? tolower(base) : toupper(base) ); // store base bases << base; @@ -861,7 +849,7 @@ void ConvertPileupFormatVisitor::Visit(const PileupPosition& pileupData ) { char deletedBase('N'); if ( m_hasFasta && (pileupData.Position+i < m_references[pileupData.RefId].RefLength) ) { if ( !m_fasta.GetBase(pileupData.RefId, pileupData.Position+i, deletedBase ) ) { - cerr << "Pileup error : Could not read reference base from FASTA file" << endl; + cerr << "bamtools convert ERROR: pileup conversion - could not read reference base from FASTA file" << endl; return; } } @@ -874,7 +862,8 @@ void ConvertPileupFormatVisitor::Visit(const PileupPosition& pileupData ) { else bases << '*'; // if end of read segment - if ( pa.IsSegmentEnd ) bases << '$'; + if ( pa.IsSegmentEnd ) + bases << '$'; // store current base quality baseQualities << ba.Qualities.at(pa.PositionInAlignment); diff --git a/src/toolkit/bamtools_count.cpp b/src/toolkit/bamtools_count.cpp index 0a0620a..40e7c5d 100644 --- a/src/toolkit/bamtools_count.cpp +++ b/src/toolkit/bamtools_count.cpp @@ -3,24 +3,23 @@ // Marth Lab, Department of Biology, Boston College // All rights reserved. // --------------------------------------------------------------------------- -// Last modified: 3 September 2010 +// Last modified: 23 March 2011 // --------------------------------------------------------------------------- // Prints alignment count for BAM file(s) // *************************************************************************** +#include "bamtools_count.h" + +#include +#include +#include +using namespace BamTools; + #include #include #include - -#include "bamtools_count.h" -#include "bamtools_options.h" -#include "bamtools_utilities.h" -#include "BamReader.h" -#include "BamMultiReader.h" - using namespace std; -using namespace BamTools; - + // --------------------------------------------- // CountSettings implementation @@ -78,8 +77,8 @@ int CountTool::Run(int argc, char* argv[]) { // open reader without index BamMultiReader reader; - if (!reader.Open(m_settings->InputFiles, false, true)) { - cerr << "ERROR: Could not open input BAM file(s)... Aborting." << endl; + if ( !reader.Open(m_settings->InputFiles) ) { + cerr << "bamtools count ERROR: could not open input BAM file(s)... Aborting." << endl; return 1; } @@ -100,22 +99,15 @@ int CountTool::Run(int argc, char* argv[]) { BamRegion region; if ( Utilities::ParseRegionString(m_settings->Region, reader, region) ) { - // attempt to re-open reader with index files - reader.Close(); - bool openedOK = reader.Open(m_settings->InputFiles, true, true ); - - // if error - if ( !openedOK ) { - cerr << "ERROR: Could not open input BAM file(s)... Aborting." << endl; - return 1; - } - - // if index data available, we can use SetRegion + // attempt to find index files + reader.LocateIndexes(); + + // if index data available for all BAM files, we can use SetRegion if ( reader.IsIndexLoaded() ) { - // attempt to use SetRegion(), if failed report error + // attempt to set region on reader if ( !reader.SetRegion(region.LeftRefID, region.LeftPosition, region.RightRefID, region.RightPosition) ) { - cerr << "ERROR: Region requested, but could not set BamReader region to REGION: " << m_settings->Region << " Aborting." << endl; + cerr << "bamtools count ERROR: set region failed. Check that REGION describes a valid range" << endl; reader.Close(); return 1; } @@ -128,7 +120,7 @@ int CountTool::Run(int argc, char* argv[]) { // no index data available, we have to iterate through until we // find overlapping alignments else { - while( reader.GetNextAlignmentCore(al) ) { + while ( reader.GetNextAlignmentCore(al) ) { if ( (al.RefID >= region.LeftRefID) && ( (al.Position + al.Length) >= region.LeftPosition ) && (al.RefID <= region.RightRefID) && ( al.Position <= region.RightPosition) ) { @@ -140,8 +132,9 @@ int CountTool::Run(int argc, char* argv[]) { // error parsing REGION string else { - cerr << "ERROR: Could not parse REGION - " << m_settings->Region << endl; - cerr << "Be sure REGION is in valid format (see README) and that coordinates are valid for selected references" << endl; + cerr << "bamtools count ERROR: could not parse REGION - " << m_settings->Region << endl; + cerr << "Check that REGION is in valid format (see documentation) and that the coordinates are valid" + << endl; reader.Close(); return 1; } @@ -150,7 +143,7 @@ int CountTool::Run(int argc, char* argv[]) { // print results cout << alignmentCount << endl; - // clean & exit + // clean up & exit reader.Close(); return 0; } diff --git a/src/toolkit/bamtools_coverage.cpp b/src/toolkit/bamtools_coverage.cpp index 8a7fce2..748f513 100644 --- a/src/toolkit/bamtools_coverage.cpp +++ b/src/toolkit/bamtools_coverage.cpp @@ -3,21 +3,23 @@ // Marth Lab, Department of Biology, Boston College // All rights reserved. // --------------------------------------------------------------------------- -// Last modified: 16 September 2010 +// Last modified: 21 March 2011 // --------------------------------------------------------------------------- // Prints coverage data for a single BAM file // *************************************************************************** +#include "bamtools_coverage.h" + +#include +#include +#include +using namespace BamTools; + #include #include #include #include -#include "bamtools_coverage.h" -#include "bamtools_options.h" -#include "bamtools_pileup_engine.h" -#include "BamReader.h" using namespace std; -using namespace BamTools; namespace BamTools { @@ -109,7 +111,8 @@ bool CoverageTool::CoverageToolPrivate::Run(void) { // open output file stream outFile.open(m_settings->OutputFilename.c_str()); if ( !outFile ) { - cerr << "Could not open " << m_settings->OutputFilename << " for output." << endl; + cerr << "bamtools coverage ERROR: could not open " << m_settings->OutputFilename + << " for output" << endl; return false; } @@ -119,10 +122,12 @@ bool CoverageTool::CoverageToolPrivate::Run(void) { //open our BAM reader BamReader reader; - if (!reader.Open(m_settings->InputBamFilename)) { - cerr << "Could not open " << m_settings->InputBamFilename << " for reading." << endl; + if ( !reader.Open(m_settings->InputBamFilename) ) { + cerr << "bamtools coverage ERROR: could not open input BAM file: " << m_settings->InputBamFilename << endl; return false; } + + // retrieve references m_references = reader.GetReferenceData(); // set up our output 'visitor' @@ -139,7 +144,8 @@ bool CoverageTool::CoverageToolPrivate::Run(void) { // clean up reader.Close(); - if ( m_settings->HasOutputFile ) outFile.close(); + if ( m_settings->HasOutputFile ) + outFile.close(); delete cv; cv = 0; diff --git a/src/toolkit/bamtools_filter.cpp b/src/toolkit/bamtools_filter.cpp index 8a816c3..023fbc9 100644 --- a/src/toolkit/bamtools_filter.cpp +++ b/src/toolkit/bamtools_filter.cpp @@ -3,27 +3,29 @@ // Marth Lab, Department of Biology, Boston College // All rights reserved. // --------------------------------------------------------------------------- -// Last modified: 17 November 2010 +// Last modified: 21 March 2011 // --------------------------------------------------------------------------- // Filters BAM file(s) according to some user-specified criteria. // *************************************************************************** +#include "bamtools_filter.h" + +#include +#include +#include +#include +#include +using namespace BamTools; + +#include +using namespace Json; + #include #include #include #include #include -#include "bamtools_filter.h" -#include "bamtools_filter_engine.h" -#include "bamtools_options.h" -#include "bamtools_utilities.h" -#include "BamReader.h" -#include "BamMultiReader.h" -#include "BamWriter.h" -#include "jsoncpp/json.h" using namespace std; -using namespace BamTools; -using namespace Json; namespace BamTools { @@ -377,8 +379,8 @@ FilterTool::FilterTool(void) OptionGroup* IO_Opts = Options::CreateOptionGroup("Input & Output"); Options::AddValueOption("-in", "BAM filename", "the input BAM file(s)", "", m_settings->HasInputBamFilename, m_settings->InputFiles, IO_Opts, Options::StandardIn()); Options::AddValueOption("-out", "BAM filename", "the output BAM file", "", m_settings->HasOutputBamFilename, m_settings->OutputFilename, IO_Opts, Options::StandardOut()); - Options::AddValueOption("-region", "REGION", "only read data from this genomic region (see README for more details)", "", m_settings->HasRegion, m_settings->Region, IO_Opts); - Options::AddValueOption("-script", "filename", "the filter script file (see README for more details)", "", m_settings->HasScriptFilename, m_settings->ScriptFilename, IO_Opts); + Options::AddValueOption("-region", "REGION", "only read data from this genomic region (see documentation for more details)", "", m_settings->HasRegion, m_settings->Region, IO_Opts); + Options::AddValueOption("-script", "filename", "the filter script file (see documentation for more details)", "", m_settings->HasScriptFilename, m_settings->ScriptFilename, IO_Opts); Options::AddOption("-forceCompression", "if results are sent to stdout (like when piping to another tool), default behavior is to leave output uncompressed. Use this flag to override and force compression", m_settings->IsForceCompression, IO_Opts); OptionGroup* FilterOpts = Options::CreateOptionGroup("General Filters"); @@ -387,7 +389,7 @@ FilterTool::FilterTool(void) Options::AddValueOption("-mapQuality", "[0-255]", "keep reads with map quality that matches pattern", "", m_settings->HasMapQualityFilter, m_settings->MapQualityFilter, FilterOpts); Options::AddValueOption("-name", "string", "keep reads with name that matches pattern", "", m_settings->HasNameFilter, m_settings->NameFilter, FilterOpts); Options::AddValueOption("-queryBases", "string", "keep reads with motif that mathces pattern", "", m_settings->HasQueryBasesFilter, m_settings->QueryBasesFilter, FilterOpts); - Options::AddValueOption("-tag", "TAG:VALUE", "keep reads with this key=>value pair", "", m_settings->HasTagFilter, m_settings->TagFilter, FilterOpts); + Options::AddValueOption("-tag", "TAG:VALUE", "keep reads with this key=>value pair", "", m_settings->HasTagFilter, m_settings->TagFilter, FilterOpts); OptionGroup* AlignmentFlagOpts = Options::CreateOptionGroup("Alignment Flag Filters"); Options::AddValueOption("-isDuplicate", "true/false", "keep only alignments that are marked as duplicate?", "", m_settings->HasIsDuplicateFilter, m_settings->IsDuplicateFilter, AlignmentFlagOpts, TRUE_STR); @@ -439,8 +441,10 @@ FilterTool::FilterToolPrivate::FilterToolPrivate(FilterTool::FilterSettings* set // destructor FilterTool::FilterToolPrivate::~FilterToolPrivate(void) { } -bool FilterTool::FilterToolPrivate::AddPropertyTokensToFilter(const string& filterName, const map& propertyTokens) { - +bool FilterTool::FilterToolPrivate::AddPropertyTokensToFilter(const string& filterName, + const map& propertyTokens) +{ // dummy temp values for token parsing bool boolValue; int32_t int32Value; @@ -515,7 +519,7 @@ bool FilterTool::FilterToolPrivate::AddPropertyTokensToFilter(const string& filt m_filterEngine.setProperty(filterName, propertyName, stringValue, type); } - else if (propertyName == TAG_PROPERTY ) { + else if ( propertyName == TAG_PROPERTY ) { // this will be stored directly as the TAG:VALUE token // (VALUE may contain compare ops, will be parsed out later) m_filterEngine.setProperty(filterName, propertyName, token, PropertyFilterValue::EXACT); @@ -523,7 +527,7 @@ bool FilterTool::FilterToolPrivate::AddPropertyTokensToFilter(const string& filt // else unknown property else { - cerr << "Unknown property: " << propertyName << "!" << endl; + cerr << "bamtools filter ERROR: unknown property - " << propertyName << endl; return false; } } @@ -539,7 +543,8 @@ const string FilterTool::FilterToolPrivate::GetScriptContents(void) { // open script for reading FILE* inFile = fopen(m_settings->ScriptFilename.c_str(), "rb"); if ( !inFile ) { - cerr << "FilterTool error: Could not open script: " << m_settings->ScriptFilename << " for reading" << endl; + cerr << "bamtools filter ERROR: could not open script: " + << m_settings->ScriptFilename << " for reading" << endl; return false; } @@ -555,7 +560,7 @@ const string FilterTool::FilterToolPrivate::GetScriptContents(void) { // read next block of data if ( fgets(buffer, 1024, inFile) == 0 ) { - cerr << "FilterTool error : could not read from script" << endl; + cerr << "bamtools filter ERROR: could not read script contents" << endl; return false; } @@ -671,7 +676,8 @@ bool FilterTool::FilterToolPrivate::ParseScript(void) { Json::Reader reader; if ( !reader.parse(document, root) ) { // use built-in error reporting mechanism to alert user what was wrong with the script - cerr << "Failed to parse configuration\n" << reader.getFormatedErrorMessages(); + cerr << "bamtools filter ERROR: failed to parse script - see error message(s) below" << endl + << reader.getFormatedErrorMessages(); return false; } @@ -740,24 +746,32 @@ bool FilterTool::FilterToolPrivate::Run(void) { // open reader without index BamMultiReader reader; - if ( !reader.Open(m_settings->InputFiles, false, false) ) { - cerr << "Could not open input files for reading." << endl; + if ( !reader.Open(m_settings->InputFiles) ) { + cerr << "bamtools filter ERROR: could not open input files for reading." << endl; return false; } + + // retrieve reader header & reference data const string headerText = reader.GetHeaderText(); filterToolReferences = reader.GetReferenceData(); - // open writer + // determine compression mode for BamWriter + bool writeUncompressed = ( m_settings->OutputFilename == Options::StandardOut() && + !m_settings->IsForceCompression ); + BamWriter::CompressionMode compressionMode = BamWriter::Compressed; + if ( writeUncompressed ) compressionMode = BamWriter::Uncompressed; + + // open BamWriter BamWriter writer; - bool writeUncompressed = ( m_settings->OutputFilename == Options::StandardOut() && !m_settings->IsForceCompression ); - if ( !writer.Open(m_settings->OutputFilename, headerText, filterToolReferences, writeUncompressed) ) { - cerr << "Could not open " << m_settings->OutputFilename << " for writing." << endl; + writer.SetCompressionMode(compressionMode); + if ( !writer.Open(m_settings->OutputFilename, headerText, filterToolReferences) ) { + cerr << "bamtools filter ERROR: could not open " << m_settings->OutputFilename << " for writing." << endl; + reader.Close(); return false; } - - BamAlignment al; - + // if no region specified, filter entire file + BamAlignment al; if ( !m_settings->HasRegion ) { while ( reader.GetNextAlignment(al) ) { if ( CheckAlignment(al) ) @@ -772,22 +786,15 @@ bool FilterTool::FilterToolPrivate::Run(void) { BamRegion region; if ( Utilities::ParseRegionString(m_settings->Region, reader, region) ) { - // attempt to re-open reader with index files - reader.Close(); - bool openedOK = reader.Open(m_settings->InputFiles, true, false ); - - // if error - if ( !openedOK ) { - cerr << "ERROR: Could not open input BAM file(s)... Aborting." << endl; - return false; - } - - // if index data available, we can use SetRegion - if ( reader.IsIndexLoaded() ) { - + // attempt to find index files + reader.LocateIndexes(); + + // if index data available for all BAM files, we can use SetRegion + if ( reader.HasIndexes() ) { + // attempt to use SetRegion(), if failed report error if ( !reader.SetRegion(region.LeftRefID, region.LeftPosition, region.RightRefID, region.RightPosition) ) { - cerr << "ERROR: Region requested, but could not set BamReader region to REGION: " << m_settings->Region << " Aborting." << endl; + cerr << "bamtools filter ERROR: set region failed. Check that REGION describes a valid range" << endl; reader.Close(); return false; } @@ -814,8 +821,9 @@ bool FilterTool::FilterToolPrivate::Run(void) { // error parsing REGION string else { - cerr << "ERROR: Could not parse REGION - " << m_settings->Region << endl; - cerr << "Be sure REGION is in valid format (see README) and that coordinates are valid for selected references" << endl; + cerr << "bamtools filter ERROR: could not parse REGION: " << m_settings->Region << endl; + cerr << "Check that REGION is in valid format (see documentation) and that the coordinates are valid" + << endl; reader.Close(); return false; } @@ -829,11 +837,12 @@ bool FilterTool::FilterToolPrivate::Run(void) { bool FilterTool::FilterToolPrivate::SetupFilters(void) { - // add known properties to FilterEngine + // set up filter engine with supported properties InitProperties(); // parse script for filter rules, if given - if ( m_settings->HasScriptFilename ) return ParseScript(); + if ( m_settings->HasScriptFilename ) + return ParseScript(); // otherwise check command line for filters else return ParseCommandLine(); diff --git a/src/toolkit/bamtools_header.cpp b/src/toolkit/bamtools_header.cpp index 180ef9e..aad413f 100644 --- a/src/toolkit/bamtools_header.cpp +++ b/src/toolkit/bamtools_header.cpp @@ -3,21 +3,22 @@ // Marth Lab, Department of Biology, Boston College // All rights reserved. // --------------------------------------------------------------------------- -// Last modified: 1 June 2010 +// Last modified: 21 March 2011 // --------------------------------------------------------------------------- // Prints the SAM-style header from a single BAM file ( or merged header from // multiple BAM files) to stdout // *************************************************************************** +#include "bamtools_header.h" + +#include +#include +using namespace BamTools; + #include #include #include -#include "bamtools_header.h" -#include "bamtools_options.h" -#include "BamReader.h" -#include "BamMultiReader.h" using namespace std; -using namespace BamTools; // --------------------------------------------- // HeaderSettings implementation @@ -70,12 +71,17 @@ int HeaderTool::Run(int argc, char* argv[]) { if ( !m_settings->HasInputBamFilename ) m_settings->InputFiles.push_back(Options::StandardIn()); - // if able to open files, dump (merged) header contents to stdout + // attemp to open BAM files BamMultiReader reader; - if ( reader.Open(m_settings->InputFiles, false) ) - cout << reader.GetHeaderText() << endl; - + if ( !reader.Open(m_settings->InputFiles) ) { + cerr << "bamtools header ERROR: could not open BAM file(s) for reading... Aborting." << endl; + return 1; + } + + // dump (merged) header contents to stdout + cout << reader.GetHeaderText() << endl; + // clean up & exit reader.Close(); return 0; -} \ No newline at end of file +} diff --git a/src/toolkit/bamtools_index.cpp b/src/toolkit/bamtools_index.cpp index 90339c4..6e5a86d 100644 --- a/src/toolkit/bamtools_index.cpp +++ b/src/toolkit/bamtools_index.cpp @@ -3,21 +3,21 @@ // Marth Lab, Department of Biology, Boston College // All rights reserved. // --------------------------------------------------------------------------- -// Last modified: 2 September 2010 +// Last modified: 21 March 2011 // --------------------------------------------------------------------------- -// Creates a BAM index (".bai") file for the provided BAM file. +// Creates a BAM index file. // *************************************************************************** -#include -#include - #include "bamtools_index.h" -#include "bamtools_options.h" -#include "BamReader.h" -using namespace std; +#include +#include using namespace BamTools; +#include +#include +using namespace std; + // --------------------------------------------- // IndexSettings implementation @@ -71,11 +71,15 @@ int IndexTool::Run(int argc, char* argv[]) { // open our BAM reader BamReader reader; - reader.Open(m_settings->InputBamFilename); + if ( !reader.Open(m_settings->InputBamFilename) ) { + cerr << "bamtools index ERROR: could not open BAM file: " << m_settings->InputBamFilename << endl; + return 1; + } // create index for BAM file - bool useDefaultIndex = !m_settings->IsUsingBamtoolsIndex; - reader.CreateIndex(useDefaultIndex); + const BamIndex::IndexType type = ( m_settings->IsUsingBamtoolsIndex ? BamIndex::BAMTOOLS + : BamIndex::STANDARD ); + reader.CreateIndex(type); // clean & exit reader.Close(); diff --git a/src/toolkit/bamtools_index.h b/src/toolkit/bamtools_index.h index bb6d893..0c6c58f 100644 --- a/src/toolkit/bamtools_index.h +++ b/src/toolkit/bamtools_index.h @@ -3,9 +3,9 @@ // Marth Lab, Department of Biology, Boston College // All rights reserved. // --------------------------------------------------------------------------- -// Last modified: 1 June 2010 +// Last modified: 18 January 2011 // --------------------------------------------------------------------------- -// Creates a BAM index (".bai") file for the provided BAM file +// Creates a BAM index file. // *************************************************************************** #ifndef BAMTOOLS_INDEX_H diff --git a/src/toolkit/bamtools_merge.cpp b/src/toolkit/bamtools_merge.cpp index e2cc4a1..fc3675e 100644 --- a/src/toolkit/bamtools_merge.cpp +++ b/src/toolkit/bamtools_merge.cpp @@ -3,21 +3,23 @@ // Marth Lab, Department of Biology, Boston College // All rights reserved. // --------------------------------------------------------------------------- -// Last modified: 13 October 2010 +// Last modified: 21 March 2011 // --------------------------------------------------------------------------- // Merges multiple BAM files into one. // *************************************************************************** +#include "bamtools_merge.h" + +#include +#include +#include +#include +using namespace BamTools; + #include #include #include -#include "bamtools_merge.h" -#include "bamtools_options.h" -#include "bamtools_utilities.h" -#include "BamMultiReader.h" -#include "BamWriter.h" using namespace std; -using namespace BamTools; // --------------------------------------------- // MergeSettings implementation @@ -72,7 +74,7 @@ MergeTool::~MergeTool(void) { int MergeTool::Help(void) { Options::DisplayHelp(); - return 0; + return 0; } int MergeTool::Run(int argc, char* argv[]) { @@ -86,8 +88,8 @@ int MergeTool::Run(int argc, char* argv[]) { // opens the BAM files (by default without checking for indexes) BamMultiReader reader; - if ( !reader.Open(m_settings->InputFiles, false, true) ) { - cerr << "ERROR: Could not open input BAM file(s)... Aborting." << endl; + if ( !reader.Open(m_settings->InputFiles) ) { + cerr << "bamtools merge ERROR: could not open input BAM file(s)... Aborting." << endl; return 1; } @@ -95,15 +97,21 @@ int MergeTool::Run(int argc, char* argv[]) { std::string mergedHeader = reader.GetHeaderText(); RefVector references = reader.GetReferenceData(); - // open writer + // determine compression mode for BamWriter + bool writeUncompressed = ( m_settings->OutputFilename == Options::StandardOut() && + !m_settings->IsForceCompression ); + BamWriter::CompressionMode compressionMode = BamWriter::Compressed; + if ( writeUncompressed ) compressionMode = BamWriter::Uncompressed; + + // open BamWriter BamWriter writer; - bool writeUncompressed = ( m_settings->OutputFilename == Options::StandardOut() && !m_settings->IsForceCompression ); - if ( !writer.Open(m_settings->OutputFilename, mergedHeader, references, writeUncompressed) ) { - cerr << "ERROR: Could not open BAM file " << m_settings->OutputFilename << " for writing... Aborting." << endl; + writer.SetCompressionMode(compressionMode); + if ( !writer.Open(m_settings->OutputFilename, mergedHeader, references) ) { + cerr << "bamtools merge ERROR: could not open " << m_settings->OutputFilename << " for writing." << endl; reader.Close(); - return 1; + return false; } - + // if no region specified, store entire contents of file(s) if ( !m_settings->HasRegion ) { BamAlignment al; @@ -118,22 +126,15 @@ int MergeTool::Run(int argc, char* argv[]) { BamRegion region; if ( Utilities::ParseRegionString(m_settings->Region, reader, region) ) { - // attempt to re-open reader with index files - reader.Close(); - bool openedOK = reader.Open(m_settings->InputFiles, true, true ); - - // if error - if ( !openedOK ) { - cerr << "ERROR: Could not open input BAM file(s)... Aborting." << endl; - return 1; - } - - // if index data available, we can use SetRegion - if ( reader.IsIndexLoaded() ) { - + // attempt to find index files + reader.LocateIndexes(); + + // if index data available for all BAM files, we can use SetRegion + if ( reader.HasIndexes() ) { + // attempt to use SetRegion(), if failed report error if ( !reader.SetRegion(region.LeftRefID, region.LeftPosition, region.RightRefID, region.RightPosition) ) { - cerr << "ERROR: Region requested, but could not set BamReader region to REGION: " << m_settings->Region << " Aborting." << endl; + cerr << "bamtools merge ERROR: set region failed. Check that REGION describes a valid range" << endl; reader.Close(); return 1; } @@ -160,8 +161,9 @@ int MergeTool::Run(int argc, char* argv[]) { // error parsing REGION string else { - cerr << "ERROR: Could not parse REGION - " << m_settings->Region << endl; - cerr << "Be sure REGION is in valid format (see README) and that coordinates are valid for selected references" << endl; + cerr << "bamtools merge ERROR: could not parse REGION - " << m_settings->Region << endl; + cerr << "Check that REGION is in valid format (see documentation) and that the coordinates are valid" + << endl; reader.Close(); writer.Close(); return 1; diff --git a/src/toolkit/bamtools_random.cpp b/src/toolkit/bamtools_random.cpp index ae6d63a..bca9861 100644 --- a/src/toolkit/bamtools_random.cpp +++ b/src/toolkit/bamtools_random.cpp @@ -3,23 +3,25 @@ // Marth Lab, Department of Biology, Boston College // All rights reserved. // --------------------------------------------------------------------------- -// Last modified: 3 September 2010 (DB) +// Last modified: 21 March 2011 (DB) // --------------------------------------------------------------------------- // Grab a random subset of alignments. // *************************************************************************** +#include "bamtools_random.h" + +#include +#include +#include +#include +using namespace BamTools; + #include #include #include #include #include -#include "bamtools_random.h" -#include "bamtools_options.h" -#include "bamtools_utilities.h" -#include "BamMultiReader.h" -#include "BamWriter.h" using namespace std; -using namespace BamTools; namespace BamTools { @@ -27,7 +29,7 @@ namespace BamTools { const unsigned int RANDOM_MAX_ALIGNMENT_COUNT = 10000; // utility methods for RandomTool -const int getRandomInt(const int& lowerBound, const int& upperBound) { +int getRandomInt(const int& lowerBound, const int& upperBound) { const int range = (upperBound - lowerBound) + 1; return ( lowerBound + (int)(range * (double)rand()/((double)RAND_MAX + 1)) ); } @@ -107,33 +109,40 @@ int RandomTool::Run(int argc, char* argv[]) { // open our reader BamMultiReader reader; if ( !reader.Open(m_settings->InputFiles) ) { - cerr << "ERROR: Could not open input BAM file(s)." << endl; + cerr << "bamtools random ERROR: could not open input BAM file(s)... Aborting." << endl; return 1; } + // look up index files for all BAM files + reader.LocateIndexes(); + // make sure index data is available - if ( !reader.IsIndexLoaded() ) { - cerr << "ERROR: Could not load index data for all input BAM file(s)." << endl; - cerr << "\'bamtools random\' requires valid index files to provide efficient performance." << endl; + if ( !reader.HasIndexes() ) { + cerr << "bamtools random ERROR: could not load index data for all input BAM file(s)... Aborting." << endl; reader.Close(); return 1; - } // get BamReader metadata const string headerText = reader.GetHeaderText(); const RefVector references = reader.GetReferenceData(); if ( references.empty() ) { - cerr << "ERROR: No reference data available - required to perform random access throughtout input file(s)." << endl; + cerr << "bamtools random ERROR: no reference data available... Aborting." << endl; reader.Close(); return 1; } - // open our writer + // determine compression mode for BamWriter + bool writeUncompressed = ( m_settings->OutputFilename == Options::StandardOut() && + !m_settings->IsForceCompression ); + BamWriter::CompressionMode compressionMode = BamWriter::Compressed; + if ( writeUncompressed ) compressionMode = BamWriter::Uncompressed; + + // open BamWriter BamWriter writer; - bool writeUncompressed = ( m_settings->OutputFilename == Options::StandardOut() && !m_settings->IsForceCompression ); - if ( !writer.Open(m_settings->OutputFilename, headerText, references, writeUncompressed) ) { - cerr << "ERROR: Could not open BamWriter." << endl; + writer.SetCompressionMode(compressionMode); + if ( !writer.Open(m_settings->OutputFilename, headerText, references) ) { + cerr << "bamtools random ERROR: could not open " << m_settings->OutputFilename << " for writing... Aborting." << endl; reader.Close(); return 1; } @@ -141,8 +150,9 @@ int RandomTool::Run(int argc, char* argv[]) { // if user specified a REGION constraint, attempt to parse REGION string BamRegion region; if ( m_settings->HasRegion && !Utilities::ParseRegionString(m_settings->Region, reader, region) ) { - cerr << "ERROR: Could not parse REGION: " << m_settings->Region << endl; - cerr << "Be sure REGION is in valid format (see README) and that coordinates are valid for selected references" << endl; + cerr << "bamtools random ERROR: could not parse REGION: " << m_settings->Region << endl; + cerr << "Check that REGION is in valid format (see documentation) and that the coordinates are valid" + << endl; reader.Close(); writer.Close(); return 1; @@ -199,4 +209,4 @@ int RandomTool::Run(int argc, char* argv[]) { reader.Close(); writer.Close(); return 0; -} \ No newline at end of file +} diff --git a/src/toolkit/bamtools_revert.cpp b/src/toolkit/bamtools_revert.cpp index 2c1ae96..a9da67e 100644 --- a/src/toolkit/bamtools_revert.cpp +++ b/src/toolkit/bamtools_revert.cpp @@ -3,20 +3,22 @@ // Marth Lab, Department of Biology, Boston College // All rights reserved. // --------------------------------------------------------------------------- -// Last modified: 13 December 2010 +// Last modified: 21 March 2011 // --------------------------------------------------------------------------- // Prints general alignment statistics for BAM file(s). // *************************************************************************** +#include "bamtools_revert.h" + +#include +#include +#include +#include +using namespace BamTools; + #include #include -#include "bamtools_revert.h" -#include "bamtools_options.h" -#include "bamtools_utilities.h" -#include "BamReader.h" -#include "BamWriter.h" using namespace std; -using namespace BamTools; // --------------------------------------------- // RevertSettings implementation @@ -101,7 +103,8 @@ bool RevertTool::RevertToolPrivate::Run(void) { // opens the BAM file without checking for indexes BamReader reader; if ( !reader.Open(m_settings->InputFilename) ) { - cerr << "Could not open input BAM file... quitting." << endl; + cerr << "bamtools revert ERROR: could not open " << m_settings->InputFilename + << " for reading... Aborting." << endl; return false; } @@ -109,11 +112,19 @@ bool RevertTool::RevertToolPrivate::Run(void) { const string& headerText = reader.GetHeaderText(); const RefVector& references = reader.GetReferenceData(); - // open writer + // determine compression mode for BamWriter + bool writeUncompressed = ( m_settings->OutputFilename == Options::StandardOut() && + !m_settings->IsForceCompression ); + BamWriter::CompressionMode compressionMode = BamWriter::Compressed; + if ( writeUncompressed ) compressionMode = BamWriter::Uncompressed; + + // open BamWriter BamWriter writer; - bool writeUncompressed = ( m_settings->OutputFilename == Options::StandardOut() && !m_settings->IsForceCompression ); - if ( !writer.Open(m_settings->OutputFilename, headerText, references, writeUncompressed) ) { - cerr << "Could not open " << m_settings->OutputFilename << " for writing." << endl; + writer.SetCompressionMode(compressionMode); + if ( !writer.Open(m_settings->OutputFilename, headerText, references) ) { + cerr << "bamtools revert ERROR: could not open " << m_settings->OutputFilename + << " for writing... Aborting." << endl; + reader.Close(); return false; } @@ -139,7 +150,7 @@ RevertTool::RevertTool(void) , m_impl(0) { // set program details - Options::SetProgramInfo("bamtools revert", "removes duplicate marks and restores original (non-recalibrated) base qualities", "[-in ... ]"); + Options::SetProgramInfo("bamtools revert", "removes duplicate marks and restores original (non-recalibrated) base qualities", "[-in -in ...] [-out | [-forceCompression]] [revertOptions]"); // set up options OptionGroup* IO_Opts = Options::CreateOptionGroup("Input & Output"); diff --git a/src/toolkit/bamtools_sort.cpp b/src/toolkit/bamtools_sort.cpp index 30de067..8d18f67 100644 --- a/src/toolkit/bamtools_sort.cpp +++ b/src/toolkit/bamtools_sort.cpp @@ -3,37 +3,38 @@ // Marth Lab, Department of Biology, Boston College // All rights reserved. // --------------------------------------------------------------------------- -// Last modified: 16 December 2010 (DB) +// Last modified: 21 March 2011 (DB) // --------------------------------------------------------------------------- // Sorts an input BAM file (default by position) and stores in a new BAM file. // *************************************************************************** +#include "bamtools_sort.h" + +#include +#include +#include +#include +using namespace BamTools; + #include #include #include #include #include #include - -#include "bamtools_sort.h" -#include "bamtools_options.h" -#include "BamReader.h" -#include "BamMultiReader.h" -#include "BamWriter.h" - using namespace std; -using namespace BamTools; namespace BamTools { // defaults // // ** These defaults should be tweaked & 'optimized' per testing ** // + // // I say 'optimized' because each system will naturally perform // differently. We will attempt to determine a sensible // compromise that should perform well on average. - const unsigned int SORT_DEFAULT_MAX_BUFFER_COUNT = 10000; // max numberOfAlignments for buffer - const unsigned int SORT_DEFAULT_MAX_BUFFER_MEMORY = 1024; // Mb + const unsigned int SORT_DEFAULT_MAX_BUFFER_COUNT = 500000; // max numberOfAlignments for buffer + const unsigned int SORT_DEFAULT_MAX_BUFFER_MEMORY = 1024; // Mb // ----------------------------------- // comparison objects (for sorting) @@ -197,13 +198,18 @@ bool SortTool::SortToolPrivate::GenerateSortedRuns(void) { // open input BAM file BamReader inputReader; - if (!inputReader.Open(m_settings->InputBamFilename)) { - cerr << "Could not open " << m_settings->InputBamFilename << " for reading." << endl; + if ( !inputReader.Open(m_settings->InputBamFilename) ) { + cerr << "bamtools sort ERROR: could not open " << m_settings->InputBamFilename + << " for reading... Aborting." << endl; return false; } // get basic data that will be shared by all temp/output files - m_headerText = inputReader.GetHeaderText(); + SamHeader header = inputReader.GetHeader(); + header.SortOrder = ( m_settings->IsSortingByName + ? Constants::SAM_HD_SORTORDER_QUERYNAME + : Constants::SAM_HD_SORTORDER_COORDINATE ); + m_headerText = header.ToString(); m_references = inputReader.GetReferenceData(); // set up alignments buffer @@ -232,7 +238,7 @@ bool SortTool::SortToolPrivate::GenerateSortedRuns(void) { else { // iterate through file - while ( inputReader.GetNextAlignmentCore(al)) { + while ( inputReader.GetNextAlignmentCore(al) ) { // store alignments in buffer buffer.push_back(al); @@ -270,6 +276,7 @@ bool SortTool::SortToolPrivate::HandleBufferContents(vector& buffe ++m_numberOfRuns; // return success/fail of writing to temp file + // TODO: a failure returned here is not actually caught and handled anywhere return success; } @@ -279,11 +286,25 @@ bool SortTool::SortToolPrivate::MergeSortedRuns(void) { // open up multi reader for all of our temp files // this might get broken up if we do a multi-pass system later ?? BamMultiReader multiReader; - multiReader.Open(m_tempFilenames, false, true); + if ( !multiReader.Open(m_tempFilenames) ) { + cerr << "bamtools sort ERROR: could not open BamMultiReader for merging temp files... Aborting." << endl; + return false; + } + + // set sort order for merge + if ( m_settings->IsSortingByName ) + multiReader.SetSortOrder(BamMultiReader::SortedByReadName); + else + multiReader.SetSortOrder(BamMultiReader::SortedByPosition); // open writer for our completely sorted output BAM file BamWriter mergedWriter; - mergedWriter.Open(m_settings->OutputBamFilename, m_headerText, m_references); + if ( !mergedWriter.Open(m_settings->OutputBamFilename, m_headerText, m_references) ) { + cerr << "bamtools sort ERROR: could not open " << m_settings->OutputBamFilename + << " for writing... Aborting." << endl; + multiReader.Close(); + return false; + } // while data available in temp files BamAlignment al; @@ -332,7 +353,11 @@ bool SortTool::SortToolPrivate::WriteTempFile(const vector& buffer // open temp file for writing BamWriter tempWriter; - tempWriter.Open(tempFilename, m_headerText, m_references); + if ( !tempWriter.Open(tempFilename, m_headerText, m_references) ) { + cerr << "bamtools sort ERROR: could not open " << tempFilename + << " for writing." << endl; + return false; + } // write data vector::const_iterator buffIter = buffer.begin(); diff --git a/src/toolkit/bamtools_split.cpp b/src/toolkit/bamtools_split.cpp index 748127f..f4d3db8 100644 --- a/src/toolkit/bamtools_split.cpp +++ b/src/toolkit/bamtools_split.cpp @@ -3,24 +3,28 @@ // Marth Lab, Department of Biology, Boston College // All rights reserved. // --------------------------------------------------------------------------- -// Last modified: 20 September 2010 (DB) +// Last modified: 21 March 2011 (DB) // --------------------------------------------------------------------------- -// +// Splits a BAM file on user-specified property, creating a new BAM output +// file for each value found. // *************************************************************************** +#include "bamtools_split.h" + +#include +#include +#include +#include +#include +using namespace BamTools; + #include #include #include #include #include #include -#include "bamtools_split.h" -#include "bamtools_options.h" -#include "bamtools_variant.h" -#include "BamReader.h" -#include "BamWriter.h" using namespace std; -using namespace BamTools; namespace BamTools { @@ -162,10 +166,14 @@ void SplitTool::SplitToolPrivate::DetermineOutputFilenameStub(void) { } bool SplitTool::SplitToolPrivate::OpenReader(void) { + + // attempt to open BAM file if ( !m_reader.Open(m_settings->InputFilename) ) { - cerr << "ERROR: SplitTool could not open BAM file: " << m_settings->InputFilename << endl; + cerr << "bamtools split ERROR: could not open BAM file: " << m_settings->InputFilename << endl; return false; } + + // save file 'metadata' & return success m_header = m_reader.GetHeaderText(); m_references = m_reader.GetReferenceData(); return true; @@ -177,7 +185,8 @@ bool SplitTool::SplitToolPrivate::Run(void) { DetermineOutputFilenameStub(); // open up BamReader - if ( !OpenReader() ) return false; + if ( !OpenReader() ) + return false; // determine split type from settings if ( m_settings->IsSplittingMapped ) return SplitMapped(); @@ -186,7 +195,8 @@ bool SplitTool::SplitToolPrivate::Run(void) { if ( m_settings->IsSplittingTag ) return SplitTag(); // if we get here, no property was specified - cerr << "No property given to split on... Please use -mapped, -paired, -reference, or -tag TAG to specifiy split behavior." << endl; + cerr << "bamtools split ERROR: no property given to split on... " << endl + << "Please use -mapped, -paired, -reference, or -tag TAG to specifiy desired split behavior." << endl; return false; } @@ -210,9 +220,15 @@ bool SplitTool::SplitToolPrivate::SplitMapped(void) { if ( writerIter == outputFiles.end() ) { // open new BamWriter + const string outputFilename = m_outputFilenameStub + ( isCurrentAlignmentMapped + ? SPLIT_MAPPED_TOKEN + : SPLIT_UNMAPPED_TOKEN ) + ".bam"; writer = new BamWriter; - const string outputFilename = m_outputFilenameStub + ( isCurrentAlignmentMapped ? SPLIT_MAPPED_TOKEN : SPLIT_UNMAPPED_TOKEN ) + ".bam"; - writer->Open(outputFilename, m_header, m_references); + if ( !writer->Open(outputFilename, m_header, m_references) ) { + cerr << "bamtools split ERROR: could not open " << outputFilename + << " for writing." << endl; + return false; + } // store in map outputFiles.insert( make_pair(isCurrentAlignmentMapped, writer) ); @@ -222,7 +238,7 @@ bool SplitTool::SplitToolPrivate::SplitMapped(void) { else writer = (*writerIter).second; // store alignment in proper BAM output file - if ( writer ) + if ( writer ) writer->SaveAlignment(al); } @@ -253,9 +269,15 @@ bool SplitTool::SplitToolPrivate::SplitPaired(void) { if ( writerIter == outputFiles.end() ) { // open new BamWriter + const string outputFilename = m_outputFilenameStub + ( isCurrentAlignmentPaired + ? SPLIT_PAIRED_TOKEN + : SPLIT_SINGLE_TOKEN ) + ".bam"; writer = new BamWriter; - const string outputFilename = m_outputFilenameStub + ( isCurrentAlignmentPaired ? SPLIT_PAIRED_TOKEN : SPLIT_SINGLE_TOKEN ) + ".bam"; - writer->Open(outputFilename, m_header, m_references); + if ( !writer->Open(outputFilename, m_header, m_references) ) { + cerr << "bamtool split ERROR: could not open " << outputFilename + << " for writing." << endl; + return false; + } // store in map outputFiles.insert( make_pair(isCurrentAlignmentPaired, writer) ); @@ -296,11 +318,15 @@ bool SplitTool::SplitToolPrivate::SplitReference(void) { if ( writerIter == outputFiles.end() ) { // open new BamWriter - writer = new BamWriter; const string refName = m_references.at(currentRefId).RefName; const string outputFilename = m_outputFilenameStub + SPLIT_REFERENCE_TOKEN + refName + ".bam"; - writer->Open(outputFilename, m_header, m_references); - + writer = new BamWriter; + if ( !writer->Open(outputFilename, m_header, m_references) ) { + cerr << "bamtools split ERROR: could not open " << outputFilename + << " for writing." << endl; + return false; + } + // store in map outputFiles.insert( make_pair(currentRefId, writer) ); } @@ -329,32 +355,33 @@ bool SplitTool::SplitToolPrivate::SplitTag(void) { // look for tag in this alignment and get tag type char tagType(0); - if ( !al.GetTagType(m_settings->TagToSplit, tagType) ) continue; + if ( !al.GetTagType(m_settings->TagToSplit, tagType) ) + continue; // request split method based on tag type // pass it the current alignment found - switch (tagType) { + switch ( tagType ) { - case 'c' : - case 's' : - case 'i' : + case (Constants::BAM_TAG_TYPE_INT8) : + case (Constants::BAM_TAG_TYPE_INT16) : + case (Constants::BAM_TAG_TYPE_INT32) : return SplitTagImpl(al); - case 'C' : - case 'S' : - case 'I' : + case (Constants::BAM_TAG_TYPE_UINT8) : + case (Constants::BAM_TAG_TYPE_UINT16) : + case (Constants::BAM_TAG_TYPE_UINT32) : return SplitTagImpl(al); - case 'f' : + case (Constants::BAM_TAG_TYPE_FLOAT) : return SplitTagImpl(al); - case 'A': - case 'Z': - case 'H': + case (Constants::BAM_TAG_TYPE_ASCII) : + case (Constants::BAM_TAG_TYPE_STRING) : + case (Constants::BAM_TAG_TYPE_HEX) : return SplitTagImpl(al); default: - fprintf(stderr, "ERROR: Unknown tag storage class encountered: [%c]\n", tagType); + fprintf(stderr, "bamtools split ERROR: unknown tag type encountered: [%c]\n", tagType); return false; } } @@ -375,11 +402,14 @@ void SplitTool::SplitToolPrivate::CloseWriters(map& writers) { typedef map WriterMap; typedef typename WriterMap::iterator WriterMapIterator; + // iterate over writers WriterMapIterator writerIter = writers.begin(); WriterMapIterator writerEnd = writers.end(); for ( ; writerIter != writerEnd; ++writerIter ) { BamWriter* writer = (*writerIter).second; if (writer == 0 ) continue; + + // close & delete writer writer->Close(); delete writer; writer = 0; @@ -409,9 +439,13 @@ bool SplitTool::SplitToolPrivate::SplitTagImpl(BamAlignment& al) { if ( al.GetTag(tag, currentValue) ) { // open new BamWriter, save first alignment - writer = new BamWriter; outputFilenameStream << m_outputFilenameStub << ".TAG_" << tag << "_" << currentValue << ".bam"; - writer->Open(outputFilenameStream.str(), m_header, m_references); + writer = new BamWriter; + if ( !writer->Open(outputFilenameStream.str(), m_header, m_references) ) { + cerr << "bamtools split ERROR: could not open " << outputFilenameStream.str() + << " for writing." << endl; + return false; + } writer->SaveAlignment(al); // store in map @@ -434,10 +468,14 @@ bool SplitTool::SplitToolPrivate::SplitTagImpl(BamAlignment& al) { if ( writerIter == outputFiles.end() ) { // open new BamWriter - writer = new BamWriter; outputFilenameStream << m_outputFilenameStub << ".TAG_" << tag << "_" << currentValue << ".bam"; - writer->Open(outputFilenameStream.str(), m_header, m_references); - + writer = new BamWriter; + if ( !writer->Open(outputFilenameStream.str(), m_header, m_references) ) { + cerr << "bamtool split ERROR: could not open " << outputFilenameStream.str() + << " for writing." << endl; + return false; + } + // store in map outputFiles.insert( make_pair(currentValue, writer) ); diff --git a/src/toolkit/bamtools_split.h b/src/toolkit/bamtools_split.h index dd825f1..3cb85dd 100644 --- a/src/toolkit/bamtools_split.h +++ b/src/toolkit/bamtools_split.h @@ -3,9 +3,10 @@ // Marth Lab, Department of Biology, Boston College // All rights reserved. // --------------------------------------------------------------------------- -// Last modified: 18 September 2010 (DB) +// Last modified: 21 March 2011 (DB) // --------------------------------------------------------------------------- -// +// Splits a BAM file on user-specified property, creating a new BAM output +// file for each value found. // *************************************************************************** #ifndef BAMTOOLS_SPLIT_H @@ -35,4 +36,4 @@ class SplitTool : public AbstractTool { } // namespace BamTools -#endif // BAMTOOLS_SPLIT_H \ No newline at end of file +#endif // BAMTOOLS_SPLIT_H diff --git a/src/toolkit/bamtools_stats.cpp b/src/toolkit/bamtools_stats.cpp index 2764499..42e7cbc 100644 --- a/src/toolkit/bamtools_stats.cpp +++ b/src/toolkit/bamtools_stats.cpp @@ -3,11 +3,17 @@ // Marth Lab, Department of Biology, Boston College // All rights reserved. // --------------------------------------------------------------------------- -// Last modified: 12 October 2010 +// Last modified: 21 March 2011 // --------------------------------------------------------------------------- // Prints general alignment statistics for BAM file(s). // *************************************************************************** +#include "bamtools_stats.h" + +#include +#include +using namespace BamTools; + #include #include #include @@ -15,11 +21,7 @@ #include #include #include -#include "bamtools_stats.h" -#include "bamtools_options.h" -#include "BamMultiReader.h" using namespace std; -using namespace BamTools; // --------------------------------------------- // StatsSettings implementation @@ -98,10 +100,11 @@ StatsTool::StatsToolPrivate::StatsToolPrivate(StatsTool::StatsSettings* _setting StatsTool::StatsToolPrivate::~StatsToolPrivate(void) { } -// median is of type double because in the case of even number of data elements, we need to return the average of middle 2 elements +// median is of type double because in the case of even number of data elements, +// we need to return the average of middle 2 elements bool StatsTool::StatsToolPrivate::CalculateMedian(vector& data, double& median) { - // check that data exists + // skip if data empty if ( data.empty() ) return false; // find middle element @@ -214,24 +217,22 @@ void StatsTool::StatsToolPrivate::ProcessAlignment(const BamAlignment& al) { bool StatsTool::StatsToolPrivate::Run() { - // opens the BAM files without checking for indexes + // open the BAM files BamMultiReader reader; - if ( !reader.Open(settings->InputFiles, false, true) ) { - cerr << "Could not open input BAM file(s)... quitting." << endl; + if ( !reader.Open(settings->InputFiles) ) { + cerr << "bamtools stats ERROR: could not open input BAM file(s)... Aborting." << endl; reader.Close(); return false; } - // plow through file, keeping track of stats + // plow through alignments, keeping track of stats BamAlignment al; while ( reader.GetNextAlignmentCore(al) ) ProcessAlignment(al); + reader.Close(); - // print stats + // print stats & exit PrintStats(); - - // clean and exit - reader.Close(); return true; } @@ -271,7 +272,7 @@ int StatsTool::Run(int argc, char* argv[]) { // parse command line arguments Options::Parse(argc, argv, 1); - + // set to default input if none provided if ( !m_settings->HasInput ) m_settings->InputFiles.push_back(Options::StandardIn()); diff --git a/src/toolkit/bamtools_tool.h b/src/toolkit/bamtools_tool.h index bcd5cfe..816c8c5 100644 --- a/src/toolkit/bamtools_tool.h +++ b/src/toolkit/bamtools_tool.h @@ -25,8 +25,12 @@ class AbstractTool { public: virtual int Help(void) =0; virtual int Run(int argc, char* argv[]) =0; + + // derived classes should also provide: + // static std::string Description(void); + // static std::String Name(void); }; } // namespace BamTools -#endif // BAMTOOLS_ABSTRACTTOOL_H \ No newline at end of file +#endif // BAMTOOLS_ABSTRACTTOOL_H diff --git a/src/utils/CMakeLists.txt b/src/utils/CMakeLists.txt index b5ac697..d1e26ff 100644 --- a/src/utils/CMakeLists.txt +++ b/src/utils/CMakeLists.txt @@ -24,6 +24,6 @@ target_link_libraries ( BamTools-utils BamTools ) # set BamTools library properties set_target_properties( BamTools-utils PROPERTIES - SOVERSION 0.9.0 + SOVERSION 1.0.0 OUTPUT_NAME bamtools-utils ) diff --git a/src/utils/bamtools_utilities.cpp b/src/utils/bamtools_utilities.cpp index 6f3c0cb..bb65c7b 100644 --- a/src/utils/bamtools_utilities.cpp +++ b/src/utils/bamtools_utilities.cpp @@ -3,7 +3,7 @@ // Marth Lab, Department of Biology, Boston College // All rights reserved. // --------------------------------------------------------------------------- -// Last modified: 19 November 2010 +// Last modified: 26 January 2011 // --------------------------------------------------------------------------- // Provides general utilities used by BamTools sub-tools. // *************************************************************************** @@ -21,20 +21,28 @@ using namespace std; namespace BamTools { -const char REVCOMP_LOOKUP[] = {'T', 0, 'G', 'H', 0, 0, 'C', 'D', 0, 0, 0, 0, 'K', 'N', 0, 0, 0, 'Y', 'W', 'A', 'A', 'B', 'S', 'X', 'R', 0 }; +const char REVCOMP_LOOKUP[] = {'T', 0, 'G', 'H', + 0, 0, 'C', 'D', + 0, 0, 0, 0, + 'K', 'N', 0, 0, + 0, 'Y', 'W', 'A', + 'A', 'B', 'S', 'X', + 'R', 0 }; } // namespace BamTools // check if a file exists -bool Utilities::FileExists(const std::string& filename) { +bool Utilities::FileExists(const string& filename) { ifstream f(filename.c_str(), ifstream::in); return !f.fail(); } // Parses a region string, does validation (valid ID's, positions), stores in Region struct // Returns success (true/false) -bool Utilities::ParseRegionString(const std::string& regionString, const BamReader& reader, BamRegion& region) { - +bool Utilities::ParseRegionString(const string& regionString, + const BamReader& reader, + BamRegion& region) +{ // ------------------------------- // parse region string @@ -140,8 +148,10 @@ bool Utilities::ParseRegionString(const std::string& regionString, const BamRead } // Same as ParseRegionString() above, but accepts a BamMultiReader -bool Utilities::ParseRegionString(const std::string& regionString, const BamMultiReader& reader, BamRegion& region) { - +bool Utilities::ParseRegionString(const string& regionString, + const BamMultiReader& reader, + BamRegion& region) +{ // ------------------------------- // parse region string @@ -243,7 +253,6 @@ bool Utilities::ParseRegionString(const std::string& regionString, const BamMult region.LeftPosition = startPos; region.RightRefID = stopRefID;; region.RightPosition = stopPos; - return true; } @@ -251,9 +260,9 @@ void Utilities::Reverse(string& sequence) { reverse(sequence.begin(), sequence.end()); } -void Utilities::ReverseComplement(std::string& sequence) { +void Utilities::ReverseComplement(string& sequence) { - // do complement + // do complement, in-place size_t seqLength = sequence.length(); for ( size_t i = 0; i < seqLength; ++i ) sequence.replace(i, 1, 1, REVCOMP_LOOKUP[(int)sequence.at(i) - 65]); diff --git a/src/utils/bamtools_utilities.h b/src/utils/bamtools_utilities.h index 73604fb..3701690 100644 --- a/src/utils/bamtools_utilities.h +++ b/src/utils/bamtools_utilities.h @@ -3,7 +3,7 @@ // Marth Lab, Department of Biology, Boston College // All rights reserved. // --------------------------------------------------------------------------- -// Last modified: 19 November 2010 +// Last modified: 26 January 2011 // --------------------------------------------------------------------------- // Provides general utilities used by BamTools sub-tools. // *************************************************************************** @@ -33,9 +33,13 @@ class UTILS_EXPORT Utilities { // Parses a region string, uses reader to do validation (valid ID's, positions), stores in Region struct // Returns success (true/false) - static bool ParseRegionString(const std::string& regionString, const BamReader& reader, BamRegion& region); + static bool ParseRegionString(const std::string& regionString, + const BamReader& reader, + BamRegion& region); // Same as above, but accepts a BamMultiReader - static bool ParseRegionString(const std::string& regionString, const BamMultiReader& reader, BamRegion& region); + static bool ParseRegionString(const std::string& regionString, + const BamMultiReader& reader, + BamRegion& region); // sequence utilities static void Reverse(std::string& sequence); -- 2.39.2