-// BamReader.h\r
-\r
-/* The MIT License\r
-\r
- Copyright (c) 2008 Genome Research Ltd (GRL).\r
-\r
- Permission is hereby granted, free of charge, to any person obtaining\r
- a copy of this software and associated documentation files (the\r
- "Software"), to deal in the Software without restriction, including\r
- without limitation the rights to use, copy, modify, merge, publish,\r
- distribute, sublicense, and/or sell copies of the Software, and to\r
- permit persons to whom the Software is furnished to do so, subject to\r
- the following conditions:\r
-\r
- The above copyright notice and this permission notice shall be\r
- included in all copies or substantial portions of the Software.\r
-\r
- THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,\r
- EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF\r
- MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND\r
- NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS\r
- BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN\r
- ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN\r
- CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE\r
- SOFTWARE.\r
-*/\r
-\r
-/*\r
- Implementation of BAM-parsing was translated to C++ directly from Heng Li's SAMtools package \r
- (thus the carryover of above MIT license)\r
- Contact: Derek Barnett <barnetde@bc.edu>\r
-*/\r
-\r
-// Derek Barnett\r
-// Marth Lab, Boston College\r
-// Last modified: 23 April 2009\r
+// ***************************************************************************\r
+// BamReader.h (c) 2009 Derek Barnett, Michael Str�mberg\r
+// Marth Lab, Department of Biology, Boston College\r
+// All rights reserved.\r
+// ---------------------------------------------------------------------------\r
+// Last modified: 8 June 2010 (DB)\r
+// ---------------------------------------------------------------------------\r
+// Uses BGZF routines were adapted from the bgzf.c code developed at the Broad\r
+// Institute.\r
+// ---------------------------------------------------------------------------\r
+// Provides the basic functionality for reading BAM files\r
+// ***************************************************************************\r
\r
#ifndef BAMREADER_H\r
#define BAMREADER_H\r
\r
-// custom includes\r
-#include "BamAlignment.h"\r
-#include "STLUtilities.h"\r
-\r
-// C library includes\r
-#include <assert.h>\r
-#include <stdio.h>\r
-#include <stdlib.h>\r
-#include <string.h>\r
-\r
-#ifdef WIN32\r
-typedef char int8_t;\r
-typedef unsigned char uint8_t;\r
-typedef short int16_t;\r
-typedef unsigned short uint16_t;\r
-typedef int int32_t;\r
-typedef unsigned int uint32_t;\r
-typedef long long int64_t;\r
-typedef unsigned long long uint64_t;\r
-#else\r
-#include <stdint.h>\r
-#endif\r
-\r
-// BGZF library includes/defines\r
-#include "bgzf.h"\r
-typedef BGZF* BamFile;\r
-#define bam_open(f_name, mode) bgzf_open(f_name, mode)\r
-#define bam_close(f_ptr) bgzf_close(f_ptr)\r
-#define bam_read(f_ptr, buf, size) bgzf_read(f_ptr, buf, size)\r
-#define bam_write(f_ptr, buf, size) bgzf_write(f_ptr, buf, size)\r
-#define bam_tell(f_ptr) bgzf_tell(f_ptr)\r
-#define bam_seek(f_ptr, pos, dir) bgzf_seek(f_ptr, pos, dir)\r
-\r
-// size of alignment data block in BAM file (bytes)\r
-#define BAM_CORE_SIZE 32\r
-\r
-// BAM indexing constants\r
-#define MAX_BIN 37450 // =(8^6-1)/7+1\r
-#define BAM_MIN_CHUNK_GAP 32768 \r
-#define BAM_LIDX_SHIFT 14\r
-\r
-// CIGAR-retrieval mask/shift constants\r
-#define BAM_CIGAR_SHIFT 4\r
-#define BAM_CIGAR_MASK ((1 << BAM_CIGAR_SHIFT) - 1)\r
-\r
-// CIGAR-operation types\r
-#define BAM_CMATCH 0\r
-#define BAM_CINS 1\r
-#define BAM_CDEL 2\r
-#define BAM_CREF_SKIP 3\r
-#define BAM_CSOFT_CLIP 4\r
-#define BAM_CHARD_CLIP 5\r
-#define BAM_CPAD 6\r
-\r
-// --------------------------- //\r
-// Bam header info\r
-// --------------------------- //\r
-\r
-// --------------------------- //\r
-// BamIndex-related typedefs\r
-// --------------------------- //\r
-\r
-// offset for linear indexing\r
-typedef vector<uint64_t> LinearOffsetVector;\r
-\r
-// chunk boundaries\r
-typedef pair<uint64_t, uint64_t> ChunkPair;\r
-// list of chunks in a BAM bin\r
-typedef vector<ChunkPair> ChunkVector;\r
-\r
-// BAM bins for a reference sequence\r
-// replaces khash - uint32_t is key, ChunkVector is value\r
-typedef pair<uint32_t, ChunkVector*> BamBin;\r
-typedef vector<BamBin> BinVector;\r
-\r
-// each reference sequence has a BinVector and LinearOffsetVector\r
-typedef pair<BinVector*, LinearOffsetVector*> RefIndex;\r
-\r
-// full BamIndex defined as: \r
-typedef vector<RefIndex*> BamIndex;\r
-\r
-// ---------------------------------------------------------------------------//\r
+// C++ includes\r
+#include <string>\r
+\r
+// BamTools includes\r
+#include "BamAux.h"\r
+\r
+namespace BamTools {\r
\r
class BamReader {\r
- \r
- public:\r
- // constructors\r
- BamReader(const char* fileName = NULL, const char* indexFilename = NULL);\r
-\r
- public:\r
- // destructor\r
- ~BamReader(void);\r
- \r
- // BAM interface methods\r
- public:\r
-\r
- // ----------------------- //\r
- // File manipulation\r
- // ----------------------- //\r
- \r
- // open BAM file (automatically opens index if provided)\r
- bool Open(void);\r
- \r
- // open BAM index (allows index to be opened separately - i.e. sometime after the BAM file is opened)\r
- bool OpenIndex(void);\r
- \r
- // close BAM file\r
- bool Close(void);\r
- \r
- // get BAM filename\r
- const char* Filename(void) const;\r
- \r
- // set BAM filename\r
- void SetFilename(const char*);\r
-\r
- // get BAM Index filename\r
- const char* IndexFilename(void) const;\r
- \r
- // set BAM Index filename\r
- void SetIndexFilename(const char*);\r
-\r
- // ----------------------- //\r
- // Access BAM header\r
- // ----------------------- //\r
- \r
- // return full header text\r
- const string GetHeaderText(void) const;\r
- \r
- // --------------------------------- //\r
- // Access reference sequence info\r
- // --------------------------------- //\r
- \r
- // return number of reference sequences in BAM file\r
- const int GetReferenceCount(void) const;\r
-\r
- // return vector of RefData entries\r
- const RefVector GetReferenceData(void) const;\r
-\r
- // get refID from reference name\r
- const int GetRefID(string refName) const; \r
- \r
- // ----------------------------------------- //\r
- // File position moving\r
- // ----------------------------------------- //\r
-\r
- // jumps to 'left' position on refID\r
- // actually jumps before position, so reads that overlap 'left' are included as well\r
- // 'left' defaults to reference begin if not specified\r
- bool Jump(int refID, unsigned int left = 0);\r
-\r
- // Jump to beginning of BAM file, clears any region previously set by Jump()\r
- bool Rewind(void);\r
- \r
- // ------------------------------ //\r
- // Access alignments\r
- // ------------------------------ //\r
- \r
- // get next alignment\r
- bool GetNextAlignment(BamAlignment& read);\r
-\r
- // allow user to specifiy whether 'AlignedBases' string is calculated when alignment is loaded\r
- void SetCalculateAlignedBases(bool);\r
-\r
- // internal utility methods\r
- private:\r
- int BinsFromRegion(int, unsigned int, uint16_t[MAX_BIN]);\r
- uint32_t CalculateAlignmentEnd(const unsigned int&, const vector<CigarOp>&);\r
- int64_t GetOffset(int, unsigned int);\r
- bool IsOverlap(BamAlignment&);\r
- bool LoadHeader(void);\r
- bool LoadIndex(void);\r
- bool LoadNextAlignment(BamAlignment&);\r
-\r
- private: \r
- // main BAM reader components\r
- char* m_filename;\r
- char* m_indexFilename;\r
- BamFile m_file;\r
- BamIndex* m_index;\r
- RefVector m_references;\r
- string m_headerText;\r
-\r
- // state flags\r
- bool m_isOpen; // BAM file is open for processing\r
- bool m_isIndexLoaded; // BAM Index data is loaded and available for processing\r
- bool m_isRegionSpecified; // a region has been specified - specifically, a user has called Jump()\r
- bool m_isCalculateAlignedBases; // build 'AlignedBases' string when getting an alignment, otherwise skip (default = true)\r
-\r
- // region values\r
- int m_currentRefID;\r
- unsigned int m_currentLeft;\r
-\r
- // file offset of 1st read in BAM file\r
- int64_t m_alignmentsBeginOffset;\r
-\r
- private:\r
- // BAM character constants\r
- static const char* DNA_LOOKUP;\r
- static const char* CIGAR_LOOKUP;\r
+\r
+ // constructor / destructor\r
+ public:\r
+ BamReader(void);\r
+ ~BamReader(void);\r
+\r
+ // public interface\r
+ public:\r
+\r
+ // ----------------------\r
+ // BAM file operations\r
+ // ----------------------\r
+\r
+ // close BAM file\r
+ void Close(void);\r
+ // performs random-access jump to reference, position\r
+ bool Jump(int refID, int position = 0);\r
+ // opens BAM file (and optional BAM index file, if provided)\r
+ void Open(const std::string& filename, const std::string& indexFilename = "");\r
+ // returns file pointer to beginning of alignments\r
+ bool Rewind(void);\r
+\r
+ // ----------------------\r
+ // access alignment data\r
+ // ----------------------\r
+\r
+ // retrieves next available alignment (returns success/fail)\r
+ bool GetNextAlignment(BamAlignment& bAlignment);\r
+ // retrieves next available alignment core data (returns success/fail)\r
+ // ** DOES NOT parse any character data (bases, qualities, tag data)\r
+ // these can be accessed, if necessary, from the supportData \r
+ // useful for operations requiring ONLY positional or other alignment-related information\r
+ bool GetNextAlignmentCore(BamAlignment& bAlignment, BamAlignmentSupportData& supportData);\r
+\r
+ // ----------------------\r
+ // access auxiliary data\r
+ // ----------------------\r
+\r
+ // returns SAM header text\r
+ const std::string GetHeaderText(void) const;\r
+ // returns number of reference sequences\r
+ int GetReferenceCount(void) const;\r
+ // returns vector of reference objects\r
+ const BamTools::RefVector GetReferenceData(void) const;\r
+ // returns reference id (used for BamReader::Jump()) for the given reference name\r
+ int GetReferenceID(const std::string& refName) const;\r
+ // returns the name of the file associated with this BamReader\r
+ const std::string GetFilename(void) const;\r
+\r
+ // ----------------------\r
+ // BAM index operations\r
+ // ----------------------\r
+\r
+ // creates index for BAM file, saves to file (default = bamFilename + ".bai")\r
+ bool CreateIndex(void);\r
+\r
+ // private implementation\r
+ private:\r
+ struct BamReaderPrivate;\r
+ BamReaderPrivate* d;\r
};\r
\r
-#endif /* BAMREADER_H */\r
+} // namespace BamTools\r
+\r
+#endif // BAMREADER_H\r