--- /dev/null
+// BamAlignment.h\r
+\r
+// Derek Barnett\r
+// Marth Lab, Boston College\r
+// Last modified: 20 March 2009\r
+\r
+#ifndef BAMALIGNMENT_H\r
+#define BAMALIGNMENT_H\r
+\r
+#include <stdint.h>\r
+\r
+// C++ includes\r
+#include <string>\r
+using std::string;\r
+\r
+#include <vector>\r
+using std::vector;\r
+\r
+struct CigarOp {\r
+ uint32_t Length;\r
+ char Type;\r
+};\r
+\r
+struct RefData {\r
+ string RefName;\r
+ unsigned int RefLength;\r
+ bool RefHasAlignments;\r
+\r
+ // constructor\r
+ RefData(void)\r
+ : RefLength(0)\r
+ , RefHasAlignments(false)\r
+ { }\r
+};\r
+\r
+typedef vector<RefData> RefVector;\r
+\r
+struct BamAlignment {\r
+\r
+ // queries against alignment flag - see below for further detail\r
+ public:\r
+ bool IsPaired(void) const { return ( (AlignmentFlag & PAIRED) != 0 ); }\r
+ bool IsProperPair(void) const { return ( (AlignmentFlag & PROPER_PAIR) != 0 ); }\r
+ bool IsMapped(void) const { return ( (AlignmentFlag & UNMAPPED) == 0 ); }\r
+ bool IsMateMapped(void) const { return ( (AlignmentFlag & MATE_UNMAPPED) == 0 ); }\r
+ bool IsReverseStrand(void) const { return ( (AlignmentFlag & REVERSE) != 0 ); }\r
+ bool IsMateReverseStrand(void) const { return ( (AlignmentFlag & MATE_REVERSE) != 0 ); }\r
+ bool IsFirstMate(void) const { return ( (AlignmentFlag & READ_1) != 0 ); }\r
+ bool IsSecondMate(void) const { return ( (AlignmentFlag & READ_2) != 0 ); }\r
+ bool IsPrimaryAlignment(void) const { return ( (AlignmentFlag & SECONDARY) == 0 ); }\r
+ bool IsFailedQC(void) const { return ( (AlignmentFlag & QC_FAILED) != 0 ); }\r
+ bool IsDuplicate(void) const { return ( (AlignmentFlag & DUPLICATE) != 0 ); }\r
+\r
+ // data members\r
+ public:\r
+ string Name; // read name\r
+ unsigned int Length; // query length\r
+ string QueryBases; // original sequence ( produced from machine )\r
+ string AlignedBases; // aligned sequence ( with indels ) \r
+ string Qualities; // FASTQ qualities ( still in ASCII characters )\r
+ vector<string> Tags;\r
+ unsigned int RefID; // ID for reference sequence\r
+ unsigned int Position; // position on reference sequence where alignment starts\r
+ unsigned int Bin; // bin in BAM file where this alignment resides\r
+ unsigned int MapQuality; // mapping quality \r
+ unsigned int AlignmentFlag; // see above for available queries\r
+ vector<CigarOp> CigarData; // vector of CIGAR operations (length & type) )\r
+ unsigned int MateRefID; // ID for reference sequence that mate was aligned to\r
+ unsigned int MatePosition; // position that mate was aligned to\r
+ unsigned int InsertSize; // mate pair insert size\r
+ \r
+\r
+ // alignment flag query constants\r
+ private:\r
+ enum { PAIRED = 1, // Alignment comes from paired-end data\r
+ PROPER_PAIR = 2, // Alignment passed paired-end resolution\r
+ UNMAPPED = 4, // Read is unmapped\r
+ MATE_UNMAPPED = 8, // Mate is unmapped\r
+ REVERSE = 16, // Read is on reverse strand\r
+ MATE_REVERSE = 32, // Mate is on reverse strand\r
+ READ_1 = 64, // This alignment is mate 1 of pair\r
+ READ_2 = 128, // This alignment is mate 2 of pair\r
+ SECONDARY = 256, // This alignment is not the primary (best) alignment for read\r
+ QC_FAILED = 512, // Read did not pass prior quality control steps\r
+ DUPLICATE = 1024 // Read is PCR duplicate\r
+ };\r
+};\r
+\r
+// commonly used vector in this library\r
+typedef vector< BamAlignment > BamAlignmentVector;\r
+\r
+#endif /* BAMALIGNMENT_H */\r
--- /dev/null
+#include <iostream>
+#include "BamReader.h"
+#include "BamWriter.h"
+
+using namespace std;
+
+int main(int argc, char* argv[]) {
+
+ if(argc != 3) {
+ cout << "USAGE: " << argv[0] << " <input BAM file> <output BAM file>" << endl;
+ exit(1);
+ }
+
+ // localize our arguments
+ const char* inputFilename = argv[1];
+ const char* outputFilename = argv[2];
+
+ // open our BAM reader
+ BamReader reader;
+ reader.SetFilename(inputFilename);
+
+ if(!reader.Open()) {
+ cout << "ERROR: Unable to open the BAM file (" << inputFilename << ")." << endl;
+ exit(1);
+ }
+
+ // retrieve the SAM header text
+ string samHeader = reader.GetHeaderText();
+
+ // retrieve the reference sequence vector
+ RefVector referenceSequences = reader.GetReferenceData();
+
+ // open the BAM writer
+ BamWriter writer;
+ writer.Open(outputFilename, samHeader, referenceSequences);
+
+ // copy all of the reads from the input file to the output file
+ BamAlignment al;
+ while(reader.GetNextAlignment(al)) writer.SaveAlignment(al);
+
+ // close our files
+ reader.Close();
+ writer.Close();
+
+ return 0;
+}
\ No newline at end of file
--- /dev/null
+// BamReader.cpp\r
+\r
+// Derek Barnett\r
+// Marth Lab, Boston College\r
+// Last modified: 6 April 2009\r
+\r
+#include "BamReader.h"\r
+#include <iostream>\r
+using std::cerr;\r
+using std::endl;\r
+\r
+// static character constants\r
+const char* BamReader::DNA_LOOKUP = "=ACMGRSVTWYHKDBN";\r
+const char* BamReader::CIGAR_LOOKUP = "MIDNSHP";\r
+\r
+BamReader::BamReader(const char* filename, const char* indexFilename) \r
+ : m_filename( (char*)filename )\r
+ , m_indexFilename( (char*)indexFilename )\r
+ , m_file(0)\r
+ , m_index(NULL)\r
+ , m_headerText("")\r
+ , m_isOpen(false)\r
+ , m_isIndexLoaded(false)\r
+ , m_isRegionSpecified(false)\r
+ , m_isCalculateAlignedBases(true)\r
+ , m_currentRefID(0)\r
+ , m_currentLeft(0)\r
+ , m_alignmentsBeginOffset(0)\r
+{\r
+ Open();\r
+}\r
+\r
+BamReader::~BamReader(void) {\r
+\r
+ // close file\r
+ if ( m_isOpen ) { Close(); }\r
+}\r
+\r
+// open BAM file\r
+bool BamReader::Open(void) {\r
+\r
+ if (!m_isOpen && m_filename != NULL ) {\r
+\r
+ // open file\r
+ m_file = bam_open(m_filename, "r"); \r
+ \r
+ // get header info && index info\r
+ if ( (m_file != NULL) && LoadHeader() ) {\r
+\r
+ // save file offset where alignments start\r
+ m_alignmentsBeginOffset = bam_tell(m_file);\r
+ \r
+ // set open flag\r
+ m_isOpen = true;\r
+ }\r
+\r
+ // try to open (and load) index data, if index file given\r
+ if ( m_indexFilename != NULL ) {\r
+ OpenIndex();\r
+ }\r
+ }\r
+\r
+ return m_isOpen;\r
+}\r
+\r
+bool BamReader::OpenIndex(void) {\r
+\r
+ if ( m_indexFilename && !m_isIndexLoaded ) {\r
+ m_isIndexLoaded = LoadIndex();\r
+ }\r
+ return m_isIndexLoaded;\r
+}\r
+\r
+// close BAM file\r
+bool BamReader::Close(void) {\r
+ \r
+ if (m_isOpen) {\r
+\r
+ // close file\r
+ int ret = bam_close(m_file);\r
+ \r
+ // delete index info\r
+ if ( m_index != NULL) { delete m_index; }\r
+\r
+ // clear open flag\r
+ m_isOpen = false;\r
+\r
+ // clear index flag\r
+ m_isIndexLoaded = false;\r
+\r
+ // clear region flag\r
+ m_isRegionSpecified = false;\r
+\r
+ // return success/fail of bam_close\r
+ return (ret == 0);\r
+ } \r
+\r
+ return true;\r
+}\r
+\r
+// get BAM filename\r
+const char* BamReader::Filename(void) const { \r
+ return (const char*)m_filename; \r
+}\r
+\r
+// set BAM filename\r
+void BamReader::SetFilename(const char* filename) {\r
+ m_filename = (char*)filename;\r
+}\r
+\r
+// get BAM Index filename\r
+const char* BamReader::IndexFilename(void) const { \r
+ return (const char*)m_indexFilename; \r
+}\r
+\r
+// set BAM Index filename\r
+void BamReader::SetIndexFilename(const char* indexFilename) {\r
+ m_indexFilename = (char*)indexFilename;\r
+}\r
+\r
+// return full header text\r
+const string BamReader::GetHeaderText(void) const { \r
+ return m_headerText; \r
+}\r
+\r
+// return number of reference sequences in BAM file\r
+const int BamReader::GetReferenceCount(void) const { \r
+ return m_references.size();\r
+}\r
+\r
+// get RefID from reference name\r
+const int BamReader::GetRefID(string refName) const { \r
+ \r
+ vector<string> refNames;\r
+ RefVector::const_iterator refIter = m_references.begin();\r
+ RefVector::const_iterator refEnd = m_references.end();\r
+ for ( ; refIter != refEnd; ++refIter) {\r
+ refNames.push_back( (*refIter).RefName );\r
+ }\r
+\r
+ // return 'index-of' refName (if not found, returns refNames.size())\r
+ return Index( refNames.begin(), refNames.end(), refName );\r
+}\r
+\r
+const RefVector BamReader::GetReferenceData(void) const {\r
+ return m_references;\r
+}\r
+\r
+bool BamReader::Jump(int refID, unsigned int left) {\r
+\r
+ // if index available, and region is valid\r
+ if ( m_isIndexLoaded && m_references.at(refID).RefHasAlignments && (left <= m_references.at(refID).RefLength) ) { \r
+ m_currentRefID = refID;\r
+ m_currentLeft = left;\r
+ m_isRegionSpecified = true;\r
+ return ( bam_seek(m_file, GetOffset(m_currentRefID, m_currentLeft), SEEK_SET) == 0 );\r
+ }\r
+ return false;\r
+}\r
+\r
+bool BamReader::Rewind(void) {\r
+\r
+ int refID = 0;\r
+ int refCount = m_references.size();\r
+ for ( ; refID < refCount; ++refID ) {\r
+ if ( m_references.at(refID).RefHasAlignments ) { break; } \r
+ }\r
+\r
+ m_currentRefID = refID;\r
+ m_currentLeft = 0;\r
+ m_isRegionSpecified = false;\r
+\r
+ return ( bam_seek(m_file, m_alignmentsBeginOffset, SEEK_SET) == 0 );\r
+} \r
+\r
+// get next alignment from specified region\r
+bool BamReader::GetNextAlignment(BamAlignment& bAlignment) {\r
+\r
+ // try to load 'next' read\r
+ if ( LoadNextAlignment(bAlignment) ) {\r
+\r
+ // if specified region, check for region overlap\r
+ if ( m_isRegionSpecified ) {\r
+\r
+ // if overlap, return true\r
+ if ( IsOverlap(bAlignment) ) { return true; }\r
+ // if not, try the next alignment\r
+ else { return GetNextAlignment(bAlignment); }\r
+ } \r
+\r
+ // not using region, valid read detected, return success\r
+ else { return true; }\r
+ }\r
+\r
+ // no valid alignment to load\r
+ return false;\r
+}\r
+\r
+void BamReader::SetCalculateAlignedBases(bool flag) {\r
+ m_isCalculateAlignedBases = flag;\r
+}\r
+\r
+int BamReader::BinsFromRegion(int refID, unsigned int left, uint16_t list[MAX_BIN]) {\r
+\r
+ // get region boundaries\r
+ uint32_t begin = left;\r
+ uint32_t end = m_references.at(refID).RefLength - 1;\r
+\r
+ // initialize list, bin '0' always a valid bin\r
+ int i = 0;\r
+ list[i++] = 0;\r
+\r
+ // get rest of bins that contain this region\r
+ unsigned int k;\r
+ for (k = 1 + (begin>>26); k <= 1 + (end>>26); ++k) { list[i++] = k; }\r
+ for (k = 9 + (begin>>23); k <= 9 + (end>>23); ++k) { list[i++] = k; }\r
+ for (k = 73 + (begin>>20); k <= 73 + (end>>20); ++k) { list[i++] = k; }\r
+ for (k = 585 + (begin>>17); k <= 585 + (end>>17); ++k) { list[i++] = k; }\r
+ for (k = 4681 + (begin>>14); k <= 4681 + (end>>14); ++k) { list[i++] = k; }\r
+ \r
+ // return number of bins stored\r
+ return i;\r
+}\r
+\r
+uint32_t BamReader::CalculateAlignmentEnd(const unsigned int& position, const vector<CigarOp>& cigarData) {\r
+\r
+ // initialize alignment end to starting position\r
+ uint32_t alignEnd = position;\r
+\r
+ // iterate over cigar operations\r
+ vector<CigarOp>::const_iterator cigarIter = cigarData.begin();\r
+ vector<CigarOp>::const_iterator cigarEnd = cigarData.end();\r
+ for ( ; cigarIter != cigarEnd; ++cigarIter) {\r
+ if ( (*cigarIter).Type == 'M' || (*cigarIter).Type == 'D' || (*cigarIter).Type == 'N') {\r
+ alignEnd += (*cigarIter).Length;\r
+ }\r
+ }\r
+ return alignEnd;\r
+}\r
+\r
+uint64_t BamReader::GetOffset(int refID, unsigned int left) {\r
+\r
+ // make space for bins\r
+ uint16_t* bins = (uint16_t*)calloc(MAX_BIN, 2); \r
+ \r
+ // returns number of bins overlapping (left, right)\r
+ // stores indices of those bins in 'bins'\r
+ int numBins = BinsFromRegion(refID, left, bins); \r
+\r
+ // access index data for refID\r
+ RefIndex* refIndex = m_index->at(refID);\r
+\r
+ // get list of BAM bins for this reference sequence\r
+ BinVector* refBins = refIndex->first;\r
+\r
+ sort( refBins->begin(), refBins->end(), LookupKeyCompare<uint32_t, ChunkVector*>() );\r
+\r
+ // declare ChunkVector\r
+ ChunkVector regionChunks;\r
+\r
+ // declaure LinearOffsetVector\r
+ LinearOffsetVector* linearOffsets = refIndex->second;\r
+\r
+ // some sort of linear offset vs bin index voodoo... not completely sure what's going here\r
+ uint64_t minOffset = ((left>>BAM_LIDX_SHIFT) >= linearOffsets->size()) ? 0 : linearOffsets->at(left>>BAM_LIDX_SHIFT);\r
+\r
+ BinVector::iterator binBegin = refBins->begin();\r
+ BinVector::iterator binEnd = refBins->end();\r
+\r
+ // iterate over bins overlapping region, count chunks\r
+ for (int i = 0; i < numBins; ++i) {\r
+ \r
+ // look for bin with ID=bin[i]\r
+ BinVector::iterator binIter = binBegin;\r
+\r
+ for ( ; binIter != binEnd; ++binIter ) {\r
+ \r
+ // if found, increment n_off by number of chunks for each bin\r
+ if ( (*binIter).first == (uint32_t)bins[i] ) { \r
+ \r
+ // iterate over chunks in that bin\r
+ ChunkVector* binChunks = (*binIter).second;\r
+ \r
+ ChunkVector::iterator chunkIter = binChunks->begin();\r
+ ChunkVector::iterator chunkEnd = binChunks->end();\r
+ for ( ; chunkIter != chunkEnd; ++chunkIter) {\r
+ \r
+ // if right bound of pair is greater than min_off (linear offset value), store pair\r
+ if ( (*chunkIter).second > minOffset) { \r
+ regionChunks.push_back( (*chunkIter) ); \r
+ }\r
+ }\r
+ }\r
+ }\r
+ }\r
+\r
+ // clean up temp array\r
+ free(bins);\r
+\r
+ // there should be at least 1\r
+ assert(regionChunks.size() > 0);\r
+\r
+ // sort chunks by start position\r
+ sort ( regionChunks.begin(), regionChunks.end(), LookupKeyCompare<uint64_t, uint64_t>() );\r
+\r
+ // resolve overlaps between adjacent blocks; this may happen due to the merge in indexing\r
+ int numOffsets = regionChunks.size(); \r
+ for (int i = 1; i < numOffsets; ++i) {\r
+ if ( regionChunks.at(i-1).second >= regionChunks.at(i).first ) {\r
+ regionChunks.at(i-1).second = regionChunks.at(i).first;\r
+ }\r
+ }\r
+ \r
+ // merge adjacent chunks\r
+ int l = 0;\r
+ for (int i = 1; i < numOffsets; ++i) {\r
+ // if adjacent, expand boundaries of (merged) chunk\r
+ if ( (regionChunks.at(l).second>>16) == (regionChunks.at(i).first>>16) ) {\r
+ regionChunks.at(l).second = regionChunks.at(i).second;\r
+ }\r
+ // else, move on to next (merged) chunk index\r
+ else { regionChunks.at(++l) = regionChunks.at(i); }\r
+ }\r
+\r
+ // return beginning file offset of first chunk for region\r
+ return regionChunks.at(0).first;\r
+}\r
+\r
+bool BamReader::IsOverlap(BamAlignment& bAlignment) {\r
+\r
+ // if on different reference sequence, quit\r
+ if ( bAlignment.RefID != (unsigned int)m_currentRefID ) { return false; }\r
+\r
+ // read starts after left boundary\r
+ if ( bAlignment.Position >= m_currentLeft) { return true; }\r
+\r
+ // get alignment end\r
+ uint32_t alignEnd = CalculateAlignmentEnd(bAlignment.Position, bAlignment.CigarData);\r
+\r
+ // return whether alignment end overlaps left boundary\r
+ return ( alignEnd >= m_currentLeft );\r
+}\r
+\r
+bool BamReader::LoadHeader(void) {\r
+\r
+ // check to see if proper BAM header\r
+ char buf[4];\r
+ if (bam_read(m_file, buf, 4) != 4) { return false; }\r
+ if (strncmp(buf, "BAM\001", 4)) {\r
+ fprintf(stderr, "wrong header type!\n");\r
+ return false;\r
+ }\r
+ \r
+ // get BAM header text length\r
+ int32_t headerTextLength;\r
+ bam_read(m_file, &headerTextLength, 4);\r
+\r
+ // get BAM header text\r
+ char* headerText = (char*)calloc(headerTextLength + 1, 1);\r
+ bam_read(m_file, headerText, headerTextLength);\r
+ m_headerText = (string)((const char*)headerText);\r
+ \r
+ // clean up calloc-ed temp variable\r
+ free(headerText);\r
+\r
+ // get number of reference sequences\r
+ int32_t numberRefSeqs;\r
+ bam_read(m_file, &numberRefSeqs, 4);\r
+ if (numberRefSeqs == 0) { return false; }\r
+\r
+ m_references.reserve((int)numberRefSeqs);\r
+ \r
+ // reference variables\r
+ int32_t refNameLength;\r
+ char* refName;\r
+ uint32_t refLength;\r
+\r
+ // iterate over all references in header\r
+ for (int i = 0; i != numberRefSeqs; ++i) {\r
+\r
+ // get length of reference name\r
+ bam_read(m_file, &refNameLength, 4);\r
+ refName = (char*)calloc(refNameLength, 1);\r
+\r
+ // get reference name and reference sequence length\r
+ bam_read(m_file, refName, refNameLength);\r
+ bam_read(m_file, &refLength, 4);\r
+\r
+ // store data for reference\r
+ RefData aReference;\r
+ aReference.RefName = (string)((const char*)refName);\r
+ aReference.RefLength = refLength;\r
+ m_references.push_back(aReference);\r
+\r
+ // clean up calloc-ed temp variable\r
+ free(refName);\r
+ }\r
+ \r
+ return true;\r
+}\r
+\r
+bool BamReader::LoadIndex(void) {\r
+\r
+ // check to see if index file exists\r
+ FILE* indexFile;\r
+ if ( ( indexFile = fopen(m_indexFilename, "r") ) == 0 ) {\r
+ fprintf(stderr, "The alignment is not indexed. Please run SAMtools \'index\' command first.\n");\r
+ return false;\r
+ }\r
+\r
+ // see if index is valid BAM index\r
+ char magic[4];\r
+ fread(magic, 1, 4, indexFile);\r
+ if (strncmp(magic, "BAI\1", 4)) {\r
+ fprintf(stderr, "Problem with index - wrong \'magic\' number.\n");\r
+ fclose(indexFile);\r
+ return false;\r
+ }\r
+\r
+ // get number of reference sequences\r
+ uint32_t numRefSeqs;\r
+ fread(&numRefSeqs, 4, 1, indexFile);\r
+ \r
+ // intialize BamIndex data structure\r
+ m_index = new BamIndex;\r
+ m_index->reserve(numRefSeqs);\r
+\r
+ // iterate over reference sequences\r
+ for (unsigned int i = 0; i < numRefSeqs; ++i) {\r
+ \r
+ // get number of bins for this reference sequence\r
+ int32_t numBins;\r
+ fread(&numBins, 4, 1, indexFile);\r
+ \r
+ if (numBins > 0) { m_references.at(i).RefHasAlignments = true; }\r
+\r
+ // intialize BinVector\r
+ BinVector* bins = new BinVector;\r
+ bins->reserve(numBins);\r
+ \r
+ // iterate over bins for that reference sequence\r
+ for (int j = 0; j < numBins; ++j) {\r
+ \r
+ // get binID \r
+ uint32_t binID;\r
+ fread(&binID, 4, 1, indexFile);\r
+ \r
+ // get number of regionChunks in this bin\r
+ uint32_t numChunks;\r
+ fread(&numChunks, 4, 1, indexFile);\r
+ \r
+ // intialize ChunkVector\r
+ ChunkVector* regionChunks = new ChunkVector;\r
+ regionChunks->reserve(numChunks);\r
+ \r
+ // iterate over regionChunks in this bin\r
+ for (unsigned int k = 0; k < numChunks; ++k) {\r
+ \r
+ // get chunk boundaries (left, right) \r
+ uint64_t left;\r
+ uint64_t right;\r
+ fread(&left, 8, 1, indexFile);\r
+ fread(&right, 8, 1, indexFile);\r
+ \r
+ // save ChunkPair\r
+ regionChunks->push_back( ChunkPair(left, right) );\r
+ }\r
+ \r
+ // save binID, chunkVector for this bin\r
+ bins->push_back( BamBin(binID, regionChunks) );\r
+ }\r
+ \r
+ // load linear index for this reference sequence\r
+ \r
+ // get number of linear offsets\r
+ int32_t numLinearOffsets;\r
+ fread(&numLinearOffsets, 4, 1, indexFile);\r
+ \r
+ // intialize LinearOffsetVector\r
+ LinearOffsetVector* linearOffsets = new LinearOffsetVector;\r
+ linearOffsets->reserve(numLinearOffsets);\r
+ \r
+ // iterate over linear offsets for this reference sequeence\r
+ for (int j = 0; j < numLinearOffsets; ++j) {\r
+ // get a linear offset\r
+ uint64_t linearOffset;\r
+ fread(&linearOffset, 8, 1, indexFile);\r
+ // store linear offset\r
+ linearOffsets->push_back(linearOffset);\r
+ }\r
+ \r
+ // store index data for that reference sequence\r
+ m_index->push_back( new RefIndex(bins, linearOffsets) );\r
+ }\r
+ \r
+ // close index file (.bai) and return\r
+ fclose(indexFile);\r
+ return true;\r
+}\r
+\r
+bool BamReader::LoadNextAlignment(BamAlignment& bAlignment) {\r
+\r
+ // check valid alignment block header data\r
+ int32_t block_len;\r
+ int32_t ret;\r
+ uint32_t x[8];\r
+\r
+ int32_t bytesRead = 0;\r
+\r
+ // read in the 'block length' value, make sure it's not zero\r
+ if ( (ret = bam_read(m_file, &block_len, 4)) == 0 ) { return false; }\r
+ bytesRead += 4;\r
+\r
+ // read in core alignment data, make the right size of data was read \r
+ if ( bam_read(m_file, x, BAM_CORE_SIZE) != BAM_CORE_SIZE ) { return false; }\r
+ bytesRead += BAM_CORE_SIZE;\r
+\r
+ // set BamAlignment 'core' data\r
+ bAlignment.RefID = x[0]; \r
+ bAlignment.Position = x[1];\r
+ bAlignment.Bin = x[2]>>16; \r
+ bAlignment.MapQuality = x[2]>>8&0xff; \r
+ bAlignment.AlignmentFlag = x[3]>>16; \r
+ bAlignment.MateRefID = x[5]; \r
+ bAlignment.MatePosition = x[6]; \r
+ bAlignment.InsertSize = x[7];\r
+\r
+ // fetch & store often-used lengths for character data parsing\r
+ unsigned int queryNameLength = x[2]&0xff;\r
+ unsigned int numCigarOperations = x[3]&0xffff;\r
+ unsigned int querySequenceLength = x[4];\r
+ \r
+ // get length of character data\r
+ int dataLength = block_len - BAM_CORE_SIZE;\r
+\r
+ // set up destination buffers for character data\r
+ uint8_t* allCharData = (uint8_t*)calloc(sizeof(uint8_t), dataLength);\r
+ uint32_t* cigarData = (uint32_t*)(allCharData+queryNameLength);\r
+ \r
+ // get character data - make sure proper data size was read\r
+ if (bam_read(m_file, allCharData, dataLength) != dataLength) { return false; }\r
+ else {\r
+\r
+ bytesRead += dataLength;\r
+\r
+ // clear out bases, qualities, aligned bases, and CIGAR\r
+ bAlignment.QueryBases.clear();\r
+ bAlignment.Qualities.clear();\r
+ bAlignment.AlignedBases.clear();\r
+ bAlignment.CigarData.clear();\r
+\r
+ // save name\r
+ bAlignment.Name = (string)((const char*)(allCharData));\r
+ \r
+ // save bases\r
+ char singleBase[2];\r
+ uint8_t* s = ( allCharData + (numCigarOperations*4) + queryNameLength);\r
+ for (unsigned int i = 0; i < querySequenceLength; ++i) { \r
+ // numeric to char conversion\r
+ sprintf( singleBase, "%c", DNA_LOOKUP[ ( ( s[(i/2)] >> (4*(1-(i%2)))) & 0xf ) ] );\r
+ // append character data to Bases\r
+ bAlignment.QueryBases.append( (const char*)singleBase );\r
+ }\r
+ \r
+ // save sequence length\r
+ bAlignment.Length = bAlignment.QueryBases.length();\r
+ \r
+ // save qualities\r
+ char singleQuality[4];\r
+ uint8_t* t = ( allCharData + (numCigarOperations*4) + queryNameLength + (querySequenceLength + 1)/2);\r
+ for (unsigned int i = 0; i < querySequenceLength; ++i) { \r
+ // numeric to char conversion\r
+ sprintf( singleQuality, "%c", ( t[i]+33 ) ); \r
+ // append character data to Qualities\r
+ bAlignment.Qualities.append( (const char*)singleQuality );\r
+ }\r
+ \r
+ // save CIGAR-related data;\r
+ int k = 0;\r
+ for (unsigned int i = 0; i < numCigarOperations; ++i) {\r
+\r
+ // build CigarOp struct\r
+ CigarOp op;\r
+ op.Length = (cigarData[i] >> BAM_CIGAR_SHIFT);\r
+ op.Type = CIGAR_LOOKUP[ (cigarData[i] & BAM_CIGAR_MASK) ];\r
+\r
+ // save CigarOp\r
+ bAlignment.CigarData.push_back(op);\r
+\r
+ // can skip this step if user wants to ignore\r
+ if (m_isCalculateAlignedBases) {\r
+\r
+ // build AlignedBases string\r
+ switch (op.Type) {\r
+ \r
+ case ('M') : \r
+ case ('I') : bAlignment.AlignedBases.append( bAlignment.QueryBases.substr(k, op.Length) ); // for 'M', 'I' - write bases\r
+ case ('S') : k += op.Length; // for 'S' - skip over query bases\r
+ break;\r
+ \r
+ case ('D') : \r
+ case ('P') : bAlignment.AlignedBases.append( op.Length, '*' ); // for 'D', 'P' - write padding;\r
+ break;\r
+ \r
+ case ('N') : bAlignment.AlignedBases.append( op.Length, 'N' ); // for 'N' - write N's, skip bases in query sequence\r
+ k += op.Length;\r
+ break;\r
+\r
+ case ('H') : break; // for 'H' - do nothing, move to next op\r
+ \r
+ default : assert(false); // shouldn't get here\r
+ break;\r
+ }\r
+ }\r
+ } \r
+ }\r
+ free(allCharData);\r
+\r
+/*\r
+ // (optional) read tag parsing data\r
+ string tag;\r
+ char data;\r
+ int i = 0;\r
+\r
+ // still data to read (auxiliary tags)\r
+ while ( bytesRead < block_len ) {\r
+\r
+ if ( bam_read(m_file, &data, 1) == 1 ) { \r
+ \r
+ ++bytesRead;\r
+\r
+ if (bytesRead == block_len && data != '\0') {\r
+ fprintf(stderr, "ERROR: Invalid termination of tag info at end of alignment block.\n");\r
+ exit(1);\r
+ }\r
+\r
+ tag.append(1, data);\r
+ if ( data == '\0' ) {\r
+ bAlignment.Tags.push_back(tag);\r
+ tag = "";\r
+ i = 0;\r
+ } else {\r
+ if ( (i == 1) && (i == 2) ) { tag.append(1, ':'); }\r
+ ++i;\r
+ }\r
+ } else {\r
+ fprintf(stderr, "ERROR: Could not read tag info.\n");\r
+ exit(1);\r
+ }\r
+ }\r
+*/\r
+ return true;\r
+}\r
--- /dev/null
+// BamReader.h\r
+\r
+/* The MIT License\r
+\r
+ Copyright (c) 2008 Genome Research Ltd (GRL).\r
+\r
+ Permission is hereby granted, free of charge, to any person obtaining\r
+ a copy of this software and associated documentation files (the\r
+ "Software"), to deal in the Software without restriction, including\r
+ without limitation the rights to use, copy, modify, merge, publish,\r
+ distribute, sublicense, and/or sell copies of the Software, and to\r
+ permit persons to whom the Software is furnished to do so, subject to\r
+ the following conditions:\r
+\r
+ The above copyright notice and this permission notice shall be\r
+ included in all copies or substantial portions of the Software.\r
+\r
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,\r
+ EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF\r
+ MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND\r
+ NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS\r
+ BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN\r
+ ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN\r
+ CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE\r
+ SOFTWARE.\r
+*/\r
+\r
+/*\r
+ Implementation of BAM-parsing was translated to C++ directly from Heng Li's SAMtools package \r
+ (thus the carryover of above MIT license)\r
+ Contact: Derek Barnett <barnetde@bc.edu>\r
+*/\r
+\r
+// Derek Barnett\r
+// Marth Lab, Boston College\r
+// Last modified: 6 April 2009\r
+\r
+#ifndef BAMREADER_H\r
+#define BAMREADER_H\r
+\r
+// custom includes\r
+#include "BamAlignment.h"\r
+#include "STLUtilities.h"\r
+\r
+// C library includes\r
+#include <assert.h>\r
+#include <stdint.h>\r
+#include <stdio.h>\r
+#include <stdlib.h>\r
+#include <string.h>\r
+\r
+// BGZF library includes/defines\r
+#include "bgzf.h"\r
+typedef BGZF* BamFile;\r
+#define bam_open(f_name, mode) bgzf_open(f_name, mode)\r
+#define bam_close(f_ptr) bgzf_close(f_ptr)\r
+#define bam_read(f_ptr, buf, size) bgzf_read(f_ptr, buf, size)\r
+#define bam_write(f_ptr, buf, size) bgzf_write(f_ptr, buf, size)\r
+#define bam_tell(f_ptr) bgzf_tell(f_ptr)\r
+#define bam_seek(f_ptr, pos, dir) bgzf_seek(f_ptr, pos, dir)\r
+\r
+// size of alignment data block in BAM file (bytes)\r
+#define BAM_CORE_SIZE 32\r
+\r
+// BAM indexing constants\r
+#define MAX_BIN 37450 // =(8^6-1)/7+1\r
+#define BAM_MIN_CHUNK_GAP 32768 \r
+#define BAM_LIDX_SHIFT 14\r
+\r
+// CIGAR-retrieval mask/shift constants\r
+#define BAM_CIGAR_SHIFT 4\r
+#define BAM_CIGAR_MASK ((1 << BAM_CIGAR_SHIFT) - 1)\r
+\r
+// CIGAR-operation types\r
+#define BAM_CMATCH 0\r
+#define BAM_CINS 1\r
+#define BAM_CDEL 2\r
+#define BAM_CREF_SKIP 3\r
+#define BAM_CSOFT_CLIP 4\r
+#define BAM_CHARD_CLIP 5\r
+#define BAM_CPAD 6\r
+\r
+// --------------------------- //\r
+// Bam header info\r
+// --------------------------- //\r
+\r
+// --------------------------- //\r
+// BamIndex-related typedefs\r
+// --------------------------- //\r
+\r
+// offset for linear indexing\r
+typedef vector<uint64_t> LinearOffsetVector;\r
+\r
+// chunk boundaries\r
+typedef pair<uint64_t, uint64_t> ChunkPair;\r
+// list of chunks in a BAM bin\r
+typedef vector<ChunkPair> ChunkVector;\r
+\r
+// BAM bins for a reference sequence\r
+// replaces khash - uint32_t is key, ChunkVector is value\r
+typedef pair<uint32_t, ChunkVector*> BamBin;\r
+typedef vector<BamBin> BinVector;\r
+\r
+// each reference sequence has a BinVector and LinearOffsetVector\r
+typedef pair<BinVector*, LinearOffsetVector*> RefIndex;\r
+\r
+// full BamIndex defined as: \r
+typedef vector<RefIndex*> BamIndex;\r
+\r
+// ---------------------------------------------------------------------------//\r
+\r
+class BamReader {\r
+ \r
+ public:\r
+ // constructors\r
+ BamReader(const char* fileName = NULL, const char* indexFilename = NULL);\r
+\r
+ public:\r
+ // destructor\r
+ ~BamReader(void);\r
+ \r
+ // BAM interface methods\r
+ public:\r
+\r
+ // ----------------------- //\r
+ // File manipulation\r
+ // ----------------------- //\r
+ \r
+ // open BAM file (automatically opens index if provided)\r
+ bool Open(void);\r
+ \r
+ // open BAM index (allows index to be opened separately - i.e. sometime after the BAM file is opened)\r
+ bool OpenIndex(void);\r
+ \r
+ // close BAM file\r
+ bool Close(void);\r
+ \r
+ // get BAM filename\r
+ const char* Filename(void) const;\r
+ \r
+ // set BAM filename\r
+ void SetFilename(const char*);\r
+\r
+ // get BAM Index filename\r
+ const char* IndexFilename(void) const;\r
+ \r
+ // set BAM Index filename\r
+ void SetIndexFilename(const char*);\r
+\r
+ // ----------------------- //\r
+ // Access BAM header\r
+ // ----------------------- //\r
+ \r
+ // return full header text\r
+ const string GetHeaderText(void) const;\r
+ \r
+ // --------------------------------- //\r
+ // Access reference sequence info\r
+ // --------------------------------- //\r
+ \r
+ // return number of reference sequences in BAM file\r
+ const int GetReferenceCount(void) const;\r
+\r
+ // return vector of RefData entries\r
+ const RefVector GetReferenceData(void) const;\r
+\r
+ // get refID from reference name\r
+ const int GetRefID(string refName) const; \r
+ \r
+ // ----------------------------------------- //\r
+ // File position moving\r
+ // ----------------------------------------- //\r
+\r
+ // jumps to 'left' position on refID\r
+ // actually jumps before position, so reads that overlap 'left' are included as well\r
+ // 'left' defaults to reference begin if not specified\r
+ bool Jump(int refID, unsigned int left = 0);\r
+\r
+ // Jump to beginning of BAM file, clears any region previously set by Jump()\r
+ bool Rewind(void);\r
+ \r
+ // ------------------------------ //\r
+ // Access alignments\r
+ // ------------------------------ //\r
+ \r
+ // get next alignment\r
+ bool GetNextAlignment(BamAlignment& read);\r
+\r
+ // allow user to specifiy whether 'AlignedBases' string is calculated when alignment is loaded\r
+ void SetCalculateAlignedBases(bool);\r
+\r
+ // internal utility methods\r
+ private:\r
+ int BinsFromRegion(int, unsigned int, uint16_t[MAX_BIN]);\r
+ uint32_t CalculateAlignmentEnd(const unsigned int&, const vector<CigarOp>&);\r
+ uint64_t GetOffset(int, unsigned int);\r
+ bool IsOverlap(BamAlignment&);\r
+ bool LoadHeader(void);\r
+ bool LoadIndex(void);\r
+ bool LoadNextAlignment(BamAlignment&);\r
+\r
+ private: \r
+ // main BAM reader components\r
+ char* m_filename;\r
+ char* m_indexFilename;\r
+ BamFile m_file;\r
+ BamIndex* m_index;\r
+ RefVector m_references;\r
+ string m_headerText;\r
+\r
+ // state flags\r
+ bool m_isOpen; // BAM file is open for processing\r
+ bool m_isIndexLoaded; // BAM Index data is loaded and available for processing\r
+ bool m_isRegionSpecified; // a region has been specified - specifically, a user has called Jump()\r
+ bool m_isCalculateAlignedBases; // build 'AlignedBases' string when getting an alignment, otherwise skip (default = true)\r
+\r
+ // region values\r
+ int m_currentRefID;\r
+ unsigned int m_currentLeft;\r
+\r
+ // file offset of 1st read in BAM file\r
+ int64_t m_alignmentsBeginOffset;\r
+\r
+ private:\r
+ // BAM character constants\r
+ static const char* DNA_LOOKUP;\r
+ static const char* CIGAR_LOOKUP;\r
+};\r
+\r
+#endif /* BAMREADER_H */\r
--- /dev/null
+// BamReaderMain.cpp\r
+\r
+// Derek Barnett\r
+// Marth Lab, Boston College\r
+// Last modified: 6 April 2009\r
+\r
+#include "BamReader.h"\r
+\r
+// C++ includes\r
+#include <iostream>\r
+using std::cerr;\r
+using std::cout;\r
+using std::endl;\r
+\r
+#include <vector>\r
+using std::vector;\r
+\r
+#include <string>\r
+using std::string;\r
+\r
+int main(int argc, char* argv[]) {\r
+\r
+ // check command line parameters\r
+ if (argc != 3) {\r
+ cerr << endl;\r
+ cerr << "Usage: BamReaderTest <bamFile> <bamIndexFile>" << endl;\r
+ cerr << endl;\r
+ exit(1);\r
+ }\r
+\r
+ // get filenames from command line\r
+ const char* bamFilename = argv[1];\r
+ const char* bamIndexFilename = argv[2];\r
+\r
+ string refName;\r
+ int refID;\r
+ unsigned int leftBound;\r
+ unsigned int rightBound;\r
+\r
+ int alignmentCount;\r
+\r
+ vector<string> refNames;\r
+\r
+ BamAlignment bAlignment;\r
+ BamAlignmentVector alignments;\r
+\r
+ RefVector references;\r
+\r
+ // ------------------------------------------------------------------------------------------------------ //\r
+ // Declare BamReader & open BAM file - automatically loads up header and index file (.bai) information\r
+ // ------------------------------------------------------------------------------------------------------ //\r
+ \r
+ BamReader bReader(bamFilename, bamIndexFilename);\r
+\r
+ cerr << endl;\r
+ cerr << "Opening BamReader for BAM file: " << bamFilename << " ..... ";\r
+ \r
+ if ( bReader.Open() ) { \r
+ cerr << "ok" << endl; \r
+ }\r
+ else {\r
+ cerr << "error" << endl;\r
+ exit(1);\r
+ }\r
+ \r
+ // ------------------------------------------------------------ //\r
+ // Find out how many reference sequences are in BAM file\r
+ // ------------------------------------------------------------ //\r
+ \r
+ references = bReader.GetReferenceData();\r
+\r
+ cerr << endl;\r
+ cerr << "Total number of ref seqs: " << references.size() << endl;\r
+ \r
+ // ---------------------------------------------------------------------------- //\r
+ // Get the names/lengths of all the reference sequences that have alignments\r
+ // ---------------------------------------------------------------------------- //\r
+ \r
+ cerr << endl;\r
+ cerr << "All ref sequences with alignments:" << endl;\r
+ cerr << endl;\r
+\r
+\r
+ if ( !references.empty() ) {\r
+ \r
+ RefVector::iterator refIter = references.begin();\r
+ RefVector::iterator refEnd = references.end();\r
+ \r
+ refID = 0;\r
+ // iterate over reference names, print to STDERR\r
+ for( ; refIter != refEnd; ++refIter) {\r
+\r
+ if ( (*refIter).RefHasAlignments ) {\r
+ cerr << "ID: " << refID << endl;\r
+ cerr << "Name: " << (*refIter).RefName << endl;\r
+ cerr << "Length: " << (*refIter).RefLength << endl;\r
+ cerr << endl;\r
+ }\r
+\r
+ ++refID;\r
+ }\r
+ }\r
+\r
+ // --------------------------------------------------------------------------------------------- //\r
+ // Get the SAM-style header text, if available (not available in the example file shown here)\r
+ // --------------------------------------------------------------------------------------------- // \r
+ \r
+ cerr << "------------------------------------" << endl;\r
+ cerr << "SAM header text: " << endl;\r
+ \r
+ // get (SAM) header text\r
+ string header = bReader.GetHeaderText();\r
+ \r
+ cerr << ( (header.empty()) ? "no header data" : header ) << endl;\r
+ cerr << "------------------------------------" << endl;\r
+\r
+ // --------------------------------------------------------------------------------------------- //\r
+ // Here we start accessing alignments\r
+ // The first method shows how to iterate over all reads in a BamFile\r
+ // This method will work on any BAM file (sorted/non-sorted, with/without an index)\r
+ // --------------------------------------------------------------------------------------------- //\r
+\r
+ // Call Rewind() to make sure you're at the 1st alignment in the file\r
+ // Please note, however, it's not necessary in this case, since file pointer initially set to 1st alignment\r
+ // but this is probably a good habit to develop to ensure correctness and make your intent obvious in the code\r
+\r
+ cerr << "Getting (up to) first 1000 alignments" << endl;\r
+\r
+ // start at 1st alignment\r
+ if ( bReader.Rewind() ) {\r
+ \r
+ alignmentCount = 0;\r
+ while ( bReader.GetNextAlignment(bAlignment) && (alignmentCount < 1000) ) {\r
+ \r
+ // disregard unmapped alignments\r
+ if ( bAlignment.IsMapped() ) {\r
+\r
+ ++alignmentCount;\r
+ \r
+ cout << "----------------------------" << endl;\r
+ cout << "Alignment " << alignmentCount << endl;\r
+ cout << bAlignment.Name << endl;\r
+ cout << bAlignment.AlignedBases << endl;\r
+ cout << "Aligned to " << references.at(bAlignment.RefID).RefName << ":" << bAlignment.Position << endl;\r
+ \r
+ cout << "Cigar Data: " << endl;\r
+\r
+ vector<CigarOp>::const_iterator cigarIter = bAlignment.CigarData.begin();\r
+ vector<CigarOp>::const_iterator cigarEnd = bAlignment.CigarData.end();\r
+ for ( ; cigarIter != cigarEnd; ++cigarIter ) {\r
+ cout << "Type: " << (*cigarIter).Type << "\tLength: " << (*cigarIter).Length << endl;\r
+ }\r
+\r
+ cout << "Tags: " << endl;\r
+ vector<string>::const_iterator tagIter = bAlignment.Tags.begin();\r
+ vector<string>::const_iterator tagEnd = bAlignment.Tags.end();\r
+ for ( ; tagIter != tagEnd; ++tagIter ) {\r
+ cout << (*tagIter) << endl;\r
+ }\r
+ }\r
+ }\r
+\r
+ cerr << "Found " << alignmentCount << " alignments." << endl;\r
+ } else { cerr << "Could not rewind" << endl; }\r
+\r
+ // ---------------------------------------------------------------------------------------------------------- //\r
+ // You can iterate over each individual alignment that overlaps a specified region\r
+ // Set the refID & left boundary parameters using Jump(refID, left)\r
+ // Jump() actually positions the file pointer actually before the left boundary\r
+ // This ensures that reads beginning before, but overlapping 'left' are included as well \r
+ // Client is responsible for setting/checking right boundary - see below\r
+ // ---------------------------------------------------------------------------------------------------------- //\r
+/*\r
+ cerr << "Jumping to region" << endl;\r
+\r
+ // get refID using a known refName\r
+ refName = "seq1";\r
+ refID = bReader.GetRefID(refName);\r
+ if (refID == (int)references.size()) { \r
+ cerr << "Reference " << refName << " not found." << endl;\r
+ exit(1);\r
+ }\r
+\r
+ // set left boundary\r
+ leftBound = 500;\r
+\r
+ // set right boundary - either user-specified number to set a discrete region\r
+ // OR you can query the BamReader for the end of the reference\r
+ rightBound = references.at(refID).RefLength;\r
+\r
+ cerr << endl;\r
+ cerr << "Iterating over alignments on reference: " << refName << " from " << leftBound << " to ref end (" << rightBound << ")" << endl;\r
+\r
+ // set region - specific region on reference\r
+ if ( bReader.Jump(refID, leftBound) ) { \r
+\r
+ alignmentCount = 0;\r
+ while ( bReader.GetNextAlignment(bAlignment) && (bAlignment.Position <= rightBound) ) {\r
+ \r
+ if ( bAlignment.IsMapped() ) {\r
+\r
+ ++alignmentCount;\r
+ \r
+ if ( (alignmentCount % 100000) == 0) { cerr << "Retrieved " << alignmentCount << " so far..." << endl; }\r
+ \r
+ cout << "----------------------------" << endl;\r
+ cout << "Alignment " << alignmentCount << endl;\r
+ cout << bAlignment.Name << endl;\r
+ cout << bAlignment.AlignedBases << endl;\r
+ cout << "Aligned to " << references.at(bAlignment.RefID).RefName << ":" << bAlignment.Position << endl;\r
+ }\r
+ }\r
+\r
+ cerr << "Found " << alignmentCount << " alignments." << endl; \r
+ } else { cerr << "Could not jump to region specified" << endl; }\r
+\r
+ // ----------------------------------------------------------------------------------------------------- //\r
+ // You can 'rewind' back to beginning of BAM file at any time \r
+ // ----------------------------------------------------------------------------------------------------- //\r
+\r
+ cerr << endl;\r
+ cerr << "Rewinding BAM file... then getting first 1000 alignments" << endl;\r
+\r
+ alignmentCount = 0;\r
+ if (bReader.Rewind() ) {\r
+ while ( bReader.GetNextAlignment(bAlignment) && (alignmentCount < 1000) ) {\r
+ \r
+ // disregard unmapped alignments\r
+ if ( bAlignment.IsMapped() ) {\r
+\r
+ ++alignmentCount;\r
+ \r
+ cout << "----------------------------" << endl;\r
+ cout << "Alignment " << alignmentCount << endl;\r
+ cout << bAlignment.Name << endl;\r
+ cout << bAlignment.AlignedBases << endl;\r
+ cout << "Aligned to " << references.at(bAlignment.RefID).RefName << ":" << bAlignment.Position << endl;\r
+ }\r
+ }\r
+\r
+ cerr << "Found " << alignmentCount << " alignments." << endl;\r
+ } else { cerr << "Could not rewind" << endl; }\r
+*/\r
+ // ---------------------------------------------------------------------- //\r
+ // Close BamReader object (releases internal header/index data) and exit\r
+ // ---------------------------------------------------------------------- //\r
+ \r
+ cerr << endl;\r
+ cerr << "Closing BAM file: " << bamFilename << endl;\r
+ \r
+ bReader.Close();\r
+\r
+ cerr << "Exiting..." << endl << endl;\r
+ return 0;\r
+}\r
--- /dev/null
+// ***************************************************************************
+// BamWriter (c) 2009 Michael Strömberg
+// Marth Lab, Deptartment of Biology, Boston College
+// All rights reserved.
+// ---------------------------------------------------------------------------
+// Provides the basic functionality for producing BAM files
+// ***************************************************************************
+
+#include "BamWriter.h"
+
+// constructor
+BamWriter::BamWriter(void)
+{}
+
+// destructor
+BamWriter::~BamWriter(void) {
+ if(mBGZF.IsOpen) BgzfClose();
+}
+
+// closes the BAM file
+void BamWriter::BgzfClose(void) {
+
+ mBGZF.IsOpen = false;
+
+ // flush the BGZF block
+ BgzfFlushBlock();
+
+ // flush and close
+ fflush(mBGZF.Stream);
+ fclose(mBGZF.Stream);
+}
+
+// compresses the current block
+int BamWriter::BgzfDeflateBlock(void) {
+
+ // initialize the gzip header
+ char* buffer = mBGZF.CompressedBlock;
+ unsigned int bufferSize = mBGZF.CompressedBlockSize;
+
+ memset(buffer, 0, 18);
+ buffer[0] = GZIP_ID1;
+ buffer[1] = (char)GZIP_ID2;
+ buffer[2] = CM_DEFLATE;
+ buffer[3] = FLG_FEXTRA;
+ buffer[9] = (char)OS_UNKNOWN;
+ buffer[10] = BGZF_XLEN;
+ buffer[12] = BGZF_ID1;
+ buffer[13] = BGZF_ID2;
+ buffer[14] = BGZF_LEN;
+
+ // loop to retry for blocks that do not compress enough
+ int inputLength = mBGZF.BlockOffset;
+ int compressedLength = 0;
+
+ while(true) {
+
+ z_stream zs;
+ zs.zalloc = NULL;
+ zs.zfree = NULL;
+ zs.next_in = (Bytef*)mBGZF.UncompressedBlock;
+ zs.avail_in = inputLength;
+ zs.next_out = (Bytef*)&buffer[BLOCK_HEADER_LENGTH];
+ zs.avail_out = bufferSize - BLOCK_HEADER_LENGTH - BLOCK_FOOTER_LENGTH;
+
+ // initialize the zlib compression algorithm
+ if(deflateInit2(&zs, Z_DEFAULT_COMPRESSION, Z_DEFLATED, GZIP_WINDOW_BITS, Z_DEFAULT_MEM_LEVEL, Z_DEFAULT_STRATEGY) != Z_OK) {
+ printf("ERROR: zlib deflate initialization failed.\n");
+ exit(1);
+ }
+
+ // compress the data
+ int status = deflate(&zs, Z_FINISH);
+ if(status != Z_STREAM_END) {
+ deflateEnd(&zs);
+
+ // reduce the input length and try again
+ if(status == Z_OK) {
+ inputLength -= 1024;
+ if(inputLength < 0) {
+ printf("ERROR: input reduction failed.\n");
+ exit(1);
+ }
+ continue;
+ }
+
+ printf("ERROR: zlib deflate failed.\n");
+ exit(1);
+ }
+
+ // finalize the compression routine
+ if(deflateEnd(&zs) != Z_OK) {
+ printf("ERROR: deflate end failed.\n");
+ exit(1);
+ }
+
+ compressedLength = zs.total_out;
+ compressedLength += BLOCK_HEADER_LENGTH + BLOCK_FOOTER_LENGTH;
+
+ if(compressedLength > MAX_BLOCK_SIZE) {
+ printf("ERROR: deflate overflow.\n");
+ exit(1);
+ }
+
+ break;
+ }
+
+ // store the compressed length
+ BgzfPackUnsignedShort(&buffer[16], (unsigned short)(compressedLength - 1));
+
+ // store the CRC32 checksum
+ unsigned int crc = crc32(0, NULL, 0);
+ crc = crc32(crc, (Bytef*)mBGZF.UncompressedBlock, inputLength);
+ BgzfPackUnsignedInt(&buffer[compressedLength - 8], crc);
+ BgzfPackUnsignedInt(&buffer[compressedLength - 4], inputLength);
+
+ // ensure that we have less than a block of data left
+ int remaining = mBGZF.BlockOffset - inputLength;
+ if(remaining > 0) {
+ if(remaining > inputLength) {
+ printf("ERROR: remainder too large.\n");
+ exit(1);
+ }
+
+ memcpy(mBGZF.UncompressedBlock, mBGZF.UncompressedBlock + inputLength, remaining);
+ }
+
+ mBGZF.BlockOffset = remaining;
+ return compressedLength;
+}
+
+// flushes the data in the BGZF block
+void BamWriter::BgzfFlushBlock(void) {
+
+ // flush all of the remaining blocks
+ while(mBGZF.BlockOffset > 0) {
+
+ // compress the data block
+ int blockLength = BgzfDeflateBlock();
+
+ // flush the data to our output stream
+ int numBytesWritten = fwrite(mBGZF.CompressedBlock, 1, blockLength, mBGZF.Stream);
+
+ if(numBytesWritten != blockLength) {
+ printf("ERROR: Expected to write %u bytes during flushing, but wrote %u bytes.\n", blockLength, numBytesWritten);
+ exit(1);
+ }
+
+ mBGZF.BlockAddress += blockLength;
+ }
+}
+
+// opens the BAM file for writing
+void BamWriter::BgzfOpen(const string& filename) {
+
+ mBGZF.Stream = fopen(filename.c_str(), "wb");
+
+ if(!mBGZF.Stream) {
+ printf("ERROR: Unable to open the BAM file (%s) for writing.\n", filename.c_str());
+ exit(1);
+ }
+
+ mBGZF.IsOpen = true;
+}
+
+// writes the supplied data into the BGZF buffer
+unsigned int BamWriter::BgzfWrite(const char* data, const unsigned int dataLen) {
+
+ // initialize
+ unsigned int numBytesWritten = 0;
+ const char* input = data;
+ unsigned int blockLength = mBGZF.UncompressedBlockSize;
+
+ // copy the data to the buffer
+ while(numBytesWritten < dataLen) {
+ unsigned int copyLength = min(blockLength - mBGZF.BlockOffset, dataLen - numBytesWritten);
+ char* buffer = mBGZF.UncompressedBlock;
+ memcpy(buffer + mBGZF.BlockOffset, input, copyLength);
+
+ mBGZF.BlockOffset += copyLength;
+ input += copyLength;
+ numBytesWritten += copyLength;
+
+ if(mBGZF.BlockOffset == blockLength) BgzfFlushBlock();
+ }
+
+ return numBytesWritten;
+}
+
+// closes the alignment archive
+void BamWriter::Close(void) {
+ if(mBGZF.IsOpen) BgzfClose();
+}
+
+// creates a cigar string from the supplied alignment
+void BamWriter::CreatePackedCigar(const vector<CigarOp>& cigarOperations, string& packedCigar) {
+
+ // initialize
+ const unsigned int numCigarOperations = cigarOperations.size();
+ packedCigar.resize(numCigarOperations * SIZEOF_INT);
+
+ // pack the cigar data into the string
+ unsigned int* pPackedCigar = (unsigned int*)packedCigar.data();
+
+ unsigned int cigarOp;
+ vector<CigarOp>::const_iterator coIter;
+ for(coIter = cigarOperations.begin(); coIter != cigarOperations.end(); coIter++) {
+
+ switch(coIter->Type) {
+ case 'M':
+ cigarOp = BAM_CMATCH;
+ break;
+ case 'I':
+ cigarOp = BAM_CINS;
+ break;
+ case 'D':
+ cigarOp = BAM_CDEL;
+ break;
+ case 'N':
+ cigarOp = BAM_CREF_SKIP;
+ break;
+ case 'S':
+ cigarOp = BAM_CSOFT_CLIP;
+ break;
+ case 'H':
+ cigarOp = BAM_CHARD_CLIP;
+ break;
+ case 'P':
+ cigarOp = BAM_CPAD;
+ break;
+ default:
+ printf("ERROR: Unknown cigar operation found: %c\n", coIter->Type);
+ exit(1);
+ }
+
+ *pPackedCigar = coIter->Length << BAM_CIGAR_SHIFT | cigarOp;
+ pPackedCigar++;
+ }
+}
+
+// encodes the supplied query sequence into 4-bit notation
+void BamWriter::EncodeQuerySequence(const string& query, string& encodedQuery) {
+
+ // prepare the encoded query string
+ const unsigned int queryLen = query.size();
+ const unsigned int encodedQueryLen = (unsigned int)((queryLen / 2.0) + 0.5);
+ encodedQuery.resize(encodedQueryLen);
+ char* pEncodedQuery = (char*)encodedQuery.data();
+ const char* pQuery = (const char*)query.data();
+
+ unsigned char nucleotideCode;
+ bool useHighWord = true;
+
+ while(*pQuery) {
+
+ switch(*pQuery) {
+ case '=':
+ nucleotideCode = 0;
+ break;
+ case 'A':
+ nucleotideCode = 1;
+ break;
+ case 'C':
+ nucleotideCode = 2;
+ break;
+ case 'G':
+ nucleotideCode = 4;
+ break;
+ case 'T':
+ nucleotideCode = 8;
+ break;
+ case 'N':
+ nucleotideCode = 15;
+ break;
+ default:
+ printf("ERROR: Only the following bases are supported in the BAM format: {=, A, C, G, T, N}. Found [%c]\n", *pQuery);
+ exit(1);
+ }
+
+ // pack the nucleotide code
+ if(useHighWord) {
+ *pEncodedQuery = nucleotideCode << 4;
+ useHighWord = false;
+ } else {
+ *pEncodedQuery |= nucleotideCode;
+ pEncodedQuery++;
+ useHighWord = true;
+ }
+
+ // increment the query position
+ pQuery++;
+ }
+}
+
+// opens the alignment archive
+void BamWriter::Open(const string& filename, const string& samHeader, const RefVector& referenceSequences) {
+
+ // open the BGZF file for writing
+ BgzfOpen(filename);
+
+ // ================
+ // write the header
+ // ================
+
+ // write the BAM signature
+ const unsigned char SIGNATURE_LENGTH = 4;
+ const char* BAM_SIGNATURE = "BAM\1";
+ BgzfWrite(BAM_SIGNATURE, SIGNATURE_LENGTH);
+
+ // write the SAM header text length
+ const unsigned int samHeaderLen = samHeader.size();
+ BgzfWrite((char*)&samHeaderLen, SIZEOF_INT);
+
+ // write the SAM header text
+ if(samHeaderLen > 0) BgzfWrite(samHeader.data(), samHeaderLen);
+
+ // write the number of reference sequences
+ const unsigned int numReferenceSequences = referenceSequences.size();
+ BgzfWrite((char*)&numReferenceSequences, SIZEOF_INT);
+
+ // =============================
+ // write the sequence dictionary
+ // =============================
+
+ RefVector::const_iterator rsIter;
+ for(rsIter = referenceSequences.begin(); rsIter != referenceSequences.end(); rsIter++) {
+
+ // write the reference sequence name length
+ const unsigned int referenceSequenceNameLen = rsIter->RefName.size() + 1;
+ BgzfWrite((char*)&referenceSequenceNameLen, SIZEOF_INT);
+
+ // write the reference sequence name
+ BgzfWrite(rsIter->RefName.c_str(), referenceSequenceNameLen);
+
+ // write the reference sequence length
+ BgzfWrite((char*)&rsIter->RefLength, SIZEOF_INT);
+ }
+}
+
+// saves the alignment to the alignment archive
+void BamWriter::SaveAlignment(const BamAlignment& al) {
+
+ // initialize
+ const unsigned int nameLen = al.Name.size() + 1;
+ const unsigned int queryLen = al.QueryBases.size();
+ const unsigned int numCigarOperations = al.CigarData.size();
+
+ // create our packed cigar string
+ string packedCigar;
+ CreatePackedCigar(al.CigarData, packedCigar);
+ const unsigned int packedCigarLen = packedCigar.size();
+
+ // encode the query
+ string encodedQuery;
+ EncodeQuerySequence(al.QueryBases, encodedQuery);
+ const unsigned int encodedQueryLen = encodedQuery.size();
+
+ // create our read group tag
+ // TODO: add read group tag support when it shows up in the BamAlignment struct
+ //string readGroupTag;
+ //const unsigned int readGroupTagLen = 3 + mReadGroupLUT[al.ReadGroupCode].NameLen + 1;
+ //readGroupTag.resize(readGroupTagLen);
+ //char* pReadGroupTag = (char*)readGroupTag.data();
+ //sprintf(pReadGroupTag, "RGZ%s", mReadGroupLUT[al.ReadGroupCode].Name);
+
+ // assign the BAM core data
+ unsigned int buffer[8];
+ buffer[0] = al.RefID;
+ buffer[1] = al.Position;
+ buffer[2] = (al.Bin << 16) | (al.MapQuality << 8) | nameLen;
+ buffer[3] = (al.AlignmentFlag << 16) | numCigarOperations;
+ buffer[4] = queryLen;
+ buffer[5] = al.MateRefID;
+ buffer[6] = al.MatePosition;
+ buffer[7] = al.InsertSize;
+
+ // write the block size
+ const unsigned int dataBlockSize = nameLen + packedCigarLen + encodedQueryLen + queryLen /* + readGroupTagLen*/;
+ const unsigned int blockSize = BAM_CORE_SIZE + dataBlockSize;
+
+ BgzfWrite((char*)&blockSize, SIZEOF_INT);
+
+ // write the BAM core
+ BgzfWrite((char*)&buffer, BAM_CORE_SIZE);
+
+ // write the query name
+ BgzfWrite(al.Name.c_str(), nameLen);
+
+ // write the packed cigar
+ BgzfWrite(packedCigar.data(), packedCigarLen);
+
+ // write the encoded query sequence
+ BgzfWrite(encodedQuery.data(), encodedQueryLen);
+
+ // write the base qualities
+ string baseQualities = al.Qualities;
+ char* pBaseQualities = (char*)al.Qualities.data();
+ for(unsigned int i = 0; i < queryLen; i++) pBaseQualities[i] -= 33;
+ BgzfWrite(pBaseQualities, queryLen);
+
+ // write the read group tag
+ // TODO: add read group tag support when it shows up in the BamAlignment struct
+ //BgzfWrite(readGroupTag.c_str(), readGroupTagLen);
+}
--- /dev/null
+// ***************************************************************************
+// BamWriter (c) 2009 Michael Strömberg
+// Marth Lab, Deptartment of Biology, Boston College
+// All rights reserved.
+// ---------------------------------------------------------------------------
+// Provides the basic functionality for producing BAM files
+// ***************************************************************************
+
+#pragma once
+
+#include <string>
+#include <vector>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <zlib.h>
+#include "BamAlignment.h"
+
+using namespace std;
+
+// our zlib constants
+#define GZIP_ID1 31
+#define GZIP_ID2 139
+#define CM_DEFLATE 8
+#define FLG_FEXTRA 4
+#define OS_UNKNOWN 255
+#define BGZF_XLEN 6
+#define BGZF_ID1 66
+#define BGZF_ID2 67
+#define BGZF_LEN 2
+#define GZIP_WINDOW_BITS -15
+#define Z_DEFAULT_MEM_LEVEL 8
+
+// our BZGF constants
+#define BLOCK_HEADER_LENGTH 18
+#define BLOCK_FOOTER_LENGTH 8
+#define MAX_BLOCK_SIZE 65536
+#define DEFAULT_BLOCK_SIZE 65536
+
+// our BAM constants
+#define BAM_CORE_SIZE 32
+#define BAM_CMATCH 0
+#define BAM_CINS 1
+#define BAM_CDEL 2
+#define BAM_CREF_SKIP 3
+#define BAM_CSOFT_CLIP 4
+#define BAM_CHARD_CLIP 5
+#define BAM_CPAD 6
+#define BAM_CIGAR_SHIFT 4
+
+#define BAM_CIGAR_MASK ((1 << BAM_CIGAR_SHIFT) - 1)
+
+// our variable sizes
+#define SIZEOF_INT 4
+
+// define our BZGF structure
+struct BgzfData {
+ unsigned int UncompressedBlockSize;
+ unsigned int CompressedBlockSize;
+ unsigned int BlockLength;
+ unsigned int BlockOffset;
+ uint64_t BlockAddress;
+ bool IsOpen;
+ FILE* Stream;
+ char* UncompressedBlock;
+ char* CompressedBlock;
+
+ // constructor
+ BgzfData(void)
+ : UncompressedBlockSize(DEFAULT_BLOCK_SIZE)
+ , CompressedBlockSize(MAX_BLOCK_SIZE)
+ , BlockLength(0)
+ , BlockOffset(0)
+ , BlockAddress(0)
+ , IsOpen(false)
+ , Stream(NULL)
+ , UncompressedBlock(NULL)
+ , CompressedBlock(NULL)
+ {
+ try {
+ CompressedBlock = new char[CompressedBlockSize];
+ UncompressedBlock = new char[UncompressedBlockSize];
+ } catch(bad_alloc&) {
+ printf("ERROR: Unable to allocate memory for our BGZF object.\n");
+ exit(1);
+ }
+ }
+
+ // destructor
+ ~BgzfData(void) {
+ if(CompressedBlock) delete [] CompressedBlock;
+ if(UncompressedBlock) delete [] UncompressedBlock;
+ }
+};
+
+class BamWriter {
+public:
+ // constructor
+ BamWriter(void);
+ // destructor
+ ~BamWriter(void);
+ // closes the alignment archive
+ void Close(void);
+ // opens the alignment archive
+ void Open(const string& filename, const string& samHeader, const RefVector& referenceSequences);
+ // saves the alignment to the alignment archive
+ void SaveAlignment(const BamAlignment& al);
+private:
+ // closes the BAM file
+ void BgzfClose(void);
+ // compresses the current block
+ int BgzfDeflateBlock(void);
+ // flushes the data in the BGZF block
+ void BgzfFlushBlock(void);
+ // opens the BAM file for writing
+ void BgzfOpen(const string& filename);
+ // packs an unsigned integer into the specified buffer
+ static inline void BgzfPackUnsignedInt(char* buffer, unsigned int value);
+ // packs an unsigned short into the specified buffer
+ static inline void BgzfPackUnsignedShort(char* buffer, unsigned short value);
+ // writes the supplied data into the BGZF buffer
+ unsigned int BgzfWrite(const char* data, const unsigned int dataLen);
+ // calculates the minimum bin that contains a region [begin, end)
+ static inline unsigned int CalculateMinimumBin(unsigned int begin, unsigned int end);
+ // creates a packed cigar string from the supplied alignment
+ static void CreatePackedCigar(const vector<CigarOp>& cigarOperations, string& packedCigar);
+ // encodes the supplied query sequence into 4-bit notation
+ static void EncodeQuerySequence(const string& query, string& encodedQuery);
+ // our BGZF output object
+ BgzfData mBGZF;
+};
+
+// packs an unsigned integer into the specified buffer
+inline void BamWriter::BgzfPackUnsignedInt(char* buffer, unsigned int value) {
+ buffer[0] = (char)value;
+ buffer[1] = (char)(value >> 8);
+ buffer[2] = (char)(value >> 16);
+ buffer[3] = (char)(value >> 24);
+}
+
+// packs an unsigned short into the specified buffer
+inline void BamWriter::BgzfPackUnsignedShort(char* buffer, unsigned short value) {
+ buffer[0] = (char)value;
+ buffer[1] = (char)(value >> 8);
+}
--- /dev/null
+CC= gcc
+CXX= g++
+CFLAGS= -Wall -pg -O3 -m64
+CXXFLAGS= $(CFLAGS)
+DFLAGS= -D_IOLIB=2 #-D_NDEBUG
+OBJS= BamReader.o bgzf.o
+PROG= BamReaderTest
+INCLUDES=
+ARFLAGS= -crs
+LIBS= -lz
+SUBDIRS= .
+
+.SUFFIXES:.c .cpp .o
+
+.c.o:
+ $(CC) -c $(CFLAGS) $(DFLAGS) $(INCLUDES) $< -o $@
+
+.cpp.o:
+ $(CXX) -c $(CXXFLAGS) $(DFLAGS) $(INCLUDES) $< -o $@
+
+all: $(PROG) BamConversion
+
+lib:libbambc.a
+
+libbambc.a:$(OBJS)
+ $(AR) $(ARFLAGS) $@ $(OBJS)
+
+BamReaderTest:lib BamReaderMain.o
+ $(CXX) $(CXXFLAGS) -o $@ BamReaderMain.o $(LIBS) -L. -lbambc
+
+BamConversion: lib BamWriter.o BamConversionMain.o
+ $(CXX) $(CXXFLAGS) -o $@ BamWriter.o BamConversionMain.o $(LIBS) -L. -lbambc
+
+clean:
+ rm -fr gmon.out *.o *.a a.out $(PROG) *~
--- /dev/null
+---------------
+README
+----------------
+
+Copy the 4 header files (*.h) and the library file (libbambc.a) to your directory.
+
+When you compile your program, add this to the g++ command:
+
+ -lz -L. -lbambc
+
+and you should be good to go. (Those are lower case l's, as in 'lion'.)
--- /dev/null
+// STLUtilities.h\r
+\r
+// Derek Barnett\r
+// Marth Lab, Boston College\r
+// Last modified: 19 March 2009\r
+\r
+#ifndef STL_UTILITIES_H\r
+#define STL_UTILITIES_H\r
+\r
+#include <algorithm>\r
+using std::distance;\r
+using std::find;\r
+using std::transform;\r
+\r
+#include <functional>\r
+using std::unary_function;\r
+\r
+#include <iterator>\r
+using std::iterator_traits;\r
+\r
+#include <string>\r
+using std::string;\r
+\r
+#include <utility>\r
+using std::pair;\r
+\r
+#include <vector>\r
+using std::vector;\r
+\r
+// -------------------------------------------------------------------------------- //\r
+// This file contains a sample of STL tricks I've gathered along the way.\r
+//\r
+// Many thanks throughout to 'Effective' series by Scott Meyers.\r
+// Some code is lifted (almost) verbatim from his texts where applicable.\r
+// ------------------------------------------------------------------------------- //\r
+\r
+// --------------------------------------------------------------------------------------------------------------------------------- //\r
+// STL containers will delete the values they hold when the container is deleted or goes out of scope\r
+// *** If the container contains pointers, however, they WILL NOT delete the actual pointed-to objects ***\r
+// This struct allows for easy algorithm processing (instead of pesky loops and iterator-boundary issues) to delete pointed-to objects (for any C++ type) that have been allocated with 'new')\r
+// Usage example: for_each( container.begin(), container.end(), DeleteObject() ) \r
+//\r
+// Unless of course, you like quirky, hard-to-spot memory leaks... then feel free to disregard this little STL lesson =)\r
+// --------------------------------------------------------------------------------------------------------------------------------- //\r
+struct DeleteObject {\r
+ template<typename T>\r
+ void operator() (const T* p) const { \r
+ delete p; \r
+ }\r
+};\r
+\r
+template<typename T>\r
+void ClearPointerVector(vector<T>& v) {\r
+ if ( !v.empty() ) {\r
+ for_each(v.begin(), v.end(), DeleteObject());\r
+ v.clear();\r
+ }\r
+}\r
+\r
+// --------------------------------------------------------------------------------------------------------------------------------- //\r
+// Query a vector (other containers? havent tried) for an element\r
+// Returns the index of that element\r
+// Returns vector::size() if not found\r
+// Works with reverse iterators as well... index is counted backward from last element though\r
+// --------------------------------------------------------------------------------------------------------------------------------- //\r
+template < typename InputIterator, typename EqualityComparable >\r
+typename iterator_traits<InputIterator>::difference_type\r
+Index(const InputIterator& begin, const InputIterator& end, const EqualityComparable& item) {\r
+ return distance(begin, find(begin, end, item));\r
+}\r
+\r
+// ----------------------------------------------------------------------------------------------------------------------//\r
+// This next section is a sort of work-around for the bulky associative maps\r
+// The idea is to use a *SORTED* vector of pair objects as a lookup vector\r
+// LookupVector = vector< pair<Key, Value> > \r
+// ----------------------------------------------------------------------------------------------------------------------//\r
+\r
+// The following template classes allow a templatized comparison function for sorting & searching lookup vector\r
+\r
+// allows sorting by Key ( less<Key> )\r
+template <typename Key, typename Value>\r
+class LookupKeyCompare {\r
+\r
+ typedef pair< Key, Value > LookupData;\r
+ typedef typename LookupData::first_type Key_t;\r
+ \r
+ public:\r
+ bool operator() (const LookupData& lhs, const LookupData& rhs) const { return keyLess(lhs.first, rhs.first); }\r
+ bool operator() (const LookupData& lhs, const Key_t& k) const { return keyLess(lhs.first, k); }\r
+ bool operator() (const Key_t& k, const LookupData& rhs) const { return keyLess(k, rhs.first); }\r
+ private:\r
+ bool keyLess(const Key_t& k1, const Key_t& k2) const { return k1 < k2; }\r
+};\r
+\r
+// allows sorting by Value ( less<Value> )\r
+template <typename Key, typename Value>\r
+class LookupValueCompare {\r
+\r
+ typedef pair< Key, Value > LookupData;\r
+ typedef typename LookupData::second_type Value_t;\r
+ \r
+ public:\r
+ bool operator() (const LookupData& lhs, const LookupData& rhs) const { return valueLess(lhs.second, rhs.second); }\r
+ bool operator() (const LookupData& lhs, const Value_t& k) const { return valueLess(lhs.second, k); }\r
+ bool operator() (const Value_t& k, const LookupData& rhs) const { return valueLess(k, rhs.second); }\r
+ private:\r
+ bool valueLess(const Value_t& k1, const Value_t& k2) const { return k1 < k2; }\r
+};\r
+\r
+// The following template functions/structs allow you to pull all keys or all values from this lookup structure\r
+\r
+// pull Key from a data pair\r
+template<typename Key, typename Value>\r
+struct RipKey:\r
+ public unary_function< pair<Key,Value>, Key> {\r
+ Key operator() (pair<Key,Value> dataPair) { \r
+ return dataPair.first; \r
+ }\r
+};\r
+\r
+// pull Value from a data pair\r
+template<typename Key, typename Value>\r
+struct RipValue:\r
+ public unary_function< pair<Key,Value>, Value> {\r
+ Value operator() (pair<Key,Value> dataPair) { \r
+ return dataPair.second; \r
+ }\r
+};\r
+\r
+// pull all Keys from lookup, store in dest (overwrite contents of dest)\r
+template <typename Key, typename Value>\r
+void RipKeys( vector< pair<Key,Value> >& lookup, vector<Key>& dest) {\r
+ dest.clear();\r
+ dest.reserve(lookup.size());\r
+ transform(lookup.begin(), lookup.end(), back_inserter(dest), RipKey<Key,Value>());\r
+}\r
+\r
+// pull all Values from lookup, store in dest (overwrite contents of dest)\r
+template <typename Key, typename Value>\r
+void RipValues( vector< pair<Key,Value> >& lookup, vector<Value>& dest) {\r
+ dest.clear();\r
+ dest.reserve(lookup.size());\r
+ transform(lookup.begin(), lookup.end(), back_inserter(dest), RipValue<Key,Value>());\r
+}\r
+\r
+#endif /* STL_UTILITIES_H */\r
--- /dev/null
+/*
+ * The Broad Institute
+ * SOFTWARE COPYRIGHT NOTICE AGREEMENT
+ * This software and its documentation are copyright 2008 by the
+ * Broad Institute/Massachusetts Institute of Technology. All rights are reserved.
+ *
+ * This software is supplied without any warranty or guaranteed support whatsoever.
+ * Neither the Broad Institute nor MIT can be responsible for its use, misuse,
+ * or functionality.
+ */
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <unistd.h>
+#include <fcntl.h>
+#include <sys/types.h>
+#include <sys/stat.h>
+#include "bgzf.h"
+
+extern off_t ftello(FILE *stream);
+extern int fseeko(FILE *stream, off_t offset, int whence);
+
+typedef int8_t byte;
+
+static const int DEFAULT_BLOCK_SIZE = 64 * 1024;
+static const int MAX_BLOCK_SIZE = 64 * 1024;
+
+static const int BLOCK_HEADER_LENGTH = 18;
+static const int BLOCK_FOOTER_LENGTH = 8;
+
+static const int GZIP_ID1 = 31;
+static const int GZIP_ID2 = 139;
+static const int CM_DEFLATE = 8;
+static const int FLG_FEXTRA = 4;
+static const int OS_UNKNOWN = 255;
+static const int BGZF_ID1 = 66; // 'B'
+static const int BGZF_ID2 = 67; // 'C'
+static const int BGZF_LEN = 2;
+static const int BGZF_XLEN = 6; // BGZF_LEN+4
+
+static const int GZIP_WINDOW_BITS = -15; // no zlib header
+static const int Z_DEFAULT_MEM_LEVEL = 8;
+
+
+inline
+void
+packInt16(uint8_t* buffer, uint16_t value)
+{
+ buffer[0] = value;
+ buffer[1] = value >> 8;
+}
+
+inline
+int
+unpackInt16(const uint8_t* buffer)
+{
+ return (buffer[0] | (buffer[1] << 8));
+}
+
+inline
+void
+packInt32(uint8_t* buffer, uint32_t value)
+{
+ buffer[0] = value;
+ buffer[1] = value >> 8;
+ buffer[2] = value >> 16;
+ buffer[3] = value >> 24;
+}
+
+inline
+int
+min(int x, int y)
+{
+ return (x < y) ? x : y;
+}
+
+static
+void
+report_error(BGZF* fp, const char* message) {
+ fp->error = message;
+}
+
+static
+BGZF*
+open_read(int fd)
+{
+ FILE* file = fdopen(fd, "r");
+ BGZF* fp = malloc(sizeof(BGZF));
+ fp->file_descriptor = fd;
+ fp->open_mode = 'r';
+ fp->owned_file = 0;
+ fp->file = file;
+ fp->uncompressed_block_size = MAX_BLOCK_SIZE;
+ fp->uncompressed_block = malloc(MAX_BLOCK_SIZE);
+ fp->compressed_block_size = MAX_BLOCK_SIZE;
+ fp->compressed_block = malloc(MAX_BLOCK_SIZE);
+ fp->block_address = 0;
+ fp->block_offset = 0;
+ fp->block_length = 0;
+ fp->error = NULL;
+ return fp;
+}
+
+static
+BGZF*
+open_write(int fd)
+{
+ FILE* file = fdopen(fd, "w");
+ BGZF* fp = malloc(sizeof(BGZF));
+ fp->file_descriptor = fd;
+ fp->open_mode = 'w';
+ fp->owned_file = 0;
+ fp->file = file;
+ fp->uncompressed_block_size = DEFAULT_BLOCK_SIZE;
+ fp->uncompressed_block = NULL;
+ fp->compressed_block_size = MAX_BLOCK_SIZE;
+ fp->compressed_block = malloc(MAX_BLOCK_SIZE);
+ fp->block_address = 0;
+ fp->block_offset = 0;
+ fp->block_length = 0;
+ fp->error = NULL;
+ return fp;
+}
+
+BGZF*
+bgzf_open(const char* __restrict path, const char* __restrict mode)
+{
+ BGZF* fp = NULL;
+ if (strcasecmp(mode, "r") == 0) {
+ int oflag = O_RDONLY;
+ int fd = open(path, oflag);
+ fp = open_read(fd);
+ } else if (strcasecmp(mode, "w") == 0) {
+ int oflag = O_WRONLY | O_CREAT | O_TRUNC;
+ int fd = open(path, oflag, 0644);
+ fp = open_write(fd);
+ }
+ if (fp != NULL) {
+ fp->owned_file = 1;
+ }
+ return fp;
+}
+
+BGZF*
+bgzf_fdopen(int fd, const char * __restrict mode)
+{
+ if (strcasecmp(mode, "r") == 0) {
+ return open_read(fd);
+ } else if (strcasecmp(mode, "w") == 0) {
+ return open_write(fd);
+ } else {
+ return NULL;
+ }
+}
+
+static
+int
+deflate_block(BGZF* fp, int block_length)
+{
+ // Deflate the block in fp->uncompressed_block into fp->compressed_block.
+ // Also adds an extra field that stores the compressed block length.
+
+ byte* buffer = fp->compressed_block;
+ int buffer_size = fp->compressed_block_size;
+
+ // Init gzip header
+ buffer[0] = GZIP_ID1;
+ buffer[1] = GZIP_ID2;
+ buffer[2] = CM_DEFLATE;
+ buffer[3] = FLG_FEXTRA;
+ buffer[4] = 0; // mtime
+ buffer[5] = 0;
+ buffer[6] = 0;
+ buffer[7] = 0;
+ buffer[8] = 0;
+ buffer[9] = OS_UNKNOWN;
+ buffer[10] = BGZF_XLEN;
+ buffer[11] = 0;
+ buffer[12] = BGZF_ID1;
+ buffer[13] = BGZF_ID2;
+ buffer[14] = BGZF_LEN;
+ buffer[15] = 0;
+ buffer[16] = 0; // placeholder for block length
+ buffer[17] = 0;
+
+ // loop to retry for blocks that do not compress enough
+ int input_length = block_length;
+ int compressed_length = 0;
+ while (1) {
+
+ z_stream zs;
+ zs.zalloc = NULL;
+ zs.zfree = NULL;
+ zs.next_in = fp->uncompressed_block;
+ zs.avail_in = input_length;
+ zs.next_out = (void*)&buffer[BLOCK_HEADER_LENGTH];
+ zs.avail_out = buffer_size - BLOCK_HEADER_LENGTH - BLOCK_FOOTER_LENGTH;
+
+ int status = deflateInit2(&zs, Z_DEFAULT_COMPRESSION, Z_DEFLATED,
+ GZIP_WINDOW_BITS, Z_DEFAULT_MEM_LEVEL, Z_DEFAULT_STRATEGY);
+ if (status != Z_OK) {
+ report_error(fp, "deflate init failed");
+ return -1;
+ }
+ status = deflate(&zs, Z_FINISH);
+ if (status != Z_STREAM_END) {
+ deflateEnd(&zs);
+ if (status == Z_OK) {
+ // Not enough space in buffer.
+ // Can happen in the rare case the input doesn't compress enough.
+ // Reduce the amount of input until it fits.
+ input_length -= 1024;
+ if (input_length <= 0) {
+ // should never happen
+ report_error(fp, "input reduction failed");
+ return -1;
+ }
+ continue;
+ }
+ report_error(fp, "deflate failed");
+ return -1;
+ }
+ status = deflateEnd(&zs);
+ if (status != Z_OK) {
+ report_error(fp, "deflate end failed");
+ return -1;
+ }
+ compressed_length = zs.total_out;
+ compressed_length += BLOCK_HEADER_LENGTH + BLOCK_FOOTER_LENGTH;
+ if (compressed_length > MAX_BLOCK_SIZE) {
+ // should never happen
+ report_error(fp, "deflate overflow");
+ return -1;
+ }
+ break;
+ }
+
+ packInt16((uint8_t*)&buffer[16], compressed_length-1);
+ uint32_t crc = crc32(0L, NULL, 0L);
+ crc = crc32(crc, fp->uncompressed_block, input_length);
+ packInt32((uint8_t*)&buffer[compressed_length-8], crc);
+ packInt32((uint8_t*)&buffer[compressed_length-4], input_length);
+
+ int remaining = block_length - input_length;
+ if (remaining > 0) {
+ if (remaining > input_length) {
+ // should never happen (check so we can use memcpy)
+ report_error(fp, "remainder too large");
+ return -1;
+ }
+ memcpy(fp->uncompressed_block,
+ fp->uncompressed_block + input_length,
+ remaining);
+ }
+ fp->block_offset = remaining;
+ return compressed_length;
+}
+
+static
+int
+inflate_block(BGZF* fp, int block_length)
+{
+ // Inflate the block in fp->compressed_block into fp->uncompressed_block
+
+ z_stream zs;
+ zs.zalloc = NULL;
+ zs.zfree = NULL;
+ zs.next_in = fp->compressed_block + 18;
+ zs.avail_in = block_length - 16;
+ zs.next_out = fp->uncompressed_block;
+ zs.avail_out = fp->uncompressed_block_size;
+
+ int status = inflateInit2(&zs, GZIP_WINDOW_BITS);
+ if (status != Z_OK) {
+ report_error(fp, "inflate init failed");
+ return -1;
+ }
+ status = inflate(&zs, Z_FINISH);
+ if (status != Z_STREAM_END) {
+ inflateEnd(&zs);
+ report_error(fp, "inflate failed");
+ return -1;
+ }
+ status = inflateEnd(&zs);
+ if (status != Z_OK) {
+ report_error(fp, "inflate failed");
+ return -1;
+ }
+ return zs.total_out;
+}
+
+static
+int
+check_header(const byte* header)
+{
+ return (header[0] == GZIP_ID1 &&
+ header[1] == (byte) GZIP_ID2 &&
+ header[2] == Z_DEFLATED &&
+ (header[3] & FLG_FEXTRA) != 0 &&
+ unpackInt16((uint8_t*)&header[10]) == BGZF_XLEN &&
+ header[12] == BGZF_ID1 &&
+ header[13] == BGZF_ID2 &&
+ unpackInt16((uint8_t*)&header[14]) == BGZF_LEN);
+}
+
+static
+int
+read_block(BGZF* fp)
+{
+ byte header[BLOCK_HEADER_LENGTH];
+ int64_t block_address = ftello(fp->file);
+ int count = fread(header, 1, sizeof(header), fp->file);
+ if (count == 0) {
+ fp->block_length = 0;
+ return 0;
+ }
+ if (count != sizeof(header)) {
+ report_error(fp, "read failed");
+ return -1;
+ }
+ if (!check_header(header)) {
+ report_error(fp, "invalid block header");
+ return -1;
+ }
+ int block_length = unpackInt16((uint8_t*)&header[16]) + 1;
+ byte* compressed_block = (byte*) fp->compressed_block;
+ memcpy(compressed_block, header, BLOCK_HEADER_LENGTH);
+ int remaining = block_length - BLOCK_HEADER_LENGTH;
+ count = fread(&compressed_block[BLOCK_HEADER_LENGTH], 1, remaining, fp->file);
+ if (count != remaining) {
+ report_error(fp, "read failed");
+ return -1;
+ }
+ count = inflate_block(fp, block_length);
+ if (count < 0) {
+ return -1;
+ }
+ if (fp->block_length != 0) {
+ // Do not reset offset if this read follows a seek.
+ fp->block_offset = 0;
+ }
+ fp->block_address = block_address;
+ fp->block_length = count;
+ return 0;
+}
+
+int
+bgzf_read(BGZF* fp, void* data, int length)
+{
+ if (length <= 0) {
+ return 0;
+ }
+ if (fp->open_mode != 'r') {
+ report_error(fp, "file not open for reading");
+ return -1;
+ }
+
+ int bytes_read = 0;
+ byte* output = data;
+ while (bytes_read < length) {
+ int available = fp->block_length - fp->block_offset;
+ if (available <= 0) {
+ if (read_block(fp) != 0) {
+ return -1;
+ }
+ available = fp->block_length - fp->block_offset;
+ if (available <= 0) {
+ break;
+ }
+ }
+ int copy_length = min(length-bytes_read, available);
+ byte* buffer = fp->uncompressed_block;
+ memcpy(output, buffer + fp->block_offset, copy_length);
+ fp->block_offset += copy_length;
+ output += copy_length;
+ bytes_read += copy_length;
+ }
+ if (fp->block_offset == fp->block_length) {
+ fp->block_address = ftello(fp->file);
+ fp->block_offset = 0;
+ fp->block_length = 0;
+ }
+ return bytes_read;
+}
+
+static
+int
+flush_block(BGZF* fp)
+{
+ while (fp->block_offset > 0) {
+ int block_length = deflate_block(fp, fp->block_offset);
+ if (block_length < 0) {
+ return -1;
+ }
+ int count = fwrite(fp->compressed_block, 1, block_length, fp->file);
+ if (count != block_length) {
+ report_error(fp, "write failed");
+ return -1;
+ }
+ fp->block_address += block_length;
+ }
+ return 0;
+}
+
+int
+bgzf_write(BGZF* fp, const void* data, int length)
+{
+ if (fp->open_mode != 'w') {
+ report_error(fp, "file not open for writing");
+ return -1;
+ }
+
+ if (fp->uncompressed_block == NULL) {
+ fp->uncompressed_block = malloc(fp->uncompressed_block_size);
+ }
+
+ const byte* input = data;
+ int block_length = fp->uncompressed_block_size;
+ int bytes_written = 0;
+ while (bytes_written < length) {
+ int copy_length = min(block_length - fp->block_offset, length - bytes_written);
+ byte* buffer = fp->uncompressed_block;
+ memcpy(buffer + fp->block_offset, input, copy_length);
+ fp->block_offset += copy_length;
+ input += copy_length;
+ bytes_written += copy_length;
+ if (fp->block_offset == block_length) {
+ if (flush_block(fp) != 0) {
+ break;
+ }
+ }
+ }
+ return bytes_written;
+}
+
+int
+bgzf_close(BGZF* fp)
+{
+ if (fp->open_mode == 'w') {
+ if (flush_block(fp) != 0) {
+ return -1;
+ }
+ if (fflush(fp->file) != 0) {
+ report_error(fp, "flush failed");
+ return -1;
+ }
+ }
+ if (fp->owned_file) {
+ if (fclose(fp->file) != 0) {
+ return -1;
+ }
+ }
+ free(fp->uncompressed_block);
+ free(fp->compressed_block);
+ free(fp);
+ return 0;
+}
+
+int64_t
+bgzf_tell(BGZF* fp)
+{
+ return ((fp->block_address << 16) | (fp->block_offset & 0xFFFF));
+}
+
+int64_t
+bgzf_seek(BGZF* fp, int64_t pos, int where)
+{
+ if (fp->open_mode != 'r') {
+ report_error(fp, "file not open for read");
+ return -1;
+ }
+ if (where != SEEK_SET) {
+ report_error(fp, "unimplemented seek option");
+ return -1;
+ }
+ int block_offset = pos & 0xFFFF;
+ int64_t block_address = (pos >> 16) & 0xFFFFFFFFFFFFLL;
+ if (fseeko(fp->file, block_address, SEEK_SET) != 0) {
+ report_error(fp, "seek failed");
+ return -1;
+ }
+ fp->block_length = 0; // indicates current block is not loaded
+ fp->block_address = block_address;
+ fp->block_offset = block_offset;
+ return 0;
+}
+
--- /dev/null
+/*
+ * The Broad Institute
+ * SOFTWARE COPYRIGHT NOTICE AGREEMENT
+ * This software and its documentation are copyright 2008 by the
+ * Broad Institute/Massachusetts Institute of Technology. All rights are reserved.
+ *
+ * This software is supplied without any warranty or guaranteed support whatsoever.
+ * Neither the Broad Institute nor MIT can be responsible for its use, misuse,
+ * or functionality.
+ */
+
+#ifndef __BGZF_H
+#define __BGZF_H
+
+#include <stdint.h>
+#include <stdio.h>
+#include "zlib.h"
+#include <stdbool.h>
+//#include "zutil.h"
+
+//typedef int8_t bool;
+
+typedef struct {
+ int file_descriptor;
+ char open_mode; // 'r' or 'w'
+ bool owned_file;
+ FILE* file;
+ int uncompressed_block_size;
+ int compressed_block_size;
+ void* uncompressed_block;
+ void* compressed_block;
+ int64_t block_address;
+ int block_length;
+ int block_offset;
+ const char* error;
+} BGZF;
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/*
+ * Open an existing file descriptor for reading or writing.
+ * Mode must be either "r" or "w".
+ * A subsequent bgzf_close will not close the file descriptor.
+ * Returns null on error.
+ */
+BGZF* bgzf_fdopen(int fd, const char* __restrict mode);
+
+/*
+ * Open the specified file for reading or writing.
+ * Mode must be either "r" or "w".
+ * Returns null on error.
+ */
+BGZF* bgzf_open(const char* path, const char* __restrict mode);
+
+/*
+ * Close the BGZ file and free all associated resources.
+ * Does not close the underlying file descriptor if created with bgzf_fdopen.
+ * Returns zero on success, -1 on error.
+ */
+int bgzf_close(BGZF* fp);
+
+/*
+ * Read up to length bytes from the file storing into data.
+ * Returns the number of bytes actually read.
+ * Returns zero on end of file.
+ * Returns -1 on error.
+ */
+int bgzf_read(BGZF* fp, void* data, int length);
+
+/*
+ * Write length bytes from data to the file.
+ * Returns the number of bytes written.
+ * Returns -1 on error.
+ */
+int bgzf_write(BGZF* fp, const void* data, int length);
+
+/*
+ * Return a virtual file pointer to the current location in the file.
+ * No interpetation of the value should be made, other than a subsequent
+ * call to bgzf_seek can be used to position the file at the same point.
+ * Return value is non-negative on success.
+ * Returns -1 on error.
+ */
+int64_t bgzf_tell(BGZF* fp);
+
+/*
+ * Set the file to read from the location specified by pos, which must
+ * be a value previously returned by bgzf_tell for this file (but not
+ * necessarily one returned by this file handle).
+ * The where argument must be SEEK_SET.
+ * Seeking on a file opened for write is not supported.
+ * Returns zero on success, -1 on error.
+ */
+int64_t bgzf_seek(BGZF* fp, int64_t pos, int where);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif