From: mikaels Date: Fri, 10 Apr 2009 15:07:49 +0000 (+0000) Subject: Initial import. X-Git-Url: https://git.donarmstrong.com/?a=commitdiff_plain;h=3ba4b5eb3f88fd8f24af8bf83ba218b84faa87ba;p=bamtools.git Initial import. git-svn-id: svn+ssh://gene.bc.edu/home/subversion/Derek/BamTools/trunk@2 9efb377e-2e27-44b9-b91a-ec4abb80ed8b --- diff --git a/BamAlignment.h b/BamAlignment.h new file mode 100644 index 0000000..b299ab3 --- /dev/null +++ b/BamAlignment.h @@ -0,0 +1,92 @@ +// BamAlignment.h + +// Derek Barnett +// Marth Lab, Boston College +// Last modified: 20 March 2009 + +#ifndef BAMALIGNMENT_H +#define BAMALIGNMENT_H + +#include + +// C++ includes +#include +using std::string; + +#include +using std::vector; + +struct CigarOp { + uint32_t Length; + char Type; +}; + +struct RefData { + string RefName; + unsigned int RefLength; + bool RefHasAlignments; + + // constructor + RefData(void) + : RefLength(0) + , RefHasAlignments(false) + { } +}; + +typedef vector RefVector; + +struct BamAlignment { + + // queries against alignment flag - see below for further detail + public: + bool IsPaired(void) const { return ( (AlignmentFlag & PAIRED) != 0 ); } + bool IsProperPair(void) const { return ( (AlignmentFlag & PROPER_PAIR) != 0 ); } + bool IsMapped(void) const { return ( (AlignmentFlag & UNMAPPED) == 0 ); } + bool IsMateMapped(void) const { return ( (AlignmentFlag & MATE_UNMAPPED) == 0 ); } + bool IsReverseStrand(void) const { return ( (AlignmentFlag & REVERSE) != 0 ); } + bool IsMateReverseStrand(void) const { return ( (AlignmentFlag & MATE_REVERSE) != 0 ); } + bool IsFirstMate(void) const { return ( (AlignmentFlag & READ_1) != 0 ); } + bool IsSecondMate(void) const { return ( (AlignmentFlag & READ_2) != 0 ); } + bool IsPrimaryAlignment(void) const { return ( (AlignmentFlag & SECONDARY) == 0 ); } + bool IsFailedQC(void) const { return ( (AlignmentFlag & QC_FAILED) != 0 ); } + bool IsDuplicate(void) const { return ( (AlignmentFlag & DUPLICATE) != 0 ); } + + // data members + public: + string Name; // read name + unsigned int Length; // query length + string QueryBases; // original sequence ( produced from machine ) + string AlignedBases; // aligned sequence ( with indels ) + string Qualities; // FASTQ qualities ( still in ASCII characters ) + vector Tags; + unsigned int RefID; // ID for reference sequence + unsigned int Position; // position on reference sequence where alignment starts + unsigned int Bin; // bin in BAM file where this alignment resides + unsigned int MapQuality; // mapping quality + unsigned int AlignmentFlag; // see above for available queries + vector CigarData; // vector of CIGAR operations (length & type) ) + unsigned int MateRefID; // ID for reference sequence that mate was aligned to + unsigned int MatePosition; // position that mate was aligned to + unsigned int InsertSize; // mate pair insert size + + + // alignment flag query constants + private: + enum { PAIRED = 1, // Alignment comes from paired-end data + PROPER_PAIR = 2, // Alignment passed paired-end resolution + UNMAPPED = 4, // Read is unmapped + MATE_UNMAPPED = 8, // Mate is unmapped + REVERSE = 16, // Read is on reverse strand + MATE_REVERSE = 32, // Mate is on reverse strand + READ_1 = 64, // This alignment is mate 1 of pair + READ_2 = 128, // This alignment is mate 2 of pair + SECONDARY = 256, // This alignment is not the primary (best) alignment for read + QC_FAILED = 512, // Read did not pass prior quality control steps + DUPLICATE = 1024 // Read is PCR duplicate + }; +}; + +// commonly used vector in this library +typedef vector< BamAlignment > BamAlignmentVector; + +#endif /* BAMALIGNMENT_H */ diff --git a/BamConversionMain.cpp b/BamConversionMain.cpp new file mode 100644 index 0000000..457e58e --- /dev/null +++ b/BamConversionMain.cpp @@ -0,0 +1,46 @@ +#include +#include "BamReader.h" +#include "BamWriter.h" + +using namespace std; + +int main(int argc, char* argv[]) { + + if(argc != 3) { + cout << "USAGE: " << argv[0] << " " << endl; + exit(1); + } + + // localize our arguments + const char* inputFilename = argv[1]; + const char* outputFilename = argv[2]; + + // open our BAM reader + BamReader reader; + reader.SetFilename(inputFilename); + + if(!reader.Open()) { + cout << "ERROR: Unable to open the BAM file (" << inputFilename << ")." << endl; + exit(1); + } + + // retrieve the SAM header text + string samHeader = reader.GetHeaderText(); + + // retrieve the reference sequence vector + RefVector referenceSequences = reader.GetReferenceData(); + + // open the BAM writer + BamWriter writer; + writer.Open(outputFilename, samHeader, referenceSequences); + + // copy all of the reads from the input file to the output file + BamAlignment al; + while(reader.GetNextAlignment(al)) writer.SaveAlignment(al); + + // close our files + reader.Close(); + writer.Close(); + + return 0; +} \ No newline at end of file diff --git a/BamReader.cpp b/BamReader.cpp new file mode 100644 index 0000000..8fba1a9 --- /dev/null +++ b/BamReader.cpp @@ -0,0 +1,653 @@ +// BamReader.cpp + +// Derek Barnett +// Marth Lab, Boston College +// Last modified: 6 April 2009 + +#include "BamReader.h" +#include +using std::cerr; +using std::endl; + +// static character constants +const char* BamReader::DNA_LOOKUP = "=ACMGRSVTWYHKDBN"; +const char* BamReader::CIGAR_LOOKUP = "MIDNSHP"; + +BamReader::BamReader(const char* filename, const char* indexFilename) + : m_filename( (char*)filename ) + , m_indexFilename( (char*)indexFilename ) + , m_file(0) + , m_index(NULL) + , m_headerText("") + , m_isOpen(false) + , m_isIndexLoaded(false) + , m_isRegionSpecified(false) + , m_isCalculateAlignedBases(true) + , m_currentRefID(0) + , m_currentLeft(0) + , m_alignmentsBeginOffset(0) +{ + Open(); +} + +BamReader::~BamReader(void) { + + // close file + if ( m_isOpen ) { Close(); } +} + +// open BAM file +bool BamReader::Open(void) { + + if (!m_isOpen && m_filename != NULL ) { + + // open file + m_file = bam_open(m_filename, "r"); + + // get header info && index info + if ( (m_file != NULL) && LoadHeader() ) { + + // save file offset where alignments start + m_alignmentsBeginOffset = bam_tell(m_file); + + // set open flag + m_isOpen = true; + } + + // try to open (and load) index data, if index file given + if ( m_indexFilename != NULL ) { + OpenIndex(); + } + } + + return m_isOpen; +} + +bool BamReader::OpenIndex(void) { + + if ( m_indexFilename && !m_isIndexLoaded ) { + m_isIndexLoaded = LoadIndex(); + } + return m_isIndexLoaded; +} + +// close BAM file +bool BamReader::Close(void) { + + if (m_isOpen) { + + // close file + int ret = bam_close(m_file); + + // delete index info + if ( m_index != NULL) { delete m_index; } + + // clear open flag + m_isOpen = false; + + // clear index flag + m_isIndexLoaded = false; + + // clear region flag + m_isRegionSpecified = false; + + // return success/fail of bam_close + return (ret == 0); + } + + return true; +} + +// get BAM filename +const char* BamReader::Filename(void) const { + return (const char*)m_filename; +} + +// set BAM filename +void BamReader::SetFilename(const char* filename) { + m_filename = (char*)filename; +} + +// get BAM Index filename +const char* BamReader::IndexFilename(void) const { + return (const char*)m_indexFilename; +} + +// set BAM Index filename +void BamReader::SetIndexFilename(const char* indexFilename) { + m_indexFilename = (char*)indexFilename; +} + +// return full header text +const string BamReader::GetHeaderText(void) const { + return m_headerText; +} + +// return number of reference sequences in BAM file +const int BamReader::GetReferenceCount(void) const { + return m_references.size(); +} + +// get RefID from reference name +const int BamReader::GetRefID(string refName) const { + + vector refNames; + RefVector::const_iterator refIter = m_references.begin(); + RefVector::const_iterator refEnd = m_references.end(); + for ( ; refIter != refEnd; ++refIter) { + refNames.push_back( (*refIter).RefName ); + } + + // return 'index-of' refName (if not found, returns refNames.size()) + return Index( refNames.begin(), refNames.end(), refName ); +} + +const RefVector BamReader::GetReferenceData(void) const { + return m_references; +} + +bool BamReader::Jump(int refID, unsigned int left) { + + // if index available, and region is valid + if ( m_isIndexLoaded && m_references.at(refID).RefHasAlignments && (left <= m_references.at(refID).RefLength) ) { + m_currentRefID = refID; + m_currentLeft = left; + m_isRegionSpecified = true; + return ( bam_seek(m_file, GetOffset(m_currentRefID, m_currentLeft), SEEK_SET) == 0 ); + } + return false; +} + +bool BamReader::Rewind(void) { + + int refID = 0; + int refCount = m_references.size(); + for ( ; refID < refCount; ++refID ) { + if ( m_references.at(refID).RefHasAlignments ) { break; } + } + + m_currentRefID = refID; + m_currentLeft = 0; + m_isRegionSpecified = false; + + return ( bam_seek(m_file, m_alignmentsBeginOffset, SEEK_SET) == 0 ); +} + +// get next alignment from specified region +bool BamReader::GetNextAlignment(BamAlignment& bAlignment) { + + // try to load 'next' read + if ( LoadNextAlignment(bAlignment) ) { + + // if specified region, check for region overlap + if ( m_isRegionSpecified ) { + + // if overlap, return true + if ( IsOverlap(bAlignment) ) { return true; } + // if not, try the next alignment + else { return GetNextAlignment(bAlignment); } + } + + // not using region, valid read detected, return success + else { return true; } + } + + // no valid alignment to load + return false; +} + +void BamReader::SetCalculateAlignedBases(bool flag) { + m_isCalculateAlignedBases = flag; +} + +int BamReader::BinsFromRegion(int refID, unsigned int left, uint16_t list[MAX_BIN]) { + + // get region boundaries + uint32_t begin = left; + uint32_t end = m_references.at(refID).RefLength - 1; + + // initialize list, bin '0' always a valid bin + int i = 0; + list[i++] = 0; + + // get rest of bins that contain this region + unsigned int k; + for (k = 1 + (begin>>26); k <= 1 + (end>>26); ++k) { list[i++] = k; } + for (k = 9 + (begin>>23); k <= 9 + (end>>23); ++k) { list[i++] = k; } + for (k = 73 + (begin>>20); k <= 73 + (end>>20); ++k) { list[i++] = k; } + for (k = 585 + (begin>>17); k <= 585 + (end>>17); ++k) { list[i++] = k; } + for (k = 4681 + (begin>>14); k <= 4681 + (end>>14); ++k) { list[i++] = k; } + + // return number of bins stored + return i; +} + +uint32_t BamReader::CalculateAlignmentEnd(const unsigned int& position, const vector& cigarData) { + + // initialize alignment end to starting position + uint32_t alignEnd = position; + + // iterate over cigar operations + vector::const_iterator cigarIter = cigarData.begin(); + vector::const_iterator cigarEnd = cigarData.end(); + for ( ; cigarIter != cigarEnd; ++cigarIter) { + if ( (*cigarIter).Type == 'M' || (*cigarIter).Type == 'D' || (*cigarIter).Type == 'N') { + alignEnd += (*cigarIter).Length; + } + } + return alignEnd; +} + +uint64_t BamReader::GetOffset(int refID, unsigned int left) { + + // make space for bins + uint16_t* bins = (uint16_t*)calloc(MAX_BIN, 2); + + // returns number of bins overlapping (left, right) + // stores indices of those bins in 'bins' + int numBins = BinsFromRegion(refID, left, bins); + + // access index data for refID + RefIndex* refIndex = m_index->at(refID); + + // get list of BAM bins for this reference sequence + BinVector* refBins = refIndex->first; + + sort( refBins->begin(), refBins->end(), LookupKeyCompare() ); + + // declare ChunkVector + ChunkVector regionChunks; + + // declaure LinearOffsetVector + LinearOffsetVector* linearOffsets = refIndex->second; + + // some sort of linear offset vs bin index voodoo... not completely sure what's going here + uint64_t minOffset = ((left>>BAM_LIDX_SHIFT) >= linearOffsets->size()) ? 0 : linearOffsets->at(left>>BAM_LIDX_SHIFT); + + BinVector::iterator binBegin = refBins->begin(); + BinVector::iterator binEnd = refBins->end(); + + // iterate over bins overlapping region, count chunks + for (int i = 0; i < numBins; ++i) { + + // look for bin with ID=bin[i] + BinVector::iterator binIter = binBegin; + + for ( ; binIter != binEnd; ++binIter ) { + + // if found, increment n_off by number of chunks for each bin + if ( (*binIter).first == (uint32_t)bins[i] ) { + + // iterate over chunks in that bin + ChunkVector* binChunks = (*binIter).second; + + ChunkVector::iterator chunkIter = binChunks->begin(); + ChunkVector::iterator chunkEnd = binChunks->end(); + for ( ; chunkIter != chunkEnd; ++chunkIter) { + + // if right bound of pair is greater than min_off (linear offset value), store pair + if ( (*chunkIter).second > minOffset) { + regionChunks.push_back( (*chunkIter) ); + } + } + } + } + } + + // clean up temp array + free(bins); + + // there should be at least 1 + assert(regionChunks.size() > 0); + + // sort chunks by start position + sort ( regionChunks.begin(), regionChunks.end(), LookupKeyCompare() ); + + // resolve overlaps between adjacent blocks; this may happen due to the merge in indexing + int numOffsets = regionChunks.size(); + for (int i = 1; i < numOffsets; ++i) { + if ( regionChunks.at(i-1).second >= regionChunks.at(i).first ) { + regionChunks.at(i-1).second = regionChunks.at(i).first; + } + } + + // merge adjacent chunks + int l = 0; + for (int i = 1; i < numOffsets; ++i) { + // if adjacent, expand boundaries of (merged) chunk + if ( (regionChunks.at(l).second>>16) == (regionChunks.at(i).first>>16) ) { + regionChunks.at(l).second = regionChunks.at(i).second; + } + // else, move on to next (merged) chunk index + else { regionChunks.at(++l) = regionChunks.at(i); } + } + + // return beginning file offset of first chunk for region + return regionChunks.at(0).first; +} + +bool BamReader::IsOverlap(BamAlignment& bAlignment) { + + // if on different reference sequence, quit + if ( bAlignment.RefID != (unsigned int)m_currentRefID ) { return false; } + + // read starts after left boundary + if ( bAlignment.Position >= m_currentLeft) { return true; } + + // get alignment end + uint32_t alignEnd = CalculateAlignmentEnd(bAlignment.Position, bAlignment.CigarData); + + // return whether alignment end overlaps left boundary + return ( alignEnd >= m_currentLeft ); +} + +bool BamReader::LoadHeader(void) { + + // check to see if proper BAM header + char buf[4]; + if (bam_read(m_file, buf, 4) != 4) { return false; } + if (strncmp(buf, "BAM\001", 4)) { + fprintf(stderr, "wrong header type!\n"); + return false; + } + + // get BAM header text length + int32_t headerTextLength; + bam_read(m_file, &headerTextLength, 4); + + // get BAM header text + char* headerText = (char*)calloc(headerTextLength + 1, 1); + bam_read(m_file, headerText, headerTextLength); + m_headerText = (string)((const char*)headerText); + + // clean up calloc-ed temp variable + free(headerText); + + // get number of reference sequences + int32_t numberRefSeqs; + bam_read(m_file, &numberRefSeqs, 4); + if (numberRefSeqs == 0) { return false; } + + m_references.reserve((int)numberRefSeqs); + + // reference variables + int32_t refNameLength; + char* refName; + uint32_t refLength; + + // iterate over all references in header + for (int i = 0; i != numberRefSeqs; ++i) { + + // get length of reference name + bam_read(m_file, &refNameLength, 4); + refName = (char*)calloc(refNameLength, 1); + + // get reference name and reference sequence length + bam_read(m_file, refName, refNameLength); + bam_read(m_file, &refLength, 4); + + // store data for reference + RefData aReference; + aReference.RefName = (string)((const char*)refName); + aReference.RefLength = refLength; + m_references.push_back(aReference); + + // clean up calloc-ed temp variable + free(refName); + } + + return true; +} + +bool BamReader::LoadIndex(void) { + + // check to see if index file exists + FILE* indexFile; + if ( ( indexFile = fopen(m_indexFilename, "r") ) == 0 ) { + fprintf(stderr, "The alignment is not indexed. Please run SAMtools \'index\' command first.\n"); + return false; + } + + // see if index is valid BAM index + char magic[4]; + fread(magic, 1, 4, indexFile); + if (strncmp(magic, "BAI\1", 4)) { + fprintf(stderr, "Problem with index - wrong \'magic\' number.\n"); + fclose(indexFile); + return false; + } + + // get number of reference sequences + uint32_t numRefSeqs; + fread(&numRefSeqs, 4, 1, indexFile); + + // intialize BamIndex data structure + m_index = new BamIndex; + m_index->reserve(numRefSeqs); + + // iterate over reference sequences + for (unsigned int i = 0; i < numRefSeqs; ++i) { + + // get number of bins for this reference sequence + int32_t numBins; + fread(&numBins, 4, 1, indexFile); + + if (numBins > 0) { m_references.at(i).RefHasAlignments = true; } + + // intialize BinVector + BinVector* bins = new BinVector; + bins->reserve(numBins); + + // iterate over bins for that reference sequence + for (int j = 0; j < numBins; ++j) { + + // get binID + uint32_t binID; + fread(&binID, 4, 1, indexFile); + + // get number of regionChunks in this bin + uint32_t numChunks; + fread(&numChunks, 4, 1, indexFile); + + // intialize ChunkVector + ChunkVector* regionChunks = new ChunkVector; + regionChunks->reserve(numChunks); + + // iterate over regionChunks in this bin + for (unsigned int k = 0; k < numChunks; ++k) { + + // get chunk boundaries (left, right) + uint64_t left; + uint64_t right; + fread(&left, 8, 1, indexFile); + fread(&right, 8, 1, indexFile); + + // save ChunkPair + regionChunks->push_back( ChunkPair(left, right) ); + } + + // save binID, chunkVector for this bin + bins->push_back( BamBin(binID, regionChunks) ); + } + + // load linear index for this reference sequence + + // get number of linear offsets + int32_t numLinearOffsets; + fread(&numLinearOffsets, 4, 1, indexFile); + + // intialize LinearOffsetVector + LinearOffsetVector* linearOffsets = new LinearOffsetVector; + linearOffsets->reserve(numLinearOffsets); + + // iterate over linear offsets for this reference sequeence + for (int j = 0; j < numLinearOffsets; ++j) { + // get a linear offset + uint64_t linearOffset; + fread(&linearOffset, 8, 1, indexFile); + // store linear offset + linearOffsets->push_back(linearOffset); + } + + // store index data for that reference sequence + m_index->push_back( new RefIndex(bins, linearOffsets) ); + } + + // close index file (.bai) and return + fclose(indexFile); + return true; +} + +bool BamReader::LoadNextAlignment(BamAlignment& bAlignment) { + + // check valid alignment block header data + int32_t block_len; + int32_t ret; + uint32_t x[8]; + + int32_t bytesRead = 0; + + // read in the 'block length' value, make sure it's not zero + if ( (ret = bam_read(m_file, &block_len, 4)) == 0 ) { return false; } + bytesRead += 4; + + // read in core alignment data, make the right size of data was read + if ( bam_read(m_file, x, BAM_CORE_SIZE) != BAM_CORE_SIZE ) { return false; } + bytesRead += BAM_CORE_SIZE; + + // set BamAlignment 'core' data + bAlignment.RefID = x[0]; + bAlignment.Position = x[1]; + bAlignment.Bin = x[2]>>16; + bAlignment.MapQuality = x[2]>>8&0xff; + bAlignment.AlignmentFlag = x[3]>>16; + bAlignment.MateRefID = x[5]; + bAlignment.MatePosition = x[6]; + bAlignment.InsertSize = x[7]; + + // fetch & store often-used lengths for character data parsing + unsigned int queryNameLength = x[2]&0xff; + unsigned int numCigarOperations = x[3]&0xffff; + unsigned int querySequenceLength = x[4]; + + // get length of character data + int dataLength = block_len - BAM_CORE_SIZE; + + // set up destination buffers for character data + uint8_t* allCharData = (uint8_t*)calloc(sizeof(uint8_t), dataLength); + uint32_t* cigarData = (uint32_t*)(allCharData+queryNameLength); + + // get character data - make sure proper data size was read + if (bam_read(m_file, allCharData, dataLength) != dataLength) { return false; } + else { + + bytesRead += dataLength; + + // clear out bases, qualities, aligned bases, and CIGAR + bAlignment.QueryBases.clear(); + bAlignment.Qualities.clear(); + bAlignment.AlignedBases.clear(); + bAlignment.CigarData.clear(); + + // save name + bAlignment.Name = (string)((const char*)(allCharData)); + + // save bases + char singleBase[2]; + uint8_t* s = ( allCharData + (numCigarOperations*4) + queryNameLength); + for (unsigned int i = 0; i < querySequenceLength; ++i) { + // numeric to char conversion + sprintf( singleBase, "%c", DNA_LOOKUP[ ( ( s[(i/2)] >> (4*(1-(i%2)))) & 0xf ) ] ); + // append character data to Bases + bAlignment.QueryBases.append( (const char*)singleBase ); + } + + // save sequence length + bAlignment.Length = bAlignment.QueryBases.length(); + + // save qualities + char singleQuality[4]; + uint8_t* t = ( allCharData + (numCigarOperations*4) + queryNameLength + (querySequenceLength + 1)/2); + for (unsigned int i = 0; i < querySequenceLength; ++i) { + // numeric to char conversion + sprintf( singleQuality, "%c", ( t[i]+33 ) ); + // append character data to Qualities + bAlignment.Qualities.append( (const char*)singleQuality ); + } + + // save CIGAR-related data; + int k = 0; + for (unsigned int i = 0; i < numCigarOperations; ++i) { + + // build CigarOp struct + CigarOp op; + op.Length = (cigarData[i] >> BAM_CIGAR_SHIFT); + op.Type = CIGAR_LOOKUP[ (cigarData[i] & BAM_CIGAR_MASK) ]; + + // save CigarOp + bAlignment.CigarData.push_back(op); + + // can skip this step if user wants to ignore + if (m_isCalculateAlignedBases) { + + // build AlignedBases string + switch (op.Type) { + + case ('M') : + case ('I') : bAlignment.AlignedBases.append( bAlignment.QueryBases.substr(k, op.Length) ); // for 'M', 'I' - write bases + case ('S') : k += op.Length; // for 'S' - skip over query bases + break; + + case ('D') : + case ('P') : bAlignment.AlignedBases.append( op.Length, '*' ); // for 'D', 'P' - write padding; + break; + + case ('N') : bAlignment.AlignedBases.append( op.Length, 'N' ); // for 'N' - write N's, skip bases in query sequence + k += op.Length; + break; + + case ('H') : break; // for 'H' - do nothing, move to next op + + default : assert(false); // shouldn't get here + break; + } + } + } + } + free(allCharData); + +/* + // (optional) read tag parsing data + string tag; + char data; + int i = 0; + + // still data to read (auxiliary tags) + while ( bytesRead < block_len ) { + + if ( bam_read(m_file, &data, 1) == 1 ) { + + ++bytesRead; + + if (bytesRead == block_len && data != '\0') { + fprintf(stderr, "ERROR: Invalid termination of tag info at end of alignment block.\n"); + exit(1); + } + + tag.append(1, data); + if ( data == '\0' ) { + bAlignment.Tags.push_back(tag); + tag = ""; + i = 0; + } else { + if ( (i == 1) && (i == 2) ) { tag.append(1, ':'); } + ++i; + } + } else { + fprintf(stderr, "ERROR: Could not read tag info.\n"); + exit(1); + } + } +*/ + return true; +} diff --git a/BamReader.h b/BamReader.h new file mode 100644 index 0000000..786f4c2 --- /dev/null +++ b/BamReader.h @@ -0,0 +1,230 @@ +// BamReader.h + +/* The MIT License + + Copyright (c) 2008 Genome Research Ltd (GRL). + + Permission is hereby granted, free of charge, to any person obtaining + a copy of this software and associated documentation files (the + "Software"), to deal in the Software without restriction, including + without limitation the rights to use, copy, modify, merge, publish, + distribute, sublicense, and/or sell copies of the Software, and to + permit persons to whom the Software is furnished to do so, subject to + the following conditions: + + The above copyright notice and this permission notice shall be + included in all copies or substantial portions of the Software. + + THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + SOFTWARE. +*/ + +/* + Implementation of BAM-parsing was translated to C++ directly from Heng Li's SAMtools package + (thus the carryover of above MIT license) + Contact: Derek Barnett +*/ + +// Derek Barnett +// Marth Lab, Boston College +// Last modified: 6 April 2009 + +#ifndef BAMREADER_H +#define BAMREADER_H + +// custom includes +#include "BamAlignment.h" +#include "STLUtilities.h" + +// C library includes +#include +#include +#include +#include +#include + +// BGZF library includes/defines +#include "bgzf.h" +typedef BGZF* BamFile; +#define bam_open(f_name, mode) bgzf_open(f_name, mode) +#define bam_close(f_ptr) bgzf_close(f_ptr) +#define bam_read(f_ptr, buf, size) bgzf_read(f_ptr, buf, size) +#define bam_write(f_ptr, buf, size) bgzf_write(f_ptr, buf, size) +#define bam_tell(f_ptr) bgzf_tell(f_ptr) +#define bam_seek(f_ptr, pos, dir) bgzf_seek(f_ptr, pos, dir) + +// size of alignment data block in BAM file (bytes) +#define BAM_CORE_SIZE 32 + +// BAM indexing constants +#define MAX_BIN 37450 // =(8^6-1)/7+1 +#define BAM_MIN_CHUNK_GAP 32768 +#define BAM_LIDX_SHIFT 14 + +// CIGAR-retrieval mask/shift constants +#define BAM_CIGAR_SHIFT 4 +#define BAM_CIGAR_MASK ((1 << BAM_CIGAR_SHIFT) - 1) + +// CIGAR-operation types +#define BAM_CMATCH 0 +#define BAM_CINS 1 +#define BAM_CDEL 2 +#define BAM_CREF_SKIP 3 +#define BAM_CSOFT_CLIP 4 +#define BAM_CHARD_CLIP 5 +#define BAM_CPAD 6 + +// --------------------------- // +// Bam header info +// --------------------------- // + +// --------------------------- // +// BamIndex-related typedefs +// --------------------------- // + +// offset for linear indexing +typedef vector LinearOffsetVector; + +// chunk boundaries +typedef pair ChunkPair; +// list of chunks in a BAM bin +typedef vector ChunkVector; + +// BAM bins for a reference sequence +// replaces khash - uint32_t is key, ChunkVector is value +typedef pair BamBin; +typedef vector BinVector; + +// each reference sequence has a BinVector and LinearOffsetVector +typedef pair RefIndex; + +// full BamIndex defined as: +typedef vector BamIndex; + +// ---------------------------------------------------------------------------// + +class BamReader { + + public: + // constructors + BamReader(const char* fileName = NULL, const char* indexFilename = NULL); + + public: + // destructor + ~BamReader(void); + + // BAM interface methods + public: + + // ----------------------- // + // File manipulation + // ----------------------- // + + // open BAM file (automatically opens index if provided) + bool Open(void); + + // open BAM index (allows index to be opened separately - i.e. sometime after the BAM file is opened) + bool OpenIndex(void); + + // close BAM file + bool Close(void); + + // get BAM filename + const char* Filename(void) const; + + // set BAM filename + void SetFilename(const char*); + + // get BAM Index filename + const char* IndexFilename(void) const; + + // set BAM Index filename + void SetIndexFilename(const char*); + + // ----------------------- // + // Access BAM header + // ----------------------- // + + // return full header text + const string GetHeaderText(void) const; + + // --------------------------------- // + // Access reference sequence info + // --------------------------------- // + + // return number of reference sequences in BAM file + const int GetReferenceCount(void) const; + + // return vector of RefData entries + const RefVector GetReferenceData(void) const; + + // get refID from reference name + const int GetRefID(string refName) const; + + // ----------------------------------------- // + // File position moving + // ----------------------------------------- // + + // jumps to 'left' position on refID + // actually jumps before position, so reads that overlap 'left' are included as well + // 'left' defaults to reference begin if not specified + bool Jump(int refID, unsigned int left = 0); + + // Jump to beginning of BAM file, clears any region previously set by Jump() + bool Rewind(void); + + // ------------------------------ // + // Access alignments + // ------------------------------ // + + // get next alignment + bool GetNextAlignment(BamAlignment& read); + + // allow user to specifiy whether 'AlignedBases' string is calculated when alignment is loaded + void SetCalculateAlignedBases(bool); + + // internal utility methods + private: + int BinsFromRegion(int, unsigned int, uint16_t[MAX_BIN]); + uint32_t CalculateAlignmentEnd(const unsigned int&, const vector&); + uint64_t GetOffset(int, unsigned int); + bool IsOverlap(BamAlignment&); + bool LoadHeader(void); + bool LoadIndex(void); + bool LoadNextAlignment(BamAlignment&); + + private: + // main BAM reader components + char* m_filename; + char* m_indexFilename; + BamFile m_file; + BamIndex* m_index; + RefVector m_references; + string m_headerText; + + // state flags + bool m_isOpen; // BAM file is open for processing + bool m_isIndexLoaded; // BAM Index data is loaded and available for processing + bool m_isRegionSpecified; // a region has been specified - specifically, a user has called Jump() + bool m_isCalculateAlignedBases; // build 'AlignedBases' string when getting an alignment, otherwise skip (default = true) + + // region values + int m_currentRefID; + unsigned int m_currentLeft; + + // file offset of 1st read in BAM file + int64_t m_alignmentsBeginOffset; + + private: + // BAM character constants + static const char* DNA_LOOKUP; + static const char* CIGAR_LOOKUP; +}; + +#endif /* BAMREADER_H */ diff --git a/BamReaderMain.cpp b/BamReaderMain.cpp new file mode 100644 index 0000000..bdf4e60 --- /dev/null +++ b/BamReaderMain.cpp @@ -0,0 +1,255 @@ +// BamReaderMain.cpp + +// Derek Barnett +// Marth Lab, Boston College +// Last modified: 6 April 2009 + +#include "BamReader.h" + +// C++ includes +#include +using std::cerr; +using std::cout; +using std::endl; + +#include +using std::vector; + +#include +using std::string; + +int main(int argc, char* argv[]) { + + // check command line parameters + if (argc != 3) { + cerr << endl; + cerr << "Usage: BamReaderTest " << endl; + cerr << endl; + exit(1); + } + + // get filenames from command line + const char* bamFilename = argv[1]; + const char* bamIndexFilename = argv[2]; + + string refName; + int refID; + unsigned int leftBound; + unsigned int rightBound; + + int alignmentCount; + + vector refNames; + + BamAlignment bAlignment; + BamAlignmentVector alignments; + + RefVector references; + + // ------------------------------------------------------------------------------------------------------ // + // Declare BamReader & open BAM file - automatically loads up header and index file (.bai) information + // ------------------------------------------------------------------------------------------------------ // + + BamReader bReader(bamFilename, bamIndexFilename); + + cerr << endl; + cerr << "Opening BamReader for BAM file: " << bamFilename << " ..... "; + + if ( bReader.Open() ) { + cerr << "ok" << endl; + } + else { + cerr << "error" << endl; + exit(1); + } + + // ------------------------------------------------------------ // + // Find out how many reference sequences are in BAM file + // ------------------------------------------------------------ // + + references = bReader.GetReferenceData(); + + cerr << endl; + cerr << "Total number of ref seqs: " << references.size() << endl; + + // ---------------------------------------------------------------------------- // + // Get the names/lengths of all the reference sequences that have alignments + // ---------------------------------------------------------------------------- // + + cerr << endl; + cerr << "All ref sequences with alignments:" << endl; + cerr << endl; + + + if ( !references.empty() ) { + + RefVector::iterator refIter = references.begin(); + RefVector::iterator refEnd = references.end(); + + refID = 0; + // iterate over reference names, print to STDERR + for( ; refIter != refEnd; ++refIter) { + + if ( (*refIter).RefHasAlignments ) { + cerr << "ID: " << refID << endl; + cerr << "Name: " << (*refIter).RefName << endl; + cerr << "Length: " << (*refIter).RefLength << endl; + cerr << endl; + } + + ++refID; + } + } + + // --------------------------------------------------------------------------------------------- // + // Get the SAM-style header text, if available (not available in the example file shown here) + // --------------------------------------------------------------------------------------------- // + + cerr << "------------------------------------" << endl; + cerr << "SAM header text: " << endl; + + // get (SAM) header text + string header = bReader.GetHeaderText(); + + cerr << ( (header.empty()) ? "no header data" : header ) << endl; + cerr << "------------------------------------" << endl; + + // --------------------------------------------------------------------------------------------- // + // Here we start accessing alignments + // The first method shows how to iterate over all reads in a BamFile + // This method will work on any BAM file (sorted/non-sorted, with/without an index) + // --------------------------------------------------------------------------------------------- // + + // Call Rewind() to make sure you're at the 1st alignment in the file + // Please note, however, it's not necessary in this case, since file pointer initially set to 1st alignment + // but this is probably a good habit to develop to ensure correctness and make your intent obvious in the code + + cerr << "Getting (up to) first 1000 alignments" << endl; + + // start at 1st alignment + if ( bReader.Rewind() ) { + + alignmentCount = 0; + while ( bReader.GetNextAlignment(bAlignment) && (alignmentCount < 1000) ) { + + // disregard unmapped alignments + if ( bAlignment.IsMapped() ) { + + ++alignmentCount; + + cout << "----------------------------" << endl; + cout << "Alignment " << alignmentCount << endl; + cout << bAlignment.Name << endl; + cout << bAlignment.AlignedBases << endl; + cout << "Aligned to " << references.at(bAlignment.RefID).RefName << ":" << bAlignment.Position << endl; + + cout << "Cigar Data: " << endl; + + vector::const_iterator cigarIter = bAlignment.CigarData.begin(); + vector::const_iterator cigarEnd = bAlignment.CigarData.end(); + for ( ; cigarIter != cigarEnd; ++cigarIter ) { + cout << "Type: " << (*cigarIter).Type << "\tLength: " << (*cigarIter).Length << endl; + } + + cout << "Tags: " << endl; + vector::const_iterator tagIter = bAlignment.Tags.begin(); + vector::const_iterator tagEnd = bAlignment.Tags.end(); + for ( ; tagIter != tagEnd; ++tagIter ) { + cout << (*tagIter) << endl; + } + } + } + + cerr << "Found " << alignmentCount << " alignments." << endl; + } else { cerr << "Could not rewind" << endl; } + + // ---------------------------------------------------------------------------------------------------------- // + // You can iterate over each individual alignment that overlaps a specified region + // Set the refID & left boundary parameters using Jump(refID, left) + // Jump() actually positions the file pointer actually before the left boundary + // This ensures that reads beginning before, but overlapping 'left' are included as well + // Client is responsible for setting/checking right boundary - see below + // ---------------------------------------------------------------------------------------------------------- // +/* + cerr << "Jumping to region" << endl; + + // get refID using a known refName + refName = "seq1"; + refID = bReader.GetRefID(refName); + if (refID == (int)references.size()) { + cerr << "Reference " << refName << " not found." << endl; + exit(1); + } + + // set left boundary + leftBound = 500; + + // set right boundary - either user-specified number to set a discrete region + // OR you can query the BamReader for the end of the reference + rightBound = references.at(refID).RefLength; + + cerr << endl; + cerr << "Iterating over alignments on reference: " << refName << " from " << leftBound << " to ref end (" << rightBound << ")" << endl; + + // set region - specific region on reference + if ( bReader.Jump(refID, leftBound) ) { + + alignmentCount = 0; + while ( bReader.GetNextAlignment(bAlignment) && (bAlignment.Position <= rightBound) ) { + + if ( bAlignment.IsMapped() ) { + + ++alignmentCount; + + if ( (alignmentCount % 100000) == 0) { cerr << "Retrieved " << alignmentCount << " so far..." << endl; } + + cout << "----------------------------" << endl; + cout << "Alignment " << alignmentCount << endl; + cout << bAlignment.Name << endl; + cout << bAlignment.AlignedBases << endl; + cout << "Aligned to " << references.at(bAlignment.RefID).RefName << ":" << bAlignment.Position << endl; + } + } + + cerr << "Found " << alignmentCount << " alignments." << endl; + } else { cerr << "Could not jump to region specified" << endl; } + + // ----------------------------------------------------------------------------------------------------- // + // You can 'rewind' back to beginning of BAM file at any time + // ----------------------------------------------------------------------------------------------------- // + + cerr << endl; + cerr << "Rewinding BAM file... then getting first 1000 alignments" << endl; + + alignmentCount = 0; + if (bReader.Rewind() ) { + while ( bReader.GetNextAlignment(bAlignment) && (alignmentCount < 1000) ) { + + // disregard unmapped alignments + if ( bAlignment.IsMapped() ) { + + ++alignmentCount; + + cout << "----------------------------" << endl; + cout << "Alignment " << alignmentCount << endl; + cout << bAlignment.Name << endl; + cout << bAlignment.AlignedBases << endl; + cout << "Aligned to " << references.at(bAlignment.RefID).RefName << ":" << bAlignment.Position << endl; + } + } + + cerr << "Found " << alignmentCount << " alignments." << endl; + } else { cerr << "Could not rewind" << endl; } +*/ + // ---------------------------------------------------------------------- // + // Close BamReader object (releases internal header/index data) and exit + // ---------------------------------------------------------------------- // + + cerr << endl; + cerr << "Closing BAM file: " << bamFilename << endl; + + bReader.Close(); + + cerr << "Exiting..." << endl << endl; + return 0; +} diff --git a/BamWriter.cpp b/BamWriter.cpp new file mode 100644 index 0000000..24544b3 --- /dev/null +++ b/BamWriter.cpp @@ -0,0 +1,403 @@ +// *************************************************************************** +// BamWriter (c) 2009 Michael Strömberg +// Marth Lab, Deptartment of Biology, Boston College +// All rights reserved. +// --------------------------------------------------------------------------- +// Provides the basic functionality for producing BAM files +// *************************************************************************** + +#include "BamWriter.h" + +// constructor +BamWriter::BamWriter(void) +{} + +// destructor +BamWriter::~BamWriter(void) { + if(mBGZF.IsOpen) BgzfClose(); +} + +// closes the BAM file +void BamWriter::BgzfClose(void) { + + mBGZF.IsOpen = false; + + // flush the BGZF block + BgzfFlushBlock(); + + // flush and close + fflush(mBGZF.Stream); + fclose(mBGZF.Stream); +} + +// compresses the current block +int BamWriter::BgzfDeflateBlock(void) { + + // initialize the gzip header + char* buffer = mBGZF.CompressedBlock; + unsigned int bufferSize = mBGZF.CompressedBlockSize; + + memset(buffer, 0, 18); + buffer[0] = GZIP_ID1; + buffer[1] = (char)GZIP_ID2; + buffer[2] = CM_DEFLATE; + buffer[3] = FLG_FEXTRA; + buffer[9] = (char)OS_UNKNOWN; + buffer[10] = BGZF_XLEN; + buffer[12] = BGZF_ID1; + buffer[13] = BGZF_ID2; + buffer[14] = BGZF_LEN; + + // loop to retry for blocks that do not compress enough + int inputLength = mBGZF.BlockOffset; + int compressedLength = 0; + + while(true) { + + z_stream zs; + zs.zalloc = NULL; + zs.zfree = NULL; + zs.next_in = (Bytef*)mBGZF.UncompressedBlock; + zs.avail_in = inputLength; + zs.next_out = (Bytef*)&buffer[BLOCK_HEADER_LENGTH]; + zs.avail_out = bufferSize - BLOCK_HEADER_LENGTH - BLOCK_FOOTER_LENGTH; + + // initialize the zlib compression algorithm + if(deflateInit2(&zs, Z_DEFAULT_COMPRESSION, Z_DEFLATED, GZIP_WINDOW_BITS, Z_DEFAULT_MEM_LEVEL, Z_DEFAULT_STRATEGY) != Z_OK) { + printf("ERROR: zlib deflate initialization failed.\n"); + exit(1); + } + + // compress the data + int status = deflate(&zs, Z_FINISH); + if(status != Z_STREAM_END) { + deflateEnd(&zs); + + // reduce the input length and try again + if(status == Z_OK) { + inputLength -= 1024; + if(inputLength < 0) { + printf("ERROR: input reduction failed.\n"); + exit(1); + } + continue; + } + + printf("ERROR: zlib deflate failed.\n"); + exit(1); + } + + // finalize the compression routine + if(deflateEnd(&zs) != Z_OK) { + printf("ERROR: deflate end failed.\n"); + exit(1); + } + + compressedLength = zs.total_out; + compressedLength += BLOCK_HEADER_LENGTH + BLOCK_FOOTER_LENGTH; + + if(compressedLength > MAX_BLOCK_SIZE) { + printf("ERROR: deflate overflow.\n"); + exit(1); + } + + break; + } + + // store the compressed length + BgzfPackUnsignedShort(&buffer[16], (unsigned short)(compressedLength - 1)); + + // store the CRC32 checksum + unsigned int crc = crc32(0, NULL, 0); + crc = crc32(crc, (Bytef*)mBGZF.UncompressedBlock, inputLength); + BgzfPackUnsignedInt(&buffer[compressedLength - 8], crc); + BgzfPackUnsignedInt(&buffer[compressedLength - 4], inputLength); + + // ensure that we have less than a block of data left + int remaining = mBGZF.BlockOffset - inputLength; + if(remaining > 0) { + if(remaining > inputLength) { + printf("ERROR: remainder too large.\n"); + exit(1); + } + + memcpy(mBGZF.UncompressedBlock, mBGZF.UncompressedBlock + inputLength, remaining); + } + + mBGZF.BlockOffset = remaining; + return compressedLength; +} + +// flushes the data in the BGZF block +void BamWriter::BgzfFlushBlock(void) { + + // flush all of the remaining blocks + while(mBGZF.BlockOffset > 0) { + + // compress the data block + int blockLength = BgzfDeflateBlock(); + + // flush the data to our output stream + int numBytesWritten = fwrite(mBGZF.CompressedBlock, 1, blockLength, mBGZF.Stream); + + if(numBytesWritten != blockLength) { + printf("ERROR: Expected to write %u bytes during flushing, but wrote %u bytes.\n", blockLength, numBytesWritten); + exit(1); + } + + mBGZF.BlockAddress += blockLength; + } +} + +// opens the BAM file for writing +void BamWriter::BgzfOpen(const string& filename) { + + mBGZF.Stream = fopen(filename.c_str(), "wb"); + + if(!mBGZF.Stream) { + printf("ERROR: Unable to open the BAM file (%s) for writing.\n", filename.c_str()); + exit(1); + } + + mBGZF.IsOpen = true; +} + +// writes the supplied data into the BGZF buffer +unsigned int BamWriter::BgzfWrite(const char* data, const unsigned int dataLen) { + + // initialize + unsigned int numBytesWritten = 0; + const char* input = data; + unsigned int blockLength = mBGZF.UncompressedBlockSize; + + // copy the data to the buffer + while(numBytesWritten < dataLen) { + unsigned int copyLength = min(blockLength - mBGZF.BlockOffset, dataLen - numBytesWritten); + char* buffer = mBGZF.UncompressedBlock; + memcpy(buffer + mBGZF.BlockOffset, input, copyLength); + + mBGZF.BlockOffset += copyLength; + input += copyLength; + numBytesWritten += copyLength; + + if(mBGZF.BlockOffset == blockLength) BgzfFlushBlock(); + } + + return numBytesWritten; +} + +// closes the alignment archive +void BamWriter::Close(void) { + if(mBGZF.IsOpen) BgzfClose(); +} + +// creates a cigar string from the supplied alignment +void BamWriter::CreatePackedCigar(const vector& cigarOperations, string& packedCigar) { + + // initialize + const unsigned int numCigarOperations = cigarOperations.size(); + packedCigar.resize(numCigarOperations * SIZEOF_INT); + + // pack the cigar data into the string + unsigned int* pPackedCigar = (unsigned int*)packedCigar.data(); + + unsigned int cigarOp; + vector::const_iterator coIter; + for(coIter = cigarOperations.begin(); coIter != cigarOperations.end(); coIter++) { + + switch(coIter->Type) { + case 'M': + cigarOp = BAM_CMATCH; + break; + case 'I': + cigarOp = BAM_CINS; + break; + case 'D': + cigarOp = BAM_CDEL; + break; + case 'N': + cigarOp = BAM_CREF_SKIP; + break; + case 'S': + cigarOp = BAM_CSOFT_CLIP; + break; + case 'H': + cigarOp = BAM_CHARD_CLIP; + break; + case 'P': + cigarOp = BAM_CPAD; + break; + default: + printf("ERROR: Unknown cigar operation found: %c\n", coIter->Type); + exit(1); + } + + *pPackedCigar = coIter->Length << BAM_CIGAR_SHIFT | cigarOp; + pPackedCigar++; + } +} + +// encodes the supplied query sequence into 4-bit notation +void BamWriter::EncodeQuerySequence(const string& query, string& encodedQuery) { + + // prepare the encoded query string + const unsigned int queryLen = query.size(); + const unsigned int encodedQueryLen = (unsigned int)((queryLen / 2.0) + 0.5); + encodedQuery.resize(encodedQueryLen); + char* pEncodedQuery = (char*)encodedQuery.data(); + const char* pQuery = (const char*)query.data(); + + unsigned char nucleotideCode; + bool useHighWord = true; + + while(*pQuery) { + + switch(*pQuery) { + case '=': + nucleotideCode = 0; + break; + case 'A': + nucleotideCode = 1; + break; + case 'C': + nucleotideCode = 2; + break; + case 'G': + nucleotideCode = 4; + break; + case 'T': + nucleotideCode = 8; + break; + case 'N': + nucleotideCode = 15; + break; + default: + printf("ERROR: Only the following bases are supported in the BAM format: {=, A, C, G, T, N}. Found [%c]\n", *pQuery); + exit(1); + } + + // pack the nucleotide code + if(useHighWord) { + *pEncodedQuery = nucleotideCode << 4; + useHighWord = false; + } else { + *pEncodedQuery |= nucleotideCode; + pEncodedQuery++; + useHighWord = true; + } + + // increment the query position + pQuery++; + } +} + +// opens the alignment archive +void BamWriter::Open(const string& filename, const string& samHeader, const RefVector& referenceSequences) { + + // open the BGZF file for writing + BgzfOpen(filename); + + // ================ + // write the header + // ================ + + // write the BAM signature + const unsigned char SIGNATURE_LENGTH = 4; + const char* BAM_SIGNATURE = "BAM\1"; + BgzfWrite(BAM_SIGNATURE, SIGNATURE_LENGTH); + + // write the SAM header text length + const unsigned int samHeaderLen = samHeader.size(); + BgzfWrite((char*)&samHeaderLen, SIZEOF_INT); + + // write the SAM header text + if(samHeaderLen > 0) BgzfWrite(samHeader.data(), samHeaderLen); + + // write the number of reference sequences + const unsigned int numReferenceSequences = referenceSequences.size(); + BgzfWrite((char*)&numReferenceSequences, SIZEOF_INT); + + // ============================= + // write the sequence dictionary + // ============================= + + RefVector::const_iterator rsIter; + for(rsIter = referenceSequences.begin(); rsIter != referenceSequences.end(); rsIter++) { + + // write the reference sequence name length + const unsigned int referenceSequenceNameLen = rsIter->RefName.size() + 1; + BgzfWrite((char*)&referenceSequenceNameLen, SIZEOF_INT); + + // write the reference sequence name + BgzfWrite(rsIter->RefName.c_str(), referenceSequenceNameLen); + + // write the reference sequence length + BgzfWrite((char*)&rsIter->RefLength, SIZEOF_INT); + } +} + +// saves the alignment to the alignment archive +void BamWriter::SaveAlignment(const BamAlignment& al) { + + // initialize + const unsigned int nameLen = al.Name.size() + 1; + const unsigned int queryLen = al.QueryBases.size(); + const unsigned int numCigarOperations = al.CigarData.size(); + + // create our packed cigar string + string packedCigar; + CreatePackedCigar(al.CigarData, packedCigar); + const unsigned int packedCigarLen = packedCigar.size(); + + // encode the query + string encodedQuery; + EncodeQuerySequence(al.QueryBases, encodedQuery); + const unsigned int encodedQueryLen = encodedQuery.size(); + + // create our read group tag + // TODO: add read group tag support when it shows up in the BamAlignment struct + //string readGroupTag; + //const unsigned int readGroupTagLen = 3 + mReadGroupLUT[al.ReadGroupCode].NameLen + 1; + //readGroupTag.resize(readGroupTagLen); + //char* pReadGroupTag = (char*)readGroupTag.data(); + //sprintf(pReadGroupTag, "RGZ%s", mReadGroupLUT[al.ReadGroupCode].Name); + + // assign the BAM core data + unsigned int buffer[8]; + buffer[0] = al.RefID; + buffer[1] = al.Position; + buffer[2] = (al.Bin << 16) | (al.MapQuality << 8) | nameLen; + buffer[3] = (al.AlignmentFlag << 16) | numCigarOperations; + buffer[4] = queryLen; + buffer[5] = al.MateRefID; + buffer[6] = al.MatePosition; + buffer[7] = al.InsertSize; + + // write the block size + const unsigned int dataBlockSize = nameLen + packedCigarLen + encodedQueryLen + queryLen /* + readGroupTagLen*/; + const unsigned int blockSize = BAM_CORE_SIZE + dataBlockSize; + + BgzfWrite((char*)&blockSize, SIZEOF_INT); + + // write the BAM core + BgzfWrite((char*)&buffer, BAM_CORE_SIZE); + + // write the query name + BgzfWrite(al.Name.c_str(), nameLen); + + // write the packed cigar + BgzfWrite(packedCigar.data(), packedCigarLen); + + // write the encoded query sequence + BgzfWrite(encodedQuery.data(), encodedQueryLen); + + // write the base qualities + string baseQualities = al.Qualities; + char* pBaseQualities = (char*)al.Qualities.data(); + for(unsigned int i = 0; i < queryLen; i++) pBaseQualities[i] -= 33; + BgzfWrite(pBaseQualities, queryLen); + + // write the read group tag + // TODO: add read group tag support when it shows up in the BamAlignment struct + //BgzfWrite(readGroupTag.c_str(), readGroupTagLen); +} diff --git a/BamWriter.h b/BamWriter.h new file mode 100644 index 0000000..422d1f9 --- /dev/null +++ b/BamWriter.h @@ -0,0 +1,145 @@ +// *************************************************************************** +// BamWriter (c) 2009 Michael Strömberg +// Marth Lab, Deptartment of Biology, Boston College +// All rights reserved. +// --------------------------------------------------------------------------- +// Provides the basic functionality for producing BAM files +// *************************************************************************** + +#pragma once + +#include +#include +#include +#include +#include +#include +#include "BamAlignment.h" + +using namespace std; + +// our zlib constants +#define GZIP_ID1 31 +#define GZIP_ID2 139 +#define CM_DEFLATE 8 +#define FLG_FEXTRA 4 +#define OS_UNKNOWN 255 +#define BGZF_XLEN 6 +#define BGZF_ID1 66 +#define BGZF_ID2 67 +#define BGZF_LEN 2 +#define GZIP_WINDOW_BITS -15 +#define Z_DEFAULT_MEM_LEVEL 8 + +// our BZGF constants +#define BLOCK_HEADER_LENGTH 18 +#define BLOCK_FOOTER_LENGTH 8 +#define MAX_BLOCK_SIZE 65536 +#define DEFAULT_BLOCK_SIZE 65536 + +// our BAM constants +#define BAM_CORE_SIZE 32 +#define BAM_CMATCH 0 +#define BAM_CINS 1 +#define BAM_CDEL 2 +#define BAM_CREF_SKIP 3 +#define BAM_CSOFT_CLIP 4 +#define BAM_CHARD_CLIP 5 +#define BAM_CPAD 6 +#define BAM_CIGAR_SHIFT 4 + +#define BAM_CIGAR_MASK ((1 << BAM_CIGAR_SHIFT) - 1) + +// our variable sizes +#define SIZEOF_INT 4 + +// define our BZGF structure +struct BgzfData { + unsigned int UncompressedBlockSize; + unsigned int CompressedBlockSize; + unsigned int BlockLength; + unsigned int BlockOffset; + uint64_t BlockAddress; + bool IsOpen; + FILE* Stream; + char* UncompressedBlock; + char* CompressedBlock; + + // constructor + BgzfData(void) + : UncompressedBlockSize(DEFAULT_BLOCK_SIZE) + , CompressedBlockSize(MAX_BLOCK_SIZE) + , BlockLength(0) + , BlockOffset(0) + , BlockAddress(0) + , IsOpen(false) + , Stream(NULL) + , UncompressedBlock(NULL) + , CompressedBlock(NULL) + { + try { + CompressedBlock = new char[CompressedBlockSize]; + UncompressedBlock = new char[UncompressedBlockSize]; + } catch(bad_alloc&) { + printf("ERROR: Unable to allocate memory for our BGZF object.\n"); + exit(1); + } + } + + // destructor + ~BgzfData(void) { + if(CompressedBlock) delete [] CompressedBlock; + if(UncompressedBlock) delete [] UncompressedBlock; + } +}; + +class BamWriter { +public: + // constructor + BamWriter(void); + // destructor + ~BamWriter(void); + // closes the alignment archive + void Close(void); + // opens the alignment archive + void Open(const string& filename, const string& samHeader, const RefVector& referenceSequences); + // saves the alignment to the alignment archive + void SaveAlignment(const BamAlignment& al); +private: + // closes the BAM file + void BgzfClose(void); + // compresses the current block + int BgzfDeflateBlock(void); + // flushes the data in the BGZF block + void BgzfFlushBlock(void); + // opens the BAM file for writing + void BgzfOpen(const string& filename); + // packs an unsigned integer into the specified buffer + static inline void BgzfPackUnsignedInt(char* buffer, unsigned int value); + // packs an unsigned short into the specified buffer + static inline void BgzfPackUnsignedShort(char* buffer, unsigned short value); + // writes the supplied data into the BGZF buffer + unsigned int BgzfWrite(const char* data, const unsigned int dataLen); + // calculates the minimum bin that contains a region [begin, end) + static inline unsigned int CalculateMinimumBin(unsigned int begin, unsigned int end); + // creates a packed cigar string from the supplied alignment + static void CreatePackedCigar(const vector& cigarOperations, string& packedCigar); + // encodes the supplied query sequence into 4-bit notation + static void EncodeQuerySequence(const string& query, string& encodedQuery); + // our BGZF output object + BgzfData mBGZF; +}; + +// packs an unsigned integer into the specified buffer +inline void BamWriter::BgzfPackUnsignedInt(char* buffer, unsigned int value) { + buffer[0] = (char)value; + buffer[1] = (char)(value >> 8); + buffer[2] = (char)(value >> 16); + buffer[3] = (char)(value >> 24); +} + +// packs an unsigned short into the specified buffer +inline void BamWriter::BgzfPackUnsignedShort(char* buffer, unsigned short value) { + buffer[0] = (char)value; + buffer[1] = (char)(value >> 8); +} diff --git a/Makefile b/Makefile new file mode 100644 index 0000000..41ba820 --- /dev/null +++ b/Makefile @@ -0,0 +1,35 @@ +CC= gcc +CXX= g++ +CFLAGS= -Wall -pg -O3 -m64 +CXXFLAGS= $(CFLAGS) +DFLAGS= -D_IOLIB=2 #-D_NDEBUG +OBJS= BamReader.o bgzf.o +PROG= BamReaderTest +INCLUDES= +ARFLAGS= -crs +LIBS= -lz +SUBDIRS= . + +.SUFFIXES:.c .cpp .o + +.c.o: + $(CC) -c $(CFLAGS) $(DFLAGS) $(INCLUDES) $< -o $@ + +.cpp.o: + $(CXX) -c $(CXXFLAGS) $(DFLAGS) $(INCLUDES) $< -o $@ + +all: $(PROG) BamConversion + +lib:libbambc.a + +libbambc.a:$(OBJS) + $(AR) $(ARFLAGS) $@ $(OBJS) + +BamReaderTest:lib BamReaderMain.o + $(CXX) $(CXXFLAGS) -o $@ BamReaderMain.o $(LIBS) -L. -lbambc + +BamConversion: lib BamWriter.o BamConversionMain.o + $(CXX) $(CXXFLAGS) -o $@ BamWriter.o BamConversionMain.o $(LIBS) -L. -lbambc + +clean: + rm -fr gmon.out *.o *.a a.out $(PROG) *~ diff --git a/README b/README new file mode 100644 index 0000000..639bfd1 --- /dev/null +++ b/README @@ -0,0 +1,11 @@ +--------------- +README +---------------- + +Copy the 4 header files (*.h) and the library file (libbambc.a) to your directory. + +When you compile your program, add this to the g++ command: + + -lz -L. -lbambc + +and you should be good to go. (Those are lower case l's, as in 'lion'.) diff --git a/STLUtilities.h b/STLUtilities.h new file mode 100644 index 0000000..7a9fbc5 --- /dev/null +++ b/STLUtilities.h @@ -0,0 +1,146 @@ +// STLUtilities.h + +// Derek Barnett +// Marth Lab, Boston College +// Last modified: 19 March 2009 + +#ifndef STL_UTILITIES_H +#define STL_UTILITIES_H + +#include +using std::distance; +using std::find; +using std::transform; + +#include +using std::unary_function; + +#include +using std::iterator_traits; + +#include +using std::string; + +#include +using std::pair; + +#include +using std::vector; + +// -------------------------------------------------------------------------------- // +// This file contains a sample of STL tricks I've gathered along the way. +// +// Many thanks throughout to 'Effective' series by Scott Meyers. +// Some code is lifted (almost) verbatim from his texts where applicable. +// ------------------------------------------------------------------------------- // + +// --------------------------------------------------------------------------------------------------------------------------------- // +// STL containers will delete the values they hold when the container is deleted or goes out of scope +// *** If the container contains pointers, however, they WILL NOT delete the actual pointed-to objects *** +// This struct allows for easy algorithm processing (instead of pesky loops and iterator-boundary issues) to delete pointed-to objects (for any C++ type) that have been allocated with 'new') +// Usage example: for_each( container.begin(), container.end(), DeleteObject() ) +// +// Unless of course, you like quirky, hard-to-spot memory leaks... then feel free to disregard this little STL lesson =) +// --------------------------------------------------------------------------------------------------------------------------------- // +struct DeleteObject { + template + void operator() (const T* p) const { + delete p; + } +}; + +template +void ClearPointerVector(vector& v) { + if ( !v.empty() ) { + for_each(v.begin(), v.end(), DeleteObject()); + v.clear(); + } +} + +// --------------------------------------------------------------------------------------------------------------------------------- // +// Query a vector (other containers? havent tried) for an element +// Returns the index of that element +// Returns vector::size() if not found +// Works with reverse iterators as well... index is counted backward from last element though +// --------------------------------------------------------------------------------------------------------------------------------- // +template < typename InputIterator, typename EqualityComparable > +typename iterator_traits::difference_type +Index(const InputIterator& begin, const InputIterator& end, const EqualityComparable& item) { + return distance(begin, find(begin, end, item)); +} + +// ----------------------------------------------------------------------------------------------------------------------// +// This next section is a sort of work-around for the bulky associative maps +// The idea is to use a *SORTED* vector of pair objects as a lookup vector +// LookupVector = vector< pair > +// ----------------------------------------------------------------------------------------------------------------------// + +// The following template classes allow a templatized comparison function for sorting & searching lookup vector + +// allows sorting by Key ( less ) +template +class LookupKeyCompare { + + typedef pair< Key, Value > LookupData; + typedef typename LookupData::first_type Key_t; + + public: + bool operator() (const LookupData& lhs, const LookupData& rhs) const { return keyLess(lhs.first, rhs.first); } + bool operator() (const LookupData& lhs, const Key_t& k) const { return keyLess(lhs.first, k); } + bool operator() (const Key_t& k, const LookupData& rhs) const { return keyLess(k, rhs.first); } + private: + bool keyLess(const Key_t& k1, const Key_t& k2) const { return k1 < k2; } +}; + +// allows sorting by Value ( less ) +template +class LookupValueCompare { + + typedef pair< Key, Value > LookupData; + typedef typename LookupData::second_type Value_t; + + public: + bool operator() (const LookupData& lhs, const LookupData& rhs) const { return valueLess(lhs.second, rhs.second); } + bool operator() (const LookupData& lhs, const Value_t& k) const { return valueLess(lhs.second, k); } + bool operator() (const Value_t& k, const LookupData& rhs) const { return valueLess(k, rhs.second); } + private: + bool valueLess(const Value_t& k1, const Value_t& k2) const { return k1 < k2; } +}; + +// The following template functions/structs allow you to pull all keys or all values from this lookup structure + +// pull Key from a data pair +template +struct RipKey: + public unary_function< pair, Key> { + Key operator() (pair dataPair) { + return dataPair.first; + } +}; + +// pull Value from a data pair +template +struct RipValue: + public unary_function< pair, Value> { + Value operator() (pair dataPair) { + return dataPair.second; + } +}; + +// pull all Keys from lookup, store in dest (overwrite contents of dest) +template +void RipKeys( vector< pair >& lookup, vector& dest) { + dest.clear(); + dest.reserve(lookup.size()); + transform(lookup.begin(), lookup.end(), back_inserter(dest), RipKey()); +} + +// pull all Values from lookup, store in dest (overwrite contents of dest) +template +void RipValues( vector< pair >& lookup, vector& dest) { + dest.clear(); + dest.reserve(lookup.size()); + transform(lookup.begin(), lookup.end(), back_inserter(dest), RipValue()); +} + +#endif /* STL_UTILITIES_H */ diff --git a/bgzf.c b/bgzf.c new file mode 100644 index 0000000..4314c70 --- /dev/null +++ b/bgzf.c @@ -0,0 +1,488 @@ +/* + * The Broad Institute + * SOFTWARE COPYRIGHT NOTICE AGREEMENT + * This software and its documentation are copyright 2008 by the + * Broad Institute/Massachusetts Institute of Technology. All rights are reserved. + * + * This software is supplied without any warranty or guaranteed support whatsoever. + * Neither the Broad Institute nor MIT can be responsible for its use, misuse, + * or functionality. + */ + +#include +#include +#include +#include +#include +#include +#include +#include "bgzf.h" + +extern off_t ftello(FILE *stream); +extern int fseeko(FILE *stream, off_t offset, int whence); + +typedef int8_t byte; + +static const int DEFAULT_BLOCK_SIZE = 64 * 1024; +static const int MAX_BLOCK_SIZE = 64 * 1024; + +static const int BLOCK_HEADER_LENGTH = 18; +static const int BLOCK_FOOTER_LENGTH = 8; + +static const int GZIP_ID1 = 31; +static const int GZIP_ID2 = 139; +static const int CM_DEFLATE = 8; +static const int FLG_FEXTRA = 4; +static const int OS_UNKNOWN = 255; +static const int BGZF_ID1 = 66; // 'B' +static const int BGZF_ID2 = 67; // 'C' +static const int BGZF_LEN = 2; +static const int BGZF_XLEN = 6; // BGZF_LEN+4 + +static const int GZIP_WINDOW_BITS = -15; // no zlib header +static const int Z_DEFAULT_MEM_LEVEL = 8; + + +inline +void +packInt16(uint8_t* buffer, uint16_t value) +{ + buffer[0] = value; + buffer[1] = value >> 8; +} + +inline +int +unpackInt16(const uint8_t* buffer) +{ + return (buffer[0] | (buffer[1] << 8)); +} + +inline +void +packInt32(uint8_t* buffer, uint32_t value) +{ + buffer[0] = value; + buffer[1] = value >> 8; + buffer[2] = value >> 16; + buffer[3] = value >> 24; +} + +inline +int +min(int x, int y) +{ + return (x < y) ? x : y; +} + +static +void +report_error(BGZF* fp, const char* message) { + fp->error = message; +} + +static +BGZF* +open_read(int fd) +{ + FILE* file = fdopen(fd, "r"); + BGZF* fp = malloc(sizeof(BGZF)); + fp->file_descriptor = fd; + fp->open_mode = 'r'; + fp->owned_file = 0; + fp->file = file; + fp->uncompressed_block_size = MAX_BLOCK_SIZE; + fp->uncompressed_block = malloc(MAX_BLOCK_SIZE); + fp->compressed_block_size = MAX_BLOCK_SIZE; + fp->compressed_block = malloc(MAX_BLOCK_SIZE); + fp->block_address = 0; + fp->block_offset = 0; + fp->block_length = 0; + fp->error = NULL; + return fp; +} + +static +BGZF* +open_write(int fd) +{ + FILE* file = fdopen(fd, "w"); + BGZF* fp = malloc(sizeof(BGZF)); + fp->file_descriptor = fd; + fp->open_mode = 'w'; + fp->owned_file = 0; + fp->file = file; + fp->uncompressed_block_size = DEFAULT_BLOCK_SIZE; + fp->uncompressed_block = NULL; + fp->compressed_block_size = MAX_BLOCK_SIZE; + fp->compressed_block = malloc(MAX_BLOCK_SIZE); + fp->block_address = 0; + fp->block_offset = 0; + fp->block_length = 0; + fp->error = NULL; + return fp; +} + +BGZF* +bgzf_open(const char* __restrict path, const char* __restrict mode) +{ + BGZF* fp = NULL; + if (strcasecmp(mode, "r") == 0) { + int oflag = O_RDONLY; + int fd = open(path, oflag); + fp = open_read(fd); + } else if (strcasecmp(mode, "w") == 0) { + int oflag = O_WRONLY | O_CREAT | O_TRUNC; + int fd = open(path, oflag, 0644); + fp = open_write(fd); + } + if (fp != NULL) { + fp->owned_file = 1; + } + return fp; +} + +BGZF* +bgzf_fdopen(int fd, const char * __restrict mode) +{ + if (strcasecmp(mode, "r") == 0) { + return open_read(fd); + } else if (strcasecmp(mode, "w") == 0) { + return open_write(fd); + } else { + return NULL; + } +} + +static +int +deflate_block(BGZF* fp, int block_length) +{ + // Deflate the block in fp->uncompressed_block into fp->compressed_block. + // Also adds an extra field that stores the compressed block length. + + byte* buffer = fp->compressed_block; + int buffer_size = fp->compressed_block_size; + + // Init gzip header + buffer[0] = GZIP_ID1; + buffer[1] = GZIP_ID2; + buffer[2] = CM_DEFLATE; + buffer[3] = FLG_FEXTRA; + buffer[4] = 0; // mtime + buffer[5] = 0; + buffer[6] = 0; + buffer[7] = 0; + buffer[8] = 0; + buffer[9] = OS_UNKNOWN; + buffer[10] = BGZF_XLEN; + buffer[11] = 0; + buffer[12] = BGZF_ID1; + buffer[13] = BGZF_ID2; + buffer[14] = BGZF_LEN; + buffer[15] = 0; + buffer[16] = 0; // placeholder for block length + buffer[17] = 0; + + // loop to retry for blocks that do not compress enough + int input_length = block_length; + int compressed_length = 0; + while (1) { + + z_stream zs; + zs.zalloc = NULL; + zs.zfree = NULL; + zs.next_in = fp->uncompressed_block; + zs.avail_in = input_length; + zs.next_out = (void*)&buffer[BLOCK_HEADER_LENGTH]; + zs.avail_out = buffer_size - BLOCK_HEADER_LENGTH - BLOCK_FOOTER_LENGTH; + + int status = deflateInit2(&zs, Z_DEFAULT_COMPRESSION, Z_DEFLATED, + GZIP_WINDOW_BITS, Z_DEFAULT_MEM_LEVEL, Z_DEFAULT_STRATEGY); + if (status != Z_OK) { + report_error(fp, "deflate init failed"); + return -1; + } + status = deflate(&zs, Z_FINISH); + if (status != Z_STREAM_END) { + deflateEnd(&zs); + if (status == Z_OK) { + // Not enough space in buffer. + // Can happen in the rare case the input doesn't compress enough. + // Reduce the amount of input until it fits. + input_length -= 1024; + if (input_length <= 0) { + // should never happen + report_error(fp, "input reduction failed"); + return -1; + } + continue; + } + report_error(fp, "deflate failed"); + return -1; + } + status = deflateEnd(&zs); + if (status != Z_OK) { + report_error(fp, "deflate end failed"); + return -1; + } + compressed_length = zs.total_out; + compressed_length += BLOCK_HEADER_LENGTH + BLOCK_FOOTER_LENGTH; + if (compressed_length > MAX_BLOCK_SIZE) { + // should never happen + report_error(fp, "deflate overflow"); + return -1; + } + break; + } + + packInt16((uint8_t*)&buffer[16], compressed_length-1); + uint32_t crc = crc32(0L, NULL, 0L); + crc = crc32(crc, fp->uncompressed_block, input_length); + packInt32((uint8_t*)&buffer[compressed_length-8], crc); + packInt32((uint8_t*)&buffer[compressed_length-4], input_length); + + int remaining = block_length - input_length; + if (remaining > 0) { + if (remaining > input_length) { + // should never happen (check so we can use memcpy) + report_error(fp, "remainder too large"); + return -1; + } + memcpy(fp->uncompressed_block, + fp->uncompressed_block + input_length, + remaining); + } + fp->block_offset = remaining; + return compressed_length; +} + +static +int +inflate_block(BGZF* fp, int block_length) +{ + // Inflate the block in fp->compressed_block into fp->uncompressed_block + + z_stream zs; + zs.zalloc = NULL; + zs.zfree = NULL; + zs.next_in = fp->compressed_block + 18; + zs.avail_in = block_length - 16; + zs.next_out = fp->uncompressed_block; + zs.avail_out = fp->uncompressed_block_size; + + int status = inflateInit2(&zs, GZIP_WINDOW_BITS); + if (status != Z_OK) { + report_error(fp, "inflate init failed"); + return -1; + } + status = inflate(&zs, Z_FINISH); + if (status != Z_STREAM_END) { + inflateEnd(&zs); + report_error(fp, "inflate failed"); + return -1; + } + status = inflateEnd(&zs); + if (status != Z_OK) { + report_error(fp, "inflate failed"); + return -1; + } + return zs.total_out; +} + +static +int +check_header(const byte* header) +{ + return (header[0] == GZIP_ID1 && + header[1] == (byte) GZIP_ID2 && + header[2] == Z_DEFLATED && + (header[3] & FLG_FEXTRA) != 0 && + unpackInt16((uint8_t*)&header[10]) == BGZF_XLEN && + header[12] == BGZF_ID1 && + header[13] == BGZF_ID2 && + unpackInt16((uint8_t*)&header[14]) == BGZF_LEN); +} + +static +int +read_block(BGZF* fp) +{ + byte header[BLOCK_HEADER_LENGTH]; + int64_t block_address = ftello(fp->file); + int count = fread(header, 1, sizeof(header), fp->file); + if (count == 0) { + fp->block_length = 0; + return 0; + } + if (count != sizeof(header)) { + report_error(fp, "read failed"); + return -1; + } + if (!check_header(header)) { + report_error(fp, "invalid block header"); + return -1; + } + int block_length = unpackInt16((uint8_t*)&header[16]) + 1; + byte* compressed_block = (byte*) fp->compressed_block; + memcpy(compressed_block, header, BLOCK_HEADER_LENGTH); + int remaining = block_length - BLOCK_HEADER_LENGTH; + count = fread(&compressed_block[BLOCK_HEADER_LENGTH], 1, remaining, fp->file); + if (count != remaining) { + report_error(fp, "read failed"); + return -1; + } + count = inflate_block(fp, block_length); + if (count < 0) { + return -1; + } + if (fp->block_length != 0) { + // Do not reset offset if this read follows a seek. + fp->block_offset = 0; + } + fp->block_address = block_address; + fp->block_length = count; + return 0; +} + +int +bgzf_read(BGZF* fp, void* data, int length) +{ + if (length <= 0) { + return 0; + } + if (fp->open_mode != 'r') { + report_error(fp, "file not open for reading"); + return -1; + } + + int bytes_read = 0; + byte* output = data; + while (bytes_read < length) { + int available = fp->block_length - fp->block_offset; + if (available <= 0) { + if (read_block(fp) != 0) { + return -1; + } + available = fp->block_length - fp->block_offset; + if (available <= 0) { + break; + } + } + int copy_length = min(length-bytes_read, available); + byte* buffer = fp->uncompressed_block; + memcpy(output, buffer + fp->block_offset, copy_length); + fp->block_offset += copy_length; + output += copy_length; + bytes_read += copy_length; + } + if (fp->block_offset == fp->block_length) { + fp->block_address = ftello(fp->file); + fp->block_offset = 0; + fp->block_length = 0; + } + return bytes_read; +} + +static +int +flush_block(BGZF* fp) +{ + while (fp->block_offset > 0) { + int block_length = deflate_block(fp, fp->block_offset); + if (block_length < 0) { + return -1; + } + int count = fwrite(fp->compressed_block, 1, block_length, fp->file); + if (count != block_length) { + report_error(fp, "write failed"); + return -1; + } + fp->block_address += block_length; + } + return 0; +} + +int +bgzf_write(BGZF* fp, const void* data, int length) +{ + if (fp->open_mode != 'w') { + report_error(fp, "file not open for writing"); + return -1; + } + + if (fp->uncompressed_block == NULL) { + fp->uncompressed_block = malloc(fp->uncompressed_block_size); + } + + const byte* input = data; + int block_length = fp->uncompressed_block_size; + int bytes_written = 0; + while (bytes_written < length) { + int copy_length = min(block_length - fp->block_offset, length - bytes_written); + byte* buffer = fp->uncompressed_block; + memcpy(buffer + fp->block_offset, input, copy_length); + fp->block_offset += copy_length; + input += copy_length; + bytes_written += copy_length; + if (fp->block_offset == block_length) { + if (flush_block(fp) != 0) { + break; + } + } + } + return bytes_written; +} + +int +bgzf_close(BGZF* fp) +{ + if (fp->open_mode == 'w') { + if (flush_block(fp) != 0) { + return -1; + } + if (fflush(fp->file) != 0) { + report_error(fp, "flush failed"); + return -1; + } + } + if (fp->owned_file) { + if (fclose(fp->file) != 0) { + return -1; + } + } + free(fp->uncompressed_block); + free(fp->compressed_block); + free(fp); + return 0; +} + +int64_t +bgzf_tell(BGZF* fp) +{ + return ((fp->block_address << 16) | (fp->block_offset & 0xFFFF)); +} + +int64_t +bgzf_seek(BGZF* fp, int64_t pos, int where) +{ + if (fp->open_mode != 'r') { + report_error(fp, "file not open for read"); + return -1; + } + if (where != SEEK_SET) { + report_error(fp, "unimplemented seek option"); + return -1; + } + int block_offset = pos & 0xFFFF; + int64_t block_address = (pos >> 16) & 0xFFFFFFFFFFFFLL; + if (fseeko(fp->file, block_address, SEEK_SET) != 0) { + report_error(fp, "seek failed"); + return -1; + } + fp->block_length = 0; // indicates current block is not loaded + fp->block_address = block_address; + fp->block_offset = block_offset; + return 0; +} + diff --git a/bgzf.h b/bgzf.h new file mode 100644 index 0000000..7e76bcb --- /dev/null +++ b/bgzf.h @@ -0,0 +1,102 @@ +/* + * The Broad Institute + * SOFTWARE COPYRIGHT NOTICE AGREEMENT + * This software and its documentation are copyright 2008 by the + * Broad Institute/Massachusetts Institute of Technology. All rights are reserved. + * + * This software is supplied without any warranty or guaranteed support whatsoever. + * Neither the Broad Institute nor MIT can be responsible for its use, misuse, + * or functionality. + */ + +#ifndef __BGZF_H +#define __BGZF_H + +#include +#include +#include "zlib.h" +#include +//#include "zutil.h" + +//typedef int8_t bool; + +typedef struct { + int file_descriptor; + char open_mode; // 'r' or 'w' + bool owned_file; + FILE* file; + int uncompressed_block_size; + int compressed_block_size; + void* uncompressed_block; + void* compressed_block; + int64_t block_address; + int block_length; + int block_offset; + const char* error; +} BGZF; + +#ifdef __cplusplus +extern "C" { +#endif + +/* + * Open an existing file descriptor for reading or writing. + * Mode must be either "r" or "w". + * A subsequent bgzf_close will not close the file descriptor. + * Returns null on error. + */ +BGZF* bgzf_fdopen(int fd, const char* __restrict mode); + +/* + * Open the specified file for reading or writing. + * Mode must be either "r" or "w". + * Returns null on error. + */ +BGZF* bgzf_open(const char* path, const char* __restrict mode); + +/* + * Close the BGZ file and free all associated resources. + * Does not close the underlying file descriptor if created with bgzf_fdopen. + * Returns zero on success, -1 on error. + */ +int bgzf_close(BGZF* fp); + +/* + * Read up to length bytes from the file storing into data. + * Returns the number of bytes actually read. + * Returns zero on end of file. + * Returns -1 on error. + */ +int bgzf_read(BGZF* fp, void* data, int length); + +/* + * Write length bytes from data to the file. + * Returns the number of bytes written. + * Returns -1 on error. + */ +int bgzf_write(BGZF* fp, const void* data, int length); + +/* + * Return a virtual file pointer to the current location in the file. + * No interpetation of the value should be made, other than a subsequent + * call to bgzf_seek can be used to position the file at the same point. + * Return value is non-negative on success. + * Returns -1 on error. + */ +int64_t bgzf_tell(BGZF* fp); + +/* + * Set the file to read from the location specified by pos, which must + * be a value previously returned by bgzf_tell for this file (but not + * necessarily one returned by this file handle). + * The where argument must be SEEK_SET. + * Seeking on a file opened for write is not supported. + * Returns zero on success, -1 on error. + */ +int64_t bgzf_seek(BGZF* fp, int64_t pos, int where); + +#ifdef __cplusplus +} +#endif + +#endif