From 3ba4b5eb3f88fd8f24af8bf83ba218b84faa87ba Mon Sep 17 00:00:00 2001 From: mikaels Date: Fri, 10 Apr 2009 15:07:49 +0000 Subject: [PATCH] Initial import. git-svn-id: svn+ssh://gene.bc.edu/home/subversion/Derek/BamTools/trunk@2 9efb377e-2e27-44b9-b91a-ec4abb80ed8b --- BamAlignment.h | 92 ++++++ BamConversionMain.cpp | 46 +++ BamReader.cpp | 653 ++++++++++++++++++++++++++++++++++++++++++ BamReader.h | 230 +++++++++++++++ BamReaderMain.cpp | 255 +++++++++++++++++ BamWriter.cpp | 403 ++++++++++++++++++++++++++ BamWriter.h | 145 ++++++++++ Makefile | 35 +++ README | 11 + STLUtilities.h | 146 ++++++++++ bgzf.c | 488 +++++++++++++++++++++++++++++++ bgzf.h | 102 +++++++ 12 files changed, 2606 insertions(+) create mode 100644 BamAlignment.h create mode 100644 BamConversionMain.cpp create mode 100644 BamReader.cpp create mode 100644 BamReader.h create mode 100644 BamReaderMain.cpp create mode 100644 BamWriter.cpp create mode 100644 BamWriter.h create mode 100644 Makefile create mode 100644 README create mode 100644 STLUtilities.h create mode 100644 bgzf.c create mode 100644 bgzf.h diff --git a/BamAlignment.h b/BamAlignment.h new file mode 100644 index 0000000..b299ab3 --- /dev/null +++ b/BamAlignment.h @@ -0,0 +1,92 @@ +// BamAlignment.h + +// Derek Barnett +// Marth Lab, Boston College +// Last modified: 20 March 2009 + +#ifndef BAMALIGNMENT_H +#define BAMALIGNMENT_H + +#include + +// C++ includes +#include +using std::string; + +#include +using std::vector; + +struct CigarOp { + uint32_t Length; + char Type; +}; + +struct RefData { + string RefName; + unsigned int RefLength; + bool RefHasAlignments; + + // constructor + RefData(void) + : RefLength(0) + , RefHasAlignments(false) + { } +}; + +typedef vector RefVector; + +struct BamAlignment { + + // queries against alignment flag - see below for further detail + public: + bool IsPaired(void) const { return ( (AlignmentFlag & PAIRED) != 0 ); } + bool IsProperPair(void) const { return ( (AlignmentFlag & PROPER_PAIR) != 0 ); } + bool IsMapped(void) const { return ( (AlignmentFlag & UNMAPPED) == 0 ); } + bool IsMateMapped(void) const { return ( (AlignmentFlag & MATE_UNMAPPED) == 0 ); } + bool IsReverseStrand(void) const { return ( (AlignmentFlag & REVERSE) != 0 ); } + bool IsMateReverseStrand(void) const { return ( (AlignmentFlag & MATE_REVERSE) != 0 ); } + bool IsFirstMate(void) const { return ( (AlignmentFlag & READ_1) != 0 ); } + bool IsSecondMate(void) const { return ( (AlignmentFlag & READ_2) != 0 ); } + bool IsPrimaryAlignment(void) const { return ( (AlignmentFlag & SECONDARY) == 0 ); } + bool IsFailedQC(void) const { return ( (AlignmentFlag & QC_FAILED) != 0 ); } + bool IsDuplicate(void) const { return ( (AlignmentFlag & DUPLICATE) != 0 ); } + + // data members + public: + string Name; // read name + unsigned int Length; // query length + string QueryBases; // original sequence ( produced from machine ) + string AlignedBases; // aligned sequence ( with indels ) + string Qualities; // FASTQ qualities ( still in ASCII characters ) + vector Tags; + unsigned int RefID; // ID for reference sequence + unsigned int Position; // position on reference sequence where alignment starts + unsigned int Bin; // bin in BAM file where this alignment resides + unsigned int MapQuality; // mapping quality + unsigned int AlignmentFlag; // see above for available queries + vector CigarData; // vector of CIGAR operations (length & type) ) + unsigned int MateRefID; // ID for reference sequence that mate was aligned to + unsigned int MatePosition; // position that mate was aligned to + unsigned int InsertSize; // mate pair insert size + + + // alignment flag query constants + private: + enum { PAIRED = 1, // Alignment comes from paired-end data + PROPER_PAIR = 2, // Alignment passed paired-end resolution + UNMAPPED = 4, // Read is unmapped + MATE_UNMAPPED = 8, // Mate is unmapped + REVERSE = 16, // Read is on reverse strand + MATE_REVERSE = 32, // Mate is on reverse strand + READ_1 = 64, // This alignment is mate 1 of pair + READ_2 = 128, // This alignment is mate 2 of pair + SECONDARY = 256, // This alignment is not the primary (best) alignment for read + QC_FAILED = 512, // Read did not pass prior quality control steps + DUPLICATE = 1024 // Read is PCR duplicate + }; +}; + +// commonly used vector in this library +typedef vector< BamAlignment > BamAlignmentVector; + +#endif /* BAMALIGNMENT_H */ diff --git a/BamConversionMain.cpp b/BamConversionMain.cpp new file mode 100644 index 0000000..457e58e --- /dev/null +++ b/BamConversionMain.cpp @@ -0,0 +1,46 @@ +#include +#include "BamReader.h" +#include "BamWriter.h" + +using namespace std; + +int main(int argc, char* argv[]) { + + if(argc != 3) { + cout << "USAGE: " << argv[0] << " " << endl; + exit(1); + } + + // localize our arguments + const char* inputFilename = argv[1]; + const char* outputFilename = argv[2]; + + // open our BAM reader + BamReader reader; + reader.SetFilename(inputFilename); + + if(!reader.Open()) { + cout << "ERROR: Unable to open the BAM file (" << inputFilename << ")." << endl; + exit(1); + } + + // retrieve the SAM header text + string samHeader = reader.GetHeaderText(); + + // retrieve the reference sequence vector + RefVector referenceSequences = reader.GetReferenceData(); + + // open the BAM writer + BamWriter writer; + writer.Open(outputFilename, samHeader, referenceSequences); + + // copy all of the reads from the input file to the output file + BamAlignment al; + while(reader.GetNextAlignment(al)) writer.SaveAlignment(al); + + // close our files + reader.Close(); + writer.Close(); + + return 0; +} \ No newline at end of file diff --git a/BamReader.cpp b/BamReader.cpp new file mode 100644 index 0000000..8fba1a9 --- /dev/null +++ b/BamReader.cpp @@ -0,0 +1,653 @@ +// BamReader.cpp + +// Derek Barnett +// Marth Lab, Boston College +// Last modified: 6 April 2009 + +#include "BamReader.h" +#include +using std::cerr; +using std::endl; + +// static character constants +const char* BamReader::DNA_LOOKUP = "=ACMGRSVTWYHKDBN"; +const char* BamReader::CIGAR_LOOKUP = "MIDNSHP"; + +BamReader::BamReader(const char* filename, const char* indexFilename) + : m_filename( (char*)filename ) + , m_indexFilename( (char*)indexFilename ) + , m_file(0) + , m_index(NULL) + , m_headerText("") + , m_isOpen(false) + , m_isIndexLoaded(false) + , m_isRegionSpecified(false) + , m_isCalculateAlignedBases(true) + , m_currentRefID(0) + , m_currentLeft(0) + , m_alignmentsBeginOffset(0) +{ + Open(); +} + +BamReader::~BamReader(void) { + + // close file + if ( m_isOpen ) { Close(); } +} + +// open BAM file +bool BamReader::Open(void) { + + if (!m_isOpen && m_filename != NULL ) { + + // open file + m_file = bam_open(m_filename, "r"); + + // get header info && index info + if ( (m_file != NULL) && LoadHeader() ) { + + // save file offset where alignments start + m_alignmentsBeginOffset = bam_tell(m_file); + + // set open flag + m_isOpen = true; + } + + // try to open (and load) index data, if index file given + if ( m_indexFilename != NULL ) { + OpenIndex(); + } + } + + return m_isOpen; +} + +bool BamReader::OpenIndex(void) { + + if ( m_indexFilename && !m_isIndexLoaded ) { + m_isIndexLoaded = LoadIndex(); + } + return m_isIndexLoaded; +} + +// close BAM file +bool BamReader::Close(void) { + + if (m_isOpen) { + + // close file + int ret = bam_close(m_file); + + // delete index info + if ( m_index != NULL) { delete m_index; } + + // clear open flag + m_isOpen = false; + + // clear index flag + m_isIndexLoaded = false; + + // clear region flag + m_isRegionSpecified = false; + + // return success/fail of bam_close + return (ret == 0); + } + + return true; +} + +// get BAM filename +const char* BamReader::Filename(void) const { + return (const char*)m_filename; +} + +// set BAM filename +void BamReader::SetFilename(const char* filename) { + m_filename = (char*)filename; +} + +// get BAM Index filename +const char* BamReader::IndexFilename(void) const { + return (const char*)m_indexFilename; +} + +// set BAM Index filename +void BamReader::SetIndexFilename(const char* indexFilename) { + m_indexFilename = (char*)indexFilename; +} + +// return full header text +const string BamReader::GetHeaderText(void) const { + return m_headerText; +} + +// return number of reference sequences in BAM file +const int BamReader::GetReferenceCount(void) const { + return m_references.size(); +} + +// get RefID from reference name +const int BamReader::GetRefID(string refName) const { + + vector refNames; + RefVector::const_iterator refIter = m_references.begin(); + RefVector::const_iterator refEnd = m_references.end(); + for ( ; refIter != refEnd; ++refIter) { + refNames.push_back( (*refIter).RefName ); + } + + // return 'index-of' refName (if not found, returns refNames.size()) + return Index( refNames.begin(), refNames.end(), refName ); +} + +const RefVector BamReader::GetReferenceData(void) const { + return m_references; +} + +bool BamReader::Jump(int refID, unsigned int left) { + + // if index available, and region is valid + if ( m_isIndexLoaded && m_references.at(refID).RefHasAlignments && (left <= m_references.at(refID).RefLength) ) { + m_currentRefID = refID; + m_currentLeft = left; + m_isRegionSpecified = true; + return ( bam_seek(m_file, GetOffset(m_currentRefID, m_currentLeft), SEEK_SET) == 0 ); + } + return false; +} + +bool BamReader::Rewind(void) { + + int refID = 0; + int refCount = m_references.size(); + for ( ; refID < refCount; ++refID ) { + if ( m_references.at(refID).RefHasAlignments ) { break; } + } + + m_currentRefID = refID; + m_currentLeft = 0; + m_isRegionSpecified = false; + + return ( bam_seek(m_file, m_alignmentsBeginOffset, SEEK_SET) == 0 ); +} + +// get next alignment from specified region +bool BamReader::GetNextAlignment(BamAlignment& bAlignment) { + + // try to load 'next' read + if ( LoadNextAlignment(bAlignment) ) { + + // if specified region, check for region overlap + if ( m_isRegionSpecified ) { + + // if overlap, return true + if ( IsOverlap(bAlignment) ) { return true; } + // if not, try the next alignment + else { return GetNextAlignment(bAlignment); } + } + + // not using region, valid read detected, return success + else { return true; } + } + + // no valid alignment to load + return false; +} + +void BamReader::SetCalculateAlignedBases(bool flag) { + m_isCalculateAlignedBases = flag; +} + +int BamReader::BinsFromRegion(int refID, unsigned int left, uint16_t list[MAX_BIN]) { + + // get region boundaries + uint32_t begin = left; + uint32_t end = m_references.at(refID).RefLength - 1; + + // initialize list, bin '0' always a valid bin + int i = 0; + list[i++] = 0; + + // get rest of bins that contain this region + unsigned int k; + for (k = 1 + (begin>>26); k <= 1 + (end>>26); ++k) { list[i++] = k; } + for (k = 9 + (begin>>23); k <= 9 + (end>>23); ++k) { list[i++] = k; } + for (k = 73 + (begin>>20); k <= 73 + (end>>20); ++k) { list[i++] = k; } + for (k = 585 + (begin>>17); k <= 585 + (end>>17); ++k) { list[i++] = k; } + for (k = 4681 + (begin>>14); k <= 4681 + (end>>14); ++k) { list[i++] = k; } + + // return number of bins stored + return i; +} + +uint32_t BamReader::CalculateAlignmentEnd(const unsigned int& position, const vector& cigarData) { + + // initialize alignment end to starting position + uint32_t alignEnd = position; + + // iterate over cigar operations + vector::const_iterator cigarIter = cigarData.begin(); + vector::const_iterator cigarEnd = cigarData.end(); + for ( ; cigarIter != cigarEnd; ++cigarIter) { + if ( (*cigarIter).Type == 'M' || (*cigarIter).Type == 'D' || (*cigarIter).Type == 'N') { + alignEnd += (*cigarIter).Length; + } + } + return alignEnd; +} + +uint64_t BamReader::GetOffset(int refID, unsigned int left) { + + // make space for bins + uint16_t* bins = (uint16_t*)calloc(MAX_BIN, 2); + + // returns number of bins overlapping (left, right) + // stores indices of those bins in 'bins' + int numBins = BinsFromRegion(refID, left, bins); + + // access index data for refID + RefIndex* refIndex = m_index->at(refID); + + // get list of BAM bins for this reference sequence + BinVector* refBins = refIndex->first; + + sort( refBins->begin(), refBins->end(), LookupKeyCompare() ); + + // declare ChunkVector + ChunkVector regionChunks; + + // declaure LinearOffsetVector + LinearOffsetVector* linearOffsets = refIndex->second; + + // some sort of linear offset vs bin index voodoo... not completely sure what's going here + uint64_t minOffset = ((left>>BAM_LIDX_SHIFT) >= linearOffsets->size()) ? 0 : linearOffsets->at(left>>BAM_LIDX_SHIFT); + + BinVector::iterator binBegin = refBins->begin(); + BinVector::iterator binEnd = refBins->end(); + + // iterate over bins overlapping region, count chunks + for (int i = 0; i < numBins; ++i) { + + // look for bin with ID=bin[i] + BinVector::iterator binIter = binBegin; + + for ( ; binIter != binEnd; ++binIter ) { + + // if found, increment n_off by number of chunks for each bin + if ( (*binIter).first == (uint32_t)bins[i] ) { + + // iterate over chunks in that bin + ChunkVector* binChunks = (*binIter).second; + + ChunkVector::iterator chunkIter = binChunks->begin(); + ChunkVector::iterator chunkEnd = binChunks->end(); + for ( ; chunkIter != chunkEnd; ++chunkIter) { + + // if right bound of pair is greater than min_off (linear offset value), store pair + if ( (*chunkIter).second > minOffset) { + regionChunks.push_back( (*chunkIter) ); + } + } + } + } + } + + // clean up temp array + free(bins); + + // there should be at least 1 + assert(regionChunks.size() > 0); + + // sort chunks by start position + sort ( regionChunks.begin(), regionChunks.end(), LookupKeyCompare() ); + + // resolve overlaps between adjacent blocks; this may happen due to the merge in indexing + int numOffsets = regionChunks.size(); + for (int i = 1; i < numOffsets; ++i) { + if ( regionChunks.at(i-1).second >= regionChunks.at(i).first ) { + regionChunks.at(i-1).second = regionChunks.at(i).first; + } + } + + // merge adjacent chunks + int l = 0; + for (int i = 1; i < numOffsets; ++i) { + // if adjacent, expand boundaries of (merged) chunk + if ( (regionChunks.at(l).second>>16) == (regionChunks.at(i).first>>16) ) { + regionChunks.at(l).second = regionChunks.at(i).second; + } + // else, move on to next (merged) chunk index + else { regionChunks.at(++l) = regionChunks.at(i); } + } + + // return beginning file offset of first chunk for region + return regionChunks.at(0).first; +} + +bool BamReader::IsOverlap(BamAlignment& bAlignment) { + + // if on different reference sequence, quit + if ( bAlignment.RefID != (unsigned int)m_currentRefID ) { return false; } + + // read starts after left boundary + if ( bAlignment.Position >= m_currentLeft) { return true; } + + // get alignment end + uint32_t alignEnd = CalculateAlignmentEnd(bAlignment.Position, bAlignment.CigarData); + + // return whether alignment end overlaps left boundary + return ( alignEnd >= m_currentLeft ); +} + +bool BamReader::LoadHeader(void) { + + // check to see if proper BAM header + char buf[4]; + if (bam_read(m_file, buf, 4) != 4) { return false; } + if (strncmp(buf, "BAM\001", 4)) { + fprintf(stderr, "wrong header type!\n"); + return false; + } + + // get BAM header text length + int32_t headerTextLength; + bam_read(m_file, &headerTextLength, 4); + + // get BAM header text + char* headerText = (char*)calloc(headerTextLength + 1, 1); + bam_read(m_file, headerText, headerTextLength); + m_headerText = (string)((const char*)headerText); + + // clean up calloc-ed temp variable + free(headerText); + + // get number of reference sequences + int32_t numberRefSeqs; + bam_read(m_file, &numberRefSeqs, 4); + if (numberRefSeqs == 0) { return false; } + + m_references.reserve((int)numberRefSeqs); + + // reference variables + int32_t refNameLength; + char* refName; + uint32_t refLength; + + // iterate over all references in header + for (int i = 0; i != numberRefSeqs; ++i) { + + // get length of reference name + bam_read(m_file, &refNameLength, 4); + refName = (char*)calloc(refNameLength, 1); + + // get reference name and reference sequence length + bam_read(m_file, refName, refNameLength); + bam_read(m_file, &refLength, 4); + + // store data for reference + RefData aReference; + aReference.RefName = (string)((const char*)refName); + aReference.RefLength = refLength; + m_references.push_back(aReference); + + // clean up calloc-ed temp variable + free(refName); + } + + return true; +} + +bool BamReader::LoadIndex(void) { + + // check to see if index file exists + FILE* indexFile; + if ( ( indexFile = fopen(m_indexFilename, "r") ) == 0 ) { + fprintf(stderr, "The alignment is not indexed. Please run SAMtools \'index\' command first.\n"); + return false; + } + + // see if index is valid BAM index + char magic[4]; + fread(magic, 1, 4, indexFile); + if (strncmp(magic, "BAI\1", 4)) { + fprintf(stderr, "Problem with index - wrong \'magic\' number.\n"); + fclose(indexFile); + return false; + } + + // get number of reference sequences + uint32_t numRefSeqs; + fread(&numRefSeqs, 4, 1, indexFile); + + // intialize BamIndex data structure + m_index = new BamIndex; + m_index->reserve(numRefSeqs); + + // iterate over reference sequences + for (unsigned int i = 0; i < numRefSeqs; ++i) { + + // get number of bins for this reference sequence + int32_t numBins; + fread(&numBins, 4, 1, indexFile); + + if (numBins > 0) { m_references.at(i).RefHasAlignments = true; } + + // intialize BinVector + BinVector* bins = new BinVector; + bins->reserve(numBins); + + // iterate over bins for that reference sequence + for (int j = 0; j < numBins; ++j) { + + // get binID + uint32_t binID; + fread(&binID, 4, 1, indexFile); + + // get number of regionChunks in this bin + uint32_t numChunks; + fread(&numChunks, 4, 1, indexFile); + + // intialize ChunkVector + ChunkVector* regionChunks = new ChunkVector; + regionChunks->reserve(numChunks); + + // iterate over regionChunks in this bin + for (unsigned int k = 0; k < numChunks; ++k) { + + // get chunk boundaries (left, right) + uint64_t left; + uint64_t right; + fread(&left, 8, 1, indexFile); + fread(&right, 8, 1, indexFile); + + // save ChunkPair + regionChunks->push_back( ChunkPair(left, right) ); + } + + // save binID, chunkVector for this bin + bins->push_back( BamBin(binID, regionChunks) ); + } + + // load linear index for this reference sequence + + // get number of linear offsets + int32_t numLinearOffsets; + fread(&numLinearOffsets, 4, 1, indexFile); + + // intialize LinearOffsetVector + LinearOffsetVector* linearOffsets = new LinearOffsetVector; + linearOffsets->reserve(numLinearOffsets); + + // iterate over linear offsets for this reference sequeence + for (int j = 0; j < numLinearOffsets; ++j) { + // get a linear offset + uint64_t linearOffset; + fread(&linearOffset, 8, 1, indexFile); + // store linear offset + linearOffsets->push_back(linearOffset); + } + + // store index data for that reference sequence + m_index->push_back( new RefIndex(bins, linearOffsets) ); + } + + // close index file (.bai) and return + fclose(indexFile); + return true; +} + +bool BamReader::LoadNextAlignment(BamAlignment& bAlignment) { + + // check valid alignment block header data + int32_t block_len; + int32_t ret; + uint32_t x[8]; + + int32_t bytesRead = 0; + + // read in the 'block length' value, make sure it's not zero + if ( (ret = bam_read(m_file, &block_len, 4)) == 0 ) { return false; } + bytesRead += 4; + + // read in core alignment data, make the right size of data was read + if ( bam_read(m_file, x, BAM_CORE_SIZE) != BAM_CORE_SIZE ) { return false; } + bytesRead += BAM_CORE_SIZE; + + // set BamAlignment 'core' data + bAlignment.RefID = x[0]; + bAlignment.Position = x[1]; + bAlignment.Bin = x[2]>>16; + bAlignment.MapQuality = x[2]>>8&0xff; + bAlignment.AlignmentFlag = x[3]>>16; + bAlignment.MateRefID = x[5]; + bAlignment.MatePosition = x[6]; + bAlignment.InsertSize = x[7]; + + // fetch & store often-used lengths for character data parsing + unsigned int queryNameLength = x[2]&0xff; + unsigned int numCigarOperations = x[3]&0xffff; + unsigned int querySequenceLength = x[4]; + + // get length of character data + int dataLength = block_len - BAM_CORE_SIZE; + + // set up destination buffers for character data + uint8_t* allCharData = (uint8_t*)calloc(sizeof(uint8_t), dataLength); + uint32_t* cigarData = (uint32_t*)(allCharData+queryNameLength); + + // get character data - make sure proper data size was read + if (bam_read(m_file, allCharData, dataLength) != dataLength) { return false; } + else { + + bytesRead += dataLength; + + // clear out bases, qualities, aligned bases, and CIGAR + bAlignment.QueryBases.clear(); + bAlignment.Qualities.clear(); + bAlignment.AlignedBases.clear(); + bAlignment.CigarData.clear(); + + // save name + bAlignment.Name = (string)((const char*)(allCharData)); + + // save bases + char singleBase[2]; + uint8_t* s = ( allCharData + (numCigarOperations*4) + queryNameLength); + for (unsigned int i = 0; i < querySequenceLength; ++i) { + // numeric to char conversion + sprintf( singleBase, "%c", DNA_LOOKUP[ ( ( s[(i/2)] >> (4*(1-(i%2)))) & 0xf ) ] ); + // append character data to Bases + bAlignment.QueryBases.append( (const char*)singleBase ); + } + + // save sequence length + bAlignment.Length = bAlignment.QueryBases.length(); + + // save qualities + char singleQuality[4]; + uint8_t* t = ( allCharData + (numCigarOperations*4) + queryNameLength + (querySequenceLength + 1)/2); + for (unsigned int i = 0; i < querySequenceLength; ++i) { + // numeric to char conversion + sprintf( singleQuality, "%c", ( t[i]+33 ) ); + // append character data to Qualities + bAlignment.Qualities.append( (const char*)singleQuality ); + } + + // save CIGAR-related data; + int k = 0; + for (unsigned int i = 0; i < numCigarOperations; ++i) { + + // build CigarOp struct + CigarOp op; + op.Length = (cigarData[i] >> BAM_CIGAR_SHIFT); + op.Type = CIGAR_LOOKUP[ (cigarData[i] & BAM_CIGAR_MASK) ]; + + // save CigarOp + bAlignment.CigarData.push_back(op); + + // can skip this step if user wants to ignore + if (m_isCalculateAlignedBases) { + + // build AlignedBases string + switch (op.Type) { + + case ('M') : + case ('I') : bAlignment.AlignedBases.append( bAlignment.QueryBases.substr(k, op.Length) ); // for 'M', 'I' - write bases + case ('S') : k += op.Length; // for 'S' - skip over query bases + break; + + case ('D') : + case ('P') : bAlignment.AlignedBases.append( op.Length, '*' ); // for 'D', 'P' - write padding; + break; + + case ('N') : bAlignment.AlignedBases.append( op.Length, 'N' ); // for 'N' - write N's, skip bases in query sequence + k += op.Length; + break; + + case ('H') : break; // for 'H' - do nothing, move to next op + + default : assert(false); // shouldn't get here + break; + } + } + } + } + free(allCharData); + +/* + // (optional) read tag parsing data + string tag; + char data; + int i = 0; + + // still data to read (auxiliary tags) + while ( bytesRead < block_len ) { + + if ( bam_read(m_file, &data, 1) == 1 ) { + + ++bytesRead; + + if (bytesRead == block_len && data != '\0') { + fprintf(stderr, "ERROR: Invalid termination of tag info at end of alignment block.\n"); + exit(1); + } + + tag.append(1, data); + if ( data == '\0' ) { + bAlignment.Tags.push_back(tag); + tag = ""; + i = 0; + } else { + if ( (i == 1) && (i == 2) ) { tag.append(1, ':'); } + ++i; + } + } else { + fprintf(stderr, "ERROR: Could not read tag info.\n"); + exit(1); + } + } +*/ + return true; +} diff --git a/BamReader.h b/BamReader.h new file mode 100644 index 0000000..786f4c2 --- /dev/null +++ b/BamReader.h @@ -0,0 +1,230 @@ +// BamReader.h + +/* The MIT License + + Copyright (c) 2008 Genome Research Ltd (GRL). + + Permission is hereby granted, free of charge, to any person obtaining + a copy of this software and associated documentation files (the + "Software"), to deal in the Software without restriction, including + without limitation the rights to use, copy, modify, merge, publish, + distribute, sublicense, and/or sell copies of the Software, and to + permit persons to whom the Software is furnished to do so, subject to + the following conditions: + + The above copyright notice and this permission notice shall be + included in all copies or substantial portions of the Software. + + THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + SOFTWARE. +*/ + +/* + Implementation of BAM-parsing was translated to C++ directly from Heng Li's SAMtools package + (thus the carryover of above MIT license) + Contact: Derek Barnett +*/ + +// Derek Barnett +// Marth Lab, Boston College +// Last modified: 6 April 2009 + +#ifndef BAMREADER_H +#define BAMREADER_H + +// custom includes +#include "BamAlignment.h" +#include "STLUtilities.h" + +// C library includes +#include +#include +#include +#include +#include + +// BGZF library includes/defines +#include "bgzf.h" +typedef BGZF* BamFile; +#define bam_open(f_name, mode) bgzf_open(f_name, mode) +#define bam_close(f_ptr) bgzf_close(f_ptr) +#define bam_read(f_ptr, buf, size) bgzf_read(f_ptr, buf, size) +#define bam_write(f_ptr, buf, size) bgzf_write(f_ptr, buf, size) +#define bam_tell(f_ptr) bgzf_tell(f_ptr) +#define bam_seek(f_ptr, pos, dir) bgzf_seek(f_ptr, pos, dir) + +// size of alignment data block in BAM file (bytes) +#define BAM_CORE_SIZE 32 + +// BAM indexing constants +#define MAX_BIN 37450 // =(8^6-1)/7+1 +#define BAM_MIN_CHUNK_GAP 32768 +#define BAM_LIDX_SHIFT 14 + +// CIGAR-retrieval mask/shift constants +#define BAM_CIGAR_SHIFT 4 +#define BAM_CIGAR_MASK ((1 << BAM_CIGAR_SHIFT) - 1) + +// CIGAR-operation types +#define BAM_CMATCH 0 +#define BAM_CINS 1 +#define BAM_CDEL 2 +#define BAM_CREF_SKIP 3 +#define BAM_CSOFT_CLIP 4 +#define BAM_CHARD_CLIP 5 +#define BAM_CPAD 6 + +// --------------------------- // +// Bam header info +// --------------------------- // + +// --------------------------- // +// BamIndex-related typedefs +// --------------------------- // + +// offset for linear indexing +typedef vector LinearOffsetVector; + +// chunk boundaries +typedef pair ChunkPair; +// list of chunks in a BAM bin +typedef vector ChunkVector; + +// BAM bins for a reference sequence +// replaces khash - uint32_t is key, ChunkVector is value +typedef pair BamBin; +typedef vector BinVector; + +// each reference sequence has a BinVector and LinearOffsetVector +typedef pair RefIndex; + +// full BamIndex defined as: +typedef vector BamIndex; + +// ---------------------------------------------------------------------------// + +class BamReader { + + public: + // constructors + BamReader(const char* fileName = NULL, const char* indexFilename = NULL); + + public: + // destructor + ~BamReader(void); + + // BAM interface methods + public: + + // ----------------------- // + // File manipulation + // ----------------------- // + + // open BAM file (automatically opens index if provided) + bool Open(void); + + // open BAM index (allows index to be opened separately - i.e. sometime after the BAM file is opened) + bool OpenIndex(void); + + // close BAM file + bool Close(void); + + // get BAM filename + const char* Filename(void) const; + + // set BAM filename + void SetFilename(const char*); + + // get BAM Index filename + const char* IndexFilename(void) const; + + // set BAM Index filename + void SetIndexFilename(const char*); + + // ----------------------- // + // Access BAM header + // ----------------------- // + + // return full header text + const string GetHeaderText(void) const; + + // --------------------------------- // + // Access reference sequence info + // --------------------------------- // + + // return number of reference sequences in BAM file + const int GetReferenceCount(void) const; + + // return vector of RefData entries + const RefVector GetReferenceData(void) const; + + // get refID from reference name + const int GetRefID(string refName) const; + + // ----------------------------------------- // + // File position moving + // ----------------------------------------- // + + // jumps to 'left' position on refID + // actually jumps before position, so reads that overlap 'left' are included as well + // 'left' defaults to reference begin if not specified + bool Jump(int refID, unsigned int left = 0); + + // Jump to beginning of BAM file, clears any region previously set by Jump() + bool Rewind(void); + + // ------------------------------ // + // Access alignments + // ------------------------------ // + + // get next alignment + bool GetNextAlignment(BamAlignment& read); + + // allow user to specifiy whether 'AlignedBases' string is calculated when alignment is loaded + void SetCalculateAlignedBases(bool); + + // internal utility methods + private: + int BinsFromRegion(int, unsigned int, uint16_t[MAX_BIN]); + uint32_t CalculateAlignmentEnd(const unsigned int&, const vector&); + uint64_t GetOffset(int, unsigned int); + bool IsOverlap(BamAlignment&); + bool LoadHeader(void); + bool LoadIndex(void); + bool LoadNextAlignment(BamAlignment&); + + private: + // main BAM reader components + char* m_filename; + char* m_indexFilename; + BamFile m_file; + BamIndex* m_index; + RefVector m_references; + string m_headerText; + + // state flags + bool m_isOpen; // BAM file is open for processing + bool m_isIndexLoaded; // BAM Index data is loaded and available for processing + bool m_isRegionSpecified; // a region has been specified - specifically, a user has called Jump() + bool m_isCalculateAlignedBases; // build 'AlignedBases' string when getting an alignment, otherwise skip (default = true) + + // region values + int m_currentRefID; + unsigned int m_currentLeft; + + // file offset of 1st read in BAM file + int64_t m_alignmentsBeginOffset; + + private: + // BAM character constants + static const char* DNA_LOOKUP; + static const char* CIGAR_LOOKUP; +}; + +#endif /* BAMREADER_H */ diff --git a/BamReaderMain.cpp b/BamReaderMain.cpp new file mode 100644 index 0000000..bdf4e60 --- /dev/null +++ b/BamReaderMain.cpp @@ -0,0 +1,255 @@ +// BamReaderMain.cpp + +// Derek Barnett +// Marth Lab, Boston College +// Last modified: 6 April 2009 + +#include "BamReader.h" + +// C++ includes +#include +using std::cerr; +using std::cout; +using std::endl; + +#include +using std::vector; + +#include +using std::string; + +int main(int argc, char* argv[]) { + + // check command line parameters + if (argc != 3) { + cerr << endl; + cerr << "Usage: BamReaderTest " << endl; + cerr << endl; + exit(1); + } + + // get filenames from command line + const char* bamFilename = argv[1]; + const char* bamIndexFilename = argv[2]; + + string refName; + int refID; + unsigned int leftBound; + unsigned int rightBound; + + int alignmentCount; + + vector refNames; + + BamAlignment bAlignment; + BamAlignmentVector alignments; + + RefVector references; + + // ------------------------------------------------------------------------------------------------------ // + // Declare BamReader & open BAM file - automatically loads up header and index file (.bai) information + // ------------------------------------------------------------------------------------------------------ // + + BamReader bReader(bamFilename, bamIndexFilename); + + cerr << endl; + cerr << "Opening BamReader for BAM file: " << bamFilename << " ..... "; + + if ( bReader.Open() ) { + cerr << "ok" << endl; + } + else { + cerr << "error" << endl; + exit(1); + } + + // ------------------------------------------------------------ // + // Find out how many reference sequences are in BAM file + // ------------------------------------------------------------ // + + references = bReader.GetReferenceData(); + + cerr << endl; + cerr << "Total number of ref seqs: " << references.size() << endl; + + // ---------------------------------------------------------------------------- // + // Get the names/lengths of all the reference sequences that have alignments + // ---------------------------------------------------------------------------- // + + cerr << endl; + cerr << "All ref sequences with alignments:" << endl; + cerr << endl; + + + if ( !references.empty() ) { + + RefVector::iterator refIter = references.begin(); + RefVector::iterator refEnd = references.end(); + + refID = 0; + // iterate over reference names, print to STDERR + for( ; refIter != refEnd; ++refIter) { + + if ( (*refIter).RefHasAlignments ) { + cerr << "ID: " << refID << endl; + cerr << "Name: " << (*refIter).RefName << endl; + cerr << "Length: " << (*refIter).RefLength << endl; + cerr << endl; + } + + ++refID; + } + } + + // --------------------------------------------------------------------------------------------- // + // Get the SAM-style header text, if available (not available in the example file shown here) + // --------------------------------------------------------------------------------------------- // + + cerr << "------------------------------------" << endl; + cerr << "SAM header text: " << endl; + + // get (SAM) header text + string header = bReader.GetHeaderText(); + + cerr << ( (header.empty()) ? "no header data" : header ) << endl; + cerr << "------------------------------------" << endl; + + // --------------------------------------------------------------------------------------------- // + // Here we start accessing alignments + // The first method shows how to iterate over all reads in a BamFile + // This method will work on any BAM file (sorted/non-sorted, with/without an index) + // --------------------------------------------------------------------------------------------- // + + // Call Rewind() to make sure you're at the 1st alignment in the file + // Please note, however, it's not necessary in this case, since file pointer initially set to 1st alignment + // but this is probably a good habit to develop to ensure correctness and make your intent obvious in the code + + cerr << "Getting (up to) first 1000 alignments" << endl; + + // start at 1st alignment + if ( bReader.Rewind() ) { + + alignmentCount = 0; + while ( bReader.GetNextAlignment(bAlignment) && (alignmentCount < 1000) ) { + + // disregard unmapped alignments + if ( bAlignment.IsMapped() ) { + + ++alignmentCount; + + cout << "----------------------------" << endl; + cout << "Alignment " << alignmentCount << endl; + cout << bAlignment.Name << endl; + cout << bAlignment.AlignedBases << endl; + cout << "Aligned to " << references.at(bAlignment.RefID).RefName << ":" << bAlignment.Position << endl; + + cout << "Cigar Data: " << endl; + + vector::const_iterator cigarIter = bAlignment.CigarData.begin(); + vector::const_iterator cigarEnd = bAlignment.CigarData.end(); + for ( ; cigarIter != cigarEnd; ++cigarIter ) { + cout << "Type: " << (*cigarIter).Type << "\tLength: " << (*cigarIter).Length << endl; + } + + cout << "Tags: " << endl; + vector::const_iterator tagIter = bAlignment.Tags.begin(); + vector::const_iterator tagEnd = bAlignment.Tags.end(); + for ( ; tagIter != tagEnd; ++tagIter ) { + cout << (*tagIter) << endl; + } + } + } + + cerr << "Found " << alignmentCount << " alignments." << endl; + } else { cerr << "Could not rewind" << endl; } + + // ---------------------------------------------------------------------------------------------------------- // + // You can iterate over each individual alignment that overlaps a specified region + // Set the refID & left boundary parameters using Jump(refID, left) + // Jump() actually positions the file pointer actually before the left boundary + // This ensures that reads beginning before, but overlapping 'left' are included as well + // Client is responsible for setting/checking right boundary - see below + // ---------------------------------------------------------------------------------------------------------- // +/* + cerr << "Jumping to region" << endl; + + // get refID using a known refName + refName = "seq1"; + refID = bReader.GetRefID(refName); + if (refID == (int)references.size()) { + cerr << "Reference " << refName << " not found." << endl; + exit(1); + } + + // set left boundary + leftBound = 500; + + // set right boundary - either user-specified number to set a discrete region + // OR you can query the BamReader for the end of the reference + rightBound = references.at(refID).RefLength; + + cerr << endl; + cerr << "Iterating over alignments on reference: " << refName << " from " << leftBound << " to ref end (" << rightBound << ")" << endl; + + // set region - specific region on reference + if ( bReader.Jump(refID, leftBound) ) { + + alignmentCount = 0; + while ( bReader.GetNextAlignment(bAlignment) && (bAlignment.Position <= rightBound) ) { + + if ( bAlignment.IsMapped() ) { + + ++alignmentCount; + + if ( (alignmentCount % 100000) == 0) { cerr << "Retrieved " << alignmentCount << " so far..." << endl; } + + cout << "----------------------------" << endl; + cout << "Alignment " << alignmentCount << endl; + cout << bAlignment.Name << endl; + cout << bAlignment.AlignedBases << endl; + cout << "Aligned to " << references.at(bAlignment.RefID).RefName << ":" << bAlignment.Position << endl; + } + } + + cerr << "Found " << alignmentCount << " alignments." << endl; + } else { cerr << "Could not jump to region specified" << endl; } + + // ----------------------------------------------------------------------------------------------------- // + // You can 'rewind' back to beginning of BAM file at any time + // ----------------------------------------------------------------------------------------------------- // + + cerr << endl; + cerr << "Rewinding BAM file... then getting first 1000 alignments" << endl; + + alignmentCount = 0; + if (bReader.Rewind() ) { + while ( bReader.GetNextAlignment(bAlignment) && (alignmentCount < 1000) ) { + + // disregard unmapped alignments + if ( bAlignment.IsMapped() ) { + + ++alignmentCount; + + cout << "----------------------------" << endl; + cout << "Alignment " << alignmentCount << endl; + cout << bAlignment.Name << endl; + cout << bAlignment.AlignedBases << endl; + cout << "Aligned to " << references.at(bAlignment.RefID).RefName << ":" << bAlignment.Position << endl; + } + } + + cerr << "Found " << alignmentCount << " alignments." << endl; + } else { cerr << "Could not rewind" << endl; } +*/ + // ---------------------------------------------------------------------- // + // Close BamReader object (releases internal header/index data) and exit + // ---------------------------------------------------------------------- // + + cerr << endl; + cerr << "Closing BAM file: " << bamFilename << endl; + + bReader.Close(); + + cerr << "Exiting..." << endl << endl; + return 0; +} diff --git a/BamWriter.cpp b/BamWriter.cpp new file mode 100644 index 0000000..24544b3 --- /dev/null +++ b/BamWriter.cpp @@ -0,0 +1,403 @@ +// *************************************************************************** +// BamWriter (c) 2009 Michael Strömberg +// Marth Lab, Deptartment of Biology, Boston College +// All rights reserved. +// --------------------------------------------------------------------------- +// Provides the basic functionality for producing BAM files +// *************************************************************************** + +#include "BamWriter.h" + +// constructor +BamWriter::BamWriter(void) +{} + +// destructor +BamWriter::~BamWriter(void) { + if(mBGZF.IsOpen) BgzfClose(); +} + +// closes the BAM file +void BamWriter::BgzfClose(void) { + + mBGZF.IsOpen = false; + + // flush the BGZF block + BgzfFlushBlock(); + + // flush and close + fflush(mBGZF.Stream); + fclose(mBGZF.Stream); +} + +// compresses the current block +int BamWriter::BgzfDeflateBlock(void) { + + // initialize the gzip header + char* buffer = mBGZF.CompressedBlock; + unsigned int bufferSize = mBGZF.CompressedBlockSize; + + memset(buffer, 0, 18); + buffer[0] = GZIP_ID1; + buffer[1] = (char)GZIP_ID2; + buffer[2] = CM_DEFLATE; + buffer[3] = FLG_FEXTRA; + buffer[9] = (char)OS_UNKNOWN; + buffer[10] = BGZF_XLEN; + buffer[12] = BGZF_ID1; + buffer[13] = BGZF_ID2; + buffer[14] = BGZF_LEN; + + // loop to retry for blocks that do not compress enough + int inputLength = mBGZF.BlockOffset; + int compressedLength = 0; + + while(true) { + + z_stream zs; + zs.zalloc = NULL; + zs.zfree = NULL; + zs.next_in = (Bytef*)mBGZF.UncompressedBlock; + zs.avail_in = inputLength; + zs.next_out = (Bytef*)&buffer[BLOCK_HEADER_LENGTH]; + zs.avail_out = bufferSize - BLOCK_HEADER_LENGTH - BLOCK_FOOTER_LENGTH; + + // initialize the zlib compression algorithm + if(deflateInit2(&zs, Z_DEFAULT_COMPRESSION, Z_DEFLATED, GZIP_WINDOW_BITS, Z_DEFAULT_MEM_LEVEL, Z_DEFAULT_STRATEGY) != Z_OK) { + printf("ERROR: zlib deflate initialization failed.\n"); + exit(1); + } + + // compress the data + int status = deflate(&zs, Z_FINISH); + if(status != Z_STREAM_END) { + deflateEnd(&zs); + + // reduce the input length and try again + if(status == Z_OK) { + inputLength -= 1024; + if(inputLength < 0) { + printf("ERROR: input reduction failed.\n"); + exit(1); + } + continue; + } + + printf("ERROR: zlib deflate failed.\n"); + exit(1); + } + + // finalize the compression routine + if(deflateEnd(&zs) != Z_OK) { + printf("ERROR: deflate end failed.\n"); + exit(1); + } + + compressedLength = zs.total_out; + compressedLength += BLOCK_HEADER_LENGTH + BLOCK_FOOTER_LENGTH; + + if(compressedLength > MAX_BLOCK_SIZE) { + printf("ERROR: deflate overflow.\n"); + exit(1); + } + + break; + } + + // store the compressed length + BgzfPackUnsignedShort(&buffer[16], (unsigned short)(compressedLength - 1)); + + // store the CRC32 checksum + unsigned int crc = crc32(0, NULL, 0); + crc = crc32(crc, (Bytef*)mBGZF.UncompressedBlock, inputLength); + BgzfPackUnsignedInt(&buffer[compressedLength - 8], crc); + BgzfPackUnsignedInt(&buffer[compressedLength - 4], inputLength); + + // ensure that we have less than a block of data left + int remaining = mBGZF.BlockOffset - inputLength; + if(remaining > 0) { + if(remaining > inputLength) { + printf("ERROR: remainder too large.\n"); + exit(1); + } + + memcpy(mBGZF.UncompressedBlock, mBGZF.UncompressedBlock + inputLength, remaining); + } + + mBGZF.BlockOffset = remaining; + return compressedLength; +} + +// flushes the data in the BGZF block +void BamWriter::BgzfFlushBlock(void) { + + // flush all of the remaining blocks + while(mBGZF.BlockOffset > 0) { + + // compress the data block + int blockLength = BgzfDeflateBlock(); + + // flush the data to our output stream + int numBytesWritten = fwrite(mBGZF.CompressedBlock, 1, blockLength, mBGZF.Stream); + + if(numBytesWritten != blockLength) { + printf("ERROR: Expected to write %u bytes during flushing, but wrote %u bytes.\n", blockLength, numBytesWritten); + exit(1); + } + + mBGZF.BlockAddress += blockLength; + } +} + +// opens the BAM file for writing +void BamWriter::BgzfOpen(const string& filename) { + + mBGZF.Stream = fopen(filename.c_str(), "wb"); + + if(!mBGZF.Stream) { + printf("ERROR: Unable to open the BAM file (%s) for writing.\n", filename.c_str()); + exit(1); + } + + mBGZF.IsOpen = true; +} + +// writes the supplied data into the BGZF buffer +unsigned int BamWriter::BgzfWrite(const char* data, const unsigned int dataLen) { + + // initialize + unsigned int numBytesWritten = 0; + const char* input = data; + unsigned int blockLength = mBGZF.UncompressedBlockSize; + + // copy the data to the buffer + while(numBytesWritten < dataLen) { + unsigned int copyLength = min(blockLength - mBGZF.BlockOffset, dataLen - numBytesWritten); + char* buffer = mBGZF.UncompressedBlock; + memcpy(buffer + mBGZF.BlockOffset, input, copyLength); + + mBGZF.BlockOffset += copyLength; + input += copyLength; + numBytesWritten += copyLength; + + if(mBGZF.BlockOffset == blockLength) BgzfFlushBlock(); + } + + return numBytesWritten; +} + +// closes the alignment archive +void BamWriter::Close(void) { + if(mBGZF.IsOpen) BgzfClose(); +} + +// creates a cigar string from the supplied alignment +void BamWriter::CreatePackedCigar(const vector& cigarOperations, string& packedCigar) { + + // initialize + const unsigned int numCigarOperations = cigarOperations.size(); + packedCigar.resize(numCigarOperations * SIZEOF_INT); + + // pack the cigar data into the string + unsigned int* pPackedCigar = (unsigned int*)packedCigar.data(); + + unsigned int cigarOp; + vector::const_iterator coIter; + for(coIter = cigarOperations.begin(); coIter != cigarOperations.end(); coIter++) { + + switch(coIter->Type) { + case 'M': + cigarOp = BAM_CMATCH; + break; + case 'I': + cigarOp = BAM_CINS; + break; + case 'D': + cigarOp = BAM_CDEL; + break; + case 'N': + cigarOp = BAM_CREF_SKIP; + break; + case 'S': + cigarOp = BAM_CSOFT_CLIP; + break; + case 'H': + cigarOp = BAM_CHARD_CLIP; + break; + case 'P': + cigarOp = BAM_CPAD; + break; + default: + printf("ERROR: Unknown cigar operation found: %c\n", coIter->Type); + exit(1); + } + + *pPackedCigar = coIter->Length << BAM_CIGAR_SHIFT | cigarOp; + pPackedCigar++; + } +} + +// encodes the supplied query sequence into 4-bit notation +void BamWriter::EncodeQuerySequence(const string& query, string& encodedQuery) { + + // prepare the encoded query string + const unsigned int queryLen = query.size(); + const unsigned int encodedQueryLen = (unsigned int)((queryLen / 2.0) + 0.5); + encodedQuery.resize(encodedQueryLen); + char* pEncodedQuery = (char*)encodedQuery.data(); + const char* pQuery = (const char*)query.data(); + + unsigned char nucleotideCode; + bool useHighWord = true; + + while(*pQuery) { + + switch(*pQuery) { + case '=': + nucleotideCode = 0; + break; + case 'A': + nucleotideCode = 1; + break; + case 'C': + nucleotideCode = 2; + break; + case 'G': + nucleotideCode = 4; + break; + case 'T': + nucleotideCode = 8; + break; + case 'N': + nucleotideCode = 15; + break; + default: + printf("ERROR: Only the following bases are supported in the BAM format: {=, A, C, G, T, N}. Found [%c]\n", *pQuery); + exit(1); + } + + // pack the nucleotide code + if(useHighWord) { + *pEncodedQuery = nucleotideCode << 4; + useHighWord = false; + } else { + *pEncodedQuery |= nucleotideCode; + pEncodedQuery++; + useHighWord = true; + } + + // increment the query position + pQuery++; + } +} + +// opens the alignment archive +void BamWriter::Open(const string& filename, const string& samHeader, const RefVector& referenceSequences) { + + // open the BGZF file for writing + BgzfOpen(filename); + + // ================ + // write the header + // ================ + + // write the BAM signature + const unsigned char SIGNATURE_LENGTH = 4; + const char* BAM_SIGNATURE = "BAM\1"; + BgzfWrite(BAM_SIGNATURE, SIGNATURE_LENGTH); + + // write the SAM header text length + const unsigned int samHeaderLen = samHeader.size(); + BgzfWrite((char*)&samHeaderLen, SIZEOF_INT); + + // write the SAM header text + if(samHeaderLen > 0) BgzfWrite(samHeader.data(), samHeaderLen); + + // write the number of reference sequences + const unsigned int numReferenceSequences = referenceSequences.size(); + BgzfWrite((char*)&numReferenceSequences, SIZEOF_INT); + + // ============================= + // write the sequence dictionary + // ============================= + + RefVector::const_iterator rsIter; + for(rsIter = referenceSequences.begin(); rsIter != referenceSequences.end(); rsIter++) { + + // write the reference sequence name length + const unsigned int referenceSequenceNameLen = rsIter->RefName.size() + 1; + BgzfWrite((char*)&referenceSequenceNameLen, SIZEOF_INT); + + // write the reference sequence name + BgzfWrite(rsIter->RefName.c_str(), referenceSequenceNameLen); + + // write the reference sequence length + BgzfWrite((char*)&rsIter->RefLength, SIZEOF_INT); + } +} + +// saves the alignment to the alignment archive +void BamWriter::SaveAlignment(const BamAlignment& al) { + + // initialize + const unsigned int nameLen = al.Name.size() + 1; + const unsigned int queryLen = al.QueryBases.size(); + const unsigned int numCigarOperations = al.CigarData.size(); + + // create our packed cigar string + string packedCigar; + CreatePackedCigar(al.CigarData, packedCigar); + const unsigned int packedCigarLen = packedCigar.size(); + + // encode the query + string encodedQuery; + EncodeQuerySequence(al.QueryBases, encodedQuery); + const unsigned int encodedQueryLen = encodedQuery.size(); + + // create our read group tag + // TODO: add read group tag support when it shows up in the BamAlignment struct + //string readGroupTag; + //const unsigned int readGroupTagLen = 3 + mReadGroupLUT[al.ReadGroupCode].NameLen + 1; + //readGroupTag.resize(readGroupTagLen); + //char* pReadGroupTag = (char*)readGroupTag.data(); + //sprintf(pReadGroupTag, "RGZ%s", mReadGroupLUT[al.ReadGroupCode].Name); + + // assign the BAM core data + unsigned int buffer[8]; + buffer[0] = al.RefID; + buffer[1] = al.Position; + buffer[2] = (al.Bin << 16) | (al.MapQuality << 8) | nameLen; + buffer[3] = (al.AlignmentFlag << 16) | numCigarOperations; + buffer[4] = queryLen; + buffer[5] = al.MateRefID; + buffer[6] = al.MatePosition; + buffer[7] = al.InsertSize; + + // write the block size + const unsigned int dataBlockSize = nameLen + packedCigarLen + encodedQueryLen + queryLen /* + readGroupTagLen*/; + const unsigned int blockSize = BAM_CORE_SIZE + dataBlockSize; + + BgzfWrite((char*)&blockSize, SIZEOF_INT); + + // write the BAM core + BgzfWrite((char*)&buffer, BAM_CORE_SIZE); + + // write the query name + BgzfWrite(al.Name.c_str(), nameLen); + + // write the packed cigar + BgzfWrite(packedCigar.data(), packedCigarLen); + + // write the encoded query sequence + BgzfWrite(encodedQuery.data(), encodedQueryLen); + + // write the base qualities + string baseQualities = al.Qualities; + char* pBaseQualities = (char*)al.Qualities.data(); + for(unsigned int i = 0; i < queryLen; i++) pBaseQualities[i] -= 33; + BgzfWrite(pBaseQualities, queryLen); + + // write the read group tag + // TODO: add read group tag support when it shows up in the BamAlignment struct + //BgzfWrite(readGroupTag.c_str(), readGroupTagLen); +} diff --git a/BamWriter.h b/BamWriter.h new file mode 100644 index 0000000..422d1f9 --- /dev/null +++ b/BamWriter.h @@ -0,0 +1,145 @@ +// *************************************************************************** +// BamWriter (c) 2009 Michael Strömberg +// Marth Lab, Deptartment of Biology, Boston College +// All rights reserved. +// --------------------------------------------------------------------------- +// Provides the basic functionality for producing BAM files +// *************************************************************************** + +#pragma once + +#include +#include +#include +#include +#include +#include +#include "BamAlignment.h" + +using namespace std; + +// our zlib constants +#define GZIP_ID1 31 +#define GZIP_ID2 139 +#define CM_DEFLATE 8 +#define FLG_FEXTRA 4 +#define OS_UNKNOWN 255 +#define BGZF_XLEN 6 +#define BGZF_ID1 66 +#define BGZF_ID2 67 +#define BGZF_LEN 2 +#define GZIP_WINDOW_BITS -15 +#define Z_DEFAULT_MEM_LEVEL 8 + +// our BZGF constants +#define BLOCK_HEADER_LENGTH 18 +#define BLOCK_FOOTER_LENGTH 8 +#define MAX_BLOCK_SIZE 65536 +#define DEFAULT_BLOCK_SIZE 65536 + +// our BAM constants +#define BAM_CORE_SIZE 32 +#define BAM_CMATCH 0 +#define BAM_CINS 1 +#define BAM_CDEL 2 +#define BAM_CREF_SKIP 3 +#define BAM_CSOFT_CLIP 4 +#define BAM_CHARD_CLIP 5 +#define BAM_CPAD 6 +#define BAM_CIGAR_SHIFT 4 + +#define BAM_CIGAR_MASK ((1 << BAM_CIGAR_SHIFT) - 1) + +// our variable sizes +#define SIZEOF_INT 4 + +// define our BZGF structure +struct BgzfData { + unsigned int UncompressedBlockSize; + unsigned int CompressedBlockSize; + unsigned int BlockLength; + unsigned int BlockOffset; + uint64_t BlockAddress; + bool IsOpen; + FILE* Stream; + char* UncompressedBlock; + char* CompressedBlock; + + // constructor + BgzfData(void) + : UncompressedBlockSize(DEFAULT_BLOCK_SIZE) + , CompressedBlockSize(MAX_BLOCK_SIZE) + , BlockLength(0) + , BlockOffset(0) + , BlockAddress(0) + , IsOpen(false) + , Stream(NULL) + , UncompressedBlock(NULL) + , CompressedBlock(NULL) + { + try { + CompressedBlock = new char[CompressedBlockSize]; + UncompressedBlock = new char[UncompressedBlockSize]; + } catch(bad_alloc&) { + printf("ERROR: Unable to allocate memory for our BGZF object.\n"); + exit(1); + } + } + + // destructor + ~BgzfData(void) { + if(CompressedBlock) delete [] CompressedBlock; + if(UncompressedBlock) delete [] UncompressedBlock; + } +}; + +class BamWriter { +public: + // constructor + BamWriter(void); + // destructor + ~BamWriter(void); + // closes the alignment archive + void Close(void); + // opens the alignment archive + void Open(const string& filename, const string& samHeader, const RefVector& referenceSequences); + // saves the alignment to the alignment archive + void SaveAlignment(const BamAlignment& al); +private: + // closes the BAM file + void BgzfClose(void); + // compresses the current block + int BgzfDeflateBlock(void); + // flushes the data in the BGZF block + void BgzfFlushBlock(void); + // opens the BAM file for writing + void BgzfOpen(const string& filename); + // packs an unsigned integer into the specified buffer + static inline void BgzfPackUnsignedInt(char* buffer, unsigned int value); + // packs an unsigned short into the specified buffer + static inline void BgzfPackUnsignedShort(char* buffer, unsigned short value); + // writes the supplied data into the BGZF buffer + unsigned int BgzfWrite(const char* data, const unsigned int dataLen); + // calculates the minimum bin that contains a region [begin, end) + static inline unsigned int CalculateMinimumBin(unsigned int begin, unsigned int end); + // creates a packed cigar string from the supplied alignment + static void CreatePackedCigar(const vector& cigarOperations, string& packedCigar); + // encodes the supplied query sequence into 4-bit notation + static void EncodeQuerySequence(const string& query, string& encodedQuery); + // our BGZF output object + BgzfData mBGZF; +}; + +// packs an unsigned integer into the specified buffer +inline void BamWriter::BgzfPackUnsignedInt(char* buffer, unsigned int value) { + buffer[0] = (char)value; + buffer[1] = (char)(value >> 8); + buffer[2] = (char)(value >> 16); + buffer[3] = (char)(value >> 24); +} + +// packs an unsigned short into the specified buffer +inline void BamWriter::BgzfPackUnsignedShort(char* buffer, unsigned short value) { + buffer[0] = (char)value; + buffer[1] = (char)(value >> 8); +} diff --git a/Makefile b/Makefile new file mode 100644 index 0000000..41ba820 --- /dev/null +++ b/Makefile @@ -0,0 +1,35 @@ +CC= gcc +CXX= g++ +CFLAGS= -Wall -pg -O3 -m64 +CXXFLAGS= $(CFLAGS) +DFLAGS= -D_IOLIB=2 #-D_NDEBUG +OBJS= BamReader.o bgzf.o +PROG= BamReaderTest +INCLUDES= +ARFLAGS= -crs +LIBS= -lz +SUBDIRS= . + +.SUFFIXES:.c .cpp .o + +.c.o: + $(CC) -c $(CFLAGS) $(DFLAGS) $(INCLUDES) $< -o $@ + +.cpp.o: + $(CXX) -c $(CXXFLAGS) $(DFLAGS) $(INCLUDES) $< -o $@ + +all: $(PROG) BamConversion + +lib:libbambc.a + +libbambc.a:$(OBJS) + $(AR) $(ARFLAGS) $@ $(OBJS) + +BamReaderTest:lib BamReaderMain.o + $(CXX) $(CXXFLAGS) -o $@ BamReaderMain.o $(LIBS) -L. -lbambc + +BamConversion: lib BamWriter.o BamConversionMain.o + $(CXX) $(CXXFLAGS) -o $@ BamWriter.o BamConversionMain.o $(LIBS) -L. -lbambc + +clean: + rm -fr gmon.out *.o *.a a.out $(PROG) *~ diff --git a/README b/README new file mode 100644 index 0000000..639bfd1 --- /dev/null +++ b/README @@ -0,0 +1,11 @@ +--------------- +README +---------------- + +Copy the 4 header files (*.h) and the library file (libbambc.a) to your directory. + +When you compile your program, add this to the g++ command: + + -lz -L. -lbambc + +and you should be good to go. (Those are lower case l's, as in 'lion'.) diff --git a/STLUtilities.h b/STLUtilities.h new file mode 100644 index 0000000..7a9fbc5 --- /dev/null +++ b/STLUtilities.h @@ -0,0 +1,146 @@ +// STLUtilities.h + +// Derek Barnett +// Marth Lab, Boston College +// Last modified: 19 March 2009 + +#ifndef STL_UTILITIES_H +#define STL_UTILITIES_H + +#include +using std::distance; +using std::find; +using std::transform; + +#include +using std::unary_function; + +#include +using std::iterator_traits; + +#include +using std::string; + +#include +using std::pair; + +#include +using std::vector; + +// -------------------------------------------------------------------------------- // +// This file contains a sample of STL tricks I've gathered along the way. +// +// Many thanks throughout to 'Effective' series by Scott Meyers. +// Some code is lifted (almost) verbatim from his texts where applicable. +// ------------------------------------------------------------------------------- // + +// --------------------------------------------------------------------------------------------------------------------------------- // +// STL containers will delete the values they hold when the container is deleted or goes out of scope +// *** If the container contains pointers, however, they WILL NOT delete the actual pointed-to objects *** +// This struct allows for easy algorithm processing (instead of pesky loops and iterator-boundary issues) to delete pointed-to objects (for any C++ type) that have been allocated with 'new') +// Usage example: for_each( container.begin(), container.end(), DeleteObject() ) +// +// Unless of course, you like quirky, hard-to-spot memory leaks... then feel free to disregard this little STL lesson =) +// --------------------------------------------------------------------------------------------------------------------------------- // +struct DeleteObject { + template + void operator() (const T* p) const { + delete p; + } +}; + +template +void ClearPointerVector(vector& v) { + if ( !v.empty() ) { + for_each(v.begin(), v.end(), DeleteObject()); + v.clear(); + } +} + +// --------------------------------------------------------------------------------------------------------------------------------- // +// Query a vector (other containers? havent tried) for an element +// Returns the index of that element +// Returns vector::size() if not found +// Works with reverse iterators as well... index is counted backward from last element though +// --------------------------------------------------------------------------------------------------------------------------------- // +template < typename InputIterator, typename EqualityComparable > +typename iterator_traits::difference_type +Index(const InputIterator& begin, const InputIterator& end, const EqualityComparable& item) { + return distance(begin, find(begin, end, item)); +} + +// ----------------------------------------------------------------------------------------------------------------------// +// This next section is a sort of work-around for the bulky associative maps +// The idea is to use a *SORTED* vector of pair objects as a lookup vector +// LookupVector = vector< pair > +// ----------------------------------------------------------------------------------------------------------------------// + +// The following template classes allow a templatized comparison function for sorting & searching lookup vector + +// allows sorting by Key ( less ) +template +class LookupKeyCompare { + + typedef pair< Key, Value > LookupData; + typedef typename LookupData::first_type Key_t; + + public: + bool operator() (const LookupData& lhs, const LookupData& rhs) const { return keyLess(lhs.first, rhs.first); } + bool operator() (const LookupData& lhs, const Key_t& k) const { return keyLess(lhs.first, k); } + bool operator() (const Key_t& k, const LookupData& rhs) const { return keyLess(k, rhs.first); } + private: + bool keyLess(const Key_t& k1, const Key_t& k2) const { return k1 < k2; } +}; + +// allows sorting by Value ( less ) +template +class LookupValueCompare { + + typedef pair< Key, Value > LookupData; + typedef typename LookupData::second_type Value_t; + + public: + bool operator() (const LookupData& lhs, const LookupData& rhs) const { return valueLess(lhs.second, rhs.second); } + bool operator() (const LookupData& lhs, const Value_t& k) const { return valueLess(lhs.second, k); } + bool operator() (const Value_t& k, const LookupData& rhs) const { return valueLess(k, rhs.second); } + private: + bool valueLess(const Value_t& k1, const Value_t& k2) const { return k1 < k2; } +}; + +// The following template functions/structs allow you to pull all keys or all values from this lookup structure + +// pull Key from a data pair +template +struct RipKey: + public unary_function< pair, Key> { + Key operator() (pair dataPair) { + return dataPair.first; + } +}; + +// pull Value from a data pair +template +struct RipValue: + public unary_function< pair, Value> { + Value operator() (pair dataPair) { + return dataPair.second; + } +}; + +// pull all Keys from lookup, store in dest (overwrite contents of dest) +template +void RipKeys( vector< pair >& lookup, vector& dest) { + dest.clear(); + dest.reserve(lookup.size()); + transform(lookup.begin(), lookup.end(), back_inserter(dest), RipKey()); +} + +// pull all Values from lookup, store in dest (overwrite contents of dest) +template +void RipValues( vector< pair >& lookup, vector& dest) { + dest.clear(); + dest.reserve(lookup.size()); + transform(lookup.begin(), lookup.end(), back_inserter(dest), RipValue()); +} + +#endif /* STL_UTILITIES_H */ diff --git a/bgzf.c b/bgzf.c new file mode 100644 index 0000000..4314c70 --- /dev/null +++ b/bgzf.c @@ -0,0 +1,488 @@ +/* + * The Broad Institute + * SOFTWARE COPYRIGHT NOTICE AGREEMENT + * This software and its documentation are copyright 2008 by the + * Broad Institute/Massachusetts Institute of Technology. All rights are reserved. + * + * This software is supplied without any warranty or guaranteed support whatsoever. + * Neither the Broad Institute nor MIT can be responsible for its use, misuse, + * or functionality. + */ + +#include +#include +#include +#include +#include +#include +#include +#include "bgzf.h" + +extern off_t ftello(FILE *stream); +extern int fseeko(FILE *stream, off_t offset, int whence); + +typedef int8_t byte; + +static const int DEFAULT_BLOCK_SIZE = 64 * 1024; +static const int MAX_BLOCK_SIZE = 64 * 1024; + +static const int BLOCK_HEADER_LENGTH = 18; +static const int BLOCK_FOOTER_LENGTH = 8; + +static const int GZIP_ID1 = 31; +static const int GZIP_ID2 = 139; +static const int CM_DEFLATE = 8; +static const int FLG_FEXTRA = 4; +static const int OS_UNKNOWN = 255; +static const int BGZF_ID1 = 66; // 'B' +static const int BGZF_ID2 = 67; // 'C' +static const int BGZF_LEN = 2; +static const int BGZF_XLEN = 6; // BGZF_LEN+4 + +static const int GZIP_WINDOW_BITS = -15; // no zlib header +static const int Z_DEFAULT_MEM_LEVEL = 8; + + +inline +void +packInt16(uint8_t* buffer, uint16_t value) +{ + buffer[0] = value; + buffer[1] = value >> 8; +} + +inline +int +unpackInt16(const uint8_t* buffer) +{ + return (buffer[0] | (buffer[1] << 8)); +} + +inline +void +packInt32(uint8_t* buffer, uint32_t value) +{ + buffer[0] = value; + buffer[1] = value >> 8; + buffer[2] = value >> 16; + buffer[3] = value >> 24; +} + +inline +int +min(int x, int y) +{ + return (x < y) ? x : y; +} + +static +void +report_error(BGZF* fp, const char* message) { + fp->error = message; +} + +static +BGZF* +open_read(int fd) +{ + FILE* file = fdopen(fd, "r"); + BGZF* fp = malloc(sizeof(BGZF)); + fp->file_descriptor = fd; + fp->open_mode = 'r'; + fp->owned_file = 0; + fp->file = file; + fp->uncompressed_block_size = MAX_BLOCK_SIZE; + fp->uncompressed_block = malloc(MAX_BLOCK_SIZE); + fp->compressed_block_size = MAX_BLOCK_SIZE; + fp->compressed_block = malloc(MAX_BLOCK_SIZE); + fp->block_address = 0; + fp->block_offset = 0; + fp->block_length = 0; + fp->error = NULL; + return fp; +} + +static +BGZF* +open_write(int fd) +{ + FILE* file = fdopen(fd, "w"); + BGZF* fp = malloc(sizeof(BGZF)); + fp->file_descriptor = fd; + fp->open_mode = 'w'; + fp->owned_file = 0; + fp->file = file; + fp->uncompressed_block_size = DEFAULT_BLOCK_SIZE; + fp->uncompressed_block = NULL; + fp->compressed_block_size = MAX_BLOCK_SIZE; + fp->compressed_block = malloc(MAX_BLOCK_SIZE); + fp->block_address = 0; + fp->block_offset = 0; + fp->block_length = 0; + fp->error = NULL; + return fp; +} + +BGZF* +bgzf_open(const char* __restrict path, const char* __restrict mode) +{ + BGZF* fp = NULL; + if (strcasecmp(mode, "r") == 0) { + int oflag = O_RDONLY; + int fd = open(path, oflag); + fp = open_read(fd); + } else if (strcasecmp(mode, "w") == 0) { + int oflag = O_WRONLY | O_CREAT | O_TRUNC; + int fd = open(path, oflag, 0644); + fp = open_write(fd); + } + if (fp != NULL) { + fp->owned_file = 1; + } + return fp; +} + +BGZF* +bgzf_fdopen(int fd, const char * __restrict mode) +{ + if (strcasecmp(mode, "r") == 0) { + return open_read(fd); + } else if (strcasecmp(mode, "w") == 0) { + return open_write(fd); + } else { + return NULL; + } +} + +static +int +deflate_block(BGZF* fp, int block_length) +{ + // Deflate the block in fp->uncompressed_block into fp->compressed_block. + // Also adds an extra field that stores the compressed block length. + + byte* buffer = fp->compressed_block; + int buffer_size = fp->compressed_block_size; + + // Init gzip header + buffer[0] = GZIP_ID1; + buffer[1] = GZIP_ID2; + buffer[2] = CM_DEFLATE; + buffer[3] = FLG_FEXTRA; + buffer[4] = 0; // mtime + buffer[5] = 0; + buffer[6] = 0; + buffer[7] = 0; + buffer[8] = 0; + buffer[9] = OS_UNKNOWN; + buffer[10] = BGZF_XLEN; + buffer[11] = 0; + buffer[12] = BGZF_ID1; + buffer[13] = BGZF_ID2; + buffer[14] = BGZF_LEN; + buffer[15] = 0; + buffer[16] = 0; // placeholder for block length + buffer[17] = 0; + + // loop to retry for blocks that do not compress enough + int input_length = block_length; + int compressed_length = 0; + while (1) { + + z_stream zs; + zs.zalloc = NULL; + zs.zfree = NULL; + zs.next_in = fp->uncompressed_block; + zs.avail_in = input_length; + zs.next_out = (void*)&buffer[BLOCK_HEADER_LENGTH]; + zs.avail_out = buffer_size - BLOCK_HEADER_LENGTH - BLOCK_FOOTER_LENGTH; + + int status = deflateInit2(&zs, Z_DEFAULT_COMPRESSION, Z_DEFLATED, + GZIP_WINDOW_BITS, Z_DEFAULT_MEM_LEVEL, Z_DEFAULT_STRATEGY); + if (status != Z_OK) { + report_error(fp, "deflate init failed"); + return -1; + } + status = deflate(&zs, Z_FINISH); + if (status != Z_STREAM_END) { + deflateEnd(&zs); + if (status == Z_OK) { + // Not enough space in buffer. + // Can happen in the rare case the input doesn't compress enough. + // Reduce the amount of input until it fits. + input_length -= 1024; + if (input_length <= 0) { + // should never happen + report_error(fp, "input reduction failed"); + return -1; + } + continue; + } + report_error(fp, "deflate failed"); + return -1; + } + status = deflateEnd(&zs); + if (status != Z_OK) { + report_error(fp, "deflate end failed"); + return -1; + } + compressed_length = zs.total_out; + compressed_length += BLOCK_HEADER_LENGTH + BLOCK_FOOTER_LENGTH; + if (compressed_length > MAX_BLOCK_SIZE) { + // should never happen + report_error(fp, "deflate overflow"); + return -1; + } + break; + } + + packInt16((uint8_t*)&buffer[16], compressed_length-1); + uint32_t crc = crc32(0L, NULL, 0L); + crc = crc32(crc, fp->uncompressed_block, input_length); + packInt32((uint8_t*)&buffer[compressed_length-8], crc); + packInt32((uint8_t*)&buffer[compressed_length-4], input_length); + + int remaining = block_length - input_length; + if (remaining > 0) { + if (remaining > input_length) { + // should never happen (check so we can use memcpy) + report_error(fp, "remainder too large"); + return -1; + } + memcpy(fp->uncompressed_block, + fp->uncompressed_block + input_length, + remaining); + } + fp->block_offset = remaining; + return compressed_length; +} + +static +int +inflate_block(BGZF* fp, int block_length) +{ + // Inflate the block in fp->compressed_block into fp->uncompressed_block + + z_stream zs; + zs.zalloc = NULL; + zs.zfree = NULL; + zs.next_in = fp->compressed_block + 18; + zs.avail_in = block_length - 16; + zs.next_out = fp->uncompressed_block; + zs.avail_out = fp->uncompressed_block_size; + + int status = inflateInit2(&zs, GZIP_WINDOW_BITS); + if (status != Z_OK) { + report_error(fp, "inflate init failed"); + return -1; + } + status = inflate(&zs, Z_FINISH); + if (status != Z_STREAM_END) { + inflateEnd(&zs); + report_error(fp, "inflate failed"); + return -1; + } + status = inflateEnd(&zs); + if (status != Z_OK) { + report_error(fp, "inflate failed"); + return -1; + } + return zs.total_out; +} + +static +int +check_header(const byte* header) +{ + return (header[0] == GZIP_ID1 && + header[1] == (byte) GZIP_ID2 && + header[2] == Z_DEFLATED && + (header[3] & FLG_FEXTRA) != 0 && + unpackInt16((uint8_t*)&header[10]) == BGZF_XLEN && + header[12] == BGZF_ID1 && + header[13] == BGZF_ID2 && + unpackInt16((uint8_t*)&header[14]) == BGZF_LEN); +} + +static +int +read_block(BGZF* fp) +{ + byte header[BLOCK_HEADER_LENGTH]; + int64_t block_address = ftello(fp->file); + int count = fread(header, 1, sizeof(header), fp->file); + if (count == 0) { + fp->block_length = 0; + return 0; + } + if (count != sizeof(header)) { + report_error(fp, "read failed"); + return -1; + } + if (!check_header(header)) { + report_error(fp, "invalid block header"); + return -1; + } + int block_length = unpackInt16((uint8_t*)&header[16]) + 1; + byte* compressed_block = (byte*) fp->compressed_block; + memcpy(compressed_block, header, BLOCK_HEADER_LENGTH); + int remaining = block_length - BLOCK_HEADER_LENGTH; + count = fread(&compressed_block[BLOCK_HEADER_LENGTH], 1, remaining, fp->file); + if (count != remaining) { + report_error(fp, "read failed"); + return -1; + } + count = inflate_block(fp, block_length); + if (count < 0) { + return -1; + } + if (fp->block_length != 0) { + // Do not reset offset if this read follows a seek. + fp->block_offset = 0; + } + fp->block_address = block_address; + fp->block_length = count; + return 0; +} + +int +bgzf_read(BGZF* fp, void* data, int length) +{ + if (length <= 0) { + return 0; + } + if (fp->open_mode != 'r') { + report_error(fp, "file not open for reading"); + return -1; + } + + int bytes_read = 0; + byte* output = data; + while (bytes_read < length) { + int available = fp->block_length - fp->block_offset; + if (available <= 0) { + if (read_block(fp) != 0) { + return -1; + } + available = fp->block_length - fp->block_offset; + if (available <= 0) { + break; + } + } + int copy_length = min(length-bytes_read, available); + byte* buffer = fp->uncompressed_block; + memcpy(output, buffer + fp->block_offset, copy_length); + fp->block_offset += copy_length; + output += copy_length; + bytes_read += copy_length; + } + if (fp->block_offset == fp->block_length) { + fp->block_address = ftello(fp->file); + fp->block_offset = 0; + fp->block_length = 0; + } + return bytes_read; +} + +static +int +flush_block(BGZF* fp) +{ + while (fp->block_offset > 0) { + int block_length = deflate_block(fp, fp->block_offset); + if (block_length < 0) { + return -1; + } + int count = fwrite(fp->compressed_block, 1, block_length, fp->file); + if (count != block_length) { + report_error(fp, "write failed"); + return -1; + } + fp->block_address += block_length; + } + return 0; +} + +int +bgzf_write(BGZF* fp, const void* data, int length) +{ + if (fp->open_mode != 'w') { + report_error(fp, "file not open for writing"); + return -1; + } + + if (fp->uncompressed_block == NULL) { + fp->uncompressed_block = malloc(fp->uncompressed_block_size); + } + + const byte* input = data; + int block_length = fp->uncompressed_block_size; + int bytes_written = 0; + while (bytes_written < length) { + int copy_length = min(block_length - fp->block_offset, length - bytes_written); + byte* buffer = fp->uncompressed_block; + memcpy(buffer + fp->block_offset, input, copy_length); + fp->block_offset += copy_length; + input += copy_length; + bytes_written += copy_length; + if (fp->block_offset == block_length) { + if (flush_block(fp) != 0) { + break; + } + } + } + return bytes_written; +} + +int +bgzf_close(BGZF* fp) +{ + if (fp->open_mode == 'w') { + if (flush_block(fp) != 0) { + return -1; + } + if (fflush(fp->file) != 0) { + report_error(fp, "flush failed"); + return -1; + } + } + if (fp->owned_file) { + if (fclose(fp->file) != 0) { + return -1; + } + } + free(fp->uncompressed_block); + free(fp->compressed_block); + free(fp); + return 0; +} + +int64_t +bgzf_tell(BGZF* fp) +{ + return ((fp->block_address << 16) | (fp->block_offset & 0xFFFF)); +} + +int64_t +bgzf_seek(BGZF* fp, int64_t pos, int where) +{ + if (fp->open_mode != 'r') { + report_error(fp, "file not open for read"); + return -1; + } + if (where != SEEK_SET) { + report_error(fp, "unimplemented seek option"); + return -1; + } + int block_offset = pos & 0xFFFF; + int64_t block_address = (pos >> 16) & 0xFFFFFFFFFFFFLL; + if (fseeko(fp->file, block_address, SEEK_SET) != 0) { + report_error(fp, "seek failed"); + return -1; + } + fp->block_length = 0; // indicates current block is not loaded + fp->block_address = block_address; + fp->block_offset = block_offset; + return 0; +} + diff --git a/bgzf.h b/bgzf.h new file mode 100644 index 0000000..7e76bcb --- /dev/null +++ b/bgzf.h @@ -0,0 +1,102 @@ +/* + * The Broad Institute + * SOFTWARE COPYRIGHT NOTICE AGREEMENT + * This software and its documentation are copyright 2008 by the + * Broad Institute/Massachusetts Institute of Technology. All rights are reserved. + * + * This software is supplied without any warranty or guaranteed support whatsoever. + * Neither the Broad Institute nor MIT can be responsible for its use, misuse, + * or functionality. + */ + +#ifndef __BGZF_H +#define __BGZF_H + +#include +#include +#include "zlib.h" +#include +//#include "zutil.h" + +//typedef int8_t bool; + +typedef struct { + int file_descriptor; + char open_mode; // 'r' or 'w' + bool owned_file; + FILE* file; + int uncompressed_block_size; + int compressed_block_size; + void* uncompressed_block; + void* compressed_block; + int64_t block_address; + int block_length; + int block_offset; + const char* error; +} BGZF; + +#ifdef __cplusplus +extern "C" { +#endif + +/* + * Open an existing file descriptor for reading or writing. + * Mode must be either "r" or "w". + * A subsequent bgzf_close will not close the file descriptor. + * Returns null on error. + */ +BGZF* bgzf_fdopen(int fd, const char* __restrict mode); + +/* + * Open the specified file for reading or writing. + * Mode must be either "r" or "w". + * Returns null on error. + */ +BGZF* bgzf_open(const char* path, const char* __restrict mode); + +/* + * Close the BGZ file and free all associated resources. + * Does not close the underlying file descriptor if created with bgzf_fdopen. + * Returns zero on success, -1 on error. + */ +int bgzf_close(BGZF* fp); + +/* + * Read up to length bytes from the file storing into data. + * Returns the number of bytes actually read. + * Returns zero on end of file. + * Returns -1 on error. + */ +int bgzf_read(BGZF* fp, void* data, int length); + +/* + * Write length bytes from data to the file. + * Returns the number of bytes written. + * Returns -1 on error. + */ +int bgzf_write(BGZF* fp, const void* data, int length); + +/* + * Return a virtual file pointer to the current location in the file. + * No interpetation of the value should be made, other than a subsequent + * call to bgzf_seek can be used to position the file at the same point. + * Return value is non-negative on success. + * Returns -1 on error. + */ +int64_t bgzf_tell(BGZF* fp); + +/* + * Set the file to read from the location specified by pos, which must + * be a value previously returned by bgzf_tell for this file (but not + * necessarily one returned by this file handle). + * The where argument must be SEEK_SET. + * Seeking on a file opened for write is not supported. + * Returns zero on success, -1 on error. + */ +int64_t bgzf_seek(BGZF* fp, int64_t pos, int where); + +#ifdef __cplusplus +} +#endif + +#endif -- 2.39.5