From ccc66728ab56d429c3b3e72c261a3cefc143d197 Mon Sep 17 00:00:00 2001 From: mikaels Date: Sat, 11 Apr 2009 03:32:53 +0000 Subject: [PATCH] Added tag data support. git-svn-id: svn+ssh://gene.bc.edu/home/subversion/Derek/BamTools/trunk@11 9efb377e-2e27-44b9-b91a-ec4abb80ed8b --- BamReader.cpp | 1315 +++++++++++++++++++++++++------------------------ BamWriter.cpp | 14 +- 2 files changed, 666 insertions(+), 663 deletions(-) diff --git a/BamReader.cpp b/BamReader.cpp index 8fba1a9..4b50d7e 100644 --- a/BamReader.cpp +++ b/BamReader.cpp @@ -1,653 +1,662 @@ -// BamReader.cpp - -// Derek Barnett -// Marth Lab, Boston College -// Last modified: 6 April 2009 - -#include "BamReader.h" -#include -using std::cerr; -using std::endl; - -// static character constants -const char* BamReader::DNA_LOOKUP = "=ACMGRSVTWYHKDBN"; -const char* BamReader::CIGAR_LOOKUP = "MIDNSHP"; - -BamReader::BamReader(const char* filename, const char* indexFilename) - : m_filename( (char*)filename ) - , m_indexFilename( (char*)indexFilename ) - , m_file(0) - , m_index(NULL) - , m_headerText("") - , m_isOpen(false) - , m_isIndexLoaded(false) - , m_isRegionSpecified(false) - , m_isCalculateAlignedBases(true) - , m_currentRefID(0) - , m_currentLeft(0) - , m_alignmentsBeginOffset(0) -{ - Open(); -} - -BamReader::~BamReader(void) { - - // close file - if ( m_isOpen ) { Close(); } -} - -// open BAM file -bool BamReader::Open(void) { - - if (!m_isOpen && m_filename != NULL ) { - - // open file - m_file = bam_open(m_filename, "r"); - - // get header info && index info - if ( (m_file != NULL) && LoadHeader() ) { - - // save file offset where alignments start - m_alignmentsBeginOffset = bam_tell(m_file); - - // set open flag - m_isOpen = true; - } - - // try to open (and load) index data, if index file given - if ( m_indexFilename != NULL ) { - OpenIndex(); - } - } - - return m_isOpen; -} - -bool BamReader::OpenIndex(void) { - - if ( m_indexFilename && !m_isIndexLoaded ) { - m_isIndexLoaded = LoadIndex(); - } - return m_isIndexLoaded; -} - -// close BAM file -bool BamReader::Close(void) { - - if (m_isOpen) { - - // close file - int ret = bam_close(m_file); - - // delete index info - if ( m_index != NULL) { delete m_index; } - - // clear open flag - m_isOpen = false; - - // clear index flag - m_isIndexLoaded = false; - - // clear region flag - m_isRegionSpecified = false; - - // return success/fail of bam_close - return (ret == 0); - } - - return true; -} - -// get BAM filename -const char* BamReader::Filename(void) const { - return (const char*)m_filename; -} - -// set BAM filename -void BamReader::SetFilename(const char* filename) { - m_filename = (char*)filename; -} - -// get BAM Index filename -const char* BamReader::IndexFilename(void) const { - return (const char*)m_indexFilename; -} - -// set BAM Index filename -void BamReader::SetIndexFilename(const char* indexFilename) { - m_indexFilename = (char*)indexFilename; -} - -// return full header text -const string BamReader::GetHeaderText(void) const { - return m_headerText; -} - -// return number of reference sequences in BAM file -const int BamReader::GetReferenceCount(void) const { - return m_references.size(); -} - -// get RefID from reference name -const int BamReader::GetRefID(string refName) const { - - vector refNames; - RefVector::const_iterator refIter = m_references.begin(); - RefVector::const_iterator refEnd = m_references.end(); - for ( ; refIter != refEnd; ++refIter) { - refNames.push_back( (*refIter).RefName ); - } - - // return 'index-of' refName (if not found, returns refNames.size()) - return Index( refNames.begin(), refNames.end(), refName ); -} - -const RefVector BamReader::GetReferenceData(void) const { - return m_references; -} - -bool BamReader::Jump(int refID, unsigned int left) { - - // if index available, and region is valid - if ( m_isIndexLoaded && m_references.at(refID).RefHasAlignments && (left <= m_references.at(refID).RefLength) ) { - m_currentRefID = refID; - m_currentLeft = left; - m_isRegionSpecified = true; - return ( bam_seek(m_file, GetOffset(m_currentRefID, m_currentLeft), SEEK_SET) == 0 ); - } - return false; -} - -bool BamReader::Rewind(void) { - - int refID = 0; - int refCount = m_references.size(); - for ( ; refID < refCount; ++refID ) { - if ( m_references.at(refID).RefHasAlignments ) { break; } - } - - m_currentRefID = refID; - m_currentLeft = 0; - m_isRegionSpecified = false; - - return ( bam_seek(m_file, m_alignmentsBeginOffset, SEEK_SET) == 0 ); -} - -// get next alignment from specified region -bool BamReader::GetNextAlignment(BamAlignment& bAlignment) { - - // try to load 'next' read - if ( LoadNextAlignment(bAlignment) ) { - - // if specified region, check for region overlap - if ( m_isRegionSpecified ) { - - // if overlap, return true - if ( IsOverlap(bAlignment) ) { return true; } - // if not, try the next alignment - else { return GetNextAlignment(bAlignment); } - } - - // not using region, valid read detected, return success - else { return true; } - } - - // no valid alignment to load - return false; -} - -void BamReader::SetCalculateAlignedBases(bool flag) { - m_isCalculateAlignedBases = flag; -} - -int BamReader::BinsFromRegion(int refID, unsigned int left, uint16_t list[MAX_BIN]) { - - // get region boundaries - uint32_t begin = left; - uint32_t end = m_references.at(refID).RefLength - 1; - - // initialize list, bin '0' always a valid bin - int i = 0; - list[i++] = 0; - - // get rest of bins that contain this region - unsigned int k; - for (k = 1 + (begin>>26); k <= 1 + (end>>26); ++k) { list[i++] = k; } - for (k = 9 + (begin>>23); k <= 9 + (end>>23); ++k) { list[i++] = k; } - for (k = 73 + (begin>>20); k <= 73 + (end>>20); ++k) { list[i++] = k; } - for (k = 585 + (begin>>17); k <= 585 + (end>>17); ++k) { list[i++] = k; } - for (k = 4681 + (begin>>14); k <= 4681 + (end>>14); ++k) { list[i++] = k; } - - // return number of bins stored - return i; -} - -uint32_t BamReader::CalculateAlignmentEnd(const unsigned int& position, const vector& cigarData) { - - // initialize alignment end to starting position - uint32_t alignEnd = position; - - // iterate over cigar operations - vector::const_iterator cigarIter = cigarData.begin(); - vector::const_iterator cigarEnd = cigarData.end(); - for ( ; cigarIter != cigarEnd; ++cigarIter) { - if ( (*cigarIter).Type == 'M' || (*cigarIter).Type == 'D' || (*cigarIter).Type == 'N') { - alignEnd += (*cigarIter).Length; - } - } - return alignEnd; -} - -uint64_t BamReader::GetOffset(int refID, unsigned int left) { - - // make space for bins - uint16_t* bins = (uint16_t*)calloc(MAX_BIN, 2); - - // returns number of bins overlapping (left, right) - // stores indices of those bins in 'bins' - int numBins = BinsFromRegion(refID, left, bins); - - // access index data for refID - RefIndex* refIndex = m_index->at(refID); - - // get list of BAM bins for this reference sequence - BinVector* refBins = refIndex->first; - - sort( refBins->begin(), refBins->end(), LookupKeyCompare() ); - - // declare ChunkVector - ChunkVector regionChunks; - - // declaure LinearOffsetVector - LinearOffsetVector* linearOffsets = refIndex->second; - - // some sort of linear offset vs bin index voodoo... not completely sure what's going here - uint64_t minOffset = ((left>>BAM_LIDX_SHIFT) >= linearOffsets->size()) ? 0 : linearOffsets->at(left>>BAM_LIDX_SHIFT); - - BinVector::iterator binBegin = refBins->begin(); - BinVector::iterator binEnd = refBins->end(); - - // iterate over bins overlapping region, count chunks - for (int i = 0; i < numBins; ++i) { - - // look for bin with ID=bin[i] - BinVector::iterator binIter = binBegin; - - for ( ; binIter != binEnd; ++binIter ) { - - // if found, increment n_off by number of chunks for each bin - if ( (*binIter).first == (uint32_t)bins[i] ) { - - // iterate over chunks in that bin - ChunkVector* binChunks = (*binIter).second; - - ChunkVector::iterator chunkIter = binChunks->begin(); - ChunkVector::iterator chunkEnd = binChunks->end(); - for ( ; chunkIter != chunkEnd; ++chunkIter) { - - // if right bound of pair is greater than min_off (linear offset value), store pair - if ( (*chunkIter).second > minOffset) { - regionChunks.push_back( (*chunkIter) ); - } - } - } - } - } - - // clean up temp array - free(bins); - - // there should be at least 1 - assert(regionChunks.size() > 0); - - // sort chunks by start position - sort ( regionChunks.begin(), regionChunks.end(), LookupKeyCompare() ); - - // resolve overlaps between adjacent blocks; this may happen due to the merge in indexing - int numOffsets = regionChunks.size(); - for (int i = 1; i < numOffsets; ++i) { - if ( regionChunks.at(i-1).second >= regionChunks.at(i).first ) { - regionChunks.at(i-1).second = regionChunks.at(i).first; - } - } - - // merge adjacent chunks - int l = 0; - for (int i = 1; i < numOffsets; ++i) { - // if adjacent, expand boundaries of (merged) chunk - if ( (regionChunks.at(l).second>>16) == (regionChunks.at(i).first>>16) ) { - regionChunks.at(l).second = regionChunks.at(i).second; - } - // else, move on to next (merged) chunk index - else { regionChunks.at(++l) = regionChunks.at(i); } - } - - // return beginning file offset of first chunk for region - return regionChunks.at(0).first; -} - -bool BamReader::IsOverlap(BamAlignment& bAlignment) { - - // if on different reference sequence, quit - if ( bAlignment.RefID != (unsigned int)m_currentRefID ) { return false; } - - // read starts after left boundary - if ( bAlignment.Position >= m_currentLeft) { return true; } - - // get alignment end - uint32_t alignEnd = CalculateAlignmentEnd(bAlignment.Position, bAlignment.CigarData); - - // return whether alignment end overlaps left boundary - return ( alignEnd >= m_currentLeft ); -} - -bool BamReader::LoadHeader(void) { - - // check to see if proper BAM header - char buf[4]; - if (bam_read(m_file, buf, 4) != 4) { return false; } - if (strncmp(buf, "BAM\001", 4)) { - fprintf(stderr, "wrong header type!\n"); - return false; - } - - // get BAM header text length - int32_t headerTextLength; - bam_read(m_file, &headerTextLength, 4); - - // get BAM header text - char* headerText = (char*)calloc(headerTextLength + 1, 1); - bam_read(m_file, headerText, headerTextLength); - m_headerText = (string)((const char*)headerText); - - // clean up calloc-ed temp variable - free(headerText); - - // get number of reference sequences - int32_t numberRefSeqs; - bam_read(m_file, &numberRefSeqs, 4); - if (numberRefSeqs == 0) { return false; } - - m_references.reserve((int)numberRefSeqs); - - // reference variables - int32_t refNameLength; - char* refName; - uint32_t refLength; - - // iterate over all references in header - for (int i = 0; i != numberRefSeqs; ++i) { - - // get length of reference name - bam_read(m_file, &refNameLength, 4); - refName = (char*)calloc(refNameLength, 1); - - // get reference name and reference sequence length - bam_read(m_file, refName, refNameLength); - bam_read(m_file, &refLength, 4); - - // store data for reference - RefData aReference; - aReference.RefName = (string)((const char*)refName); - aReference.RefLength = refLength; - m_references.push_back(aReference); - - // clean up calloc-ed temp variable - free(refName); - } - - return true; -} - -bool BamReader::LoadIndex(void) { - - // check to see if index file exists - FILE* indexFile; - if ( ( indexFile = fopen(m_indexFilename, "r") ) == 0 ) { - fprintf(stderr, "The alignment is not indexed. Please run SAMtools \'index\' command first.\n"); - return false; - } - - // see if index is valid BAM index - char magic[4]; - fread(magic, 1, 4, indexFile); - if (strncmp(magic, "BAI\1", 4)) { - fprintf(stderr, "Problem with index - wrong \'magic\' number.\n"); - fclose(indexFile); - return false; - } - - // get number of reference sequences - uint32_t numRefSeqs; - fread(&numRefSeqs, 4, 1, indexFile); - - // intialize BamIndex data structure - m_index = new BamIndex; - m_index->reserve(numRefSeqs); - - // iterate over reference sequences - for (unsigned int i = 0; i < numRefSeqs; ++i) { - - // get number of bins for this reference sequence - int32_t numBins; - fread(&numBins, 4, 1, indexFile); - - if (numBins > 0) { m_references.at(i).RefHasAlignments = true; } - - // intialize BinVector - BinVector* bins = new BinVector; - bins->reserve(numBins); - - // iterate over bins for that reference sequence - for (int j = 0; j < numBins; ++j) { - - // get binID - uint32_t binID; - fread(&binID, 4, 1, indexFile); - - // get number of regionChunks in this bin - uint32_t numChunks; - fread(&numChunks, 4, 1, indexFile); - - // intialize ChunkVector - ChunkVector* regionChunks = new ChunkVector; - regionChunks->reserve(numChunks); - - // iterate over regionChunks in this bin - for (unsigned int k = 0; k < numChunks; ++k) { - - // get chunk boundaries (left, right) - uint64_t left; - uint64_t right; - fread(&left, 8, 1, indexFile); - fread(&right, 8, 1, indexFile); - - // save ChunkPair - regionChunks->push_back( ChunkPair(left, right) ); - } - - // save binID, chunkVector for this bin - bins->push_back( BamBin(binID, regionChunks) ); - } - - // load linear index for this reference sequence - - // get number of linear offsets - int32_t numLinearOffsets; - fread(&numLinearOffsets, 4, 1, indexFile); - - // intialize LinearOffsetVector - LinearOffsetVector* linearOffsets = new LinearOffsetVector; - linearOffsets->reserve(numLinearOffsets); - - // iterate over linear offsets for this reference sequeence - for (int j = 0; j < numLinearOffsets; ++j) { - // get a linear offset - uint64_t linearOffset; - fread(&linearOffset, 8, 1, indexFile); - // store linear offset - linearOffsets->push_back(linearOffset); - } - - // store index data for that reference sequence - m_index->push_back( new RefIndex(bins, linearOffsets) ); - } - - // close index file (.bai) and return - fclose(indexFile); - return true; -} - -bool BamReader::LoadNextAlignment(BamAlignment& bAlignment) { - - // check valid alignment block header data - int32_t block_len; - int32_t ret; - uint32_t x[8]; - - int32_t bytesRead = 0; - - // read in the 'block length' value, make sure it's not zero - if ( (ret = bam_read(m_file, &block_len, 4)) == 0 ) { return false; } - bytesRead += 4; - - // read in core alignment data, make the right size of data was read - if ( bam_read(m_file, x, BAM_CORE_SIZE) != BAM_CORE_SIZE ) { return false; } - bytesRead += BAM_CORE_SIZE; - - // set BamAlignment 'core' data - bAlignment.RefID = x[0]; - bAlignment.Position = x[1]; - bAlignment.Bin = x[2]>>16; - bAlignment.MapQuality = x[2]>>8&0xff; - bAlignment.AlignmentFlag = x[3]>>16; - bAlignment.MateRefID = x[5]; - bAlignment.MatePosition = x[6]; - bAlignment.InsertSize = x[7]; - - // fetch & store often-used lengths for character data parsing - unsigned int queryNameLength = x[2]&0xff; - unsigned int numCigarOperations = x[3]&0xffff; - unsigned int querySequenceLength = x[4]; - - // get length of character data - int dataLength = block_len - BAM_CORE_SIZE; - - // set up destination buffers for character data - uint8_t* allCharData = (uint8_t*)calloc(sizeof(uint8_t), dataLength); - uint32_t* cigarData = (uint32_t*)(allCharData+queryNameLength); - - // get character data - make sure proper data size was read - if (bam_read(m_file, allCharData, dataLength) != dataLength) { return false; } - else { - - bytesRead += dataLength; - - // clear out bases, qualities, aligned bases, and CIGAR - bAlignment.QueryBases.clear(); - bAlignment.Qualities.clear(); - bAlignment.AlignedBases.clear(); - bAlignment.CigarData.clear(); - - // save name - bAlignment.Name = (string)((const char*)(allCharData)); - - // save bases - char singleBase[2]; - uint8_t* s = ( allCharData + (numCigarOperations*4) + queryNameLength); - for (unsigned int i = 0; i < querySequenceLength; ++i) { - // numeric to char conversion - sprintf( singleBase, "%c", DNA_LOOKUP[ ( ( s[(i/2)] >> (4*(1-(i%2)))) & 0xf ) ] ); - // append character data to Bases - bAlignment.QueryBases.append( (const char*)singleBase ); - } - - // save sequence length - bAlignment.Length = bAlignment.QueryBases.length(); - - // save qualities - char singleQuality[4]; - uint8_t* t = ( allCharData + (numCigarOperations*4) + queryNameLength + (querySequenceLength + 1)/2); - for (unsigned int i = 0; i < querySequenceLength; ++i) { - // numeric to char conversion - sprintf( singleQuality, "%c", ( t[i]+33 ) ); - // append character data to Qualities - bAlignment.Qualities.append( (const char*)singleQuality ); - } - - // save CIGAR-related data; - int k = 0; - for (unsigned int i = 0; i < numCigarOperations; ++i) { - - // build CigarOp struct - CigarOp op; - op.Length = (cigarData[i] >> BAM_CIGAR_SHIFT); - op.Type = CIGAR_LOOKUP[ (cigarData[i] & BAM_CIGAR_MASK) ]; - - // save CigarOp - bAlignment.CigarData.push_back(op); - - // can skip this step if user wants to ignore - if (m_isCalculateAlignedBases) { - - // build AlignedBases string - switch (op.Type) { - - case ('M') : - case ('I') : bAlignment.AlignedBases.append( bAlignment.QueryBases.substr(k, op.Length) ); // for 'M', 'I' - write bases - case ('S') : k += op.Length; // for 'S' - skip over query bases - break; - - case ('D') : - case ('P') : bAlignment.AlignedBases.append( op.Length, '*' ); // for 'D', 'P' - write padding; - break; - - case ('N') : bAlignment.AlignedBases.append( op.Length, 'N' ); // for 'N' - write N's, skip bases in query sequence - k += op.Length; - break; - - case ('H') : break; // for 'H' - do nothing, move to next op - - default : assert(false); // shouldn't get here - break; - } - } - } - } - free(allCharData); - -/* - // (optional) read tag parsing data - string tag; - char data; - int i = 0; - - // still data to read (auxiliary tags) - while ( bytesRead < block_len ) { - - if ( bam_read(m_file, &data, 1) == 1 ) { - - ++bytesRead; - - if (bytesRead == block_len && data != '\0') { - fprintf(stderr, "ERROR: Invalid termination of tag info at end of alignment block.\n"); - exit(1); - } - - tag.append(1, data); - if ( data == '\0' ) { - bAlignment.Tags.push_back(tag); - tag = ""; - i = 0; - } else { - if ( (i == 1) && (i == 2) ) { tag.append(1, ':'); } - ++i; - } - } else { - fprintf(stderr, "ERROR: Could not read tag info.\n"); - exit(1); - } - } -*/ - return true; -} +// BamReader.cpp + +// Derek Barnett +// Marth Lab, Boston College +// Last modified: 6 April 2009 + +#include "BamReader.h" +#include +using std::cerr; +using std::endl; + +// static character constants +const char* BamReader::DNA_LOOKUP = "=ACMGRSVTWYHKDBN"; +const char* BamReader::CIGAR_LOOKUP = "MIDNSHP"; + +BamReader::BamReader(const char* filename, const char* indexFilename) + : m_filename( (char*)filename ) + , m_indexFilename( (char*)indexFilename ) + , m_file(0) + , m_index(NULL) + , m_headerText("") + , m_isOpen(false) + , m_isIndexLoaded(false) + , m_isRegionSpecified(false) + , m_isCalculateAlignedBases(true) + , m_currentRefID(0) + , m_currentLeft(0) + , m_alignmentsBeginOffset(0) +{ + Open(); +} + +BamReader::~BamReader(void) { + + // close file + if ( m_isOpen ) { Close(); } +} + +// open BAM file +bool BamReader::Open(void) { + + if (!m_isOpen && m_filename != NULL ) { + + // open file + m_file = bam_open(m_filename, "r"); + + // get header info && index info + if ( (m_file != NULL) && LoadHeader() ) { + + // save file offset where alignments start + m_alignmentsBeginOffset = bam_tell(m_file); + + // set open flag + m_isOpen = true; + } + + // try to open (and load) index data, if index file given + if ( m_indexFilename != NULL ) { + OpenIndex(); + } + } + + return m_isOpen; +} + +bool BamReader::OpenIndex(void) { + + if ( m_indexFilename && !m_isIndexLoaded ) { + m_isIndexLoaded = LoadIndex(); + } + return m_isIndexLoaded; +} + +// close BAM file +bool BamReader::Close(void) { + + if (m_isOpen) { + + // close file + int ret = bam_close(m_file); + + // delete index info + if ( m_index != NULL) { delete m_index; } + + // clear open flag + m_isOpen = false; + + // clear index flag + m_isIndexLoaded = false; + + // clear region flag + m_isRegionSpecified = false; + + // return success/fail of bam_close + return (ret == 0); + } + + return true; +} + +// get BAM filename +const char* BamReader::Filename(void) const { + return (const char*)m_filename; +} + +// set BAM filename +void BamReader::SetFilename(const char* filename) { + m_filename = (char*)filename; +} + +// get BAM Index filename +const char* BamReader::IndexFilename(void) const { + return (const char*)m_indexFilename; +} + +// set BAM Index filename +void BamReader::SetIndexFilename(const char* indexFilename) { + m_indexFilename = (char*)indexFilename; +} + +// return full header text +const string BamReader::GetHeaderText(void) const { + return m_headerText; +} + +// return number of reference sequences in BAM file +const int BamReader::GetReferenceCount(void) const { + return m_references.size(); +} + +// get RefID from reference name +const int BamReader::GetRefID(string refName) const { + + vector refNames; + RefVector::const_iterator refIter = m_references.begin(); + RefVector::const_iterator refEnd = m_references.end(); + for ( ; refIter != refEnd; ++refIter) { + refNames.push_back( (*refIter).RefName ); + } + + // return 'index-of' refName (if not found, returns refNames.size()) + return Index( refNames.begin(), refNames.end(), refName ); +} + +const RefVector BamReader::GetReferenceData(void) const { + return m_references; +} + +bool BamReader::Jump(int refID, unsigned int left) { + + // if index available, and region is valid + if ( m_isIndexLoaded && m_references.at(refID).RefHasAlignments && (left <= m_references.at(refID).RefLength) ) { + m_currentRefID = refID; + m_currentLeft = left; + m_isRegionSpecified = true; + return ( bam_seek(m_file, GetOffset(m_currentRefID, m_currentLeft), SEEK_SET) == 0 ); + } + return false; +} + +bool BamReader::Rewind(void) { + + int refID = 0; + int refCount = m_references.size(); + for ( ; refID < refCount; ++refID ) { + if ( m_references.at(refID).RefHasAlignments ) { break; } + } + + m_currentRefID = refID; + m_currentLeft = 0; + m_isRegionSpecified = false; + + return ( bam_seek(m_file, m_alignmentsBeginOffset, SEEK_SET) == 0 ); +} + +// get next alignment from specified region +bool BamReader::GetNextAlignment(BamAlignment& bAlignment) { + + // try to load 'next' read + if ( LoadNextAlignment(bAlignment) ) { + + // if specified region, check for region overlap + if ( m_isRegionSpecified ) { + + // if overlap, return true + if ( IsOverlap(bAlignment) ) { return true; } + // if not, try the next alignment + else { return GetNextAlignment(bAlignment); } + } + + // not using region, valid read detected, return success + else { return true; } + } + + // no valid alignment to load + return false; +} + +void BamReader::SetCalculateAlignedBases(bool flag) { + m_isCalculateAlignedBases = flag; +} + +int BamReader::BinsFromRegion(int refID, unsigned int left, uint16_t list[MAX_BIN]) { + + // get region boundaries + uint32_t begin = left; + uint32_t end = m_references.at(refID).RefLength - 1; + + // initialize list, bin '0' always a valid bin + int i = 0; + list[i++] = 0; + + // get rest of bins that contain this region + unsigned int k; + for (k = 1 + (begin>>26); k <= 1 + (end>>26); ++k) { list[i++] = k; } + for (k = 9 + (begin>>23); k <= 9 + (end>>23); ++k) { list[i++] = k; } + for (k = 73 + (begin>>20); k <= 73 + (end>>20); ++k) { list[i++] = k; } + for (k = 585 + (begin>>17); k <= 585 + (end>>17); ++k) { list[i++] = k; } + for (k = 4681 + (begin>>14); k <= 4681 + (end>>14); ++k) { list[i++] = k; } + + // return number of bins stored + return i; +} + +uint32_t BamReader::CalculateAlignmentEnd(const unsigned int& position, const vector& cigarData) { + + // initialize alignment end to starting position + uint32_t alignEnd = position; + + // iterate over cigar operations + vector::const_iterator cigarIter = cigarData.begin(); + vector::const_iterator cigarEnd = cigarData.end(); + for ( ; cigarIter != cigarEnd; ++cigarIter) { + if ( (*cigarIter).Type == 'M' || (*cigarIter).Type == 'D' || (*cigarIter).Type == 'N') { + alignEnd += (*cigarIter).Length; + } + } + return alignEnd; +} + +uint64_t BamReader::GetOffset(int refID, unsigned int left) { + + // make space for bins + uint16_t* bins = (uint16_t*)calloc(MAX_BIN, 2); + + // returns number of bins overlapping (left, right) + // stores indices of those bins in 'bins' + int numBins = BinsFromRegion(refID, left, bins); + + // access index data for refID + RefIndex* refIndex = m_index->at(refID); + + // get list of BAM bins for this reference sequence + BinVector* refBins = refIndex->first; + + sort( refBins->begin(), refBins->end(), LookupKeyCompare() ); + + // declare ChunkVector + ChunkVector regionChunks; + + // declaure LinearOffsetVector + LinearOffsetVector* linearOffsets = refIndex->second; + + // some sort of linear offset vs bin index voodoo... not completely sure what's going here + uint64_t minOffset = ((left>>BAM_LIDX_SHIFT) >= linearOffsets->size()) ? 0 : linearOffsets->at(left>>BAM_LIDX_SHIFT); + + BinVector::iterator binBegin = refBins->begin(); + BinVector::iterator binEnd = refBins->end(); + + // iterate over bins overlapping region, count chunks + for (int i = 0; i < numBins; ++i) { + + // look for bin with ID=bin[i] + BinVector::iterator binIter = binBegin; + + for ( ; binIter != binEnd; ++binIter ) { + + // if found, increment n_off by number of chunks for each bin + if ( (*binIter).first == (uint32_t)bins[i] ) { + + // iterate over chunks in that bin + ChunkVector* binChunks = (*binIter).second; + + ChunkVector::iterator chunkIter = binChunks->begin(); + ChunkVector::iterator chunkEnd = binChunks->end(); + for ( ; chunkIter != chunkEnd; ++chunkIter) { + + // if right bound of pair is greater than min_off (linear offset value), store pair + if ( (*chunkIter).second > minOffset) { + regionChunks.push_back( (*chunkIter) ); + } + } + } + } + } + + // clean up temp array + free(bins); + + // there should be at least 1 + assert(regionChunks.size() > 0); + + // sort chunks by start position + sort ( regionChunks.begin(), regionChunks.end(), LookupKeyCompare() ); + + // resolve overlaps between adjacent blocks; this may happen due to the merge in indexing + int numOffsets = regionChunks.size(); + for (int i = 1; i < numOffsets; ++i) { + if ( regionChunks.at(i-1).second >= regionChunks.at(i).first ) { + regionChunks.at(i-1).second = regionChunks.at(i).first; + } + } + + // merge adjacent chunks + int l = 0; + for (int i = 1; i < numOffsets; ++i) { + // if adjacent, expand boundaries of (merged) chunk + if ( (regionChunks.at(l).second>>16) == (regionChunks.at(i).first>>16) ) { + regionChunks.at(l).second = regionChunks.at(i).second; + } + // else, move on to next (merged) chunk index + else { regionChunks.at(++l) = regionChunks.at(i); } + } + + // return beginning file offset of first chunk for region + return regionChunks.at(0).first; +} + +bool BamReader::IsOverlap(BamAlignment& bAlignment) { + + // if on different reference sequence, quit + if ( bAlignment.RefID != (unsigned int)m_currentRefID ) { return false; } + + // read starts after left boundary + if ( bAlignment.Position >= m_currentLeft) { return true; } + + // get alignment end + uint32_t alignEnd = CalculateAlignmentEnd(bAlignment.Position, bAlignment.CigarData); + + // return whether alignment end overlaps left boundary + return ( alignEnd >= m_currentLeft ); +} + +bool BamReader::LoadHeader(void) { + + // check to see if proper BAM header + char buf[4]; + if (bam_read(m_file, buf, 4) != 4) { return false; } + if (strncmp(buf, "BAM\001", 4)) { + fprintf(stderr, "wrong header type!\n"); + return false; + } + + // get BAM header text length + int32_t headerTextLength; + bam_read(m_file, &headerTextLength, 4); + + // get BAM header text + char* headerText = (char*)calloc(headerTextLength + 1, 1); + bam_read(m_file, headerText, headerTextLength); + m_headerText = (string)((const char*)headerText); + + // clean up calloc-ed temp variable + free(headerText); + + // get number of reference sequences + int32_t numberRefSeqs; + bam_read(m_file, &numberRefSeqs, 4); + if (numberRefSeqs == 0) { return false; } + + m_references.reserve((int)numberRefSeqs); + + // reference variables + int32_t refNameLength; + char* refName; + uint32_t refLength; + + // iterate over all references in header + for (int i = 0; i != numberRefSeqs; ++i) { + + // get length of reference name + bam_read(m_file, &refNameLength, 4); + refName = (char*)calloc(refNameLength, 1); + + // get reference name and reference sequence length + bam_read(m_file, refName, refNameLength); + bam_read(m_file, &refLength, 4); + + // store data for reference + RefData aReference; + aReference.RefName = (string)((const char*)refName); + aReference.RefLength = refLength; + m_references.push_back(aReference); + + // clean up calloc-ed temp variable + free(refName); + } + + return true; +} + +bool BamReader::LoadIndex(void) { + + // check to see if index file exists + FILE* indexFile; + if ( ( indexFile = fopen(m_indexFilename, "r") ) == 0 ) { + fprintf(stderr, "The alignment is not indexed. Please run SAMtools \'index\' command first.\n"); + return false; + } + + // see if index is valid BAM index + char magic[4]; + fread(magic, 1, 4, indexFile); + if (strncmp(magic, "BAI\1", 4)) { + fprintf(stderr, "Problem with index - wrong \'magic\' number.\n"); + fclose(indexFile); + return false; + } + + // get number of reference sequences + uint32_t numRefSeqs; + fread(&numRefSeqs, 4, 1, indexFile); + + // intialize BamIndex data structure + m_index = new BamIndex; + m_index->reserve(numRefSeqs); + + // iterate over reference sequences + for (unsigned int i = 0; i < numRefSeqs; ++i) { + + // get number of bins for this reference sequence + int32_t numBins; + fread(&numBins, 4, 1, indexFile); + + if (numBins > 0) { m_references.at(i).RefHasAlignments = true; } + + // intialize BinVector + BinVector* bins = new BinVector; + bins->reserve(numBins); + + // iterate over bins for that reference sequence + for (int j = 0; j < numBins; ++j) { + + // get binID + uint32_t binID; + fread(&binID, 4, 1, indexFile); + + // get number of regionChunks in this bin + uint32_t numChunks; + fread(&numChunks, 4, 1, indexFile); + + // intialize ChunkVector + ChunkVector* regionChunks = new ChunkVector; + regionChunks->reserve(numChunks); + + // iterate over regionChunks in this bin + for (unsigned int k = 0; k < numChunks; ++k) { + + // get chunk boundaries (left, right) + uint64_t left; + uint64_t right; + fread(&left, 8, 1, indexFile); + fread(&right, 8, 1, indexFile); + + // save ChunkPair + regionChunks->push_back( ChunkPair(left, right) ); + } + + // save binID, chunkVector for this bin + bins->push_back( BamBin(binID, regionChunks) ); + } + + // load linear index for this reference sequence + + // get number of linear offsets + int32_t numLinearOffsets; + fread(&numLinearOffsets, 4, 1, indexFile); + + // intialize LinearOffsetVector + LinearOffsetVector* linearOffsets = new LinearOffsetVector; + linearOffsets->reserve(numLinearOffsets); + + // iterate over linear offsets for this reference sequeence + for (int j = 0; j < numLinearOffsets; ++j) { + // get a linear offset + uint64_t linearOffset; + fread(&linearOffset, 8, 1, indexFile); + // store linear offset + linearOffsets->push_back(linearOffset); + } + + // store index data for that reference sequence + m_index->push_back( new RefIndex(bins, linearOffsets) ); + } + + // close index file (.bai) and return + fclose(indexFile); + return true; +} + +bool BamReader::LoadNextAlignment(BamAlignment& bAlignment) { + + // check valid alignment block header data + int32_t block_len; + int32_t ret; + uint32_t x[8]; + + int32_t bytesRead = 0; + + // read in the 'block length' value, make sure it's not zero + if ( (ret = bam_read(m_file, &block_len, 4)) == 0 ) { return false; } + bytesRead += 4; + + // read in core alignment data, make the right size of data was read + if ( bam_read(m_file, x, BAM_CORE_SIZE) != BAM_CORE_SIZE ) { return false; } + bytesRead += BAM_CORE_SIZE; + + // set BamAlignment 'core' data + bAlignment.RefID = x[0]; + bAlignment.Position = x[1]; + bAlignment.Bin = x[2]>>16; + bAlignment.MapQuality = x[2]>>8&0xff; + bAlignment.AlignmentFlag = x[3]>>16; + bAlignment.MateRefID = x[5]; + bAlignment.MatePosition = x[6]; + bAlignment.InsertSize = x[7]; + + // fetch & store often-used lengths for character data parsing + unsigned int queryNameLength = x[2]&0xff; + unsigned int numCigarOperations = x[3]&0xffff; + unsigned int querySequenceLength = x[4]; + + // get length of character data + int dataLength = block_len - BAM_CORE_SIZE; + + // set up destination buffers for character data + uint8_t* allCharData = (uint8_t*)calloc(sizeof(uint8_t), dataLength); + uint32_t* cigarData = (uint32_t*)(allCharData+queryNameLength); + + const unsigned int tagDataOffset = (numCigarOperations * 4) + queryNameLength + (querySequenceLength + 1) / 2 + querySequenceLength; + const unsigned int tagDataLen = dataLength - tagDataOffset; + char* tagData = ((char*)allCharData) + tagDataOffset; + + // get character data - make sure proper data size was read + if (bam_read(m_file, allCharData, dataLength) != dataLength) { return false; } + else { + + bytesRead += dataLength; + + // clear out bases, qualities, aligned bases, CIGAR, and tag data + bAlignment.QueryBases.clear(); + bAlignment.Qualities.clear(); + bAlignment.AlignedBases.clear(); + bAlignment.CigarData.clear(); + bAlignment.TagData.clear(); + + // save name + bAlignment.Name = (string)((const char*)(allCharData)); + + // save bases + char singleBase[2]; + uint8_t* s = ( allCharData + (numCigarOperations*4) + queryNameLength); + for (unsigned int i = 0; i < querySequenceLength; ++i) { + // numeric to char conversion + sprintf( singleBase, "%c", DNA_LOOKUP[ ( ( s[(i/2)] >> (4*(1-(i%2)))) & 0xf ) ] ); + // append character data to Bases + bAlignment.QueryBases.append( (const char*)singleBase ); + } + + // save sequence length + bAlignment.Length = bAlignment.QueryBases.length(); + + // save qualities + char singleQuality[4]; + uint8_t* t = ( allCharData + (numCigarOperations*4) + queryNameLength + (querySequenceLength + 1)/2); + for (unsigned int i = 0; i < querySequenceLength; ++i) { + // numeric to char conversion + sprintf( singleQuality, "%c", ( t[i]+33 ) ); + // append character data to Qualities + bAlignment.Qualities.append( (const char*)singleQuality ); + } + + // save CIGAR-related data; + int k = 0; + for (unsigned int i = 0; i < numCigarOperations; ++i) { + + // build CigarOp struct + CigarOp op; + op.Length = (cigarData[i] >> BAM_CIGAR_SHIFT); + op.Type = CIGAR_LOOKUP[ (cigarData[i] & BAM_CIGAR_MASK) ]; + + // save CigarOp + bAlignment.CigarData.push_back(op); + + // can skip this step if user wants to ignore + if (m_isCalculateAlignedBases) { + + // build AlignedBases string + switch (op.Type) { + + case ('M') : + case ('I') : bAlignment.AlignedBases.append( bAlignment.QueryBases.substr(k, op.Length) ); // for 'M', 'I' - write bases + case ('S') : k += op.Length; // for 'S' - skip over query bases + break; + + case ('D') : + case ('P') : bAlignment.AlignedBases.append( op.Length, '*' ); // for 'D', 'P' - write padding; + break; + + case ('N') : bAlignment.AlignedBases.append( op.Length, 'N' ); // for 'N' - write N's, skip bases in query sequence + k += op.Length; + break; + + case ('H') : break; // for 'H' - do nothing, move to next op + + default : assert(false); // shouldn't get here + break; + } + } + } + + // read in the tag data + bAlignment.TagData.resize(tagDataLen); + memcpy((char*)bAlignment.TagData.data(), tagData, tagDataLen); + } + free(allCharData); + +/* + // (optional) read tag parsing data + string tag; + char data; + int i = 0; + + // still data to read (auxiliary tags) + while ( bytesRead < block_len ) { + + if ( bam_read(m_file, &data, 1) == 1 ) { + + ++bytesRead; + + if (bytesRead == block_len && data != '\0') { + fprintf(stderr, "ERROR: Invalid termination of tag info at end of alignment block.\n"); + exit(1); + } + + tag.append(1, data); + if ( data == '\0' ) { + bAlignment.Tags.push_back(tag); + tag = ""; + i = 0; + } else { + if ( (i == 1) && (i == 2) ) { tag.append(1, ':'); } + ++i; + } + } else { + fprintf(stderr, "ERROR: Could not read tag info.\n"); + exit(1); + } + } +*/ + return true; +} diff --git a/BamWriter.cpp b/BamWriter.cpp index f0c67e2..4f6a37f 100644 --- a/BamWriter.cpp +++ b/BamWriter.cpp @@ -357,13 +357,8 @@ void BamWriter::SaveAlignment(const BamAlignment& al) { EncodeQuerySequence(al.QueryBases, encodedQuery); const unsigned int encodedQueryLen = encodedQuery.size(); - // create our read group tag - // TODO: add read group tag support when it shows up in the BamAlignment struct - //string readGroupTag; - //const unsigned int readGroupTagLen = 3 + mReadGroupLUT[al.ReadGroupCode].NameLen + 1; - //readGroupTag.resize(readGroupTagLen); - //char* pReadGroupTag = (char*)readGroupTag.data(); - //sprintf(pReadGroupTag, "RGZ%s", mReadGroupLUT[al.ReadGroupCode].Name); + // store the tag data length + const unsigned int tagDataLength = al.TagData.size(); // assign the BAM core data unsigned int buffer[8]; @@ -377,7 +372,7 @@ void BamWriter::SaveAlignment(const BamAlignment& al) { buffer[7] = al.InsertSize; // write the block size - const unsigned int dataBlockSize = nameLen + packedCigarLen + encodedQueryLen + queryLen /* + readGroupTagLen*/; + const unsigned int dataBlockSize = nameLen + packedCigarLen + encodedQueryLen + queryLen + tagDataLength; const unsigned int blockSize = BAM_CORE_SIZE + dataBlockSize; BgzfWrite((char*)&blockSize, SIZEOF_INT); @@ -401,6 +396,5 @@ void BamWriter::SaveAlignment(const BamAlignment& al) { BgzfWrite(pBaseQualities, queryLen); // write the read group tag - // TODO: add read group tag support when it shows up in the BamAlignment struct - //BgzfWrite(readGroupTag.c_str(), readGroupTagLen); + BgzfWrite(al.TagData.data(), tagDataLength); } -- 2.39.5