1 // ***************************************************************************
2 // BamReader_p.cpp (c) 2009 Derek Barnett
3 // Marth Lab, Department of Biology, Boston College
4 // ---------------------------------------------------------------------------
5 <<<<<<< HEAD:src/api/internal/BamReader_p.cpp
6 // Last modified: 14 November 2011 (DB)
8 // Last modified: 25 October 2011 (DB)
9 >>>>>>> remoteio:src/api/internal/bam/BamReader_p.cpp
10 // ---------------------------------------------------------------------------
11 // Provides the basic functionality for reading BAM files
12 // ***************************************************************************
14 #include "api/BamConstants.h"
15 #include "api/BamReader.h"
16 #include "api/IBamIODevice.h"
17 #include "api/internal/bam/BamHeader_p.h"
18 #include "api/internal/bam/BamRandomAccessController_p.h"
19 #include "api/internal/bam/BamReader_p.h"
20 #include "api/internal/index/BamStandardIndex_p.h"
21 #include "api/internal/index/BamToolsIndex_p.h"
22 #include "api/internal/io/BamDeviceFactory_p.h"
23 #include "api/internal/utils/BamException_p.h"
24 using namespace BamTools;
25 using namespace BamTools::Internal;
35 BamReaderPrivate::BamReaderPrivate(BamReader* parent)
36 : m_alignmentsBeginOffset(0)
39 m_isBigEndian = BamTools::SystemIsBigEndian();
43 BamReaderPrivate::~BamReaderPrivate(void) {
47 // closes the BAM file
48 bool BamReaderPrivate::Close(void) {
57 // close random access controller
58 m_randomAccessController.Close();
60 // if stream is open, attempt close
64 } catch ( BamException& e ) {
65 const string streamError = e.what();
66 const string message = string("encountered error closing BAM file: \n\t") + streamError;
67 SetErrorString("BamReader::Close", message);
76 // creates an index file of requested type on current BAM file
77 bool BamReaderPrivate::CreateIndex(const BamIndex::IndexType& type) {
79 // skip if BAM file not open
81 SetErrorString("BamReader::CreateIndex", "cannot create index on unopened BAM file");
85 // attempt to create index
86 if ( m_randomAccessController.CreateIndex(this, type) )
89 const string bracError = m_randomAccessController.GetErrorString();
90 const string message = string("could not create index: \n\t") + bracError;
91 SetErrorString("BamReader::CreateIndex", message);
96 // return path & filename of current BAM file
97 const string BamReaderPrivate::Filename(void) const {
101 string BamReaderPrivate::GetErrorString(void) const {
102 return m_errorString;
105 // return header data as std::string
106 string BamReaderPrivate::GetHeaderText(void) const {
107 return m_header.ToString();
110 // return header data as SamHeader object
111 SamHeader BamReaderPrivate::GetSamHeader(void) const {
112 return m_header.ToSamHeader();
115 // get next alignment (with character data fully parsed)
116 bool BamReaderPrivate::GetNextAlignment(BamAlignment& alignment) {
118 // if valid alignment found
119 if ( GetNextAlignmentCore(alignment) ) {
121 // store alignment's "source" filename
122 alignment.Filename = m_filename;
124 // return success/failure of parsing char data
125 if ( alignment.BuildCharData() )
128 const string alError = alignment.GetErrorString();
129 const string message = string("could not populate alignment data: \n\t") + alError;
130 SetErrorString("BamReader::GetNextAlignment", message);
135 // no valid alignment found
139 // retrieves next available alignment core data (returns success/fail)
140 // ** DOES NOT populate any character data fields (read name, bases, qualities, tag data, filename)
141 // these can be accessed, if necessary, from the supportData
142 // useful for operations requiring ONLY positional or other alignment-related information
143 bool BamReaderPrivate::GetNextAlignmentCore(BamAlignment& alignment) {
145 // skip if stream not opened
146 if ( !m_stream.IsOpen() )
151 // skip if region is set but has no alignments
152 if ( m_randomAccessController.HasRegion() &&
153 !m_randomAccessController.RegionHasAlignments() )
158 // if can't read next alignment
159 if ( !LoadNextAlignment(alignment) )
162 // check alignment's region-overlap state
163 BamRandomAccessController::RegionState state = m_randomAccessController.AlignmentState(alignment);
165 // if alignment starts after region, no need to keep reading
166 if ( state == BamRandomAccessController::AfterRegion )
169 // read until overlap is found
170 while ( state != BamRandomAccessController::OverlapsRegion ) {
172 // if can't read next alignment
173 if ( !LoadNextAlignment(alignment) )
176 // check alignment's region-overlap state
177 state = m_randomAccessController.AlignmentState(alignment);
179 // if alignment starts after region, no need to keep reading
180 if ( state == BamRandomAccessController::AfterRegion )
184 // if we get here, we found the next 'valid' alignment
185 // (e.g. overlaps current region if one was set, simply the next alignment if not)
186 alignment.SupportData.HasCoreOnly = true;
189 } catch ( BamException& e ) {
190 const string streamError = e.what();
191 const string message = string("encountered error reading BAM alignment: \n\t") + streamError;
192 SetErrorString("BamReader::GetNextAlignmentCore", message);
197 int BamReaderPrivate::GetReferenceCount(void) const {
198 return m_references.size();
201 const RefVector& BamReaderPrivate::GetReferenceData(void) const {
205 // returns RefID for given RefName (returns References.size() if not found)
206 int BamReaderPrivate::GetReferenceID(const string& refName) const {
208 // retrieve names from reference data
209 vector<string> refNames;
210 RefVector::const_iterator refIter = m_references.begin();
211 RefVector::const_iterator refEnd = m_references.end();
212 for ( ; refIter != refEnd; ++refIter)
213 refNames.push_back( (*refIter).RefName );
215 // return 'index-of' refName (or -1 if not found)
216 int index = distance(refNames.begin(), find(refNames.begin(), refNames.end(), refName));
217 if ( index == (int)m_references.size() ) return -1;
221 bool BamReaderPrivate::HasIndex(void) const {
222 return m_randomAccessController.HasIndex();
225 bool BamReaderPrivate::IsOpen(void) const {
226 return m_stream.IsOpen();
229 // load BAM header data
230 void BamReaderPrivate::LoadHeaderData(void) {
231 m_header.Load(&m_stream);
234 // populates BamAlignment with alignment data under file pointer, returns success/fail
235 bool BamReaderPrivate::LoadNextAlignment(BamAlignment& alignment) {
237 // read in the 'block length' value, make sure it's not zero
238 char buffer[sizeof(uint32_t)];
239 m_stream.Read(buffer, sizeof(uint32_t));
240 alignment.SupportData.BlockLength = BamTools::UnpackUnsignedInt(buffer);
241 if ( m_isBigEndian ) BamTools::SwapEndian_32(alignment.SupportData.BlockLength);
242 if ( alignment.SupportData.BlockLength == 0 )
245 // read in core alignment data, make sure the right size of data was read
246 char x[Constants::BAM_CORE_SIZE];
247 if ( m_stream.Read(x, Constants::BAM_CORE_SIZE) != Constants::BAM_CORE_SIZE )
250 // swap core endian-ness if necessary
251 if ( m_isBigEndian ) {
252 for ( unsigned int i = 0; i < Constants::BAM_CORE_SIZE; i+=sizeof(uint32_t) )
253 BamTools::SwapEndian_32p(&x[i]);
256 // set BamAlignment 'core' and 'support' data
257 alignment.RefID = BamTools::UnpackSignedInt(&x[0]);
258 alignment.Position = BamTools::UnpackSignedInt(&x[4]);
260 unsigned int tempValue = BamTools::UnpackUnsignedInt(&x[8]);
261 alignment.Bin = tempValue >> 16;
262 alignment.MapQuality = tempValue >> 8 & 0xff;
263 alignment.SupportData.QueryNameLength = tempValue & 0xff;
265 tempValue = BamTools::UnpackUnsignedInt(&x[12]);
266 alignment.AlignmentFlag = tempValue >> 16;
267 alignment.SupportData.NumCigarOperations = tempValue & 0xffff;
269 alignment.SupportData.QuerySequenceLength = BamTools::UnpackUnsignedInt(&x[16]);
270 alignment.MateRefID = BamTools::UnpackSignedInt(&x[20]);
271 alignment.MatePosition = BamTools::UnpackSignedInt(&x[24]);
272 alignment.InsertSize = BamTools::UnpackSignedInt(&x[28]);
274 // set BamAlignment length
275 alignment.Length = alignment.SupportData.QuerySequenceLength;
277 // read in character data - make sure proper data size was read
278 bool readCharDataOK = false;
279 const unsigned int dataLength = alignment.SupportData.BlockLength - Constants::BAM_CORE_SIZE;
280 RaiiBuffer allCharData(dataLength);
282 if ( m_stream.Read(allCharData.Buffer, dataLength) == dataLength ) {
284 // store 'allCharData' in supportData structure
285 alignment.SupportData.AllCharData.assign((const char*)allCharData.Buffer, dataLength);
288 readCharDataOK = true;
291 // need to calculate this here so that BamAlignment::GetEndPosition() performs correctly,
292 // even when GetNextAlignmentCore() is called
293 const unsigned int cigarDataOffset = alignment.SupportData.QueryNameLength;
294 uint32_t* cigarData = (uint32_t*)(allCharData.Buffer + cigarDataOffset);
296 alignment.CigarData.clear();
297 alignment.CigarData.reserve(alignment.SupportData.NumCigarOperations);
298 for ( unsigned int i = 0; i < alignment.SupportData.NumCigarOperations; ++i ) {
300 // swap endian-ness if necessary
301 if ( m_isBigEndian ) BamTools::SwapEndian_32(cigarData[i]);
303 // build CigarOp structure
304 op.Length = (cigarData[i] >> Constants::BAM_CIGAR_SHIFT);
305 op.Type = Constants::BAM_CIGAR_LOOKUP[ (cigarData[i] & Constants::BAM_CIGAR_MASK) ];
308 alignment.CigarData.push_back(op);
312 // return success/failure
313 return readCharDataOK;
316 // loads reference data from BAM file
317 bool BamReaderPrivate::LoadReferenceData(void) {
319 // get number of reference sequences
320 char buffer[sizeof(uint32_t)];
321 m_stream.Read(buffer, sizeof(uint32_t));
322 uint32_t numberRefSeqs = BamTools::UnpackUnsignedInt(buffer);
323 if ( m_isBigEndian ) BamTools::SwapEndian_32(numberRefSeqs);
324 m_references.reserve((int)numberRefSeqs);
326 // iterate over all references in header
327 for ( unsigned int i = 0; i != numberRefSeqs; ++i ) {
329 // get length of reference name
330 m_stream.Read(buffer, sizeof(uint32_t));
331 uint32_t refNameLength = BamTools::UnpackUnsignedInt(buffer);
332 if ( m_isBigEndian ) BamTools::SwapEndian_32(refNameLength);
333 RaiiBuffer refName(refNameLength);
335 // get reference name and reference sequence length
336 m_stream.Read(refName.Buffer, refNameLength);
337 m_stream.Read(buffer, sizeof(int32_t));
338 int32_t refLength = BamTools::UnpackSignedInt(buffer);
339 if ( m_isBigEndian ) BamTools::SwapEndian_32(refLength);
341 // store data for reference
343 aReference.RefName = (string)((const char*)refName.Buffer);
344 aReference.RefLength = refLength;
345 m_references.push_back(aReference);
352 bool BamReaderPrivate::LocateIndex(const BamIndex::IndexType& preferredType) {
354 if ( m_randomAccessController.LocateIndex(this, preferredType) )
357 const string bracError = m_randomAccessController.GetErrorString();
358 const string message = string("could not locate index: \n\t") + bracError;
359 SetErrorString("BamReader::LocateIndex", message);
364 // opens BAM file (and index)
365 bool BamReaderPrivate::Open(const string& filename) {
369 // make sure we're starting with fresh state
373 m_stream.Open(filename, IBamIODevice::ReadOnly);
379 // store filename & offset of first alignment
380 m_filename = filename;
381 m_alignmentsBeginOffset = m_stream.Tell();
386 } catch ( BamException& e ) {
387 const string error = e.what();
388 const string message = string("could not open file: ") + filename +
390 SetErrorString("BamReader::Open", message);
395 bool BamReaderPrivate::OpenIndex(const std::string& indexFilename) {
397 if ( m_randomAccessController.OpenIndex(indexFilename, this) )
400 const string bracError = m_randomAccessController.GetErrorString();
401 const string message = string("could not open index: \n\t") + bracError;
402 SetErrorString("BamReader::OpenIndex", message);
407 // returns BAM file pointer to beginning of alignment data
408 bool BamReaderPrivate::Rewind(void) {
411 m_randomAccessController.ClearRegion();
413 // return status of seeking back to first alignment
414 if ( Seek(m_alignmentsBeginOffset) )
417 const string currentError = m_errorString;
418 const string message = string("could not rewind: \n\t") + currentError;
419 SetErrorString("BamReader::Rewind", message);
424 bool BamReaderPrivate::Seek(const int64_t& position) {
426 // skip if BAM file not open
428 SetErrorString("BamReader::Seek", "cannot seek on unopened BAM file");
433 m_stream.Seek(position);
436 catch ( BamException& e ) {
437 const string streamError = e.what();
438 const string message = string("could not seek in BAM file: \n\t") + streamError;
439 SetErrorString("BamReader::Seek", message);
444 void BamReaderPrivate::SetErrorString(const string& where, const string& what) {
445 static const string SEPARATOR = ": ";
446 m_errorString = where + SEPARATOR + what;
449 void BamReaderPrivate::SetIndex(BamIndex* index) {
450 m_randomAccessController.SetIndex(index);
453 // sets current region & attempts to jump to it
454 // returns success/failure
455 bool BamReaderPrivate::SetRegion(const BamRegion& region) {
457 if ( m_randomAccessController.SetRegion(region, m_references.size()) )
460 const string bracError = m_randomAccessController.GetErrorString();
461 const string message = string("could not set region: \n\t") + bracError;
462 SetErrorString("BamReader::SetRegion", message);
467 int64_t BamReaderPrivate::Tell(void) const {
468 return m_stream.Tell();