1 // ***************************************************************************
2 // BamReader_p.cpp (c) 2009 Derek Barnett
3 // Marth Lab, Department of Biology, Boston College
4 // ---------------------------------------------------------------------------
5 // Last modified: 7 October 2011 (DB)
6 // ---------------------------------------------------------------------------
7 // Provides the basic functionality for reading BAM files
8 // ***************************************************************************
10 #include <api/BamConstants.h>
11 #include <api/BamReader.h>
12 #include <api/IBamIODevice.h>
13 #include <api/internal/BamDeviceFactory_p.h>
14 #include <api/internal/BamException_p.h>
15 #include <api/internal/BamHeader_p.h>
16 #include <api/internal/BamRandomAccessController_p.h>
17 #include <api/internal/BamReader_p.h>
18 #include <api/internal/BamStandardIndex_p.h>
19 #include <api/internal/BamToolsIndex_p.h>
20 #include <api/internal/BgzfStream_p.h>
21 using namespace BamTools;
22 using namespace BamTools::Internal;
32 BamReaderPrivate::BamReaderPrivate(BamReader* parent)
33 : m_alignmentsBeginOffset(0)
36 m_isBigEndian = BamTools::SystemIsBigEndian();
40 BamReaderPrivate::~BamReaderPrivate(void) {
44 // closes the BAM file
45 bool BamReaderPrivate::Close(void) {
54 // close random access controller
55 m_randomAccessController.Close();
57 // if stream is open, attempt close
61 } catch ( BamException& e ) {
62 const string streamError = e.what();
63 const string message = string("encountered error closing BAM file: \n\t") + streamError;
64 SetErrorString("BamReader::Close", message);
73 // creates an index file of requested type on current BAM file
74 bool BamReaderPrivate::CreateIndex(const BamIndex::IndexType& type) {
76 // skip if BAM file not open
78 SetErrorString("BamReader::CreateIndex", "cannot create index on unopened BAM file");
82 // attempt to create index
83 if ( m_randomAccessController.CreateIndex(this, type) )
86 const string bracError = m_randomAccessController.GetErrorString();
87 const string message = string("could not create index: \n\t") + bracError;
88 SetErrorString("BamReader::CreateIndex", message);
93 // return path & filename of current BAM file
94 const string BamReaderPrivate::Filename(void) const {
98 string BamReaderPrivate::GetErrorString(void) const {
102 // return header data as std::string
103 string BamReaderPrivate::GetHeaderText(void) const {
104 return m_header.ToString();
107 // return header data as SamHeader object
108 SamHeader BamReaderPrivate::GetSamHeader(void) const {
109 return m_header.ToSamHeader();
112 // get next alignment (with character data fully parsed)
113 bool BamReaderPrivate::GetNextAlignment(BamAlignment& alignment) {
115 // if valid alignment found
116 if ( GetNextAlignmentCore(alignment) ) {
118 // store alignment's "source" filename
119 alignment.Filename = m_filename;
121 // return success/failure of parsing char data
122 if ( alignment.BuildCharData() )
125 const string alError = alignment.GetErrorString();
126 const string message = string("could not populate alignment data: \n\t") + alError;
127 SetErrorString("BamReader::GetNextAlignment", message);
132 // no valid alignment found
136 // retrieves next available alignment core data (returns success/fail)
137 // ** DOES NOT populate any character data fields (read name, bases, qualities, tag data, filename)
138 // these can be accessed, if necessary, from the supportData
139 // useful for operations requiring ONLY positional or other alignment-related information
140 bool BamReaderPrivate::GetNextAlignmentCore(BamAlignment& alignment) {
142 // skip if stream not opened
143 if ( !m_stream.IsOpen() )
148 // skip if region is set but has no alignments
149 if ( m_randomAccessController.HasRegion() &&
150 !m_randomAccessController.RegionHasAlignments() )
155 // if can't read next alignment
156 if ( !LoadNextAlignment(alignment) )
159 // check alignment's region-overlap state
160 BamRandomAccessController::RegionState state = m_randomAccessController.AlignmentState(alignment);
162 // if alignment starts after region, no need to keep reading
163 if ( state == BamRandomAccessController::AfterRegion )
166 // read until overlap is found
167 while ( state != BamRandomAccessController::OverlapsRegion ) {
169 // if can't read next alignment
170 if ( !LoadNextAlignment(alignment) )
173 // check alignment's region-overlap state
174 state = m_randomAccessController.AlignmentState(alignment);
176 // if alignment starts after region, no need to keep reading
177 if ( state == BamRandomAccessController::AfterRegion )
181 // if we get here, we found the next 'valid' alignment
182 // (e.g. overlaps current region if one was set, simply the next alignment if not)
183 alignment.SupportData.HasCoreOnly = true;
186 } catch ( BamException& e ) {
187 const string streamError = e.what();
188 const string message = string("encountered error reading BAM alignment: \n\t") + streamError;
189 SetErrorString("BamReader::GetNextAlignmentCore", message);
194 int BamReaderPrivate::GetReferenceCount(void) const {
195 return m_references.size();
198 const RefVector& BamReaderPrivate::GetReferenceData(void) const {
202 // returns RefID for given RefName (returns References.size() if not found)
203 int BamReaderPrivate::GetReferenceID(const string& refName) const {
205 // retrieve names from reference data
206 vector<string> refNames;
207 RefVector::const_iterator refIter = m_references.begin();
208 RefVector::const_iterator refEnd = m_references.end();
209 for ( ; refIter != refEnd; ++refIter)
210 refNames.push_back( (*refIter).RefName );
212 // return 'index-of' refName (or -1 if not found)
213 int index = distance(refNames.begin(), find(refNames.begin(), refNames.end(), refName));
214 if ( index == (int)m_references.size() ) return -1;
218 bool BamReaderPrivate::HasIndex(void) const {
219 return m_randomAccessController.HasIndex();
222 bool BamReaderPrivate::IsOpen(void) const {
223 return m_stream.IsOpen();
226 // load BAM header data
227 void BamReaderPrivate::LoadHeaderData(void) {
228 m_header.Load(&m_stream);
231 // populates BamAlignment with alignment data under file pointer, returns success/fail
232 bool BamReaderPrivate::LoadNextAlignment(BamAlignment& alignment) {
234 // read in the 'block length' value, make sure it's not zero
235 char buffer[sizeof(uint32_t)];
236 m_stream.Read(buffer, sizeof(uint32_t));
237 alignment.SupportData.BlockLength = BamTools::UnpackUnsignedInt(buffer);
238 if ( m_isBigEndian ) BamTools::SwapEndian_32(alignment.SupportData.BlockLength);
239 if ( alignment.SupportData.BlockLength == 0 )
242 // read in core alignment data, make sure the right size of data was read
243 char x[Constants::BAM_CORE_SIZE];
244 if ( m_stream.Read(x, Constants::BAM_CORE_SIZE) != Constants::BAM_CORE_SIZE )
247 // swap core endian-ness if necessary
248 if ( m_isBigEndian ) {
249 for ( unsigned int i = 0; i < Constants::BAM_CORE_SIZE; i+=sizeof(uint32_t) )
250 BamTools::SwapEndian_32p(&x[i]);
253 // set BamAlignment 'core' and 'support' data
254 alignment.RefID = BamTools::UnpackSignedInt(&x[0]);
255 alignment.Position = BamTools::UnpackSignedInt(&x[4]);
257 unsigned int tempValue = BamTools::UnpackUnsignedInt(&x[8]);
258 alignment.Bin = tempValue >> 16;
259 alignment.MapQuality = tempValue >> 8 & 0xff;
260 alignment.SupportData.QueryNameLength = tempValue & 0xff;
262 tempValue = BamTools::UnpackUnsignedInt(&x[12]);
263 alignment.AlignmentFlag = tempValue >> 16;
264 alignment.SupportData.NumCigarOperations = tempValue & 0xffff;
266 alignment.SupportData.QuerySequenceLength = BamTools::UnpackUnsignedInt(&x[16]);
267 alignment.MateRefID = BamTools::UnpackSignedInt(&x[20]);
268 alignment.MatePosition = BamTools::UnpackSignedInt(&x[24]);
269 alignment.InsertSize = BamTools::UnpackSignedInt(&x[28]);
271 // set BamAlignment length
272 alignment.Length = alignment.SupportData.QuerySequenceLength;
274 // read in character data - make sure proper data size was read
275 bool readCharDataOK = false;
276 const unsigned int dataLength = alignment.SupportData.BlockLength - Constants::BAM_CORE_SIZE;
277 RaiiBuffer allCharData(dataLength);
279 if ( m_stream.Read(allCharData.Buffer, dataLength) == dataLength ) {
281 // store 'allCharData' in supportData structure
282 alignment.SupportData.AllCharData.assign((const char*)allCharData.Buffer, dataLength);
285 readCharDataOK = true;
288 // need to calculate this here so that BamAlignment::GetEndPosition() performs correctly,
289 // even when GetNextAlignmentCore() is called
290 const unsigned int cigarDataOffset = alignment.SupportData.QueryNameLength;
291 uint32_t* cigarData = (uint32_t*)(allCharData.Buffer + cigarDataOffset);
293 alignment.CigarData.clear();
294 alignment.CigarData.reserve(alignment.SupportData.NumCigarOperations);
295 for ( unsigned int i = 0; i < alignment.SupportData.NumCigarOperations; ++i ) {
297 // swap endian-ness if necessary
298 if ( m_isBigEndian ) BamTools::SwapEndian_32(cigarData[i]);
300 // build CigarOp structure
301 op.Length = (cigarData[i] >> Constants::BAM_CIGAR_SHIFT);
302 op.Type = Constants::BAM_CIGAR_LOOKUP[ (cigarData[i] & Constants::BAM_CIGAR_MASK) ];
305 alignment.CigarData.push_back(op);
309 // return success/failure
310 return readCharDataOK;
313 // loads reference data from BAM file
314 bool BamReaderPrivate::LoadReferenceData(void) {
316 // get number of reference sequences
317 char buffer[sizeof(uint32_t)];
318 m_stream.Read(buffer, sizeof(uint32_t));
319 uint32_t numberRefSeqs = BamTools::UnpackUnsignedInt(buffer);
320 if ( m_isBigEndian ) BamTools::SwapEndian_32(numberRefSeqs);
321 m_references.reserve((int)numberRefSeqs);
323 // iterate over all references in header
324 for ( unsigned int i = 0; i != numberRefSeqs; ++i ) {
326 // get length of reference name
327 m_stream.Read(buffer, sizeof(uint32_t));
328 uint32_t refNameLength = BamTools::UnpackUnsignedInt(buffer);
329 if ( m_isBigEndian ) BamTools::SwapEndian_32(refNameLength);
330 RaiiBuffer refName(refNameLength);
332 // get reference name and reference sequence length
333 m_stream.Read(refName.Buffer, refNameLength);
334 m_stream.Read(buffer, sizeof(int32_t));
335 int32_t refLength = BamTools::UnpackSignedInt(buffer);
336 if ( m_isBigEndian ) BamTools::SwapEndian_32(refLength);
338 // store data for reference
340 aReference.RefName = (string)((const char*)refName.Buffer);
341 aReference.RefLength = refLength;
342 m_references.push_back(aReference);
349 bool BamReaderPrivate::LocateIndex(const BamIndex::IndexType& preferredType) {
351 if ( m_randomAccessController.LocateIndex(this, preferredType) )
354 const string bracError = m_randomAccessController.GetErrorString();
355 const string message = string("could not locate index: \n\t") + bracError;
356 SetErrorString("BamReader::LocateIndex", message);
361 // opens BAM file (and index)
362 bool BamReaderPrivate::Open(const string& filename) {
366 // make sure we're starting with fresh state
370 m_stream.Open(filename, IBamIODevice::ReadOnly);
377 // store filename & offset of first alignment
378 m_filename = filename;
379 m_alignmentsBeginOffset = m_stream.Tell();
384 } catch ( BamException& e ) {
385 const string error = e.what();
386 const string message = string("could not open file: ") + filename +
388 SetErrorString("BamReader::Open", message);
393 bool BamReaderPrivate::OpenIndex(const std::string& indexFilename) {
395 if ( m_randomAccessController.OpenIndex(indexFilename, this) )
398 const string bracError = m_randomAccessController.GetErrorString();
399 const string message = string("could not open index: \n\t") + bracError;
400 SetErrorString("BamReader::OpenIndex", message);
405 // returns BAM file pointer to beginning of alignment data
406 bool BamReaderPrivate::Rewind(void) {
409 m_randomAccessController.ClearRegion();
411 // return status of seeking back to first alignment
412 if ( Seek(m_alignmentsBeginOffset) )
415 const string currentError = m_errorString;
416 const string message = string("could not rewind: \n\t") + currentError;
417 SetErrorString("BamReader::Rewind", message);
422 bool BamReaderPrivate::Seek(const int64_t& position) {
424 // skip if BAM file not open
426 SetErrorString("BamReader::Seek", "cannot seek on unopened BAM file");
431 m_stream.Seek(position);
434 catch ( BamException& e ) {
435 const string streamError = e.what();
436 const string message = string("could not seek in BAM file: \n\t") + streamError;
437 SetErrorString("BamReader::Seek", message);
442 void BamReaderPrivate::SetErrorString(const string& where, const string& what) {
443 static const string SEPARATOR = ": ";
444 m_errorString = where + SEPARATOR + what;
447 void BamReaderPrivate::SetIndex(BamIndex* index) {
448 m_randomAccessController.SetIndex(index);
451 // change the index caching behavior
452 void BamReaderPrivate::SetIndexCacheMode(const BamIndex::IndexCacheMode& mode) {
453 m_randomAccessController.SetIndexCacheMode(mode);
456 // sets current region & attempts to jump to it
457 // returns success/failure
458 bool BamReaderPrivate::SetRegion(const BamRegion& region) {
460 if ( m_randomAccessController.SetRegion(region, m_references.size()) )
463 const string bracError = m_randomAccessController.GetErrorString();
464 const string message = string("could not set region: \n\t") + bracError;
465 SetErrorString("BamReader::SetRegion", message);
470 int64_t BamReaderPrivate::Tell(void) const {
471 return m_stream.Tell();