1 // ***************************************************************************
2 // BamReader_p.cpp (c) 2009 Derek Barnett
3 // Marth Lab, Department of Biology, Boston College
4 // All rights reserved.
5 // ---------------------------------------------------------------------------
6 // Last modified: 10 May 2011 (DB)
7 // ---------------------------------------------------------------------------
8 // Provides the basic functionality for reading BAM files
9 // ***************************************************************************
11 #include <api/BamConstants.h>
12 #include <api/BamReader.h>
13 #include <api/IBamIODevice.h>
14 #include <api/internal/BamDeviceFactory_p.h>
15 #include <api/internal/BamHeader_p.h>
16 #include <api/internal/BamRandomAccessController_p.h>
17 #include <api/internal/BamReader_p.h>
18 #include <api/internal/BamStandardIndex_p.h>
19 #include <api/internal/BamToolsIndex_p.h>
20 #include <api/internal/BgzfStream_p.h>
21 using namespace BamTools;
22 using namespace BamTools::Internal;
31 BamReaderPrivate::BamReaderPrivate(BamReader* parent)
32 : m_alignmentsBeginOffset(0)
35 m_isBigEndian = BamTools::SystemIsBigEndian();
39 BamReaderPrivate::~BamReaderPrivate(void) {
43 // closes the BAM file
44 void BamReaderPrivate::Close(void) {
46 // clear header & reference data
51 m_randomAccessController.Close();
58 // creates an index file of requested type on current BAM file
59 bool BamReaderPrivate::CreateIndex(const BamIndex::IndexType& type) {
60 if ( !IsOpen() ) return false;
61 return m_randomAccessController.CreateIndex(this, type);
64 // return path & filename of current BAM file
65 const string BamReaderPrivate::Filename(void) const {
69 // return header data as std::string
70 string BamReaderPrivate::GetHeaderText(void) const {
71 return m_header.ToString();
74 // return header data as SamHeader object
75 SamHeader BamReaderPrivate::GetSamHeader(void) const {
76 return m_header.ToSamHeader();
79 // get next alignment (with character data fully parsed)
80 bool BamReaderPrivate::GetNextAlignment(BamAlignment& alignment) {
82 // if valid alignment found
83 if ( GetNextAlignmentCore(alignment) ) {
85 // store alignment's "source" filename
86 alignment.Filename = m_filename;
88 // return success/failure of parsing char data
89 return alignment.BuildCharData();
92 // no valid alignment found
96 // retrieves next available alignment core data (returns success/fail)
97 // ** DOES NOT populate any character data fields (read name, bases, qualities, tag data, filename)
98 // these can be accessed, if necessary, from the supportData
99 // useful for operations requiring ONLY positional or other alignment-related information
100 bool BamReaderPrivate::GetNextAlignmentCore(BamAlignment& alignment) {
102 if ( !m_stream.IsOpen() )
105 // skip if region is set but has no alignments
106 if ( m_randomAccessController.HasRegion() &&
107 !m_randomAccessController.RegionHasAlignments() )
112 // if can't read next alignment
113 if ( !LoadNextAlignment(alignment) )
116 // check alignment's region-overlap state
117 BamRandomAccessController::RegionState state = m_randomAccessController.AlignmentState(alignment);
119 // if alignment starts after region, no need to keep reading
120 if ( state == BamRandomAccessController::AfterRegion )
123 // read until overlap is found
124 while ( state != BamRandomAccessController::OverlapsRegion ) {
126 // if can't read next alignment
127 if ( !LoadNextAlignment(alignment) )
130 // check alignment's region-overlap state
131 state = m_randomAccessController.AlignmentState(alignment);
133 // if alignment starts after region, no need to keep reading
134 if ( state == BamRandomAccessController::AfterRegion )
138 // if we get here, we found the next 'valid' alignment
139 // (e.g. overlaps current region if one was set, simply the next alignment if not)
140 alignment.SupportData.HasCoreOnly = true;
144 int BamReaderPrivate::GetReferenceCount(void) const {
145 return m_references.size();
148 const RefVector& BamReaderPrivate::GetReferenceData(void) const {
152 // returns RefID for given RefName (returns References.size() if not found)
153 int BamReaderPrivate::GetReferenceID(const string& refName) const {
155 // retrieve names from reference data
156 vector<string> refNames;
157 RefVector::const_iterator refIter = m_references.begin();
158 RefVector::const_iterator refEnd = m_references.end();
159 for ( ; refIter != refEnd; ++refIter)
160 refNames.push_back( (*refIter).RefName );
162 // return 'index-of' refName (or -1 if not found)
163 int index = distance(refNames.begin(), find(refNames.begin(), refNames.end(), refName));
164 if ( index == (int)m_references.size() ) return -1;
168 bool BamReaderPrivate::HasIndex(void) const {
169 return m_randomAccessController.HasIndex();
172 bool BamReaderPrivate::IsOpen(void) const {
173 return m_stream.IsOpen();
176 // load BAM header data
177 bool BamReaderPrivate::LoadHeaderData(void) {
178 return m_header.Load(&m_stream);
181 // populates BamAlignment with alignment data under file pointer, returns success/fail
182 bool BamReaderPrivate::LoadNextAlignment(BamAlignment& alignment) {
184 // read in the 'block length' value, make sure it's not zero
185 char buffer[sizeof(uint32_t)];
186 m_stream.Read(buffer, sizeof(uint32_t));
187 alignment.SupportData.BlockLength = BamTools::UnpackUnsignedInt(buffer);
188 if ( m_isBigEndian ) BamTools::SwapEndian_32(alignment.SupportData.BlockLength);
189 if ( alignment.SupportData.BlockLength == 0 ) return false;
191 // read in core alignment data, make sure the right size of data was read
192 char x[Constants::BAM_CORE_SIZE];
193 if ( m_stream.Read(x, Constants::BAM_CORE_SIZE) != Constants::BAM_CORE_SIZE )
196 // swap core endian-ness if necessary
197 if ( m_isBigEndian ) {
198 for ( unsigned int i = 0; i < Constants::BAM_CORE_SIZE; i+=sizeof(uint32_t) )
199 BamTools::SwapEndian_32p(&x[i]);
202 // set BamAlignment 'core' and 'support' data
203 alignment.RefID = BamTools::UnpackSignedInt(&x[0]);
204 alignment.Position = BamTools::UnpackSignedInt(&x[4]);
206 unsigned int tempValue = BamTools::UnpackUnsignedInt(&x[8]);
207 alignment.Bin = tempValue >> 16;
208 alignment.MapQuality = tempValue >> 8 & 0xff;
209 alignment.SupportData.QueryNameLength = tempValue & 0xff;
211 tempValue = BamTools::UnpackUnsignedInt(&x[12]);
212 alignment.AlignmentFlag = tempValue >> 16;
213 alignment.SupportData.NumCigarOperations = tempValue & 0xffff;
215 alignment.SupportData.QuerySequenceLength = BamTools::UnpackUnsignedInt(&x[16]);
216 alignment.MateRefID = BamTools::UnpackSignedInt(&x[20]);
217 alignment.MatePosition = BamTools::UnpackSignedInt(&x[24]);
218 alignment.InsertSize = BamTools::UnpackSignedInt(&x[28]);
220 // set BamAlignment length
221 alignment.Length = alignment.SupportData.QuerySequenceLength;
223 // read in character data - make sure proper data size was read
224 bool readCharDataOK = false;
225 const unsigned int dataLength = alignment.SupportData.BlockLength - Constants::BAM_CORE_SIZE;
226 char* allCharData = (char*)calloc(sizeof(char), dataLength);
228 if ( m_stream.Read(allCharData, dataLength) == dataLength ) {
230 // store 'allCharData' in supportData structure
231 alignment.SupportData.AllCharData.assign((const char*)allCharData, dataLength);
234 readCharDataOK = true;
237 // need to calculate this here so that BamAlignment::GetEndPosition() performs correctly,
238 // even when GetNextAlignmentCore() is called
239 const unsigned int cigarDataOffset = alignment.SupportData.QueryNameLength;
240 uint32_t* cigarData = (uint32_t*)(allCharData + cigarDataOffset);
242 alignment.CigarData.clear();
243 alignment.CigarData.reserve(alignment.SupportData.NumCigarOperations);
244 for ( unsigned int i = 0; i < alignment.SupportData.NumCigarOperations; ++i ) {
246 // swap endian-ness if necessary
247 if ( m_isBigEndian ) BamTools::SwapEndian_32(cigarData[i]);
249 // build CigarOp structure
250 op.Length = (cigarData[i] >> Constants::BAM_CIGAR_SHIFT);
251 op.Type = Constants::BAM_CIGAR_LOOKUP[ (cigarData[i] & Constants::BAM_CIGAR_MASK) ];
254 alignment.CigarData.push_back(op);
258 // clean up & return parsing success/failure
260 return readCharDataOK;
263 // loads reference data from BAM file
264 bool BamReaderPrivate::LoadReferenceData(void) {
266 // get number of reference sequences
267 char buffer[sizeof(uint32_t)];
268 m_stream.Read(buffer, sizeof(uint32_t));
269 uint32_t numberRefSeqs = BamTools::UnpackUnsignedInt(buffer);
270 if ( m_isBigEndian ) BamTools::SwapEndian_32(numberRefSeqs);
271 m_references.reserve((int)numberRefSeqs);
273 // iterate over all references in header
274 for ( unsigned int i = 0; i != numberRefSeqs; ++i ) {
276 // get length of reference name
277 m_stream.Read(buffer, sizeof(uint32_t));
278 uint32_t refNameLength = BamTools::UnpackUnsignedInt(buffer);
279 if ( m_isBigEndian ) BamTools::SwapEndian_32(refNameLength);
280 char* refName = (char*)calloc(refNameLength, 1);
282 // get reference name and reference sequence length
283 m_stream.Read(refName, refNameLength);
284 m_stream.Read(buffer, sizeof(int32_t));
285 int32_t refLength = BamTools::UnpackSignedInt(buffer);
286 if ( m_isBigEndian ) BamTools::SwapEndian_32(refLength);
288 // store data for reference
290 aReference.RefName = (string)((const char*)refName);
291 aReference.RefLength = refLength;
292 m_references.push_back(aReference);
294 // clean up calloc-ed temp variable
302 bool BamReaderPrivate::LocateIndex(const BamIndex::IndexType& preferredType) {
303 return m_randomAccessController.LocateIndex(this, preferredType);
306 // opens BAM file (and index)
307 bool BamReaderPrivate::Open(const string& filename) {
309 // make sure we're starting with a fresh slate
312 // attempt to open BgzfStream for reading
313 if ( !m_stream.Open(filename, IBamIODevice::ReadOnly) ) {
314 cerr << "BamReader ERROR: Could not open BGZF stream for " << filename << endl;
318 // attempt to load header data
319 if ( !LoadHeaderData() ) {
320 cerr << "BamReader ERROR: Could not load header data for " << filename << endl;
325 // attempt to load reference data
326 if ( !LoadReferenceData() ) {
327 cerr << "BamReader ERROR: Could not load reference data for " << filename << endl;
332 // if all OK, store filename & offset of first alignment
333 m_filename = filename;
334 m_alignmentsBeginOffset = m_stream.Tell();
340 bool BamReaderPrivate::OpenIndex(const std::string& indexFilename) {
341 return m_randomAccessController.OpenIndex(indexFilename, this);
344 // returns BAM file pointer to beginning of alignment data
345 bool BamReaderPrivate::Rewind(void) {
347 if ( !m_stream.IsOpen() )
350 // attempt rewind to first alignment
351 if ( !m_stream.Seek(m_alignmentsBeginOffset) )
354 // verify that we can read first alignment
356 if ( !LoadNextAlignment(al) )
360 m_randomAccessController.ClearRegion();
362 // rewind back to beginning of first alignment
363 // return success/fail of seek
364 return m_stream.Seek(m_alignmentsBeginOffset);
367 bool BamReaderPrivate::Seek(const int64_t& position) {
368 return m_stream.Seek(position);
371 void BamReaderPrivate::SetIndex(BamIndex* index) {
372 m_randomAccessController.SetIndex(index);
375 // change the index caching behavior
376 void BamReaderPrivate::SetIndexCacheMode(const BamIndex::IndexCacheMode& mode) {
377 m_randomAccessController.SetIndexCacheMode(mode);
380 // sets current region & attempts to jump to it
381 // returns success/failure
382 bool BamReaderPrivate::SetRegion(const BamRegion& region) {
383 return m_randomAccessController.SetRegion(this, region, m_references.size());
386 int64_t BamReaderPrivate::Tell(void) const {
387 return m_stream.Tell();