1 // ***************************************************************************
2 // BamReader_p.cpp (c) 2009 Derek Barnett
3 // Marth Lab, Department of Biology, Boston College
4 // ---------------------------------------------------------------------------
5 // Last modified: 10 May 2011 (DB)
6 // ---------------------------------------------------------------------------
7 // Provides the basic functionality for reading BAM files
8 // ***************************************************************************
10 #include <api/BamConstants.h>
11 #include <api/BamReader.h>
12 #include <api/IBamIODevice.h>
13 #include <api/internal/BamDeviceFactory_p.h>
14 #include <api/internal/BamHeader_p.h>
15 #include <api/internal/BamRandomAccessController_p.h>
16 #include <api/internal/BamReader_p.h>
17 #include <api/internal/BamStandardIndex_p.h>
18 #include <api/internal/BamToolsIndex_p.h>
19 #include <api/internal/BgzfStream_p.h>
20 using namespace BamTools;
21 using namespace BamTools::Internal;
30 BamReaderPrivate::BamReaderPrivate(BamReader* parent)
31 : m_alignmentsBeginOffset(0)
34 m_isBigEndian = BamTools::SystemIsBigEndian();
38 BamReaderPrivate::~BamReaderPrivate(void) {
42 // closes the BAM file
43 void BamReaderPrivate::Close(void) {
45 // clear header & reference data
50 m_randomAccessController.Close();
57 // creates an index file of requested type on current BAM file
58 bool BamReaderPrivate::CreateIndex(const BamIndex::IndexType& type) {
59 if ( !IsOpen() ) return false;
60 return m_randomAccessController.CreateIndex(this, type);
63 // return path & filename of current BAM file
64 const string BamReaderPrivate::Filename(void) const {
68 // return header data as std::string
69 string BamReaderPrivate::GetHeaderText(void) const {
70 return m_header.ToString();
73 // return header data as SamHeader object
74 SamHeader BamReaderPrivate::GetSamHeader(void) const {
75 return m_header.ToSamHeader();
78 // get next alignment (with character data fully parsed)
79 bool BamReaderPrivate::GetNextAlignment(BamAlignment& alignment) {
81 // if valid alignment found
82 if ( GetNextAlignmentCore(alignment) ) {
84 // store alignment's "source" filename
85 alignment.Filename = m_filename;
87 // return success/failure of parsing char data
88 return alignment.BuildCharData();
91 // no valid alignment found
95 // retrieves next available alignment core data (returns success/fail)
96 // ** DOES NOT populate any character data fields (read name, bases, qualities, tag data, filename)
97 // these can be accessed, if necessary, from the supportData
98 // useful for operations requiring ONLY positional or other alignment-related information
99 bool BamReaderPrivate::GetNextAlignmentCore(BamAlignment& alignment) {
101 if ( !m_stream.IsOpen() )
104 // skip if region is set but has no alignments
105 if ( m_randomAccessController.HasRegion() &&
106 !m_randomAccessController.RegionHasAlignments() )
111 // if can't read next alignment
112 if ( !LoadNextAlignment(alignment) )
115 // check alignment's region-overlap state
116 BamRandomAccessController::RegionState state = m_randomAccessController.AlignmentState(alignment);
118 // if alignment starts after region, no need to keep reading
119 if ( state == BamRandomAccessController::AfterRegion )
122 // read until overlap is found
123 while ( state != BamRandomAccessController::OverlapsRegion ) {
125 // if can't read next alignment
126 if ( !LoadNextAlignment(alignment) )
129 // check alignment's region-overlap state
130 state = m_randomAccessController.AlignmentState(alignment);
132 // if alignment starts after region, no need to keep reading
133 if ( state == BamRandomAccessController::AfterRegion )
137 // if we get here, we found the next 'valid' alignment
138 // (e.g. overlaps current region if one was set, simply the next alignment if not)
139 alignment.SupportData.HasCoreOnly = true;
143 int BamReaderPrivate::GetReferenceCount(void) const {
144 return m_references.size();
147 const RefVector& BamReaderPrivate::GetReferenceData(void) const {
151 // returns RefID for given RefName (returns References.size() if not found)
152 int BamReaderPrivate::GetReferenceID(const string& refName) const {
154 // retrieve names from reference data
155 vector<string> refNames;
156 RefVector::const_iterator refIter = m_references.begin();
157 RefVector::const_iterator refEnd = m_references.end();
158 for ( ; refIter != refEnd; ++refIter)
159 refNames.push_back( (*refIter).RefName );
161 // return 'index-of' refName (or -1 if not found)
162 int index = distance(refNames.begin(), find(refNames.begin(), refNames.end(), refName));
163 if ( index == (int)m_references.size() ) return -1;
167 bool BamReaderPrivate::HasIndex(void) const {
168 return m_randomAccessController.HasIndex();
171 bool BamReaderPrivate::IsOpen(void) const {
172 return m_stream.IsOpen();
175 // load BAM header data
176 bool BamReaderPrivate::LoadHeaderData(void) {
177 return m_header.Load(&m_stream);
180 // populates BamAlignment with alignment data under file pointer, returns success/fail
181 bool BamReaderPrivate::LoadNextAlignment(BamAlignment& alignment) {
183 // read in the 'block length' value, make sure it's not zero
184 char buffer[sizeof(uint32_t)];
185 m_stream.Read(buffer, sizeof(uint32_t));
186 alignment.SupportData.BlockLength = BamTools::UnpackUnsignedInt(buffer);
187 if ( m_isBigEndian ) BamTools::SwapEndian_32(alignment.SupportData.BlockLength);
188 if ( alignment.SupportData.BlockLength == 0 ) return false;
190 // read in core alignment data, make sure the right size of data was read
191 char x[Constants::BAM_CORE_SIZE];
192 if ( m_stream.Read(x, Constants::BAM_CORE_SIZE) != Constants::BAM_CORE_SIZE )
195 // swap core endian-ness if necessary
196 if ( m_isBigEndian ) {
197 for ( unsigned int i = 0; i < Constants::BAM_CORE_SIZE; i+=sizeof(uint32_t) )
198 BamTools::SwapEndian_32p(&x[i]);
201 // set BamAlignment 'core' and 'support' data
202 alignment.RefID = BamTools::UnpackSignedInt(&x[0]);
203 alignment.Position = BamTools::UnpackSignedInt(&x[4]);
205 unsigned int tempValue = BamTools::UnpackUnsignedInt(&x[8]);
206 alignment.Bin = tempValue >> 16;
207 alignment.MapQuality = tempValue >> 8 & 0xff;
208 alignment.SupportData.QueryNameLength = tempValue & 0xff;
210 tempValue = BamTools::UnpackUnsignedInt(&x[12]);
211 alignment.AlignmentFlag = tempValue >> 16;
212 alignment.SupportData.NumCigarOperations = tempValue & 0xffff;
214 alignment.SupportData.QuerySequenceLength = BamTools::UnpackUnsignedInt(&x[16]);
215 alignment.MateRefID = BamTools::UnpackSignedInt(&x[20]);
216 alignment.MatePosition = BamTools::UnpackSignedInt(&x[24]);
217 alignment.InsertSize = BamTools::UnpackSignedInt(&x[28]);
219 // set BamAlignment length
220 alignment.Length = alignment.SupportData.QuerySequenceLength;
222 // read in character data - make sure proper data size was read
223 bool readCharDataOK = false;
224 const unsigned int dataLength = alignment.SupportData.BlockLength - Constants::BAM_CORE_SIZE;
225 char* allCharData = (char*)calloc(sizeof(char), dataLength);
227 if ( m_stream.Read(allCharData, dataLength) == dataLength ) {
229 // store 'allCharData' in supportData structure
230 alignment.SupportData.AllCharData.assign((const char*)allCharData, dataLength);
233 readCharDataOK = true;
236 // need to calculate this here so that BamAlignment::GetEndPosition() performs correctly,
237 // even when GetNextAlignmentCore() is called
238 const unsigned int cigarDataOffset = alignment.SupportData.QueryNameLength;
239 uint32_t* cigarData = (uint32_t*)(allCharData + cigarDataOffset);
241 alignment.CigarData.clear();
242 alignment.CigarData.reserve(alignment.SupportData.NumCigarOperations);
243 for ( unsigned int i = 0; i < alignment.SupportData.NumCigarOperations; ++i ) {
245 // swap endian-ness if necessary
246 if ( m_isBigEndian ) BamTools::SwapEndian_32(cigarData[i]);
248 // build CigarOp structure
249 op.Length = (cigarData[i] >> Constants::BAM_CIGAR_SHIFT);
250 op.Type = Constants::BAM_CIGAR_LOOKUP[ (cigarData[i] & Constants::BAM_CIGAR_MASK) ];
253 alignment.CigarData.push_back(op);
257 // clean up & return parsing success/failure
259 return readCharDataOK;
262 // loads reference data from BAM file
263 bool BamReaderPrivate::LoadReferenceData(void) {
265 // get number of reference sequences
266 char buffer[sizeof(uint32_t)];
267 m_stream.Read(buffer, sizeof(uint32_t));
268 uint32_t numberRefSeqs = BamTools::UnpackUnsignedInt(buffer);
269 if ( m_isBigEndian ) BamTools::SwapEndian_32(numberRefSeqs);
270 m_references.reserve((int)numberRefSeqs);
272 // iterate over all references in header
273 for ( unsigned int i = 0; i != numberRefSeqs; ++i ) {
275 // get length of reference name
276 m_stream.Read(buffer, sizeof(uint32_t));
277 uint32_t refNameLength = BamTools::UnpackUnsignedInt(buffer);
278 if ( m_isBigEndian ) BamTools::SwapEndian_32(refNameLength);
279 char* refName = (char*)calloc(refNameLength, 1);
281 // get reference name and reference sequence length
282 m_stream.Read(refName, refNameLength);
283 m_stream.Read(buffer, sizeof(int32_t));
284 int32_t refLength = BamTools::UnpackSignedInt(buffer);
285 if ( m_isBigEndian ) BamTools::SwapEndian_32(refLength);
287 // store data for reference
289 aReference.RefName = (string)((const char*)refName);
290 aReference.RefLength = refLength;
291 m_references.push_back(aReference);
293 // clean up calloc-ed temp variable
301 bool BamReaderPrivate::LocateIndex(const BamIndex::IndexType& preferredType) {
302 return m_randomAccessController.LocateIndex(this, preferredType);
305 // opens BAM file (and index)
306 bool BamReaderPrivate::Open(const string& filename) {
308 // make sure we're starting with a fresh slate
311 // attempt to open BgzfStream for reading
312 if ( !m_stream.Open(filename, IBamIODevice::ReadOnly) ) {
313 cerr << "BamReader ERROR: Could not open BGZF stream for " << filename << endl;
317 // attempt to load header data
318 if ( !LoadHeaderData() ) {
319 cerr << "BamReader ERROR: Could not load header data for " << filename << endl;
324 // attempt to load reference data
325 if ( !LoadReferenceData() ) {
326 cerr << "BamReader ERROR: Could not load reference data for " << filename << endl;
331 // if all OK, store filename & offset of first alignment
332 m_filename = filename;
333 m_alignmentsBeginOffset = m_stream.Tell();
339 bool BamReaderPrivate::OpenIndex(const std::string& indexFilename) {
340 return m_randomAccessController.OpenIndex(indexFilename, this);
343 // returns BAM file pointer to beginning of alignment data
344 bool BamReaderPrivate::Rewind(void) {
346 if ( !m_stream.IsOpen() ) {
347 cerr << "BRP::Rewind() - stream not open" << endl;
351 // attempt rewind to first alignment
352 if ( !m_stream.Seek(m_alignmentsBeginOffset) ) {
353 cerr << "BRP::Rewind() - could not seek to ABO (1st time)" << endl;
357 // verify that we can read first alignment
359 if ( !LoadNextAlignment(al) ) {
360 cerr << "BRP::Rewind() - could not read first alignment" << endl;
365 m_randomAccessController.ClearRegion();
367 // rewind back to beginning of first alignment
368 // return success/fail of seek
369 return m_stream.Seek(m_alignmentsBeginOffset);
372 bool BamReaderPrivate::Seek(const int64_t& position) {
373 return m_stream.Seek(position);
376 void BamReaderPrivate::SetIndex(BamIndex* index) {
377 m_randomAccessController.SetIndex(index);
380 // change the index caching behavior
381 void BamReaderPrivate::SetIndexCacheMode(const BamIndex::IndexCacheMode& mode) {
382 m_randomAccessController.SetIndexCacheMode(mode);
385 // sets current region & attempts to jump to it
386 // returns success/failure
387 bool BamReaderPrivate::SetRegion(const BamRegion& region) {
388 return m_randomAccessController.SetRegion(this, region, m_references.size());
391 int64_t BamReaderPrivate::Tell(void) const {
392 return m_stream.Tell();