1 // ***************************************************************************
2 // BamMultiReader.cpp (c) 2010 Erik Garrison, Derek Barnett
3 // Marth Lab, Department of Biology, Boston College
4 // ---------------------------------------------------------------------------
5 // Last modified: 14 January 2013 (DB)
6 // ---------------------------------------------------------------------------
7 // Convenience class for reading multiple BAM files.
9 // This functionality allows applications to work on very large sets of files
10 // without requiring intermediate merge, sort, and index steps for each file
11 // subset. It also improves the performance of our merge system as it
12 // precludes the need to sort merged files.
13 // ***************************************************************************
15 #include "api/BamMultiReader.h"
16 #include "api/internal/bam/BamMultiReader_p.h"
17 using namespace BamTools;
23 /*! \class BamTools::BamMultiReader
24 \brief Convenience class for reading multiple BAM files.
27 /*! \enum BamMultiReader::MergeOrder
28 \brief A description of the enum type.
30 /*! \var BamMultiReader::MergeOrder BamMultiReader::MergeByCoordinate
31 \brief The description of the first enum value.
33 /*! \var BamMultiReader::MergeOrder BamMultiReader::MergeByName
39 /*! \fn BamMultiReader::BamMultiReader(void)
42 BamMultiReader::BamMultiReader(void)
43 : d(new Internal::BamMultiReaderPrivate)
46 /*! \fn BamMultiReader::~BamMultiReader(void)
49 BamMultiReader::~BamMultiReader(void) {
54 /*! \fn void BamMultiReader::Close(void)
55 \brief Closes all open BAM files.
57 Also clears out all header and reference data.
59 \sa CloseFile(), IsOpen(), Open(), BamReader::Close()
61 bool BamMultiReader::Close(void) {
65 /*! \fn void BamMultiReader::CloseFile(const std::string& filename)
66 \brief Closes requested BAM file.
68 Leaves any other file(s) open, along with header and reference data.
70 \param[in] filename name of specific BAM file to close
72 \sa Close(), IsOpen(), Open(), BamReader::Close()
74 bool BamMultiReader::CloseFile(const std::string& filename) {
75 return d->CloseFile(filename);
78 /*! \fn bool BamMultiReader::CreateIndexes(const BamIndex::IndexType& type)
79 \brief Creates index files for the current BAM files.
81 \param[in] type file format to create, see BamIndex::IndexType for available formats
82 \return \c true if index files created OK
83 \sa LocateIndexes(), OpenIndexes(), BamReader::CreateIndex()
85 bool BamMultiReader::CreateIndexes(const BamIndex::IndexType& type) {
86 return d->CreateIndexes(type);
89 /*! \fn const std::vector<std::string> BamMultiReader::Filenames(void) const
90 \brief Returns list of filenames for all open BAM files.
92 Retrieved filenames will contain whatever was passed via Open().
93 If you need full directory paths here, be sure to include them
94 when you open the BAM files.
96 \returns names of open BAM files. If no files are open, returns an empty vector.
97 \sa IsOpen(), BamReader::GetFilename()
99 const std::vector<std::string> BamMultiReader::Filenames(void) const {
100 return d->Filenames();
103 /*! \fn std::string BamMultiReader::GetErrorString(void) const
104 \brief Returns a human-readable description of the last error that occurred
106 This method allows elimination of STDERR pollution. Developers of client code
107 may choose how the messages are displayed to the user, if at all.
109 \return error description
111 std::string BamMultiReader::GetErrorString(void) const {
112 return d->GetErrorString();
115 /*! \fn SamHeader BamMultiReader::GetHeader(void) const
116 \brief Returns unified SAM-format header for all files
118 \note Modifying the retrieved text does NOT affect the current
119 BAM files. These files have been opened in a read-only mode. However,
120 your modified header text can be used in conjunction with BamWriter
121 to generate a new BAM file with the appropriate header information.
123 \returns header data wrapped in SamHeader object
124 \sa GetHeaderText(), BamReader::GetHeader()
126 SamHeader BamMultiReader::GetHeader(void) const {
127 return d->GetHeader();
130 /*! \fn std::string BamMultiReader::GetHeaderText(void) const
131 \brief Returns unified SAM-format header text for all files
133 \note Modifying the retrieved text does NOT affect the current
134 BAM files. These files have been opened in a read-only mode. However,
135 your modified header text can be used in conjunction with BamWriter
136 to generate a new BAM file with the appropriate header information.
138 \returns SAM-formatted header text
139 \sa GetHeader(), BamReader::GetHeaderText()
141 std::string BamMultiReader::GetHeaderText(void) const {
142 return d->GetHeaderText();
145 /*! \fn BamMultiReader::MergeOrder BamMultiReader::GetMergeOrder(void) const
146 \brief Returns curent merge order strategy.
148 \returns current merge order enum value
149 \sa BamMultiReader::MergeOrder, SetExplicitMergeOrder()
151 BamMultiReader::MergeOrder BamMultiReader::GetMergeOrder(void) const {
152 return d->GetMergeOrder();
155 /*! \fn bool BamMultiReader::GetNextAlignment(BamAlignment& alignment)
156 \brief Retrieves next available alignment.
158 Equivalent to BamReader::GetNextAlignment() with respect to what is a valid
159 overlapping alignment and what data gets populated.
161 This method takes care of determining which alignment actually is 'next'
162 across multiple files, depending on their sort order.
164 \param[out] alignment destination for alignment record data
165 \returns \c true if a valid alignment was found
166 \sa GetNextAlignmentCore(), SetExplicitMergeOrder(), SetRegion(), BamReader::GetNextAlignment()
168 bool BamMultiReader::GetNextAlignment(BamAlignment& nextAlignment) {
169 return d->GetNextAlignment(nextAlignment);
172 /*! \fn bool BamMultiReader::GetNextAlignmentCore(BamAlignment& alignment)
173 \brief Retrieves next available alignment.
175 Equivalent to BamReader::GetNextAlignmentCore() with respect to what is a valid
176 overlapping alignment and what data gets populated.
178 This method takes care of determining which alignment actually is 'next'
179 across multiple files, depending on their sort order.
181 \param[out] alignment destination for alignment record data
182 \returns \c true if a valid alignment was found
183 \sa GetNextAlignment(), SetExplicitMergeOrder(), SetRegion(), BamReader::GetNextAlignmentCore()
185 bool BamMultiReader::GetNextAlignmentCore(BamAlignment& nextAlignment) {
186 return d->GetNextAlignmentCore(nextAlignment);
189 /*! \fn int BamMultiReader::GetReferenceCount(void) const
190 \brief Returns number of reference sequences.
191 \sa BamReader::GetReferenceCount()
193 int BamMultiReader::GetReferenceCount(void) const {
194 return d->GetReferenceCount();
197 /*! \fn const RefVector& BamMultiReader::GetReferenceData(void) const
198 \brief Returns all reference sequence entries.
199 \sa RefData, BamReader::GetReferenceData()
201 const BamTools::RefVector BamMultiReader::GetReferenceData(void) const {
202 return d->GetReferenceData();
205 /*! \fn int BamMultiReader::GetReferenceID(const std::string& refName) const
206 \brief Returns the ID of the reference with this name.
208 If \a refName is not found, returns -1.
210 \param[in] refName name of reference to look up
211 \sa BamReader::GetReferenceID()
213 int BamMultiReader::GetReferenceID(const std::string& refName) const {
214 return d->GetReferenceID(refName);
217 /*! \fn bool BamMultiReader::HasIndexes(void) const
218 \brief Returns \c true if all BAM files have index data available.
219 \sa BamReader::HasIndex()
221 bool BamMultiReader::HasIndexes(void) const {
222 return d->HasIndexes();
225 /*! \fn bool BamMultiReader::HasOpenReaders(void) const
226 \brief Returns \c true if there are any open BAM files.
228 bool BamMultiReader::HasOpenReaders(void) const {
229 return d->HasOpenReaders();
232 /*! \fn bool BamMultiReader::Jump(int refID, int position)
233 \brief Performs a random-access jump within current BAM files.
235 This is a convenience method, equivalent to calling SetRegion()
236 with only a left boundary specified.
238 \param[in] refID ID of reference to jump to
239 \param[in] position (0-based) left boundary
241 \returns \c true if jump was successful
242 \sa HasIndex(), BamReader::Jump()
245 bool BamMultiReader::Jump(int refID, int position) {
246 return d->Jump(refID, position);
249 /*! \fn bool BamMultiReader::LocateIndexes(const BamIndex::IndexType& preferredType)
250 \brief Looks for index files that match current BAM files.
252 Use this function when you need index files, and perhaps have a
253 preferred index format, but do not depend heavily on which indexes
254 actually get loaded at runtime.
256 For each BAM file, this function will defer to your \a preferredType
257 whenever possible. However, if an index file of \a preferredType can
258 not be found, then it will look for any other index file that matches
261 An example case would look this:
263 BamMultiReader reader;
267 // ensure that all files have an index
268 if ( !reader.LocateIndexes() ) // opens any existing index files that match our BAM files
269 reader.CreateIndexes(); // creates index files for any BAM files that still lack one
271 // do interesting stuff using random-access...
275 If you want precise control over which index files are loaded, use OpenIndexes()
276 with the desired index filenames. If that function returns false, you can use
277 CreateIndexes() to then build index files of the exact requested format.
279 \param[in] preferredType desired index file format, see BamIndex::IndexType for available formats
280 \returns \c true if index files could be found for \b ALL open BAM files
281 \sa BamReader::LocateIndex()
283 bool BamMultiReader::LocateIndexes(const BamIndex::IndexType& preferredType) {
284 return d->LocateIndexes(preferredType);
287 /*! \fn bool BamMultiReader::Open(const std::vector<std::string>& filenames)
288 \brief Opens BAM files.
290 \note Opening BAM files will invalidate any current region set on the multireader.
291 All file pointers will be returned to the beginning of the alignment data. Follow
292 this with Jump() or SetRegion() to establish a region of interest.
294 \param[in] filenames list of BAM filenames to open
295 \returns \c true if BAM files were opened successfully
296 \sa Close(), HasOpenReaders(), OpenFile(), OpenIndexes(), BamReader::Open()
298 bool BamMultiReader::Open(const std::vector<std::string>& filenames) {
299 return d->Open(filenames);
302 /*! \fn bool BamMultiReader::OpenFile(const std::string& filename)
303 \brief Opens a single BAM file.
305 Adds another BAM file to multireader "on-the-fly".
307 \note Opening a BAM file will invalidate any current region set on the multireader.
308 All file pointers will be returned to the beginning of the alignment data. Follow
309 this with Jump() or SetRegion() to establish a region of interest.
311 \param[in] filename BAM filename to open
312 \returns \c true if BAM file was opened successfully
313 \sa Close(), HasOpenReaders(), Open(), OpenIndexes(), BamReader::Open()
315 bool BamMultiReader::OpenFile(const std::string& filename) {
316 return d->OpenFile(filename);
319 /*! \fn bool BamMultiReader::OpenIndexes(const std::vector<std::string>& indexFilenames)
320 \brief Opens index files for current BAM files.
322 \note Currently assumes that index filenames match the order (and number) of
323 BAM files passed to Open().
325 \param[in] indexFilenames list of BAM index file names
326 \returns \c true if BAM index file was opened & data loaded successfully
327 \sa LocateIndex(), Open(), SetIndex(), BamReader::OpenIndex()
329 bool BamMultiReader::OpenIndexes(const std::vector<std::string>& indexFilenames) {
330 return d->OpenIndexes(indexFilenames);
333 /*! \fn bool BamMultiReader::Rewind(void)
334 \brief Returns the internal file pointers to the beginning of alignment records.
336 Useful for performing multiple sequential passes through BAM files.
337 Calling this function clears any prior region that may have been set.
339 \returns \c true if rewind operation was successful
340 \sa Jump(), SetRegion(), BamReader::Rewind()
342 bool BamMultiReader::Rewind(void) {
346 /*! \fn void BamMultiReader::SetExplicitMergeOrder(BamMultiReader::MergeOrder order)
347 \brief Sets an explicit merge order, regardless of the BAM files' SO header tag.
349 The default behavior of the BamMultiReader is to check the SO tag in the BAM files'
350 SAM header text to determine the merge strategy". The merge strategy is used to
351 determine from which BAM file the next alignment should come when either
352 GetNextAlignment() or GetNextAlignmentCore() are called. If files share a
353 'coordinate' or 'queryname' value for this tag, then the merge strategy is
354 selected accordingly. If any of them do not match, or if any fileis marked as
355 'unsorted', then the merge strategy is simply a round-robin.
357 This method allows client code to explicitly override the lookup behavior. This
358 method can be useful when you know, for example, that your BAM files are sorted
359 by coordinate but upstream processes did not set the header tag properly.
361 \note This method should \bold not be called while reading alignments via
362 GetNextAlignment() or GetNextAlignmentCore(). For proper results, you should
363 call this method before (or immediately after) opening files, rewinding,
364 jumping, etc. but \bold not once alignment fetching has started. There is
365 nothing in the API to prevent you from doing so, but the results may be
368 \sa BamMultiReader::MergeOrder, GetMergeOrder(), GetNextAlignment(), GetNextAlignmentCore()
370 void BamMultiReader::SetExplicitMergeOrder(BamMultiReader::MergeOrder order) {
371 d->SetExplicitMergeOrder(order);
374 /*! \fn bool BamMultiReader::SetRegion(const BamRegion& region)
375 \brief Sets a target region of interest
377 Equivalent to calling BamReader::SetRegion() on all open BAM files.
379 \warning BamRegion now represents a zero-based, HALF-OPEN interval.
380 In previous versions of BamTools (0.x & 1.x) all intervals were treated
381 as zero-based, CLOSED.
383 \param[in] region desired region-of-interest to activate
384 \returns \c true if ALL readers set the region successfully
385 \sa HasIndexes(), Jump(), BamReader::SetRegion()
387 bool BamMultiReader::SetRegion(const BamRegion& region) {
388 return d->SetRegion(region);
391 /*! \fn bool BamMultiReader::SetRegion(const int& leftRefID,
392 const int& leftPosition,
393 const int& rightRefID,
394 const int& rightPosition)
395 \brief Sets a target region of interest
397 This is an overloaded function. Equivalent to calling BamReader::SetRegion() on all open BAM files.
399 \warning This function now expects a zero-based, HALF-OPEN interval.
400 In previous versions of BamTools (0.x & 1.x) all intervals were treated
401 as zero-based, CLOSED.
403 \param[in] leftRefID referenceID of region's left boundary
404 \param[in] leftPosition position of region's left boundary
405 \param[in] rightRefID reference ID of region's right boundary
406 \param[in] rightPosition position of region's right boundary
408 \returns \c true if ALL readers set the region successfully
409 \sa HasIndexes(), Jump(), BamReader::SetRegion()
411 bool BamMultiReader::SetRegion(const int& leftRefID,
412 const int& leftPosition,
413 const int& rightRefID,
414 const int& rightPosition)
416 return d->SetRegion( BamRegion(leftRefID, leftPosition, rightRefID, rightPosition) );