1 // ***************************************************************************
2 // BamMultiReader.cpp (c) 2010 Erik Garrison, Derek Barnett
3 // Marth Lab, Department of Biology, Boston College
4 // ---------------------------------------------------------------------------
5 // Last modified: 14 January 2013 (DB)
6 // ---------------------------------------------------------------------------
7 // Convenience class for reading multiple BAM files.
9 // This functionality allows applications to work on very large sets of files
10 // without requiring intermediate merge, sort, and index steps for each file
11 // subset. It also improves the performance of our merge system as it
12 // precludes the need to sort merged files.
13 // ***************************************************************************
15 #include "api/BamMultiReader.h"
16 #include "api/internal/bam/BamMultiReader_p.h"
17 using namespace BamTools;
23 /*! \class BamTools::BamMultiReader
24 \brief Convenience class for reading multiple BAM files.
26 /*! \enum BamMultiReader::MergeOrder
27 \brief Used to describe the merge strategy of the BamMultiReader.
29 The merge strategy determines which alignment is 'next' from across
32 /*! \var BamMultiReader::MergeOrder BamMultiReader::RoundRobinMerge
33 \brief Merge strategy when BAM files are unsorted, or their sorted status is either unknown or ignored
35 /*! \var BamMultiReader::MergeOrder BamMultiReader::MergeByCoordinate
36 \brief Merge strategy when BAM files are sorted by position ('coordinate')
38 /*! \var BamMultiReader::MergeOrder BamMultiReader::MergeByName
39 \brief Merge strategy when BAM files are sorted by read name ('queryname')
42 /*! \fn BamMultiReader::BamMultiReader(void)
45 BamMultiReader::BamMultiReader(void)
46 : d(new Internal::BamMultiReaderPrivate)
49 /*! \fn BamMultiReader::~BamMultiReader(void)
52 BamMultiReader::~BamMultiReader(void) {
57 /*! \fn void BamMultiReader::Close(void)
58 \brief Closes all open BAM files.
60 Also clears out all header and reference data.
62 \sa CloseFile(), IsOpen(), Open(), BamReader::Close()
64 bool BamMultiReader::Close(void) {
68 /*! \fn void BamMultiReader::CloseFile(const std::string& filename)
69 \brief Closes requested BAM file.
71 Leaves any other file(s) open, along with header and reference data.
73 \param[in] filename name of specific BAM file to close
75 \sa Close(), IsOpen(), Open(), BamReader::Close()
77 bool BamMultiReader::CloseFile(const std::string& filename) {
78 return d->CloseFile(filename);
81 /*! \fn bool BamMultiReader::CreateIndexes(const BamIndex::IndexType& type)
82 \brief Creates index files for the current BAM files.
84 \param[in] type file format to create, see BamIndex::IndexType for available formats
85 \return \c true if index files created OK
86 \sa LocateIndexes(), OpenIndexes(), BamReader::CreateIndex()
88 bool BamMultiReader::CreateIndexes(const BamIndex::IndexType& type) {
89 return d->CreateIndexes(type);
92 /*! \fn const std::vector<std::string> BamMultiReader::Filenames(void) const
93 \brief Returns list of filenames for all open BAM files.
95 Retrieved filenames will contain whatever was passed via Open().
96 If you need full directory paths here, be sure to include them
97 when you open the BAM files.
99 \returns names of open BAM files. If no files are open, returns an empty vector.
100 \sa IsOpen(), BamReader::GetFilename()
102 const std::vector<std::string> BamMultiReader::Filenames(void) const {
103 return d->Filenames();
106 /*! \fn std::string BamMultiReader::GetErrorString(void) const
107 \brief Returns a human-readable description of the last error that occurred
109 This method allows elimination of STDERR pollution. Developers of client code
110 may choose how the messages are displayed to the user, if at all.
112 \return error description
114 std::string BamMultiReader::GetErrorString(void) const {
115 return d->GetErrorString();
118 /*! \fn SamHeader BamMultiReader::GetHeader(void) const
119 \brief Returns unified SAM-format header for all files
121 \note Modifying the retrieved text does NOT affect the current
122 BAM files. These files have been opened in a read-only mode. However,
123 your modified header text can be used in conjunction with BamWriter
124 to generate a new BAM file with the appropriate header information.
126 \returns header data wrapped in SamHeader object
127 \sa GetHeaderText(), BamReader::GetHeader()
129 SamHeader BamMultiReader::GetHeader(void) const {
130 return d->GetHeader();
133 /*! \fn std::string BamMultiReader::GetHeaderText(void) const
134 \brief Returns unified SAM-format header text for all files
136 \note Modifying the retrieved text does NOT affect the current
137 BAM files. These files have been opened in a read-only mode. However,
138 your modified header text can be used in conjunction with BamWriter
139 to generate a new BAM file with the appropriate header information.
141 \returns SAM-formatted header text
142 \sa GetHeader(), BamReader::GetHeaderText()
144 std::string BamMultiReader::GetHeaderText(void) const {
145 return d->GetHeaderText();
148 /*! \fn BamMultiReader::MergeOrder BamMultiReader::GetMergeOrder(void) const
149 \brief Returns curent merge order strategy.
151 \returns current merge order enum value
152 \sa BamMultiReader::MergeOrder, SetExplicitMergeOrder()
154 BamMultiReader::MergeOrder BamMultiReader::GetMergeOrder(void) const {
155 return d->GetMergeOrder();
158 /*! \fn bool BamMultiReader::GetNextAlignment(BamAlignment& alignment)
159 \brief Retrieves next available alignment.
161 Equivalent to BamReader::GetNextAlignment() with respect to what is a valid
162 overlapping alignment and what data gets populated.
164 This method takes care of determining which alignment actually is 'next'
165 across multiple files, depending on their sort order.
167 \param[out] alignment destination for alignment record data
168 \returns \c true if a valid alignment was found
169 \sa GetNextAlignmentCore(), SetExplicitMergeOrder(), SetRegion(), BamReader::GetNextAlignment()
171 bool BamMultiReader::GetNextAlignment(BamAlignment& nextAlignment) {
172 return d->GetNextAlignment(nextAlignment);
175 /*! \fn bool BamMultiReader::GetNextAlignmentCore(BamAlignment& alignment)
176 \brief Retrieves next available alignment.
178 Equivalent to BamReader::GetNextAlignmentCore() with respect to what is a valid
179 overlapping alignment and what data gets populated.
181 This method takes care of determining which alignment actually is 'next'
182 across multiple files, depending on their sort order.
184 \param[out] alignment destination for alignment record data
185 \returns \c true if a valid alignment was found
186 \sa GetNextAlignment(), SetExplicitMergeOrder(), SetRegion(), BamReader::GetNextAlignmentCore()
188 bool BamMultiReader::GetNextAlignmentCore(BamAlignment& nextAlignment) {
189 return d->GetNextAlignmentCore(nextAlignment);
192 /*! \fn int BamMultiReader::GetReferenceCount(void) const
193 \brief Returns number of reference sequences.
194 \sa BamReader::GetReferenceCount()
196 int BamMultiReader::GetReferenceCount(void) const {
197 return d->GetReferenceCount();
200 /*! \fn const RefVector& BamMultiReader::GetReferenceData(void) const
201 \brief Returns all reference sequence entries.
202 \sa RefData, BamReader::GetReferenceData()
204 const BamTools::RefVector BamMultiReader::GetReferenceData(void) const {
205 return d->GetReferenceData();
208 /*! \fn int BamMultiReader::GetReferenceID(const std::string& refName) const
209 \brief Returns the ID of the reference with this name.
211 If \a refName is not found, returns -1.
213 \param[in] refName name of reference to look up
214 \sa BamReader::GetReferenceID()
216 int BamMultiReader::GetReferenceID(const std::string& refName) const {
217 return d->GetReferenceID(refName);
220 /*! \fn bool BamMultiReader::HasIndexes(void) const
221 \brief Returns \c true if all BAM files have index data available.
222 \sa BamReader::HasIndex()
224 bool BamMultiReader::HasIndexes(void) const {
225 return d->HasIndexes();
228 /*! \fn bool BamMultiReader::HasOpenReaders(void) const
229 \brief Returns \c true if there are any open BAM files.
231 bool BamMultiReader::HasOpenReaders(void) const {
232 return d->HasOpenReaders();
235 /*! \fn bool BamMultiReader::Jump(int refID, int position)
236 \brief Performs a random-access jump within current BAM files.
238 This is a convenience method, equivalent to calling SetRegion()
239 with only a left boundary specified.
241 \param[in] refID ID of reference to jump to
242 \param[in] position (0-based) left boundary
244 \returns \c true if jump was successful
245 \sa HasIndex(), BamReader::Jump()
248 bool BamMultiReader::Jump(int refID, int position) {
249 return d->Jump(refID, position);
252 /*! \fn bool BamMultiReader::LocateIndexes(const BamIndex::IndexType& preferredType)
253 \brief Looks for index files that match current BAM files.
255 Use this function when you need index files, and perhaps have a
256 preferred index format, but do not depend heavily on which indexes
257 actually get loaded at runtime.
259 For each BAM file, this function will defer to your \a preferredType
260 whenever possible. However, if an index file of \a preferredType can
261 not be found, then it will look for any other index file that matches
264 An example case would look this:
266 BamMultiReader reader;
270 // ensure that all files have an index
271 if ( !reader.LocateIndexes() ) // opens any existing index files that match our BAM files
272 reader.CreateIndexes(); // creates index files for any BAM files that still lack one
274 // do interesting stuff using random-access...
278 If you want precise control over which index files are loaded, use OpenIndexes()
279 with the desired index filenames. If that function returns false, you can use
280 CreateIndexes() to then build index files of the exact requested format.
282 \param[in] preferredType desired index file format, see BamIndex::IndexType for available formats
283 \returns \c true if index files could be found for \b ALL open BAM files
284 \sa BamReader::LocateIndex()
286 bool BamMultiReader::LocateIndexes(const BamIndex::IndexType& preferredType) {
287 return d->LocateIndexes(preferredType);
290 /*! \fn bool BamMultiReader::Open(const std::vector<std::string>& filenames)
291 \brief Opens BAM files.
293 \note Opening BAM files will invalidate any current region set on the multireader.
294 All file pointers will be returned to the beginning of the alignment data. Follow
295 this with Jump() or SetRegion() to establish a region of interest.
297 \param[in] filenames list of BAM filenames to open
298 \returns \c true if BAM files were opened successfully
299 \sa Close(), HasOpenReaders(), OpenFile(), OpenIndexes(), BamReader::Open()
301 bool BamMultiReader::Open(const std::vector<std::string>& filenames) {
302 return d->Open(filenames);
305 /*! \fn bool BamMultiReader::OpenFile(const std::string& filename)
306 \brief Opens a single BAM file.
308 Adds another BAM file to multireader "on-the-fly".
310 \note Opening a BAM file will invalidate any current region set on the multireader.
311 All file pointers will be returned to the beginning of the alignment data. Follow
312 this with Jump() or SetRegion() to establish a region of interest.
314 \param[in] filename BAM filename to open
315 \returns \c true if BAM file was opened successfully
316 \sa Close(), HasOpenReaders(), Open(), OpenIndexes(), BamReader::Open()
318 bool BamMultiReader::OpenFile(const std::string& filename) {
319 return d->OpenFile(filename);
322 /*! \fn bool BamMultiReader::OpenIndexes(const std::vector<std::string>& indexFilenames)
323 \brief Opens index files for current BAM files.
325 \note Currently assumes that index filenames match the order (and number) of
326 BAM files passed to Open().
328 \param[in] indexFilenames list of BAM index file names
329 \returns \c true if BAM index file was opened & data loaded successfully
330 \sa LocateIndex(), Open(), SetIndex(), BamReader::OpenIndex()
332 bool BamMultiReader::OpenIndexes(const std::vector<std::string>& indexFilenames) {
333 return d->OpenIndexes(indexFilenames);
336 /*! \fn bool BamMultiReader::Rewind(void)
337 \brief Returns the internal file pointers to the beginning of alignment records.
339 Useful for performing multiple sequential passes through BAM files.
340 Calling this function clears any prior region that may have been set.
342 \returns \c true if rewind operation was successful
343 \sa Jump(), SetRegion(), BamReader::Rewind()
345 bool BamMultiReader::Rewind(void) {
349 /*! \fn void BamMultiReader::SetExplicitMergeOrder(BamMultiReader::MergeOrder order)
350 \brief Sets an explicit merge order, regardless of the BAM files' SO header tag.
352 The default behavior of the BamMultiReader is to check the SO tag in the BAM files'
353 SAM header text to determine the merge strategy". The merge strategy is used to
354 determine from which BAM file the next alignment should come when either
355 GetNextAlignment() or GetNextAlignmentCore() are called. If files share a
356 'coordinate' or 'queryname' value for this tag, then the merge strategy is
357 selected accordingly. If any of them do not match, or if any fileis marked as
358 'unsorted', then the merge strategy is simply a round-robin.
360 This method allows client code to explicitly override the lookup behavior. This
361 method can be useful when you know, for example, that your BAM files are sorted
362 by coordinate but upstream processes did not set the header tag properly.
364 \note This method should \bold not be called while reading alignments via
365 GetNextAlignment() or GetNextAlignmentCore(). For proper results, you should
366 call this method before (or immediately after) opening files, rewinding,
367 jumping, etc. but \bold not once alignment fetching has started. There is
368 nothing in the API to prevent you from doing so, but the results may be
371 \returns \c true if merge order could be successfully applied
372 \sa BamMultiReader::MergeOrder, GetMergeOrder(), GetNextAlignment(), GetNextAlignmentCore()
374 bool BamMultiReader::SetExplicitMergeOrder(BamMultiReader::MergeOrder order) {
375 return d->SetExplicitMergeOrder(order);
378 /*! \fn bool BamMultiReader::SetRegion(const BamRegion& region)
379 \brief Sets a target region of interest
381 Equivalent to calling BamReader::SetRegion() on all open BAM files.
383 \warning BamRegion now represents a zero-based, HALF-OPEN interval.
384 In previous versions of BamTools (0.x & 1.x) all intervals were treated
385 as zero-based, CLOSED.
387 \param[in] region desired region-of-interest to activate
388 \returns \c true if ALL readers set the region successfully
389 \sa HasIndexes(), Jump(), BamReader::SetRegion()
391 bool BamMultiReader::SetRegion(const BamRegion& region) {
392 return d->SetRegion(region);
395 /*! \fn bool BamMultiReader::SetRegion(const int& leftRefID,
396 const int& leftPosition,
397 const int& rightRefID,
398 const int& rightPosition)
399 \brief Sets a target region of interest
401 This is an overloaded function. Equivalent to calling BamReader::SetRegion() on all open BAM files.
403 \warning This function now expects a zero-based, HALF-OPEN interval.
404 In previous versions of BamTools (0.x & 1.x) all intervals were treated
405 as zero-based, CLOSED.
407 \param[in] leftRefID referenceID of region's left boundary
408 \param[in] leftPosition position of region's left boundary
409 \param[in] rightRefID reference ID of region's right boundary
410 \param[in] rightPosition position of region's right boundary
412 \returns \c true if ALL readers set the region successfully
413 \sa HasIndexes(), Jump(), BamReader::SetRegion()
415 bool BamMultiReader::SetRegion(const int& leftRefID,
416 const int& leftPosition,
417 const int& rightRefID,
418 const int& rightPosition)
420 return d->SetRegion( BamRegion(leftRefID, leftPosition, rightRefID, rightPosition) );