From 2126ee0d204be8293df9492b48bce076a41a2a25 Mon Sep 17 00:00:00 2001 From: Don Armstrong Date: Wed, 2 Apr 2014 20:16:08 -0700 Subject: [PATCH] merge master 2.3.0 --- CMakeLists.txt | 2 +- docs/Doxyfile | 2 +- src/api/BamAlignment.cpp | 107 +++++- src/api/BamAlignment.h | 12 +- src/api/BamMultiReader.cpp | 60 +++- src/api/BamMultiReader.h | 14 +- src/api/BamReader.cpp | 27 +- src/api/BamReader.h | 6 +- src/api/CMakeLists.txt | 14 +- src/api/SamHeader.cpp | 1 + src/api/internal/bam/BamHeader_p.cpp | 7 +- src/api/internal/bam/BamHeader_p.h | 4 +- src/api/internal/bam/BamMultiReader_p.cpp | 107 +++++- src/api/internal/bam/BamMultiReader_p.h | 10 +- src/api/internal/bam/BamReader_p.cpp | 6 +- src/api/internal/bam/BamReader_p.h | 3 +- src/api/internal/bam/BamWriter_p.cpp | 4 +- src/api/internal/index/CMakeLists.txt | 4 +- src/api/internal/io/BamHttp_p.cpp | 394 ++++++++++++++-------- src/api/internal/io/BamHttp_p.h | 8 +- src/api/internal/io/CMakeLists.txt | 22 +- src/api/internal/io/HttpHeader_p.h | 2 +- src/api/internal/io/RollingBuffer_p.cpp | 6 +- src/api/internal/io/TcpSocket_p.cpp | 6 +- src/api/internal/io/TcpSocket_p.h | 6 +- src/api/internal/sam/CMakeLists.txt | 4 +- src/api/internal/utils/CMakeLists.txt | 4 +- src/third_party/jsoncpp/CMakeLists.txt | 4 +- src/toolkit/CMakeLists.txt | 2 +- src/toolkit/bamtools_convert.cpp | 27 +- src/toolkit/bamtools_count.cpp | 32 +- src/toolkit/bamtools_coverage.cpp | 3 +- src/toolkit/bamtools_filter.cpp | 91 +++-- src/toolkit/bamtools_header.cpp | 31 +- src/toolkit/bamtools_merge.cpp | 38 ++- src/toolkit/bamtools_random.cpp | 46 ++- src/toolkit/bamtools_resolve.cpp | 20 +- src/toolkit/bamtools_split.cpp | 23 +- src/toolkit/bamtools_stats.cpp | 31 +- src/utils/CMakeLists.txt | 4 +- src/utils/bamtools_filter_engine.h | 5 +- 41 files changed, 915 insertions(+), 284 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 944e01d..9b97fa0 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -31,7 +31,7 @@ ensure_out_of_source_build( " # set BamTools version information set( BamTools_VERSION_MAJOR 2 ) -set( BamTools_VERSION_MINOR 2 ) +set( BamTools_VERSION_MINOR 3 ) set( BamTools_VERSION_BUILD 0 ) # set our library and executable destination dirs diff --git a/docs/Doxyfile b/docs/Doxyfile index caf5f2c..410ea27 100644 --- a/docs/Doxyfile +++ b/docs/Doxyfile @@ -31,7 +31,7 @@ PROJECT_NAME = BamTools # This could be handy for archiving the generated documentation or # if some version control system is used. -PROJECT_NUMBER = 2.1.1 +PROJECT_NUMBER = 2.3.0 # The OUTPUT_DIRECTORY tag is used to specify the (relative or absolute) # base path where the generated documentation will be put. diff --git a/src/api/BamAlignment.cpp b/src/api/BamAlignment.cpp index 251c5e0..620ba2e 100644 --- a/src/api/BamAlignment.cpp +++ b/src/api/BamAlignment.cpp @@ -2,7 +2,7 @@ // BamAlignment.cpp (c) 2009 Derek Barnett // Marth Lab, Department of Biology, Boston College // --------------------------------------------------------------------------- -// Last modified: 4 April 2012 (DB) +// Last modified: 4 December 2012 (DB) // --------------------------------------------------------------------------- // Provides the BamAlignment data structure // *************************************************************************** @@ -394,6 +394,72 @@ bool BamAlignment::FindTag(const std::string& tag, return false; } +/*! \fn bool BamAlignment::GetArrayTagType(const std::string& tag, char& type) const + \brief Retrieves the BAM tag type-code for the array elements associated with requested tag name. + + \param[in] tag 2-character tag name + \param[out] type retrieved (1-character) type-code + + \return \c true if found. False if not found, or if tag is not an array type. + \sa \samSpecURL for more details on reserved tag names, supported tag types, etc. +*/ +bool BamAlignment::GetArrayTagType(const std::string& tag, char& type) const { + + // skip if alignment is core-only + if ( SupportData.HasCoreOnly ) { + // TODO: set error string? + return false; + } + + // skip if no tags present + if ( TagData.empty() ) { + // TODO: set error string? + return false; + } + + // localize the tag data + char* pTagData = (char*)TagData.data(); + const unsigned int tagDataLength = TagData.size(); + unsigned int numBytesParsed = 0; + + // if tag not found, return failure + if ( !FindTag(tag, pTagData, tagDataLength, numBytesParsed) ){ + // TODO: set error string? + return false; + } + + // check that tag type code is array + type = *(pTagData - 1); + if ( type != Constants::BAM_TAG_TYPE_ARRAY ) { + // TODO: set error string + return false; + } + + // fetch element type + const char elementType = *pTagData; + switch ( elementType ) { + + // allowable types + case (Constants::BAM_TAG_TYPE_INT8) : + case (Constants::BAM_TAG_TYPE_UINT8) : + case (Constants::BAM_TAG_TYPE_INT16) : + case (Constants::BAM_TAG_TYPE_UINT16) : + case (Constants::BAM_TAG_TYPE_INT32) : + case (Constants::BAM_TAG_TYPE_UINT32) : + case (Constants::BAM_TAG_TYPE_FLOAT) : + type = elementType; + break; + + default: + //TODO: set error string + return false; + } + + // if we get here, return success + return true; +} + + /*! \fn int BamAlignment::GetEndPosition(bool usePadded = false, bool closedInterval = false) const \brief Calculates alignment end position, based on its starting position and CIGAR data. @@ -551,6 +617,45 @@ bool BamAlignment::GetSoftClips(vector& clipSizes, return softClipFound; } +/*! \fn std::vector BamAlignment::GetTagNames(void) const + \brief Retrieves the BAM tag names. + + When paired with GetTagType() and GetTag(), this method allows you + to iterate over an alignment's tag data without knowing the names (or types) + beforehand. + + \return \c vector containing all tag names found (empty if none available) + \sa \samSpecURL for more details on reserved tag names, supported tag types, etc. +*/ +std::vector BamAlignment::GetTagNames(void) const { + + std::vector result; + if ( SupportData.HasCoreOnly || TagData.empty() ) + return result; + + char* pTagData = (char*)TagData.data(); + const unsigned int tagDataLength = TagData.size(); + unsigned int numBytesParsed = 0; + while ( numBytesParsed < tagDataLength ) { + + // get current tag name & type + const char* pTagName = pTagData; + const char* pTagType = pTagData + 2; + pTagData += 3; + numBytesParsed +=3; + + // store tag name + result.push_back( std::string(pTagName, 2) ); + + // find the next tag + if ( *pTagType == '\0' ) break; + if ( !SkipToNextTag(*pTagType, pTagData, numBytesParsed) ) break; + if ( *pTagData == '\0' ) break; + } + + return result; +} + /*! \fn bool BamAlignment::GetTagType(const std::string& tag, char& type) const \brief Retrieves the BAM tag type-code associated with requested tag name. diff --git a/src/api/BamAlignment.h b/src/api/BamAlignment.h index a2349ea..0f4fe73 100644 --- a/src/api/BamAlignment.h +++ b/src/api/BamAlignment.h @@ -2,7 +2,7 @@ // BamAlignment.h (c) 2009 Derek Barnett // Marth Lab, Department of Biology, Boston College // --------------------------------------------------------------------------- -// Last modified: 16 October 2011 (DB) +// Last modified: 25 July 2013 (DB) // --------------------------------------------------------------------------- // Provides the BamAlignment data structure // *************************************************************************** @@ -80,9 +80,15 @@ struct API_EXPORT BamAlignment { template bool GetTag(const std::string& tag, T& destination) const; template bool GetTag(const std::string& tag, std::vector& destination) const; + // retrieves all current tag names + std::vector GetTagNames(void) const; + // retrieves the SAM/BAM type-code for requested tag name bool GetTagType(const std::string& tag, char& type) const; + // retrieves the SAM/BAM type-code for the data elements in an array tag + bool GetArrayTagType(const std::string& tag, char& type) const; + // returns true if alignment has a record for this tag name bool HasTag(const std::string& tag) const; @@ -110,8 +116,8 @@ struct API_EXPORT BamAlignment { public: std::string Name; // read name int32_t Length; // length of query sequence - std::string QueryBases; // 'original' sequence (as reported from sequencing machine) - std::string AlignedBases; // 'aligned' sequence (includes any indels, padding, clipping) + std::string QueryBases; // 'original' sequence (contained in BAM file) + std::string AlignedBases; // 'aligned' sequence (QueryBases plus deletion, padding, clipping chars) std::string Qualities; // FASTQ qualities (ASCII characters, not numeric values) std::string TagData; // tag data (use provided methods to query/modify) int32_t RefID; // ID number for reference sequence diff --git a/src/api/BamMultiReader.cpp b/src/api/BamMultiReader.cpp index f61aa26..5c2a065 100644 --- a/src/api/BamMultiReader.cpp +++ b/src/api/BamMultiReader.cpp @@ -2,7 +2,7 @@ // BamMultiReader.cpp (c) 2010 Erik Garrison, Derek Barnett // Marth Lab, Department of Biology, Boston College // --------------------------------------------------------------------------- -// Last modified: 25 October 2011 (DB) +// Last modified: 14 January 2013 (DB) // --------------------------------------------------------------------------- // Convenience class for reading multiple BAM files. // @@ -23,6 +23,21 @@ using namespace std; /*! \class BamTools::BamMultiReader \brief Convenience class for reading multiple BAM files. */ +/*! \enum BamMultiReader::MergeOrder + \brief Used to describe the merge strategy of the BamMultiReader. + + The merge strategy determines which alignment is 'next' from across + all opened BAM files. +*/ +/*! \var BamMultiReader::MergeOrder BamMultiReader::RoundRobinMerge + \brief Merge strategy when BAM files are unsorted, or their sorted status is either unknown or ignored +*/ +/*! \var BamMultiReader::MergeOrder BamMultiReader::MergeByCoordinate + \brief Merge strategy when BAM files are sorted by position ('coordinate') +*/ +/*! \var BamMultiReader::MergeOrder BamMultiReader::MergeByName + \brief Merge strategy when BAM files are sorted by read name ('queryname') +*/ /*! \fn BamMultiReader::BamMultiReader(void) \brief constructor @@ -130,6 +145,16 @@ std::string BamMultiReader::GetHeaderText(void) const { return d->GetHeaderText(); } +/*! \fn BamMultiReader::MergeOrder BamMultiReader::GetMergeOrder(void) const + \brief Returns curent merge order strategy. + + \returns current merge order enum value + \sa BamMultiReader::MergeOrder, SetExplicitMergeOrder() +*/ +BamMultiReader::MergeOrder BamMultiReader::GetMergeOrder(void) const { + return d->GetMergeOrder(); +} + /*! \fn bool BamMultiReader::GetNextAlignment(BamAlignment& alignment) \brief Retrieves next available alignment. @@ -141,7 +166,7 @@ std::string BamMultiReader::GetHeaderText(void) const { \param[out] alignment destination for alignment record data \returns \c true if a valid alignment was found - \sa GetNextAlignmentCore(), SetRegion(), BamReader::GetNextAlignment() + \sa GetNextAlignmentCore(), SetExplicitMergeOrder(), SetRegion(), BamReader::GetNextAlignment() */ bool BamMultiReader::GetNextAlignment(BamAlignment& nextAlignment) { return d->GetNextAlignment(nextAlignment); @@ -158,7 +183,7 @@ bool BamMultiReader::GetNextAlignment(BamAlignment& nextAlignment) { \param[out] alignment destination for alignment record data \returns \c true if a valid alignment was found - \sa GetNextAlignment(), SetRegion(), BamReader::GetNextAlignmentCore() + \sa GetNextAlignment(), SetExplicitMergeOrder(), SetRegion(), BamReader::GetNextAlignmentCore() */ bool BamMultiReader::GetNextAlignmentCore(BamAlignment& nextAlignment) { return d->GetNextAlignmentCore(nextAlignment); @@ -321,6 +346,35 @@ bool BamMultiReader::Rewind(void) { return d->Rewind(); } +/*! \fn void BamMultiReader::SetExplicitMergeOrder(BamMultiReader::MergeOrder order) + \brief Sets an explicit merge order, regardless of the BAM files' SO header tag. + + The default behavior of the BamMultiReader is to check the SO tag in the BAM files' + SAM header text to determine the merge strategy". The merge strategy is used to + determine from which BAM file the next alignment should come when either + GetNextAlignment() or GetNextAlignmentCore() are called. If files share a + 'coordinate' or 'queryname' value for this tag, then the merge strategy is + selected accordingly. If any of them do not match, or if any fileis marked as + 'unsorted', then the merge strategy is simply a round-robin. + + This method allows client code to explicitly override the lookup behavior. This + method can be useful when you know, for example, that your BAM files are sorted + by coordinate but upstream processes did not set the header tag properly. + + \note This method should \bold not be called while reading alignments via + GetNextAlignment() or GetNextAlignmentCore(). For proper results, you should + call this method before (or immediately after) opening files, rewinding, + jumping, etc. but \bold not once alignment fetching has started. There is + nothing in the API to prevent you from doing so, but the results may be + unexpected. + + \returns \c true if merge order could be successfully applied + \sa BamMultiReader::MergeOrder, GetMergeOrder(), GetNextAlignment(), GetNextAlignmentCore() +*/ +bool BamMultiReader::SetExplicitMergeOrder(BamMultiReader::MergeOrder order) { + return d->SetExplicitMergeOrder(order); +} + /*! \fn bool BamMultiReader::SetRegion(const BamRegion& region) \brief Sets a target region of interest diff --git a/src/api/BamMultiReader.h b/src/api/BamMultiReader.h index e5fc9c9..4f8c133 100644 --- a/src/api/BamMultiReader.h +++ b/src/api/BamMultiReader.h @@ -2,7 +2,7 @@ // BamMultiReader.h (c) 2010 Erik Garrison, Derek Barnett // Marth Lab, Department of Biology, Boston College // --------------------------------------------------------------------------- -// Last modified: 25 October 2011 (DB) +// Last modified: 14 January 2013 (DB) // --------------------------------------------------------------------------- // Convenience class for reading multiple BAM files. // *************************************************************************** @@ -25,6 +25,14 @@ namespace Internal { class API_EXPORT BamMultiReader { + // enums + public: + // possible merge order strategies + enum MergeOrder { RoundRobinMerge = 0 + , MergeByCoordinate + , MergeByName + }; + // constructor / destructor public: BamMultiReader(void); @@ -43,6 +51,8 @@ class API_EXPORT BamMultiReader { bool CloseFile(const std::string& filename); // returns list of filenames for all open BAM files const std::vector Filenames(void) const; + // returns curent merge order strategy + BamMultiReader::MergeOrder GetMergeOrder(void) const; // returns true if multireader has any open BAM files bool HasOpenReaders(void) const; // performs random-access jump within current BAM files @@ -53,6 +63,8 @@ class API_EXPORT BamMultiReader { bool OpenFile(const std::string& filename); // returns file pointers to beginning of alignments bool Rewind(void); + // sets an explicit merge order, regardless of the BAM files' SO header tag + bool SetExplicitMergeOrder(BamMultiReader::MergeOrder order); // sets the target region of interest bool SetRegion(const BamRegion& region); // sets the target region of interest diff --git a/src/api/BamReader.cpp b/src/api/BamReader.cpp index ae2adec..c4f0432 100644 --- a/src/api/BamReader.cpp +++ b/src/api/BamReader.cpp @@ -2,7 +2,7 @@ // BamReader.cpp (c) 2009 Derek Barnett, Michael Str�mberg // Marth Lab, Department of Biology, Boston College // --------------------------------------------------------------------------- -// Last modified: 25 October 2011 (DB) +// Last modified: 29 July 2013 (DB) // --------------------------------------------------------------------------- // Provides read access to BAM files. // *************************************************************************** @@ -61,6 +61,22 @@ bool BamReader::CreateIndex(const BamIndex::IndexType& type) { return d->CreateIndex(type); } +/*! \fn const SamHeader& BamReader::GetConstSamHeader(void) const + \brief Returns const reference to SAM header data. + + Allows for read-only queries of SAM header data. + + If you do not need to modify the SAM header, use this method to avoid the + potentially expensive copy used by GetHeader(). + + \note + \returns const reference to header data object + \sa GetHeader(), GetHeaderText() +*/ +const SamHeader& BamReader::GetConstSamHeader(void) const { + return d->GetConstSamHeader(); +} + /*! \fn std::string BamReader::GetErrorString(void) const \brief Returns a human-readable description of the last error that occurred @@ -90,7 +106,8 @@ const std::string BamReader::GetFilename(void) const { /*! \fn SamHeader BamReader::GetHeader(void) const \brief Returns SAM header data. - Header data is wrapped in a SamHeader object that can be conveniently queried & modified. + Header data is wrapped in a SamHeader object that can be conveniently queried and/or modified. + If you only need read access, consider using GetConstSamHeader() instead. \note Modifying the retrieved SamHeader object does NOT affect the current BAM file. This file has been opened in a read-only mode. @@ -98,7 +115,7 @@ const std::string BamReader::GetFilename(void) const { BamWriter to generate a new BAM file with the appropriate header information. \returns header data object - \sa GetHeaderText() + \sa GetConstSamHeader(), GetHeaderText() */ SamHeader BamReader::GetHeader(void) const { return d->GetSamHeader(); @@ -154,8 +171,8 @@ bool BamReader::GetNextAlignment(BamAlignment& alignment) { However, this method does NOT populate the alignment's string data fields (read name, bases, qualities, tags, filename). This provides a boost in speed - when these fields are not required for every alignment. These fields can be - populated 'lazily' (as needed) by calling BamAlignment::BuildCharData() later. + when these fields are not required for every alignment. These fields, excluding filename, + can be populated 'lazily' (as needed) by calling BamAlignment::BuildCharData() later. \param[out] alignment destination for alignment record data \returns \c true if a valid alignment was found diff --git a/src/api/BamReader.h b/src/api/BamReader.h index fb9064d..15b4135 100644 --- a/src/api/BamReader.h +++ b/src/api/BamReader.h @@ -2,7 +2,7 @@ // BamReader.h (c) 2009 Derek Barnett, Michael Str�mberg // Marth Lab, Department of Biology, Boston College // --------------------------------------------------------------------------- -// Last modified: 10 October 2011 (DB) +// Last modified: 18 November 2012 (DB) // --------------------------------------------------------------------------- // Provides read access to BAM files. // *************************************************************************** @@ -69,7 +69,9 @@ class API_EXPORT BamReader { // access header data // ---------------------- - // returns SAM header data + // returns a read-only reference to SAM header data + const SamHeader& GetConstSamHeader(void) const; + // returns an editable copy of SAM header data SamHeader GetHeader(void) const; // returns SAM header data, as SAM-formatted text std::string GetHeaderText(void) const; diff --git a/src/api/CMakeLists.txt b/src/api/CMakeLists.txt index 5b66ec0..7e3d3ca 100644 --- a/src/api/CMakeLists.txt +++ b/src/api/CMakeLists.txt @@ -13,7 +13,7 @@ add_definitions( -DBAMTOOLS_API_LIBRARY ) # (for proper exporting of library sym add_definitions( -fPIC ) # (attempt to force PIC compiling on CentOS, not being set on shared libs by CMake) # fetch all internal source files -add_subdirectory ( internal ) +add_subdirectory( internal ) # make list of all API source files set( BamToolsAPISources @@ -34,7 +34,7 @@ set( BamToolsAPISources # create main BamTools API shared library add_library( BamTools SHARED ${BamToolsAPISources} ) set_target_properties( BamTools PROPERTIES - SOVERSION "2.2.0" + SOVERSION "2.3.0" OUTPUT_NAME "bamtools" ) # create main BamTools API static library @@ -44,17 +44,17 @@ set_target_properties( BamTools-static PROPERTIES PREFIX "lib" ) # link libraries automatically with zlib (and Winsock2, if applicable) -if( _WIN32 ) +if( WIN32 ) set( APILibs z ws2_32 ) -else( _WIN32 ) +else() set( APILibs z ) -endif( _WIN32 ) +endif() -target_link_libraries( BamTools ${APILibs} ) +target_link_libraries( BamTools ${APILibs} ) target_link_libraries( BamTools-static ${APILibs} ) # set library install destinations -install( TARGETS BamTools LIBRARY DESTINATION "lib/bamtools" RUNTIME DESTINATION "bin") +install( TARGETS BamTools LIBRARY DESTINATION "lib/bamtools" RUNTIME DESTINATION "bin") install( TARGETS BamTools-static ARCHIVE DESTINATION "lib/bamtools") # export API headers diff --git a/src/api/SamHeader.cpp b/src/api/SamHeader.cpp index 967957a..9221944 100644 --- a/src/api/SamHeader.cpp +++ b/src/api/SamHeader.cpp @@ -73,6 +73,7 @@ SamHeader::SamHeader(const SamHeader& other) , ReadGroups(other.ReadGroups) , Programs(other.Programs) , Comments(other.Comments) + , m_errorString(other.GetErrorString()) { } /*! \fn SamHeader::~SamHeader(void) diff --git a/src/api/internal/bam/BamHeader_p.cpp b/src/api/internal/bam/BamHeader_p.cpp index 02c0a25..aa3cdf7 100644 --- a/src/api/internal/bam/BamHeader_p.cpp +++ b/src/api/internal/bam/BamHeader_p.cpp @@ -2,7 +2,7 @@ // BamHeader_p.cpp (c) 2010 Derek Barnett // Marth Lab, Department of Biology, Boston College // --------------------------------------------------------------------------- -// Last modified: 25 October 2011 (DB) +// Last modified: 18 November 2012 (DB) // --------------------------------------------------------------------------- // Provides the basic functionality for handling BAM headers. // *************************************************************************** @@ -109,6 +109,11 @@ void BamHeader::ReadHeaderText(BgzfStream* stream, const uint32_t& length) { free(headerText); } +// returns const-reference to SamHeader data object +const SamHeader& BamHeader::ToConstSamHeader(void) const { + return m_header; +} + // returns *copy* of SamHeader data object SamHeader BamHeader::ToSamHeader(void) const { return m_header; diff --git a/src/api/internal/bam/BamHeader_p.h b/src/api/internal/bam/BamHeader_p.h index 499ad96..22851d8 100644 --- a/src/api/internal/bam/BamHeader_p.h +++ b/src/api/internal/bam/BamHeader_p.h @@ -2,7 +2,7 @@ // BamHeader_p.h (c) 2010 Derek Barnett // Marth Lab, Department of Biology, Boston College // --------------------------------------------------------------------------- -// Last modified: 10 October 2011 (DB) +// Last modified: 18 November 2012 (DB) // --------------------------------------------------------------------------- // Provides the basic functionality for handling BAM headers. // *************************************************************************** @@ -44,6 +44,8 @@ class BamHeader { // load BAM header ('magic number' and SAM header text) from BGZF stream // returns true if all OK void Load(BgzfStream* stream); + // returns (read-only) reference to SamHeader data object + const SamHeader& ToConstSamHeader(void) const; // returns (editable) copy of SamHeader data object SamHeader ToSamHeader(void) const; // returns SAM-formatted string of header data diff --git a/src/api/internal/bam/BamMultiReader_p.cpp b/src/api/internal/bam/BamMultiReader_p.cpp index d3f2b15..310d837 100644 --- a/src/api/internal/bam/BamMultiReader_p.cpp +++ b/src/api/internal/bam/BamMultiReader_p.cpp @@ -2,7 +2,7 @@ // BamMultiReader_p.cpp (c) 2010 Derek Barnett, Erik Garrison // Marth Lab, Department of Biology, Boston College // --------------------------------------------------------------------------- -// Last modified: 25 October 2011 (DB) +// Last modified: 24 July 2013 (DB) // --------------------------------------------------------------------------- // Functionality for simultaneously reading multiple BAM files // ************************************************************************* @@ -25,6 +25,8 @@ using namespace std; // ctor BamMultiReaderPrivate::BamMultiReaderPrivate(void) : m_alignmentCache(0) + , m_hasUserMergeOrder(false) + , m_mergeOrder(BamMultiReader::RoundRobinMerge) { } // dtor @@ -115,11 +117,19 @@ bool BamMultiReaderPrivate::CloseFiles(const vector& filenames) { } } - // make sure alignment cache is cleaned up if all readers closed - if ( m_readers.empty() && m_alignmentCache ) { - m_alignmentCache->Clear(); - delete m_alignmentCache; - m_alignmentCache = 0; + // make sure we clean up properly if all readers were closed + if ( m_readers.empty() ) { + + // clean up merger + if ( m_alignmentCache ) { + m_alignmentCache->Clear(); + delete m_alignmentCache; + m_alignmentCache = 0; + } + + // reset merge flags + m_hasUserMergeOrder = false; + m_mergeOrder = BamMultiReader::RoundRobinMerge; } // return whether all readers closed OK @@ -161,21 +171,46 @@ bool BamMultiReaderPrivate::CreateIndexes(const BamIndex::IndexType& type) { return true; } -IMultiMerger* BamMultiReaderPrivate::CreateAlignmentCache(void) const { +IMultiMerger* BamMultiReaderPrivate::CreateAlignmentCache(void) { + + // if no merge order set explicitly, use SAM header to lookup proper order + if ( !m_hasUserMergeOrder ) { + + // fetch SamHeader from BAM files + SamHeader header = GetHeader(); + + // if BAM files are sorted by position + if ( header.SortOrder == Constants::SAM_HD_SORTORDER_COORDINATE ) + m_mergeOrder = BamMultiReader::MergeByCoordinate; - // fetch SamHeader - SamHeader header = GetHeader(); + // if BAM files are sorted by read name + else if ( header.SortOrder == Constants::SAM_HD_SORTORDER_QUERYNAME ) + m_mergeOrder = BamMultiReader::MergeByName; - // if BAM files are sorted by position - if ( header.SortOrder == Constants::SAM_HD_SORTORDER_COORDINATE ) - return new MultiMerger(); + // otherwise, sorting is either "unknown" or marked as "unsorted" + else + m_mergeOrder = BamMultiReader::RoundRobinMerge; + } + + // use current merge order to create proper 'multi-merger' + switch ( m_mergeOrder ) { + + // merge BAM files by position + case BamMultiReader::MergeByCoordinate : + return new MultiMerger(); + + // merge BAM files by read name + case BamMultiReader::MergeByName : + return new MultiMerger(); - // if BAM files are sorted by read name - if ( header.SortOrder == Constants::SAM_HD_SORTORDER_QUERYNAME ) - return new MultiMerger(); + // sorting is "unknown", "unsorted" or "ignored"... so use unsorted merger + case BamMultiReader::RoundRobinMerge : + return new MultiMerger(); - // otherwise "unknown" or "unsorted", use unsorted merger and just read in - return new MultiMerger(); + // unknown merge order, can't create merger + default: + return 0; + } } const vector BamMultiReaderPrivate::Filenames(void) const { @@ -248,6 +283,10 @@ string BamMultiReaderPrivate::GetHeaderText(void) const { return mergedHeader.ToString(); } +BamMultiReader::MergeOrder BamMultiReaderPrivate::GetMergeOrder(void) const { + return m_mergeOrder; +} + // get next alignment among all files bool BamMultiReaderPrivate::GetNextAlignment(BamAlignment& al) { return PopNextCachedAlignment(al, true); @@ -622,6 +661,40 @@ void BamMultiReaderPrivate::SaveNextAlignment(BamReader* reader, BamAlignment* a m_alignmentCache->Add( MergeItem(reader, alignment) ); } +bool BamMultiReaderPrivate::SetExplicitMergeOrder(BamMultiReader::MergeOrder order) { + + // set new merge flags + m_hasUserMergeOrder = true; + m_mergeOrder = order; + + // remove any existing merger (storing any existing data sitting in the cache) + vector currentCacheData; + if ( m_alignmentCache ) { + while ( !m_alignmentCache->IsEmpty() ) + currentCacheData.push_back( m_alignmentCache->TakeFirst() ); + delete m_alignmentCache; + m_alignmentCache = 0; + } + + // create new cache using the new merge flags + m_alignmentCache = CreateAlignmentCache(); + if ( m_alignmentCache == 0 ) { + SetErrorString("BamMultiReader::SetExplicitMergeOrder", "requested order is unrecognized"); + return false; + } + + // push current data onto new cache + vector::const_iterator readerIter = currentCacheData.begin(); + vector::const_iterator readerEnd = currentCacheData.end(); + for ( ; readerIter != readerEnd; ++readerIter ) { + const MergeItem& item = (*readerIter); + m_alignmentCache->Add(item); + } + + // return success + return true; +} + void BamMultiReaderPrivate::SetErrorString(const string& where, const string& what) const { static const string SEPARATOR = ": "; m_errorString = where + SEPARATOR + what; diff --git a/src/api/internal/bam/BamMultiReader_p.h b/src/api/internal/bam/BamMultiReader_p.h index 9d7c39a..3a7a0b2 100644 --- a/src/api/internal/bam/BamMultiReader_p.h +++ b/src/api/internal/bam/BamMultiReader_p.h @@ -2,7 +2,7 @@ // BamMultiReader_p.h (c) 2010 Derek Barnett // Marth Lab, Department of Biology, Boston College // --------------------------------------------------------------------------- -// Last modified: 25 October 2011 (DB) +// Last modified: 14 January 2013 (DB) // --------------------------------------------------------------------------- // Functionality for simultaneously reading multiple BAM files // ************************************************************************* @@ -54,9 +54,11 @@ class BamMultiReaderPrivate { bool SetRegion(const BamRegion& region); // access alignment data + BamMultiReader::MergeOrder GetMergeOrder(void) const; bool GetNextAlignment(BamAlignment& al); bool GetNextAlignmentCore(BamAlignment& al); bool HasOpenReaders(void); + bool SetExplicitMergeOrder(BamMultiReader::MergeOrder order); // access auxiliary data SamHeader GetHeader(void) const; @@ -78,7 +80,7 @@ class BamMultiReaderPrivate { public: bool CloseFiles(const std::vector& filenames); - IMultiMerger* CreateAlignmentCache(void) const; + IMultiMerger* CreateAlignmentCache(void); bool PopNextCachedAlignment(BamAlignment& al, const bool needCharData); bool RewindReaders(void); void SaveNextAlignment(BamReader* reader, BamAlignment* alignment); @@ -90,6 +92,10 @@ class BamMultiReaderPrivate { public: std::vector m_readers; IMultiMerger* m_alignmentCache; + + bool m_hasUserMergeOrder; + BamMultiReader::MergeOrder m_mergeOrder; + mutable std::string m_errorString; }; diff --git a/src/api/internal/bam/BamReader_p.cpp b/src/api/internal/bam/BamReader_p.cpp index 24e54fd..737d598 100644 --- a/src/api/internal/bam/BamReader_p.cpp +++ b/src/api/internal/bam/BamReader_p.cpp @@ -2,7 +2,7 @@ // BamReader_p.cpp (c) 2009 Derek Barnett // Marth Lab, Department of Biology, Boston College // --------------------------------------------------------------------------- -// Last modified: 28 November 2011 (DB) +// Last modified: 18 November 2012 (DB) // --------------------------------------------------------------------------- // Provides the basic functionality for reading BAM files // *************************************************************************** @@ -94,6 +94,10 @@ const string BamReaderPrivate::Filename(void) const { return m_filename; } +const SamHeader& BamReaderPrivate::GetConstSamHeader(void) const { + return m_header.ToConstSamHeader(); +} + string BamReaderPrivate::GetErrorString(void) const { return m_errorString; } diff --git a/src/api/internal/bam/BamReader_p.h b/src/api/internal/bam/BamReader_p.h index e8db646..a49ad2a 100644 --- a/src/api/internal/bam/BamReader_p.h +++ b/src/api/internal/bam/BamReader_p.h @@ -2,7 +2,7 @@ // BamReader_p.h (c) 2010 Derek Barnett // Marth Lab, Department of Biology, Boston College // --------------------------------------------------------------------------- -// Last modified: 25 October 2011 (DB) +// Last modified: 18 November 2012 (DB) // --------------------------------------------------------------------------- // Provides the basic functionality for reading BAM files // *************************************************************************** @@ -56,6 +56,7 @@ class BamReaderPrivate { // access auxiliary data std::string GetHeaderText(void) const; + const SamHeader& GetConstSamHeader(void) const; SamHeader GetSamHeader(void) const; int GetReferenceCount(void) const; const RefVector& GetReferenceData(void) const; diff --git a/src/api/internal/bam/BamWriter_p.cpp b/src/api/internal/bam/BamWriter_p.cpp index 8877800..637bb7a 100644 --- a/src/api/internal/bam/BamWriter_p.cpp +++ b/src/api/internal/bam/BamWriter_p.cpp @@ -2,7 +2,7 @@ // BamWriter_p.cpp (c) 2010 Derek Barnett // Marth Lab, Department of Biology, Boston College // --------------------------------------------------------------------------- -// Last modified: 4 April 2012 (DB) +// Last modified: 18 November 2012 (DB) // --------------------------------------------------------------------------- // Provides the basic functionality for producing BAM files // *************************************************************************** @@ -284,7 +284,7 @@ void BamWriterPrivate::WriteAlignment(const BamAlignment& al) { // write the base qualities char* pBaseQualities = new char[queryLength](); - if ( al.Qualities.empty() || al.Qualities == "*" ) + if ( al.Qualities.empty() || ( al.Qualities.size() == 1 && al.Qualities[0] == '*' ) || al.Qualities[0] == (char)0xFF ) memset(pBaseQualities, 0xFF, queryLength); // if missing or '*', fill with invalid qual else { for ( size_t i = 0; i < queryLength; ++i ) diff --git a/src/api/internal/index/CMakeLists.txt b/src/api/internal/index/CMakeLists.txt index 1c78cb9..d6a7df6 100644 --- a/src/api/internal/index/CMakeLists.txt +++ b/src/api/internal/index/CMakeLists.txt @@ -5,9 +5,9 @@ # src/api/internal/index # ========================== -set ( InternalIndexDir "${InternalDir}/index" ) +set( InternalIndexDir "${InternalDir}/index" ) -set ( InternalIndexSources +set( InternalIndexSources ${InternalIndexDir}/BamIndexFactory_p.cpp ${InternalIndexDir}/BamStandardIndex_p.cpp ${InternalIndexDir}/BamToolsIndex_p.cpp diff --git a/src/api/internal/io/BamHttp_p.cpp b/src/api/internal/io/BamHttp_p.cpp index 377be82..b089172 100644 --- a/src/api/internal/io/BamHttp_p.cpp +++ b/src/api/internal/io/BamHttp_p.cpp @@ -2,7 +2,7 @@ // BamHttp_p.cpp (c) 2011 Derek Barnett // Marth Lab, Department of Biology, Boston College // --------------------------------------------------------------------------- -// Last modified: 8 December 2011 (DB) +// Last modified: 24 July 2013 (DB) // --------------------------------------------------------------------------- // Provides reading/writing of BAM files on HTTP server // *************************************************************************** @@ -16,6 +16,7 @@ using namespace BamTools::Internal; #include #include +#include #include #include using namespace std; @@ -34,9 +35,11 @@ static const size_t HTTP_PREFIX_LENGTH = 7; static const string DOUBLE_NEWLINE = "\n\n"; static const string GET_METHOD = "GET"; +static const string HEAD_METHOD = "HEAD"; static const string HOST_HEADER = "Host"; static const string RANGE_HEADER = "Range"; static const string BYTES_PREFIX = "bytes="; +static const string CONTENT_LENGTH_HEADER = "Content-Length"; static const char HOST_SEPARATOR = '/'; static const char PROXY_SEPARATOR = ':'; @@ -75,7 +78,8 @@ BamHttp::BamHttp(const string& url) , m_response(0) , m_isUrlParsed(false) , m_filePosition(-1) - , m_endRangeFilePosition(-1) + , m_fileEndPosition(-1) + , m_rangeEndPosition(-1) { ParseUrl(url); } @@ -88,25 +92,24 @@ BamHttp::~BamHttp(void) { delete m_socket; } -void BamHttp::Close(void) { - - // disconnect socket - m_socket->DisconnectFromHost(); - - // clean up request & response - if ( m_request ) { - delete m_request; - m_request = 0; - } +void BamHttp::ClearResponse(void) { if ( m_response ) { delete m_response; m_response = 0; } +} - // reset state - necessary?? +void BamHttp::Close(void) { + + // disconnect socket & clear related resources + DisconnectSocket(); + + // reset state m_isUrlParsed = false; - m_filePosition = -1; - m_endRangeFilePosition = -1; + m_filePosition = -1; + m_fileEndPosition = -1; + m_rangeEndPosition = -1; + m_mode = IBamIODevice::NotOpen; } bool BamHttp::ConnectSocket(void) { @@ -115,23 +118,7 @@ bool BamHttp::ConnectSocket(void) { // any state checks, etc? if ( !m_socket->ConnectToHost(m_hostname, m_port, m_mode) ) { - // TODO: set error string - return false; - } - - // attempt initial request - m_filePosition = 0; - m_endRangeFilePosition = -1; - if ( !SendRequest() ) { - // TODO: set error string - Close(); - return false; - } - - // wait for response from server - if ( !ReceiveResponse() ) { - // TODO: set error string - Close(); + SetErrorString("BamHttp::ConnectSocket", m_socket->GetErrorString()); return false; } @@ -139,10 +126,21 @@ bool BamHttp::ConnectSocket(void) { return true; } +void BamHttp::DisconnectSocket(void) { + + // disconnect socket & clean up + m_socket->DisconnectFromHost(); + ClearResponse(); + if ( m_request ) { + delete m_request; + m_request = 0; + } +} + bool BamHttp::EnsureSocketConnection(void) { if ( m_socket->IsConnected() ) return true; - else return ConnectSocket(); + return ConnectSocket(); } bool BamHttp::IsOpen(void) const { @@ -168,6 +166,20 @@ bool BamHttp::Open(const IBamIODevice::OpenMode mode) { return false; } + // initialize our file positions + m_filePosition = 0; + m_fileEndPosition = 0; + m_rangeEndPosition = 0; + + // attempt to send initial request (just 'HEAD' to check connection) + if ( !SendHeadRequest() ) { + SetErrorString("BamHttp::Open", m_socket->GetErrorString()); + return false; + } + + // clear response from HEAD request, not needed + ClearResponse(); + // return success return true; } @@ -216,62 +228,90 @@ int64_t BamHttp::Read(char* data, const unsigned int numBytes) { if ( !IsOpen() ) return -1; - // read until hit desired @numBytes - int64_t bytesReadSoFar = 0; - while ( bytesReadSoFar < numBytes ) { - - // calculate number of bytes we're going to try to read this iteration - const size_t remainingBytes = ( numBytes - bytesReadSoFar ); + int64_t numBytesReadSoFar = 0; + while ( numBytesReadSoFar < numBytes ) { - // if socket has access to entire file contents - // i.e. we received response with full data (status code == 200) - if ( m_endRangeFilePosition < 0 ) { + const size_t remaining = static_cast( numBytes - numBytesReadSoFar ); - // try to read 'remainingBytes' from socket - const int64_t socketBytesRead = ReadFromSocket(data+bytesReadSoFar, remainingBytes); - if ( socketBytesRead < 0 ) // error + // if we're not holding a valid GET reponse, get one + if ( m_response == 0 ) { + if ( !SendGetRequest(remaining) ) return -1; - else if ( socketBytesRead == 0 ) // EOF - return bytesReadSoFar; - bytesReadSoFar += socketBytesRead; - m_filePosition += socketBytesRead; } + BT_ASSERT_X(m_response, "null HTTP response"); - // socket has access to a range of data (might already be in buffer) - // i.e. we received response with partial data (status code == 206) - else { + // check response status code + const int statusCode = m_response->GetStatusCode(); + + // if we receieved full file contents in response + if ( statusCode == 200 ) { + + // try to read 'remaining' bytes from socket + const int64_t socketBytesRead = ReadFromSocket(data+numBytesReadSoFar, remaining); + + // if error + if ( socketBytesRead < 0 ) { + SetErrorString("BamHttp::Read", m_socket->GetErrorString()); + return -1; + } + + // EOF + else if ( socketBytesRead == 0 ) + return numBytesReadSoFar; - // there is data left from last request - if ( m_endRangeFilePosition > m_filePosition ) { + // update counters + numBytesReadSoFar += socketBytesRead; + m_filePosition += socketBytesRead; - // try to read either the total 'remainingBytes' or - // whatever we have remaining from last request range - const size_t rangeRemainingBytes = m_endRangeFilePosition - m_filePosition; - const size_t bytesToRead = std::min(remainingBytes, rangeRemainingBytes); - const int64_t socketBytesRead = ReadFromSocket(data+bytesReadSoFar, bytesToRead); - if ( socketBytesRead < 0 ) // error + } + + // else if we received a range of bytes in response + else if ( statusCode == 206 ) { + + // if we've exhausted the last request + if ( m_filePosition == m_rangeEndPosition ) { + if ( !SendGetRequest(remaining) ) return -1; - else if ( socketBytesRead == 0 ) // EOF - return bytesReadSoFar; - bytesReadSoFar += socketBytesRead; - m_filePosition += socketBytesRead; } - // otherwise, this is a 1st-time read or - // we already read everything from the last GET request else { - // request for next range - if ( !SendRequest(remainingBytes) || !ReceiveResponse() ) { - Close(); + // try to read 'remaining' bytes from socket + const int64_t socketBytesRead = ReadFromSocket(data+numBytesReadSoFar, remaining); + + // if error + if ( socketBytesRead < 0 ) { + SetErrorString("BamHttp::Read", m_socket->GetErrorString()); return -1; } + + // maybe EOF + else if ( socketBytesRead == 0 ) { + + // if we know we're not at end position, fire off a new request + if ( m_fileEndPosition > 0 && m_filePosition < m_fileEndPosition ) { + if ( !SendGetRequest() ) + return -1; + } else + return numBytesReadSoFar; + } + + // update counters + numBytesReadSoFar += socketBytesRead; + m_filePosition += socketBytesRead; } } + + + // else some other HTTP status + else { + SetErrorString("BamHttp::Read", "unsupported status code in response"); + return -1; + } } - // return actual number bytes successfully read - return bytesReadSoFar; + // return actual number of bytes read + return numBytesReadSoFar; } int64_t BamHttp::ReadFromSocket(char* data, const unsigned int maxNumBytes) { @@ -280,17 +320,14 @@ int64_t BamHttp::ReadFromSocket(char* data, const unsigned int maxNumBytes) { bool BamHttp::ReceiveResponse(void) { - // clear any prior response - if ( m_response ) - delete m_response; - - // make sure we're connected - if ( !EnsureSocketConnection() ) - return false; - // fetch header, up until double new line string responseHeader; do { + + // make sure we can read a line + if ( !m_socket->WaitForReadLine() ) + return false; + // read line & append to full header const string headerLine = m_socket->ReadLine(); responseHeader += headerLine; @@ -299,7 +336,7 @@ bool BamHttp::ReceiveResponse(void) { // sanity check if ( responseHeader.empty() ) { - // TODO: set error string + SetErrorString("BamHttp::ReceiveResponse", "empty HTTP response"); Close(); return false; } @@ -307,93 +344,184 @@ bool BamHttp::ReceiveResponse(void) { // create response from header text m_response = new HttpResponseHeader(responseHeader); if ( !m_response->IsValid() ) { - // TODO: set error string + SetErrorString("BamHttp::ReceiveResponse", "could not parse HTTP response"); Close(); return false; } - // if we got range response as requested - if ( m_response->GetStatusCode() == 206 ) - return true; - - // if we got the full file contents instead of range - else if ( m_response->GetStatusCode() == 200 ) { + // if we get here, success + return true; +} - // skip up to current file position - RaiiBuffer tmp(0x8000); - int64_t numBytesRead = 0; - while ( numBytesRead < m_filePosition ) { +bool BamHttp::Seek(const int64_t& position, const int origin) { - const int64_t remaining = m_filePosition - numBytesRead; - const size_t bytesToRead = static_cast( (remaining > 0x8000) ? 0x8000 : remaining ); - const int64_t socketBytesRead = ReadFromSocket(tmp.Buffer, bytesToRead); - if ( socketBytesRead < 0 ) { // error - Close(); - return false; - } - else if ( socketBytesRead == 0 ) // EOF - break; + // if HTTP device not in a valid state + if ( !IsOpen() ) { + SetErrorString("BamHttp::Seek", "cannot seek on unopen connection"); + return false; + } - numBytesRead += socketBytesRead; - } + // reset the connection + DisconnectSocket(); + if ( !ConnectSocket() ) { + SetErrorString("BamHttp::Seek", m_socket->GetErrorString()); + return false; + } - // return success - return ( numBytesRead == m_filePosition); + // udpate file position + switch ( origin ) { + case SEEK_CUR : m_filePosition += position; break; + case SEEK_SET : m_filePosition = position; break; + default : + SetErrorString("BamHttp::Seek", "unsupported seek origin"); + return false; } - // on any other reponse status - // TODO: set error string - Close(); - return false; + // return success + return true; } -bool BamHttp::Seek(const int64_t& position, const int origin) { +bool BamHttp::SendGetRequest(const size_t numBytes) { - // if HTTP device not in a valid state - if ( !IsOpen() ) { - // TODO: set error string + // clear previous data + ClearResponse(); + if ( m_request ) + delete m_request; + m_socket->ClearBuffer(); + + // make sure we're connected + if ( !EnsureSocketConnection() ) + return false; + + // create range string + const int64_t endPosition = m_filePosition + std::max(static_cast(0x10000), numBytes); + stringstream range(""); + range << BYTES_PREFIX << m_filePosition << '-' << endPosition; + + // create request + m_request = new HttpRequestHeader(GET_METHOD, m_filename); + m_request->SetField(HOST_HEADER, m_hostname); + m_request->SetField(RANGE_HEADER, range.str()); + + // send request + const string requestHeader = m_request->ToString(); + const size_t headerSize = requestHeader.size(); + if ( WriteToSocket(requestHeader.c_str(), headerSize) != headerSize ) { + SetErrorString("BamHttp::SendHeadRequest", m_socket->GetErrorString()); return false; } - // discard socket's buffer contents, update positions, & return success + // ensure clean buffer m_socket->ClearBuffer(); - if ( origin == SEEK_CUR ) - m_filePosition += position; - else if ( origin == SEEK_SET ) - m_filePosition = position; - else { - // TODO: set error string + // wait for response + if ( !ReceiveResponse() ) { + SetErrorString("BamHttp::SendGetRequest", m_socket->GetErrorString()); + Close(); return false; } - m_endRangeFilePosition = m_filePosition; - return true; + BT_ASSERT_X(m_response, "BamHttp::SendGetRequest : null HttpResponse"); + BT_ASSERT_X(m_response->IsValid(), "BamHttp::SendGetRequest : invalid HttpResponse"); + + // check response status code + const int statusCode = m_response->GetStatusCode(); + switch ( statusCode ) { + + // ranged response, as requested + case 206 : + // get content length if available + if ( m_response->ContainsKey(CONTENT_LENGTH_HEADER) ) { + const string contentLengthString = m_response->GetValue(CONTENT_LENGTH_HEADER); + m_rangeEndPosition = m_filePosition + atoi( contentLengthString.c_str() ); + } + return true; + + // full contents, not range + case 200 : + { + // skip up to current file position + RaiiBuffer tmp(0x8000); + int64_t numBytesRead = 0; + while ( numBytesRead < m_filePosition ) { + + // read data from response + const int64_t remaining = m_filePosition - numBytesRead; + const size_t bytesToRead = static_cast( (remaining > 0x8000) ? 0x8000 : remaining ); + const int64_t socketBytesRead = ReadFromSocket(tmp.Buffer, bytesToRead); + + // if error + if ( socketBytesRead < 0 ) { + SetErrorString("BamHttp::SendGetRequest", m_socket->GetErrorString()); + Close(); + return false; + } + + // else if EOF + else if ( socketBytesRead == 0 && m_socket->BufferBytesAvailable() == 0 ) + break; + + // update byte counter + numBytesRead += socketBytesRead; + } + + // return success + return ( numBytesRead == m_filePosition); + } + + // any other status codes + default: + break; + } + + // fail on unexpected status code + SetErrorString("BamHttp::SendGetRequest", "unsupported status code in response"); + Close(); + return false; } -bool BamHttp::SendRequest(const size_t numBytes) { +bool BamHttp::SendHeadRequest(void) { - // remove any currently active request + // ensure clean slate + ClearResponse(); if ( m_request ) delete m_request; - - // create range string - m_endRangeFilePosition = m_filePosition + numBytes; - stringstream range(""); - range << BYTES_PREFIX << m_filePosition << '-' << m_endRangeFilePosition; + m_socket->ClearBuffer(); // make sure we're connected if ( !EnsureSocketConnection() ) return false; // create request - m_request = new HttpRequestHeader(GET_METHOD, m_filename); - m_request->SetField(HOST_HEADER, m_hostname); - m_request->SetField(RANGE_HEADER, range.str()); + m_request = new HttpRequestHeader(HEAD_METHOD, m_filename); + m_request->SetField(HOST_HEADER, m_hostname); - // write request to socket + // send request const string requestHeader = m_request->ToString(); const size_t headerSize = requestHeader.size(); - return ( WriteToSocket(requestHeader.c_str(), headerSize) == headerSize ); + if ( WriteToSocket(requestHeader.c_str(), headerSize) != headerSize ) { + SetErrorString("BamHttp::SendHeadRequest", m_socket->GetErrorString()); + return false; + } + + m_socket->ClearBuffer(); + + // wait for response from server + if ( !ReceiveResponse() ) { + SetErrorString("BamHttp::SendHeadRequest", m_socket->GetErrorString()); + Close(); + return false; + } + BT_ASSERT_X(m_response, "BamHttp::SendHeadRequest : null HttpResponse"); + BT_ASSERT_X(m_response->IsValid(), "BamHttp::SendHeadRequest : invalid HttpResponse"); + + // get content length if available + if ( m_response->ContainsKey(CONTENT_LENGTH_HEADER) ) { + const string contentLengthString = m_response->GetValue(CONTENT_LENGTH_HEADER); + m_fileEndPosition = atoi( contentLengthString.c_str() ) - 1; + } + + // return whether we found any errors + return m_socket->GetError() == TcpSocket::NoError; } int64_t BamHttp::Tell(void) const { diff --git a/src/api/internal/io/BamHttp_p.h b/src/api/internal/io/BamHttp_p.h index 371ccce..cbbc95c 100644 --- a/src/api/internal/io/BamHttp_p.h +++ b/src/api/internal/io/BamHttp_p.h @@ -50,12 +50,15 @@ class BamHttp : public IBamIODevice { // internal methods private: + void ClearResponse(void); bool ConnectSocket(void); + void DisconnectSocket(void); bool EnsureSocketConnection(void); void ParseUrl(const std::string& url); int64_t ReadFromSocket(char* data, const unsigned int numBytes); bool ReceiveResponse(void); - bool SendRequest(const size_t numBytes = 0); + bool SendGetRequest(const size_t numBytes = 0x10000); + bool SendHeadRequest(void); int64_t WriteToSocket(const char* data, const unsigned int numBytes); // data members @@ -78,7 +81,8 @@ class BamHttp : public IBamIODevice { // file position int64_t m_filePosition; - int64_t m_endRangeFilePosition; + int64_t m_fileEndPosition; + int64_t m_rangeEndPosition; }; } // namespace Internal diff --git a/src/api/internal/io/CMakeLists.txt b/src/api/internal/io/CMakeLists.txt index d9da416..28153d5 100644 --- a/src/api/internal/io/CMakeLists.txt +++ b/src/api/internal/io/CMakeLists.txt @@ -5,12 +5,12 @@ # src/api/internal/io # ========================== -set ( InternalIODir "${InternalDir}/io" ) +set( InternalIODir "${InternalDir}/io" ) #-------------------------- # platform-independent IO #-------------------------- -set ( CommonIOSources +set( CommonIOSources ${InternalIODir}/BamDeviceFactory_p.cpp ${InternalIODir}/BamFile_p.cpp ${InternalIODir}/BamFtp_p.cpp @@ -30,21 +30,17 @@ set ( CommonIOSources #------------------------ # platform-dependent IO #------------------------ -if ( _WIN32 ) - set ( PlatformIOSources - ${InternalIODir}/TcpSocketEngine_win_p.cpp - ) -else ( _WIN32 ) - set ( PlatformIOSources - ${InternalIODir}/TcpSocketEngine_unix_p.cpp - ) -endif ( _WIN32 ) +if( WIN32 ) + set( PlatformIOSources ${InternalIODir}/TcpSocketEngine_win_p.cpp ) +else() + set( PlatformIOSources ${InternalIODir}/TcpSocketEngine_unix_p.cpp ) +endif() #--------------------------- # make build-specific list #--------------------------- -set ( InternalIOSources - ${CommonIOSources} +set( InternalIOSources + ${CommonIOSources} ${PlatformIOSources} PARENT_SCOPE # <-- leave this last diff --git a/src/api/internal/io/HttpHeader_p.h b/src/api/internal/io/HttpHeader_p.h index 7a50ff9..6b838ff 100644 --- a/src/api/internal/io/HttpHeader_p.h +++ b/src/api/internal/io/HttpHeader_p.h @@ -75,7 +75,7 @@ class HttpRequestHeader : public HttpHeader { // ctor & dtor public: - HttpRequestHeader(const std::string& method, // "GET", "PUT", etc + HttpRequestHeader(const std::string& method, // "GET", "HEAD", ... const std::string& resource, // filename int majorVersion = 1, // version info int minorVersion = 1); diff --git a/src/api/internal/io/RollingBuffer_p.cpp b/src/api/internal/io/RollingBuffer_p.cpp index 10e7627..c712b57 100644 --- a/src/api/internal/io/RollingBuffer_p.cpp +++ b/src/api/internal/io/RollingBuffer_p.cpp @@ -237,7 +237,7 @@ size_t RollingBuffer::ReadLine(char* dest, size_t max) { bytesReadSoFar += bytesToRead; Free(bytesToRead); - if ( !((bytesReadSoFar < index+1)&&(bytesReadSoFar < max-1)) ) + if ( !((bytesReadSoFar < index+1) && (bytesReadSoFar < max-1)) ) finished = true; } @@ -274,7 +274,7 @@ char* RollingBuffer::Reserve(size_t n) { if ( (m_tail + n) <= m_data.at(m_tailBufferIndex).Size() ) { // fetch write pointer at current 'tail', increment tail by @n & return - char* ptr = m_data[m_tailBufferIndex].Data() + m_tail; + char* ptr = m_data[m_tailBufferIndex].Data(); //+ m_tail; m_tail += n; return ptr; } @@ -286,7 +286,7 @@ char* RollingBuffer::Reserve(size_t n) { m_data[m_tailBufferIndex].Resize(m_tail + n); // fetch write pointer at current 'tail', increment tail by @n & return - char* ptr = m_data[m_tailBufferIndex].Data() + m_tail; + char* ptr = m_data[m_tailBufferIndex].Data(); //+ m_tail; m_tail += n; return ptr; } diff --git a/src/api/internal/io/TcpSocket_p.cpp b/src/api/internal/io/TcpSocket_p.cpp index 1a5bd86..d390932 100644 --- a/src/api/internal/io/TcpSocket_p.cpp +++ b/src/api/internal/io/TcpSocket_p.cpp @@ -27,7 +27,7 @@ namespace BamTools { namespace Internal { // constants -static const size_t DEFAULT_BUFFER_SIZE = 0x4000; +static const size_t DEFAULT_BUFFER_SIZE = 0x10000; } // namespace Internal } // namespace BamTools @@ -43,7 +43,7 @@ TcpSocket::TcpSocket(void) , m_engine(0) , m_cachedSocketDescriptor(-1) , m_readBuffer(DEFAULT_BUFFER_SIZE) - , m_error(TcpSocket::UnknownSocketError) + , m_error(TcpSocket::NoError) , m_state(TcpSocket::UnconnectedState) { } @@ -79,7 +79,7 @@ bool TcpSocket::ConnectImpl(const HostInfo& hostInfo, m_hostName = hostInfo.HostName(); m_mode = mode; m_state = TcpSocket::UnconnectedState; - m_error = TcpSocket::UnknownSocketError; + m_error = TcpSocket::NoError; // m_localPort = 0; m_remotePort = 0; // m_localAddress.Clear(); diff --git a/src/api/internal/io/TcpSocket_p.h b/src/api/internal/io/TcpSocket_p.h index a25a11e..2ad2dee 100644 --- a/src/api/internal/io/TcpSocket_p.h +++ b/src/api/internal/io/TcpSocket_p.h @@ -28,13 +28,15 @@ namespace BamTools { namespace Internal { +class BamHttp; class TcpSocketEngine; class TcpSocket { // enums public: - enum SocketError { UnknownSocketError = -1 + enum SocketError { NoError = -2 + , UnknownSocketError = -1 , ConnectionRefusedError = 0 , RemoteHostClosedError , HostNotFoundError @@ -116,6 +118,8 @@ class TcpSocket { TcpSocket::SocketError m_error; TcpSocket::SocketState m_state; std::string m_errorString; + + friend class BamHttp; }; } // namespace Internal diff --git a/src/api/internal/sam/CMakeLists.txt b/src/api/internal/sam/CMakeLists.txt index 4b2bce2..2f303bd 100644 --- a/src/api/internal/sam/CMakeLists.txt +++ b/src/api/internal/sam/CMakeLists.txt @@ -5,9 +5,9 @@ # src/api/internal/sam # ========================== -set ( InternalSamDir "${InternalDir}/sam" ) +set( InternalSamDir "${InternalDir}/sam" ) -set ( InternalSamSources +set( InternalSamSources ${InternalSamDir}/SamFormatParser_p.cpp ${InternalSamDir}/SamFormatPrinter_p.cpp ${InternalSamDir}/SamHeaderValidator_p.cpp diff --git a/src/api/internal/utils/CMakeLists.txt b/src/api/internal/utils/CMakeLists.txt index 38a6957..4b1e2c2 100644 --- a/src/api/internal/utils/CMakeLists.txt +++ b/src/api/internal/utils/CMakeLists.txt @@ -5,9 +5,9 @@ # src/api/internal/utils # ========================== -set ( InternalUtilsDir "${InternalDir}/utils" ) +set( InternalUtilsDir "${InternalDir}/utils" ) -set ( InternalUtilsSources +set( InternalUtilsSources ${InternalUtilsDir}/BamException_p.cpp PARENT_SCOPE # <-- leave this last diff --git a/src/third_party/jsoncpp/CMakeLists.txt b/src/third_party/jsoncpp/CMakeLists.txt index 03c091b..8dc64e4 100644 --- a/src/third_party/jsoncpp/CMakeLists.txt +++ b/src/third_party/jsoncpp/CMakeLists.txt @@ -10,7 +10,7 @@ add_definitions( -DBAMTOOLS_JSONCPP_LIBRARY ) # (for proper exporting of library add_definitions( -fPIC ) # (attempt to force PIC compiling on CentOS, not being set on shared libs by CMake) # create jsoncpp library -add_library( jsoncpp SHARED +add_library( jsoncpp STATIC json_reader.cpp json_value.cpp json_writer.cpp @@ -18,6 +18,6 @@ add_library( jsoncpp SHARED # set jsoncpp library properties set_target_properties( jsoncpp PROPERTIES - SOVERSION 1.0.0 OUTPUT_NAME jsoncpp + PREFIX "lib" ) diff --git a/src/toolkit/CMakeLists.txt b/src/toolkit/CMakeLists.txt index 1f198b3..33b0735 100644 --- a/src/toolkit/CMakeLists.txt +++ b/src/toolkit/CMakeLists.txt @@ -31,7 +31,7 @@ add_executable( bamtools_cmd # set BamTools application properties set_target_properties( bamtools_cmd PROPERTIES - VERSION 2.2.0 + VERSION 2.3.0 OUTPUT_NAME "bamtools" ) # make version info available in application diff --git a/src/toolkit/bamtools_convert.cpp b/src/toolkit/bamtools_convert.cpp index 0e1743f..54820e7 100644 --- a/src/toolkit/bamtools_convert.cpp +++ b/src/toolkit/bamtools_convert.cpp @@ -2,7 +2,7 @@ // bamtools_convert.cpp (c) 2010 Derek Barnett, Erik Garrison // Marth Lab, Department of Biology, Boston College // --------------------------------------------------------------------------- -// Last modified: 11 November 2012 +// Last modified: 10 December 2012 // --------------------------------------------------------------------------- // Converts between BAM and a number of other formats // *************************************************************************** @@ -76,6 +76,7 @@ struct ConvertTool::ConvertSettings { // flag bool HasInput; + bool HasInputFilelist; bool HasOutput; bool HasFormat; bool HasRegion; @@ -87,6 +88,7 @@ struct ConvertTool::ConvertSettings { // options vector InputFiles; + string InputFilelist; string OutputFilename; string Format; string Region; @@ -97,6 +99,7 @@ struct ConvertTool::ConvertSettings { // constructor ConvertSettings(void) : HasInput(false) + , HasInputFilelist(false) , HasOutput(false) , HasFormat(false) , HasRegion(false) @@ -151,9 +154,23 @@ bool ConvertTool::ConvertToolPrivate::Run(void) { // initialize conversion input/output // set to default input if none provided - if ( !m_settings->HasInput ) + if ( !m_settings->HasInput && !m_settings->HasInputFilelist ) m_settings->InputFiles.push_back(Options::StandardIn()); + // add files in the filelist to the input file list + if ( m_settings->HasInputFilelist ) { + + ifstream filelist(m_settings->InputFilelist.c_str(), ios::in); + if ( !filelist.is_open() ) { + cerr << "bamtools convert ERROR: could not open input BAM file list... Aborting." << endl; + return false; + } + + string line; + while ( getline(filelist, line) ) + m_settings->InputFiles.push_back(line); + } + // open input files BamMultiReader reader; if ( !reader.Open(m_settings->InputFiles) ) { @@ -406,7 +423,7 @@ void ConvertTool::ConvertToolPrivate::PrintJson(const BamAlignment& a) { } // write alignment's source BAM file - m_out << "\"filename\":" << a.Filename << ","; + m_out << "\"filename\":\"" << a.Filename << "\","; // write tag data const char* tagData = a.TagData.c_str(); @@ -703,11 +720,13 @@ ConvertTool::ConvertTool(void) , m_impl(0) { // set program details - Options::SetProgramInfo("bamtools convert", "converts BAM to a number of other formats", "-format [-in -in ...] [-out ] [-region ] [format-specific options]"); + Options::SetProgramInfo("bamtools convert", "converts BAM to a number of other formats", + "-format [-in -in ... | -list ] [-out ] [-region ] [format-specific options]"); // set up options OptionGroup* IO_Opts = Options::CreateOptionGroup("Input & Output"); Options::AddValueOption("-in", "BAM filename", "the input BAM file(s)", "", m_settings->HasInput, m_settings->InputFiles, IO_Opts, Options::StandardIn()); + Options::AddValueOption("-list", "filename", "the input BAM file list, one line per file", "", m_settings->HasInputFilelist, m_settings->InputFilelist, IO_Opts); Options::AddValueOption("-out", "BAM filename", "the output BAM file", "", m_settings->HasOutput, m_settings->OutputFilename, IO_Opts, Options::StandardOut()); Options::AddValueOption("-format", "FORMAT", "the output file format - see README for recognized formats", "", m_settings->HasFormat, m_settings->Format, IO_Opts); Options::AddValueOption("-region", "REGION", "genomic region. Index file is recommended for better performance, and is used automatically if it exists. See \'bamtools help index\' for more details on creating one", "", m_settings->HasRegion, m_settings->Region, IO_Opts); diff --git a/src/toolkit/bamtools_count.cpp b/src/toolkit/bamtools_count.cpp index 3593f4d..5a7c0a7 100644 --- a/src/toolkit/bamtools_count.cpp +++ b/src/toolkit/bamtools_count.cpp @@ -2,7 +2,7 @@ // bamtools_count.cpp (c) 2010 Derek Barnett, Erik Garrison // Marth Lab, Department of Biology, Boston College // --------------------------------------------------------------------------- -// Last modified: 7 April 2011 +// Last modified: 10 December 2012 // --------------------------------------------------------------------------- // Prints alignment count for BAM file(s) // *************************************************************************** @@ -15,6 +15,7 @@ #include using namespace BamTools; +#include #include #include #include @@ -27,15 +28,18 @@ struct CountTool::CountSettings { // flags bool HasInput; + bool HasInputFilelist; bool HasRegion; // filenames vector InputFiles; + string InputFilelist; string Region; // constructor CountSettings(void) : HasInput(false) + , HasInputFilelist(false) , HasRegion(false) { } }; @@ -64,10 +68,24 @@ struct CountTool::CountToolPrivate { bool CountTool::CountToolPrivate::Run(void) { - // if no '-in' args supplied, default to stdin - if ( !m_settings->HasInput ) + // set to default input if none provided + if ( !m_settings->HasInput && !m_settings->HasInputFilelist ) m_settings->InputFiles.push_back(Options::StandardIn()); + // add files in the filelist to the input file list + if ( m_settings->HasInputFilelist ) { + + ifstream filelist(m_settings->InputFilelist.c_str(), ios::in); + if ( !filelist.is_open() ) { + cerr << "bamtools count ERROR: could not open input BAM file list... Aborting." << endl; + return false; + } + + string line; + while ( getline(filelist, line) ) + m_settings->InputFiles.push_back(line); + } + // open reader without index BamMultiReader reader; if ( !reader.Open(m_settings->InputFiles) ) { @@ -150,12 +168,16 @@ CountTool::CountTool(void) , m_impl(0) { // set program details - Options::SetProgramInfo("bamtools count", "prints number of alignments in BAM file(s)", "[-in -in ...] [-region ]"); + Options::SetProgramInfo("bamtools count", "prints number of alignments in BAM file(s)", + "[-in -in ... | -list ] [-region ]"); // set up options OptionGroup* IO_Opts = Options::CreateOptionGroup("Input & Output"); Options::AddValueOption("-in", "BAM filename", "the input BAM file(s)", "", m_settings->HasInput, m_settings->InputFiles, IO_Opts, Options::StandardIn()); - Options::AddValueOption("-region", "REGION", "genomic region. Index file is recommended for better performance, and is used automatically if it exists. See \'bamtools help index\' for more details on creating one", "", m_settings->HasRegion, m_settings->Region, IO_Opts); + Options::AddValueOption("-list", "filename", "the input BAM file list, one line per file", "", m_settings->HasInputFilelist, m_settings->InputFilelist, IO_Opts); + Options::AddValueOption("-region", "REGION", + "genomic region. Index file is recommended for better performance, and is used automatically if it exists. See \'bamtools help index\' for more details on creating one", + "", m_settings->HasRegion, m_settings->Region, IO_Opts); } CountTool::~CountTool(void) { diff --git a/src/toolkit/bamtools_coverage.cpp b/src/toolkit/bamtools_coverage.cpp index c0ecd8f..6a4493d 100644 --- a/src/toolkit/bamtools_coverage.cpp +++ b/src/toolkit/bamtools_coverage.cpp @@ -2,7 +2,7 @@ // bamtools_coverage.cpp (c) 2010 Derek Barnett, Erik Garrison // Marth Lab, Department of Biology, Boston College // --------------------------------------------------------------------------- -// Last modified: 7 April 2011 +// Last modified: 24 July 2013 // --------------------------------------------------------------------------- // Prints coverage data for a single BAM file // *************************************************************************** @@ -137,6 +137,7 @@ bool CoverageTool::CoverageToolPrivate::Run(void) { BamAlignment al; while ( reader.GetNextAlignment(al) ) pileup.AddAlignment(al); + pileup.Flush(); // clean up reader.Close(); diff --git a/src/toolkit/bamtools_filter.cpp b/src/toolkit/bamtools_filter.cpp index 8af9cb9..2f17242 100644 --- a/src/toolkit/bamtools_filter.cpp +++ b/src/toolkit/bamtools_filter.cpp @@ -2,7 +2,7 @@ // bamtools_filter.cpp (c) 2010 Derek Barnett, Erik Garrison // Marth Lab, Department of Biology, Boston College // --------------------------------------------------------------------------- -// Last modified: 14 October 2011 +// Last modified: 3 May 2013 // --------------------------------------------------------------------------- // Filters BAM file(s) according to some user-specified criteria // *************************************************************************** @@ -20,6 +20,7 @@ using namespace BamTools; using namespace Json; #include +#include #include #include #include @@ -47,6 +48,7 @@ const string ISPROPERPAIR_PROPERTY = "isProperPair"; const string ISREVERSESTRAND_PROPERTY = "isReverseStrand"; const string ISSECONDMATE_PROPERTY = "isSecondMate"; const string ISSINGLETON_PROPERTY = "isSingleton"; +const string LENGTH_PROPERTY = "length"; const string MAPQUALITY_PROPERTY = "mapQuality"; const string MATEPOSITION_PROPERTY = "matePosition"; const string MATEREFERENCE_PROPERTY = "mateReference"; @@ -106,6 +108,7 @@ struct BamAlignmentChecker { const bool isSingleton = al.IsPaired() && al.IsMapped() && !al.IsMateMapped(); keepAlignment &= valueFilter.check(isSingleton); } + else if ( propertyName == LENGTH_PROPERTY ) keepAlignment &= valueFilter.check(al.Length); else if ( propertyName == MAPQUALITY_PROPERTY ) keepAlignment &= valueFilter.check(al.MapQuality); else if ( propertyName == MATEPOSITION_PROPERTY ) keepAlignment &= ( al.IsPaired() && al.IsMateMapped() && valueFilter.check(al.MateRefID) ); else if ( propertyName == MATEREFERENCE_PROPERTY ) { @@ -156,6 +159,7 @@ struct BamAlignmentChecker { string tagFilterString = entireTagFilterString.substr(3); // switch on tag type to set tag query value & parse filter token + int8_t asciiFilterValue, asciiQueryValue; int32_t intFilterValue, intQueryValue; uint32_t uintFilterValue, uintQueryValue; float realFilterValue, realQueryValue; @@ -166,6 +170,17 @@ struct BamAlignmentChecker { bool keepAlignment = false; switch (tagType) { + // ASCII tag type + case 'A': + if ( al.GetTag(tagName, asciiQueryValue) ) { + if ( FilterEngine::parseToken(tagFilterString, asciiFilterValue, compareType) ) { + tagFilter.Value = asciiFilterValue; + tagFilter.Type = compareType; + keepAlignment = tagFilter.check(asciiQueryValue); + } + } + break; + // signed int tag type case 'c' : case 's' : @@ -204,7 +219,7 @@ struct BamAlignmentChecker { break; // string tag type - case 'A': + case 'Z': case 'H': if ( al.GetTag(tagName, stringQueryValue) ) { @@ -236,14 +251,16 @@ struct FilterTool::FilterSettings { // IO opts // flags - bool HasInputBamFilename; - bool HasOutputBamFilename; + bool HasInput; + bool HasInputFilelist; + bool HasOutput; bool HasRegion; - bool HasScriptFilename; + bool HasScript; bool IsForceCompression; // filenames vector InputFiles; + string InputFilelist; string OutputFilename; string Region; string ScriptFilename; @@ -254,6 +271,7 @@ struct FilterTool::FilterSettings { // flags bool HasAlignmentFlagFilter; bool HasInsertSizeFilter; + bool HasLengthFilter; bool HasMapQualityFilter; bool HasNameFilter; bool HasQueryBasesFilter; @@ -262,8 +280,9 @@ struct FilterTool::FilterSettings { // filters string AlignmentFlagFilter; string InsertSizeFilter; - string NameFilter; + string LengthFilter; string MapQualityFilter; + string NameFilter; string QueryBasesFilter; string TagFilter; // support multiple ? @@ -302,14 +321,16 @@ struct FilterTool::FilterSettings { // constructor FilterSettings(void) - : HasInputBamFilename(false) - , HasOutputBamFilename(false) + : HasInput(false) + , HasInputFilelist(false) + , HasOutput(false) , HasRegion(false) - , HasScriptFilename(false) + , HasScript(false) , IsForceCompression(false) , OutputFilename(Options::StandardOut()) , HasAlignmentFlagFilter(false) , HasInsertSizeFilter(false) + , HasLengthFilter(false) , HasMapQualityFilter(false) , HasNameFilter(false) , HasQueryBasesFilter(false) @@ -429,6 +450,7 @@ bool FilterTool::FilterToolPrivate::AddPropertyTokensToFilter(const string& filt // int32_t conversion else if ( propertyName == INSERTSIZE_PROPERTY || + propertyName == LENGTH_PROPERTY || propertyName == MATEPOSITION_PROPERTY || propertyName == POSITION_PROPERTY ) @@ -463,11 +485,11 @@ bool FilterTool::FilterToolPrivate::AddPropertyTokensToFilter(const string& filt m_filterEngine.setProperty(filterName, propertyName, stringValue, type); } - else if ( propertyName == TAG_PROPERTY ) { - // this will be stored directly as the TAG:VALUE token - // (VALUE may contain compare ops, will be parsed out later) - m_filterEngine.setProperty(filterName, propertyName, token, PropertyFilterValue::EXACT); - } + else if ( propertyName == TAG_PROPERTY ) { + // this will be stored directly as the TAG:VALUE token + // (VALUE may contain compare ops, will be parsed out later) + m_filterEngine.setProperty(filterName, propertyName, token, PropertyFilterValue::EXACT); + } // else unknown property else { @@ -500,7 +522,8 @@ const string FilterTool::FilterToolPrivate::GetScriptContents(void) { // peek ahead, make sure there is data available char ch = fgetc(inFile); ungetc(ch, inFile); - if( feof(inFile) ) break; + if( feof(inFile) ) + break; // read next block of data if ( fgets(buffer, 1024, inFile) == 0 ) { @@ -536,6 +559,7 @@ void FilterTool::FilterToolPrivate::InitProperties(void) { m_propertyNames.push_back(ISREVERSESTRAND_PROPERTY); m_propertyNames.push_back(ISSECONDMATE_PROPERTY); m_propertyNames.push_back(ISSINGLETON_PROPERTY); + m_propertyNames.push_back(LENGTH_PROPERTY); m_propertyNames.push_back(MAPQUALITY_PROPERTY); m_propertyNames.push_back(MATEPOSITION_PROPERTY); m_propertyNames.push_back(MATEREFERENCE_PROPERTY); @@ -574,6 +598,7 @@ bool FilterTool::FilterToolPrivate::ParseCommandLine(void) { if ( m_settings->HasIsReverseStrandFilter ) propertyTokens.insert( make_pair(ISREVERSESTRAND_PROPERTY, m_settings->IsReverseStrandFilter) ); if ( m_settings->HasIsSecondMateFilter ) propertyTokens.insert( make_pair(ISSECONDMATE_PROPERTY, m_settings->IsSecondMateFilter) ); if ( m_settings->HasIsSingletonFilter ) propertyTokens.insert( make_pair(ISSINGLETON_PROPERTY, m_settings->IsSingletonFilter) ); + if ( m_settings->HasLengthFilter ) propertyTokens.insert( make_pair(LENGTH_PROPERTY, m_settings->LengthFilter) ); if ( m_settings->HasMapQualityFilter ) propertyTokens.insert( make_pair(MAPQUALITY_PROPERTY, m_settings->MapQualityFilter) ); if ( m_settings->HasNameFilter ) propertyTokens.insert( make_pair(NAME_PROPERTY, m_settings->NameFilter) ); if ( m_settings->HasQueryBasesFilter ) propertyTokens.insert( make_pair(QUERYBASES_PROPERTY, m_settings->QueryBasesFilter) ); @@ -682,12 +707,27 @@ bool FilterTool::FilterToolPrivate::ParseScript(void) { bool FilterTool::FilterToolPrivate::Run(void) { // set to default input if none provided - if ( !m_settings->HasInputBamFilename ) + if ( !m_settings->HasInput && !m_settings->HasInputFilelist ) m_settings->InputFiles.push_back(Options::StandardIn()); + // add files in the filelist to the input file list + if ( m_settings->HasInputFilelist ) { + + ifstream filelist(m_settings->InputFilelist.c_str(), ios::in); + if ( !filelist.is_open() ) { + cerr << "bamtools filter ERROR: could not open input BAM file list... Aborting." << endl; + return false; + } + + string line; + while ( getline(filelist, line) ) + m_settings->InputFiles.push_back(line); + } + // initialize defined properties & user-specified filters // quit if failed - if ( !SetupFilters() ) return false; + if ( !SetupFilters() ) + return false; // open reader without index BamMultiReader reader; @@ -786,7 +826,7 @@ bool FilterTool::FilterToolPrivate::SetupFilters(void) { InitProperties(); // parse script for filter rules, if given - if ( m_settings->HasScriptFilename ) + if ( m_settings->HasScript ) return ParseScript(); // otherwise check command line for filters @@ -804,9 +844,10 @@ FilterTool::FilterTool(void) // ---------------------------------- // set program details - const string usage = "[-in -in ...] " + const string usage = "[-in -in ... | -list ] " "[-out | [-forceCompression]] [-region ] " "[ [-script HasInputBamFilename, m_settings->InputFiles, IO_Opts, Options::StandardIn()); - Options::AddValueOption("-out", "BAM filename", outDesc, "", m_settings->HasOutputBamFilename, m_settings->OutputFilename, IO_Opts, Options::StandardOut()); - Options::AddValueOption("-region", "REGION", regionDesc, "", m_settings->HasRegion, m_settings->Region, IO_Opts); - Options::AddValueOption("-script", "filename", scriptDesc, "", m_settings->HasScriptFilename, m_settings->ScriptFilename, IO_Opts); + Options::AddValueOption("-in", "BAM filename", inDesc, "", m_settings->HasInput, m_settings->InputFiles, IO_Opts, Options::StandardIn()); + Options::AddValueOption("-list", "filename", listDesc, "", m_settings->HasInputFilelist, m_settings->InputFilelist, IO_Opts); + Options::AddValueOption("-out", "BAM filename", outDesc, "", m_settings->HasOutput, m_settings->OutputFilename, IO_Opts, Options::StandardOut()); + Options::AddValueOption("-region", "REGION", regionDesc, "", m_settings->HasRegion, m_settings->Region, IO_Opts); + Options::AddValueOption("-script", "filename", scriptDesc, "", m_settings->HasScript, m_settings->ScriptFilename, IO_Opts); Options::AddOption("-forceCompression",forceDesc, m_settings->IsForceCompression, IO_Opts); // ---------------------------------- @@ -835,6 +878,7 @@ FilterTool::FilterTool(void) const string flagDesc = "keep reads with this *exact* alignment flag (for more detailed queries, see below)"; const string insertDesc = "keep reads with insert size that matches pattern"; + const string lengthDesc = "keep reads with length that matches pattern"; const string mapQualDesc = "keep reads with map quality that matches pattern"; const string nameDesc = "keep reads with name that matches pattern"; const string queryDesc = "keep reads with motif that matches pattern"; @@ -842,6 +886,7 @@ FilterTool::FilterTool(void) Options::AddValueOption("-alignmentFlag", "int", flagDesc, "", m_settings->HasAlignmentFlagFilter, m_settings->AlignmentFlagFilter, FilterOpts); Options::AddValueOption("-insertSize", "int", insertDesc, "", m_settings->HasInsertSizeFilter, m_settings->InsertSizeFilter, FilterOpts); + Options::AddValueOption("-length", "int", lengthDesc, "", m_settings->HasLengthFilter, m_settings->LengthFilter, FilterOpts); Options::AddValueOption("-mapQuality", "[0-255]", mapQualDesc, "", m_settings->HasMapQualityFilter, m_settings->MapQualityFilter, FilterOpts); Options::AddValueOption("-name", "string", nameDesc, "", m_settings->HasNameFilter, m_settings->NameFilter, FilterOpts); Options::AddValueOption("-queryBases", "string", queryDesc, "", m_settings->HasQueryBasesFilter, m_settings->QueryBasesFilter, FilterOpts); diff --git a/src/toolkit/bamtools_header.cpp b/src/toolkit/bamtools_header.cpp index a08c632..534bb14 100644 --- a/src/toolkit/bamtools_header.cpp +++ b/src/toolkit/bamtools_header.cpp @@ -2,7 +2,7 @@ // bamtools_header.cpp (c) 2010 Derek Barnett, Erik Garrison // Marth Lab, Department of Biology, Boston College // --------------------------------------------------------------------------- -// Last modified: 7 April 2011 +// Last modified: 10 December 2012 // --------------------------------------------------------------------------- // Prints the SAM-style header from a single BAM file ( or merged header from // multiple BAM files) to stdout @@ -14,6 +14,7 @@ #include using namespace BamTools; +#include #include #include #include @@ -25,14 +26,17 @@ using namespace std; struct HeaderTool::HeaderSettings { // flags - bool HasInputBamFilename; + bool HasInput; + bool HasInputFilelist; // filenames vector InputFiles; + string InputFilelist; // constructor HeaderSettings(void) - : HasInputBamFilename(false) + : HasInput(false) + , HasInputFilelist(false) { } }; @@ -58,9 +62,23 @@ struct HeaderTool::HeaderToolPrivate { bool HeaderTool::HeaderToolPrivate::Run(void) { // set to default input if none provided - if ( !m_settings->HasInputBamFilename ) + if ( !m_settings->HasInput && !m_settings->HasInputFilelist ) m_settings->InputFiles.push_back(Options::StandardIn()); + // add files in the filelist to the input file list + if ( m_settings->HasInputFilelist ) { + + ifstream filelist(m_settings->InputFilelist.c_str(), ios::in); + if ( !filelist.is_open() ) { + cerr << "bamtools header ERROR: could not open input BAM file list... Aborting." << endl; + return false; + } + + string line; + while ( getline(filelist, line) ) + m_settings->InputFiles.push_back(line); + } + // attemp to open BAM files BamMultiReader reader; if ( !reader.Open(m_settings->InputFiles) ) { @@ -85,11 +103,12 @@ HeaderTool::HeaderTool(void) , m_impl(0) { // set program details - Options::SetProgramInfo("bamtools header", "prints header from BAM file(s)", "[-in -in ...] "); + Options::SetProgramInfo("bamtools header", "prints header from BAM file(s)", "[-in -in ... | -list ]"); // set up options OptionGroup* IO_Opts = Options::CreateOptionGroup("Input & Output"); - Options::AddValueOption("-in", "BAM filename", "the input BAM file(s)", "", m_settings->HasInputBamFilename, m_settings->InputFiles, IO_Opts, Options::StandardIn()); + Options::AddValueOption("-in", "BAM filename", "the input BAM file(s)", "", m_settings->HasInput, m_settings->InputFiles, IO_Opts, Options::StandardIn()); + Options::AddValueOption("-list", "filename", "the input BAM file list, one line per file", "", m_settings->HasInputFilelist, m_settings->InputFilelist, IO_Opts); } HeaderTool::~HeaderTool(void) { diff --git a/src/toolkit/bamtools_merge.cpp b/src/toolkit/bamtools_merge.cpp index 1e8312f..6a33d12 100644 --- a/src/toolkit/bamtools_merge.cpp +++ b/src/toolkit/bamtools_merge.cpp @@ -2,7 +2,7 @@ // bamtools_merge.cpp (c) 2010 Derek Barnett, Erik Garrison // Marth Lab, Department of Biology, Boston College // --------------------------------------------------------------------------- -// Last modified: 7 April 2011 +// Last modified: 10 December 2012 // --------------------------------------------------------------------------- // Merges multiple BAM files into one // *************************************************************************** @@ -15,6 +15,7 @@ #include using namespace BamTools; +#include #include #include #include @@ -26,13 +27,15 @@ using namespace std; struct MergeTool::MergeSettings { // flags - bool HasInputBamFilename; - bool HasOutputBamFilename; + bool HasInput; + bool HasInputFilelist; + bool HasOutput; bool IsForceCompression; bool HasRegion; // filenames vector InputFiles; + string InputFilelist; // other parameters string OutputFilename; @@ -40,8 +43,9 @@ struct MergeTool::MergeSettings { // constructor MergeSettings(void) - : HasInputBamFilename(false) - , HasOutputBamFilename(false) + : HasInput(false) + , HasInputFilelist(false) + , HasOutput(false) , IsForceCompression(false) , HasRegion(false) , OutputFilename(Options::StandardOut()) @@ -73,9 +77,23 @@ struct MergeTool::MergeToolPrivate { bool MergeTool::MergeToolPrivate::Run(void) { // set to default input if none provided - if ( !m_settings->HasInputBamFilename ) + if ( !m_settings->HasInput && !m_settings->HasInputFilelist ) m_settings->InputFiles.push_back(Options::StandardIn()); + // add files in the filelist to the input file list + if ( m_settings->HasInputFilelist ) { + + ifstream filelist(m_settings->InputFilelist.c_str(), ios::in); + if ( !filelist.is_open() ) { + cerr << "bamtools merge ERROR: could not open input BAM file list... Aborting." << endl; + return false; + } + + string line; + while ( getline(filelist, line) ) + m_settings->InputFiles.push_back(line); + } + // opens the BAM files (by default without checking for indexes) BamMultiReader reader; if ( !reader.Open(m_settings->InputFiles) ) { @@ -181,12 +199,14 @@ MergeTool::MergeTool(void) , m_impl(0) { // set program details - Options::SetProgramInfo("bamtools merge", "merges multiple BAM files into one", "[-in -in ...] [-out | [-forceCompression]] [-region ]"); + Options::SetProgramInfo("bamtools merge", "merges multiple BAM files into one", + "[-in -in ... | -list ] [-out | [-forceCompression]] [-region ]"); // set up options OptionGroup* IO_Opts = Options::CreateOptionGroup("Input & Output"); - Options::AddValueOption("-in", "BAM filename", "the input BAM file(s)", "", m_settings->HasInputBamFilename, m_settings->InputFiles, IO_Opts); - Options::AddValueOption("-out", "BAM filename", "the output BAM file", "", m_settings->HasOutputBamFilename, m_settings->OutputFilename, IO_Opts); + Options::AddValueOption("-in", "BAM filename", "the input BAM file(s)", "", m_settings->HasInput, m_settings->InputFiles, IO_Opts); + Options::AddValueOption("-list", "filename", "the input BAM file list, one line per file", "", m_settings->HasInputFilelist, m_settings->InputFilelist, IO_Opts); + Options::AddValueOption("-out", "BAM filename", "the output BAM file", "", m_settings->HasOutput, m_settings->OutputFilename, IO_Opts); Options::AddOption("-forceCompression", "if results are sent to stdout (like when piping to another tool), default behavior is to leave output uncompressed. Use this flag to override and force compression", m_settings->IsForceCompression, IO_Opts); Options::AddValueOption("-region", "REGION", "genomic region. See README for more details", "", m_settings->HasRegion, m_settings->Region, IO_Opts); } diff --git a/src/toolkit/bamtools_random.cpp b/src/toolkit/bamtools_random.cpp index e28ea70..367ac58 100644 --- a/src/toolkit/bamtools_random.cpp +++ b/src/toolkit/bamtools_random.cpp @@ -2,7 +2,7 @@ // bamtools_random.cpp (c) 2010 Derek Barnett, Erik Garrison // Marth Lab, Department of Biology, Boston College // --------------------------------------------------------------------------- -// Last modified: 7 April 2011 (DB) +// Last modified: 24 July 2013 (DB) // --------------------------------------------------------------------------- // Grab a random subset of alignments (testing tool) // *************************************************************************** @@ -17,6 +17,7 @@ using namespace BamTools; #include #include +#include #include #include #include @@ -43,25 +44,32 @@ struct RandomTool::RandomSettings { // flags bool HasAlignmentCount; bool HasInput; + bool HasInputFilelist; bool HasOutput; + bool HasRandomNumberSeed; bool HasRegion; bool IsForceCompression; // parameters unsigned int AlignmentCount; vector InputFiles; + string InputFilelist; string OutputFilename; + unsigned int RandomNumberSeed; string Region; // constructor RandomSettings(void) : HasAlignmentCount(false) , HasInput(false) + , HasInputFilelist(false) , HasOutput(false) + , HasRandomNumberSeed(false) , HasRegion(false) , IsForceCompression(false) , AlignmentCount(RANDOM_MAX_ALIGNMENT_COUNT) , OutputFilename(Options::StandardOut()) + , RandomNumberSeed(0) { } }; @@ -90,9 +98,23 @@ struct RandomTool::RandomToolPrivate { bool RandomTool::RandomToolPrivate::Run(void) { // set to default stdin if no input files provided - if ( !m_settings->HasInput ) + if ( !m_settings->HasInput && !m_settings->HasInputFilelist ) m_settings->InputFiles.push_back(Options::StandardIn()); + // add files in the filelist to the input file list + if ( m_settings->HasInputFilelist ) { + + ifstream filelist(m_settings->InputFilelist.c_str(), ios::in); + if ( !filelist.is_open() ) { + cerr << "bamtools random ERROR: could not open input BAM file list... Aborting." << endl; + return false; + } + + string line; + while ( getline(filelist, line) ) + m_settings->InputFiles.push_back(line); + } + // open our reader BamMultiReader reader; if ( !reader.Open(m_settings->InputFiles) ) { @@ -147,7 +169,10 @@ bool RandomTool::RandomToolPrivate::Run(void) { } // seed our random number generator - srand( time(NULL) ); + if ( m_settings->HasRandomNumberSeed ) + srand( m_settings->RandomNumberSeed ); + else + srand( time(NULL) ); // grab random alignments BamAlignment al; @@ -212,17 +237,22 @@ RandomTool::RandomTool(void) , m_impl(0) { // set program details - Options::SetProgramInfo("bamtools random", "grab a random subset of alignments", "[-in -in ...] [-out ] [-forceCompression] [-n] [-region ]"); + Options::SetProgramInfo("bamtools random", "grab a random subset of alignments", + "[-in -in ... | -list ] [-out ] [-forceCompression] [-n] [-region ]"); // set up options OptionGroup* IO_Opts = Options::CreateOptionGroup("Input & Output"); - Options::AddValueOption("-in", "BAM filename", "the input BAM file", "", m_settings->HasInput, m_settings->InputFiles, IO_Opts, Options::StandardIn()); - Options::AddValueOption("-out", "BAM filename", "the output BAM file", "", m_settings->HasOutput, m_settings->OutputFilename, IO_Opts, Options::StandardOut()); + Options::AddValueOption("-in", "BAM filename", "the input BAM file", "", m_settings->HasInput, m_settings->InputFiles, IO_Opts, Options::StandardIn()); + Options::AddValueOption("-list", "filename", "the input BAM file list, one line per file", "", m_settings->HasInputFilelist, m_settings->InputFilelist, IO_Opts); + Options::AddValueOption("-out", "BAM filename", "the output BAM file", "", m_settings->HasOutput, m_settings->OutputFilename, IO_Opts, Options::StandardOut()); + Options::AddValueOption("-region", "REGION", "only pull random alignments from within this genomic region. Index file is recommended for better performance, and is used automatically if it exists. See \'bamtools help index\' for more details on creating one", "", m_settings->HasRegion, m_settings->Region, IO_Opts); Options::AddOption("-forceCompression", "if results are sent to stdout (like when piping to another tool), default behavior is to leave output uncompressed. Use this flag to override and force compression", m_settings->IsForceCompression, IO_Opts); - Options::AddValueOption("-region", "REGION", "only pull random alignments from within this genomic region. Index file is recommended for better performance, and is used automatically if it exists. See \'bamtools help index\' for more details on creating one", "", m_settings->HasRegion, m_settings->Region, IO_Opts); OptionGroup* SettingsOpts = Options::CreateOptionGroup("Settings"); - Options::AddValueOption("-n", "count", "number of alignments to grab. Note - no duplicate checking is performed", "", m_settings->HasAlignmentCount, m_settings->AlignmentCount, SettingsOpts, RANDOM_MAX_ALIGNMENT_COUNT); + Options::AddValueOption("-n", "count", "number of alignments to grab. Note - no duplicate checking is performed", "", + m_settings->HasAlignmentCount, m_settings->AlignmentCount, SettingsOpts, RANDOM_MAX_ALIGNMENT_COUNT); + Options::AddValueOption("-seed", "unsigned integer", "random number generator seed (for repeatable results). Current time is used if no seed value is provided.", "", + m_settings->HasRandomNumberSeed, m_settings->RandomNumberSeed, SettingsOpts); } RandomTool::~RandomTool(void) { diff --git a/src/toolkit/bamtools_resolve.cpp b/src/toolkit/bamtools_resolve.cpp index cb42f5b..9e5fb84 100644 --- a/src/toolkit/bamtools_resolve.cpp +++ b/src/toolkit/bamtools_resolve.cpp @@ -2,7 +2,7 @@ // bamtools_resolve.cpp (c) 2011 // Marth Lab, Department of Biology, Boston College // --------------------------------------------------------------------------- -// Last modified: 14 October 2011 +// Last modified: 24 July 2013 (DB) // --------------------------------------------------------------------------- // Resolves paired-end reads (marking the IsProperPair flag as needed). // *************************************************************************** @@ -73,6 +73,20 @@ static const string OPTION_FORCEMARKREADGROUPS = "ForceMarkReadGroups"; static const string RG_FIELD_DESCRIPTION = "# "; +static const string MODEL_DESCRIPTION = + "# ------------- Model Types Description ---------------\n" + "#\n" + "# ID Position Orientation \n" + "# 1 mate1 < mate2 mate1:forward, mate2:forward \n" + "# 2 mate1 < mate2 mate1:forward, mate2:reverse \n" + "# 3 mate1 < mate2 mate1:reverse, mate2:forward \n" + "# 4 mate1 < mate2 mate1:reverse, mate2:reverse \n" + "# 5 mate2 < mate1 mate2:forward, mate1:forward \n" + "# 6 mate2 < mate1 mate2:forward, mate1:reverse \n" + "# 7 mate2 < mate1 mate2:reverse, mate1:forward \n" + "# 8 mate2 < mate1 mate2:reverse, mate1:reverse \n" + "# -----------------------------------------------------\n"; + // -------------------------------------------------------------------------- // unique readname file constants // -------------------------------------------------------------------------- @@ -731,9 +745,13 @@ void ResolveTool::StatsFileWriter::WriteHeader(void) { << BAMTOOLS_VERSION_BUILD; // # bamtools resolve (vX.Y.Z) + // # + // # MODEL DESCRIPTION - see above for actual text // \n m_stream << COMMENT_CHAR << " bamtools resolve (" << versionStream.str() << ")" << endl + << COMMENT_CHAR << endl + << MODEL_DESCRIPTION << endl; } diff --git a/src/toolkit/bamtools_split.cpp b/src/toolkit/bamtools_split.cpp index e6602a9..6425e95 100644 --- a/src/toolkit/bamtools_split.cpp +++ b/src/toolkit/bamtools_split.cpp @@ -2,7 +2,7 @@ // bamtools_split.cpp (c) 2010 Derek Barnett, Erik Garrison // Marth Lab, Department of Biology, Boston College // --------------------------------------------------------------------------- -// Last modified: 8 December 2011 (DB) +// Last modified: 24 July 2013 (DB) // --------------------------------------------------------------------------- // Splits a BAM file on user-specified property, creating a new BAM output // file for each value found @@ -33,6 +33,7 @@ static const string SPLIT_UNMAPPED_TOKEN = ".UNMAPPED"; static const string SPLIT_PAIRED_TOKEN = ".PAIRED_END"; static const string SPLIT_SINGLE_TOKEN = ".SINGLE_END"; static const string SPLIT_REFERENCE_TOKEN = ".REF_"; +static const string SPLIT_TAG_TOKEN = ".TAG_"; string GetTimestampString(void) { @@ -70,6 +71,7 @@ struct SplitTool::SplitSettings { bool HasInputFilename; bool HasCustomOutputStub; bool HasCustomRefPrefix; + bool HasCustomTagPrefix; bool IsSplittingMapped; bool IsSplittingPaired; bool IsSplittingReference; @@ -78,6 +80,7 @@ struct SplitTool::SplitSettings { // string args string CustomOutputStub; string CustomRefPrefix; + string CustomTagPrefix; string InputFilename; string TagToSplit; @@ -86,12 +89,14 @@ struct SplitTool::SplitSettings { : HasInputFilename(false) , HasCustomOutputStub(false) , HasCustomRefPrefix(false) + , HasCustomTagPrefix(false) , IsSplittingMapped(false) , IsSplittingPaired(false) , IsSplittingReference(false) , IsSplittingTag(false) , CustomOutputStub("") , CustomRefPrefix("") + , CustomTagPrefix("") , InputFilename(Options::StandardIn()) , TagToSplit("") { } @@ -454,6 +459,16 @@ bool SplitTool::SplitToolPrivate::SplitTagImpl(BamAlignment& al) { WriterMap outputFiles; WriterMapIterator writerIter; + // determine tag prefix + string tagPrefix = SPLIT_TAG_TOKEN; + if ( m_settings->HasCustomTagPrefix ) + tagPrefix = m_settings->CustomTagPrefix; + + // make sure prefix starts with '.' + const size_t dotFound = tagPrefix.find('.'); + if ( dotFound != 0 ) + tagPrefix = string(".") + tagPrefix; + // local variables const string tag = m_settings->TagToSplit; BamWriter* writer; @@ -464,7 +479,7 @@ bool SplitTool::SplitToolPrivate::SplitTagImpl(BamAlignment& al) { if ( al.GetTag(tag, currentValue) ) { // open new BamWriter, save first alignment - outputFilenameStream << m_outputFilenameStub << ".TAG_" << tag << "_" << currentValue << ".bam"; + outputFilenameStream << m_outputFilenameStub << tagPrefix << tag << "_" << currentValue << ".bam"; writer = new BamWriter; if ( !writer->Open(outputFilenameStream.str(), m_header, m_references) ) { cerr << "bamtools split ERROR: could not open " << outputFilenameStream.str() @@ -493,7 +508,7 @@ bool SplitTool::SplitToolPrivate::SplitTagImpl(BamAlignment& al) { if ( writerIter == outputFiles.end() ) { // open new BamWriter - outputFilenameStream << m_outputFilenameStub << ".TAG_" << tag << "_" << currentValue << ".bam"; + outputFilenameStream << m_outputFilenameStub << tagPrefix << tag << "_" << currentValue << ".bam"; writer = new BamWriter; if ( !writer->Open(outputFilenameStream.str(), m_header, m_references) ) { cerr << "bamtool split ERROR: could not open " << outputFilenameStream.str() @@ -542,6 +557,8 @@ SplitTool::SplitTool(void) Options::AddValueOption("-in", "BAM filename", "the input BAM file", "", m_settings->HasInputFilename, m_settings->InputFilename, IO_Opts, Options::StandardIn()); Options::AddValueOption("-refPrefix", "string", "custom prefix for splitting by references. Currently files end with REF_.bam. This option allows you to replace \"REF_\" with a prefix of your choosing.", "", m_settings->HasCustomRefPrefix, m_settings->CustomRefPrefix, IO_Opts); + Options::AddValueOption("-tagPrefix", "string", "custom prefix for splitting by tags. Current files end with TAG__.bam. This option allows you to replace \"TAG_\" with a prefix of your choosing.", "", + m_settings->HasCustomTagPrefix, m_settings->CustomTagPrefix, IO_Opts); Options::AddValueOption("-stub", "filename stub", "prefix stub for output BAM files (default behavior is to use input filename, without .bam extension, as stub). If input is stdin and no stub provided, a timestamp is generated as the stub.", "", m_settings->HasCustomOutputStub, m_settings->CustomOutputStub, IO_Opts); diff --git a/src/toolkit/bamtools_stats.cpp b/src/toolkit/bamtools_stats.cpp index eb57a95..0035913 100644 --- a/src/toolkit/bamtools_stats.cpp +++ b/src/toolkit/bamtools_stats.cpp @@ -2,7 +2,7 @@ // bamtools_cpp (c) 2010 Derek Barnett, Erik Garrison // Marth Lab, Department of Biology, Boston College // --------------------------------------------------------------------------- -// Last modified: 7 April 2011 +// Last modified: 10 December 2012 // --------------------------------------------------------------------------- // Prints general alignment statistics for BAM file(s). // *************************************************************************** @@ -15,6 +15,7 @@ using namespace BamTools; #include #include +#include #include #include #include @@ -29,14 +30,17 @@ struct StatsTool::StatsSettings { // flags bool HasInput; + bool HasInputFilelist; bool IsShowingInsertSizeSummary; // filenames vector InputFiles; + string InputFilelist; // constructor StatsSettings(void) : HasInput(false) + , HasInputFilelist(false) , IsShowingInsertSizeSummary(false) { } }; @@ -102,7 +106,8 @@ StatsTool::StatsToolPrivate::StatsToolPrivate(StatsTool::StatsSettings* settings bool StatsTool::StatsToolPrivate::CalculateMedian(vector& data, double& median) { // skip if data empty - if ( data.empty() ) return false; + if ( data.empty() ) + return false; // find middle element size_t middleIndex = data.size() / 2; @@ -202,7 +207,8 @@ void StatsTool::StatsToolPrivate::ProcessAlignment(const BamAlignment& al) { } // check for explicit proper pair flag - if ( al.IsProperPair() ) ++m_numProperPair; + if ( al.IsProperPair() ) + ++m_numProperPair; // store insert size for first mate if ( m_settings->IsShowingInsertSizeSummary && al.IsFirstMate() && (al.InsertSize != 0) ) { @@ -215,9 +221,23 @@ void StatsTool::StatsToolPrivate::ProcessAlignment(const BamAlignment& al) { bool StatsTool::StatsToolPrivate::Run() { // set to default input if none provided - if ( !m_settings->HasInput ) + if ( !m_settings->HasInput && !m_settings->HasInputFilelist ) m_settings->InputFiles.push_back(Options::StandardIn()); + // add files in the filelist to the input file list + if ( m_settings->HasInputFilelist ) { + + ifstream filelist(m_settings->InputFilelist.c_str(), ios::in); + if ( !filelist.is_open() ) { + cerr << "bamtools stats ERROR: could not open input BAM file list... Aborting." << endl; + return false; + } + + string line; + while ( getline(filelist, line) ) + m_settings->InputFiles.push_back(line); + } + // open the BAM files BamMultiReader reader; if ( !reader.Open(m_settings->InputFiles) ) { @@ -246,11 +266,12 @@ StatsTool::StatsTool(void) , m_impl(0) { // set program details - Options::SetProgramInfo("bamtools stats", "prints general alignment statistics", "[-in -in ...] [statsOptions]"); + Options::SetProgramInfo("bamtools stats", "prints general alignment statistics", "[-in -in ... | -list ] [statsOptions]"); // set up options OptionGroup* IO_Opts = Options::CreateOptionGroup("Input & Output"); Options::AddValueOption("-in", "BAM filename", "the input BAM file", "", m_settings->HasInput, m_settings->InputFiles, IO_Opts, Options::StandardIn()); + Options::AddValueOption("-list", "filename", "the input BAM file list, one line per file", "", m_settings->HasInputFilelist, m_settings->InputFilelist, IO_Opts); OptionGroup* AdditionalOpts = Options::CreateOptionGroup("Additional Stats"); Options::AddOption("-insert", "summarize insert size data", m_settings->IsShowingInsertSizeSummary, AdditionalOpts); diff --git a/src/utils/CMakeLists.txt b/src/utils/CMakeLists.txt index 2d91ca3..1c33f4b 100644 --- a/src/utils/CMakeLists.txt +++ b/src/utils/CMakeLists.txt @@ -13,7 +13,7 @@ add_definitions( -DBAMTOOLS_UTILS_LIBRARY ) # (for proper exporting of library s add_definitions( -fPIC ) # (attempt to force PIC compiling on CentOS, not being set on shared libs by CMake) # create BamTools utils library -add_library( BamTools-utils SHARED +add_library( BamTools-utils STATIC bamtools_fasta.cpp bamtools_options.cpp bamtools_pileup_engine.cpp @@ -25,6 +25,6 @@ target_link_libraries( BamTools-utils BamTools ) # set BamTools library properties set_target_properties( BamTools-utils PROPERTIES - SOVERSION 2.2.0 OUTPUT_NAME bamtools-utils + PREFIX "lib" ) diff --git a/src/utils/bamtools_filter_engine.h b/src/utils/bamtools_filter_engine.h index 2ece5e7..9fb2f59 100644 --- a/src/utils/bamtools_filter_engine.h +++ b/src/utils/bamtools_filter_engine.h @@ -2,7 +2,7 @@ // bamtools_filter_engine.h (c) 2010 Derek Barnett, Erik Garrison // Marth Lab, Department of Biology, Boston College // --------------------------------------------------------------------------- -// Last modified: 10 October 2011 +// Last modified: 3 May 2013 // --------------------------------------------------------------------------- // Provides a generic filter engine based on filter-sets of properties, // with possible "rules" (compound logical expressions) to create more complex @@ -294,7 +294,6 @@ bool FilterEngine::evaluateFilterRules(const T& query) { std::stack resultStack; FilterMap::const_iterator filterIter; - FilterMap::const_iterator filterEnd = m_filters.end(); std::queue ruleQueueCopy = m_ruleQueue; while ( !ruleQueueCopy.empty() ) { const std::string& token = ruleQueueCopy.front(); @@ -325,7 +324,7 @@ bool FilterEngine::evaluateFilterRules(const T& query) { else { // look up PropertyFilter that matches this token filterIter = m_filters.find(token); - BAMTOOLS_ASSERT_MESSAGE( (filterIter != filterEnd), "Filter mentioned in rule, not found in FilterEngine" ); + BAMTOOLS_ASSERT_MESSAGE( (filterIter != m_filters.end() ), "Filter mentioned in rule, not found in FilterEngine" ); const PropertyFilter& filter = (*filterIter).second; bool result = m_checker.check(filter, query); resultStack.push( result ); -- 2.39.2