From 1b1108713c795c05fc845ba6373efcf4166acf43 Mon Sep 17 00:00:00 2001 From: derek Date: Thu, 13 Oct 2011 10:03:22 -0400 Subject: [PATCH] Added GetSoftClips() method to BamAlignment --- CMakeLists.txt | 2 +- docs/Doxyfile | 2 +- src/api/BamAlignment.cpp | 90 +++++++++++++++++++++++++++++++++++++++- src/api/BamAlignment.h | 8 +++- src/api/CMakeLists.txt | 2 +- 5 files changed, 99 insertions(+), 5 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 7ee7f83..31bc980 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -32,7 +32,7 @@ ensure_out_of_source_build (" # set BamTools version information set (BamTools_VERSION_MAJOR 2) set (BamTools_VERSION_MINOR 0) -set (BamTools_VERSION_BUILD 1) +set (BamTools_VERSION_BUILD 2) # set our library and executable destination dirs set (EXECUTABLE_OUTPUT_PATH "${CMAKE_SOURCE_DIR}/bin") diff --git a/docs/Doxyfile b/docs/Doxyfile index fb500ad..da1bf4e 100644 --- a/docs/Doxyfile +++ b/docs/Doxyfile @@ -31,7 +31,7 @@ PROJECT_NAME = BamTools # This could be handy for archiving the generated documentation or # if some version control system is used. -PROJECT_NUMBER = 2.0.1 +PROJECT_NUMBER = 2.0.2 # The OUTPUT_DIRECTORY tag is used to specify the (relative or absolute) # base path where the generated documentation will be put. diff --git a/src/api/BamAlignment.cpp b/src/api/BamAlignment.cpp index 78d7d6b..013937a 100644 --- a/src/api/BamAlignment.cpp +++ b/src/api/BamAlignment.cpp @@ -2,7 +2,7 @@ // BamAlignment.cpp (c) 2009 Derek Barnett // Marth Lab, Department of Biology, Boston College // --------------------------------------------------------------------------- -// Last modified: 10 October 2011 (DB) +// Last modified: 13 October 2011 (DB) // --------------------------------------------------------------------------- // Provides the BamAlignment data structure // *************************************************************************** @@ -443,6 +443,94 @@ std::string BamAlignment::GetErrorString(void) const { return ErrorString; } +/*! \fn bool BamAlignment::GetSoftClips(std::vector& clipSizes, std::vector& readPositions, std::vector& genomePositions, bool usePadded = false) const + \brief Identifies if an alignment has a soft clip. If so, identifies the + sizes of the soft clips, as well as their positions in the read and reference. + + \param[out] clipSizes vector of the sizes of each soft clip in the alignment + \param[out] readPositions vector of the 0-based read locations of each soft clip in the alignment. + These positions are basically indexes within the read, not genomic positions. + \param[out] genomePositions vector of the 0-based genome locations of each soft clip in the alignment + \param[in] usePadded inserted bases affect reported position. Default is false, so that + reported position stays 'sync-ed' with reference coordinates. + + \return \c true if any soft clips were found in the alignment +*/ +bool BamAlignment::GetSoftClips(vector& clipSizes, + vector& readPositions, + vector& genomePositions, + bool usePadded) const +{ + // initialize positions & flags + int refPosition = Position; + int readPosition = 0; + bool softClipFound = false; + bool firstCigarOp = true; + + // iterate over cigar operations + vector::const_iterator cigarIter = CigarData.begin(); + vector::const_iterator cigarEnd = CigarData.end(); + for ( ; cigarIter != cigarEnd; ++cigarIter) { + const CigarOp& op = (*cigarIter); + + switch ( op.Type ) { + + // increase both read & genome positions on CIGAR chars [DMXN=] + case Constants::BAM_CIGAR_DEL_CHAR : + case Constants::BAM_CIGAR_MATCH_CHAR : + case Constants::BAM_CIGAR_MISMATCH_CHAR : + case Constants::BAM_CIGAR_REFSKIP_CHAR : + case Constants::BAM_CIGAR_SEQMATCH_CHAR : + refPosition += op.Length; + readPosition += op.Length; + break; + + // increase read position on insertion, genome position only if @usePadded is true + case Constants::BAM_CIGAR_INS_CHAR : + readPosition += op.Length; + if ( usePadded ) + refPosition += op.Length; + break; + + case Constants::BAM_CIGAR_SOFTCLIP_CHAR : + + softClipFound = true; + + ////////////////////////////////////////////////////////////////////////////// + // if we are dealing with the *first* CIGAR operation + // for this alignment, we increment the read position so that + // the read and genome position of the clip are referring to the same base. + // For example, in the alignment below, the ref position would be 4, yet + // the read position would be 0. Thus, to "sync" the two, + // we need to increment the read position by the length of the + // soft clip. + // Read: ATCGTTTCGTCCCTGC + // Ref: GGGATTTCGTCCCTGC + // Cigar: SSSSMMMMMMMMMMMM + // + // NOTE: This only needs to be done if the soft clip is the _first_ CIGAR op. + ////////////////////////////////////////////////////////////////////////////// + if ( firstCigarOp ) + readPosition += op.Length; + + // track the soft clip's size, read position, and genome position + clipSizes.push_back(op.Length); + readPositions.push_back(readPosition); + genomePositions.push_back(refPosition); + + // any other CIGAR operations have no effect + default : + break; + } + + // clear our "first pass" flag + firstCigarOp = false; + } + + // return whether any soft clips found + return softClipFound; +} + /*! \fn bool BamAlignment::GetTagType(const std::string& tag, char& type) const \brief Retrieves the BAM tag type-code associated with requested tag name. diff --git a/src/api/BamAlignment.h b/src/api/BamAlignment.h index 9a8e7be..22535c9 100644 --- a/src/api/BamAlignment.h +++ b/src/api/BamAlignment.h @@ -2,7 +2,7 @@ // BamAlignment.h (c) 2009 Derek Barnett // Marth Lab, Department of Biology, Boston College // --------------------------------------------------------------------------- -// Last modified: 10 October 2011 (DB) +// Last modified: 12 October 2011 (DB) // --------------------------------------------------------------------------- // Provides the BamAlignment data structure // *************************************************************************** @@ -100,6 +100,12 @@ struct API_EXPORT BamAlignment { // returns a description of the last error that occurred std::string GetErrorString(void) const; + // retrieves the size, read locations and reference locations of soft-clip operations + bool GetSoftClips(std::vector& clipSizes, + std::vector& readPositions, + std::vector& genomePositions, + bool usePadded = false) const; + // public data fields public: std::string Name; // read name diff --git a/src/api/CMakeLists.txt b/src/api/CMakeLists.txt index fcd0961..bee347b 100644 --- a/src/api/CMakeLists.txt +++ b/src/api/CMakeLists.txt @@ -49,7 +49,7 @@ set( BamToolsAPISources # create main BamTools API shared library add_library( BamTools SHARED ${BamToolsAPISources} ) -set_target_properties( BamTools PROPERTIES SOVERSION "2.0.1" ) +set_target_properties( BamTools PROPERTIES SOVERSION "2.0.2" ) set_target_properties( BamTools PROPERTIES OUTPUT_NAME "bamtools" ) # create main BamTools API static library -- 2.39.2