From: derek Date: Wed, 4 Apr 2012 22:28:23 +0000 (-0400) Subject: Added proper handling for empty or ignored ("*") alignment fields: X-Git-Url: https://git.donarmstrong.com/?p=bamtools.git;a=commitdiff_plain;h=6bb3f902d3f8087112afd4dbf8a783b73b40db8d Added proper handling for empty or ignored ("*") alignment fields: QueryBases & Qualities --- diff --git a/src/api/BamAlignment.cpp b/src/api/BamAlignment.cpp index 3a508c4..251c5e0 100644 --- a/src/api/BamAlignment.cpp +++ b/src/api/BamAlignment.cpp @@ -2,7 +2,7 @@ // BamAlignment.cpp (c) 2009 Derek Barnett // Marth Lab, Department of Biology, Boston College // --------------------------------------------------------------------------- -// Last modified: 22 February 2012 (DB) +// Last modified: 4 April 2012 (DB) // --------------------------------------------------------------------------- // Provides the BamAlignment data structure // *************************************************************************** @@ -25,12 +25,22 @@ using namespace std; */ /*! \var BamAlignment::QueryBases \brief 'original' sequence (as reported from sequencing machine) + + \note Setting this field to "*" indicates that the sequence is not to be stored on output. + In this case, the contents of the Qualities field should be invalidated as well (cleared or marked as "*"). */ /*! \var BamAlignment::AlignedBases \brief 'aligned' sequence (includes any indels, padding, clipping) + + This field will be completely empty after reading from BamReader/BamMultiReader when + QueryBases is empty. */ /*! \var BamAlignment::Qualities \brief FASTQ qualities (ASCII characters, not numeric values) + + \note Setting this field to "*" indicates to BamWriter that the quality scores are not to be stored, + but instead will be output as a sequence of '0xFF'. Otherwise, QueryBases must not be a "*" and + the length of this field should equal the length of QueryBases. */ /*! \var BamAlignment::TagData \brief tag data (use the provided methods to query/modify) @@ -138,22 +148,17 @@ bool BamAlignment::BuildCharData(void) { const unsigned int tagDataLength = dataLength - tagDataOffset; // check offsets to see what char data exists - const bool hasSeqData = ( seqDataOffset < dataLength ); - const bool hasQualData = ( qualDataOffset < dataLength ); + const bool hasSeqData = ( seqDataOffset < qualDataOffset ); + const bool hasQualData = ( qualDataOffset < tagDataOffset ); const bool hasTagData = ( tagDataOffset < dataLength ); - // set up char buffers - const char* allCharData = SupportData.AllCharData.data(); - const char* seqData = ( hasSeqData ? (((const char*)allCharData) + seqDataOffset) : (const char*)0 ); - const char* qualData = ( hasQualData ? (((const char*)allCharData) + qualDataOffset) : (const char*)0 ); - char* tagData = ( hasTagData ? (((char*)allCharData) + tagDataOffset) : (char*)0 ); - // store alignment name (relies on null char in name as terminator) - Name.assign((const char*)(allCharData)); + Name.assign(SupportData.AllCharData.data()); // save query sequence QueryBases.clear(); if ( hasSeqData ) { + const char* seqData = SupportData.AllCharData.data() + seqDataOffset; QueryBases.reserve(SupportData.QuerySequenceLength); for ( size_t i = 0; i < SupportData.QuerySequenceLength; ++i ) { const char singleBase = Constants::BAM_DNA_LOOKUP[ ( (seqData[(i/2)] >> (4*(1-(i%2)))) & 0xf ) ]; @@ -161,13 +166,21 @@ bool BamAlignment::BuildCharData(void) { } } - // save qualities, converting from numeric QV to 'FASTQ-style' ASCII character + // save qualities + Qualities.clear(); if ( hasQualData ) { - Qualities.reserve(SupportData.QuerySequenceLength); - for ( size_t i = 0; i < SupportData.QuerySequenceLength; ++i ) { - const char singleQuality = static_cast(qualData[i]+33); - Qualities.append(1, singleQuality); + const char* qualData = SupportData.AllCharData.data() + qualDataOffset; + + // if marked as unstored (sequence of 0xFF) - don't do conversion, just fill with 0xFFs + if ( qualData[0] == (char)0xFF ) + Qualities.resize(SupportData.QuerySequenceLength, (char)0xFF); + + // otherwise convert from numeric QV to 'FASTQ-style' ASCII character + else { + Qualities.reserve(SupportData.QuerySequenceLength); + for ( size_t i = 0; i < SupportData.QuerySequenceLength; ++i ) + Qualities.append(1, qualData[i]+33); } } @@ -176,7 +189,7 @@ bool BamAlignment::BuildCharData(void) { // if QueryBases has data, build AlignedBases using CIGAR data // otherwise, AlignedBases will remain empty (this case IS allowed) - if ( !QueryBases.empty() ) { + if ( !QueryBases.empty() && QueryBases != "*" ) { // resize AlignedBases AlignedBases.reserve(SupportData.QuerySequenceLength); @@ -235,6 +248,9 @@ bool BamAlignment::BuildCharData(void) { // save tag data TagData.clear(); if ( hasTagData ) { + + char* tagData = (((char*)SupportData.AllCharData.data()) + tagDataOffset); + if ( IsBigEndian ) { size_t i = 0; while ( i < tagDataLength ) { diff --git a/src/api/internal/bam/BamWriter_p.cpp b/src/api/internal/bam/BamWriter_p.cpp index 97a9a12..8877800 100644 --- a/src/api/internal/bam/BamWriter_p.cpp +++ b/src/api/internal/bam/BamWriter_p.cpp @@ -2,7 +2,7 @@ // BamWriter_p.cpp (c) 2010 Derek Barnett // Marth Lab, Department of Biology, Boston College // --------------------------------------------------------------------------- -// Last modified: 13 February 2012 (DB) +// Last modified: 4 April 2012 (DB) // --------------------------------------------------------------------------- // Provides the basic functionality for producing BAM files // *************************************************************************** @@ -210,7 +210,7 @@ void BamWriterPrivate::WriteAlignment(const BamAlignment& al) { // calculate char lengths const unsigned int nameLength = al.Name.size() + 1; const unsigned int numCigarOperations = al.CigarData.size(); - const unsigned int queryLength = al.QueryBases.size(); + const unsigned int queryLength = ( (al.QueryBases == "*") ? 0 : al.QueryBases.size() ); const unsigned int tagDataLength = al.TagData.size(); // no way to tell if alignment's bin is already defined (there is no default, invalid value) @@ -223,15 +223,18 @@ void BamWriterPrivate::WriteAlignment(const BamAlignment& al) { const unsigned int packedCigarLength = packedCigar.size(); // encode the query + unsigned int encodedQueryLength = 0; string encodedQuery; - EncodeQuerySequence(al.QueryBases, encodedQuery); - const unsigned int encodedQueryLength = encodedQuery.size(); + if ( queryLength > 0 ) { + EncodeQuerySequence(al.QueryBases, encodedQuery); + encodedQueryLength = encodedQuery.size(); + } // write the block size const unsigned int dataBlockSize = nameLength + packedCigarLength + encodedQueryLength + - queryLength + + queryLength + // here referring to quality length tagDataLength; unsigned int blockSize = Constants::BAM_CORE_SIZE + dataBlockSize; if ( m_isBigEndian ) BamTools::SwapEndian_32(blockSize); @@ -274,14 +277,22 @@ void BamWriterPrivate::WriteAlignment(const BamAlignment& al) { else m_stream.Write(packedCigar.data(), packedCigarLength); - // write the encoded query sequence - m_stream.Write(encodedQuery.data(), encodedQueryLength); + if ( queryLength > 0 ) { + + // write the encoded query sequence + m_stream.Write(encodedQuery.data(), encodedQueryLength); - // write the base qualities - char* pBaseQualities = (char*)al.Qualities.data(); - for ( size_t i = 0; i < queryLength; ++i ) - pBaseQualities[i] -= 33; // FASTQ conversion - m_stream.Write(pBaseQualities, queryLength); + // write the base qualities + char* pBaseQualities = new char[queryLength](); + if ( al.Qualities.empty() || al.Qualities == "*" ) + memset(pBaseQualities, 0xFF, queryLength); // if missing or '*', fill with invalid qual + else { + for ( size_t i = 0; i < queryLength; ++i ) + pBaseQualities[i] = al.Qualities.at(i) - 33; // FASTQ ASCII -> phred score conversion + } + m_stream.Write(pBaseQualities, queryLength); + delete[] pBaseQualities; + } // write the tag data if ( m_isBigEndian ) { diff --git a/src/toolkit/bamtools_convert.cpp b/src/toolkit/bamtools_convert.cpp index 907c4ba..8896882 100644 --- a/src/toolkit/bamtools_convert.cpp +++ b/src/toolkit/bamtools_convert.cpp @@ -396,7 +396,7 @@ void ConvertTool::ConvertToolPrivate::PrintJson(const BamAlignment& a) { m_out << "\"queryBases\":\"" << a.QueryBases << "\","; // write qualities - if ( !a.Qualities.empty() ) { + if ( !a.Qualities.empty() && a.Qualities.at(0) != (char)0xFF ) { string::const_iterator s = a.Qualities.begin(); m_out << "\"qualities\":[" << static_cast(*s) - 33; ++s; @@ -539,7 +539,7 @@ void ConvertTool::ConvertToolPrivate::PrintSam(const BamAlignment& a) { m_out << a.QueryBases << "\t"; // write qualities - if ( a.Qualities.empty() ) + if ( a.Qualities.empty() || (a.Qualities.at(0) == (char)0xFF) ) m_out << "*"; else m_out << a.Qualities;