// BamAlignment.cpp (c) 2009 Derek Barnett
// Marth Lab, Department of Biology, Boston College
// ---------------------------------------------------------------------------
-// Last modified: 22 February 2012 (DB)
+// Last modified: 4 April 2012 (DB)
// ---------------------------------------------------------------------------
// Provides the BamAlignment data structure
// ***************************************************************************
*/
/*! \var BamAlignment::QueryBases
\brief 'original' sequence (as reported from sequencing machine)
+
+ \note Setting this field to "*" indicates that the sequence is not to be stored on output.
+ In this case, the contents of the Qualities field should be invalidated as well (cleared or marked as "*").
*/
/*! \var BamAlignment::AlignedBases
\brief 'aligned' sequence (includes any indels, padding, clipping)
+
+ This field will be completely empty after reading from BamReader/BamMultiReader when
+ QueryBases is empty.
*/
/*! \var BamAlignment::Qualities
\brief FASTQ qualities (ASCII characters, not numeric values)
+
+ \note Setting this field to "*" indicates to BamWriter that the quality scores are not to be stored,
+ but instead will be output as a sequence of '0xFF'. Otherwise, QueryBases must not be a "*" and
+ the length of this field should equal the length of QueryBases.
*/
/*! \var BamAlignment::TagData
\brief tag data (use the provided methods to query/modify)
const unsigned int tagDataLength = dataLength - tagDataOffset;
// check offsets to see what char data exists
- const bool hasSeqData = ( seqDataOffset < dataLength );
- const bool hasQualData = ( qualDataOffset < dataLength );
+ const bool hasSeqData = ( seqDataOffset < qualDataOffset );
+ const bool hasQualData = ( qualDataOffset < tagDataOffset );
const bool hasTagData = ( tagDataOffset < dataLength );
- // set up char buffers
- const char* allCharData = SupportData.AllCharData.data();
- const char* seqData = ( hasSeqData ? (((const char*)allCharData) + seqDataOffset) : (const char*)0 );
- const char* qualData = ( hasQualData ? (((const char*)allCharData) + qualDataOffset) : (const char*)0 );
- char* tagData = ( hasTagData ? (((char*)allCharData) + tagDataOffset) : (char*)0 );
-
// store alignment name (relies on null char in name as terminator)
- Name.assign((const char*)(allCharData));
+ Name.assign(SupportData.AllCharData.data());
// save query sequence
QueryBases.clear();
if ( hasSeqData ) {
+ const char* seqData = SupportData.AllCharData.data() + seqDataOffset;
QueryBases.reserve(SupportData.QuerySequenceLength);
for ( size_t i = 0; i < SupportData.QuerySequenceLength; ++i ) {
const char singleBase = Constants::BAM_DNA_LOOKUP[ ( (seqData[(i/2)] >> (4*(1-(i%2)))) & 0xf ) ];
}
}
- // save qualities, converting from numeric QV to 'FASTQ-style' ASCII character
+ // save qualities
+
Qualities.clear();
if ( hasQualData ) {
- Qualities.reserve(SupportData.QuerySequenceLength);
- for ( size_t i = 0; i < SupportData.QuerySequenceLength; ++i ) {
- const char singleQuality = static_cast<const char>(qualData[i]+33);
- Qualities.append(1, singleQuality);
+ const char* qualData = SupportData.AllCharData.data() + qualDataOffset;
+
+ // if marked as unstored (sequence of 0xFF) - don't do conversion, just fill with 0xFFs
+ if ( qualData[0] == (char)0xFF )
+ Qualities.resize(SupportData.QuerySequenceLength, (char)0xFF);
+
+ // otherwise convert from numeric QV to 'FASTQ-style' ASCII character
+ else {
+ Qualities.reserve(SupportData.QuerySequenceLength);
+ for ( size_t i = 0; i < SupportData.QuerySequenceLength; ++i )
+ Qualities.append(1, qualData[i]+33);
}
}
// if QueryBases has data, build AlignedBases using CIGAR data
// otherwise, AlignedBases will remain empty (this case IS allowed)
- if ( !QueryBases.empty() ) {
+ if ( !QueryBases.empty() && QueryBases != "*" ) {
// resize AlignedBases
AlignedBases.reserve(SupportData.QuerySequenceLength);
// save tag data
TagData.clear();
if ( hasTagData ) {
+
+ char* tagData = (((char*)SupportData.AllCharData.data()) + tagDataOffset);
+
if ( IsBigEndian ) {
size_t i = 0;
while ( i < tagDataLength ) {
// Marth Lab, Department of Biology, Boston College
// All rights reserved.
// ---------------------------------------------------------------------------
-// Last modified: 10 October 2011 (DB)
+// Last modified: 4 April 2012 (DB)
// ---------------------------------------------------------------------------
// Provides sorting functionality.
// ***************************************************************************
// data members
private:
- const Sort::Order& m_order;
+ const Sort::Order m_order;
};
/*! \struct BamTools::Algorithms::Sort::ByPosition
// data members
private:
- Sort::Order m_order;
+ const Sort::Order m_order;
};
/*! \struct BamTools::Algorithms::Sort::ByTag
// data members
private:
- std::string m_tag;
- Sort::Order m_order;
+ const std::string m_tag;
+ const Sort::Order m_order;
};
/*! \struct BamTools::Algorithms::Sort::Unsorted
// BamWriter_p.cpp (c) 2010 Derek Barnett
// Marth Lab, Department of Biology, Boston College
// ---------------------------------------------------------------------------
-// Last modified: 13 February 2012 (DB)
+// Last modified: 4 April 2012 (DB)
// ---------------------------------------------------------------------------
// Provides the basic functionality for producing BAM files
// ***************************************************************************
// calculate char lengths
const unsigned int nameLength = al.Name.size() + 1;
const unsigned int numCigarOperations = al.CigarData.size();
- const unsigned int queryLength = al.QueryBases.size();
+ const unsigned int queryLength = ( (al.QueryBases == "*") ? 0 : al.QueryBases.size() );
const unsigned int tagDataLength = al.TagData.size();
// no way to tell if alignment's bin is already defined (there is no default, invalid value)
const unsigned int packedCigarLength = packedCigar.size();
// encode the query
+ unsigned int encodedQueryLength = 0;
string encodedQuery;
- EncodeQuerySequence(al.QueryBases, encodedQuery);
- const unsigned int encodedQueryLength = encodedQuery.size();
+ if ( queryLength > 0 ) {
+ EncodeQuerySequence(al.QueryBases, encodedQuery);
+ encodedQueryLength = encodedQuery.size();
+ }
// write the block size
const unsigned int dataBlockSize = nameLength +
packedCigarLength +
encodedQueryLength +
- queryLength +
+ queryLength + // here referring to quality length
tagDataLength;
unsigned int blockSize = Constants::BAM_CORE_SIZE + dataBlockSize;
if ( m_isBigEndian ) BamTools::SwapEndian_32(blockSize);
else
m_stream.Write(packedCigar.data(), packedCigarLength);
- // write the encoded query sequence
- m_stream.Write(encodedQuery.data(), encodedQueryLength);
+ if ( queryLength > 0 ) {
+
+ // write the encoded query sequence
+ m_stream.Write(encodedQuery.data(), encodedQueryLength);
- // write the base qualities
- char* pBaseQualities = (char*)al.Qualities.data();
- for ( size_t i = 0; i < queryLength; ++i )
- pBaseQualities[i] -= 33; // FASTQ conversion
- m_stream.Write(pBaseQualities, queryLength);
+ // write the base qualities
+ char* pBaseQualities = new char[queryLength]();
+ if ( al.Qualities.empty() || al.Qualities == "*" )
+ memset(pBaseQualities, 0xFF, queryLength); // if missing or '*', fill with invalid qual
+ else {
+ for ( size_t i = 0; i < queryLength; ++i )
+ pBaseQualities[i] = al.Qualities.at(i) - 33; // FASTQ ASCII -> phred score conversion
+ }
+ m_stream.Write(pBaseQualities, queryLength);
+ delete[] pBaseQualities;
+ }
// write the tag data
if ( m_isBigEndian ) {
m_out << "\"queryBases\":\"" << a.QueryBases << "\",";
// write qualities
- if ( !a.Qualities.empty() ) {
+ if ( !a.Qualities.empty() && a.Qualities.at(0) != (char)0xFF ) {
string::const_iterator s = a.Qualities.begin();
m_out << "\"qualities\":[" << static_cast<short>(*s) - 33;
++s;
m_out << a.QueryBases << "\t";
// write qualities
- if ( a.Qualities.empty() )
+ if ( a.Qualities.empty() || (a.Qualities.at(0) == (char)0xFF) )
m_out << "*";
else
m_out << a.Qualities;