From: Derek Date: Thu, 3 Jun 2010 03:52:07 +0000 (-0400) Subject: Implemented bamtools sam. Required new unpack methods in BGZF.h. Seems to pass all... X-Git-Url: https://git.donarmstrong.com/?a=commitdiff_plain;h=aba2821a7ffdc8d9f86e63fd1895c8e642c93ed4;p=bamtools.git Implemented bamtools sam. Required new unpack methods in BGZF.h. Seems to pass all files tested, compared against samtools view output --- diff --git a/BGZF.h b/BGZF.h index e1adf25..0cca24c 100644 --- a/BGZF.h +++ b/BGZF.h @@ -108,12 +108,32 @@ struct BgzfData { static inline void PackUnsignedInt(char* buffer, unsigned int value); // packs an unsigned short into the specified buffer static inline void PackUnsignedShort(char* buffer, unsigned short value); + // unpacks a buffer into a signed int static inline signed int UnpackSignedInt(char* buffer); - // unpacks a buffer into a unsigned int + // unpacks a buffer into an unsigned int static inline unsigned int UnpackUnsignedInt(char* buffer); - // unpacks a buffer into a unsigned short + // unpacks a buffer into a signed short + static inline signed short UnpackSignedShort(char* buffer); + // unpacks a buffer into an unsigned short static inline unsigned short UnpackUnsignedShort(char* buffer); + // unpacks a buffer into a double + static inline double UnpackDouble(char* buffer); + // unpacks a buffer into a float + static inline float UnpackFloat(char* buffer); + + // unpacks a buffer into a signed int + static inline signed int UnpackSignedInt(const char* buffer); + // unpacks a buffer into an unsigned int + static inline unsigned int UnpackUnsignedInt(const char* buffer); + // unpacks a buffer into a signed short + static inline signed short UnpackSignedShort(const char* buffer); + // unpacks a buffer into an unsigned short + static inline unsigned short UnpackUnsignedShort(const char* buffer); + // unpacks a buffer into a double + static inline double UnpackDouble(const char* buffer); + // unpacks a buffer into a float + static inline float UnpackFloat(const char* buffer); }; // ------------------------------------------------------------- @@ -170,6 +190,16 @@ unsigned int BgzfData::UnpackUnsignedInt(char* buffer) { return un.value; } +// 'unpacks' a buffer into a signed short +inline +signed short BgzfData::UnpackSignedShort(char* buffer) { + union { signed short value; unsigned char valueBuffer[sizeof(signed short)]; } un; + un.value = 0; + un.valueBuffer[0] = buffer[0]; + un.valueBuffer[1] = buffer[1]; + return un.value; +} + // 'unpacks' a buffer into an unsigned short inline unsigned short BgzfData::UnpackUnsignedShort(char* buffer) { @@ -180,6 +210,108 @@ unsigned short BgzfData::UnpackUnsignedShort(char* buffer) { return un.value; } +// 'unpacks' a buffer into a double +inline +double BgzfData::UnpackDouble(char* buffer) { + union { double value; unsigned char valueBuffer[sizeof(double)]; } un; + un.value = 0; + un.valueBuffer[0] = buffer[0]; + un.valueBuffer[1] = buffer[1]; + un.valueBuffer[2] = buffer[2]; + un.valueBuffer[3] = buffer[3]; + un.valueBuffer[4] = buffer[4]; + un.valueBuffer[5] = buffer[5]; + un.valueBuffer[6] = buffer[6]; + un.valueBuffer[7] = buffer[7]; + return un.value; +} + +// 'unpacks' a buffer into a float +inline +float BgzfData::UnpackFloat(char* buffer) { + union { float value; unsigned char valueBuffer[sizeof(float)]; } un; + un.value = 0; + un.valueBuffer[0] = buffer[0]; + un.valueBuffer[1] = buffer[1]; + un.valueBuffer[2] = buffer[2]; + un.valueBuffer[3] = buffer[3]; + return un.value; +} + +// --------- + +// 'unpacks' a buffer into a signed int +inline +signed int BgzfData::UnpackSignedInt(const char* buffer) { + union { signed int value; unsigned char valueBuffer[sizeof(signed int)]; } un; + un.value = 0; + un.valueBuffer[0] = buffer[0]; + un.valueBuffer[1] = buffer[1]; + un.valueBuffer[2] = buffer[2]; + un.valueBuffer[3] = buffer[3]; + return un.value; +} + +// 'unpacks' a buffer into an unsigned int +inline +unsigned int BgzfData::UnpackUnsignedInt(const char* buffer) { + union { unsigned int value; unsigned char valueBuffer[sizeof(unsigned int)]; } un; + un.value = 0; + un.valueBuffer[0] = buffer[0]; + un.valueBuffer[1] = buffer[1]; + un.valueBuffer[2] = buffer[2]; + un.valueBuffer[3] = buffer[3]; + return un.value; +} + +// 'unpacks' a buffer into a signed short +inline +signed short BgzfData::UnpackSignedShort(const char* buffer) { + union { signed short value; unsigned char valueBuffer[sizeof(signed short)]; } un; + un.value = 0; + un.valueBuffer[0] = buffer[0]; + un.valueBuffer[1] = buffer[1]; + return un.value; +} + +// 'unpacks' a buffer into an unsigned short +inline +unsigned short BgzfData::UnpackUnsignedShort(const char* buffer) { + union { unsigned short value; unsigned char valueBuffer[sizeof(unsigned short)]; } un; + un.value = 0; + un.valueBuffer[0] = buffer[0]; + un.valueBuffer[1] = buffer[1]; + return un.value; +} + +// 'unpacks' a buffer into a double +inline +double BgzfData::UnpackDouble(const char* buffer) { + union { double value; unsigned char valueBuffer[sizeof(double)]; } un; + un.value = 0; + un.valueBuffer[0] = buffer[0]; + un.valueBuffer[1] = buffer[1]; + un.valueBuffer[2] = buffer[2]; + un.valueBuffer[3] = buffer[3]; + un.valueBuffer[4] = buffer[4]; + un.valueBuffer[5] = buffer[5]; + un.valueBuffer[6] = buffer[6]; + un.valueBuffer[7] = buffer[7]; + return un.value; +} + +// 'unpacks' a buffer into a float +inline +float BgzfData::UnpackFloat(const char* buffer) { + union { float value; unsigned char valueBuffer[sizeof(float)]; } un; + un.value = 0; + un.valueBuffer[0] = buffer[0]; + un.valueBuffer[1] = buffer[1]; + un.valueBuffer[2] = buffer[2]; + un.valueBuffer[3] = buffer[3]; + return un.value; +} + } // namespace BamTools #endif // BGZF_H diff --git a/bamtools_count.cpp b/bamtools_count.cpp index 7a2de0d..dad30e6 100644 --- a/bamtools_count.cpp +++ b/bamtools_count.cpp @@ -98,7 +98,10 @@ int CountTool::Run(int argc, char* argv[]) { BamAlignment al; while ( reader.GetNextAlignment(al) ) ++alignmentCount; - } else { + } + + // more complicated - region specified + else { // parse region string for desired region string startChrom; @@ -199,7 +202,7 @@ int CountTool::Run(int argc, char* argv[]) { } // ----------------------------- - // count alignments until + // count alignments until stop hit if ( !foundError ) { // while valid alignment AND diff --git a/bamtools_sam.cpp b/bamtools_sam.cpp index b14d128..89da132 100644 --- a/bamtools_sam.cpp +++ b/bamtools_sam.cpp @@ -1,25 +1,27 @@ // *************************************************************************** -// bamtools_sam.h (c) 2010 Derek Barnett, Erik Garrison +// bamtools_sam.cpp (c) 2010 Derek Barnett, Erik Garrison // Marth Lab, Department of Biology, Boston College // All rights reserved. // --------------------------------------------------------------------------- -// Last modified: 1 June 2010 +// Last modified: 2 June 2010 // --------------------------------------------------------------------------- // Prints a BAM file in the text-based SAM format. // *************************************************************************** #include #include +#include #include #include "bamtools_sam.h" #include "bamtools_options.h" #include "BamReader.h" +#include "BGZF.h" using namespace std; using namespace BamTools; -RefVector references; +static RefVector references; // --------------------------------------------- // print BamAlignment in SAM format @@ -29,39 +31,121 @@ void PrintSAM(const BamAlignment& a) { // tab-delimited // [ :: [...] ] - // ******************************* // - // ** NOT FULLY IMPLEMENTED YET ** // - //******************************** // - // - // Todo : build CIGAR string - // build TAG string - // there are some quirks, per the spec, regarding when to use '=' or not - // - // ******************************* // - - // - // do validity check on RefID / MateRefID ?? - // - - // build CIGAR string - string cigarString("CIGAR:NOT YET"); - - // build TAG string - string tagString("TAG:NOT YET"); - - // print BamAlignment to stdout in SAM format - cout << a.Name << '\t' - << a.AlignmentFlag << '\t' - << references[a.RefID].RefName << '\t' - << a.Position << '\t' - << a.MapQuality << '\t' - << cigarString << '\t' - << ( a.IsPaired() ? references[a.MateRefID].RefName : "*" ) << '\t' - << ( a.IsPaired() ? a.MatePosition : 0 ) << '\t' - << ( a.IsPaired() ? a.InsertSize : 0 ) << '\t' - << a.QueryBases << '\t' - << a.Qualities << '\t' - << tagString << endl; + ostringstream sb(""); + + // write name & alignment flag + cout << a.Name << "\t" << a.AlignmentFlag << "\t"; + + // write reference name + if ( (a.RefID >= 0) && (a.RefID < (int)references.size()) ) cout << references[a.RefID].RefName << "\t"; + else cout << "*\t"; + + // write position & map quality + cout << a.Position+1 << "\t" << a.MapQuality << "\t"; + + // write CIGAR + const vector& cigarData = a.CigarData; + if ( cigarData.empty() ) cout << "*\t"; + else { + vector::const_iterator cigarIter = cigarData.begin(); + vector::const_iterator cigarEnd = cigarData.end(); + for ( ; cigarIter != cigarEnd; ++cigarIter ) { + const CigarOp& op = (*cigarIter); + cout << op.Length << op.Type; + } + cout << "\t"; + } + + // write mate reference name, mate position, & insert size + if ( a.IsPaired() && (a.MateRefID >= 0) && (a.MateRefID < (int)references.size()) ) { + if ( a.MateRefID == a.RefID ) cout << "=\t"; + else cout << references[a.MateRefID].RefName << "\t"; + cout << a.MatePosition+1 << "\t" << a.InsertSize << "\t"; + } + else cout << "*\t0\t0\t"; + + // write sequence + if ( a.QueryBases.empty() ) cout << "*\t"; + else cout << a.QueryBases << "\t"; + + // write qualities + if ( a.Qualities.empty() ) cout << "*"; + else cout << a.Qualities; + + // write tag data + const char* tagData = a.TagData.c_str(); + const size_t tagDataLength = a.TagData.length(); + size_t index = 0; + while ( index < tagDataLength ) { + + // write tag name + cout << "\t" << a.TagData.substr(index, 2) << ":"; + index += 2; + + // get data type + char type = a.TagData.at(index); + ++index; + + switch (type) { + case('A') : + cout << "A:" << tagData[index]; + ++index; + break; + + case('C') : + cout << "i:" << atoi(&tagData[index]); + ++index; + break; + + case('c') : + cout << "i:" << atoi(&tagData[index]); + ++index; + break; + + case('S') : + cout << "i:" << BgzfData::UnpackUnsignedShort(&tagData[index]); + index += 2; + break; + + case('s') : + cout << "i:" << BgzfData::UnpackSignedShort(&tagData[index]); + index += 2; + break; + + case('I') : + cout << "i:" << BgzfData::UnpackUnsignedInt(&tagData[index]); + index += 4; + break; + + case('i') : + cout << "i:" << BgzfData::UnpackSignedInt(&tagData[index]); + index += 4; + break; + + case('f') : + cout << "f:" << BgzfData::UnpackFloat(&tagData[index]); + index += 4; + break; + + case('d') : + cout << "d:" << BgzfData::UnpackDouble(&tagData[index]); + index += 8; + break; + + case('Z') : + case('H') : + cout << type << ":"; + while (tagData[index]) { + cout << tagData[index]; + ++index; + } + ++index; + break; + } + } + + // write stream to stdout + cout << sb.str() << endl; } // ---------------------------------------------