From a97aada254953e7ef30287f173c25a25e02c3815 Mon Sep 17 00:00:00 2001 From: Derek Date: Thu, 23 Sep 2010 15:56:43 -0400 Subject: [PATCH] Fixed lack of reverse complemented output in BAM -> FASTA/Q conversion. * Also appended '/1' or '/2' to paired-end FASTQ entries to indicate which mate it represents. --- src/toolkit/bamtools_convert.cpp | 46 +++++++++++++++++++++++--------- src/utils/bamtools_utilities.cpp | 24 ++++++++++++++++- src/utils/bamtools_utilities.h | 6 +++-- 3 files changed, 61 insertions(+), 15 deletions(-) diff --git a/src/toolkit/bamtools_convert.cpp b/src/toolkit/bamtools_convert.cpp index 1929da6..86a3f9e 100644 --- a/src/toolkit/bamtools_convert.cpp +++ b/src/toolkit/bamtools_convert.cpp @@ -3,7 +3,7 @@ // Marth Lab, Department of Biology, Boston College // All rights reserved. // --------------------------------------------------------------------------- -// Last modified: 16 September 2010 +// Last modified: 23 September 2010 // --------------------------------------------------------------------------- // Converts between BAM and a number of other formats // *************************************************************************** @@ -56,8 +56,6 @@ namespace BamTools { // PileupVisitor interface implementation public: -// void Visit(const int& refId, const int& position, const vector& alignments); - void Visit(const PileupPosition& pileupData); // data members @@ -279,28 +277,35 @@ void ConvertTool::ConvertToolPrivate::PrintFasta(const BamAlignment& a) { // >BamAlignment.Name // BamAlignment.QueryBases (up to FASTA_LINE_MAX bases per line) // ... + // + // N.B. - QueryBases are reverse-complemented if aligned to reverse strand // print header m_out << "> " << a.Name << endl; + // handle reverse strand alignment - bases + string sequence = a.QueryBases; + if ( a.IsReverseStrand() ) + Utilities::ReverseComplement(sequence); + // if sequence fits on single line - if ( a.QueryBases.length() <= FASTA_LINE_MAX ) - m_out << a.QueryBases << endl; + if ( sequence.length() <= FASTA_LINE_MAX ) + m_out << sequence << endl; // else split over multiple lines else { size_t position = 0; - size_t seqLength = a.QueryBases.length(); + size_t seqLength = sequence.length(); // handle reverse strand alignment - bases & qualitiesth(); // write subsequences to each line while ( position < (seqLength - FASTA_LINE_MAX) ) { - m_out << a.QueryBases.substr(position, FASTA_LINE_MAX) << endl; + m_out << sequence.substr(position, FASTA_LINE_MAX) << endl; position += FASTA_LINE_MAX; } // write final subsequence - m_out << a.QueryBases.substr(position) << endl; + m_out << sequence.substr(position) << endl; } } @@ -312,11 +317,28 @@ void ConvertTool::ConvertToolPrivate::PrintFastq(const BamAlignment& a) { // BamAlignment.QueryBases // + // BamAlignment.Qualities + // + // N.B. - QueryBases are reverse-complemented (& Qualities reversed) if aligned to reverse strand . + // Name is appended "/1" or "/2" if paired-end, to reflect which mate this entry is. + + // handle paired-end alignments + string name = a.Name; + if ( a.IsPaired() ) + name.append( (a.IsFirstMate() ? "/1" : "/2") ); + + // handle reverse strand alignment - bases & qualities + string qualities = a.Qualities; + string sequence = a.QueryBases; + if ( a.IsReverseStrand() ) { + Utilities::Reverse(qualities); + Utilities::ReverseComplement(sequence); + } - m_out << "@" << a.Name << endl - << a.QueryBases << endl - << "+" << endl - << a.Qualities << endl; + // write to output stream + m_out << "@" << name << endl + << sequence << endl + << "+" << endl + << qualities << endl; } // print BamAlignment in JSON format diff --git a/src/utils/bamtools_utilities.cpp b/src/utils/bamtools_utilities.cpp index b39e60d..559fc8f 100644 --- a/src/utils/bamtools_utilities.cpp +++ b/src/utils/bamtools_utilities.cpp @@ -3,11 +3,12 @@ // Marth Lab, Department of Biology, Boston College // All rights reserved. // --------------------------------------------------------------------------- -// Last modified: 3 September 2010 +// Last modified: 23 September 2010 // --------------------------------------------------------------------------- // Provides general utilities used by BamTools sub-tools. // *************************************************************************** +#include #include #include #include @@ -17,6 +18,12 @@ using namespace std; using namespace BamTools; +namespace BamTools { + +const char REVCOMP_LOOKUP[] = {'T', 0, 'G', 'H', 0, 0, 'C', 'D', 0, 0, 0, 0, 'K', 'N', 0, 0, 0, 'Y', 'W', 'A', 'A', 'B', 'S', 'X', 'R', 0 }; + +} // namespace BamTools + // check if a file exists bool Utilities::FileExists(const std::string& filename) { ifstream f(filename.c_str(), ifstream::in); @@ -238,3 +245,18 @@ bool Utilities::ParseRegionString(const std::string& regionString, const BamMult return true; } + +void Utilities::Reverse(string& sequence) { + reverse(sequence.begin(), sequence.end()); +} + +void Utilities::ReverseComplement(std::string& sequence) { + + // do complement + size_t seqLength = sequence.length(); + for ( size_t i = 0; i < seqLength; ++i ) + sequence.replace(i, 1, 1, REVCOMP_LOOKUP[(int)sequence.at(i) - 65]); + + // reverse it + Reverse(sequence); +} diff --git a/src/utils/bamtools_utilities.h b/src/utils/bamtools_utilities.h index f72897a..e31f63a 100644 --- a/src/utils/bamtools_utilities.h +++ b/src/utils/bamtools_utilities.h @@ -3,7 +3,7 @@ // Marth Lab, Department of Biology, Boston College // All rights reserved. // --------------------------------------------------------------------------- -// Last modified: 30 August 2010 +// Last modified: 23 September 2010 // --------------------------------------------------------------------------- // Provides general utilities used by BamTools sub-tools. // *************************************************************************** @@ -36,7 +36,9 @@ class Utilities { // Same as above, but accepts a BamMultiReader static bool ParseRegionString(const std::string& regionString, const BamMultiReader& reader, BamRegion& region); - + // sequence utilities + static void Reverse(std::string& sequence); + static void ReverseComplement(std::string& sequence); }; } // namespace BamTools -- 2.39.2