1 // ***************************************************************************
2 // SamFormatParser.cpp (c) 2010 Derek Barnett
3 // Marth Lab, Department of Biology, Boston College
4 // All rights reserved.
5 // ---------------------------------------------------------------------------
6 // Last modified: 23 December 2010 (DB)
7 // ---------------------------------------------------------------------------
8 // Provides functionality for parsing SAM header text into SamHeader object
9 // ***************************************************************************
11 #include <api/SamConstants.h>
12 #include <api/SamHeader.h>
13 #include <api/internal/SamFormatParser_p.h>
14 using namespace BamTools;
15 using namespace BamTools::Internal;
22 SamFormatParser::SamFormatParser(SamHeader& header)
26 SamFormatParser::~SamFormatParser(void) { }
28 void SamFormatParser::Parse(const string& headerText) {
30 // clear header's prior contents
33 // empty header is OK, but skip processing
34 if ( headerText.empty() )
37 // other wise parse SAM lines
38 istringstream headerStream(headerText);
39 string headerLine("");
40 while ( getline(headerStream, headerLine) )
41 ParseSamLine(headerLine);
45 void SamFormatParser::ParseSamLine(const string& line) {
47 // skip if line is not long enough to contain true values
48 if (line.length() < 5 ) return;
50 // determine token at beginning of line
51 const string firstToken = line.substr(0,3);
52 string restOfLine = line.substr(4);
53 if ( firstToken == Constants::SAM_HD_BEGIN_TOKEN) ParseHDLine(restOfLine);
54 else if ( firstToken == Constants::SAM_SQ_BEGIN_TOKEN) ParseSQLine(restOfLine);
55 else if ( firstToken == Constants::SAM_RG_BEGIN_TOKEN) ParseRGLine(restOfLine);
56 else if ( firstToken == Constants::SAM_PG_BEGIN_TOKEN) ParsePGLine(restOfLine);
57 else if ( firstToken == Constants::SAM_CO_BEGIN_TOKEN) ParseCOLine(restOfLine);
59 cerr << "SamFormatParser ERROR: unknown token: " << firstToken << endl;
64 void SamFormatParser::ParseHDLine(const string& line) {
66 // split HD lines into tokens
67 vector<string> tokens = Split(line, Constants::SAM_TAB);
69 // iterate over tokens
70 vector<string>::const_iterator tokenIter = tokens.begin();
71 vector<string>::const_iterator tokenEnd = tokens.end();
72 for ( ; tokenIter != tokenEnd; ++tokenIter ) {
75 const string tokenTag = (*tokenIter).substr(0,2);
76 const string tokenValue = (*tokenIter).substr(3);
78 // set header contents
79 if ( tokenTag == Constants::SAM_HD_VERSION_TAG ) m_header.Version = tokenValue;
80 else if ( tokenTag == Constants::SAM_HD_GROUPORDER_TAG ) m_header.GroupOrder = tokenValue;
81 else if ( tokenTag == Constants::SAM_HD_SORTORDER_TAG ) m_header.SortOrder = tokenValue;
83 cerr << "SamFormatParser ERROR: unknown HD tag: " << tokenTag << endl;
86 // if @HD line exists, VN must be provided
87 if ( !m_header.HasVersion() ) {
88 cerr << "SamFormatParser ERROR: @HD line is missing VN tag" << endl;
93 void SamFormatParser::ParseSQLine(const string& line) {
97 // split SQ line into tokens
98 vector<string> tokens = Split(line, Constants::SAM_TAB);
100 // iterate over tokens
101 vector<string>::const_iterator tokenIter = tokens.begin();
102 vector<string>::const_iterator tokenEnd = tokens.end();
103 for ( ; tokenIter != tokenEnd; ++tokenIter ) {
106 const string tokenTag = (*tokenIter).substr(0,2);
107 const string tokenValue = (*tokenIter).substr(3);
109 // set sequence contents
110 if ( tokenTag == Constants::SAM_SQ_NAME_TAG ) seq.Name = tokenValue;
111 else if ( tokenTag == Constants::SAM_SQ_LENGTH_TAG ) seq.Length = tokenValue;
112 else if ( tokenTag == Constants::SAM_SQ_ASSEMBLYID_TAG ) seq.AssemblyID = tokenValue;
113 else if ( tokenTag == Constants::SAM_SQ_URI_TAG ) seq.URI = tokenValue;
114 else if ( tokenTag == Constants::SAM_SQ_CHECKSUM_TAG ) seq.Checksum = tokenValue;
115 else if ( tokenTag == Constants::SAM_SQ_SPECIES_TAG ) seq.Species = tokenValue;
117 cerr << "SamFormatParser ERROR: unknown SQ tag: " << tokenTag << endl;
120 // if @SQ line exists, SN must be provided
121 if ( !seq.HasName() ) {
122 cerr << "SamFormatParser ERROR: @SQ line is missing SN tag" << endl;
126 // if @SQ line exists, LN must be provided
127 if ( !seq.HasLength() ) {
128 cerr << "SamFormatParser ERROR: @SQ line is missing LN tag" << endl;
132 // store SAM sequence entry
133 m_header.Sequences.Add(seq);
136 void SamFormatParser::ParseRGLine(const string& line) {
140 // split string into tokens
141 vector<string> tokens = Split(line, Constants::SAM_TAB);
143 // iterate over tokens
144 vector<string>::const_iterator tokenIter = tokens.begin();
145 vector<string>::const_iterator tokenEnd = tokens.end();
146 for ( ; tokenIter != tokenEnd; ++tokenIter ) {
148 // get token tag/value
149 const string tokenTag = (*tokenIter).substr(0,2);
150 const string tokenValue = (*tokenIter).substr(3);
152 // set read group contents
153 if ( tokenTag == Constants::SAM_RG_ID_TAG ) rg.ID = tokenValue;
154 else if ( tokenTag == Constants::SAM_RG_SAMPLE_TAG ) rg.Sample = tokenValue;
155 else if ( tokenTag == Constants::SAM_RG_LIBRARY_TAG ) rg.Library = tokenValue;
156 else if ( tokenTag == Constants::SAM_RG_DESCRIPTION_TAG ) rg.Description = tokenValue;
157 else if ( tokenTag == Constants::SAM_RG_PLATFORMUNIT_TAG ) rg.PlatformUnit = tokenValue;
158 else if ( tokenTag == Constants::SAM_RG_PREDICTEDINSERTSIZE_TAG ) rg.PredictedInsertSize = tokenValue;
159 else if ( tokenTag == Constants::SAM_RG_SEQCENTER_TAG ) rg.SequencingCenter = tokenValue;
160 else if ( tokenTag == Constants::SAM_RG_PRODUCTIONDATE_TAG ) rg.ProductionDate = tokenValue;
161 else if ( tokenTag == Constants::SAM_RG_SEQTECHNOLOGY_TAG ) rg.SequencingTechnology = tokenValue;
163 cerr << "SamFormatParser ERROR: unknown RG tag: " << tokenTag << endl;
166 // if @RG line exists, ID must be provided
168 cerr << "SamFormatParser ERROR: @RG line is missing ID tag" << endl;
172 // if @RG line exists, SM must be provided
173 if ( !rg.HasSample() ) {
174 cerr << "SamFormatParser ERROR: @RG line is missing SM tag" << endl;
178 // store SAM read group entry
179 m_header.ReadGroups.Add(rg);
182 void SamFormatParser::ParsePGLine(const string& line) {
184 // split string into tokens
185 vector<string> tokens = Split(line, Constants::SAM_TAB);
187 // iterate over tokens
188 vector<string>::const_iterator tokenIter = tokens.begin();
189 vector<string>::const_iterator tokenEnd = tokens.end();
190 for ( ; tokenIter != tokenEnd; ++tokenIter ) {
192 // get token tag/value
193 const string tokenTag = (*tokenIter).substr(0,2);
194 const string tokenValue = (*tokenIter).substr(3);
196 // set header contents
197 if ( tokenTag == Constants::SAM_PG_NAME_TAG ) m_header.ProgramName = tokenValue;
198 else if ( tokenTag == Constants::SAM_PG_VERSION_TAG ) m_header.ProgramVersion = tokenValue;
199 else if ( tokenTag == Constants::SAM_PG_COMMANDLINE_TAG ) m_header.ProgramCommandLine = tokenValue;
201 cerr << "SamFormatParser ERROR: unknown PG tag: " << tokenTag << endl;
204 // if @PG line exists, ID must be provided
205 if ( !m_header.HasProgramName() ) {
206 cerr << "SamFormatParser ERROR:- @PG line is missing ID tag" << endl;
211 void SamFormatParser::ParseCOLine(const string& line) {
212 // simply add line to comments list
213 m_header.Comments.push_back(line);
216 const vector<string> SamFormatParser::Split(const string& line, const char delim) {
217 vector<string> tokens;
218 stringstream lineStream(line);
220 while ( getline(lineStream, token, delim) )
221 tokens.push_back(token);