1 // ***************************************************************************
2 // SamFormatParser.cpp (c) 2010 Derek Barnett
3 // Marth Lab, Department of Biology, Boston College
4 // ---------------------------------------------------------------------------
5 // Last modified: 19 April 2011 (DB)
6 // ---------------------------------------------------------------------------
7 // Provides functionality for parsing SAM header text into SamHeader object
8 // ***************************************************************************
10 #include <api/SamConstants.h>
11 #include <api/SamHeader.h>
12 #include <api/internal/SamFormatParser_p.h>
13 using namespace BamTools;
14 using namespace BamTools::Internal;
21 SamFormatParser::SamFormatParser(SamHeader& header)
25 SamFormatParser::~SamFormatParser(void) { }
27 void SamFormatParser::Parse(const string& headerText) {
29 // clear header's prior contents
32 // empty header is OK, but skip processing
33 if ( headerText.empty() )
36 // other wise parse SAM lines
37 istringstream headerStream(headerText);
38 string headerLine("");
39 while ( getline(headerStream, headerLine) )
40 ParseSamLine(headerLine);
43 void SamFormatParser::ParseSamLine(const string& line) {
45 // skip if line is not long enough to contain true values
46 if (line.length() < 5 ) return;
48 // determine token at beginning of line
49 const string firstToken = line.substr(0,3);
50 string restOfLine = line.substr(4);
51 if ( firstToken == Constants::SAM_HD_BEGIN_TOKEN) ParseHDLine(restOfLine);
52 else if ( firstToken == Constants::SAM_SQ_BEGIN_TOKEN) ParseSQLine(restOfLine);
53 else if ( firstToken == Constants::SAM_RG_BEGIN_TOKEN) ParseRGLine(restOfLine);
54 else if ( firstToken == Constants::SAM_PG_BEGIN_TOKEN) ParsePGLine(restOfLine);
55 else if ( firstToken == Constants::SAM_CO_BEGIN_TOKEN) ParseCOLine(restOfLine);
57 cerr << "SamFormatParser ERROR: unknown token: " << firstToken << endl;
60 void SamFormatParser::ParseHDLine(const string& line) {
62 // split HD lines into tokens
63 vector<string> tokens = Split(line, Constants::SAM_TAB);
65 // iterate over tokens
66 vector<string>::const_iterator tokenIter = tokens.begin();
67 vector<string>::const_iterator tokenEnd = tokens.end();
68 for ( ; tokenIter != tokenEnd; ++tokenIter ) {
71 const string tokenTag = (*tokenIter).substr(0,2);
72 const string tokenValue = (*tokenIter).substr(3);
74 // set header contents
75 if ( tokenTag == Constants::SAM_HD_VERSION_TAG ) m_header.Version = tokenValue;
76 else if ( tokenTag == Constants::SAM_HD_SORTORDER_TAG ) m_header.SortOrder = tokenValue;
77 else if ( tokenTag == Constants::SAM_HD_GROUPORDER_TAG ) m_header.GroupOrder = tokenValue;
79 cerr << "SamFormatParser ERROR: unknown HD tag: " << tokenTag << endl;
82 // if @HD line exists, VN must be provided
83 if ( !m_header.HasVersion() )
84 cerr << "SamFormatParser ERROR: @HD line is missing VN tag" << endl;
87 void SamFormatParser::ParseSQLine(const string& line) {
91 // split SQ line into tokens
92 vector<string> tokens = Split(line, Constants::SAM_TAB);
94 // iterate over tokens
95 vector<string>::const_iterator tokenIter = tokens.begin();
96 vector<string>::const_iterator tokenEnd = tokens.end();
97 for ( ; tokenIter != tokenEnd; ++tokenIter ) {
100 const string tokenTag = (*tokenIter).substr(0,2);
101 const string tokenValue = (*tokenIter).substr(3);
103 // set sequence contents
104 if ( tokenTag == Constants::SAM_SQ_NAME_TAG ) seq.Name = tokenValue;
105 else if ( tokenTag == Constants::SAM_SQ_LENGTH_TAG ) seq.Length = tokenValue;
106 else if ( tokenTag == Constants::SAM_SQ_ASSEMBLYID_TAG ) seq.AssemblyID = tokenValue;
107 else if ( tokenTag == Constants::SAM_SQ_CHECKSUM_TAG ) seq.Checksum = tokenValue;
108 else if ( tokenTag == Constants::SAM_SQ_SPECIES_TAG ) seq.Species = tokenValue;
109 else if ( tokenTag == Constants::SAM_SQ_URI_TAG ) seq.URI = tokenValue;
111 cerr << "SamFormatParser ERROR: unknown SQ tag: " << tokenTag << endl;
114 bool isMissingRequiredFields = false;
116 // if @SQ line exists, SN must be provided
117 if ( !seq.HasName() ) {
118 isMissingRequiredFields = true;
119 cerr << "SamFormatParser ERROR: @SQ line is missing SN tag" << endl;
122 // if @SQ line exists, LN must be provided
123 if ( !seq.HasLength() ) {
124 isMissingRequiredFields = true;
125 cerr << "SamFormatParser ERROR: @SQ line is missing LN tag" << endl;
128 // store SAM sequence entry
129 if ( !isMissingRequiredFields )
130 m_header.Sequences.Add(seq);
133 void SamFormatParser::ParseRGLine(const string& line) {
137 // split string into tokens
138 vector<string> tokens = Split(line, Constants::SAM_TAB);
140 // iterate over tokens
141 vector<string>::const_iterator tokenIter = tokens.begin();
142 vector<string>::const_iterator tokenEnd = tokens.end();
143 for ( ; tokenIter != tokenEnd; ++tokenIter ) {
145 // get token tag/value
146 const string tokenTag = (*tokenIter).substr(0,2);
147 const string tokenValue = (*tokenIter).substr(3);
149 // set read group contents
150 if ( tokenTag == Constants::SAM_RG_ID_TAG ) rg.ID = tokenValue;
151 else if ( tokenTag == Constants::SAM_RG_DESCRIPTION_TAG ) rg.Description = tokenValue;
152 else if ( tokenTag == Constants::SAM_RG_FLOWORDER_TAG ) rg.FlowOrder = tokenValue;
153 else if ( tokenTag == Constants::SAM_RG_KEYSEQUENCE_TAG ) rg.KeySequence = tokenValue;
154 else if ( tokenTag == Constants::SAM_RG_LIBRARY_TAG ) rg.Library = tokenValue;
155 else if ( tokenTag == Constants::SAM_RG_PLATFORMUNIT_TAG ) rg.PlatformUnit = tokenValue;
156 else if ( tokenTag == Constants::SAM_RG_PREDICTEDINSERTSIZE_TAG ) rg.PredictedInsertSize = tokenValue;
157 else if ( tokenTag == Constants::SAM_RG_PRODUCTIONDATE_TAG ) rg.ProductionDate = tokenValue;
158 else if ( tokenTag == Constants::SAM_RG_PROGRAM_TAG ) rg.Program = tokenValue;
159 else if ( tokenTag == Constants::SAM_RG_SAMPLE_TAG ) rg.Sample = tokenValue;
160 else if ( tokenTag == Constants::SAM_RG_SEQCENTER_TAG ) rg.SequencingCenter = tokenValue;
161 else if ( tokenTag == Constants::SAM_RG_SEQTECHNOLOGY_TAG ) rg.SequencingTechnology = tokenValue;
163 cerr << "SamFormatParser ERROR: unknown RG tag: " << tokenTag << endl;
166 bool isMissingRequiredFields = false;
168 // if @RG line exists, ID must be provided
170 isMissingRequiredFields = true;
171 cerr << "SamFormatParser ERROR: @RG line is missing ID tag" << endl;
174 // store SAM read group entry
175 if ( !isMissingRequiredFields )
176 m_header.ReadGroups.Add(rg);
179 void SamFormatParser::ParsePGLine(const string& line) {
183 // split string into tokens
184 vector<string> tokens = Split(line, Constants::SAM_TAB);
186 // iterate over tokens
187 vector<string>::const_iterator tokenIter = tokens.begin();
188 vector<string>::const_iterator tokenEnd = tokens.end();
189 for ( ; tokenIter != tokenEnd; ++tokenIter ) {
191 // get token tag/value
192 const string tokenTag = (*tokenIter).substr(0,2);
193 const string tokenValue = (*tokenIter).substr(3);
195 // set program record contents
196 if ( tokenTag == Constants::SAM_PG_ID_TAG ) pg.ID = tokenValue;
197 else if ( tokenTag == Constants::SAM_PG_NAME_TAG ) pg.Name = tokenValue;
198 else if ( tokenTag == Constants::SAM_PG_COMMANDLINE_TAG ) pg.CommandLine = tokenValue;
199 else if ( tokenTag == Constants::SAM_PG_PREVIOUSPROGRAM_TAG ) pg.PreviousProgramID = tokenValue;
200 else if ( tokenTag == Constants::SAM_PG_VERSION_TAG ) pg.Version = tokenValue;
202 cerr << "SamFormatParser ERROR: unknown PG tag: " << tokenTag << endl;
205 bool isMissingRequiredFields = false;
207 // if @PG line exists, ID must be provided
209 isMissingRequiredFields = true;
210 cerr << "SamFormatParser ERROR: @PG line is missing ID tag" << endl;
213 // store SAM program record
214 if ( !isMissingRequiredFields )
215 m_header.Programs.Add(pg);
218 void SamFormatParser::ParseCOLine(const string& line) {
219 // simply add line to comments list
220 m_header.Comments.push_back(line);
223 const vector<string> SamFormatParser::Split(const string& line, const char delim) {
224 vector<string> tokens;
225 stringstream lineStream(line);
227 while ( getline(lineStream, token, delim) )
228 tokens.push_back(token);