1 // ***************************************************************************
2 // SamFormatParser.cpp (c) 2010 Derek Barnett
3 // Marth Lab, Department of Biology, Boston College
4 // ---------------------------------------------------------------------------
5 // Last modified: 6 October 2011 (DB)
6 // ---------------------------------------------------------------------------
7 // Provides functionality for parsing SAM header text into SamHeader object
8 // ***************************************************************************
10 #include <api/SamConstants.h>
11 #include <api/SamHeader.h>
12 #include <api/internal/BamException_p.h>
13 #include <api/internal/SamFormatParser_p.h>
14 using namespace BamTools;
15 using namespace BamTools::Internal;
22 SamFormatParser::SamFormatParser(SamHeader& header)
26 SamFormatParser::~SamFormatParser(void) { }
28 void SamFormatParser::Parse(const string& headerText) {
30 // clear header's prior contents
33 // empty header is OK, but skip processing
34 if ( headerText.empty() )
37 // other wise parse SAM lines
38 istringstream headerStream(headerText);
39 string headerLine("");
40 while ( getline(headerStream, headerLine) )
41 ParseSamLine(headerLine);
44 void SamFormatParser::ParseSamLine(const string& line) {
46 // skip if line is not long enough to contain true values
47 if ( line.length() < 5 ) return;
49 // determine token at beginning of line
50 const string firstToken = line.substr(0,3);
51 string restOfLine = line.substr(4);
52 if ( firstToken == Constants::SAM_HD_BEGIN_TOKEN) ParseHDLine(restOfLine);
53 else if ( firstToken == Constants::SAM_SQ_BEGIN_TOKEN) ParseSQLine(restOfLine);
54 else if ( firstToken == Constants::SAM_RG_BEGIN_TOKEN) ParseRGLine(restOfLine);
55 else if ( firstToken == Constants::SAM_PG_BEGIN_TOKEN) ParsePGLine(restOfLine);
56 else if ( firstToken == Constants::SAM_CO_BEGIN_TOKEN) ParseCOLine(restOfLine);
58 const string message = string("unknown token: ") + firstToken;
59 throw BamException("SamFormatParser::ParseSamLine", message);
63 void SamFormatParser::ParseHDLine(const string& line) {
65 // split HD lines into tokens
66 vector<string> tokens = Split(line, Constants::SAM_TAB);
68 // iterate over tokens
69 vector<string>::const_iterator tokenIter = tokens.begin();
70 vector<string>::const_iterator tokenEnd = tokens.end();
71 for ( ; tokenIter != tokenEnd; ++tokenIter ) {
74 const string tokenTag = (*tokenIter).substr(0,2);
75 const string tokenValue = (*tokenIter).substr(3);
77 // set header contents
78 if ( tokenTag == Constants::SAM_HD_VERSION_TAG ) m_header.Version = tokenValue;
79 else if ( tokenTag == Constants::SAM_HD_SORTORDER_TAG ) m_header.SortOrder = tokenValue;
80 else if ( tokenTag == Constants::SAM_HD_GROUPORDER_TAG ) m_header.GroupOrder = tokenValue;
82 const string message = string("unknown HD tag: ") + tokenTag;
83 throw BamException("SamFormatParser::ParseHDLine", message);
87 // check for required tags
88 if ( !m_header.HasVersion() )
89 throw BamException("SamFormatParser::ParseHDLine", "@HD line is missing VN tag");
92 void SamFormatParser::ParseSQLine(const string& line) {
96 // split SQ line into tokens
97 vector<string> tokens = Split(line, Constants::SAM_TAB);
99 // iterate over tokens
100 vector<string>::const_iterator tokenIter = tokens.begin();
101 vector<string>::const_iterator tokenEnd = tokens.end();
102 for ( ; tokenIter != tokenEnd; ++tokenIter ) {
105 const string tokenTag = (*tokenIter).substr(0,2);
106 const string tokenValue = (*tokenIter).substr(3);
108 // set sequence contents
109 if ( tokenTag == Constants::SAM_SQ_NAME_TAG ) seq.Name = tokenValue;
110 else if ( tokenTag == Constants::SAM_SQ_LENGTH_TAG ) seq.Length = tokenValue;
111 else if ( tokenTag == Constants::SAM_SQ_ASSEMBLYID_TAG ) seq.AssemblyID = tokenValue;
112 else if ( tokenTag == Constants::SAM_SQ_CHECKSUM_TAG ) seq.Checksum = tokenValue;
113 else if ( tokenTag == Constants::SAM_SQ_SPECIES_TAG ) seq.Species = tokenValue;
114 else if ( tokenTag == Constants::SAM_SQ_URI_TAG ) seq.URI = tokenValue;
116 const string message = string("unknown SQ tag: ") + tokenTag;
117 throw BamException("SamFormatParser::ParseSQLine", message);
121 // check for required tags
122 if ( !seq.HasName() )
123 throw BamException("SamFormatParser::ParseSQLine", "@SQ line is missing SN tag");
124 if ( !seq.HasLength() )
125 throw BamException("SamFormatParser::ParseSQLine", "@SQ line is missing LN tag");
127 // store SAM sequence entry
128 m_header.Sequences.Add(seq);
131 void SamFormatParser::ParseRGLine(const string& line) {
135 // split string into tokens
136 vector<string> tokens = Split(line, Constants::SAM_TAB);
138 // iterate over tokens
139 vector<string>::const_iterator tokenIter = tokens.begin();
140 vector<string>::const_iterator tokenEnd = tokens.end();
141 for ( ; tokenIter != tokenEnd; ++tokenIter ) {
143 // get token tag/value
144 const string tokenTag = (*tokenIter).substr(0,2);
145 const string tokenValue = (*tokenIter).substr(3);
147 // set read group contents
148 if ( tokenTag == Constants::SAM_RG_ID_TAG ) rg.ID = tokenValue;
149 else if ( tokenTag == Constants::SAM_RG_DESCRIPTION_TAG ) rg.Description = tokenValue;
150 else if ( tokenTag == Constants::SAM_RG_FLOWORDER_TAG ) rg.FlowOrder = tokenValue;
151 else if ( tokenTag == Constants::SAM_RG_KEYSEQUENCE_TAG ) rg.KeySequence = tokenValue;
152 else if ( tokenTag == Constants::SAM_RG_LIBRARY_TAG ) rg.Library = tokenValue;
153 else if ( tokenTag == Constants::SAM_RG_PLATFORMUNIT_TAG ) rg.PlatformUnit = tokenValue;
154 else if ( tokenTag == Constants::SAM_RG_PREDICTEDINSERTSIZE_TAG ) rg.PredictedInsertSize = tokenValue;
155 else if ( tokenTag == Constants::SAM_RG_PRODUCTIONDATE_TAG ) rg.ProductionDate = tokenValue;
156 else if ( tokenTag == Constants::SAM_RG_PROGRAM_TAG ) rg.Program = tokenValue;
157 else if ( tokenTag == Constants::SAM_RG_SAMPLE_TAG ) rg.Sample = tokenValue;
158 else if ( tokenTag == Constants::SAM_RG_SEQCENTER_TAG ) rg.SequencingCenter = tokenValue;
159 else if ( tokenTag == Constants::SAM_RG_SEQTECHNOLOGY_TAG ) rg.SequencingTechnology = tokenValue;
161 const string message = string("unknown RG tag: ") + tokenTag;
162 throw BamException("SamFormatParser::ParseRGLine", message);
166 // check for required tags
168 throw BamException("SamFormatParser::ParseRGLine", "@RG line is missing ID tag");
170 // store SAM read group entry
171 m_header.ReadGroups.Add(rg);
174 void SamFormatParser::ParsePGLine(const string& line) {
178 // split string into tokens
179 vector<string> tokens = Split(line, Constants::SAM_TAB);
181 // iterate over tokens
182 vector<string>::const_iterator tokenIter = tokens.begin();
183 vector<string>::const_iterator tokenEnd = tokens.end();
184 for ( ; tokenIter != tokenEnd; ++tokenIter ) {
186 // get token tag/value
187 const string tokenTag = (*tokenIter).substr(0,2);
188 const string tokenValue = (*tokenIter).substr(3);
190 // set program record contents
191 if ( tokenTag == Constants::SAM_PG_ID_TAG ) pg.ID = tokenValue;
192 else if ( tokenTag == Constants::SAM_PG_NAME_TAG ) pg.Name = tokenValue;
193 else if ( tokenTag == Constants::SAM_PG_COMMANDLINE_TAG ) pg.CommandLine = tokenValue;
194 else if ( tokenTag == Constants::SAM_PG_PREVIOUSPROGRAM_TAG ) pg.PreviousProgramID = tokenValue;
195 else if ( tokenTag == Constants::SAM_PG_VERSION_TAG ) pg.Version = tokenValue;
197 const string message = string("unknown PG tag: ") + tokenTag;
198 throw BamException("SamFormatParser::ParsePGLine", message);
202 // check for required tags
204 throw BamException("SamFormatParser::ParsePGLine", "@PG line is missing ID tag");
206 // store SAM program entry
207 m_header.Programs.Add(pg);
210 void SamFormatParser::ParseCOLine(const string& line) {
211 // simply add line to comments list
212 m_header.Comments.push_back(line);
215 const vector<string> SamFormatParser::Split(const string& line, const char delim) {
216 vector<string> tokens;
217 stringstream lineStream(line);
219 while ( getline(lineStream, token, delim) )
220 tokens.push_back(token);