1 // ***************************************************************************
2 // SamFormatParser.cpp (c) 2010 Derek Barnett
3 // Marth Lab, Department of Biology, Boston College
4 // ---------------------------------------------------------------------------
5 // Last modified: 8 December 2011 (DB)
6 // ---------------------------------------------------------------------------
7 // Provides functionality for parsing SAM header text into SamHeader object
8 // ***************************************************************************
10 #include "api/SamConstants.h"
11 #include "api/SamHeader.h"
12 #include "api/internal/sam/SamFormatParser_p.h"
13 #include "api/internal/utils/BamException_p.h"
14 using namespace BamTools;
15 using namespace BamTools::Internal;
22 SamFormatParser::SamFormatParser(SamHeader& header)
26 SamFormatParser::~SamFormatParser(void) { }
28 void SamFormatParser::Parse(const string& headerText) {
30 // clear header's prior contents
33 // empty header is OK, but skip processing
34 if ( headerText.empty() )
37 // other wise parse SAM lines
38 istringstream headerStream(headerText);
39 string headerLine("");
40 while ( getline(headerStream, headerLine) )
41 ParseSamLine(headerLine);
44 void SamFormatParser::ParseSamLine(const string& line) {
46 // skip if line is not long enough to contain true values
47 if ( line.length() < 5 ) return;
49 // determine token at beginning of line
50 const string firstToken = line.substr(0,3);
51 const string restOfLine = line.substr(4);
52 if ( firstToken == Constants::SAM_HD_BEGIN_TOKEN) ParseHDLine(restOfLine);
53 else if ( firstToken == Constants::SAM_SQ_BEGIN_TOKEN) ParseSQLine(restOfLine);
54 else if ( firstToken == Constants::SAM_RG_BEGIN_TOKEN) ParseRGLine(restOfLine);
55 else if ( firstToken == Constants::SAM_PG_BEGIN_TOKEN) ParsePGLine(restOfLine);
56 else if ( firstToken == Constants::SAM_CO_BEGIN_TOKEN) ParseCOLine(restOfLine);
59 void SamFormatParser::ParseHDLine(const string& line) {
61 // split HD lines into tokens
62 vector<string> tokens = Split(line, Constants::SAM_TAB);
64 // iterate over tokens
65 vector<string>::const_iterator tokenIter = tokens.begin();
66 vector<string>::const_iterator tokenEnd = tokens.end();
67 for ( ; tokenIter != tokenEnd; ++tokenIter ) {
70 const string tokenTag = (*tokenIter).substr(0,2);
71 const string tokenValue = (*tokenIter).substr(3);
73 // set header contents
74 if ( tokenTag == Constants::SAM_HD_VERSION_TAG ) m_header.Version = tokenValue;
75 else if ( tokenTag == Constants::SAM_HD_SORTORDER_TAG ) m_header.SortOrder = tokenValue;
76 else if ( tokenTag == Constants::SAM_HD_GROUPORDER_TAG ) m_header.GroupOrder = tokenValue;
79 // check for required tags
80 if ( !m_header.HasVersion() )
81 throw BamException("SamFormatParser::ParseHDLine", "@HD line is missing VN tag");
84 void SamFormatParser::ParseSQLine(const string& line) {
88 // split SQ line into tokens
89 vector<string> tokens = Split(line, Constants::SAM_TAB);
91 // iterate over tokens
92 vector<string>::const_iterator tokenIter = tokens.begin();
93 vector<string>::const_iterator tokenEnd = tokens.end();
94 for ( ; tokenIter != tokenEnd; ++tokenIter ) {
97 const string tokenTag = (*tokenIter).substr(0,2);
98 const string tokenValue = (*tokenIter).substr(3);
100 // set sequence contents
101 if ( tokenTag == Constants::SAM_SQ_NAME_TAG ) seq.Name = tokenValue;
102 else if ( tokenTag == Constants::SAM_SQ_LENGTH_TAG ) seq.Length = tokenValue;
103 else if ( tokenTag == Constants::SAM_SQ_ASSEMBLYID_TAG ) seq.AssemblyID = tokenValue;
104 else if ( tokenTag == Constants::SAM_SQ_CHECKSUM_TAG ) seq.Checksum = tokenValue;
105 else if ( tokenTag == Constants::SAM_SQ_SPECIES_TAG ) seq.Species = tokenValue;
106 else if ( tokenTag == Constants::SAM_SQ_URI_TAG ) seq.URI = tokenValue;
109 // check for required tags
110 if ( !seq.HasName() )
111 throw BamException("SamFormatParser::ParseSQLine", "@SQ line is missing SN tag");
112 if ( !seq.HasLength() )
113 throw BamException("SamFormatParser::ParseSQLine", "@SQ line is missing LN tag");
115 // store SAM sequence entry
116 m_header.Sequences.Add(seq);
119 void SamFormatParser::ParseRGLine(const string& line) {
123 // split string into tokens
124 vector<string> tokens = Split(line, Constants::SAM_TAB);
126 // iterate over tokens
127 vector<string>::const_iterator tokenIter = tokens.begin();
128 vector<string>::const_iterator tokenEnd = tokens.end();
129 for ( ; tokenIter != tokenEnd; ++tokenIter ) {
131 // get token tag/value
132 const string tokenTag = (*tokenIter).substr(0,2);
133 const string tokenValue = (*tokenIter).substr(3);
135 // set read group contents
136 if ( tokenTag == Constants::SAM_RG_ID_TAG ) rg.ID = tokenValue;
137 else if ( tokenTag == Constants::SAM_RG_DESCRIPTION_TAG ) rg.Description = tokenValue;
138 else if ( tokenTag == Constants::SAM_RG_FLOWORDER_TAG ) rg.FlowOrder = tokenValue;
139 else if ( tokenTag == Constants::SAM_RG_KEYSEQUENCE_TAG ) rg.KeySequence = tokenValue;
140 else if ( tokenTag == Constants::SAM_RG_LIBRARY_TAG ) rg.Library = tokenValue;
141 else if ( tokenTag == Constants::SAM_RG_PLATFORMUNIT_TAG ) rg.PlatformUnit = tokenValue;
142 else if ( tokenTag == Constants::SAM_RG_PREDICTEDINSERTSIZE_TAG ) rg.PredictedInsertSize = tokenValue;
143 else if ( tokenTag == Constants::SAM_RG_PRODUCTIONDATE_TAG ) rg.ProductionDate = tokenValue;
144 else if ( tokenTag == Constants::SAM_RG_PROGRAM_TAG ) rg.Program = tokenValue;
145 else if ( tokenTag == Constants::SAM_RG_SAMPLE_TAG ) rg.Sample = tokenValue;
146 else if ( tokenTag == Constants::SAM_RG_SEQCENTER_TAG ) rg.SequencingCenter = tokenValue;
147 else if ( tokenTag == Constants::SAM_RG_SEQTECHNOLOGY_TAG ) rg.SequencingTechnology = tokenValue;
150 // check for required tags
152 throw BamException("SamFormatParser::ParseRGLine", "@RG line is missing ID tag");
154 // store SAM read group entry
155 m_header.ReadGroups.Add(rg);
158 void SamFormatParser::ParsePGLine(const string& line) {
162 // split string into tokens
163 vector<string> tokens = Split(line, Constants::SAM_TAB);
165 // iterate over tokens
166 vector<string>::const_iterator tokenIter = tokens.begin();
167 vector<string>::const_iterator tokenEnd = tokens.end();
168 for ( ; tokenIter != tokenEnd; ++tokenIter ) {
170 // get token tag/value
171 const string tokenTag = (*tokenIter).substr(0,2);
172 const string tokenValue = (*tokenIter).substr(3);
174 // set program record contents
175 if ( tokenTag == Constants::SAM_PG_ID_TAG ) pg.ID = tokenValue;
176 else if ( tokenTag == Constants::SAM_PG_NAME_TAG ) pg.Name = tokenValue;
177 else if ( tokenTag == Constants::SAM_PG_COMMANDLINE_TAG ) pg.CommandLine = tokenValue;
178 else if ( tokenTag == Constants::SAM_PG_PREVIOUSPROGRAM_TAG ) pg.PreviousProgramID = tokenValue;
179 else if ( tokenTag == Constants::SAM_PG_VERSION_TAG ) pg.Version = tokenValue;
182 // check for required tags
184 throw BamException("SamFormatParser::ParsePGLine", "@PG line is missing ID tag");
186 // store SAM program entry
187 m_header.Programs.Add(pg);
190 void SamFormatParser::ParseCOLine(const string& line) {
191 // simply add line to comments list
192 m_header.Comments.push_back(line);
195 const vector<string> SamFormatParser::Split(const string& line, const char delim) {
196 vector<string> tokens;
197 stringstream lineStream(line);
199 while ( getline(lineStream, token, delim) )
200 tokens.push_back(token);