5 * Created by westcott on 9/9/11.
6 * Copyright 2011 Schloss Lab. All rights reserved.
10 #include "sequenceparser.h"
13 /************************************************************/
14 SequenceParser::SequenceParser(string groupFile, string fastaFile, string nameFile) {
17 m = MothurOut::getInstance();
21 groupMap = new GroupMap(groupFile);
22 error = groupMap->readMap();
24 if (error == 1) { m->control_pressed = true; }
27 vector<string> namesOfGroups = groupMap->getNamesOfGroups();
28 for (int i = 0; i < namesOfGroups.size(); i++) {
29 vector<Sequence> temp;
30 map<string, string> tempMap;
31 seqs[namesOfGroups[i]] = temp;
32 nameMapPerGroup[namesOfGroups[i]] = tempMap;
35 //read fasta file making sure each sequence is in the group file
37 m->openInputFile(fastaFile, in);
39 map<string, string> seqName; //stores name -> sequence string so we can make new "unique" sequences when we parse the name file
43 if (m->control_pressed) { break; }
45 Sequence seq(in); m->gobble(in);
47 if (m->debug) { if((fastaCount) % 1000 == 0){ m->mothurOut("[DEBUG]: reading seq " + toString(fastaCount) + "\n."); } }
49 if (seq.getName() != "") {
51 string group = groupMap->getGroup(seq.getName());
52 if (group == "not found") { error = 1; m->mothurOut("[ERROR]: " + seq.getName() + " is in your fasta file and not in your groupfile, please correct."); m->mothurOutEndLine(); }
54 seqs[group].push_back(seq);
55 seqName[seq.getName()] = seq.getAligned();
61 if (error == 1) { m->control_pressed = true; }
65 m->openInputFile(nameFile, inName);
67 //string first, second;
69 set<string> thisnames1;
73 bool pairDone = false;
74 bool columnOne = true;
75 string firstCol, secondCol;
77 while (!inName.eof()) {
78 if (m->control_pressed) { break; }
80 inName.read(buffer, 4096);
81 vector<string> pieces = m->splitWhiteSpace(rest, buffer, inName.gcount());
83 for (int i = 0; i < pieces.size(); i++) {
84 if (columnOne) { firstCol = pieces[i]; columnOne=false; }
85 else { secondCol = pieces[i]; pairDone = true; columnOne=true; }
87 if (pairDone) { //save one line
88 if (m->debug) { m->mothurOut("[DEBUG]: reading names: " + firstCol + '\t' + secondCol + ".\n"); }
90 m->splitAtChar(secondCol, names, ',');
92 //get aligned string for these seqs from the fasta file
93 string alignedString = "";
94 map<string, string>::iterator itAligned = seqName.find(names[0]);
95 if (itAligned == seqName.end()) {
96 error = 1; m->mothurOut("[ERROR]: " + names[0] + " is in your name file and not in your fasta file, please correct."); m->mothurOutEndLine();
98 alignedString = itAligned->second;
101 //separate by group - parse one line in name file
102 map<string, string> splitMap; //group -> name1,name2,...
103 map<string, string>::iterator it;
104 for (int i = 0; i < names.size(); i++) {
106 string group = groupMap->getGroup(names[i]);
107 if (group == "not found") { error = 1; m->mothurOut("[ERROR]: " + names[i] + " is in your name file and not in your groupfile, please correct."); m->mothurOutEndLine(); }
110 it = splitMap.find(group);
111 if (it != splitMap.end()) { //adding seqs to this group
112 (it->second) += "," + names[i];
113 thisnames1.insert(names[i]);
115 }else { //first sighting of this group
116 splitMap[group] = names[i];
118 thisnames1.insert(names[i]);
120 //is this seq in the fasta file?
121 if (i != 0) { //if not then we need to add a duplicate sequence to the seqs for this group so the new "fasta" and "name" files will match
122 Sequence tempSeq(names[i], alignedString); //get the first guys sequence string since he's in the fasta file.
123 seqs[group].push_back(tempSeq);
128 allSeqsMap[names[i]] = names[0];
132 //fill nameMapPerGroup - holds all lines in namefile separated by group
133 for (it = splitMap.begin(); it != splitMap.end(); it++) {
135 string firstName = "";
136 for(int i = 0; i < (it->second).length(); i++) {
137 if (((it->second)[i]) != ',') {
138 firstName += ((it->second)[i]);
142 //group1 -> seq1 -> seq1,seq2,seq3
143 nameMapPerGroup[it->first][firstName] = it->second;
152 //in case file does not end in white space
154 vector<string> pieces = m->splitWhiteSpace(rest);
156 for (int i = 0; i < pieces.size(); i++) {
157 if (columnOne) { firstCol = pieces[i]; columnOne=false; }
158 else { secondCol = pieces[i]; pairDone = true; columnOne=true; }
160 if (pairDone) { //save one line
161 if (m->debug) { m->mothurOut("[DEBUG]: reading names: " + firstCol + '\t' + secondCol + ".\n"); }
162 vector<string> names;
163 m->splitAtChar(secondCol, names, ',');
165 //get aligned string for these seqs from the fasta file
166 string alignedString = "";
167 map<string, string>::iterator itAligned = seqName.find(names[0]);
168 if (itAligned == seqName.end()) {
169 error = 1; m->mothurOut("[ERROR]: " + names[0] + " is in your name file and not in your fasta file, please correct."); m->mothurOutEndLine();
171 alignedString = itAligned->second;
174 //separate by group - parse one line in name file
175 map<string, string> splitMap; //group -> name1,name2,...
176 map<string, string>::iterator it;
177 for (int i = 0; i < names.size(); i++) {
179 string group = groupMap->getGroup(names[i]);
180 if (group == "not found") { error = 1; m->mothurOut("[ERROR]: " + names[i] + " is in your name file and not in your groupfile, please correct."); m->mothurOutEndLine(); }
183 it = splitMap.find(group);
184 if (it != splitMap.end()) { //adding seqs to this group
185 (it->second) += "," + names[i];
186 thisnames1.insert(names[i]);
188 }else { //first sighting of this group
189 splitMap[group] = names[i];
191 thisnames1.insert(names[i]);
193 //is this seq in the fasta file?
194 if (i != 0) { //if not then we need to add a duplicate sequence to the seqs for this group so the new "fasta" and "name" files will match
195 Sequence tempSeq(names[i], alignedString); //get the first guys sequence string since he's in the fasta file.
196 seqs[group].push_back(tempSeq);
201 allSeqsMap[names[i]] = names[0];
205 //fill nameMapPerGroup - holds all lines in namefile separated by group
206 for (it = splitMap.begin(); it != splitMap.end(); it++) {
208 string firstName = "";
209 for(int i = 0; i < (it->second).length(); i++) {
210 if (((it->second)[i]) != ',') {
211 firstName += ((it->second)[i]);
215 //group1 -> seq1 -> seq1,seq2,seq3
216 nameMapPerGroup[it->first][firstName] = it->second;
224 if (error == 1) { m->control_pressed = true; }
226 if (countName != (groupMap->getNumSeqs())) {
227 vector<string> groupseqsnames = groupMap->getNamesSeqs();
229 for (int i = 0; i < groupseqsnames.size(); i++) {
230 set<string>::iterator itnamesfile = thisnames1.find(groupseqsnames[i]);
231 if (itnamesfile == thisnames1.end()){
232 cout << "missing name " + groupseqsnames[i] << '\t' << allSeqsMap[groupseqsnames[i]] << endl;
236 m->mothurOutEndLine();
237 m->mothurOut("[ERROR]: Your name file contains " + toString(countName) + " valid sequences, and your groupfile contains " + toString(groupMap->getNumSeqs()) + ", please correct.");
238 m->mothurOutEndLine();
239 m->control_pressed = true;
243 catch(exception& e) {
244 m->errorOut(e, "SequenceParser", "SequenceParser");
248 /************************************************************/
249 SequenceParser::SequenceParser(string groupFile, string fastaFile) {
252 m = MothurOut::getInstance();
256 groupMap = new GroupMap(groupFile);
257 error = groupMap->readMap();
259 if (error == 1) { m->control_pressed = true; }
262 vector<string> namesOfGroups = groupMap->getNamesOfGroups();
263 for (int i = 0; i < namesOfGroups.size(); i++) {
264 vector<Sequence> temp;
265 seqs[namesOfGroups[i]] = temp;
268 //read fasta file making sure each sequence is in the group file
270 m->openInputFile(fastaFile, in);
275 if (m->control_pressed) { break; }
277 Sequence seq(in); m->gobble(in);
279 if (seq.getName() != "") {
281 string group = groupMap->getGroup(seq.getName());
282 if (group == "not found") { error = 1; m->mothurOut("[ERROR]: " + seq.getName() + " is in your fasta file and not in your groupfile, please correct."); m->mothurOutEndLine(); }
283 else { seqs[group].push_back(seq); count++; }
288 if (error == 1) { m->control_pressed = true; }
290 if (count != (groupMap->getNumSeqs())) {
291 m->mothurOutEndLine();
292 m->mothurOut("[ERROR]: Your fasta file contains " + toString(count) + " valid sequences, and your groupfile contains " + toString(groupMap->getNumSeqs()) + ", please correct.");
293 if (count < (groupMap->getNumSeqs())) { m->mothurOut(" Did you forget to include the name file?"); }
294 m->mothurOutEndLine();
295 m->control_pressed = true;
299 catch(exception& e) {
300 m->errorOut(e, "SequenceParser", "SequenceParser");
304 /************************************************************/
305 SequenceParser::~SequenceParser(){ delete groupMap; }
306 /************************************************************/
307 int SequenceParser::getNumGroups(){ return groupMap->getNumGroups(); }
308 /************************************************************/
309 vector<string> SequenceParser::getNamesOfGroups(){ return groupMap->getNamesOfGroups(); }
310 /************************************************************/
311 bool SequenceParser::isValidGroup(string g){ return groupMap->isValidGroup(g); }
312 /************************************************************/
313 int SequenceParser::getNumSeqs(string g){
315 map<string, vector<Sequence> >::iterator it;
319 if(it == seqs.end()) {
320 m->mothurOut("[ERROR]: " + g + " is not a valid group, please correct."); m->mothurOutEndLine();
322 num = (it->second).size();
327 catch(exception& e) {
328 m->errorOut(e, "SequenceParser", "getNumSeqs");
332 /************************************************************/
333 vector<Sequence> SequenceParser::getSeqs(string g){
335 map<string, vector<Sequence> >::iterator it;
336 vector<Sequence> seqForThisGroup;
339 if(it == seqs.end()) {
340 m->mothurOut("[ERROR]: No sequences available for group " + g + ", please correct."); m->mothurOutEndLine();
342 seqForThisGroup = it->second;
343 if (m->debug) { m->mothurOut("[DEBUG]: group " + g + " fasta file has " + toString(seqForThisGroup.size()) + " sequences."); }
346 return seqForThisGroup;
348 catch(exception& e) {
349 m->errorOut(e, "SequenceParser", "getSeqs");
353 /************************************************************/
354 int SequenceParser::getSeqs(string g, string filename, bool uchimeFormat=false){
356 map<string, vector<Sequence> >::iterator it;
357 vector<Sequence> seqForThisGroup;
358 vector<seqPriorityNode> nameVector;
361 if(it == seqs.end()) {
362 m->mothurOut("[ERROR]: No sequences available for group " + g + ", please correct."); m->mothurOutEndLine();
366 m->openOutputFile(filename, out);
368 seqForThisGroup = it->second;
371 // format should look like
372 //>seqName /ab=numRedundantSeqs/
375 map<string, string> nameMapForThisGroup = getNameMap(g);
376 map<string, string>::iterator itNameMap;
379 for (int i = 0; i < seqForThisGroup.size(); i++) {
380 itNameMap = nameMapForThisGroup.find(seqForThisGroup[i].getName());
382 if (itNameMap == nameMapForThisGroup.end()){
384 m->mothurOut("[ERROR]: " + seqForThisGroup[i].getName() + " is in your fastafile, but is not in your namesfile, please correct."); m->mothurOutEndLine();
386 int num = m->getNumNames(itNameMap->second);
388 seqPriorityNode temp(num, seqForThisGroup[i].getAligned(), seqForThisGroup[i].getName());
389 nameVector.push_back(temp);
393 if (error == 1) { out.close(); m->mothurRemove(filename); return 1; }
395 //sort by num represented
396 sort(nameVector.begin(), nameVector.end(), compareSeqPriorityNodes);
398 //print new file in order of
399 for (int i = 0; i < nameVector.size(); i++) {
401 if(m->control_pressed) { out.close(); m->mothurRemove(filename); return 1; }
403 out << ">" << nameVector[i].name << "/ab=" << nameVector[i].numIdentical << "/" << endl << nameVector[i].seq << endl; //
407 //m->mothurOut("Group " + g + " contains " + toString(seqForThisGroup.size()) + " unique seqs.\n");
408 for (int i = 0; i < seqForThisGroup.size(); i++) {
410 if(m->control_pressed) { out.close(); m->mothurRemove(filename); return 1; }
412 seqForThisGroup[i].printSequence(out);
420 catch(exception& e) {
421 m->errorOut(e, "SequenceParser", "getSeqs");
426 /************************************************************/
427 map<string, string> SequenceParser::getNameMap(string g){
429 map<string, map<string, string> >::iterator it;
430 map<string, string> nameMapForThisGroup;
432 it = nameMapPerGroup.find(g);
433 if(it == nameMapPerGroup.end()) {
434 m->mothurOut("[ERROR]: No nameMap available for group " + g + ", please correct."); m->mothurOutEndLine();
436 nameMapForThisGroup = it->second;
437 if (m->debug) { m->mothurOut("[DEBUG]: group " + g + " name file has " + toString(nameMapForThisGroup.size()) + " unique sequences."); }
440 return nameMapForThisGroup;
442 catch(exception& e) {
443 m->errorOut(e, "SequenceParser", "getNameMap");
447 /************************************************************/
448 int SequenceParser::getNameMap(string g, string filename){
450 map<string, map<string, string> >::iterator it;
451 map<string, string> nameMapForThisGroup;
453 it = nameMapPerGroup.find(g);
454 if(it == nameMapPerGroup.end()) {
455 m->mothurOut("[ERROR]: No nameMap available for group " + g + ", please correct."); m->mothurOutEndLine();
457 nameMapForThisGroup = it->second;
460 m->openOutputFile(filename, out);
462 for (map<string, string>::iterator itFile = nameMapForThisGroup.begin(); itFile != nameMapForThisGroup.end(); itFile++) {
464 if(m->control_pressed) { out.close(); m->mothurRemove(filename); return 1; }
466 out << itFile->first << '\t' << itFile->second << endl;
474 catch(exception& e) {
475 m->errorOut(e, "SequenceParser", "getNameMap");
479 /************************************************************/