5 * Created by westcott on 9/9/11.
6 * Copyright 2011 Schloss Lab. All rights reserved.
10 #include "sequenceparser.h"
13 /************************************************************/
14 SequenceParser::SequenceParser(string groupFile, string fastaFile, string nameFile) {
17 m = MothurOut::getInstance();
21 groupMap = new GroupMap(groupFile);
22 error = groupMap->readMap();
24 if (error == 1) { m->control_pressed = true; }
27 vector<string> namesOfGroups = groupMap->getNamesOfGroups();
28 for (int i = 0; i < namesOfGroups.size(); i++) {
29 vector<Sequence> temp;
30 map<string, string> tempMap;
31 seqs[namesOfGroups[i]] = temp;
32 nameMapPerGroup[namesOfGroups[i]] = tempMap;
35 //read fasta file making sure each sequence is in the group file
37 m->openInputFile(fastaFile, in);
39 map<string, string> seqName; //stores name -> sequence string so we can make new "unique" sequences when we parse the name file
42 if (m->control_pressed) { break; }
44 Sequence seq(in); m->gobble(in);
46 if (seq.getName() != "") {
48 string group = groupMap->getGroup(seq.getName());
49 if (group == "not found") { error = 1; m->mothurOut("[ERROR]: " + seq.getName() + " is in your fasta file and not in your groupfile, please correct."); m->mothurOutEndLine(); }
51 seqs[group].push_back(seq);
52 seqName[seq.getName()] = seq.getAligned();
58 if (error == 1) { m->control_pressed = true; }
62 m->openInputFile(nameFile, inName);
66 set<string> thisnames1;
68 while(!inName.eof()) {
70 if (m->control_pressed) { break; }
72 inName >> first; m->gobble(inName);
73 inName >> second; m->gobble(inName);
76 m->splitAtChar(second, names, ',');
78 //get aligned string for these seqs from the fasta file
79 string alignedString = "";
80 map<string, string>::iterator itAligned = seqName.find(names[0]);
81 if (itAligned == seqName.end()) {
82 error = 1; m->mothurOut("[ERROR]: " + names[0] + " is in your name file and not in your fasta file, please correct."); m->mothurOutEndLine();
84 alignedString = itAligned->second;
87 //separate by group - parse one line in name file
88 map<string, string> splitMap; //group -> name1,name2,...
89 map<string, string>::iterator it;
90 for (int i = 0; i < names.size(); i++) {
92 string group = groupMap->getGroup(names[i]);
93 if (group == "not found") { error = 1; m->mothurOut("[ERROR]: " + names[i] + " is in your name file and not in your groupfile, please correct."); m->mothurOutEndLine(); }
96 it = splitMap.find(group);
97 if (it != splitMap.end()) { //adding seqs to this group
98 (it->second) += "," + names[i];
99 thisnames1.insert(names[i]);
101 }else { //first sighting of this group
102 splitMap[group] = names[i];
104 thisnames1.insert(names[i]);
106 //is this seq in the fasta file?
107 if (i != 0) { //if not then we need to add a duplicate sequence to the seqs for this group so the new "fasta" and "name" files will match
108 Sequence tempSeq(names[i], alignedString); //get the first guys sequence string since he's in the fasta file.
109 seqs[group].push_back(tempSeq);
114 allSeqsMap[names[i]] = names[0];
118 //fill nameMapPerGroup - holds all lines in namefile separated by group
119 for (it = splitMap.begin(); it != splitMap.end(); it++) {
121 string firstName = "";
122 for(int i = 0; i < (it->second).length(); i++) {
123 if (((it->second)[i]) != ',') {
124 firstName += ((it->second)[i]);
128 //group1 -> seq1 -> seq1,seq2,seq3
129 nameMapPerGroup[it->first][firstName] = it->second;
135 if (error == 1) { m->control_pressed = true; }
137 if (countName != (groupMap->getNumSeqs())) {
138 vector<string> groupseqsnames = groupMap->getNamesSeqs();
140 for (int i = 0; i < groupseqsnames.size(); i++) {
141 set<string>::iterator itnamesfile = thisnames1.find(groupseqsnames[i]);
142 if (itnamesfile == thisnames1.end()){
143 cout << "missing name " + groupseqsnames[i] << '\t' << allSeqsMap[groupseqsnames[i]] << endl;
147 m->mothurOutEndLine();
148 m->mothurOut("[ERROR]: Your name file contains " + toString(countName) + " valid sequences, and your groupfile contains " + toString(groupMap->getNumSeqs()) + ", please correct.");
149 m->mothurOutEndLine();
150 m->control_pressed = true;
154 catch(exception& e) {
155 m->errorOut(e, "SequenceParser", "SequenceParser");
159 /************************************************************/
160 SequenceParser::SequenceParser(string groupFile, string fastaFile) {
163 m = MothurOut::getInstance();
167 groupMap = new GroupMap(groupFile);
168 error = groupMap->readMap();
170 if (error == 1) { m->control_pressed = true; }
173 vector<string> namesOfGroups = groupMap->getNamesOfGroups();
174 for (int i = 0; i < namesOfGroups.size(); i++) {
175 vector<Sequence> temp;
176 seqs[namesOfGroups[i]] = temp;
179 //read fasta file making sure each sequence is in the group file
181 m->openInputFile(fastaFile, in);
186 if (m->control_pressed) { break; }
188 Sequence seq(in); m->gobble(in);
190 if (seq.getName() != "") {
192 string group = groupMap->getGroup(seq.getName());
193 if (group == "not found") { error = 1; m->mothurOut("[ERROR]: " + seq.getName() + " is in your fasta file and not in your groupfile, please correct."); m->mothurOutEndLine(); }
194 else { seqs[group].push_back(seq); count++; }
199 if (error == 1) { m->control_pressed = true; }
201 if (count != (groupMap->getNumSeqs())) {
202 m->mothurOutEndLine();
203 m->mothurOut("[ERROR]: Your fasta file contains " + toString(count) + " valid sequences, and your groupfile contains " + toString(groupMap->getNumSeqs()) + ", please correct.");
204 if (count < (groupMap->getNumSeqs())) { m->mothurOut(" Did you forget to include the name file?"); }
205 m->mothurOutEndLine();
206 m->control_pressed = true;
210 catch(exception& e) {
211 m->errorOut(e, "SequenceParser", "SequenceParser");
215 /************************************************************/
216 SequenceParser::~SequenceParser(){ delete groupMap; }
217 /************************************************************/
218 int SequenceParser::getNumGroups(){ return groupMap->getNumGroups(); }
219 /************************************************************/
220 vector<string> SequenceParser::getNamesOfGroups(){ return groupMap->getNamesOfGroups(); }
221 /************************************************************/
222 bool SequenceParser::isValidGroup(string g){ return groupMap->isValidGroup(g); }
223 /************************************************************/
224 string SequenceParser::getGroup(string g){ return groupMap->getGroup(g); }
225 /************************************************************/
226 int SequenceParser::getNumSeqs(string g){
228 map<string, vector<Sequence> >::iterator it;
232 if(it == seqs.end()) {
233 m->mothurOut("[ERROR]: " + g + " is not a valid group, please correct."); m->mothurOutEndLine();
235 num = (it->second).size();
240 catch(exception& e) {
241 m->errorOut(e, "SequenceParser", "getNumSeqs");
245 /************************************************************/
246 vector<Sequence> SequenceParser::getSeqs(string g){
248 map<string, vector<Sequence> >::iterator it;
249 vector<Sequence> seqForThisGroup;
252 if(it == seqs.end()) {
253 m->mothurOut("[ERROR]: No sequences available for group " + g + ", please correct."); m->mothurOutEndLine();
255 seqForThisGroup = it->second;
258 return seqForThisGroup;
260 catch(exception& e) {
261 m->errorOut(e, "SequenceParser", "getSeqs");
265 /************************************************************/
266 int SequenceParser::getSeqs(string g, string filename, bool uchimeFormat=false){
268 map<string, vector<Sequence> >::iterator it;
269 vector<Sequence> seqForThisGroup;
270 vector<seqPriorityNode> nameVector;
273 if(it == seqs.end()) {
274 m->mothurOut("[ERROR]: No sequences available for group " + g + ", please correct."); m->mothurOutEndLine();
278 m->openOutputFile(filename, out);
280 seqForThisGroup = it->second;
283 // format should look like
284 //>seqName /ab=numRedundantSeqs/
287 map<string, string> nameMapForThisGroup = getNameMap(g);
288 map<string, string>::iterator itNameMap;
291 for (int i = 0; i < seqForThisGroup.size(); i++) {
292 itNameMap = nameMapForThisGroup.find(seqForThisGroup[i].getName());
294 if (itNameMap == nameMapForThisGroup.end()){
296 m->mothurOut("[ERROR]: " + seqForThisGroup[i].getName() + " is in your fastafile, but is not in your namesfile, please correct."); m->mothurOutEndLine();
298 int num = m->getNumNames(itNameMap->second);
300 seqPriorityNode temp(num, seqForThisGroup[i].getAligned(), seqForThisGroup[i].getName());
301 nameVector.push_back(temp);
305 if (error == 1) { out.close(); m->mothurRemove(filename); return 1; }
307 //sort by num represented
308 sort(nameVector.begin(), nameVector.end(), compareSeqPriorityNodes);
310 //print new file in order of
311 for (int i = 0; i < nameVector.size(); i++) {
313 if(m->control_pressed) { out.close(); m->mothurRemove(filename); return 1; }
315 out << ">" << nameVector[i].name << "/ab=" << nameVector[i].numIdentical << "/" << endl << nameVector[i].seq << endl;
319 for (int i = 0; i < seqForThisGroup.size(); i++) {
321 if(m->control_pressed) { out.close(); m->mothurRemove(filename); return 1; }
323 seqForThisGroup[i].printSequence(out);
331 catch(exception& e) {
332 m->errorOut(e, "SequenceParser", "getSeqs");
337 /************************************************************/
338 map<string, string> SequenceParser::getNameMap(string g){
340 map<string, map<string, string> >::iterator it;
341 map<string, string> nameMapForThisGroup;
343 it = nameMapPerGroup.find(g);
344 if(it == nameMapPerGroup.end()) {
345 m->mothurOut("[ERROR]: No nameMap available for group " + g + ", please correct."); m->mothurOutEndLine();
347 nameMapForThisGroup = it->second;
350 return nameMapForThisGroup;
352 catch(exception& e) {
353 m->errorOut(e, "SequenceParser", "getNameMap");
357 /************************************************************/
358 int SequenceParser::getNameMap(string g, string filename){
360 map<string, map<string, string> >::iterator it;
361 map<string, string> nameMapForThisGroup;
363 it = nameMapPerGroup.find(g);
364 if(it == nameMapPerGroup.end()) {
365 m->mothurOut("[ERROR]: No nameMap available for group " + g + ", please correct."); m->mothurOutEndLine();
367 nameMapForThisGroup = it->second;
370 m->openOutputFile(filename, out);
372 for (map<string, string>::iterator itFile = nameMapForThisGroup.begin(); itFile != nameMapForThisGroup.end(); itFile++) {
374 if(m->control_pressed) { out.close(); m->mothurRemove(filename); return 1; }
376 out << itFile->first << '\t' << itFile->second << endl;
384 catch(exception& e) {
385 m->errorOut(e, "SequenceParser", "getNameMap");
389 /************************************************************/