]> git.donarmstrong.com Git - mothur.git/blob - sequenceparser.cpp
adds group parameter to chimera.uchime so you can check for chimeras with template...
[mothur.git] / sequenceparser.cpp
1 /*
2  *  sequenceParser.cpp
3  *  Mothur
4  *
5  *  Created by westcott on 9/9/11.
6  *  Copyright 2011 Schloss Lab. All rights reserved.
7  *
8  */
9
10 #include "sequenceParser.h"
11
12
13 /************************************************************/
14 SequenceParser::SequenceParser(string groupFile, string fastaFile, string nameFile) {
15         try {
16                 
17                 m = MothurOut::getInstance();
18                 int error;
19                 
20                 //read group file
21                 groupMap = new GroupMap(groupFile);
22                 error = groupMap->readMap();
23                 
24                 if (error == 1) { m->control_pressed = true; }
25                 
26                 //initialize maps
27                 vector<string> namesOfGroups = groupMap->getNamesOfGroups();
28                 for (int i = 0; i < namesOfGroups.size(); i++) {
29                         vector<Sequence> temp;
30                         map<string, string> tempMap;
31                         seqs[namesOfGroups[i]] = temp;
32                         nameMapPerGroup[namesOfGroups[i]] = tempMap;
33                 }
34                 
35                 //read fasta file making sure each sequence is in the group file
36                 ifstream in;
37                 m->openInputFile(fastaFile, in);
38                 
39                 map<string, string> seqName; //stores name -> sequence string so we can make new "unique" sequences when we parse the name file
40                 while (!in.eof()) {
41                         
42                         if (m->control_pressed) { break; }
43                         
44                         Sequence seq(in); m->gobble(in);
45                         
46                         if (seq.getName() != "") {
47                                 
48                                  string group = groupMap->getGroup(seq.getName());
49                                  if (group == "not found") {  error = 1; m->mothurOut("[ERROR]: " + seq.getName() + " is in your fasta file and not in your groupfile, please correct."); m->mothurOutEndLine();  }
50                                  else { 
51                                          seqs[group].push_back(seq);    
52                                          seqName[seq.getName()] = seq.getAligned();
53                                  }
54                         }
55                 }
56                 in.close();
57                                  
58                 if (error == 1) { m->control_pressed = true; }
59                                  
60                 //read name file
61                 ifstream inName;
62                 m->openInputFile(nameFile, inName);
63                 
64                 string first, second;
65                 int countName = 0;
66                 while(!inName.eof()) {
67                         
68                         if (m->control_pressed) { break; }
69                         
70                         inName >> first; m->gobble(inName);
71                         inName >> second; m->gobble(inName);
72                         
73                         vector<string> names;
74                         m->splitAtChar(second, names, ',');
75                         
76                         //get aligned string for these seqs from the fasta file
77                         string alignedString = "";
78                         map<string, string>::iterator itAligned = seqName.find(names[0]);
79                         if (itAligned == seqName.end()) {
80                                 error = 1; m->mothurOut("[ERROR]: " + names[0] + " is in your name file and not in your fasta file, please correct."); m->mothurOutEndLine();
81                         }else {
82                                 alignedString = itAligned->second;
83                         }
84                         
85                         //separate by group - parse one line in name file
86                         map<string, string> splitMap; //group -> name1,name2,...
87                         map<string, string>::iterator it;
88                         for (int i = 0; i < names.size(); i++) {
89                                 
90                                 string group = groupMap->getGroup(names[i]);
91                                 if (group == "not found") {  error = 1; m->mothurOut("[ERROR]: " + names[i] + " is in your name file and not in your groupfile, please correct."); m->mothurOutEndLine();  }
92                                 else {  
93                                         
94                                         it = splitMap.find(group);
95                                         if (it != splitMap.end()) { //adding seqs to this group
96                                                 (it->second) += "," + names[i];
97                                                 countName++;
98                                         }else { //first sighting of this group
99                                                 splitMap[group] = names[i];
100                                                 countName++;
101                                                 
102                                                 //is this seq in the fasta file?
103                                                 if (i != 0) { //if not then we need to add a duplicate sequence to the seqs for this group so the new "fasta" and "name" files will match
104                                                         Sequence tempSeq(names[i], alignedString); //get the first guys sequence string since he's in the fasta file.
105                                                         seqs[group].push_back(tempSeq);
106                                                 }
107                                         }
108                                 }
109                                 
110                                 allSeqsMap[names[i]] = names[0];
111                         }
112                         
113                         
114                         //fill nameMapPerGroup - holds all lines in namefile separated by group
115                         for (it = splitMap.begin(); it != splitMap.end(); it++) {
116                                 //grab first name
117                                 string firstName = "";
118                                 for(int i = 0; i < (it->second).length(); i++) {
119                                         if (((it->second)[i]) != ',') {
120                                                 firstName += ((it->second)[i]);
121                                         }else { break; }
122                                 }
123                                 
124                                 //group1 -> seq1 -> seq1,seq2,seq3
125                                 nameMapPerGroup[it->first][firstName] = it->second;
126                         }
127                 }
128                 
129                 inName.close();
130                 
131                 if (error == 1) { m->control_pressed = true; }
132                 
133                 if (countName != (groupMap->getNumSeqs())) {
134                         m->mothurOutEndLine();
135                         m->mothurOut("[ERROR]: Your name file contains " + toString(countName) + " valid sequences, and your groupfile contains " + toString(groupMap->getNumSeqs()) + ", please correct.");
136                         m->mothurOutEndLine();
137                         m->control_pressed = true;
138                 }
139                 
140         }
141         catch(exception& e) {
142                 m->errorOut(e, "SequenceParser", "SequenceParser");
143                 exit(1);
144         }
145 }
146 /************************************************************/
147 SequenceParser::SequenceParser(string groupFile, string fastaFile) {
148         try {
149                 
150                 m = MothurOut::getInstance();
151                 int error;
152                 
153                 //read group file
154                 groupMap = new GroupMap(groupFile);
155                 error = groupMap->readMap();
156                 
157                 if (error == 1) { m->control_pressed = true; }
158                 
159                 //initialize maps
160                 vector<string> namesOfGroups = groupMap->getNamesOfGroups();
161                 for (int i = 0; i < namesOfGroups.size(); i++) {
162                         vector<Sequence> temp;
163                         seqs[namesOfGroups[i]] = temp;
164                 }
165                 
166                 //read fasta file making sure each sequence is in the group file
167                 ifstream in;
168                 m->openInputFile(fastaFile, in);
169                 int count = 0;
170                 
171                 while (!in.eof()) {
172                         
173                         if (m->control_pressed) { break; }
174                         
175                         Sequence seq(in); m->gobble(in);
176                         
177                         if (seq.getName() != "") {
178                                 
179                                 string group = groupMap->getGroup(seq.getName());
180                                 if (group == "not found") {  error = 1; m->mothurOut("[ERROR]: " + seq.getName() + " is in your fasta file and not in your groupfile, please correct."); m->mothurOutEndLine();  }
181                                 else {  seqs[group].push_back(seq);     count++; }
182                         }
183                 }
184                 in.close();
185                 
186                 if (error == 1) { m->control_pressed = true; }
187                 
188                 if (count != (groupMap->getNumSeqs())) {
189                         m->mothurOutEndLine();
190                         m->mothurOut("[ERROR]: Your fasta file contains " + toString(count) + " valid sequences, and your groupfile contains " + toString(groupMap->getNumSeqs()) + ", please correct.");
191                         if (count < (groupMap->getNumSeqs())) { m->mothurOut(" Did you forget to include the name file?"); }
192                         m->mothurOutEndLine();
193                         m->control_pressed = true;
194                 }
195                 
196         }
197         catch(exception& e) {
198                 m->errorOut(e, "SequenceParser", "SequenceParser");
199                 exit(1);
200         }
201 }
202 /************************************************************/
203 SequenceParser::~SequenceParser(){ delete groupMap; }
204 /************************************************************/
205 int SequenceParser::getNumGroups(){ return groupMap->getNumGroups(); }
206 /************************************************************/
207 vector<string> SequenceParser::getNamesOfGroups(){ return groupMap->getNamesOfGroups(); }
208 /************************************************************/
209 bool SequenceParser::isValidGroup(string g){ return groupMap->isValidGroup(g); }
210 /************************************************************/
211 string SequenceParser::getGroup(string g){ return groupMap->getGroup(g); }
212 /************************************************************/
213 int SequenceParser::getNumSeqs(string g){ 
214         try {
215                 map<string, vector<Sequence> >::iterator it;
216                 int num = 0;
217                 
218                 it = seqs.find(g);
219                 if(it == seqs.end()) {
220                         m->mothurOut("[ERROR]: " + g + " is not a valid group, please correct."); m->mothurOutEndLine();
221                 }else {
222                         num = (it->second).size();
223                 }
224                 
225                 return num; 
226         }
227         catch(exception& e) {
228                 m->errorOut(e, "SequenceParser", "getNumSeqs");
229                 exit(1);
230         }
231 }
232 /************************************************************/
233 vector<Sequence> SequenceParser::getSeqs(string g){ 
234         try {
235                 map<string, vector<Sequence> >::iterator it;
236                 vector<Sequence> seqForThisGroup;
237                 
238                 it = seqs.find(g);
239                 if(it == seqs.end()) {
240                         m->mothurOut("[ERROR]: No sequences available for group " + g + ", please correct."); m->mothurOutEndLine();
241                 }else {
242                         seqForThisGroup = it->second;
243                 }
244                 
245                 return seqForThisGroup; 
246         }
247         catch(exception& e) {
248                 m->errorOut(e, "SequenceParser", "getSeqs");
249                 exit(1);
250         }
251 }
252 /************************************************************/
253 int SequenceParser::getSeqs(string g, string filename, bool uchimeFormat=false){ 
254         try {
255                 map<string, vector<Sequence> >::iterator it;
256                 vector<Sequence> seqForThisGroup;
257                 vector<seqPriorityNode> nameVector;
258                 
259                 it = seqs.find(g);
260                 if(it == seqs.end()) {
261                         m->mothurOut("[ERROR]: No sequences available for group " + g + ", please correct."); m->mothurOutEndLine();
262                 }else {
263                         
264                         ofstream out;
265                         m->openOutputFile(filename, out);
266                         
267                         seqForThisGroup = it->second;
268                         
269                         if (uchimeFormat) {
270                                 // format should look like 
271                                 //>seqName /ab=numRedundantSeqs/
272                                 //sequence
273                                 
274                                 map<string, string> nameMapForThisGroup = getNameMap(g);
275                                 map<string, string>::iterator itNameMap;
276                                 int error = 0;
277                                 
278                                 for (int i = 0; i < seqForThisGroup.size(); i++) {
279                                         itNameMap = nameMapForThisGroup.find(seqForThisGroup[i].getName());
280                                         
281                                         if (itNameMap == nameMapForThisGroup.end()){
282                                                 error = 1;
283                                                 m->mothurOut("[ERROR]: " + seqForThisGroup[i].getName() + " is in your fastafile, but is not in your namesfile, please correct."); m->mothurOutEndLine();
284                                         }else {
285                                                 int num = m->getNumNames(itNameMap->second);
286                                                 
287                                                 seqPriorityNode temp(num, seqForThisGroup[i].getAligned(), seqForThisGroup[i].getName());
288                                                 nameVector.push_back(temp);
289                                         }
290                                 }
291                                 
292                                 if (error == 1) { out.close(); m->mothurRemove(filename); return 1; }
293                                 
294                                 //sort by num represented
295                                 sort(nameVector.begin(), nameVector.end(), compareSeqPriorityNodes);
296
297                                 //print new file in order of
298                                 for (int i = 0; i < nameVector.size(); i++) {
299                                         
300                                         if(m->control_pressed) { out.close(); m->mothurRemove(filename); return 1; }
301                                         
302                                         out << ">" << nameVector[i].name  << "/ab=" << nameVector[i].numIdentical << "/" << endl << nameVector[i].seq << endl;
303                                 }
304                                 
305                         }else { 
306                                 for (int i = 0; i < seqForThisGroup.size(); i++) {
307                                         
308                                         if(m->control_pressed) { out.close(); m->mothurRemove(filename); return 1; }
309                                         
310                                         seqForThisGroup[i].printSequence(out);  
311                                 }
312                         }
313                         out.close();
314                 }
315                 
316                 return 0; 
317         }
318         catch(exception& e) {
319                 m->errorOut(e, "SequenceParser", "getSeqs");
320                 exit(1);
321         }
322 }
323
324 /************************************************************/
325 map<string, string> SequenceParser::getNameMap(string g){ 
326         try {
327                 map<string, map<string, string> >::iterator it;
328                 map<string, string> nameMapForThisGroup;
329                 
330                 it = nameMapPerGroup.find(g);
331                 if(it == nameMapPerGroup.end()) {
332                         m->mothurOut("[ERROR]: No nameMap available for group " + g + ", please correct."); m->mothurOutEndLine();
333                 }else {
334                         nameMapForThisGroup = it->second;
335                 }
336                 
337                 return nameMapForThisGroup; 
338         }
339         catch(exception& e) {
340                 m->errorOut(e, "SequenceParser", "getNameMap");
341                 exit(1);
342         }
343 }
344 /************************************************************/
345 int SequenceParser::getNameMap(string g, string filename){ 
346         try {
347                 map<string, map<string, string> >::iterator it;
348                 map<string, string> nameMapForThisGroup;
349                 
350                 it = nameMapPerGroup.find(g);
351                 if(it == nameMapPerGroup.end()) {
352                         m->mothurOut("[ERROR]: No nameMap available for group " + g + ", please correct."); m->mothurOutEndLine();
353                 }else {
354                         nameMapForThisGroup = it->second;
355                         
356                         ofstream out;
357                         m->openOutputFile(filename, out);
358                         
359                         for (map<string, string>::iterator itFile = nameMapForThisGroup.begin(); itFile != nameMapForThisGroup.end(); itFile++) {
360                                 
361                                 if(m->control_pressed) { out.close(); m->mothurRemove(filename); return 1; }
362                                 
363                                 out << itFile->first << '\t' << itFile->second << endl;
364                         }
365                         
366                         out.close();
367                 }
368                 
369                 return 0; 
370         }
371         catch(exception& e) {
372                 m->errorOut(e, "SequenceParser", "getNameMap");
373                 exit(1);
374         }
375 }
376 /************************************************************/
377
378
379