]> git.donarmstrong.com Git - mothur.git/blob - sequenceparser.cpp
fixed label option in sens.spec and added smart distancing.
[mothur.git] / sequenceparser.cpp
1 /*
2  *  sequenceParser.cpp
3  *  Mothur
4  *
5  *  Created by westcott on 9/9/11.
6  *  Copyright 2011 Schloss Lab. All rights reserved.
7  *
8  */
9
10 #include "sequenceparser.h"
11
12
13 /************************************************************/
14 SequenceParser::SequenceParser(string groupFile, string fastaFile, string nameFile) {
15         try {
16                 
17                 m = MothurOut::getInstance();
18                 int error;
19                 
20                 //read group file
21                 groupMap = new GroupMap(groupFile);
22                 error = groupMap->readMap();
23                 
24                 if (error == 1) { m->control_pressed = true; }
25                 
26                 //initialize maps
27                 vector<string> namesOfGroups = groupMap->getNamesOfGroups();
28                 for (int i = 0; i < namesOfGroups.size(); i++) {
29                         vector<Sequence> temp;
30                         map<string, string> tempMap;
31                         seqs[namesOfGroups[i]] = temp;
32                         nameMapPerGroup[namesOfGroups[i]] = tempMap;
33                 }
34                 
35                 //read fasta file making sure each sequence is in the group file
36                 ifstream in;
37                 m->openInputFile(fastaFile, in);
38                 
39                 map<string, string> seqName; //stores name -> sequence string so we can make new "unique" sequences when we parse the name file
40                 while (!in.eof()) {
41                         
42                         if (m->control_pressed) { break; }
43                         
44                         Sequence seq(in); m->gobble(in);
45                         
46                         if (seq.getName() != "") {
47                                 
48                                  string group = groupMap->getGroup(seq.getName());
49                                  if (group == "not found") {  error = 1; m->mothurOut("[ERROR]: " + seq.getName() + " is in your fasta file and not in your groupfile, please correct."); m->mothurOutEndLine();  }
50                                  else { 
51                                          seqs[group].push_back(seq);    
52                                          seqName[seq.getName()] = seq.getAligned();
53                                  }
54                         }
55                 }
56                 in.close();
57                                  
58                 if (error == 1) { m->control_pressed = true; }
59                                  
60                 //read name file
61                 ifstream inName;
62                 m->openInputFile(nameFile, inName);
63                 
64                 string first, second;
65                 int countName = 0;
66                 set<string> thisnames1;
67                 
68                 while(!inName.eof()) {
69                         
70                         if (m->control_pressed) { break; }
71                         
72                         inName >> first; m->gobble(inName);
73                         inName >> second; m->gobble(inName);
74                         
75                         vector<string> names;
76                         m->splitAtChar(second, names, ',');
77                         
78                         //get aligned string for these seqs from the fasta file
79                         string alignedString = "";
80                         map<string, string>::iterator itAligned = seqName.find(names[0]);
81                         if (itAligned == seqName.end()) {
82                                 error = 1; m->mothurOut("[ERROR]: " + names[0] + " is in your name file and not in your fasta file, please correct."); m->mothurOutEndLine();
83                         }else {
84                                 alignedString = itAligned->second;
85                         }
86                         
87                         //separate by group - parse one line in name file
88                         map<string, string> splitMap; //group -> name1,name2,...
89                         map<string, string>::iterator it;
90                         for (int i = 0; i < names.size(); i++) {
91                                 
92                                 string group = groupMap->getGroup(names[i]);
93                                 if (group == "not found") {  error = 1; m->mothurOut("[ERROR]: " + names[i] + " is in your name file and not in your groupfile, please correct."); m->mothurOutEndLine();  }
94                                 else {  
95                                         
96                                         it = splitMap.find(group);
97                                         if (it != splitMap.end()) { //adding seqs to this group
98                                                 (it->second) += "," + names[i];
99                                                 thisnames1.insert(names[i]);
100                                                 countName++;
101                                         }else { //first sighting of this group
102                                                 splitMap[group] = names[i];
103                                                 countName++;
104                                                 thisnames1.insert(names[i]);
105                                                 
106                                                 //is this seq in the fasta file?
107                                                 if (i != 0) { //if not then we need to add a duplicate sequence to the seqs for this group so the new "fasta" and "name" files will match
108                                                         Sequence tempSeq(names[i], alignedString); //get the first guys sequence string since he's in the fasta file.
109                                                         seqs[group].push_back(tempSeq);
110                                                 }
111                                         }
112                                 }
113                                 
114                                 allSeqsMap[names[i]] = names[0];
115                         }
116                         
117                         
118                         //fill nameMapPerGroup - holds all lines in namefile separated by group
119                         for (it = splitMap.begin(); it != splitMap.end(); it++) {
120                                 //grab first name
121                                 string firstName = "";
122                                 for(int i = 0; i < (it->second).length(); i++) {
123                                         if (((it->second)[i]) != ',') {
124                                                 firstName += ((it->second)[i]);
125                                         }else { break; }
126                                 }
127                                 
128                                 //group1 -> seq1 -> seq1,seq2,seq3
129                                 nameMapPerGroup[it->first][firstName] = it->second;
130                         }
131                 }
132                 
133                 inName.close();
134                 
135                 if (error == 1) { m->control_pressed = true; }
136                 
137                 if (countName != (groupMap->getNumSeqs())) {
138                         vector<string> groupseqsnames = groupMap->getNamesSeqs();
139                         
140                         for (int i = 0; i < groupseqsnames.size(); i++) {
141                                 set<string>::iterator itnamesfile = thisnames1.find(groupseqsnames[i]);
142                                 if (itnamesfile == thisnames1.end()){
143                                         cout << "missing name " + groupseqsnames[i] << '\t' << allSeqsMap[groupseqsnames[i]] << endl;
144                                 }
145                         }
146                         
147                         m->mothurOutEndLine();
148                         m->mothurOut("[ERROR]: Your name file contains " + toString(countName) + " valid sequences, and your groupfile contains " + toString(groupMap->getNumSeqs()) + ", please correct.");
149                         m->mothurOutEndLine();
150                         m->control_pressed = true;
151                 }
152                 
153         }
154         catch(exception& e) {
155                 m->errorOut(e, "SequenceParser", "SequenceParser");
156                 exit(1);
157         }
158 }
159 /************************************************************/
160 SequenceParser::SequenceParser(string groupFile, string fastaFile) {
161         try {
162                 
163                 m = MothurOut::getInstance();
164                 int error;
165                 
166                 //read group file
167                 groupMap = new GroupMap(groupFile);
168                 error = groupMap->readMap();
169                 
170                 if (error == 1) { m->control_pressed = true; }
171                 
172                 //initialize maps
173                 vector<string> namesOfGroups = groupMap->getNamesOfGroups();
174                 for (int i = 0; i < namesOfGroups.size(); i++) {
175                         vector<Sequence> temp;
176                         seqs[namesOfGroups[i]] = temp;
177                 }
178                 
179                 //read fasta file making sure each sequence is in the group file
180                 ifstream in;
181                 m->openInputFile(fastaFile, in);
182                 int count = 0;
183                 
184                 while (!in.eof()) {
185                         
186                         if (m->control_pressed) { break; }
187                         
188                         Sequence seq(in); m->gobble(in);
189                         
190                         if (seq.getName() != "") {
191                                 
192                                 string group = groupMap->getGroup(seq.getName());
193                                 if (group == "not found") {  error = 1; m->mothurOut("[ERROR]: " + seq.getName() + " is in your fasta file and not in your groupfile, please correct."); m->mothurOutEndLine();  }
194                                 else {  seqs[group].push_back(seq);     count++; }
195                         }
196                 }
197                 in.close();
198                 
199                 if (error == 1) { m->control_pressed = true; }
200                 
201                 if (count != (groupMap->getNumSeqs())) {
202                         m->mothurOutEndLine();
203                         m->mothurOut("[ERROR]: Your fasta file contains " + toString(count) + " valid sequences, and your groupfile contains " + toString(groupMap->getNumSeqs()) + ", please correct.");
204                         if (count < (groupMap->getNumSeqs())) { m->mothurOut(" Did you forget to include the name file?"); }
205                         m->mothurOutEndLine();
206                         m->control_pressed = true;
207                 }
208                 
209         }
210         catch(exception& e) {
211                 m->errorOut(e, "SequenceParser", "SequenceParser");
212                 exit(1);
213         }
214 }
215 /************************************************************/
216 SequenceParser::~SequenceParser(){ delete groupMap; }
217 /************************************************************/
218 int SequenceParser::getNumGroups(){ return groupMap->getNumGroups(); }
219 /************************************************************/
220 vector<string> SequenceParser::getNamesOfGroups(){ return groupMap->getNamesOfGroups(); }
221 /************************************************************/
222 bool SequenceParser::isValidGroup(string g){ return groupMap->isValidGroup(g); }
223 /************************************************************/
224 string SequenceParser::getGroup(string g){ return groupMap->getGroup(g); }
225 /************************************************************/
226 int SequenceParser::getNumSeqs(string g){ 
227         try {
228                 map<string, vector<Sequence> >::iterator it;
229                 int num = 0;
230                 
231                 it = seqs.find(g);
232                 if(it == seqs.end()) {
233                         m->mothurOut("[ERROR]: " + g + " is not a valid group, please correct."); m->mothurOutEndLine();
234                 }else {
235                         num = (it->second).size();
236                 }
237                 
238                 return num; 
239         }
240         catch(exception& e) {
241                 m->errorOut(e, "SequenceParser", "getNumSeqs");
242                 exit(1);
243         }
244 }
245 /************************************************************/
246 vector<Sequence> SequenceParser::getSeqs(string g){ 
247         try {
248                 map<string, vector<Sequence> >::iterator it;
249                 vector<Sequence> seqForThisGroup;
250                 
251                 it = seqs.find(g);
252                 if(it == seqs.end()) {
253                         m->mothurOut("[ERROR]: No sequences available for group " + g + ", please correct."); m->mothurOutEndLine();
254                 }else {
255                         seqForThisGroup = it->second;
256                 }
257                 
258                 return seqForThisGroup; 
259         }
260         catch(exception& e) {
261                 m->errorOut(e, "SequenceParser", "getSeqs");
262                 exit(1);
263         }
264 }
265 /************************************************************/
266 int SequenceParser::getSeqs(string g, string filename, bool uchimeFormat=false){ 
267         try {
268                 map<string, vector<Sequence> >::iterator it;
269                 vector<Sequence> seqForThisGroup;
270                 vector<seqPriorityNode> nameVector;
271                 
272                 it = seqs.find(g);
273                 if(it == seqs.end()) {
274                         m->mothurOut("[ERROR]: No sequences available for group " + g + ", please correct."); m->mothurOutEndLine();
275                 }else {
276                         
277                         ofstream out;
278                         m->openOutputFile(filename, out);
279                         
280                         seqForThisGroup = it->second;
281                         
282                         if (uchimeFormat) {
283                                 // format should look like 
284                                 //>seqName /ab=numRedundantSeqs/
285                                 //sequence
286                                 
287                                 map<string, string> nameMapForThisGroup = getNameMap(g);
288                                 map<string, string>::iterator itNameMap;
289                                 int error = 0;
290                                 
291                                 for (int i = 0; i < seqForThisGroup.size(); i++) {
292                                         itNameMap = nameMapForThisGroup.find(seqForThisGroup[i].getName());
293                                         
294                                         if (itNameMap == nameMapForThisGroup.end()){
295                                                 error = 1;
296                                                 m->mothurOut("[ERROR]: " + seqForThisGroup[i].getName() + " is in your fastafile, but is not in your namesfile, please correct."); m->mothurOutEndLine();
297                                         }else {
298                                                 int num = m->getNumNames(itNameMap->second);
299                                                 
300                                                 seqPriorityNode temp(num, seqForThisGroup[i].getAligned(), seqForThisGroup[i].getName());
301                                                 nameVector.push_back(temp);
302                                         }
303                                 }
304                                 
305                                 if (error == 1) { out.close(); m->mothurRemove(filename); return 1; }
306                                 
307                                 //sort by num represented
308                                 sort(nameVector.begin(), nameVector.end(), compareSeqPriorityNodes);
309
310                                 //print new file in order of
311                                 for (int i = 0; i < nameVector.size(); i++) {
312                                         
313                                         if(m->control_pressed) { out.close(); m->mothurRemove(filename); return 1; }
314                                         
315                                         out << ">" << nameVector[i].name  << "/ab=" << nameVector[i].numIdentical << "/" << endl << nameVector[i].seq << endl;
316                                 }
317                                 
318                         }else { 
319                                 for (int i = 0; i < seqForThisGroup.size(); i++) {
320                                         
321                                         if(m->control_pressed) { out.close(); m->mothurRemove(filename); return 1; }
322                                         
323                                         seqForThisGroup[i].printSequence(out);  
324                                 }
325                         }
326                         out.close();
327                 }
328                 
329                 return 0; 
330         }
331         catch(exception& e) {
332                 m->errorOut(e, "SequenceParser", "getSeqs");
333                 exit(1);
334         }
335 }
336
337 /************************************************************/
338 map<string, string> SequenceParser::getNameMap(string g){ 
339         try {
340                 map<string, map<string, string> >::iterator it;
341                 map<string, string> nameMapForThisGroup;
342                 
343                 it = nameMapPerGroup.find(g);
344                 if(it == nameMapPerGroup.end()) {
345                         m->mothurOut("[ERROR]: No nameMap available for group " + g + ", please correct."); m->mothurOutEndLine();
346                 }else {
347                         nameMapForThisGroup = it->second;
348                 }
349                 
350                 return nameMapForThisGroup; 
351         }
352         catch(exception& e) {
353                 m->errorOut(e, "SequenceParser", "getNameMap");
354                 exit(1);
355         }
356 }
357 /************************************************************/
358 int SequenceParser::getNameMap(string g, string filename){ 
359         try {
360                 map<string, map<string, string> >::iterator it;
361                 map<string, string> nameMapForThisGroup;
362                 
363                 it = nameMapPerGroup.find(g);
364                 if(it == nameMapPerGroup.end()) {
365                         m->mothurOut("[ERROR]: No nameMap available for group " + g + ", please correct."); m->mothurOutEndLine();
366                 }else {
367                         nameMapForThisGroup = it->second;
368                         
369                         ofstream out;
370                         m->openOutputFile(filename, out);
371                         
372                         for (map<string, string>::iterator itFile = nameMapForThisGroup.begin(); itFile != nameMapForThisGroup.end(); itFile++) {
373                                 
374                                 if(m->control_pressed) { out.close(); m->mothurRemove(filename); return 1; }
375                                 
376                                 out << itFile->first << '\t' << itFile->second << endl;
377                         }
378                         
379                         out.close();
380                 }
381                 
382                 return 0; 
383         }
384         catch(exception& e) {
385                 m->errorOut(e, "SequenceParser", "getNameMap");
386                 exit(1);
387         }
388 }
389 /************************************************************/
390
391
392