]> git.donarmstrong.com Git - mothur.git/blob - sequenceparser.cpp
testing 1.22.0
[mothur.git] / sequenceparser.cpp
1 /*
2  *  sequenceParser.cpp
3  *  Mothur
4  *
5  *  Created by westcott on 9/9/11.
6  *  Copyright 2011 Schloss Lab. All rights reserved.
7  *
8  */
9
10 #include "sequenceParser.h"
11
12
13 /************************************************************/
14 SequenceParser::SequenceParser(string groupFile, string fastaFile, string nameFile) {
15         try {
16                 
17                 m = MothurOut::getInstance();
18                 int error;
19                 
20                 //read group file
21                 groupMap = new GroupMap(groupFile);
22                 error = groupMap->readMap();
23                 
24                 if (error == 1) { m->control_pressed = true; }
25                 
26                 //initialize maps
27                 vector<string> namesOfGroups = groupMap->getNamesOfGroups();
28                 for (int i = 0; i < namesOfGroups.size(); i++) {
29                         vector<Sequence> temp;
30                         map<string, string> tempMap;
31                         seqs[namesOfGroups[i]] = temp;
32                         nameMapPerGroup[namesOfGroups[i]] = tempMap;
33                 }
34                 
35                 //read fasta file making sure each sequence is in the group file
36                 ifstream in;
37                 m->openInputFile(fastaFile, in);
38                 
39                 map<string, string> seqName; //stores name -> sequence string so we can make new "unique" sequences when we parse the name file
40                 while (!in.eof()) {
41                         
42                         if (m->control_pressed) { break; }
43                         
44                         Sequence seq(in); m->gobble(in);
45                         
46                         if (seq.getName() != "") {
47                                 
48                                  string group = groupMap->getGroup(seq.getName());
49                                  if (group == "not found") {  error = 1; m->mothurOut("[ERROR]: " + seq.getName() + " is in your fasta file and not in your groupfile, please correct."); m->mothurOutEndLine();  }
50                                  else { 
51                                          seqs[group].push_back(seq);    
52                                          seqName[seq.getName()] = seq.getAligned();
53                                  }
54                         }
55                 }
56                 in.close();
57                                  
58                 if (error == 1) { m->control_pressed = true; }
59                                  
60                 //read name file
61                 ifstream inName;
62                 m->openInputFile(nameFile, inName);
63                 
64                 string first, second;
65                 int countName = 0;
66                 set<string> thisnames1;
67                 while(!inName.eof()) {
68                         
69                         if (m->control_pressed) { break; }
70                         
71                         inName >> first; m->gobble(inName);
72                         inName >> second; m->gobble(inName);
73                         
74                         vector<string> names;
75                         m->splitAtChar(second, names, ',');
76                         
77                         //get aligned string for these seqs from the fasta file
78                         string alignedString = "";
79                         map<string, string>::iterator itAligned = seqName.find(names[0]);
80                         if (itAligned == seqName.end()) {
81                                 error = 1; m->mothurOut("[ERROR]: " + names[0] + " is in your name file and not in your fasta file, please correct."); m->mothurOutEndLine();
82                         }else {
83                                 alignedString = itAligned->second;
84                         }
85                         
86                         //separate by group - parse one line in name file
87                         map<string, string> splitMap; //group -> name1,name2,...
88                         map<string, string>::iterator it;
89                         for (int i = 0; i < names.size(); i++) {
90                                 
91                                 string group = groupMap->getGroup(names[i]);
92                                 if (group == "not found") {  error = 1; m->mothurOut("[ERROR]: " + names[i] + " is in your name file and not in your groupfile, please correct."); m->mothurOutEndLine();  }
93                                 else {  
94                                         
95                                         it = splitMap.find(group);
96                                         if (it != splitMap.end()) { //adding seqs to this group
97                                                 (it->second) += "," + names[i];
98                                                 thisnames1.insert(names[i]);
99                                                 countName++;
100                                         }else { //first sighting of this group
101                                                 splitMap[group] = names[i];
102                                                 countName++;
103                                                 thisnames1.insert(names[i]);
104                                                 
105                                                 //is this seq in the fasta file?
106                                                 if (i != 0) { //if not then we need to add a duplicate sequence to the seqs for this group so the new "fasta" and "name" files will match
107                                                         Sequence tempSeq(names[i], alignedString); //get the first guys sequence string since he's in the fasta file.
108                                                         seqs[group].push_back(tempSeq);
109                                                 }
110                                         }
111                                 }
112                                 
113                                 allSeqsMap[names[i]] = names[0];
114                         }
115                         
116                         
117                         //fill nameMapPerGroup - holds all lines in namefile separated by group
118                         for (it = splitMap.begin(); it != splitMap.end(); it++) {
119                                 //grab first name
120                                 string firstName = "";
121                                 for(int i = 0; i < (it->second).length(); i++) {
122                                         if (((it->second)[i]) != ',') {
123                                                 firstName += ((it->second)[i]);
124                                         }else { break; }
125                                 }
126                                 
127                                 //group1 -> seq1 -> seq1,seq2,seq3
128                                 nameMapPerGroup[it->first][firstName] = it->second;
129                         }
130                 }
131                 
132                 inName.close();
133                 
134                 if (error == 1) { m->control_pressed = true; }
135                 
136                 if (countName != (groupMap->getNumSeqs())) {
137                         vector<string> groupseqsnames = groupMap->getNamesSeqs();
138                         for (int i = 0; i < groupseqsnames.size(); i++) {
139                                 set<string>::iterator itnamesfile = thisnames1.find(groupseqsnames[i]);
140                                 if (itnamesfile == thisnames1.end()){
141                                         cout << "missing name " + groupseqsnames[i] << '\t' << allSeqsMap[groupseqsnames[i]] << endl;
142                                 }
143                         }
144                         m->mothurOutEndLine();
145                         m->mothurOut("[ERROR]: Your name file contains " + toString(countName) + " valid sequences, and your groupfile contains " + toString(groupMap->getNumSeqs()) + ", please correct.");
146                         m->mothurOutEndLine();
147                         m->control_pressed = true;
148                 }
149                 
150         }
151         catch(exception& e) {
152                 m->errorOut(e, "SequenceParser", "SequenceParser");
153                 exit(1);
154         }
155 }
156 /************************************************************/
157 SequenceParser::SequenceParser(string groupFile, string fastaFile) {
158         try {
159                 
160                 m = MothurOut::getInstance();
161                 int error;
162                 
163                 //read group file
164                 groupMap = new GroupMap(groupFile);
165                 error = groupMap->readMap();
166                 
167                 if (error == 1) { m->control_pressed = true; }
168                 
169                 //initialize maps
170                 vector<string> namesOfGroups = groupMap->getNamesOfGroups();
171                 for (int i = 0; i < namesOfGroups.size(); i++) {
172                         vector<Sequence> temp;
173                         seqs[namesOfGroups[i]] = temp;
174                 }
175                 
176                 //read fasta file making sure each sequence is in the group file
177                 ifstream in;
178                 m->openInputFile(fastaFile, in);
179                 int count = 0;
180                 
181                 while (!in.eof()) {
182                         
183                         if (m->control_pressed) { break; }
184                         
185                         Sequence seq(in); m->gobble(in);
186                         
187                         if (seq.getName() != "") {
188                                 
189                                 string group = groupMap->getGroup(seq.getName());
190                                 if (group == "not found") {  error = 1; m->mothurOut("[ERROR]: " + seq.getName() + " is in your fasta file and not in your groupfile, please correct."); m->mothurOutEndLine();  }
191                                 else {  seqs[group].push_back(seq);     count++; }
192                         }
193                 }
194                 in.close();
195                 
196                 if (error == 1) { m->control_pressed = true; }
197                 
198                 if (count != (groupMap->getNumSeqs())) {
199                         m->mothurOutEndLine();
200                         m->mothurOut("[ERROR]: Your fasta file contains " + toString(count) + " valid sequences, and your groupfile contains " + toString(groupMap->getNumSeqs()) + ", please correct.");
201                         if (count < (groupMap->getNumSeqs())) { m->mothurOut(" Did you forget to include the name file?"); }
202                         m->mothurOutEndLine();
203                         m->control_pressed = true;
204                 }
205                 
206         }
207         catch(exception& e) {
208                 m->errorOut(e, "SequenceParser", "SequenceParser");
209                 exit(1);
210         }
211 }
212 /************************************************************/
213 SequenceParser::~SequenceParser(){ delete groupMap; }
214 /************************************************************/
215 int SequenceParser::getNumGroups(){ return groupMap->getNumGroups(); }
216 /************************************************************/
217 vector<string> SequenceParser::getNamesOfGroups(){ return groupMap->getNamesOfGroups(); }
218 /************************************************************/
219 bool SequenceParser::isValidGroup(string g){ return groupMap->isValidGroup(g); }
220 /************************************************************/
221 string SequenceParser::getGroup(string g){ return groupMap->getGroup(g); }
222 /************************************************************/
223 int SequenceParser::getNumSeqs(string g){ 
224         try {
225                 map<string, vector<Sequence> >::iterator it;
226                 int num = 0;
227                 
228                 it = seqs.find(g);
229                 if(it == seqs.end()) {
230                         m->mothurOut("[ERROR]: " + g + " is not a valid group, please correct."); m->mothurOutEndLine();
231                 }else {
232                         num = (it->second).size();
233                 }
234                 
235                 return num; 
236         }
237         catch(exception& e) {
238                 m->errorOut(e, "SequenceParser", "getNumSeqs");
239                 exit(1);
240         }
241 }
242 /************************************************************/
243 vector<Sequence> SequenceParser::getSeqs(string g){ 
244         try {
245                 map<string, vector<Sequence> >::iterator it;
246                 vector<Sequence> seqForThisGroup;
247                 
248                 it = seqs.find(g);
249                 if(it == seqs.end()) {
250                         m->mothurOut("[ERROR]: No sequences available for group " + g + ", please correct."); m->mothurOutEndLine();
251                 }else {
252                         seqForThisGroup = it->second;
253                 }
254                 
255                 return seqForThisGroup; 
256         }
257         catch(exception& e) {
258                 m->errorOut(e, "SequenceParser", "getSeqs");
259                 exit(1);
260         }
261 }
262 /************************************************************/
263 int SequenceParser::getSeqs(string g, string filename, bool uchimeFormat=false){ 
264         try {
265                 map<string, vector<Sequence> >::iterator it;
266                 vector<Sequence> seqForThisGroup;
267                 vector<seqPriorityNode> nameVector;
268                 
269                 it = seqs.find(g);
270                 if(it == seqs.end()) {
271                         m->mothurOut("[ERROR]: No sequences available for group " + g + ", please correct."); m->mothurOutEndLine();
272                 }else {
273                         
274                         ofstream out;
275                         m->openOutputFile(filename, out);
276                         
277                         seqForThisGroup = it->second;
278                         
279                         if (uchimeFormat) {
280                                 // format should look like 
281                                 //>seqName /ab=numRedundantSeqs/
282                                 //sequence
283                                 
284                                 map<string, string> nameMapForThisGroup = getNameMap(g);
285                                 map<string, string>::iterator itNameMap;
286                                 int error = 0;
287                                 
288                                 for (int i = 0; i < seqForThisGroup.size(); i++) {
289                                         itNameMap = nameMapForThisGroup.find(seqForThisGroup[i].getName());
290                                         
291                                         if (itNameMap == nameMapForThisGroup.end()){
292                                                 error = 1;
293                                                 m->mothurOut("[ERROR]: " + seqForThisGroup[i].getName() + " is in your fastafile, but is not in your namesfile, please correct."); m->mothurOutEndLine();
294                                         }else {
295                                                 int num = m->getNumNames(itNameMap->second);
296                                                 
297                                                 seqPriorityNode temp(num, seqForThisGroup[i].getAligned(), seqForThisGroup[i].getName());
298                                                 nameVector.push_back(temp);
299                                         }
300                                 }
301                                 
302                                 if (error == 1) { out.close(); m->mothurRemove(filename); return 1; }
303                                 
304                                 //sort by num represented
305                                 sort(nameVector.begin(), nameVector.end(), compareSeqPriorityNodes);
306
307                                 //print new file in order of
308                                 for (int i = 0; i < nameVector.size(); i++) {
309                                         
310                                         if(m->control_pressed) { out.close(); m->mothurRemove(filename); return 1; }
311                                         
312                                         out << ">" << nameVector[i].name  << "/ab=" << nameVector[i].numIdentical << "/" << endl << nameVector[i].seq << endl;
313                                 }
314                                 
315                         }else { 
316                                 for (int i = 0; i < seqForThisGroup.size(); i++) {
317                                         
318                                         if(m->control_pressed) { out.close(); m->mothurRemove(filename); return 1; }
319                                         
320                                         seqForThisGroup[i].printSequence(out);  
321                                 }
322                         }
323                         out.close();
324                 }
325                 
326                 return 0; 
327         }
328         catch(exception& e) {
329                 m->errorOut(e, "SequenceParser", "getSeqs");
330                 exit(1);
331         }
332 }
333
334 /************************************************************/
335 map<string, string> SequenceParser::getNameMap(string g){ 
336         try {
337                 map<string, map<string, string> >::iterator it;
338                 map<string, string> nameMapForThisGroup;
339                 
340                 it = nameMapPerGroup.find(g);
341                 if(it == nameMapPerGroup.end()) {
342                         m->mothurOut("[ERROR]: No nameMap available for group " + g + ", please correct."); m->mothurOutEndLine();
343                 }else {
344                         nameMapForThisGroup = it->second;
345                 }
346                 
347                 return nameMapForThisGroup; 
348         }
349         catch(exception& e) {
350                 m->errorOut(e, "SequenceParser", "getNameMap");
351                 exit(1);
352         }
353 }
354 /************************************************************/
355 int SequenceParser::getNameMap(string g, string filename){ 
356         try {
357                 map<string, map<string, string> >::iterator it;
358                 map<string, string> nameMapForThisGroup;
359                 
360                 it = nameMapPerGroup.find(g);
361                 if(it == nameMapPerGroup.end()) {
362                         m->mothurOut("[ERROR]: No nameMap available for group " + g + ", please correct."); m->mothurOutEndLine();
363                 }else {
364                         nameMapForThisGroup = it->second;
365                         
366                         ofstream out;
367                         m->openOutputFile(filename, out);
368                         
369                         for (map<string, string>::iterator itFile = nameMapForThisGroup.begin(); itFile != nameMapForThisGroup.end(); itFile++) {
370                                 
371                                 if(m->control_pressed) { out.close(); m->mothurRemove(filename); return 1; }
372                                 
373                                 out << itFile->first << '\t' << itFile->second << endl;
374                         }
375                         
376                         out.close();
377                 }
378                 
379                 return 0; 
380         }
381         catch(exception& e) {
382                 m->errorOut(e, "SequenceParser", "getNameMap");
383                 exit(1);
384         }
385 }
386 /************************************************************/
387
388
389