]> git.donarmstrong.com Git - mothur.git/blob - sequenceparser.cpp
added load.logfile command. changed summary.single output for subsample=t.
[mothur.git] / sequenceparser.cpp
1 /*
2  *  sequenceParser.cpp
3  *  Mothur
4  *
5  *  Created by westcott on 9/9/11.
6  *  Copyright 2011 Schloss Lab. All rights reserved.
7  *
8  */
9
10 #include "sequenceparser.h"
11
12
13 /************************************************************/
14 SequenceParser::SequenceParser(string groupFile, string fastaFile, string nameFile) {
15         try {
16                 
17                 m = MothurOut::getInstance();
18                 int error;
19                 
20                 //read group file
21                 groupMap = new GroupMap(groupFile);
22                 error = groupMap->readMap();
23                 
24                 if (error == 1) { m->control_pressed = true; }
25                 
26                 //initialize maps
27                 vector<string> namesOfGroups = groupMap->getNamesOfGroups();
28                 for (int i = 0; i < namesOfGroups.size(); i++) {
29                         vector<Sequence> temp;
30                         map<string, string> tempMap;
31                         seqs[namesOfGroups[i]] = temp;
32                         nameMapPerGroup[namesOfGroups[i]] = tempMap;
33                 }
34                 
35                 //read fasta file making sure each sequence is in the group file
36                 ifstream in;
37                 m->openInputFile(fastaFile, in);
38                 
39                 map<string, string> seqName; //stores name -> sequence string so we can make new "unique" sequences when we parse the name file
40         int fastaCount = 0;
41                 while (!in.eof()) {
42                         
43                         if (m->control_pressed) { break; }
44                         
45                         Sequence seq(in); m->gobble(in);
46             fastaCount++;
47             if (m->debug) { if((fastaCount) % 1000 == 0){       m->mothurOut("[DEBUG]: reading seq " + toString(fastaCount) + "\n.");   } }
48                         
49         if (seq.getName() != "") {
50                                 
51                                  string group = groupMap->getGroup(seq.getName());
52                                  if (group == "not found") {  error = 1; m->mothurOut("[ERROR]: " + seq.getName() + " is in your fasta file and not in your groupfile, please correct."); m->mothurOutEndLine();  }
53                                  else { 
54                                          seqs[group].push_back(seq);    
55                                          seqName[seq.getName()] = seq.getAligned();
56                                  }
57                         }
58                 }
59                 in.close();
60                                  
61                 if (error == 1) { m->control_pressed = true; }
62                                  
63                 //read name file
64                 ifstream inName;
65                 m->openInputFile(nameFile, inName);
66                 
67                 string first, second;
68                 int countName = 0;
69                 set<string> thisnames1;
70                 
71         string rest = "";
72         char buffer[4096];
73         bool pairDone = false;
74         bool columnOne = true;
75         string firstCol, secondCol;
76         
77                 while (!inName.eof()) {
78                         if (m->control_pressed) { break; }
79                         
80             inName.read(buffer, 4096);
81             vector<string> pieces = m->splitWhiteSpace(rest, buffer, inName.gcount());
82             
83             for (int i = 0; i < pieces.size(); i++) {
84                 if (columnOne) {  firstCol = pieces[i]; columnOne=false; }
85                 else  { secondCol = pieces[i]; pairDone = true; columnOne=true; }
86                 
87                 if (pairDone) { //save one line
88                     vector<string> names;
89                     m->splitAtChar(second, names, ',');
90                     
91                     //get aligned string for these seqs from the fasta file
92                     string alignedString = "";
93                     map<string, string>::iterator itAligned = seqName.find(names[0]);
94                     if (itAligned == seqName.end()) {
95                         error = 1; m->mothurOut("[ERROR]: " + names[0] + " is in your name file and not in your fasta file, please correct."); m->mothurOutEndLine();
96                     }else {
97                         alignedString = itAligned->second;
98                     }
99                     
100                     //separate by group - parse one line in name file
101                     map<string, string> splitMap; //group -> name1,name2,...
102                     map<string, string>::iterator it;
103                     for (int i = 0; i < names.size(); i++) {
104                         
105                         string group = groupMap->getGroup(names[i]);
106                         if (group == "not found") {  error = 1; m->mothurOut("[ERROR]: " + names[i] + " is in your name file and not in your groupfile, please correct."); m->mothurOutEndLine();  }
107                         else {  
108                             
109                             it = splitMap.find(group);
110                             if (it != splitMap.end()) { //adding seqs to this group
111                                 (it->second) += "," + names[i];
112                                 thisnames1.insert(names[i]);
113                                 countName++;
114                             }else { //first sighting of this group
115                                 splitMap[group] = names[i];
116                                 countName++;
117                                 thisnames1.insert(names[i]);
118                                 
119                                 //is this seq in the fasta file?
120                                 if (i != 0) { //if not then we need to add a duplicate sequence to the seqs for this group so the new "fasta" and "name" files will match
121                                     Sequence tempSeq(names[i], alignedString); //get the first guys sequence string since he's in the fasta file.
122                                     seqs[group].push_back(tempSeq);
123                                 }
124                             }
125                         }
126                         
127                         allSeqsMap[names[i]] = names[0];
128                     }
129                     
130                     
131                     //fill nameMapPerGroup - holds all lines in namefile separated by group
132                     for (it = splitMap.begin(); it != splitMap.end(); it++) {
133                         //grab first name
134                         string firstName = "";
135                         for(int i = 0; i < (it->second).length(); i++) {
136                             if (((it->second)[i]) != ',') {
137                                 firstName += ((it->second)[i]);
138                             }else { break; }
139                         }
140                         
141                         //group1 -> seq1 -> seq1,seq2,seq3
142                         nameMapPerGroup[it->first][firstName] = it->second;
143                     }
144
145                     pairDone = false; 
146                 }
147             }
148                 }
149                 inName.close();
150                 
151                 if (error == 1) { m->control_pressed = true; }
152                         
153                 if (countName != (groupMap->getNumSeqs())) {
154                         vector<string> groupseqsnames = groupMap->getNamesSeqs();
155                         
156                         for (int i = 0; i < groupseqsnames.size(); i++) {
157                                 set<string>::iterator itnamesfile = thisnames1.find(groupseqsnames[i]);
158                                 if (itnamesfile == thisnames1.end()){
159                                         cout << "missing name " + groupseqsnames[i] << '\t' << allSeqsMap[groupseqsnames[i]] << endl;
160                                 }
161                         }
162                         
163                         m->mothurOutEndLine();
164                         m->mothurOut("[ERROR]: Your name file contains " + toString(countName) + " valid sequences, and your groupfile contains " + toString(groupMap->getNumSeqs()) + ", please correct.");
165                         m->mothurOutEndLine();
166                         m->control_pressed = true;
167                 }
168                 
169         }
170         catch(exception& e) {
171                 m->errorOut(e, "SequenceParser", "SequenceParser");
172                 exit(1);
173         }
174 }
175 /************************************************************/
176 SequenceParser::SequenceParser(string groupFile, string fastaFile) {
177         try {
178                 
179                 m = MothurOut::getInstance();
180                 int error;
181                 
182                 //read group file
183                 groupMap = new GroupMap(groupFile);
184                 error = groupMap->readMap();
185                 
186                 if (error == 1) { m->control_pressed = true; }
187                 
188                 //initialize maps
189                 vector<string> namesOfGroups = groupMap->getNamesOfGroups();
190                 for (int i = 0; i < namesOfGroups.size(); i++) {
191                         vector<Sequence> temp;
192                         seqs[namesOfGroups[i]] = temp;
193                 }
194                 
195                 //read fasta file making sure each sequence is in the group file
196                 ifstream in;
197                 m->openInputFile(fastaFile, in);
198                 int count = 0;
199                 
200                 while (!in.eof()) {
201                         
202                         if (m->control_pressed) { break; }
203                         
204                         Sequence seq(in); m->gobble(in);
205                         
206                         if (seq.getName() != "") {
207                                 
208                                 string group = groupMap->getGroup(seq.getName());
209                                 if (group == "not found") {  error = 1; m->mothurOut("[ERROR]: " + seq.getName() + " is in your fasta file and not in your groupfile, please correct."); m->mothurOutEndLine();  }
210                                 else {  seqs[group].push_back(seq);     count++; }
211                         }
212                 }
213                 in.close();
214                 
215                 if (error == 1) { m->control_pressed = true; }
216                 
217                 if (count != (groupMap->getNumSeqs())) {
218                         m->mothurOutEndLine();
219                         m->mothurOut("[ERROR]: Your fasta file contains " + toString(count) + " valid sequences, and your groupfile contains " + toString(groupMap->getNumSeqs()) + ", please correct.");
220                         if (count < (groupMap->getNumSeqs())) { m->mothurOut(" Did you forget to include the name file?"); }
221                         m->mothurOutEndLine();
222                         m->control_pressed = true;
223                 }
224                 
225         }
226         catch(exception& e) {
227                 m->errorOut(e, "SequenceParser", "SequenceParser");
228                 exit(1);
229         }
230 }
231 /************************************************************/
232 SequenceParser::~SequenceParser(){ delete groupMap; }
233 /************************************************************/
234 int SequenceParser::getNumGroups(){ return groupMap->getNumGroups(); }
235 /************************************************************/
236 vector<string> SequenceParser::getNamesOfGroups(){ return groupMap->getNamesOfGroups(); }
237 /************************************************************/
238 bool SequenceParser::isValidGroup(string g){ return groupMap->isValidGroup(g); }
239 /************************************************************/
240 string SequenceParser::getGroup(string g){ return groupMap->getGroup(g); }
241 /************************************************************/
242 int SequenceParser::getNumSeqs(string g){ 
243         try {
244                 map<string, vector<Sequence> >::iterator it;
245                 int num = 0;
246                 
247                 it = seqs.find(g);
248                 if(it == seqs.end()) {
249                         m->mothurOut("[ERROR]: " + g + " is not a valid group, please correct."); m->mothurOutEndLine();
250                 }else {
251                         num = (it->second).size();
252                 }
253                 
254                 return num; 
255         }
256         catch(exception& e) {
257                 m->errorOut(e, "SequenceParser", "getNumSeqs");
258                 exit(1);
259         }
260 }
261 /************************************************************/
262 vector<Sequence> SequenceParser::getSeqs(string g){ 
263         try {
264                 map<string, vector<Sequence> >::iterator it;
265                 vector<Sequence> seqForThisGroup;
266                 
267                 it = seqs.find(g);
268                 if(it == seqs.end()) {
269                         m->mothurOut("[ERROR]: No sequences available for group " + g + ", please correct."); m->mothurOutEndLine();
270                 }else {
271                         seqForThisGroup = it->second;
272             if (m->debug) {  m->mothurOut("[DEBUG]: group " + g + " fasta file has " + toString(seqForThisGroup.size()) + " sequences.");  }
273                 }
274                 
275                 return seqForThisGroup; 
276         }
277         catch(exception& e) {
278                 m->errorOut(e, "SequenceParser", "getSeqs");
279                 exit(1);
280         }
281 }
282 /************************************************************/
283 int SequenceParser::getSeqs(string g, string filename, bool uchimeFormat=false){ 
284         try {
285                 map<string, vector<Sequence> >::iterator it;
286                 vector<Sequence> seqForThisGroup;
287                 vector<seqPriorityNode> nameVector;
288                 
289                 it = seqs.find(g);
290                 if(it == seqs.end()) {
291                         m->mothurOut("[ERROR]: No sequences available for group " + g + ", please correct."); m->mothurOutEndLine();
292                 }else {
293                         
294                         ofstream out;
295                         m->openOutputFile(filename, out);
296                         
297                         seqForThisGroup = it->second;
298                         
299                         if (uchimeFormat) {
300                                 // format should look like 
301                                 //>seqName /ab=numRedundantSeqs/
302                                 //sequence
303                                 
304                                 map<string, string> nameMapForThisGroup = getNameMap(g);
305                                 map<string, string>::iterator itNameMap;
306                                 int error = 0;
307                                 
308                                 for (int i = 0; i < seqForThisGroup.size(); i++) {
309                                         itNameMap = nameMapForThisGroup.find(seqForThisGroup[i].getName());
310                                         
311                                         if (itNameMap == nameMapForThisGroup.end()){
312                                                 error = 1;
313                                                 m->mothurOut("[ERROR]: " + seqForThisGroup[i].getName() + " is in your fastafile, but is not in your namesfile, please correct."); m->mothurOutEndLine();
314                                         }else {
315                                                 int num = m->getNumNames(itNameMap->second);
316                                                 
317                                                 seqPriorityNode temp(num, seqForThisGroup[i].getAligned(), seqForThisGroup[i].getName());
318                                                 nameVector.push_back(temp);
319                                         }
320                                 }
321                                 
322                                 if (error == 1) { out.close(); m->mothurRemove(filename); return 1; }
323                                 
324                                 //sort by num represented
325                                 sort(nameVector.begin(), nameVector.end(), compareSeqPriorityNodes);
326
327                                 //print new file in order of
328                                 for (int i = 0; i < nameVector.size(); i++) {
329                                         
330                                         if(m->control_pressed) { out.close(); m->mothurRemove(filename); return 1; }
331                                         
332                                         out << ">" << nameVector[i].name  << "/ab=" << nameVector[i].numIdentical << "/" << endl << nameVector[i].seq << endl;
333                                 }
334                                 
335                         }else { 
336                 //m->mothurOut("Group " + g +  " contains " + toString(seqForThisGroup.size()) + " unique seqs.\n");
337                                 for (int i = 0; i < seqForThisGroup.size(); i++) {
338                                         
339                                         if(m->control_pressed) { out.close(); m->mothurRemove(filename); return 1; }
340                                         
341                                         seqForThisGroup[i].printSequence(out);  
342                                 }
343                         }
344                         out.close();
345                 }
346                 
347                 return 0; 
348         }
349         catch(exception& e) {
350                 m->errorOut(e, "SequenceParser", "getSeqs");
351                 exit(1);
352         }
353 }
354
355 /************************************************************/
356 map<string, string> SequenceParser::getNameMap(string g){ 
357         try {
358                 map<string, map<string, string> >::iterator it;
359                 map<string, string> nameMapForThisGroup;
360                 
361                 it = nameMapPerGroup.find(g);
362                 if(it == nameMapPerGroup.end()) {
363                         m->mothurOut("[ERROR]: No nameMap available for group " + g + ", please correct."); m->mothurOutEndLine();
364                 }else {
365                         nameMapForThisGroup = it->second;
366             if (m->debug) {  m->mothurOut("[DEBUG]: group " + g + " name file has " + toString(nameMapForThisGroup.size()) + " unique sequences.");  }
367                 }
368                 
369                 return nameMapForThisGroup; 
370         }
371         catch(exception& e) {
372                 m->errorOut(e, "SequenceParser", "getNameMap");
373                 exit(1);
374         }
375 }
376 /************************************************************/
377 int SequenceParser::getNameMap(string g, string filename){ 
378         try {
379                 map<string, map<string, string> >::iterator it;
380                 map<string, string> nameMapForThisGroup;
381                 
382                 it = nameMapPerGroup.find(g);
383                 if(it == nameMapPerGroup.end()) {
384                         m->mothurOut("[ERROR]: No nameMap available for group " + g + ", please correct."); m->mothurOutEndLine();
385                 }else {
386                         nameMapForThisGroup = it->second;
387                         
388                         ofstream out;
389                         m->openOutputFile(filename, out);
390                         
391                         for (map<string, string>::iterator itFile = nameMapForThisGroup.begin(); itFile != nameMapForThisGroup.end(); itFile++) {
392                                 
393                                 if(m->control_pressed) { out.close(); m->mothurRemove(filename); return 1; }
394                                 
395                                 out << itFile->first << '\t' << itFile->second << endl;
396                         }
397                         
398                         out.close();
399                 }
400                 
401                 return 0; 
402         }
403         catch(exception& e) {
404                 m->errorOut(e, "SequenceParser", "getNameMap");
405                 exit(1);
406         }
407 }
408 /************************************************************/
409
410
411