]> git.donarmstrong.com Git - mothur.git/blob - sequenceparser.cpp
changes while testing 1.27
[mothur.git] / sequenceparser.cpp
1 /*
2  *  sequenceParser.cpp
3  *  Mothur
4  *
5  *  Created by westcott on 9/9/11.
6  *  Copyright 2011 Schloss Lab. All rights reserved.
7  *
8  */
9
10 #include "sequenceparser.h"
11
12
13 /************************************************************/
14 SequenceParser::SequenceParser(string groupFile, string fastaFile, string nameFile) {
15         try {
16                 
17                 m = MothurOut::getInstance();
18                 int error;
19                 
20                 //read group file
21                 groupMap = new GroupMap(groupFile);
22                 error = groupMap->readMap();
23                 
24                 if (error == 1) { m->control_pressed = true; }
25                 
26                 //initialize maps
27                 vector<string> namesOfGroups = groupMap->getNamesOfGroups();
28                 for (int i = 0; i < namesOfGroups.size(); i++) {
29                         vector<Sequence> temp;
30                         map<string, string> tempMap;
31                         seqs[namesOfGroups[i]] = temp;
32                         nameMapPerGroup[namesOfGroups[i]] = tempMap;
33                 }
34                 
35                 //read fasta file making sure each sequence is in the group file
36                 ifstream in;
37                 m->openInputFile(fastaFile, in);
38                 
39                 map<string, string> seqName; //stores name -> sequence string so we can make new "unique" sequences when we parse the name file
40         int fastaCount = 0;
41                 while (!in.eof()) {
42                         
43                         if (m->control_pressed) { break; }
44                         
45                         Sequence seq(in); m->gobble(in);
46             fastaCount++;
47             if (m->debug) { if((fastaCount) % 1000 == 0){       m->mothurOut("[DEBUG]: reading seq " + toString(fastaCount) + "\n.");   } }
48                         
49         if (seq.getName() != "") {
50                                 
51                                  string group = groupMap->getGroup(seq.getName());
52                                  if (group == "not found") {  error = 1; m->mothurOut("[ERROR]: " + seq.getName() + " is in your fasta file and not in your groupfile, please correct."); m->mothurOutEndLine();  }
53                                  else { 
54                                          seqs[group].push_back(seq);    
55                                          seqName[seq.getName()] = seq.getAligned();
56                                  }
57                         }
58                 }
59                 in.close();
60                                  
61                 if (error == 1) { m->control_pressed = true; }
62                 
63                 //read name file
64                 ifstream inName;
65                 m->openInputFile(nameFile, inName);
66                 
67                 //string first, second;
68                 int countName = 0;
69                 set<string> thisnames1;
70                 
71         string rest = "";
72         char buffer[4096];
73         bool pairDone = false;
74         bool columnOne = true;
75         string firstCol, secondCol;
76         
77                 while (!inName.eof()) {
78                         if (m->control_pressed) { break; }
79                         
80             inName.read(buffer, 4096);
81             vector<string> pieces = m->splitWhiteSpace(rest, buffer, inName.gcount());
82             
83             for (int i = 0; i < pieces.size(); i++) {
84                 if (columnOne) {  firstCol = pieces[i]; columnOne=false; }
85                 else  { secondCol = pieces[i]; pairDone = true; columnOne=true; }
86                 
87                 if (pairDone) { //save one line
88                     if (m->debug) { m->mothurOut("[DEBUG]: reading names: " + firstCol + '\t' + secondCol + ".\n"); }
89                     vector<string> names;
90                     m->splitAtChar(secondCol, names, ',');
91                     
92                     //get aligned string for these seqs from the fasta file
93                     string alignedString = "";
94                     map<string, string>::iterator itAligned = seqName.find(names[0]);
95                     if (itAligned == seqName.end()) {
96                         error = 1; m->mothurOut("[ERROR]: " + names[0] + " is in your name file and not in your fasta file, please correct."); m->mothurOutEndLine();
97                     }else {
98                         alignedString = itAligned->second;
99                     }
100                     
101                     //separate by group - parse one line in name file
102                     map<string, string> splitMap; //group -> name1,name2,...
103                     map<string, string>::iterator it;
104                     for (int i = 0; i < names.size(); i++) {
105                         
106                         string group = groupMap->getGroup(names[i]);
107                         if (group == "not found") {  error = 1; m->mothurOut("[ERROR]: " + names[i] + " is in your name file and not in your groupfile, please correct."); m->mothurOutEndLine();  }
108                         else {  
109                             
110                             it = splitMap.find(group);
111                             if (it != splitMap.end()) { //adding seqs to this group
112                                 (it->second) += "," + names[i];
113                                 thisnames1.insert(names[i]);
114                                 countName++;
115                             }else { //first sighting of this group
116                                 splitMap[group] = names[i];
117                                 countName++;
118                                 thisnames1.insert(names[i]);
119                                 
120                                 //is this seq in the fasta file?
121                                 if (i != 0) { //if not then we need to add a duplicate sequence to the seqs for this group so the new "fasta" and "name" files will match
122                                     Sequence tempSeq(names[i], alignedString); //get the first guys sequence string since he's in the fasta file.
123                                     seqs[group].push_back(tempSeq);
124                                 }
125                             }
126                         }
127                         
128                         allSeqsMap[names[i]] = names[0];
129                     }
130                     
131                     
132                     //fill nameMapPerGroup - holds all lines in namefile separated by group
133                     for (it = splitMap.begin(); it != splitMap.end(); it++) {
134                         //grab first name
135                         string firstName = "";
136                         for(int i = 0; i < (it->second).length(); i++) {
137                             if (((it->second)[i]) != ',') {
138                                 firstName += ((it->second)[i]);
139                             }else { break; }
140                         }
141                         
142                         //group1 -> seq1 -> seq1,seq2,seq3
143                         nameMapPerGroup[it->first][firstName] = it->second;
144                     }
145
146                     pairDone = false; 
147                 }
148             }
149                 }
150                 inName.close();
151         
152         //in case file does not end in white space
153         if (rest != "") {
154             vector<string> pieces = m->splitWhiteSpace(rest);
155             
156             for (int i = 0; i < pieces.size(); i++) {
157                 if (columnOne) {  firstCol = pieces[i]; columnOne=false; }
158                 else  { secondCol = pieces[i]; pairDone = true; columnOne=true; }
159                 
160                 if (pairDone) { //save one line
161                     if (m->debug) { m->mothurOut("[DEBUG]: reading names: " + firstCol + '\t' + secondCol + ".\n"); }
162                     vector<string> names;
163                     m->splitAtChar(secondCol, names, ',');
164                     
165                     //get aligned string for these seqs from the fasta file
166                     string alignedString = "";
167                     map<string, string>::iterator itAligned = seqName.find(names[0]);
168                     if (itAligned == seqName.end()) {
169                         error = 1; m->mothurOut("[ERROR]: " + names[0] + " is in your name file and not in your fasta file, please correct."); m->mothurOutEndLine();
170                     }else {
171                         alignedString = itAligned->second;
172                     }
173                     
174                     //separate by group - parse one line in name file
175                     map<string, string> splitMap; //group -> name1,name2,...
176                     map<string, string>::iterator it;
177                     for (int i = 0; i < names.size(); i++) {
178                         
179                         string group = groupMap->getGroup(names[i]);
180                         if (group == "not found") {  error = 1; m->mothurOut("[ERROR]: " + names[i] + " is in your name file and not in your groupfile, please correct."); m->mothurOutEndLine();  }
181                         else {  
182                             
183                             it = splitMap.find(group);
184                             if (it != splitMap.end()) { //adding seqs to this group
185                                 (it->second) += "," + names[i];
186                                 thisnames1.insert(names[i]);
187                                 countName++;
188                             }else { //first sighting of this group
189                                 splitMap[group] = names[i];
190                                 countName++;
191                                 thisnames1.insert(names[i]);
192                                 
193                                 //is this seq in the fasta file?
194                                 if (i != 0) { //if not then we need to add a duplicate sequence to the seqs for this group so the new "fasta" and "name" files will match
195                                     Sequence tempSeq(names[i], alignedString); //get the first guys sequence string since he's in the fasta file.
196                                     seqs[group].push_back(tempSeq);
197                                 }
198                             }
199                         }
200                         
201                         allSeqsMap[names[i]] = names[0];
202                     }
203                     
204                     
205                     //fill nameMapPerGroup - holds all lines in namefile separated by group
206                     for (it = splitMap.begin(); it != splitMap.end(); it++) {
207                         //grab first name
208                         string firstName = "";
209                         for(int i = 0; i < (it->second).length(); i++) {
210                             if (((it->second)[i]) != ',') {
211                                 firstName += ((it->second)[i]);
212                             }else { break; }
213                         }
214                         
215                         //group1 -> seq1 -> seq1,seq2,seq3
216                         nameMapPerGroup[it->first][firstName] = it->second;
217                     }
218                     
219                     pairDone = false; 
220                 }
221             }
222         }
223                 
224                 if (error == 1) { m->control_pressed = true; }
225                         
226                 if (countName != (groupMap->getNumSeqs())) {
227                         vector<string> groupseqsnames = groupMap->getNamesSeqs();
228                         
229                         for (int i = 0; i < groupseqsnames.size(); i++) {
230                                 set<string>::iterator itnamesfile = thisnames1.find(groupseqsnames[i]);
231                                 if (itnamesfile == thisnames1.end()){
232                                         cout << "missing name " + groupseqsnames[i] << '\t' << allSeqsMap[groupseqsnames[i]] << endl;
233                                 }
234                         }
235                         
236                         m->mothurOutEndLine();
237                         m->mothurOut("[ERROR]: Your name file contains " + toString(countName) + " valid sequences, and your groupfile contains " + toString(groupMap->getNumSeqs()) + ", please correct.");
238                         m->mothurOutEndLine();
239                         m->control_pressed = true;
240                 }
241                 
242         }
243         catch(exception& e) {
244                 m->errorOut(e, "SequenceParser", "SequenceParser");
245                 exit(1);
246         }
247 }
248 /************************************************************/
249 SequenceParser::SequenceParser(string groupFile, string fastaFile) {
250         try {
251                 
252                 m = MothurOut::getInstance();
253                 int error;
254                 
255                 //read group file
256                 groupMap = new GroupMap(groupFile);
257                 error = groupMap->readMap();
258                 
259                 if (error == 1) { m->control_pressed = true; }
260                 
261                 //initialize maps
262                 vector<string> namesOfGroups = groupMap->getNamesOfGroups();
263                 for (int i = 0; i < namesOfGroups.size(); i++) {
264                         vector<Sequence> temp;
265                         seqs[namesOfGroups[i]] = temp;
266                 }
267                 
268                 //read fasta file making sure each sequence is in the group file
269                 ifstream in;
270                 m->openInputFile(fastaFile, in);
271                 int count = 0;
272                 
273                 while (!in.eof()) {
274                         
275                         if (m->control_pressed) { break; }
276                         
277                         Sequence seq(in); m->gobble(in);
278                         
279                         if (seq.getName() != "") {
280                                 
281                                 string group = groupMap->getGroup(seq.getName());
282                                 if (group == "not found") {  error = 1; m->mothurOut("[ERROR]: " + seq.getName() + " is in your fasta file and not in your groupfile, please correct."); m->mothurOutEndLine();  }
283                                 else {  seqs[group].push_back(seq);     count++; }
284                         }
285                 }
286                 in.close();
287                 
288                 if (error == 1) { m->control_pressed = true; }
289                 
290                 if (count != (groupMap->getNumSeqs())) {
291                         m->mothurOutEndLine();
292                         m->mothurOut("[ERROR]: Your fasta file contains " + toString(count) + " valid sequences, and your groupfile contains " + toString(groupMap->getNumSeqs()) + ", please correct.");
293                         if (count < (groupMap->getNumSeqs())) { m->mothurOut(" Did you forget to include the name file?"); }
294                         m->mothurOutEndLine();
295                         m->control_pressed = true;
296                 }
297                 
298         }
299         catch(exception& e) {
300                 m->errorOut(e, "SequenceParser", "SequenceParser");
301                 exit(1);
302         }
303 }
304 /************************************************************/
305 SequenceParser::~SequenceParser(){ delete groupMap; }
306 /************************************************************/
307 int SequenceParser::getNumGroups(){ return groupMap->getNumGroups(); }
308 /************************************************************/
309 vector<string> SequenceParser::getNamesOfGroups(){ return groupMap->getNamesOfGroups(); }
310 /************************************************************/
311 bool SequenceParser::isValidGroup(string g){ return groupMap->isValidGroup(g); }
312 /************************************************************/
313 string SequenceParser::getGroup(string g){ return groupMap->getGroup(g); }
314 /************************************************************/
315 int SequenceParser::getNumSeqs(string g){ 
316         try {
317                 map<string, vector<Sequence> >::iterator it;
318                 int num = 0;
319                 
320                 it = seqs.find(g);
321                 if(it == seqs.end()) {
322                         m->mothurOut("[ERROR]: " + g + " is not a valid group, please correct."); m->mothurOutEndLine();
323                 }else {
324                         num = (it->second).size();
325                 }
326                 
327                 return num; 
328         }
329         catch(exception& e) {
330                 m->errorOut(e, "SequenceParser", "getNumSeqs");
331                 exit(1);
332         }
333 }
334 /************************************************************/
335 vector<Sequence> SequenceParser::getSeqs(string g){ 
336         try {
337                 map<string, vector<Sequence> >::iterator it;
338                 vector<Sequence> seqForThisGroup;
339                 
340                 it = seqs.find(g);
341                 if(it == seqs.end()) {
342                         m->mothurOut("[ERROR]: No sequences available for group " + g + ", please correct."); m->mothurOutEndLine();
343                 }else {
344                         seqForThisGroup = it->second;
345             if (m->debug) {  m->mothurOut("[DEBUG]: group " + g + " fasta file has " + toString(seqForThisGroup.size()) + " sequences.");  }
346                 }
347                 
348                 return seqForThisGroup; 
349         }
350         catch(exception& e) {
351                 m->errorOut(e, "SequenceParser", "getSeqs");
352                 exit(1);
353         }
354 }
355 /************************************************************/
356 int SequenceParser::getSeqs(string g, string filename, bool uchimeFormat=false){ 
357         try {
358                 map<string, vector<Sequence> >::iterator it;
359                 vector<Sequence> seqForThisGroup;
360                 vector<seqPriorityNode> nameVector;
361                 
362                 it = seqs.find(g);
363                 if(it == seqs.end()) {
364                         m->mothurOut("[ERROR]: No sequences available for group " + g + ", please correct."); m->mothurOutEndLine();
365                 }else {
366                         
367                         ofstream out;
368                         m->openOutputFile(filename, out);
369                         
370                         seqForThisGroup = it->second;
371                         
372                         if (uchimeFormat) {
373                                 // format should look like 
374                                 //>seqName /ab=numRedundantSeqs/
375                                 //sequence
376                                 
377                                 map<string, string> nameMapForThisGroup = getNameMap(g);
378                                 map<string, string>::iterator itNameMap;
379                                 int error = 0;
380                                 
381                                 for (int i = 0; i < seqForThisGroup.size(); i++) {
382                                         itNameMap = nameMapForThisGroup.find(seqForThisGroup[i].getName());
383                                         
384                                         if (itNameMap == nameMapForThisGroup.end()){
385                                                 error = 1;
386                                                 m->mothurOut("[ERROR]: " + seqForThisGroup[i].getName() + " is in your fastafile, but is not in your namesfile, please correct."); m->mothurOutEndLine();
387                                         }else {
388                                                 int num = m->getNumNames(itNameMap->second);
389                                                 
390                                                 seqPriorityNode temp(num, seqForThisGroup[i].getAligned(), seqForThisGroup[i].getName());
391                                                 nameVector.push_back(temp);
392                                         }
393                                 }
394                                 
395                                 if (error == 1) { out.close(); m->mothurRemove(filename); return 1; }
396                                 
397                                 //sort by num represented
398                                 sort(nameVector.begin(), nameVector.end(), compareSeqPriorityNodes);
399
400                                 //print new file in order of
401                                 for (int i = 0; i < nameVector.size(); i++) {
402                                         
403                                         if(m->control_pressed) { out.close(); m->mothurRemove(filename); return 1; }
404                                         
405                                         out << ">" << nameVector[i].name  << "/ab=" << nameVector[i].numIdentical << "/" << endl << nameVector[i].seq << endl;
406                                 }
407                                 
408                         }else { 
409                 //m->mothurOut("Group " + g +  " contains " + toString(seqForThisGroup.size()) + " unique seqs.\n");
410                                 for (int i = 0; i < seqForThisGroup.size(); i++) {
411                                         
412                                         if(m->control_pressed) { out.close(); m->mothurRemove(filename); return 1; }
413                                         
414                                         seqForThisGroup[i].printSequence(out);  
415                                 }
416                         }
417                         out.close();
418                 }
419                 
420                 return 0; 
421         }
422         catch(exception& e) {
423                 m->errorOut(e, "SequenceParser", "getSeqs");
424                 exit(1);
425         }
426 }
427
428 /************************************************************/
429 map<string, string> SequenceParser::getNameMap(string g){ 
430         try {
431                 map<string, map<string, string> >::iterator it;
432                 map<string, string> nameMapForThisGroup;
433                 
434                 it = nameMapPerGroup.find(g);
435                 if(it == nameMapPerGroup.end()) {
436                         m->mothurOut("[ERROR]: No nameMap available for group " + g + ", please correct."); m->mothurOutEndLine();
437                 }else {
438                         nameMapForThisGroup = it->second;
439             if (m->debug) {  m->mothurOut("[DEBUG]: group " + g + " name file has " + toString(nameMapForThisGroup.size()) + " unique sequences.");  }
440                 }
441                 
442                 return nameMapForThisGroup; 
443         }
444         catch(exception& e) {
445                 m->errorOut(e, "SequenceParser", "getNameMap");
446                 exit(1);
447         }
448 }
449 /************************************************************/
450 int SequenceParser::getNameMap(string g, string filename){ 
451         try {
452                 map<string, map<string, string> >::iterator it;
453                 map<string, string> nameMapForThisGroup;
454                 
455                 it = nameMapPerGroup.find(g);
456                 if(it == nameMapPerGroup.end()) {
457                         m->mothurOut("[ERROR]: No nameMap available for group " + g + ", please correct."); m->mothurOutEndLine();
458                 }else {
459                         nameMapForThisGroup = it->second;
460                         
461                         ofstream out;
462                         m->openOutputFile(filename, out);
463                         
464                         for (map<string, string>::iterator itFile = nameMapForThisGroup.begin(); itFile != nameMapForThisGroup.end(); itFile++) {
465                                 
466                                 if(m->control_pressed) { out.close(); m->mothurRemove(filename); return 1; }
467                                 
468                                 out << itFile->first << '\t' << itFile->second << endl;
469                         }
470                         
471                         out.close();
472                 }
473                 
474                 return 0; 
475         }
476         catch(exception& e) {
477                 m->errorOut(e, "SequenceParser", "getNameMap");
478                 exit(1);
479         }
480 }
481 /************************************************************/
482
483
484