]> git.donarmstrong.com Git - mothur.git/blob - sequenceparser.cpp
fixed bug with dist.shared subsampling. added mode parameter to dist.shared so...
[mothur.git] / sequenceparser.cpp
1 /*
2  *  sequenceParser.cpp
3  *  Mothur
4  *
5  *  Created by westcott on 9/9/11.
6  *  Copyright 2011 Schloss Lab. All rights reserved.
7  *
8  */
9
10 #include "sequenceparser.h"
11
12
13 /************************************************************/
14 SequenceParser::SequenceParser(string groupFile, string fastaFile, string nameFile) {
15         try {
16                 
17                 m = MothurOut::getInstance();
18                 int error;
19                 
20                 //read group file
21                 groupMap = new GroupMap(groupFile);
22                 error = groupMap->readMap();
23                 
24                 if (error == 1) { m->control_pressed = true; }
25                 
26                 //initialize maps
27                 vector<string> namesOfGroups = groupMap->getNamesOfGroups();
28                 for (int i = 0; i < namesOfGroups.size(); i++) {
29                         vector<Sequence> temp;
30                         map<string, string> tempMap;
31                         seqs[namesOfGroups[i]] = temp;
32                         nameMapPerGroup[namesOfGroups[i]] = tempMap;
33                 }
34                 
35                 //read fasta file making sure each sequence is in the group file
36                 ifstream in;
37                 m->openInputFile(fastaFile, in);
38                 
39                 map<string, string> seqName; //stores name -> sequence string so we can make new "unique" sequences when we parse the name file
40         int fastaCount = 0;
41                 while (!in.eof()) {
42                         
43                         if (m->control_pressed) { break; }
44                         
45                         Sequence seq(in); m->gobble(in);
46             fastaCount++;
47             if (m->debug) { if((fastaCount) % 1000 == 0){       m->mothurOut("[DEBUG]: reading seq " + toString(fastaCount) + "\n.");   } }
48                         
49         if (seq.getName() != "") {
50                                 
51                                  string group = groupMap->getGroup(seq.getName());
52                                  if (group == "not found") {  error = 1; m->mothurOut("[ERROR]: " + seq.getName() + " is in your fasta file and not in your groupfile, please correct."); m->mothurOutEndLine();  }
53                                  else { 
54                                          seqs[group].push_back(seq);    
55                                          seqName[seq.getName()] = seq.getAligned();
56                                  }
57                         }
58                 }
59                 in.close();
60                                  
61                 if (error == 1) { m->control_pressed = true; }
62                                  
63                 //read name file
64                 ifstream inName;
65                 m->openInputFile(nameFile, inName);
66                 
67                 string first, second;
68                 int countName = 0;
69                 set<string> thisnames1;
70                 
71                 while(!inName.eof()) {
72                         
73                         if (m->control_pressed) { break; }
74                         
75                         inName >> first; m->gobble(inName);
76                         inName >> second; m->gobble(inName);
77                         
78                         vector<string> names;
79                         m->splitAtChar(second, names, ',');
80                         
81                         //get aligned string for these seqs from the fasta file
82                         string alignedString = "";
83                         map<string, string>::iterator itAligned = seqName.find(names[0]);
84                         if (itAligned == seqName.end()) {
85                                 error = 1; m->mothurOut("[ERROR]: " + names[0] + " is in your name file and not in your fasta file, please correct."); m->mothurOutEndLine();
86                         }else {
87                                 alignedString = itAligned->second;
88                         }
89                         
90                         //separate by group - parse one line in name file
91                         map<string, string> splitMap; //group -> name1,name2,...
92                         map<string, string>::iterator it;
93                         for (int i = 0; i < names.size(); i++) {
94                                 
95                                 string group = groupMap->getGroup(names[i]);
96                                 if (group == "not found") {  error = 1; m->mothurOut("[ERROR]: " + names[i] + " is in your name file and not in your groupfile, please correct."); m->mothurOutEndLine();  }
97                                 else {  
98                                         
99                                         it = splitMap.find(group);
100                                         if (it != splitMap.end()) { //adding seqs to this group
101                                                 (it->second) += "," + names[i];
102                                                 thisnames1.insert(names[i]);
103                                                 countName++;
104                                         }else { //first sighting of this group
105                                                 splitMap[group] = names[i];
106                                                 countName++;
107                                                 thisnames1.insert(names[i]);
108                                                 
109                                                 //is this seq in the fasta file?
110                                                 if (i != 0) { //if not then we need to add a duplicate sequence to the seqs for this group so the new "fasta" and "name" files will match
111                                                         Sequence tempSeq(names[i], alignedString); //get the first guys sequence string since he's in the fasta file.
112                                                         seqs[group].push_back(tempSeq);
113                                                 }
114                                         }
115                                 }
116                                 
117                                 allSeqsMap[names[i]] = names[0];
118                         }
119                         
120                         
121                         //fill nameMapPerGroup - holds all lines in namefile separated by group
122                         for (it = splitMap.begin(); it != splitMap.end(); it++) {
123                                 //grab first name
124                                 string firstName = "";
125                                 for(int i = 0; i < (it->second).length(); i++) {
126                                         if (((it->second)[i]) != ',') {
127                                                 firstName += ((it->second)[i]);
128                                         }else { break; }
129                                 }
130                                 
131                                 //group1 -> seq1 -> seq1,seq2,seq3
132                                 nameMapPerGroup[it->first][firstName] = it->second;
133                         }
134                 }
135                 
136                 inName.close();
137                 
138                 if (error == 1) { m->control_pressed = true; }
139                         
140                 if (countName != (groupMap->getNumSeqs())) {
141                         vector<string> groupseqsnames = groupMap->getNamesSeqs();
142                         
143                         for (int i = 0; i < groupseqsnames.size(); i++) {
144                                 set<string>::iterator itnamesfile = thisnames1.find(groupseqsnames[i]);
145                                 if (itnamesfile == thisnames1.end()){
146                                         cout << "missing name " + groupseqsnames[i] << '\t' << allSeqsMap[groupseqsnames[i]] << endl;
147                                 }
148                         }
149                         
150                         m->mothurOutEndLine();
151                         m->mothurOut("[ERROR]: Your name file contains " + toString(countName) + " valid sequences, and your groupfile contains " + toString(groupMap->getNumSeqs()) + ", please correct.");
152                         m->mothurOutEndLine();
153                         m->control_pressed = true;
154                 }
155                 
156         }
157         catch(exception& e) {
158                 m->errorOut(e, "SequenceParser", "SequenceParser");
159                 exit(1);
160         }
161 }
162 /************************************************************/
163 SequenceParser::SequenceParser(string groupFile, string fastaFile) {
164         try {
165                 
166                 m = MothurOut::getInstance();
167                 int error;
168                 
169                 //read group file
170                 groupMap = new GroupMap(groupFile);
171                 error = groupMap->readMap();
172                 
173                 if (error == 1) { m->control_pressed = true; }
174                 
175                 //initialize maps
176                 vector<string> namesOfGroups = groupMap->getNamesOfGroups();
177                 for (int i = 0; i < namesOfGroups.size(); i++) {
178                         vector<Sequence> temp;
179                         seqs[namesOfGroups[i]] = temp;
180                 }
181                 
182                 //read fasta file making sure each sequence is in the group file
183                 ifstream in;
184                 m->openInputFile(fastaFile, in);
185                 int count = 0;
186                 
187                 while (!in.eof()) {
188                         
189                         if (m->control_pressed) { break; }
190                         
191                         Sequence seq(in); m->gobble(in);
192                         
193                         if (seq.getName() != "") {
194                                 
195                                 string group = groupMap->getGroup(seq.getName());
196                                 if (group == "not found") {  error = 1; m->mothurOut("[ERROR]: " + seq.getName() + " is in your fasta file and not in your groupfile, please correct."); m->mothurOutEndLine();  }
197                                 else {  seqs[group].push_back(seq);     count++; }
198                         }
199                 }
200                 in.close();
201                 
202                 if (error == 1) { m->control_pressed = true; }
203                 
204                 if (count != (groupMap->getNumSeqs())) {
205                         m->mothurOutEndLine();
206                         m->mothurOut("[ERROR]: Your fasta file contains " + toString(count) + " valid sequences, and your groupfile contains " + toString(groupMap->getNumSeqs()) + ", please correct.");
207                         if (count < (groupMap->getNumSeqs())) { m->mothurOut(" Did you forget to include the name file?"); }
208                         m->mothurOutEndLine();
209                         m->control_pressed = true;
210                 }
211                 
212         }
213         catch(exception& e) {
214                 m->errorOut(e, "SequenceParser", "SequenceParser");
215                 exit(1);
216         }
217 }
218 /************************************************************/
219 SequenceParser::~SequenceParser(){ delete groupMap; }
220 /************************************************************/
221 int SequenceParser::getNumGroups(){ return groupMap->getNumGroups(); }
222 /************************************************************/
223 vector<string> SequenceParser::getNamesOfGroups(){ return groupMap->getNamesOfGroups(); }
224 /************************************************************/
225 bool SequenceParser::isValidGroup(string g){ return groupMap->isValidGroup(g); }
226 /************************************************************/
227 string SequenceParser::getGroup(string g){ return groupMap->getGroup(g); }
228 /************************************************************/
229 int SequenceParser::getNumSeqs(string g){ 
230         try {
231                 map<string, vector<Sequence> >::iterator it;
232                 int num = 0;
233                 
234                 it = seqs.find(g);
235                 if(it == seqs.end()) {
236                         m->mothurOut("[ERROR]: " + g + " is not a valid group, please correct."); m->mothurOutEndLine();
237                 }else {
238                         num = (it->second).size();
239                 }
240                 
241                 return num; 
242         }
243         catch(exception& e) {
244                 m->errorOut(e, "SequenceParser", "getNumSeqs");
245                 exit(1);
246         }
247 }
248 /************************************************************/
249 vector<Sequence> SequenceParser::getSeqs(string g){ 
250         try {
251                 map<string, vector<Sequence> >::iterator it;
252                 vector<Sequence> seqForThisGroup;
253                 
254                 it = seqs.find(g);
255                 if(it == seqs.end()) {
256                         m->mothurOut("[ERROR]: No sequences available for group " + g + ", please correct."); m->mothurOutEndLine();
257                 }else {
258                         seqForThisGroup = it->second;
259             if (m->debug) {  m->mothurOut("[DEBUG]: group " + g + " fasta file has " + toString(seqForThisGroup.size()) + " sequences.");  }
260                 }
261                 
262                 return seqForThisGroup; 
263         }
264         catch(exception& e) {
265                 m->errorOut(e, "SequenceParser", "getSeqs");
266                 exit(1);
267         }
268 }
269 /************************************************************/
270 int SequenceParser::getSeqs(string g, string filename, bool uchimeFormat=false){ 
271         try {
272                 map<string, vector<Sequence> >::iterator it;
273                 vector<Sequence> seqForThisGroup;
274                 vector<seqPriorityNode> nameVector;
275                 
276                 it = seqs.find(g);
277                 if(it == seqs.end()) {
278                         m->mothurOut("[ERROR]: No sequences available for group " + g + ", please correct."); m->mothurOutEndLine();
279                 }else {
280                         
281                         ofstream out;
282                         m->openOutputFile(filename, out);
283                         
284                         seqForThisGroup = it->second;
285                         
286                         if (uchimeFormat) {
287                                 // format should look like 
288                                 //>seqName /ab=numRedundantSeqs/
289                                 //sequence
290                                 
291                                 map<string, string> nameMapForThisGroup = getNameMap(g);
292                                 map<string, string>::iterator itNameMap;
293                                 int error = 0;
294                                 
295                                 for (int i = 0; i < seqForThisGroup.size(); i++) {
296                                         itNameMap = nameMapForThisGroup.find(seqForThisGroup[i].getName());
297                                         
298                                         if (itNameMap == nameMapForThisGroup.end()){
299                                                 error = 1;
300                                                 m->mothurOut("[ERROR]: " + seqForThisGroup[i].getName() + " is in your fastafile, but is not in your namesfile, please correct."); m->mothurOutEndLine();
301                                         }else {
302                                                 int num = m->getNumNames(itNameMap->second);
303                                                 
304                                                 seqPriorityNode temp(num, seqForThisGroup[i].getAligned(), seqForThisGroup[i].getName());
305                                                 nameVector.push_back(temp);
306                                         }
307                                 }
308                                 
309                                 if (error == 1) { out.close(); m->mothurRemove(filename); return 1; }
310                                 
311                                 //sort by num represented
312                                 sort(nameVector.begin(), nameVector.end(), compareSeqPriorityNodes);
313
314                                 //print new file in order of
315                                 for (int i = 0; i < nameVector.size(); i++) {
316                                         
317                                         if(m->control_pressed) { out.close(); m->mothurRemove(filename); return 1; }
318                                         
319                                         out << ">" << nameVector[i].name  << "/ab=" << nameVector[i].numIdentical << "/" << endl << nameVector[i].seq << endl;
320                                 }
321                                 
322                         }else { 
323                 //m->mothurOut("Group " + g +  " contains " + toString(seqForThisGroup.size()) + " unique seqs.\n");
324                                 for (int i = 0; i < seqForThisGroup.size(); i++) {
325                                         
326                                         if(m->control_pressed) { out.close(); m->mothurRemove(filename); return 1; }
327                                         
328                                         seqForThisGroup[i].printSequence(out);  
329                                 }
330                         }
331                         out.close();
332                 }
333                 
334                 return 0; 
335         }
336         catch(exception& e) {
337                 m->errorOut(e, "SequenceParser", "getSeqs");
338                 exit(1);
339         }
340 }
341
342 /************************************************************/
343 map<string, string> SequenceParser::getNameMap(string g){ 
344         try {
345                 map<string, map<string, string> >::iterator it;
346                 map<string, string> nameMapForThisGroup;
347                 
348                 it = nameMapPerGroup.find(g);
349                 if(it == nameMapPerGroup.end()) {
350                         m->mothurOut("[ERROR]: No nameMap available for group " + g + ", please correct."); m->mothurOutEndLine();
351                 }else {
352                         nameMapForThisGroup = it->second;
353             if (m->debug) {  m->mothurOut("[DEBUG]: group " + g + " name file has " + toString(nameMapForThisGroup.size()) + " unique sequences.");  }
354                 }
355                 
356                 return nameMapForThisGroup; 
357         }
358         catch(exception& e) {
359                 m->errorOut(e, "SequenceParser", "getNameMap");
360                 exit(1);
361         }
362 }
363 /************************************************************/
364 int SequenceParser::getNameMap(string g, string filename){ 
365         try {
366                 map<string, map<string, string> >::iterator it;
367                 map<string, string> nameMapForThisGroup;
368                 
369                 it = nameMapPerGroup.find(g);
370                 if(it == nameMapPerGroup.end()) {
371                         m->mothurOut("[ERROR]: No nameMap available for group " + g + ", please correct."); m->mothurOutEndLine();
372                 }else {
373                         nameMapForThisGroup = it->second;
374                         
375                         ofstream out;
376                         m->openOutputFile(filename, out);
377                         
378                         for (map<string, string>::iterator itFile = nameMapForThisGroup.begin(); itFile != nameMapForThisGroup.end(); itFile++) {
379                                 
380                                 if(m->control_pressed) { out.close(); m->mothurRemove(filename); return 1; }
381                                 
382                                 out << itFile->first << '\t' << itFile->second << endl;
383                         }
384                         
385                         out.close();
386                 }
387                 
388                 return 0; 
389         }
390         catch(exception& e) {
391                 m->errorOut(e, "SequenceParser", "getNameMap");
392                 exit(1);
393         }
394 }
395 /************************************************************/
396
397
398