]> git.donarmstrong.com Git - mothur.git/blob - sequencecountparser.cpp
changed random forest output filename
[mothur.git] / sequencecountparser.cpp
1 //
2 //  sequencecountparser.cpp
3 //  Mothur
4 //
5 //  Created by Sarah Westcott on 8/7/12.
6 //  Copyright (c) 2012 Schloss Lab. All rights reserved.
7 //
8
9 #include "sequencecountparser.h"
10
11 /************************************************************/
12 SequenceCountParser::SequenceCountParser(string countfile, string fastafile) {
13         try {
14                 
15                 m = MothurOut::getInstance();
16                 
17                 //read count file
18                 CountTable countTable;
19                 countTable.readTable(countfile, true);
20                 
21                 //initialize maps
22                 namesOfGroups = countTable.getNamesOfGroups();
23                 for (int i = 0; i < namesOfGroups.size(); i++) {
24                         vector<Sequence> temp;
25                         map<string, int> tempMap;
26                         seqs[namesOfGroups[i]] = temp;
27                         countTablePerGroup[namesOfGroups[i]] = tempMap;
28                 }
29                 
30                 //read fasta file making sure each sequence is in the group file
31                 ifstream in;
32                 m->openInputFile(fastafile, in);
33                 
34         int fastaCount = 0;
35                 while (!in.eof()) {
36                         
37                         if (m->control_pressed) { break; }
38                         
39                         Sequence seq(in); m->gobble(in);
40             fastaCount++;
41             if (m->debug) { if((fastaCount) % 1000 == 0){       m->mothurOut("[DEBUG]: reading seq " + toString(fastaCount) + "\n.");   } }
42                         
43             if (seq.getName() != "") {
44                                 
45                 allSeqsMap[seq.getName()] = seq.getName();
46                 vector<int> groupCounts = countTable.getGroupCounts(seq.getName());
47                 
48                 for (int i = 0; i < namesOfGroups.size(); i++) {
49                     if (groupCounts[i] != 0) {
50                         seqs[namesOfGroups[i]].push_back(seq);  
51                         countTablePerGroup[namesOfGroups[i]][seq.getName()] = groupCounts[i];
52                     }
53                 }
54                         }
55                 }
56                 in.close();                                     
57         }
58         catch(exception& e) {
59                 m->errorOut(e, "SequenceCountParser", "SequenceCountParser");
60                 exit(1);
61         }
62 }
63 /************************************************************/
64 SequenceCountParser::SequenceCountParser(string fastafile, CountTable& countTable) {
65         try {
66                 
67                 m = MothurOut::getInstance();
68                                 
69                 //initialize maps
70         if (countTable.hasGroupInfo()) {
71             namesOfGroups = countTable.getNamesOfGroups();
72             for (int i = 0; i < namesOfGroups.size(); i++) {
73                 vector<Sequence> temp;
74                 map<string, int> tempMap;
75                 seqs[namesOfGroups[i]] = temp;
76                 countTablePerGroup[namesOfGroups[i]] = tempMap;
77             }
78             
79             //read fasta file making sure each sequence is in the group file
80             ifstream in;
81             m->openInputFile(fastafile, in);
82             
83             int fastaCount = 0;
84             while (!in.eof()) {
85                 
86                 if (m->control_pressed) { break; }
87                 
88                 Sequence seq(in); m->gobble(in);
89                 fastaCount++;
90                 if (m->debug) { if((fastaCount) % 1000 == 0){   m->mothurOut("[DEBUG]: reading seq " + toString(fastaCount) + "\n.");   } }
91                 
92                 if (seq.getName() != "") {
93                     
94                     allSeqsMap[seq.getName()] = seq.getName();
95                     vector<int> groupCounts = countTable.getGroupCounts(seq.getName());
96                     
97                     for (int i = 0; i < namesOfGroups.size(); i++) {
98                         if (groupCounts[i] != 0) {
99                             seqs[namesOfGroups[i]].push_back(seq);      
100                             countTablePerGroup[namesOfGroups[i]][seq.getName()] = groupCounts[i];
101                         }
102                     }
103                 }
104             }
105             in.close(); 
106         }else {  m->control_pressed = true;  m->mothurOut("[ERROR]: cannot parse fasta file by group with a count table that does not include group data, please correct.\n"); }
107         
108         }
109         catch(exception& e) {
110                 m->errorOut(e, "SequenceCountParser", "SequenceCountParser");
111                 exit(1);
112         }
113 }
114 /************************************************************/
115 SequenceCountParser::~SequenceCountParser(){  }
116 /************************************************************/
117 int SequenceCountParser::getNumGroups(){ return namesOfGroups.size(); }
118 /************************************************************/
119 vector<string> SequenceCountParser::getNamesOfGroups(){ return namesOfGroups; }
120 /************************************************************/
121 int SequenceCountParser::getNumSeqs(string g){ 
122         try {
123                 map<string, vector<Sequence> >::iterator it;
124                 int num = 0;
125                 
126                 it = seqs.find(g);
127                 if(it == seqs.end()) {
128                         m->mothurOut("[ERROR]: " + g + " is not a valid group, please correct."); m->mothurOutEndLine();
129                 }else {
130                         num = (it->second).size();
131                 }
132                 
133                 return num; 
134         }
135         catch(exception& e) {
136                 m->errorOut(e, "SequenceCountParser", "getNumSeqs");
137                 exit(1);
138         }
139 }
140 /************************************************************/
141 vector<Sequence> SequenceCountParser::getSeqs(string g){ 
142         try {
143                 map<string, vector<Sequence> >::iterator it;
144                 vector<Sequence> seqForThisGroup;
145                 
146                 it = seqs.find(g);
147                 if(it == seqs.end()) {
148                         m->mothurOut("[ERROR]: No sequences available for group " + g + ", please correct."); m->mothurOutEndLine();
149                 }else {
150                         seqForThisGroup = it->second;
151             if (m->debug) {  m->mothurOut("[DEBUG]: group " + g + " fasta file has " + toString(seqForThisGroup.size()) + " sequences.");  }
152                 }
153                 
154                 return seqForThisGroup; 
155         }
156         catch(exception& e) {
157                 m->errorOut(e, "SequenceCountParser", "getSeqs");
158                 exit(1);
159         }
160 }
161 /************************************************************/
162 int SequenceCountParser::getSeqs(string g, string filename, bool uchimeFormat=false){ 
163         try {
164                 map<string, vector<Sequence> >::iterator it;
165                 vector<Sequence> seqForThisGroup;
166                 vector<seqPriorityNode> nameVector;
167                 
168                 it = seqs.find(g);
169                 if(it == seqs.end()) {
170                         m->mothurOut("[ERROR]: No sequences available for group " + g + ", please correct."); m->mothurOutEndLine();
171                 }else {
172                         
173                         ofstream out;
174                         m->openOutputFile(filename, out);
175                         
176                         seqForThisGroup = it->second;
177                         
178                         if (uchimeFormat) {
179                                 // format should look like 
180                                 //>seqName /ab=numRedundantSeqs/
181                                 //sequence
182                                 
183                                 map<string, int> countForThisGroup = getCountTable(g);
184                                 map<string, int>::iterator itCount;
185                                 int error = 0;
186                                 
187                                 for (int i = 0; i < seqForThisGroup.size(); i++) {
188                                         itCount = countForThisGroup.find(seqForThisGroup[i].getName());
189                                         
190                                         if (itCount == countForThisGroup.end()){
191                                                 error = 1;
192                                                 m->mothurOut("[ERROR]: " + seqForThisGroup[i].getName() + " is in your fastafile, but is not in your count file, please correct."); m->mothurOutEndLine();
193                                         }else {
194                         seqPriorityNode temp(itCount->second, seqForThisGroup[i].getAligned(), seqForThisGroup[i].getName());
195                                                 nameVector.push_back(temp);
196                                         }
197                                 }
198                                 
199                                 if (error == 1) { out.close(); m->mothurRemove(filename); return 1; }
200                                 
201                                 //sort by num represented
202                                 sort(nameVector.begin(), nameVector.end(), compareSeqPriorityNodes);
203                 
204                                 //print new file in order of
205                                 for (int i = 0; i < nameVector.size(); i++) {
206                                         
207                                         if(m->control_pressed) { out.close(); m->mothurRemove(filename); return 1; }
208                                         
209                                         out << ">" << nameVector[i].name << "/ab=" << nameVector[i].numIdentical << "/" << endl << nameVector[i].seq << endl; //
210                                 }
211                                 
212                         }else { 
213                 //m->mothurOut("Group " + g +  " contains " + toString(seqForThisGroup.size()) + " unique seqs.\n");
214                                 for (int i = 0; i < seqForThisGroup.size(); i++) {
215                                         
216                                         if(m->control_pressed) { out.close(); m->mothurRemove(filename); return 1; }
217                                         
218                                         seqForThisGroup[i].printSequence(out);  
219                                 }
220                         }
221                         out.close();
222                 }
223                 
224                 return 0; 
225         }
226         catch(exception& e) {
227                 m->errorOut(e, "SequenceCountParser", "getSeqs");
228                 exit(1);
229         }
230 }
231
232 /************************************************************/
233 map<string, int> SequenceCountParser::getCountTable(string g){ 
234         try {
235                 map<string, map<string, int> >::iterator it;
236                 map<string, int> countForThisGroup;
237                 
238                 it = countTablePerGroup.find(g);
239                 if(it == countTablePerGroup.end()) {
240                         m->mothurOut("[ERROR]: No countTable available for group " + g + ", please correct."); m->mothurOutEndLine();
241                 }else {
242                         countForThisGroup = it->second;
243             if (m->debug) {  m->mothurOut("[DEBUG]: group " + g + " count file has " + toString(countForThisGroup.size()) + " unique sequences.");  }
244                 }
245                 
246                 return countForThisGroup; 
247         }
248         catch(exception& e) {
249                 m->errorOut(e, "SequenceCountParser", "getCountTable");
250                 exit(1);
251         }
252 }
253 /************************************************************/
254 int SequenceCountParser::getCountTable(string g, string filename){ 
255         try {
256                 map<string, map<string, int> >::iterator it;
257                 map<string, int> countForThisGroup;
258                 
259                 it = countTablePerGroup.find(g);
260                 if(it == countTablePerGroup.end()) {
261                         m->mothurOut("[ERROR]: No countTable available for group " + g + ", please correct."); m->mothurOutEndLine();
262                 }else {
263                         countForThisGroup = it->second;
264                         
265                         ofstream out;
266                         m->openOutputFile(filename, out);
267             out << "Representative_Sequence\ttotal\t" << g << endl;
268             
269                         for (map<string, int>::iterator itFile = countForThisGroup.begin(); itFile != countForThisGroup.end(); itFile++) {
270                                 
271                                 if(m->control_pressed) { out.close(); m->mothurRemove(filename); return 1; }
272                                 
273                                 out << itFile->first << '\t' << itFile->second << '\t' << itFile->second << endl;
274                         }
275                         
276                         out.close();
277                 }
278                 
279                 return 0; 
280         }
281         catch(exception& e) {
282                 m->errorOut(e, "SequenceParser", "getCountTable");
283                 exit(1);
284         }
285 }
286 /************************************************************/
287
288
289