]> git.donarmstrong.com Git - mothur.git/blob - getoturepcommand.cpp
added get.rabund and get.sabund command and fixed bug introduced by line by line...
[mothur.git] / getoturepcommand.cpp
1 /*
2  *  getoturepcommand.cpp
3  *  Mothur
4  *
5  *  Created by Sarah Westcott on 4/6/09.
6  *  Copyright 2009 Schloss Lab UMASS Amherst. All rights reserved.
7  *
8  */
9
10 #include "getoturepcommand.h"
11
12 //**********************************************************************************************************************
13 GetOTURepCommand::GetOTURepCommand(){
14         try{
15                 globaldata = GlobalData::getInstance();
16         
17                 if(globaldata->gSparseMatrix != NULL)   {       matrix = new SparseMatrix(*globaldata->gSparseMatrix);          }
18                 
19                 //listOfNames bin 0 = first name read in distance matrix, listOfNames bin 1 = second name read in distance matrix
20                 if(globaldata->gListVector != NULL)             {       
21                         listOfNames = new ListVector(*globaldata->gListVector); 
22                         
23                         vector<string> names;
24                         string binnames;
25                         //map names to rows in sparsematrix
26                         for (int i = 0; i < listOfNames->size(); i++) {
27                                 names.clear();
28                                 binnames = listOfNames->get(i);
29                                 splitAtComma(binnames, names);
30                                 
31                                 for (int j = 0; j < names.size(); j++) {
32                                         nameToIndex[names[j]] = i;
33                                 }
34                         }
35                 }else { cout << "error, no listvector." << endl; }
36
37                 
38                 fastafile = globaldata->getFastaFile();
39                 namesfile = globaldata->getNameFile();
40                 groupfile = globaldata->getGroupFile();
41                 
42                 if (groupfile != "") {
43                         //read in group map info.
44                         groupMap = new GroupMap(groupfile);
45                         groupMap->readMap();
46                 }
47
48                 openInputFile(fastafile, in);
49                 
50                 fasta = new FastaMap();
51
52         }
53         catch(exception& e) {
54                 cout << "Standard Error: " << e.what() << " has occurred in the GetOTURepCommand class Function GetOTURepCommand. Please contact Pat Schloss at pschloss@microbio.umass.edu." << "\n";
55                 exit(1);
56         }
57         catch(...) {
58                 cout << "An unknown error has occurred in the GetOTURepCommand class function GetOTURepCommand. Please contact Pat Schloss at pschloss@microbio.umass.edu." << "\n";
59                 exit(1);
60         }
61 }
62 //**********************************************************************************************************************
63
64 GetOTURepCommand::~GetOTURepCommand(){
65         delete input;
66         delete read;
67         delete fasta;
68         if (groupfile != "") {
69                 delete groupMap;
70         }
71 }
72
73 //**********************************************************************************************************************
74
75 int GetOTURepCommand::execute(){
76         try {
77                 int count = 1;
78                 int error;
79                 
80                 //read fastafile
81                 fasta->readFastaFile(in);
82                 
83                 //set format to list so input can get listvector
84                 globaldata->setFormat("list");
85                 
86                 //if user gave a namesfile then use it
87                 if (namesfile != "") {
88                         readNamesFile();
89                 }
90                 
91                 //read list file
92                 read = new ReadOTUFile(globaldata->getListFile());      
93                 read->read(&*globaldata); 
94                 
95                 input = globaldata->ginput;
96                 list = globaldata->gListVector;
97                 ListVector* lastList = list;
98                 
99                 //if the users enters label "0.06" and there is no "0.06" in their file use the next lowest label.
100                 set<string> processedLabels;
101                 set<string> userLabels = globaldata->labels;
102                 set<int> userLines = globaldata->lines;
103
104                 
105                 while((list != NULL) && ((globaldata->allLines == 1) || (userLabels.size() != 0) || (userLines.size() != 0))) {
106                         
107                         if(globaldata->allLines == 1 || globaldata->lines.count(count) == 1 || globaldata->labels.count(list->getLabel()) == 1){
108                                         cout << list->getLabel() << '\t' << count << endl;
109                                         error = process(list);
110                                         if (error == 1) { return 0; } //there is an error in hte input files, abort command
111                                         
112                                         processedLabels.insert(list->getLabel());
113                                         userLabels.erase(list->getLabel());
114                                         userLines.erase(count);
115                         }
116                         
117                         if ((anyLabelsToProcess(list->getLabel(), userLabels, "") == true) && (processedLabels.count(lastList->getLabel()) != 1)) {
118                                         cout << lastList->getLabel() << '\t' << count << endl;
119                                         error = process(lastList);
120                                         if (error == 1) { return 0; } //there is an error in hte input files, abort command
121                                         
122                                         processedLabels.insert(lastList->getLabel());
123                                         userLabels.erase(lastList->getLabel());
124                         }
125                         
126                         if (count != 1) { delete lastList; }
127                         lastList = list;                        
128                         
129                         list = input->getListVector();
130                         count++;
131                 }
132                 
133                 //output error messages about any remaining user labels
134                 set<string>::iterator it;
135                 bool needToRun = false;
136                 for (it = userLabels.begin(); it != userLabels.end(); it++) {  
137                         cout << "Your file does not include the label "<< *it; 
138                         if (processedLabels.count(lastList->getLabel()) != 1) {
139                                 cout << ". I will use " << lastList->getLabel() << "." << endl;
140                                 needToRun = true;
141                         }else {
142                                 cout << ". Please refer to " << lastList->getLabel() << "." << endl;
143                         }
144                 }
145                 
146                 //run last line if you need to
147                 if (needToRun == true)  {
148                         cout << lastList->getLabel() << '\t' << count << endl;
149                         error = process(lastList);
150                         if (error == 1) { return 0; } //there is an error in hte input files, abort command
151                 }
152                 delete lastList;
153                 
154                 delete matrix;
155                 globaldata->gSparseMatrix = NULL;
156                 delete list;
157                 globaldata->gListVector = NULL;
158
159                 return 0;
160         }
161         catch(exception& e) {
162                 cout << "Standard Error: " << e.what() << " has occurred in the GetOTURepCommand class Function execute. Please contact Pat Schloss at pschloss@microbio.umass.edu." << "\n";
163                 exit(1);
164         }
165         catch(...) {
166                 cout << "An unknown error has occurred in the GetOTURepCommand class function execute. Please contact Pat Schloss at pschloss@microbio.umass.edu." << "\n";
167                 exit(1);
168         }
169
170 }
171
172 //**********************************************************************************************************************
173 void GetOTURepCommand::readNamesFile() {
174         try {
175                 vector<string> dupNames;
176                 openInputFile(namesfile, inNames);
177                 
178                 string name, names, sequence;
179         
180                 while(inNames){
181                         inNames >> name;                        //read from first column  A
182                         inNames >> names;               //read from second column  A,B,C,D
183                         
184                         dupNames.clear();
185                         
186                         //parse names into vector
187                         splitAtComma(names, dupNames);
188                         
189                         //store names in fasta map
190                         sequence = fasta->getSequence(name);
191                         for (int i = 0; i < dupNames.size(); i++) {
192                                 fasta->push_back(dupNames[i], sequence);
193                         }
194                 
195                         gobble(inNames);
196                 }
197                 inNames.close();
198
199         }
200         catch(exception& e) {
201                 cout << "Standard Error: " << e.what() << " has occurred in the GetOTURepCommand class Function readNamesFile. Please contact Pat Schloss at pschloss@microbio.umass.edu." << "\n";
202                 exit(1);
203         }
204         catch(...) {
205                 cout << "An unknown error has occurred in the GetOTURepCommand class function readNamesFile. Please contact Pat Schloss at pschloss@microbio.umass.edu." << "\n";
206                 exit(1);
207         }       
208 }
209 //**********************************************************************************************************************
210 string GetOTURepCommand::FindRep(int bin, string& group, ListVector* thisList) {
211         try{
212                 vector<string> names;
213                 map<string, float> sums;
214                 map<string, float>::iterator it4;
215                 map<int, string> binMap; //subset of namesToIndex - just member of this bin
216                 string binnames;
217                 float min = 10000;
218                 string minName;
219                 map<string, string> groups;
220                 map<string, string>::iterator groupIt;
221                 
222                 binnames = thisList->get(bin);
223         
224                 //parse names into vector
225                 splitAtComma(binnames, names);
226                 
227                 //if you have a groupfile
228                 if(groupfile != "") {
229                         //find the groups that are in this bin
230                         for (int i = 0; i < names.size(); i++) {
231                                 string groupName = groupMap->getGroup(names[i]);
232                                 if (groupName == "not found") {  
233                                         cout << names[i] << " is missing from your group file. Please correct. " << endl;
234                                         groupError = true;
235                                 }else{
236                                         groups[groupName] = groupName;
237                                 }
238                         }
239                         
240                         //turn the groups into a string
241                         for(groupIt = groups.begin(); groupIt != groups.end(); groupIt++) { group += groupIt->first + "-"; }
242                         
243                         //rip off last dash
244                         group = group.substr(0, group.length()-1);
245                 }
246                 
247                 //if only 1 sequence in bin then that's the rep
248                 if (names.size() == 1) { return names[0]; }
249                 else {
250                         //fill binMap
251                         for (int i = 0; i < names.size(); i++) {
252                                 for (it3 = nameToIndex.begin(); it3 != nameToIndex.end(); it3++) {
253
254                                         if (it3->first == names[i]) {  
255                                                 binMap[it3->second] = it3->first;
256
257                                                 //initialize sums map
258                                                 sums[it3->first] = 0.0;
259                                                 break;
260                                         }
261                                 }
262                         }
263                         
264                         //go through each cell in the sparsematrix
265                         for(MatData currentCell = matrix->begin(); currentCell != matrix->end(); currentCell++){
266                                 //is this a distance between 2 members of this bin
267                                 it = binMap.find(currentCell->row);
268                                 it2 = binMap.find(currentCell->column);
269                                 
270                                 //sum the distance of the sequences in the bin to eachother
271                                 if ((it != binMap.end()) && (it2 != binMap.end())) {
272                                         //this is a cell that repesents the distance between to of this bins members
273                                         sums[it->second] += currentCell->dist;
274                                         sums[it2->second] += currentCell->dist;
275                                 }
276                         }
277                         
278                         //smallest sum is the representative
279                         for (it4 = sums.begin(); it4 != sums.end(); it4++) {
280                                 if (it4->second < min) {
281                                         min = it4->second;
282                                         minName = it4->first;
283                                 }
284
285                         }
286                         
287                         return minName;
288                 }
289         
290         }
291         catch(exception& e) {
292                 cout << "Standard Error: " << e.what() << " has occurred in the GetOTURepCommand class Function FindRep. Please contact Pat Schloss at pschloss@microbio.umass.edu." << "\n";
293                 exit(1);
294         }
295         catch(...) {
296                 cout << "An unknown error has occurred in the GetOTURepCommand class function FindRep. Please contact Pat Schloss at pschloss@microbio.umass.edu." << "\n";
297                 exit(1);
298         }       
299 }
300
301 //**********************************************************************************************************************
302 int GetOTURepCommand::process(ListVector* processList) {
303         try{
304                                 string nameRep, name, sequence;
305
306                                 //create output file
307                                 string outputFileName = getRootName(globaldata->getListFile()) + processList->getLabel() + ".rep.fasta";
308                                 openOutputFile(outputFileName, out);
309                                 
310                                 //for each bin in the list vector
311                                 for (int i = 0; i < processList->size(); i++) {
312                                         string groups;
313                                         nameRep = FindRep(i, groups, processList);
314                                         
315                                         //print out name and sequence for that bin
316                                         sequence = fasta->getSequence(nameRep);
317
318                                         if (sequence != "not found") {
319                                                 if (groupfile == "") {
320                                                         nameRep = nameRep + "|" + toString(i+1);
321                                                         out << ">" << nameRep << endl;
322                                                         out << sequence << endl;
323                                                 }else {
324                                                         nameRep = nameRep + "|" + groups + "|" + toString(i+1);
325                                                         out << ">" << nameRep << endl;
326                                                         out << sequence << endl;
327                                                 }
328                                         }else { 
329                                                 cout << nameRep << " is missing from your fasta or name file. Please correct. " << endl; 
330                                                 remove(outputFileName.c_str());
331                                                 return 1;
332                                         }
333                                 }
334                                 
335                                 out.close();
336                                 return 0;
337         
338         }
339         catch(exception& e) {
340                 cout << "Standard Error: " << e.what() << " has occurred in the GetOTURepCommand class Function process. Please contact Pat Schloss at pschloss@microbio.umass.edu." << "\n";
341                 exit(1);
342         }
343         catch(...) {
344                 cout << "An unknown error has occurred in the GetOTURepCommand class function process. Please contact Pat Schloss at pschloss@microbio.umass.edu." << "\n";
345                 exit(1);
346         }       
347 }
348
349 //**********************************************************************************************************************
350
351
352
353
354