]> git.donarmstrong.com Git - mothur.git/blob - filtersharedcommand.cpp
5b5fcb1a5ed36578ebb70b3e8f3ac1048ec1ad4f
[mothur.git] / filtersharedcommand.cpp
1 //
2 //  filtersharedcommand.cpp
3 //  Mothur
4 //
5 //  Created by Sarah Westcott on 1/4/13.
6 //  Copyright (c) 2013 Schloss Lab. All rights reserved.
7 //
8
9 #include "filtersharedcommand.h"
10
11 //**********************************************************************************************************************
12 vector<string> FilterSharedCommand::setParameters(){    
13         try {           
14                 CommandParameter pshared("shared", "InputTypes", "", "", "none", "none", "none","shared",false,true,true); parameters.push_back(pshared);
15                 CommandParameter plabel("label", "String", "", "", "", "", "","",false,false); parameters.push_back(plabel);
16                 CommandParameter pgroups("groups", "String", "", "", "", "", "","",false,false); parameters.push_back(pgroups);
17                 CommandParameter pminpercent("minpercent", "Number", "", "-1", "", "", "","",false,false,true); parameters.push_back(pminpercent);
18         CommandParameter prarepercent("rarepercent", "Number", "", "-1", "", "", "","",false,false,true); parameters.push_back(prarepercent);
19         CommandParameter pminabund("minabund", "Number", "", "-1", "", "", "","",false,false,true); parameters.push_back(pminabund);
20         CommandParameter pmintotal("mintotal", "Number", "", "-1", "", "", "","",false,false,true); parameters.push_back(pmintotal);
21         CommandParameter pminnumsamples("minnumsamples", "Number", "", "-1", "", "", "","",false,false,true); parameters.push_back(pminnumsamples);
22         CommandParameter pminpercentsamples("minpercentsamples", "Number", "", "-1", "", "", "","",false,false,true); parameters.push_back(pminpercentsamples);
23         CommandParameter pkeepties("keepties", "Boolean", "", "T", "", "", "","",false,false,true); parameters.push_back(pkeepties);
24                 CommandParameter pmakerare("makerare", "Boolean", "", "T", "", "", "","",false,false,true); parameters.push_back(pmakerare);
25                 CommandParameter pinputdir("inputdir", "String", "", "", "", "", "","",false,false); parameters.push_back(pinputdir);
26                 CommandParameter poutputdir("outputdir", "String", "", "", "", "", "","",false,false); parameters.push_back(poutputdir);
27                 
28                 vector<string> myArray;
29                 for (int i = 0; i < parameters.size(); i++) {   myArray.push_back(parameters[i].name);          }
30                 return myArray;
31         }
32         catch(exception& e) {
33                 m->errorOut(e, "FilterSharedCommand", "setParameters");
34                 exit(1);
35         }
36 }
37 //**********************************************************************************************************************
38 string FilterSharedCommand::getHelpString(){    
39         try {
40                 string helpString = "";
41                 helpString += "The filter.shared command is used to remove OTUs based on various critieria.\n";
42                 helpString += "The filter.shared command parameters are shared, minpercent, minabund, mintotal, minnumsamples, minpercentsamples, rarepercent, makerare, keepties, groups and label.  You must provide a shared file.\n";
43                 helpString += "The groups parameter allows you to specify which of the groups you would like included. The group names are separated by dashes.\n";
44                 helpString += "The label parameter allows you to select what distance levels you would like, and are also separated by dashes.\n";
45         
46                 helpString += "The minabund parameter allows you indicate the minimum abundance required for each sample in a given OTU.  If any samples abundance falls below the minimum, the OTU is removed. Default=0\n";
47         helpString += "The minpercent parameter allows you indicate the minimum relative abundance of an OTU. For example, if the OTUs total abundance across all samples is 8, and the total abundance across all OTUs is 1000, and minpercent=1. The OTU's relative abundance is 0.008, the minimum is 0.01, so the OTU will be removed. Default=0.\n";
48         helpString += "The rarepercent parameter allows you indicate the percentage of otus to remove.  The OTUs chosen to be removed are the rarest.  For example if you have 1000 OTUs, rarepercent=20 would remove the 200 OTUs with the lowest abundance. Default=0.\n";
49         helpString += "The keepties parameter is used with the rarepercent parameter.  It allows you indicate you want to keep the OTUs with the same abundance as the first 'not rare' OTU. For example if you have 10 OTUs, rarepercent=20 abundances of 20, 18, 15, 15, 10, 5, 3, 3, 3, 1. keepties=t, would remove the 10th OTU, but keep the 9th because its abundance ties the 8th OTU. keepties=f would remove OTUs 9 and 10.  Default=T\n";   
50         helpString += "The minnumsamples parameter allows you indicate the minimum number of samples present in an OTU. If the number of samples present falls below the minimum, the OTU is removed. Default=0.\n";
51         helpString += "The minpercentsamples parameter allows you indicate the minimum percent of sample present in an OTU. For example, if the total number of samples is 10, the number present is 3, and the minpercentsamples=50. The OTU's precent of samples is 0.333, the minimum is 0.50, so the OTU will be removed. Default=0.\n";
52         helpString += "The mintotal parameter allows you indicate the minimum abundance required for a given OTU.  If abundance across all samples falls below the minimum, the OTU is removed. Default=0.\n";
53         
54                 helpString += "The makerare parameter allows you indicate you want the abundances of any removed OTUs to be saved and a new \"rare\" OTU created with its abundances equal to the sum of the OTUs removed.  This will preserve the number of reads in your dataset. Default=T\n";
55                 helpString += "The filter.shared command should be in the following format: filter.shared(shared=yourSharedFile, minabund=yourMinAbund, groups=yourGroups, label=yourLabels).\n";
56                 helpString += "Example filter.shared(shared=final.an.shared, minabund=10).\n";
57                 helpString += "The default value for groups is all the groups in your sharedfile, and all labels in your inputfile will be used.\n";
58                 helpString += "The filter.shared command outputs a .filter.shared file.\n";
59                 helpString += "Note: No spaces between parameter labels (i.e. groups), '=' and parameters (i.e.yourGroups).\n";
60                 return helpString;
61         }
62         catch(exception& e) {
63                 m->errorOut(e, "FilterSharedCommand", "getHelpString");
64                 exit(1);
65         }
66 }
67 //**********************************************************************************************************************
68 string FilterSharedCommand::getOutputPattern(string type) {
69     try {
70         string pattern = "";
71         
72         if (type == "shared")      {   pattern = "[filename],[distance],filter,[extension]";    }
73         else { m->mothurOut("[ERROR]: No definition for type " + type + " output pattern.\n"); m->control_pressed = true;  }
74         
75         return pattern;
76     }
77     catch(exception& e) {
78         m->errorOut(e, "FilterSharedCommand", "getOutputPattern");
79         exit(1);
80     }
81 }
82 //**********************************************************************************************************************
83 FilterSharedCommand::FilterSharedCommand(){     
84         try {
85                 abort = true; calledHelp = true; 
86                 setParameters();
87                 vector<string> tempOutNames;
88                 outputTypes["shared"] = tempOutNames;
89         }
90         catch(exception& e) {
91                 m->errorOut(e, "FilterSharedCommand", "GetRelAbundCommand");
92                 exit(1);
93         }
94 }
95 //**********************************************************************************************************************
96 FilterSharedCommand::FilterSharedCommand(string option) {
97         try {
98                 abort = false; calledHelp = false;   
99                 allLines = 1;
100                 
101                 //allow user to run help
102                 if(option == "help") { help(); abort = true; calledHelp = true; }
103                 else if(option == "citation") { citation(); abort = true; calledHelp = true;}
104                 
105                 else {
106                         vector<string> myArray = setParameters();
107                         
108                         OptionParser parser(option);
109                         map<string,string> parameters = parser.getParameters();
110                         
111                         ValidParameters validParameter;
112                         
113                         //check to make sure all parameters are valid for command
114                         map<string,string>::iterator it;
115                         for (it = parameters.begin(); it != parameters.end(); it++) { 
116                                 if (validParameter.isValidParameter(it->first, myArray, it->second) != true) {  abort = true;  }
117                         }
118                         
119                         //initialize outputTypes
120                         vector<string> tempOutNames;
121                         outputTypes["shared"] = tempOutNames;
122                                     
123                         //if the user changes the input directory command factory will send this info to us in the output parameter 
124                         string inputDir = validParameter.validFile(parameters, "inputdir", false);              
125                         if (inputDir == "not found"){   inputDir = "";          }
126                         else {
127                                 string path;
128                 it = parameters.find("shared");
129                                 //user has given a template file
130                                 if(it != parameters.end()){ 
131                                         path = m->hasPath(it->second);
132                                         //if the user has not given a path then, add inputdir. else leave path alone.
133                                         if (path == "") {       parameters["shared"] = inputDir + it->second;           }
134                                 }
135             }
136                                         
137                         sharedfile = validParameter.validFile(parameters, "shared", true);
138                         if (sharedfile == "not open") { sharedfile = ""; abort = true; }        
139                         else if (sharedfile == "not found") { 
140                                 //if there is a current shared file, use it
141                                 sharedfile = m->getSharedFile(); 
142                                 if (sharedfile != "") { m->mothurOut("Using " + sharedfile + " as input file for the shared parameter."); m->mothurOutEndLine(); }
143                                 else {  m->mothurOut("You have no current sharedfile and the shared parameter is required."); m->mothurOutEndLine(); abort = true; }
144                         }else { m->setSharedFile(sharedfile); }
145                         
146             //if the user changes the output directory command factory will send this info to us in the output parameter 
147                         outputDir = validParameter.validFile(parameters, "outputdir", false);           if (outputDir == "not found"){  outputDir = m->hasPath(sharedfile);     }
148             
149                         //check for optional parameter and set defaults
150                         // ...at some point should added some additional type checking...
151                         label = validParameter.validFile(parameters, "label", false);                   
152                         if (label == "not found") { label = ""; }
153                         else { 
154                                 if(label != "all") {  m->splitAtDash(label, labels);  allLines = 0;  }
155                                 else { allLines = 1;  }
156                         }
157                         
158                         groups = validParameter.validFile(parameters, "groups", false);                 
159                         if (groups == "not found") { groups = ""; pickedGroups = false; }
160                         else { 
161                                 pickedGroups = true;
162                                 m->splitAtDash(groups, Groups);
163                                 m->setGroups(Groups);
164                         }
165                         
166             bool setSomething = false;
167                         string temp = validParameter.validFile(parameters, "minabund", false);          
168             if (temp == "not found"){   temp = "-1";            }
169             else { setSomething = true; }
170                         m->mothurConvert(temp, minAbund);  
171             
172             temp = validParameter.validFile(parameters, "mintotal", false);             
173             if (temp == "not found"){   temp = "-1";            }
174             else { setSomething = true; }
175                         m->mothurConvert(temp, minTotal);
176             
177             temp = validParameter.validFile(parameters, "minnumsamples", false);                
178             if (temp == "not found"){   temp = "-1";            }
179             else { setSomething = true; }
180                         m->mothurConvert(temp, minSamples);
181             
182             temp = validParameter.validFile(parameters, "minpercent", false);           
183             if (temp == "not found"){   temp = "-1";            }
184             else { setSomething = true; }
185             
186                         m->mothurConvert(temp, minPercent);
187             if (minPercent < 1) {} //already in percent form
188             else {  minPercent = minPercent / 100.0; } //user gave us a whole number version so convert to %
189             
190             temp = validParameter.validFile(parameters, "rarepercent", false);
191             if (temp == "not found"){   temp = "-1";            }
192             else { setSomething = true; }
193             
194                         m->mothurConvert(temp, rarePercent);
195             if (rarePercent < 1) {} //already in percent form
196             else {  rarePercent = rarePercent / 100.0; } //user gave us a whole number version so convert to %
197             
198             temp = validParameter.validFile(parameters, "minpercentsamples", false);            
199             if (temp == "not found"){   temp = "-1";            }
200             else { setSomething = true; }
201             
202                         m->mothurConvert(temp, minPercentSamples);
203             if (minPercentSamples < 1) {} //already in percent form
204             else {  minPercentSamples = minPercentSamples / 100.0; } //user gave us a whole number version so convert to %
205                         
206                         temp = validParameter.validFile(parameters, "makerare", false);         if (temp == "not found"){       temp = "T";             }
207                         makeRare = m->isTrue(temp);
208             
209             temp = validParameter.validFile(parameters, "keepties", false);             if (temp == "not found"){       temp = "T";             }
210                         keepties = m->isTrue(temp);
211             
212             if (!setSomething) { m->mothurOut("\nYou did not set any parameters. I will filter using minabund=1.\n\n"); minAbund = 1; }
213         }
214         
215         }
216         catch(exception& e) {
217                 m->errorOut(e, "FilterSharedCommand", "FilterSharedCommand");
218                 exit(1);
219         }
220 }
221 //**********************************************************************************************************************
222
223 int FilterSharedCommand::execute(){
224         try {
225         
226                 if (abort == true) { if (calledHelp) { return 0; }  return 2;   }
227                 
228         InputData input(sharedfile, "sharedfile");
229                 vector<SharedRAbundVector*> lookup = input.getSharedRAbundVectors();
230                 string lastLabel = lookup[0]->getLabel();
231                 
232                 //if the users enters label "0.06" and there is no "0.06" in their file use the next lowest label.
233                 set<string> processedLabels;
234                 set<string> userLabels = labels;
235                 
236                 //as long as you are not at the end of the file or done wih the lines you want
237                 while((lookup[0] != NULL) && ((allLines == 1) || (userLabels.size() != 0))) {
238                         if (m->control_pressed) {   for (int i = 0; i < lookup.size(); i++) {  delete lookup[i]; lookup[i] = NULL; }  
239                 for (int i = 0; i < outputNames.size(); i++) {  m->mothurRemove(outputNames[i]); } return 0;  }
240                         
241                         if(allLines == 1 || labels.count(lookup[0]->getLabel()) == 1){                  
242                                 
243                                 m->mothurOut(lookup[0]->getLabel()); m->mothurOutEndLine();
244                                 
245                                 processShared(lookup);
246                                 
247                                 processedLabels.insert(lookup[0]->getLabel());
248                                 userLabels.erase(lookup[0]->getLabel());
249                         }
250                         
251                         if ((m->anyLabelsToProcess(lookup[0]->getLabel(), userLabels, "") == true) && (processedLabels.count(lastLabel) != 1)) {
252                                 string saveLabel = lookup[0]->getLabel();
253                                 
254                                 for (int i = 0; i < lookup.size(); i++) {  delete lookup[i];  }  
255                                 
256                                 lookup = input.getSharedRAbundVectors(lastLabel);
257                                 m->mothurOut(lookup[0]->getLabel()); m->mothurOutEndLine();
258                                 
259                                 processShared(lookup);
260                                 
261                                 processedLabels.insert(lookup[0]->getLabel());
262                                 userLabels.erase(lookup[0]->getLabel());
263                                 
264                                 //restore real lastlabel to save below
265                                 lookup[0]->setLabel(saveLabel);
266                         }
267                         
268                         lastLabel = lookup[0]->getLabel();
269                         //prevent memory leak
270                         for (int i = 0; i < lookup.size(); i++) {  delete lookup[i]; lookup[i] = NULL; }
271                         
272                         //get next line to process
273                         lookup = input.getSharedRAbundVectors();                                
274                 }
275                 
276                 
277                 if (m->control_pressed) {   return 0;  }
278                 
279                 //output error messages about any remaining user labels
280                 set<string>::iterator it;
281                 bool needToRun = false;
282                 for (it = userLabels.begin(); it != userLabels.end(); it++) {  
283                         m->mothurOut("Your file does not include the label " + *it); 
284                         if (processedLabels.count(lastLabel) != 1) {
285                                 m->mothurOut(". I will use " + lastLabel + "."); m->mothurOutEndLine();
286                                 needToRun = true;
287                         }else {
288                                 m->mothurOut(". Please refer to " + lastLabel + "."); m->mothurOutEndLine();
289                         }
290                 }
291                 
292                 //run last label if you need to
293                 if (needToRun == true)  {
294                         for (int i = 0; i < lookup.size(); i++) { if (lookup[i] != NULL) { delete lookup[i]; } }  
295                         lookup = input.getSharedRAbundVectors(lastLabel);
296                         
297                         m->mothurOut(lookup[0]->getLabel()); m->mothurOutEndLine();
298                         
299                         processShared(lookup);
300                         
301                         for (int i = 0; i < lookup.size(); i++) {  delete lookup[i];  }
302                 }
303         
304                 //set shared file as new current sharedfile
305                 string current = "";
306         itTypes = outputTypes.find("shared");
307                 if (itTypes != outputTypes.end()) {
308                         if ((itTypes->second).size() != 0) { current = (itTypes->second)[0]; m->setSharedFile(current); }
309                 }
310                 
311                 m->mothurOutEndLine();
312                 m->mothurOut("Output File Names: "); m->mothurOutEndLine();
313                 for (int i = 0; i < outputNames.size(); i++) {  m->mothurOut(outputNames[i]); m->mothurOutEndLine();    }
314                 m->mothurOutEndLine();
315                 
316                 return 0;
317         }
318         catch(exception& e) {
319                 m->errorOut(e, "FilterSharedCommand", "execute");
320                 exit(1);
321         }
322 }
323 //**********************************************************************************************************************
324 int FilterSharedCommand::processShared(vector<SharedRAbundVector*>& thislookup) {
325         try {
326                 
327                 //save mothurOut's binLabels to restore for next label
328                 vector<string> saveBinLabels = m->currentBinLabels;
329                 
330         map<string, string> variables; 
331         variables["[filename]"] = outputDir + m->getRootName(m->getSimpleName(sharedfile));
332         variables["[extension]"] = m->getExtension(sharedfile);
333         variables["[distance]"] = thislookup[0]->getLabel();
334                 string outputFileName = getOutputFileName("shared", variables);        
335         
336         if (m->control_pressed) {  return 0; }
337         
338         vector<string> filteredLabels;
339         vector<int> rareCounts; rareCounts.resize(m->getGroups().size(), 0);
340         
341         //create new "filtered" lookup
342         vector<SharedRAbundVector*> filteredLookup;
343         for (int i = 0; i < thislookup.size(); i++) {
344             SharedRAbundVector* temp = new SharedRAbundVector();
345                         temp->setLabel(thislookup[i]->getLabel());
346                         temp->setGroup(thislookup[i]->getGroup());
347                         filteredLookup.push_back(temp);
348         }
349         
350         //you want to remove a percentage of OTUs
351         set<string> removeLabels;
352         if (rarePercent != -0.01) {
353             vector<spearmanRank> otus;
354             //rank otus by abundance
355             for (int i = 0; i < thislookup[0]->getNumBins(); i++) {
356                 float otuTotal = 0.0;
357                 for (int j = 0; j < thislookup.size(); j++) {
358                     otuTotal += thislookup[j]->getAbundance(i);
359                 }
360                 spearmanRank temp(saveBinLabels[i], otuTotal);
361                 otus.push_back(temp);
362             }
363             
364             //sort by abundance
365             sort(otus.begin(), otus.end(), compareSpearman);
366             
367             //find index of cutoff
368             int indexFirstNotRare = ceil(rarePercent * (float)thislookup[0]->getNumBins());
369             
370             //handle ties
371             if (keepties) { //adjust indexFirstNotRare if needed
372                 if (indexFirstNotRare != 0) { //not out of bounds
373                     if (otus[indexFirstNotRare].score == otus[indexFirstNotRare-1].score) { //you have a tie
374                         bool tie = true;
375                         for (int i = indexFirstNotRare-1; i >=0; i--) {
376                             if (otus[indexFirstNotRare].score != otus[i].score) { //found value below tie
377                                 indexFirstNotRare = i+1; tie = false; break;
378                             }
379                         }
380                         if (tie) { if (m->debug) { m->mothurOut("For distance " + thislookup[0]->getLabel() + " all rare OTUs abundance tie with first 'non rare' OTU, not removing any for rarepercent parameter.\n"); }indexFirstNotRare = 0; }
381                     }
382                 }
383             }
384             
385             //saved labels for OTUs above rarepercent
386             for (int i = 0; i < indexFirstNotRare; i++) { removeLabels.insert(otus[i].name); }
387         }
388         
389         bool filteredSomething = false;
390         int numRemoved = 0;
391         for (int i = 0; i < thislookup[0]->getNumBins(); i++) {
392             
393             if (m->control_pressed) { for (int j = 0; j < filteredLookup.size(); j++) { delete filteredLookup[j]; } return 0; }
394             
395             bool okay = true; //innocent until proven guilty
396             if (minAbund != -1) {
397                 for (int j = 0; j < thislookup.size(); j++) { 
398                     if (thislookup[j]->getAbundance(i) < minAbund) { okay = false; break; }
399                 }
400             }
401             
402             if (okay && (minTotal != -1)) {
403                 int otuTotal = 0;
404                 for (int j = 0; j < thislookup.size(); j++) { 
405                     otuTotal += thislookup[j]->getAbundance(i);
406                 }
407                 if (otuTotal < minTotal) { okay = false; }
408             }
409             
410             if (okay && (minPercent != -0.01)) {
411                 double otuTotal = 0;
412                 double total = 0;
413                 for (int j = 0; j < thislookup.size(); j++) { 
414                     otuTotal += thislookup[j]->getAbundance(i);
415                     total += thislookup[j]->getNumSeqs();
416                 }
417                 double percent = otuTotal / total; 
418                 if (percent < minPercent) { okay = false; }
419             }
420             
421             
422             if (okay && (minSamples != -1)) {
423                 int samples = 0;
424                 for (int j = 0; j < thislookup.size(); j++) { 
425                     if (thislookup[j]->getAbundance(i) != 0) { samples++; }
426                 }
427                 if (samples < minSamples) { okay = false; }
428             }
429             
430             if (okay && (minPercentSamples != -0.01)) {
431                 double samples = 0;
432                 double total = thislookup.size();
433                 for (int j = 0; j < thislookup.size(); j++) { 
434                     if (thislookup[j]->getAbundance(i) != 0) { samples++; }
435                 }
436                 double percent = samples / total; 
437                 if (percent < minPercentSamples) { okay = false; }
438             }
439             
440             if (okay && (rarePercent != -0.01)) {
441                 if (removeLabels.count(saveBinLabels[i]) != 0) { //are we on the 'bad' list
442                     okay = false;
443                 }
444             }
445             
446             //did this OTU pass the filter criteria
447             if (okay) {
448                 filteredLabels.push_back(saveBinLabels[i]);
449                 for (int j = 0; j < filteredLookup.size(); j++) { //add this OTU to the filtered lookup
450                     filteredLookup[j]->push_back(thislookup[j]->getAbundance(i), thislookup[j]->getGroup());
451                 }
452             }else { //if not, do we want to save the counts
453                 filteredSomething = true;
454                 if (makeRare) {
455                     for (int j = 0; j < rareCounts.size(); j++) {  rareCounts[j] += thislookup[j]->getAbundance(i); }
456                 }
457                 numRemoved++;
458             }
459             
460         }
461         
462         //if we are saving the counts add a "rare" OTU if anything was filtered
463         if (makeRare) {
464             if (filteredSomething) {
465                 for (int j = 0; j < rareCounts.size(); j++) { //add "rare" OTU to the filtered lookup
466                     filteredLookup[j]->push_back(rareCounts[j], thislookup[j]->getGroup());
467                 }
468                 //create label for rare OTUs
469                 filteredLabels.push_back("rareOTUs");
470             }
471         }
472         
473         ofstream out;
474                 m->openOutputFile(outputFileName, out);
475                 outputTypes["shared"].push_back(outputFileName);  outputNames.push_back(outputFileName);
476                 
477         m->currentBinLabels = filteredLabels;
478         
479                 filteredLookup[0]->printHeaders(out);
480                 
481                 for (int i = 0; i < filteredLookup.size(); i++) {
482                         out << filteredLookup[i]->getLabel() << '\t' << filteredLookup[i]->getGroup() << '\t';
483                         filteredLookup[i]->print(out);
484                 }
485                 out.close();
486         
487         
488         //save mothurOut's binLabels to restore for next label
489                 m->currentBinLabels = saveBinLabels;
490         
491         for (int j = 0; j < filteredLookup.size(); j++) { delete filteredLookup[j]; }
492                 
493         m->mothurOut("\nRemoved " + toString(numRemoved) + " OTUs.\n");
494         
495                 return 0;
496                 
497         }
498         catch(exception& e) {
499                 m->errorOut(e, "FilterSharedCommand", "processShared");
500                 exit(1);
501         }
502 }                       
503 //**********************************************************************************************************************
504
505