]> git.donarmstrong.com Git - mothur.git/blob - classifyotucommand.cpp
bug fixes
[mothur.git] / classifyotucommand.cpp
1 /*
2  *  classifyotucommand.cpp
3  *  Mothur
4  *
5  *  Created by westcott on 6/1/10.
6  *  Copyright 2010 Schloss Lab. All rights reserved.
7  *
8  */
9
10 #include "classifyotucommand.h"
11 #include "phylotree.h"
12
13
14 //**********************************************************************************************************************
15 ClassifyOtuCommand::ClassifyOtuCommand(string option)  {
16         try{
17                 abort = false;
18                 allLines = 1;
19                 labels.clear();
20                                 
21                 //allow user to run help
22                 if (option == "help") { 
23                         help(); abort = true;
24                 } else {
25                         //valid paramters for this command
26                         string Array[] =  {"list","label","name","taxonomy","cutoff","probs","outputdir","inputdir"};
27                         vector<string> myArray (Array, Array+(sizeof(Array)/sizeof(string)));
28                         
29                         OptionParser parser(option);
30                         map<string, string> parameters = parser.getParameters();
31                         
32                         ValidParameters validParameter;
33                         map<string, string>::iterator it;
34                 
35                         //check to make sure all parameters are valid for command
36                         for (it = parameters.begin(); it != parameters.end(); it++) { 
37                                 if (validParameter.isValidParameter(it->first, myArray, it->second) != true) {  abort = true;  }
38                         }
39                         
40                         //if the user changes the input directory command factory will send this info to us in the output parameter 
41                         string inputDir = validParameter.validFile(parameters, "inputdir", false);              
42                         if (inputDir == "not found"){   inputDir = "";          }
43                         else {
44                                 string path;
45                                 it = parameters.find("list");
46                                 //user has given a template file
47                                 if(it != parameters.end()){ 
48                                         path = m->hasPath(it->second);
49                                         //if the user has not given a path then, add inputdir. else leave path alone.
50                                         if (path == "") {       parameters["list"] = inputDir + it->second;             }
51                                 }
52                                 
53                                 it = parameters.find("name");
54                                 //user has given a template file
55                                 if(it != parameters.end()){ 
56                                         path = m->hasPath(it->second);
57                                         //if the user has not given a path then, add inputdir. else leave path alone.
58                                         if (path == "") {       parameters["name"] = inputDir + it->second;             }
59                                 }
60                                 
61                                 it = parameters.find("taxonomy");
62                                 //user has given a template file
63                                 if(it != parameters.end()){ 
64                                         path = m->hasPath(it->second);
65                                         //if the user has not given a path then, add inputdir. else leave path alone.
66                                         if (path == "") {       parameters["taxonomy"] = inputDir + it->second;         }
67                                 }
68                         }
69
70                         
71                         //if the user changes the output directory command factory will send this info to us in the output parameter 
72                         outputDir = validParameter.validFile(parameters, "outputdir", false);           if (outputDir == "not found"){  outputDir = "";         }
73                         
74                         //check for required parameters
75                         listfile = validParameter.validFile(parameters, "list", true);
76                         if (listfile == "not found") { m->mothurOut("list is a required parameter for the classify.otu command."); m->mothurOutEndLine(); abort = true; }
77                         else if (listfile == "not open") { abort = true; }      
78                         
79                         taxfile = validParameter.validFile(parameters, "taxonomy", true);
80                         if (taxfile == "not found") {  m->mothurOut("taxonomy is a required parameter for the classify.otu command."); m->mothurOutEndLine(); abort = true; }
81                         else if (taxfile == "not open") { abort = true; }       
82         
83                         namefile = validParameter.validFile(parameters, "name", true);
84                         if (namefile == "not open") { abort = true; }   
85                         else if (namefile == "not found") { namefile = ""; }
86                         
87                         //check for optional parameter and set defaults
88                         // ...at some point should added some additional type checking...
89                         label = validParameter.validFile(parameters, "label", false);                   
90                         if (label == "not found") { label = ""; allLines = 1;  }
91                         else { 
92                                 if(label != "all") {  m->splitAtDash(label, labels);  allLines = 0;  }
93                                 else { allLines = 1;  }
94                         }
95                         
96                         string temp = validParameter.validFile(parameters, "cutoff", false);                    if (temp == "not found") { temp = "51"; }
97                         convert(temp, cutoff); 
98                         
99                         temp = validParameter.validFile(parameters, "probs", false);                                    if (temp == "not found"){       temp = "true";                  }
100                         probs = m->isTrue(temp);
101                         
102                         
103                         if ((cutoff < 51) || (cutoff > 100)) { m->mothurOut("cutoff must be above 50, and no greater than 100."); m->mothurOutEndLine(); abort = true;  }
104                         
105                 }
106         }
107         catch(exception& e) {
108                 m->errorOut(e, "ClassifyOtuCommand", "ClassifyOtuCommand");
109                 exit(1);
110         }
111 }
112
113 //**********************************************************************************************************************
114
115 void ClassifyOtuCommand::help(){
116         try {
117                 m->mothurOut("The classify.otu command parameters are list, taxonomy, name, cutoff, label and probs.  The taxonomy and list parameters are required.\n");
118                 m->mothurOut("The name parameter allows you add a names file with your taxonomy file.\n");
119                 m->mothurOut("The label parameter allows you to select what distance levels you would like a output files created for, and is separated by dashes.\n");
120                 m->mothurOut("The default value for label is all labels in your inputfile.\n");
121                 m->mothurOut("The cutoff parameter allows you to specify a consensus confidence threshold for your taxonomy.  The default is 51, meaning 51%. Cutoff cannot be below 51.\n");
122                 m->mothurOut("The probs parameter shuts off the outputting of the consensus confidence results. The default is true, meaning you want the confidence to be shown.\n");
123                 m->mothurOut("The classify.otu command should be in the following format: classify.otu(taxonomy=yourTaxonomyFile, list=yourListFile, name=yourNamesFile, label=yourLabels).\n");
124                 m->mothurOut("Example classify.otu(taxonomy=abrecovery.silva.full.taxonomy, list=abrecovery.fn.list, label=0.10).\n");
125                 m->mothurOut("Note: No spaces between parameter labels (i.e. list), '=' and parameters (i.e.yourListFile).\n\n");
126         }
127         catch(exception& e) {
128                 m->errorOut(e, "ClassifyOtuCommand", "help");
129                 exit(1);
130         }
131 }
132
133 //**********************************************************************************************************************
134
135 ClassifyOtuCommand::~ClassifyOtuCommand(){}
136
137 //**********************************************************************************************************************
138
139 int ClassifyOtuCommand::execute(){
140         try {
141         
142                 if (abort == true) { return 0; }
143                 
144                 //if user gave a namesfile then use it
145                 if (namefile != "") {   readNamesFile();        }
146                 
147                 //read taxonomy file and save in map for easy access in building bin trees
148                 readTaxonomyFile();
149                 
150                 if (m->control_pressed) { return 0; }
151                 
152                 input = new InputData(listfile, "list");
153                 list = input->getListVector();
154                 string lastLabel = list->getLabel();
155
156                 //if the users enters label "0.06" and there is no "0.06" in their file use the next lowest label.
157                 set<string> processedLabels;
158                 set<string> userLabels = labels;
159                 
160                 if (m->control_pressed) { delete input; delete list; for (int i = 0; i < outputNames.size(); i++) {     remove(outputNames[i].c_str());  }  return 0; }
161         
162                 while((list != NULL) && ((allLines == 1) || (userLabels.size() != 0))) {
163                         
164                         if (allLines == 1 || labels.count(list->getLabel()) == 1){
165                         
166                                         m->mothurOut(list->getLabel() + "\t" + toString(list->size())); m->mothurOutEndLine();
167                                         process(list);
168                                         if (m->control_pressed) { for (int i = 0; i < outputNames.size(); i++) {        remove(outputNames[i].c_str());  } delete input; delete list; return 0; }
169                                                                                 
170                                         processedLabels.insert(list->getLabel());
171                                         userLabels.erase(list->getLabel());
172                         }
173                         
174                         if ((m->anyLabelsToProcess(list->getLabel(), userLabels, "") == true) && (processedLabels.count(lastLabel) != 1)) {
175                                         string saveLabel = list->getLabel();
176                                         
177                                         delete list;
178                                         list = input->getListVector(lastLabel);
179                                         m->mothurOut(list->getLabel() + "\t" + toString(list->size())); m->mothurOutEndLine();
180                                         process(list);
181                                 
182                                         
183                                         if (m->control_pressed) { for (int i = 0; i < outputNames.size(); i++) {        remove(outputNames[i].c_str());  } delete input; delete list; return 0; }
184                                                                                 
185                                         processedLabels.insert(list->getLabel());
186                                         userLabels.erase(list->getLabel());
187                                         
188                                         //restore real lastlabel to save below
189                                         list->setLabel(saveLabel);
190                         }
191                         
192                         lastLabel = list->getLabel();
193         
194                         delete list;
195                         list = input->getListVector();
196                 }
197                 
198                 //output error messages about any remaining user labels
199                 bool needToRun = false;
200                 for (set<string>::iterator it = userLabels.begin(); it != userLabels.end(); it++) {  
201                         m->mothurOut("Your file does not include the label " + (*it)); 
202                         if (processedLabels.count(lastLabel) != 1) {
203                                 m->mothurOut(". I will use " + lastLabel + "."); m->mothurOutEndLine();
204                                 needToRun = true;
205                         }else {
206                                 m->mothurOut(". Please refer to " + lastLabel + "."); m->mothurOutEndLine();
207                         }
208                 }
209                 
210                 //run last label if you need to
211                 if (needToRun == true)  {
212                         if (list != NULL) {     delete list;    }
213                         list = input->getListVector(lastLabel);
214                         m->mothurOut(list->getLabel() + "\t" + toString(list->size())); m->mothurOutEndLine();
215                         
216                         process(list);
217                         delete list;
218                         
219                         if (m->control_pressed) { for (int i = 0; i < outputNames.size(); i++) {        remove(outputNames[i].c_str());  } delete input; delete list; return 0; }
220                 }
221                 
222                 delete input;  
223                                 
224                 if (m->control_pressed) {  for (int i = 0; i < outputNames.size(); i++) {       remove(outputNames[i].c_str());  } return 0; }
225                 
226                 m->mothurOutEndLine();
227                 m->mothurOut("Output File Names: "); m->mothurOutEndLine();
228                 for (int i = 0; i < outputNames.size(); i++) {  m->mothurOut(outputNames[i]); m->mothurOutEndLine();    }
229                 m->mothurOutEndLine();
230                 
231                 return 0;
232         }
233         catch(exception& e) {
234                 m->errorOut(e, "ClassifyOtuCommand", "execute");
235                 exit(1);
236         }
237 }
238
239 //**********************************************************************************************************************
240 int ClassifyOtuCommand::readNamesFile() {
241         try {
242                 
243                 ifstream inNames;
244                 m->openInputFile(namefile, inNames);
245                 
246                 string name, names;
247         
248                 while(!inNames.eof()){
249                         inNames >> name;                        //read from first column  A
250                         inNames >> names;               //read from second column  A,B,C,D
251                         m->gobble(inNames);
252                         
253                         //parse names into vector
254                         vector<string> theseNames;
255                         m->splitAtComma(names, theseNames);
256
257                         for (int i = 0; i < theseNames.size(); i++) {  nameMap[theseNames[i]] = name;  }
258                         
259                         if (m->control_pressed) { inNames.close(); nameMap.clear(); return 0; }
260                 }
261                 inNames.close();
262                 
263                 return 0;
264         }
265         catch(exception& e) {
266                 m->errorOut(e, "ClassifyOtuCommand", "readNamesFile");
267                 exit(1);
268         }
269 }
270 //**********************************************************************************************************************
271 int ClassifyOtuCommand::readTaxonomyFile() {
272         try {
273                 
274                 ifstream in;
275                 m->openInputFile(taxfile, in);
276                 
277                 string name, tax;
278         
279                 while(!in.eof()){
280                         in >> name >> tax;              
281                         m->gobble(in);
282                         
283                         //are there confidence scores, if so remove them
284                         if (tax.find_first_of('(') != -1) {  removeConfidences(tax);    }
285                         
286                         taxMap[name] = tax;
287                         
288                         if (m->control_pressed) { in.close(); taxMap.clear(); return 0; }
289                 }
290                 in.close();
291                 
292                 return 0;
293         }
294         catch(exception& e) {
295                 m->errorOut(e, "ClassifyOtuCommand", "readTaxonomyFile");
296                 exit(1);
297         }
298 }
299 //**********************************************************************************************************************
300 string ClassifyOtuCommand::findConsensusTaxonomy(int bin, ListVector* thisList, int& size) {
301         try{
302                 string conTax = "";
303                 vector<string> names;
304                 map<string, string>::iterator it;
305                 map<string, string>::iterator it2;
306
307                 //parse names into vector
308                 string binnames = thisList->get(bin);
309                 m->splitAtComma(binnames, names);
310
311                 //create a tree containing sequences from this bin
312                 PhyloTree* phylo = new PhyloTree();
313                 
314                 size = 0;
315                 for (int i = 0; i < names.size(); i++) {
316         
317                         //if namesfile include the names
318                         if (namefile != "") {
319         
320                                 //is this sequence in the name file - namemap maps seqName -> repSeqName
321                                 it2 = nameMap.find(names[i]);
322                                 
323                                 if (it2 == nameMap.end()) { //this name is not in name file, skip it
324                                         m->mothurOut(names[i] + " is not in your name file.  I will not include it in the consensus."); m->mothurOutEndLine();
325                                 }else{
326                                         
327                                         //is this sequence in the taxonomy file - look for repSeqName since we are assuming the taxonomy file is unique
328                                         it = taxMap.find(it2->second);
329                         
330                                         if (it == taxMap.end()) { //this name is not in taxonomy file, skip it
331                                         
332                                                 if (names[i] != it2->second) { m->mothurOut(names[i] + " is represented by " +  it2->second + " and is not in your taxonomy file.  I will not include it in the consensus."); m->mothurOutEndLine(); }
333                                                 else {  m->mothurOut(names[i] + " is not in your taxonomy file.  I will not include it in the consensus."); m->mothurOutEndLine(); }
334                                         }else{
335                                 
336                                                 //add seq to tree
337                                                 phylo->addSeqToTree(names[i], it->second);
338                                                 size++;
339                                         }
340                                 }
341                                 
342                         }else{
343                                 //is this sequence in the taxonomy file - look for repSeqName since we are assuming the taxonomy file is unique
344                                 it = taxMap.find(names[i]);
345                 
346                                 if (it == taxMap.end()) { //this name is not in taxonomy file, skip it
347                                         m->mothurOut(names[i] + " is not in your taxonomy file.  I will not include it in the consensus."); m->mothurOutEndLine();
348                                 }else{
349                                         //add seq to tree
350                                         phylo->addSeqToTree(names[i], it->second);
351                                         size++;
352                                 }
353                         }
354
355                         
356                         if (m->control_pressed) { delete phylo; return conTax; }
357                         
358                 }
359                 
360                 //build tree
361                 phylo->assignHeirarchyIDs(0);
362                 
363                 TaxNode currentNode = phylo->get(0);
364                 
365                 //at each level
366                 while (currentNode.children.size() != 0) { //you still have more to explore
367                 
368                         TaxNode bestChild;
369                         int bestChildSize = 0;
370                         
371                         //go through children
372                         for (map<string, int>::iterator itChild = currentNode.children.begin(); itChild != currentNode.children.end(); itChild++) {
373                                 
374                                 TaxNode temp = phylo->get(itChild->second);
375                                 
376                                 //select child with largest accesions - most seqs assigned to it
377                                 if (temp.accessions.size() > bestChildSize) {
378                                         bestChild = phylo->get(itChild->second);
379                                         bestChildSize = temp.accessions.size();
380                                 }
381                                 
382                         }
383                                 
384                         //is this taxonomy above cutoff
385                         int consensusConfidence = ceil((bestChildSize / (float) size) * 100);
386                         
387                         if (consensusConfidence >= cutoff) { //if yes, add it
388                                 if (probs) {
389                                         conTax += bestChild.name + "(" + toString(consensusConfidence) + ");";
390                                 }else{
391                                         conTax += bestChild.name + ";";
392                                 }
393                         }else{ //if no, quit
394                                 break;
395                         }
396                         
397                         //move down a level
398                         currentNode = bestChild;
399                 }
400                 
401                                 
402                 if (conTax == "") {  conTax = "unclassified;";  }
403                 
404                 delete phylo;   
405                 
406                 return conTax;
407                         
408         }
409         catch(exception& e) {
410                 m->errorOut(e, "ClassifyOtuCommand", "findConsensusTaxonomy");
411                 exit(1);
412         }
413 }
414
415 //**********************************************************************************************************************
416 int ClassifyOtuCommand::process(ListVector* processList) {
417         try{
418                 string conTax;
419                 int size;
420                 
421                 //create output file
422                 if (outputDir == "") { outputDir += m->hasPath(listfile); }
423                                 
424                 ofstream out;
425                 string outputFile = outputDir + m->getRootName(m->getSimpleName(listfile)) + processList->getLabel() + ".cons.taxonomy";
426                 m->openOutputFile(outputFile, out);
427                 outputNames.push_back(outputFile);
428                 
429                 //for each bin in the list vector
430                 for (int i = 0; i < processList->getNumBins(); i++) {
431         
432                         conTax  = findConsensusTaxonomy(i, processList, size);
433                 
434                         if (m->control_pressed) { out.close();  return 0; }
435                         
436                         //output to new names file
437                         out << (i+1) << '\t' << size << '\t' << conTax << endl;
438                 }
439
440                 out.close();
441                 
442                 return 0;
443
444         }
445         catch(exception& e) {
446                 m->errorOut(e, "ClassifyOtuCommand", "process");
447                 exit(1);
448         }
449 }
450 /**************************************************************************************************/
451 void ClassifyOtuCommand::removeConfidences(string& tax) {
452         try {
453                 
454                 string taxon;
455                 string newTax = "";
456                 
457                 while (tax.find_first_of(';') != -1) {
458                         //get taxon
459                         taxon = tax.substr(0,tax.find_first_of(';'));
460                         
461                         int pos = taxon.find_first_of('(');
462                         if (pos != -1) {
463                                 taxon = taxon.substr(0, pos); //rip off confidence 
464                         }
465                         
466                         taxon += ";";
467                         
468                         tax = tax.substr(tax.find_first_of(';')+1, tax.length());
469                         newTax += taxon;
470                 }
471                 
472                 tax = newTax;
473         }
474         catch(exception& e) {
475                 m->errorOut(e, "ClassifyOtuCommand", "removeConfidences");
476                 exit(1);
477         }
478 }
479 //**********************************************************************************************************************
480
481