]> git.donarmstrong.com Git - mothur.git/blob - sensspeccommand.cpp
removed read.dist, read.otu, read.tree and globaldata. added current to defaults...
[mothur.git] / sensspeccommand.cpp
1 /*
2  *  sensspeccommand.cpp
3  *  Mothur
4  *
5  *  Created by Pat Schloss on 7/6/10.
6  *  Copyright 2010 Schloss Lab. All rights reserved.
7  *
8  */
9
10 #include "sensspeccommand.h"
11
12 //**********************************************************************************************************************
13 vector<string> SensSpecCommand::setParameters(){        
14         try {
15                 CommandParameter plist("list", "InputTypes", "", "", "none", "none", "none",false,true); parameters.push_back(plist);
16                 CommandParameter pphylip("phylip", "InputTypes", "", "", "PhylipColumn", "PhylipColumn", "none",false,false); parameters.push_back(pphylip);
17                 //CommandParameter pname("name", "InputTypes", "", "", "none", "none", "ColumnName",false,false); parameters.push_back(pname);
18                 CommandParameter pcolumn("column", "InputTypes", "", "", "PhylipColumn", "PhylipColumn", "none",false,false); parameters.push_back(pcolumn);
19                 CommandParameter plabel("label", "String", "", "", "", "", "",false,false); parameters.push_back(plabel);
20                 CommandParameter pcutoff("cutoff", "Number", "", "-1.00", "", "", "",false,false); parameters.push_back(pcutoff);
21                 CommandParameter pprecision("precision", "Number", "", "100", "", "", "",false,false); parameters.push_back(pprecision);
22                 CommandParameter phard("hard", "Boolean", "", "F", "", "", "",false,false); parameters.push_back(phard);
23                 CommandParameter pinputdir("inputdir", "String", "", "", "", "", "",false,false); parameters.push_back(pinputdir);
24                 CommandParameter poutputdir("outputdir", "String", "", "", "", "", "",false,false); parameters.push_back(poutputdir);
25                 
26                 vector<string> myArray;
27                 for (int i = 0; i < parameters.size(); i++) {   myArray.push_back(parameters[i].name);          }
28                 return myArray;
29         }
30         catch(exception& e) {
31                 m->errorOut(e, "SensSpecCommand", "setParameters");
32                 exit(1);
33         }
34 }
35 //**********************************************************************************************************************
36 string SensSpecCommand::getHelpString(){        
37         try {
38                 string helpString = "";
39                 helpString += "The get.oturep command parameters are phylip, column, list, fasta, name, group, large, weighted, cutoff, precision, groups, sorted and label.  The fasta and list parameters are required, as well as phylip or column and name, unless you have valid current files.\n";
40                 helpString += "The label parameter allows you to select what distance levels you would like a output files created for, and is separated by dashes.\n";
41                 helpString += "The phylip or column parameter is required, but only one may be used.  If you use a column file the name filename is required. \n";
42                 helpString += "If you do not provide a cutoff value 10.00 is assumed. If you do not provide a precision value then 100 is assumed.\n";
43                 helpString += "The get.oturep command should be in the following format: get.oturep(phylip=yourDistanceMatrix, fasta=yourFastaFile, list=yourListFile, name=yourNamesFile, group=yourGroupFile, label=yourLabels).\n";
44                 helpString += "Example get.oturep(phylip=amazon.dist, fasta=amazon.fasta, list=amazon.fn.list, group=amazon.groups).\n";
45                 helpString += "The default value for label is all labels in your inputfile.\n";
46                 helpString += "The sorted parameter allows you to indicate you want the output sorted. You can sort by sequence name, bin number, bin size or group. The default is no sorting, but your options are name, number, size, or group.\n";
47                 helpString += "The large parameter allows you to indicate that your distance matrix is too large to fit in RAM.  The default value is false.\n";
48                 helpString += "The weighted parameter allows you to indicate that want to find the weighted representative. You must provide a namesfile to set weighted to true.  The default value is false.\n";
49                 helpString += "The representative is found by selecting the sequence that has the smallest total distance to all other sequences in the OTU. If a tie occurs the smallest average distance is used.\n";
50                 helpString += "For weighted = false, mothur assumes the distance file contains only unique sequences, the list file may contain all sequences, but only the uniques are considered to become the representative. If your distance file contains all the sequences it would become weighted=true.\n";
51                 helpString += "For weighted = true, mothur assumes the distance file contains only unique sequences, the list file must contain all sequences, all sequences are considered to become the representative, but unique name will be used in the output for consistency.\n";
52                 helpString += "If your distance file contains all the sequence and you do not provide a name file, the weighted representative will be given, unless your listfile is unique. If you provide a namefile, then you can select weighted or unweighted.\n";
53                 helpString += "The group parameter allows you provide a group file.\n";
54                 helpString += "The groups parameter allows you to indicate that you want representative sequences for each group specified for each OTU, group name should be separated by dashes. ex. groups=A-B-C.\n";
55                 helpString += "The get.oturep command outputs a .fastarep and .rep.names file for each distance you specify, selecting one OTU representative for each bin.\n";
56                 helpString += "If you provide a groupfile, then it also appends the names of the groups present in that bin.\n";
57                 helpString += "Note: No spaces between parameter labels (i.e. fasta), '=' and parameters (i.e.yourFastaFile).\n\n";
58                 return helpString;
59         }
60         catch(exception& e) {
61                 m->errorOut(e, "SensSpecCommand", "getHelpString");
62                 exit(1);
63         }
64 }
65 //**********************************************************************************************************************
66 SensSpecCommand::SensSpecCommand(){     
67         try {
68                 abort = true; calledHelp = true; 
69                 setParameters();
70                 vector<string> tempOutNames;
71                 outputTypes["sensspec"] = tempOutNames;
72         }
73         catch(exception& e) {
74                 m->errorOut(e, "SensSpecCommand", "SensSpecCommand");
75                 exit(1);
76         }
77 }
78 //***************************************************************************************************************
79
80 SensSpecCommand::SensSpecCommand(string option)  {
81         try {
82                 
83                 abort = false; calledHelp = false;   
84                 
85                 //allow user to run help
86                 if(option == "help") { help(); abort = true; calledHelp = true; }
87                 
88                 else {
89                         string temp;
90
91                         //valid paramters for this command
92                         string AlignArray[] =  {"list", "phylip", "column", "hard", "label", "cutoff", "precision", "outputdir", "inputdir"};
93                         
94                         vector<string> myArray (AlignArray, AlignArray+(sizeof(AlignArray)/sizeof(string)));
95                         
96                         OptionParser parser(option);
97                         map<string,string> parameters = parser.getParameters();
98                         
99                         ValidParameters validParameter;
100                         map<string,string>::iterator it;
101                         
102                         //check to make sure all parameters are valid for command
103                         for (it = parameters.begin(); it != parameters.end(); it++) { 
104                                 if (validParameter.isValidParameter(it->first, myArray, it->second) != true) {  abort = true;  }
105                         }
106                         
107                         //initialize outputTypes
108                         vector<string> tempOutNames;
109                         outputTypes["sensspec"] = tempOutNames;
110                         
111                         //if the user changes the input directory command factory will send this info to us in the output parameter 
112                         string inputDir = validParameter.validFile(parameters, "inputdir", false);              
113                         if (inputDir == "not found"){   inputDir = "";          }
114                         else {
115                                 string path;
116                                 it = parameters.find("list");
117                                 //user has given a template file
118                                 if(it != parameters.end()){ 
119                                         path = m->hasPath(it->second);
120                                         //if the user has not given a path then, add inputdir. else leave path alone.
121                                         if (path == "") {       parameters["list"] = inputDir + it->second;             }
122                                 }
123                                 
124                                 it = parameters.find("phylip");
125                                 //user has given a template file
126                                 if(it != parameters.end()){ 
127                                         path = m->hasPath(it->second);
128                                         //if the user has not given a path then, add inputdir. else leave path alone.
129                                         if (path == "") {       parameters["phylip"] = inputDir + it->second;           }
130                                 }
131                                 
132                                 it = parameters.find("column");
133                                 //user has given a template file
134                                 if(it != parameters.end()){ 
135                                         path = m->hasPath(it->second);
136                                         //if the user has not given a path then, add inputdir. else leave path alone.
137                                         if (path == "") {       parameters["column"] = inputDir + it->second;           }
138                                 }
139                                 
140                                 //it = parameters.find("name");
141                                 //user has given a template file
142                                 //if(it != parameters.end()){ 
143                                         //path = m->hasPath(it->second);
144                                         //if the user has not given a path then, add inputdir. else leave path alone.
145                                         //if (path == "") {     parameters["name"] = inputDir + it->second;             }
146                                 //}
147                                 
148                         }
149                         //check for required parameters
150                         listFile = validParameter.validFile(parameters, "list", true);
151                         if (listFile == "not found") {          
152                                 listFile = m->getListFile(); 
153                                 if (listFile != "") { m->mothurOut("Using " + listFile + " as input file for the list parameter."); m->mothurOutEndLine(); }
154                                 else {  m->mothurOut("You have no current list file and the list parameter is required."); m->mothurOutEndLine(); abort = true; }
155                         }
156                         else if (listFile == "not open") { abort = true; }      
157                         
158                         phylipfile = validParameter.validFile(parameters, "phylip", true);
159                         if (phylipfile == "not found") { phylipfile = "";  }
160                         else if (phylipfile == "not open") { abort = true; }    
161                         else { distFile = phylipfile; format = "phylip";   }
162                         
163                         columnfile = validParameter.validFile(parameters, "column", true);
164                         if (columnfile == "not found") { columnfile = ""; }
165                         else if (columnfile == "not open") { abort = true; }    
166                         else { distFile = columnfile; format = "column";   }
167                         
168                         if ((phylipfile == "") && (columnfile == "")) { //is there are current file available for either of these?
169                                 //give priority to column, then phylip
170                                 columnfile = m->getColumnFile(); 
171                                 if (columnfile != "") {  distFile = columnfile; format = "column";  m->mothurOut("Using " + columnfile + " as input file for the column parameter."); m->mothurOutEndLine(); }
172                                 else { 
173                                         phylipfile = m->getPhylipFile(); 
174                                         if (phylipfile != "") {  distFile = phylipfile; format = "phylip"; m->mothurOut("Using " + phylipfile + " as input file for the phylip parameter."); m->mothurOutEndLine(); }
175                                         else { 
176                                                 m->mothurOut("No valid current files. You must provide a phylip or column file."); m->mothurOutEndLine(); 
177                                                 abort = true;
178                                         }
179                                 }
180                         }else if ((phylipfile != "") && (columnfile != "")) { m->mothurOut("When executing a sens.spec command you must enter ONLY ONE of the following: phylip or column."); m->mothurOutEndLine(); abort = true; }
181                         
182                         
183                         //if the user changes the output directory command factory will send this info to us in the output parameter 
184                         outputDir = validParameter.validFile(parameters, "outputdir", false);
185                         if (outputDir == "not found"){  
186                                 outputDir = ""; 
187                                 outputDir += m->hasPath(listFile); //if user entered a file with a path then preserve it        
188                         }
189
190                         //check for optional parameter and set defaults
191                         // ...at some point should added some additional type checking...
192                         temp = validParameter.validFile(parameters, "hard", false);
193                         if (temp == "not found"){       hard = 0;       }
194                         else if(!m->isTrue(temp))       {       hard = 0;       }
195                         else if(m->isTrue(temp))        {       hard = 1;       }
196                         
197 //                      temp = validParameter.validFile(parameters, "name", true);
198 //                      if (temp == "not found")        {       nameFile = "";          }
199 //                      else if(temp == "not open")     {       abort = true;           }
200 //                      else                                            {       nameFile = temp;        }
201 //                      cout << "name:\t" << nameFile << endl;
202
203                         temp = validParameter.validFile(parameters, "cutoff", false);           if (temp == "not found") { temp = "-1.00"; }
204                         convert(temp, cutoff);  
205 //                      cout << cutoff << endl;
206                         
207                         temp = validParameter.validFile(parameters, "precision", false);        if (temp == "not found") { temp = "100"; }
208                         convert(temp, precision);  
209 //                      cout << precision << endl;
210                         
211                         lineLabel = validParameter.validFile(parameters, "label", false);       if (lineLabel == "not found") { lineLabel = ""; }
212                         
213                         sensSpecFileName = listFile.substr(0,listFile.find_last_of('.')) + ".sensspec";
214                 }
215         }
216         catch(exception& e) {
217                 m->errorOut(e, "SensSpecCommand", "SensSpecCommand");
218                 exit(1);
219         }
220 }
221 //***************************************************************************************************************
222
223 int SensSpecCommand::execute(){
224         try{
225                 if (abort == true) { if (calledHelp) { return 0; }  return 2;   }
226
227                 setUpOutput();
228                 outputNames.push_back(sensSpecFileName); outputTypes["sensspec"].push_back(sensSpecFileName);
229                 if(format == "phylip")          {       processPhylip();        }
230                 else if(format == "column")     {       processColumn();        }
231                 
232                 
233                 return 0;       
234         }
235         catch(exception& e) {
236                 m->errorOut(e, "SensSpecCommand", "execute");
237                 exit(1);
238         }
239 }
240
241 //***************************************************************************************************************
242
243 void SensSpecCommand::processPhylip(){
244         try{
245                 //probably need some checking to confirm that the names in the distance matrix are the same as those in the list file
246                 
247                 ifstream inputListFile;
248                 m->openInputFile(listFile, inputListFile);
249                 
250                 string origCutoff = "";
251                 bool getCutoff = 0;
252                 if(cutoff == -1.00)     {       getCutoff = 1;                                                                                                                  }
253                 else                            {       origCutoff = toString(cutoff);  cutoff += (0.49 / double(precision));   }
254                 
255                 string label;
256                 int numOTUs;
257
258                 map<string, int> seqMap;
259                 string seqList;
260                 
261                 while(inputListFile){
262                         inputListFile >> label >> numOTUs;
263                         for(int i=0;i<numOTUs;i++){
264                                 inputListFile >> seqList;
265                                 int seqListLength = seqList.length();
266                                 string seqName = "";
267                                 for(int j=0;j<seqListLength;j++){
268                                         
269                                         if(seqList[j] == ','){
270                                                 seqMap[seqName] = i;
271                                                 seqName = "";
272                                         }
273                                         else{
274                                                 seqName += seqList[j];
275                                         }
276                                         
277                                 }
278                                 seqMap[seqName] = i;
279                         }
280                         m->gobble(inputListFile);
281                 
282                         int lNumSeqs = seqMap.size();
283                         int pNumSeqs = 0;
284
285                         ifstream phylipFile;
286                         m->openInputFile(distFile, phylipFile);
287                         phylipFile >> pNumSeqs;
288                         if(pNumSeqs != lNumSeqs){       cout << "numSeq mismatch!" << endl;     }
289                         
290                         string seqName;
291                         double distance;
292                         vector<int> otuIndices(lNumSeqs, -1);
293                                 
294                         truePositives = 0;
295                         falsePositives = 0;
296                         trueNegatives = 0;
297                         falseNegatives = 0;
298                         
299                         if(getCutoff == 1){
300                                 if(label != "unique"){
301                                         origCutoff = label;
302                                         convert(label, cutoff);
303                                         if(hard == 0){  cutoff += (0.49 / double(precision));   }               
304                                 }
305                                 else{
306                                         origCutoff = "unique";
307                                         cutoff = 0.0000;
308                                 }
309                         }
310                                    
311                         cout << label << endl;
312                         
313                         for(int i=0;i<lNumSeqs;i++){
314                                 phylipFile >> seqName;
315                                 otuIndices[i] = seqMap[seqName];
316                                 
317                                 for(int j=0;j<i;j++){
318                                         phylipFile >> distance;
319                                         
320                                         if(distance <= cutoff){
321                                                 if(otuIndices[i] == otuIndices[j])      {       truePositives++;        }
322                                                 else                                                            {       falseNegatives++;       }
323                                         }
324                                         else{
325                                                 if(otuIndices[i] == otuIndices[j])      {       falsePositives++;       }
326                                                 else                                                            {       trueNegatives++;        }
327                                         }
328                                 }
329                         }
330                         phylipFile.close();
331                         
332                         outputStatistics(label, origCutoff);
333                 }
334                 inputListFile.close();
335
336         }
337         catch(exception& e) {
338                 m->errorOut(e, "SensSpecCommand", "processPhylip");
339                 exit(1);
340         }
341 }
342
343 //***************************************************************************************************************
344
345 void SensSpecCommand::processColumn(){
346         try{            
347                 ifstream inputListFile;
348                 m->openInputFile(listFile, inputListFile);
349                 
350                 string origCutoff = "";
351                 bool getCutoff = 0;
352                 if(cutoff == -1.00)     {       getCutoff = 1;                                                                                                                  }
353                 else                            {       origCutoff = toString(cutoff);  cutoff += (0.49 / double(precision));   }
354                 
355                 set<string> seqPairSet;
356                 
357                 string label, seqList;
358                 int numOTUs;
359                 int numSeqs;
360                 
361                 while(inputListFile){
362                         numSeqs = 0;
363                         
364                         inputListFile >> label >> numOTUs;
365                         for(int i=0;i<numOTUs;i++){
366                                 
367                                 vector<string> seqNameVector;
368                                 
369                                 inputListFile >> seqList;
370                                 int seqListLength = seqList.length();
371                                 string seqName = "";
372                                 for(int j=0;j<seqListLength;j++){
373                                         
374                                         if(seqList[j] == ','){
375                                                 seqNameVector.push_back(seqName);
376                                                 seqName = "";
377                                         }
378                                         else{
379                                                 seqName += seqList[j];
380                                         }
381                                 }
382                                 seqNameVector.push_back(seqName);
383
384                                 numSeqs += seqNameVector.size();
385                                 
386                                 int numSeqsInOTU = seqNameVector.size();
387                                 for(int j=0;j<numSeqsInOTU;j++){
388                                         string seqPairString = "";                              
389                                         for(int k=0;k<j;k++){
390                                                 if(seqNameVector[j] < seqNameVector[k]) {       seqPairString = seqNameVector[j] + '\t' + seqNameVector[k];     }
391                                                 else                                                                    {       seqPairString = seqNameVector[k] + '\t' + seqNameVector[j];     }
392                                                 seqPairSet.insert(seqPairString);
393                                         }
394                                 }
395                         }
396                         m->gobble(inputListFile);
397                         
398                         int numDists = (numSeqs * (numSeqs-1) / 2);
399
400                         ifstream columnFile;
401                         m->openInputFile(distFile, columnFile);
402                         string seqNameA, seqNameB, seqPairString;
403                         double distance;
404                         
405                         truePositives = 0;
406                         falsePositives = 0;
407                         trueNegatives = numDists;
408                         falseNegatives = 0;
409                         
410                         if(getCutoff == 1){
411                                 if(label != "unique"){
412                                         origCutoff = label;
413                                         convert(label, cutoff);
414                                         if(hard == 0){  cutoff += (0.49 / double(precision));   }               
415                                 }
416                                 else{
417                                         origCutoff = "unique";
418                                         cutoff = 0.0000;
419                                 }
420                         }
421                         
422                         cout << label << endl;
423                         
424                         while(columnFile){
425                                 columnFile >> seqNameA >> seqNameB >> distance;
426                                 if(seqNameA < seqNameB) {       seqPairString = seqNameA + '\t' + seqNameB;     }
427                                 else                                    {       seqPairString = seqNameB + '\t' + seqNameA;     }
428
429                                 set<string>::iterator it = seqPairSet.find(seqPairString);
430                         
431                                 if(distance <= cutoff){
432                                         if(it != seqPairSet.end()){
433                                                 truePositives++;
434                                                 seqPairSet.erase(it);   
435                                         }
436                                         else{
437                                                 falseNegatives++;
438                                         }
439                                         trueNegatives--;
440                                 }
441                                 else if(it != seqPairSet.end()){        
442                                         falsePositives++;
443                                         trueNegatives--;
444                                         seqPairSet.erase(it);   
445                                 }
446                                 
447                                 m->gobble(columnFile);
448                         }
449                         falsePositives += seqPairSet.size();
450                         
451                         outputStatistics(label, origCutoff);
452                 }
453         }
454         catch(exception& e) {
455                 m->errorOut(e, "SensSpecCommand", "processColumn");
456                 exit(1);
457         }
458 }
459
460 //***************************************************************************************************************
461
462 void SensSpecCommand::setUpOutput(){
463         try{            
464                 ofstream sensSpecFile;
465                 m->openOutputFile(sensSpecFileName, sensSpecFile);
466                 
467                 sensSpecFile << "label\tcutoff\ttp\ttn\tfp\tfn\tsensitivity\tspecificity\tppv\tnpv\tfdr\taccuracy\tmcc\tf1score\n";
468
469                 sensSpecFile.close();
470         }
471         catch(exception& e) {
472                 m->errorOut(e, "SensSpecCommand", "setUpOutput");
473                 exit(1);
474         }
475 }
476
477 //***************************************************************************************************************
478
479 void SensSpecCommand::outputStatistics(string label, string cutoff){
480         try{            
481                 double tp = (double) truePositives;
482                 double fp = (double) falsePositives;
483                 double tn = (double) trueNegatives;
484                 double fn = (double) falseNegatives;
485                 
486                 double p = tp + fn;
487                 double n = fp + tn;
488                 double pPrime = tp + fp;
489                 double nPrime = tn + fn;
490                 
491                 double sensitivity = tp / p;    
492                 double specificity = tn / n;
493                 double positivePredictiveValue = tp / pPrime;
494                 double negativePredictiveValue = tn / nPrime;
495                 double falseDiscoveryRate = fp / pPrime;
496                 
497                 double accuracy = (tp + tn) / (p + n);
498                 double matthewsCorrCoef = (tp * tn - fp * fn) / sqrt(p * n * pPrime * nPrime);  if(p == 0 || n == 0){   matthewsCorrCoef = 0;   }
499                 double f1Score = 2.0 * tp / (p + pPrime);
500                 
501                 
502                 if(p == 0)                      {       sensitivity = 0;        matthewsCorrCoef = 0;   }
503                 if(n == 0)                      {       specificity = 0;        matthewsCorrCoef = 0;   }
504                 if(p + n == 0)          {       accuracy = 0;                                                           }
505                 if(p + pPrime == 0)     {       f1Score = 0;                                                            }
506                 if(pPrime == 0)         {       positivePredictiveValue = 0;    falseDiscoveryRate = 0; matthewsCorrCoef = 0;   }
507                 if(nPrime == 0)         {       negativePredictiveValue = 0;    matthewsCorrCoef = 0;                                                   }
508                 
509                 ofstream sensSpecFile;
510                 m->openOutputFileAppend(sensSpecFileName, sensSpecFile);
511                 
512                 sensSpecFile << label << '\t' << cutoff << '\t';
513                 sensSpecFile << truePositives << '\t' << trueNegatives << '\t' << falsePositives << '\t' << falseNegatives << '\t';
514                 sensSpecFile << setprecision(4);
515                 sensSpecFile << sensitivity << '\t' << specificity << '\t' << positivePredictiveValue << '\t' << negativePredictiveValue << '\t';
516                 sensSpecFile << falseDiscoveryRate << '\t' << accuracy << '\t' << matthewsCorrCoef << '\t' << f1Score << endl;
517                 
518                 sensSpecFile.close();
519         }
520         catch(exception& e) {
521                 m->errorOut(e, "SensSpecCommand", "outputStatistics");
522                 exit(1);
523         }
524 }
525
526 //***************************************************************************************************************
527
528
529