5 * Created by Pat Schloss on 7/6/10.
6 * Copyright 2010 Schloss Lab. All rights reserved.
10 #include "sensspeccommand.h"
12 //**********************************************************************************************************************
13 vector<string> SensSpecCommand::setParameters(){
15 CommandParameter plist("list", "InputTypes", "", "", "none", "none", "none",false,true); parameters.push_back(plist);
16 CommandParameter pphylip("phylip", "InputTypes", "", "", "PhylipColumn", "PhylipColumn", "none",false,false); parameters.push_back(pphylip);
17 //CommandParameter pname("name", "InputTypes", "", "", "none", "none", "ColumnName",false,false); parameters.push_back(pname);
18 CommandParameter pcolumn("column", "InputTypes", "", "", "PhylipColumn", "PhylipColumn", "none",false,false); parameters.push_back(pcolumn);
19 CommandParameter plabel("label", "String", "", "", "", "", "",false,false); parameters.push_back(plabel);
20 CommandParameter pcutoff("cutoff", "Number", "", "-1.00", "", "", "",false,false); parameters.push_back(pcutoff);
21 CommandParameter pprecision("precision", "Number", "", "100", "", "", "",false,false); parameters.push_back(pprecision);
22 CommandParameter phard("hard", "Boolean", "", "F", "", "", "",false,false); parameters.push_back(phard);
23 CommandParameter pinputdir("inputdir", "String", "", "", "", "", "",false,false); parameters.push_back(pinputdir);
24 CommandParameter poutputdir("outputdir", "String", "", "", "", "", "",false,false); parameters.push_back(poutputdir);
26 vector<string> myArray;
27 for (int i = 0; i < parameters.size(); i++) { myArray.push_back(parameters[i].name); }
31 m->errorOut(e, "SensSpecCommand", "setParameters");
35 //**********************************************************************************************************************
36 string SensSpecCommand::getHelpString(){
38 string helpString = "";
39 helpString += "The get.oturep command parameters are phylip, column, list, fasta, name, group, large, weighted, cutoff, precision, groups, sorted and label. The fasta and list parameters are required, as well as phylip or column and name, unless you have valid current files.\n";
40 helpString += "The label parameter allows you to select what distance levels you would like a output files created for, and is separated by dashes.\n";
41 helpString += "The phylip or column parameter is required, but only one may be used. If you use a column file the name filename is required. \n";
42 helpString += "If you do not provide a cutoff value 10.00 is assumed. If you do not provide a precision value then 100 is assumed.\n";
43 helpString += "The get.oturep command should be in the following format: get.oturep(phylip=yourDistanceMatrix, fasta=yourFastaFile, list=yourListFile, name=yourNamesFile, group=yourGroupFile, label=yourLabels).\n";
44 helpString += "Example get.oturep(phylip=amazon.dist, fasta=amazon.fasta, list=amazon.fn.list, group=amazon.groups).\n";
45 helpString += "The default value for label is all labels in your inputfile.\n";
46 helpString += "The sorted parameter allows you to indicate you want the output sorted. You can sort by sequence name, bin number, bin size or group. The default is no sorting, but your options are name, number, size, or group.\n";
47 helpString += "The large parameter allows you to indicate that your distance matrix is too large to fit in RAM. The default value is false.\n";
48 helpString += "The weighted parameter allows you to indicate that want to find the weighted representative. You must provide a namesfile to set weighted to true. The default value is false.\n";
49 helpString += "The representative is found by selecting the sequence that has the smallest total distance to all other sequences in the OTU. If a tie occurs the smallest average distance is used.\n";
50 helpString += "For weighted = false, mothur assumes the distance file contains only unique sequences, the list file may contain all sequences, but only the uniques are considered to become the representative. If your distance file contains all the sequences it would become weighted=true.\n";
51 helpString += "For weighted = true, mothur assumes the distance file contains only unique sequences, the list file must contain all sequences, all sequences are considered to become the representative, but unique name will be used in the output for consistency.\n";
52 helpString += "If your distance file contains all the sequence and you do not provide a name file, the weighted representative will be given, unless your listfile is unique. If you provide a namefile, then you can select weighted or unweighted.\n";
53 helpString += "The group parameter allows you provide a group file.\n";
54 helpString += "The groups parameter allows you to indicate that you want representative sequences for each group specified for each OTU, group name should be separated by dashes. ex. groups=A-B-C.\n";
55 helpString += "The get.oturep command outputs a .fastarep and .rep.names file for each distance you specify, selecting one OTU representative for each bin.\n";
56 helpString += "If you provide a groupfile, then it also appends the names of the groups present in that bin.\n";
57 helpString += "Note: No spaces between parameter labels (i.e. fasta), '=' and parameters (i.e.yourFastaFile).\n";
61 m->errorOut(e, "SensSpecCommand", "getHelpString");
65 //**********************************************************************************************************************
66 SensSpecCommand::SensSpecCommand(){
68 abort = true; calledHelp = true;
70 vector<string> tempOutNames;
71 outputTypes["sensspec"] = tempOutNames;
74 m->errorOut(e, "SensSpecCommand", "SensSpecCommand");
78 //***************************************************************************************************************
80 SensSpecCommand::SensSpecCommand(string option) {
83 abort = false; calledHelp = false;
85 //allow user to run help
86 if(option == "help") { help(); abort = true; calledHelp = true; }
91 //valid paramters for this command
92 string AlignArray[] = {"list", "phylip", "column", "hard", "label", "cutoff", "precision", "outputdir", "inputdir"};
94 vector<string> myArray (AlignArray, AlignArray+(sizeof(AlignArray)/sizeof(string)));
96 OptionParser parser(option);
97 map<string,string> parameters = parser.getParameters();
99 ValidParameters validParameter;
100 map<string,string>::iterator it;
102 //check to make sure all parameters are valid for command
103 for (it = parameters.begin(); it != parameters.end(); it++) {
104 if (validParameter.isValidParameter(it->first, myArray, it->second) != true) { abort = true; }
107 //initialize outputTypes
108 vector<string> tempOutNames;
109 outputTypes["sensspec"] = tempOutNames;
111 //if the user changes the input directory command factory will send this info to us in the output parameter
112 string inputDir = validParameter.validFile(parameters, "inputdir", false);
113 if (inputDir == "not found"){ inputDir = ""; }
116 it = parameters.find("list");
117 //user has given a template file
118 if(it != parameters.end()){
119 path = m->hasPath(it->second);
120 //if the user has not given a path then, add inputdir. else leave path alone.
121 if (path == "") { parameters["list"] = inputDir + it->second; }
124 it = parameters.find("phylip");
125 //user has given a template file
126 if(it != parameters.end()){
127 path = m->hasPath(it->second);
128 //if the user has not given a path then, add inputdir. else leave path alone.
129 if (path == "") { parameters["phylip"] = inputDir + it->second; }
132 it = parameters.find("column");
133 //user has given a template file
134 if(it != parameters.end()){
135 path = m->hasPath(it->second);
136 //if the user has not given a path then, add inputdir. else leave path alone.
137 if (path == "") { parameters["column"] = inputDir + it->second; }
140 //it = parameters.find("name");
141 //user has given a template file
142 //if(it != parameters.end()){
143 //path = m->hasPath(it->second);
144 //if the user has not given a path then, add inputdir. else leave path alone.
145 //if (path == "") { parameters["name"] = inputDir + it->second; }
149 //check for required parameters
150 listFile = validParameter.validFile(parameters, "list", true);
151 if (listFile == "not found") {
152 listFile = m->getListFile();
153 if (listFile != "") { m->mothurOut("Using " + listFile + " as input file for the list parameter."); m->mothurOutEndLine(); }
154 else { m->mothurOut("You have no current list file and the list parameter is required."); m->mothurOutEndLine(); abort = true; }
156 else if (listFile == "not open") { abort = true; }
158 phylipfile = validParameter.validFile(parameters, "phylip", true);
159 if (phylipfile == "not found") { phylipfile = ""; }
160 else if (phylipfile == "not open") { abort = true; }
161 else { distFile = phylipfile; format = "phylip"; }
163 columnfile = validParameter.validFile(parameters, "column", true);
164 if (columnfile == "not found") { columnfile = ""; }
165 else if (columnfile == "not open") { abort = true; }
166 else { distFile = columnfile; format = "column"; }
168 if ((phylipfile == "") && (columnfile == "")) { //is there are current file available for either of these?
169 //give priority to column, then phylip
170 columnfile = m->getColumnFile();
171 if (columnfile != "") { distFile = columnfile; format = "column"; m->mothurOut("Using " + columnfile + " as input file for the column parameter."); m->mothurOutEndLine(); }
173 phylipfile = m->getPhylipFile();
174 if (phylipfile != "") { distFile = phylipfile; format = "phylip"; m->mothurOut("Using " + phylipfile + " as input file for the phylip parameter."); m->mothurOutEndLine(); }
176 m->mothurOut("No valid current files. You must provide a phylip or column file."); m->mothurOutEndLine();
180 }else if ((phylipfile != "") && (columnfile != "")) { m->mothurOut("When executing a sens.spec command you must enter ONLY ONE of the following: phylip or column."); m->mothurOutEndLine(); abort = true; }
183 //if the user changes the output directory command factory will send this info to us in the output parameter
184 outputDir = validParameter.validFile(parameters, "outputdir", false);
185 if (outputDir == "not found"){
187 outputDir += m->hasPath(listFile); //if user entered a file with a path then preserve it
190 //check for optional parameter and set defaults
191 // ...at some point should added some additional type checking...
192 temp = validParameter.validFile(parameters, "hard", false);
193 if (temp == "not found"){ hard = 0; }
194 else if(!m->isTrue(temp)) { hard = 0; }
195 else if(m->isTrue(temp)) { hard = 1; }
197 // temp = validParameter.validFile(parameters, "name", true);
198 // if (temp == "not found") { nameFile = ""; }
199 // else if(temp == "not open") { abort = true; }
200 // else { nameFile = temp; }
201 // cout << "name:\t" << nameFile << endl;
203 temp = validParameter.validFile(parameters, "cutoff", false); if (temp == "not found") { temp = "-1.00"; }
204 convert(temp, cutoff);
205 // cout << cutoff << endl;
207 temp = validParameter.validFile(parameters, "precision", false); if (temp == "not found") { temp = "100"; }
208 convert(temp, precision);
209 // cout << precision << endl;
211 lineLabel = validParameter.validFile(parameters, "label", false); if (lineLabel == "not found") { lineLabel = ""; }
213 sensSpecFileName = listFile.substr(0,listFile.find_last_of('.')) + ".sensspec";
216 catch(exception& e) {
217 m->errorOut(e, "SensSpecCommand", "SensSpecCommand");
221 //***************************************************************************************************************
223 int SensSpecCommand::execute(){
225 if (abort == true) { if (calledHelp) { return 0; } return 2; }
228 outputNames.push_back(sensSpecFileName); outputTypes["sensspec"].push_back(sensSpecFileName);
229 if(format == "phylip") { processPhylip(); }
230 else if(format == "column") { processColumn(); }
235 catch(exception& e) {
236 m->errorOut(e, "SensSpecCommand", "execute");
241 //***************************************************************************************************************
243 void SensSpecCommand::processPhylip(){
245 //probably need some checking to confirm that the names in the distance matrix are the same as those in the list file
247 ifstream inputListFile;
248 m->openInputFile(listFile, inputListFile);
250 string origCutoff = "";
252 if(cutoff == -1.00) { getCutoff = 1; }
253 else { origCutoff = toString(cutoff); cutoff += (0.49 / double(precision)); }
258 map<string, int> seqMap;
261 while(inputListFile){
262 inputListFile >> label >> numOTUs;
263 for(int i=0;i<numOTUs;i++){
264 inputListFile >> seqList;
265 int seqListLength = seqList.length();
267 for(int j=0;j<seqListLength;j++){
269 if(seqList[j] == ','){
274 seqName += seqList[j];
280 m->gobble(inputListFile);
282 int lNumSeqs = seqMap.size();
286 m->openInputFile(distFile, phylipFile);
287 phylipFile >> pNumSeqs;
288 if(pNumSeqs != lNumSeqs){ cout << "numSeq mismatch!" << endl; }
292 vector<int> otuIndices(lNumSeqs, -1);
300 if(label != "unique"){
302 convert(label, cutoff);
303 if(hard == 0){ cutoff += (0.49 / double(precision)); }
306 origCutoff = "unique";
311 cout << label << endl;
313 for(int i=0;i<lNumSeqs;i++){
314 phylipFile >> seqName;
315 otuIndices[i] = seqMap[seqName];
317 for(int j=0;j<i;j++){
318 phylipFile >> distance;
320 if(distance <= cutoff){
321 if(otuIndices[i] == otuIndices[j]) { truePositives++; }
322 else { falseNegatives++; }
325 if(otuIndices[i] == otuIndices[j]) { falsePositives++; }
326 else { trueNegatives++; }
332 outputStatistics(label, origCutoff);
334 inputListFile.close();
337 catch(exception& e) {
338 m->errorOut(e, "SensSpecCommand", "processPhylip");
343 //***************************************************************************************************************
345 void SensSpecCommand::processColumn(){
347 ifstream inputListFile;
348 m->openInputFile(listFile, inputListFile);
350 string origCutoff = "";
352 if(cutoff == -1.00) { getCutoff = 1; }
353 else { origCutoff = toString(cutoff); cutoff += (0.49 / double(precision)); }
355 set<string> seqPairSet;
357 string label, seqList;
361 while(inputListFile){
364 inputListFile >> label >> numOTUs;
365 for(int i=0;i<numOTUs;i++){
367 vector<string> seqNameVector;
369 inputListFile >> seqList;
370 int seqListLength = seqList.length();
372 for(int j=0;j<seqListLength;j++){
374 if(seqList[j] == ','){
375 seqNameVector.push_back(seqName);
379 seqName += seqList[j];
382 seqNameVector.push_back(seqName);
384 numSeqs += seqNameVector.size();
386 int numSeqsInOTU = seqNameVector.size();
387 for(int j=0;j<numSeqsInOTU;j++){
388 string seqPairString = "";
389 for(int k=0;k<j;k++){
390 if(seqNameVector[j] < seqNameVector[k]) { seqPairString = seqNameVector[j] + '\t' + seqNameVector[k]; }
391 else { seqPairString = seqNameVector[k] + '\t' + seqNameVector[j]; }
392 seqPairSet.insert(seqPairString);
396 m->gobble(inputListFile);
398 int numDists = (numSeqs * (numSeqs-1) / 2);
401 m->openInputFile(distFile, columnFile);
402 string seqNameA, seqNameB, seqPairString;
407 trueNegatives = numDists;
411 if(label != "unique"){
413 convert(label, cutoff);
414 if(hard == 0){ cutoff += (0.49 / double(precision)); }
417 origCutoff = "unique";
422 cout << label << endl;
425 columnFile >> seqNameA >> seqNameB >> distance;
426 if(seqNameA < seqNameB) { seqPairString = seqNameA + '\t' + seqNameB; }
427 else { seqPairString = seqNameB + '\t' + seqNameA; }
429 set<string>::iterator it = seqPairSet.find(seqPairString);
431 if(distance <= cutoff){
432 if(it != seqPairSet.end()){
434 seqPairSet.erase(it);
441 else if(it != seqPairSet.end()){
444 seqPairSet.erase(it);
447 m->gobble(columnFile);
449 falsePositives += seqPairSet.size();
451 outputStatistics(label, origCutoff);
454 catch(exception& e) {
455 m->errorOut(e, "SensSpecCommand", "processColumn");
460 //***************************************************************************************************************
462 void SensSpecCommand::setUpOutput(){
464 ofstream sensSpecFile;
465 m->openOutputFile(sensSpecFileName, sensSpecFile);
467 sensSpecFile << "label\tcutoff\ttp\ttn\tfp\tfn\tsensitivity\tspecificity\tppv\tnpv\tfdr\taccuracy\tmcc\tf1score\n";
469 sensSpecFile.close();
471 catch(exception& e) {
472 m->errorOut(e, "SensSpecCommand", "setUpOutput");
477 //***************************************************************************************************************
479 void SensSpecCommand::outputStatistics(string label, string cutoff){
481 double tp = (double) truePositives;
482 double fp = (double) falsePositives;
483 double tn = (double) trueNegatives;
484 double fn = (double) falseNegatives;
488 double pPrime = tp + fp;
489 double nPrime = tn + fn;
491 double sensitivity = tp / p;
492 double specificity = tn / n;
493 double positivePredictiveValue = tp / pPrime;
494 double negativePredictiveValue = tn / nPrime;
495 double falseDiscoveryRate = fp / pPrime;
497 double accuracy = (tp + tn) / (p + n);
498 double matthewsCorrCoef = (tp * tn - fp * fn) / sqrt(p * n * pPrime * nPrime); if(p == 0 || n == 0){ matthewsCorrCoef = 0; }
499 double f1Score = 2.0 * tp / (p + pPrime);
502 if(p == 0) { sensitivity = 0; matthewsCorrCoef = 0; }
503 if(n == 0) { specificity = 0; matthewsCorrCoef = 0; }
504 if(p + n == 0) { accuracy = 0; }
505 if(p + pPrime == 0) { f1Score = 0; }
506 if(pPrime == 0) { positivePredictiveValue = 0; falseDiscoveryRate = 0; matthewsCorrCoef = 0; }
507 if(nPrime == 0) { negativePredictiveValue = 0; matthewsCorrCoef = 0; }
509 ofstream sensSpecFile;
510 m->openOutputFileAppend(sensSpecFileName, sensSpecFile);
512 sensSpecFile << label << '\t' << cutoff << '\t';
513 sensSpecFile << truePositives << '\t' << trueNegatives << '\t' << falsePositives << '\t' << falseNegatives << '\t';
514 sensSpecFile << setprecision(4);
515 sensSpecFile << sensitivity << '\t' << specificity << '\t' << positivePredictiveValue << '\t' << negativePredictiveValue << '\t';
516 sensSpecFile << falseDiscoveryRate << '\t' << accuracy << '\t' << matthewsCorrCoef << '\t' << f1Score << endl;
518 sensSpecFile.close();
520 catch(exception& e) {
521 m->errorOut(e, "SensSpecCommand", "outputStatistics");
526 //***************************************************************************************************************