5 * Created by Pat Schloss on 7/6/10.
6 * Copyright 2010 Schloss Lab. All rights reserved.
10 #include "sensspeccommand.h"
12 //**********************************************************************************************************************
13 vector<string> SensSpecCommand::setParameters(){
15 CommandParameter plist("list", "InputTypes", "", "", "none", "none", "none","sensspec",false,true,true); parameters.push_back(plist);
16 CommandParameter pphylip("phylip", "InputTypes", "", "", "PhylipColumn", "PhylipColumn", "none","",false,false); parameters.push_back(pphylip);
17 CommandParameter pcolumn("column", "InputTypes", "", "", "PhylipColumn", "PhylipColumn", "none","",false,false); parameters.push_back(pcolumn);
18 CommandParameter plabel("label", "String", "", "", "", "", "","",false,false); parameters.push_back(plabel);
19 CommandParameter pcutoff("cutoff", "Number", "", "-1.00", "", "", "","",false,false); parameters.push_back(pcutoff);
20 CommandParameter pprecision("precision", "Number", "", "100", "", "", "","",false,false); parameters.push_back(pprecision);
21 CommandParameter phard("hard", "Boolean", "", "T", "", "", "","",false,false); parameters.push_back(phard);
22 CommandParameter pinputdir("inputdir", "String", "", "", "", "", "","",false,false); parameters.push_back(pinputdir);
23 CommandParameter poutputdir("outputdir", "String", "", "", "", "", "","",false,false); parameters.push_back(poutputdir);
25 vector<string> myArray;
26 for (int i = 0; i < parameters.size(); i++) { myArray.push_back(parameters[i].name); }
30 m->errorOut(e, "SensSpecCommand", "setParameters");
34 //**********************************************************************************************************************
35 string SensSpecCommand::getHelpString(){
37 string helpString = "";
38 helpString += "The sens.spec command....\n";
42 m->errorOut(e, "SensSpecCommand", "getHelpString");
46 //**********************************************************************************************************************
47 string SensSpecCommand::getOutputPattern(string type) {
51 if (type == "sensspec") { pattern = "[filename],sensspec"; }
52 else { m->mothurOut("[ERROR]: No definition for type " + type + " output pattern.\n"); m->control_pressed = true; }
57 m->errorOut(e, "SensSpecCommand", "getOutputPattern");
61 //**********************************************************************************************************************
62 SensSpecCommand::SensSpecCommand(){
64 abort = true; calledHelp = true;
66 vector<string> tempOutNames;
67 outputTypes["sensspec"] = tempOutNames;
70 m->errorOut(e, "SensSpecCommand", "SensSpecCommand");
74 //***************************************************************************************************************
76 SensSpecCommand::SensSpecCommand(string option) {
79 abort = false; calledHelp = false;
82 //allow user to run help
83 if(option == "help") { help(); abort = true; calledHelp = true; }
84 else if(option == "citation") { citation(); abort = true; calledHelp = true;}
89 vector<string> myArray = setParameters();
91 OptionParser parser(option);
92 map<string,string> parameters = parser.getParameters();
94 ValidParameters validParameter;
95 map<string,string>::iterator it;
97 //check to make sure all parameters are valid for command
98 for (it = parameters.begin(); it != parameters.end(); it++) {
99 if (validParameter.isValidParameter(it->first, myArray, it->second) != true) { abort = true; }
102 //initialize outputTypes
103 vector<string> tempOutNames;
104 outputTypes["sensspec"] = tempOutNames;
106 //if the user changes the input directory command factory will send this info to us in the output parameter
107 string inputDir = validParameter.validFile(parameters, "inputdir", false);
108 if (inputDir == "not found"){ inputDir = ""; }
111 it = parameters.find("list");
112 //user has given a template file
113 if(it != parameters.end()){
114 path = m->hasPath(it->second);
115 //if the user has not given a path then, add inputdir. else leave path alone.
116 if (path == "") { parameters["list"] = inputDir + it->second; }
119 it = parameters.find("phylip");
120 //user has given a template file
121 if(it != parameters.end()){
122 path = m->hasPath(it->second);
123 //if the user has not given a path then, add inputdir. else leave path alone.
124 if (path == "") { parameters["phylip"] = inputDir + it->second; }
127 it = parameters.find("column");
128 //user has given a template file
129 if(it != parameters.end()){
130 path = m->hasPath(it->second);
131 //if the user has not given a path then, add inputdir. else leave path alone.
132 if (path == "") { parameters["column"] = inputDir + it->second; }
135 //check for required parameters
136 listFile = validParameter.validFile(parameters, "list", true);
137 if (listFile == "not found") {
138 listFile = m->getListFile();
139 if (listFile != "") { m->mothurOut("Using " + listFile + " as input file for the list parameter."); m->mothurOutEndLine(); }
140 else { m->mothurOut("You have no current list file and the list parameter is required."); m->mothurOutEndLine(); abort = true; }
142 else if (listFile == "not open") { abort = true; }
143 else { m->setListFile(listFile); }
145 phylipfile = validParameter.validFile(parameters, "phylip", true);
146 if (phylipfile == "not found") { phylipfile = ""; }
147 else if (phylipfile == "not open") { abort = true; }
148 else { distFile = phylipfile; format = "phylip"; m->setPhylipFile(phylipfile); }
150 columnfile = validParameter.validFile(parameters, "column", true);
151 if (columnfile == "not found") { columnfile = ""; }
152 else if (columnfile == "not open") { abort = true; }
153 else { distFile = columnfile; format = "column"; m->setColumnFile(columnfile); }
155 if ((phylipfile == "") && (columnfile == "")) { //is there are current file available for either of these?
156 //give priority to column, then phylip
157 columnfile = m->getColumnFile();
158 if (columnfile != "") { distFile = columnfile; format = "column"; m->mothurOut("Using " + columnfile + " as input file for the column parameter."); m->mothurOutEndLine(); }
160 phylipfile = m->getPhylipFile();
161 if (phylipfile != "") { distFile = phylipfile; format = "phylip"; m->mothurOut("Using " + phylipfile + " as input file for the phylip parameter."); m->mothurOutEndLine(); }
163 m->mothurOut("No valid current files. You must provide a phylip or column file."); m->mothurOutEndLine();
167 }else if ((phylipfile != "") && (columnfile != "")) { m->mothurOut("When executing a sens.spec command you must enter ONLY ONE of the following: phylip or column."); m->mothurOutEndLine(); abort = true; }
170 //if the user changes the output directory command factory will send this info to us in the output parameter
171 outputDir = validParameter.validFile(parameters, "outputdir", false);
172 if (outputDir == "not found"){
174 outputDir += m->hasPath(listFile); //if user entered a file with a path then preserve it
177 //check for optional parameter and set defaults
178 // ...at some point should added some additional type checking...
179 temp = validParameter.validFile(parameters, "hard", false);
180 if (temp == "not found"){ hard = 0; }
181 else if(!m->isTrue(temp)) { hard = 0; }
182 else if(m->isTrue(temp)) { hard = 1; }
184 temp = validParameter.validFile(parameters, "cutoff", false); if (temp == "not found") { temp = "-1.00"; }
185 m->mothurConvert(temp, cutoff);
186 // cout << cutoff << endl;
188 temp = validParameter.validFile(parameters, "precision", false); if (temp == "not found") { temp = "100"; }
189 m->mothurConvert(temp, precision);
190 // cout << precision << endl;
192 string label = validParameter.validFile(parameters, "label", false);
193 if (label == "not found") { label = ""; }
195 if(label != "all") { m->splitAtDash(label, labels); allLines = 0; }
196 else { allLines = 1; }
199 map<string, string> variables;
200 variables["[filename]"] = outputDir + m->getRootName(m->getSimpleName(listFile));
201 sensSpecFileName = getOutputFileName("sensspec",variables);
204 catch(exception& e) {
205 m->errorOut(e, "SensSpecCommand", "SensSpecCommand");
209 //***************************************************************************************************************
211 int SensSpecCommand::execute(){
213 if (abort == true) { if (calledHelp) { return 0; } return 2; }
215 int startTime = time(NULL);
217 //create list file with only unique names, saves time and memory by removing redundant names from list file that are not in the distance file.
218 string newListFile = preProcessList();
219 if (newListFile != "") { listFile = newListFile; }
222 outputNames.push_back(sensSpecFileName); outputTypes["sensspec"].push_back(sensSpecFileName);
223 if(format == "phylip") { processPhylip(); }
224 else if(format == "column") { processColumn(); }
226 //remove temp file if created
227 if (newListFile != "") { m->mothurRemove(newListFile); }
229 if (m->control_pressed) { m->mothurRemove(sensSpecFileName); return 0; }
231 m->mothurOut("It took " + toString(time(NULL) - startTime) + " to run sens.spec."); m->mothurOutEndLine();
233 m->mothurOutEndLine();
234 m->mothurOut("Output File Names: "); m->mothurOutEndLine();
235 m->mothurOut(sensSpecFileName); m->mothurOutEndLine();
236 m->mothurOutEndLine();
241 catch(exception& e) {
242 m->errorOut(e, "SensSpecCommand", "execute");
246 //***************************************************************************************************************
247 bool SensSpecCommand::testFile(){
250 m->openInputFile(phylipfile, fileHandle);
253 string numTest, name;
254 fileHandle >> numTest >> name;
256 if (!m->isContainingOnlyDigits(numTest)) { m->mothurOut("[ERROR]: expected a number and got " + numTest + ", quitting."); m->mothurOutEndLine(); exit(1); }
259 while((d=fileHandle.get()) != EOF){
273 catch(exception& e) {
274 m->errorOut(e, "SensSpecCommand", "testFile");
279 //***************************************************************************************************************
281 int SensSpecCommand::processPhylip(){
283 //probably need some checking to confirm that the names in the distance matrix are the same as those in the list file
285 string origCutoff = "";
287 if(cutoff == -1.00) { getCutoff = 1; }
288 else { origCutoff = toString(cutoff); cutoff += (0.49 / double(precision)); }
290 map<string, int> seqMap;
293 InputData input(listFile, "list");
294 ListVector* list = input.getListVector();
295 string lastLabel = list->getLabel();
297 //if the users enters label "0.06" and there is no "0.06" in their file use the next lowest label.
298 set<string> processedLabels;
299 set<string> userLabels = labels;
301 while((list != NULL) && ((allLines == 1) || (userLabels.size() != 0))) {
303 if(m->control_pressed){
304 for (int i = 0; i < outputNames.size(); i++){ m->mothurRemove(outputNames[i]); } delete list; return 0;
307 if(allLines == 1 || labels.count(list->getLabel()) == 1){
309 processedLabels.insert(list->getLabel());
310 userLabels.erase(list->getLabel());
313 fillSeqMap(seqMap, list);
314 process(seqMap, list->getLabel(), getCutoff, origCutoff);
317 if ((m->anyLabelsToProcess(list->getLabel(), userLabels, "") == true) && (processedLabels.count(lastLabel) != 1)) {
318 string saveLabel = list->getLabel();
321 list = input.getListVector(lastLabel);
323 processedLabels.insert(list->getLabel());
324 userLabels.erase(list->getLabel());
327 fillSeqMap(seqMap, list);
328 process(seqMap, list->getLabel(), getCutoff, origCutoff);
330 //restore real lastlabel to save below
331 list->setLabel(saveLabel);
334 lastLabel = list->getLabel();
337 list = input.getListVector();
341 //output error messages about any remaining user labels
342 set<string>::iterator it;
343 bool needToRun = false;
344 for (it = userLabels.begin(); it != userLabels.end(); it++) {
345 m->mothurOut("Your file does not include the label " + *it);
346 if (processedLabels.count(lastLabel) != 1) {
347 m->mothurOut(". I will use " + lastLabel + "."); m->mothurOutEndLine();
350 m->mothurOut(". Please refer to " + lastLabel + "."); m->mothurOutEndLine();
354 //run last label if you need to
355 if (needToRun == true) {
356 if (list != NULL) { delete list; }
357 list = input.getListVector(lastLabel);
360 fillSeqMap(seqMap, list);
361 process(seqMap, list->getLabel(), getCutoff, origCutoff);
368 catch(exception& e) {
369 m->errorOut(e, "SensSpecCommand", "processPhylip");
374 //***************************************************************************************************************
376 int SensSpecCommand::fillSeqMap(map<string, int>& seqMap, ListVector*& list){
379 for(int i=0;i<list->getNumBins();i++){
381 if (m->control_pressed) { return 0; }
383 string seqList = list->get(i);
384 int seqListLength = seqList.length();
387 //parse bin by name, mapping each name to its otu number
388 for(int j=0;j<seqListLength;j++){
390 if(seqList[j] == ','){
395 seqName += seqList[j];
404 catch(exception& e) {
405 m->errorOut(e, "SensSpecCommand", "fillSeqMap");
409 //***************************************************************************************************************
410 int SensSpecCommand::fillSeqPairSet(set<string>& seqPairSet, ListVector*& list){
415 for(int i=0;i<list->getNumBins();i++){
417 if (m->control_pressed) { return 0; }
419 vector<string> seqNameVector;
420 string bin = list->get(i);
421 m->splitAtComma(bin, seqNameVector);
423 numSeqs += seqNameVector.size();
425 for(int j=0;j<seqNameVector.size();j++){
426 string seqPairString = "";
427 for(int k=0;k<j;k++){
428 if(seqNameVector[j] < seqNameVector[k]) { seqPairString = seqNameVector[j] + '\t' + seqNameVector[k]; }
429 else { seqPairString = seqNameVector[k] + '\t' + seqNameVector[j]; }
430 seqPairSet.insert(seqPairString);
437 catch(exception& e) {
438 m->errorOut(e, "SensSpecCommand", "fillSeqPairSet");
442 //***************************************************************************************************************
443 int SensSpecCommand::process(map<string, int>& seqMap, string label, bool& getCutoff, string& origCutoff){
446 int lNumSeqs = seqMap.size();
450 m->openInputFile(distFile, phylipFile);
451 phylipFile >> pNumSeqs;
452 if(pNumSeqs != lNumSeqs){ m->mothurOut("numSeq mismatch!\n"); /*m->control_pressed = true;*/ }
456 vector<int> otuIndices(lNumSeqs, -1);
464 if(label != "unique"){
466 convert(label, cutoff);
467 if(hard == 0){ cutoff += (0.49 / double(precision)); }
470 origCutoff = "unique";
475 m->mothurOut(label); m->mothurOutEndLine();
477 for(int i=0;i<pNumSeqs;i++){
479 if (m->control_pressed) { return 0; }
481 phylipFile >> seqName;
482 otuIndices[i] = seqMap[seqName];
484 for(int j=0;j<i;j++){
485 phylipFile >> distance;
487 if(distance <= cutoff){
488 if(otuIndices[i] == otuIndices[j]) { truePositives++; }
489 else { falseNegatives++; }
492 if(otuIndices[i] == otuIndices[j]) { falsePositives++; }
493 else { trueNegatives++; }
497 if (square) { m->getline(phylipFile); } //get rest of line - redundant distances
498 m->gobble(phylipFile);
502 outputStatistics(label, origCutoff);
506 catch(exception& e) {
507 m->errorOut(e, "SensSpecCommand", "process");
511 //***************************************************************************************************************
512 int SensSpecCommand::process(set<string>& seqPairSet, string label, bool& getCutoff, string& origCutoff, int numSeqs){
514 int numDists = (numSeqs * (numSeqs-1) / 2);
517 m->openInputFile(distFile, columnFile);
518 string seqNameA, seqNameB, seqPairString;
523 trueNegatives = numDists;
527 if(label != "unique"){
529 convert(label, cutoff);
530 if(hard == 0){ cutoff += (0.49 / double(precision)); }
533 origCutoff = "unique";
538 m->mothurOut(label); m->mothurOutEndLine();
541 columnFile >> seqNameA >> seqNameB >> distance;
542 if(seqNameA < seqNameB) { seqPairString = seqNameA + '\t' + seqNameB; }
543 else { seqPairString = seqNameB + '\t' + seqNameA; }
545 set<string>::iterator it = seqPairSet.find(seqPairString);
547 if(distance <= cutoff){
548 if(it != seqPairSet.end()){
550 seqPairSet.erase(it);
557 else if(it != seqPairSet.end()){
560 seqPairSet.erase(it);
563 m->gobble(columnFile);
565 falsePositives += seqPairSet.size();
567 outputStatistics(label, origCutoff);
572 catch(exception& e) {
573 m->errorOut(e, "SensSpecCommand", "process");
577 //***************************************************************************************************************
579 int SensSpecCommand::processColumn(){
581 string origCutoff = "";
583 if(cutoff == -1.00) { getCutoff = 1; }
584 else { origCutoff = toString(cutoff); cutoff += (0.49 / double(precision)); }
586 set<string> seqPairSet;
589 InputData input(listFile, "list");
590 ListVector* list = input.getListVector();
591 string lastLabel = list->getLabel();
593 //if the users enters label "0.06" and there is no "0.06" in their file use the next lowest label.
594 set<string> processedLabels;
595 set<string> userLabels = labels;
598 while((list != NULL) && ((allLines == 1) || (userLabels.size() != 0))) {
600 if (m->control_pressed) { for (int i = 0; i < outputNames.size(); i++) { m->mothurRemove(outputNames[i]); } delete list; return 0; }
602 if(allLines == 1 || labels.count(list->getLabel()) == 1){
604 processedLabels.insert(list->getLabel());
605 userLabels.erase(list->getLabel());
608 numSeqs = fillSeqPairSet(seqPairSet, list);
609 process(seqPairSet, list->getLabel(), getCutoff, origCutoff, numSeqs);
612 if ((m->anyLabelsToProcess(list->getLabel(), userLabels, "") == true) && (processedLabels.count(lastLabel) != 1)) {
613 string saveLabel = list->getLabel();
616 list = input.getListVector(lastLabel);
618 processedLabels.insert(list->getLabel());
619 userLabels.erase(list->getLabel());
622 numSeqs = fillSeqPairSet(seqPairSet, list);
623 process(seqPairSet, list->getLabel(), getCutoff, origCutoff, numSeqs);
625 //restore real lastlabel to save below
626 list->setLabel(saveLabel);
629 lastLabel = list->getLabel();
632 list = input.getListVector();
636 //output error messages about any remaining user labels
637 set<string>::iterator it;
638 bool needToRun = false;
639 for (it = userLabels.begin(); it != userLabels.end(); it++) {
640 m->mothurOut("Your file does not include the label " + *it);
641 if (processedLabels.count(lastLabel) != 1) {
642 m->mothurOut(". I will use " + lastLabel + "."); m->mothurOutEndLine();
645 m->mothurOut(". Please refer to " + lastLabel + "."); m->mothurOutEndLine();
649 //run last label if you need to
650 if (needToRun == true) {
651 if (list != NULL) { delete list; }
652 list = input.getListVector(lastLabel);
655 numSeqs = fillSeqPairSet(seqPairSet, list);
657 process(seqPairSet, list->getLabel(), getCutoff, origCutoff, numSeqs);
662 catch(exception& e) {
663 m->errorOut(e, "SensSpecCommand", "processColumn");
668 //***************************************************************************************************************
670 void SensSpecCommand::setUpOutput(){
672 ofstream sensSpecFile;
673 m->openOutputFile(sensSpecFileName, sensSpecFile);
675 sensSpecFile << "label\tcutoff\ttp\ttn\tfp\tfn\tsensitivity\tspecificity\tppv\tnpv\tfdr\taccuracy\tmcc\tf1score\n";
677 sensSpecFile.close();
679 catch(exception& e) {
680 m->errorOut(e, "SensSpecCommand", "setUpOutput");
685 //***************************************************************************************************************
687 void SensSpecCommand::outputStatistics(string label, string cutoff){
689 double tp = (double) truePositives;
690 double fp = (double) falsePositives;
691 double tn = (double) trueNegatives;
692 double fn = (double) falseNegatives;
696 double pPrime = tp + fp;
697 double nPrime = tn + fn;
699 double sensitivity = tp / p;
700 double specificity = tn / n;
701 double positivePredictiveValue = tp / pPrime;
702 double negativePredictiveValue = tn / nPrime;
703 double falseDiscoveryRate = fp / pPrime;
705 double accuracy = (tp + tn) / (p + n);
706 double matthewsCorrCoef = (tp * tn - fp * fn) / sqrt(p * n * pPrime * nPrime); if(p == 0 || n == 0){ matthewsCorrCoef = 0; }
707 double f1Score = 2.0 * tp / (p + pPrime);
710 if(p == 0) { sensitivity = 0; matthewsCorrCoef = 0; }
711 if(n == 0) { specificity = 0; matthewsCorrCoef = 0; }
712 if(p + n == 0) { accuracy = 0; }
713 if(p + pPrime == 0) { f1Score = 0; }
714 if(pPrime == 0) { positivePredictiveValue = 0; falseDiscoveryRate = 0; matthewsCorrCoef = 0; }
715 if(nPrime == 0) { negativePredictiveValue = 0; matthewsCorrCoef = 0; }
717 ofstream sensSpecFile;
718 m->openOutputFileAppend(sensSpecFileName, sensSpecFile);
720 sensSpecFile << label << '\t' << cutoff << '\t';
721 sensSpecFile << truePositives << '\t' << trueNegatives << '\t' << falsePositives << '\t' << falseNegatives << '\t';
722 sensSpecFile << setprecision(4);
723 sensSpecFile << sensitivity << '\t' << specificity << '\t' << positivePredictiveValue << '\t' << negativePredictiveValue << '\t';
724 sensSpecFile << falseDiscoveryRate << '\t' << accuracy << '\t' << matthewsCorrCoef << '\t' << f1Score << endl;
726 sensSpecFile.close();
728 catch(exception& e) {
729 m->errorOut(e, "SensSpecCommand", "outputStatistics");
733 //***************************************************************************************************************
735 string SensSpecCommand::preProcessList(){
737 set<string> uniqueNames;
738 //get unique names from distance file
739 if (format == "phylip") {
742 m->openInputFile(distFile, phylipFile);
745 phylipFile >> numTest; m->gobble(phylipFile);
747 if (!m->isContainingOnlyDigits(numTest)) { m->mothurOut("[ERROR]: expected a number and got " + numTest + ", quitting."); m->mothurOutEndLine(); exit(1); }
749 m->mothurConvert(numTest, pNumSeqs);
753 for(int i=0;i<pNumSeqs;i++){
754 if (m->control_pressed) { return ""; }
755 phylipFile >> seqName; m->getline(phylipFile); m->gobble(phylipFile);
756 uniqueNames.insert(seqName);
761 m->openInputFile(distFile, columnFile);
762 string seqNameA, seqNameB;
766 if (m->control_pressed) { return ""; }
767 columnFile >> seqNameA >> seqNameB >> distance;
768 uniqueNames.insert(seqNameA); uniqueNames.insert(seqNameB);
769 m->gobble(columnFile);
774 //read list file, if numSeqs > unique names then remove redundant names
775 string newListFile = listFile + ".temp";
777 m->openOutputFile(newListFile, out);
779 m->openInputFile(listFile, in);
781 bool wroteSomething = false;
785 if (m->control_pressed) { in.close(); out.close(); m->mothurRemove(newListFile); return ""; }
787 //read in list vector
790 //listfile is already unique
791 if (list.getNumSeqs() == uniqueNames.size()) { in.close(); out.close(); m->mothurRemove(newListFile); return ""; }
793 //make a new list vector
795 newList.setLabel(list.getLabel());
798 for (int i = 0; i < list.getNumBins(); i++) {
800 //parse out names that are in accnos file
801 string binnames = list.get(i);
802 vector<string> bnames;
803 m->splitAtComma(binnames, bnames);
805 string newNames = "";
806 for (int j = 0; j < bnames.size(); j++) {
807 string name = bnames[j];
808 //if that name is in the .accnos file, add it
809 if (uniqueNames.count(name) != 0) { newNames += name + ","; }
812 //if there are names in this bin add to new list
813 if (newNames != "") {
814 newNames = newNames.substr(0, newNames.length()-1); //rip off extra comma
815 newList.push_back(newNames);
819 //print new listvector
820 if (newList.getNumBins() != 0) {
821 wroteSomething = true;
830 if (wroteSomething) { return newListFile; }
831 else { m->mothurRemove(newListFile); }
835 catch(exception& e) {
836 m->errorOut(e, "SensSpecCommand", "preProcessList");
842 //***************************************************************************************************************