From 30910bfac62a1299cd20d99683c4fae68bfc2c1b Mon Sep 17 00:00:00 2001 From: westcott Date: Wed, 30 Nov 2011 13:28:25 +0000 Subject: [PATCH] fixed label option in sens.spec and added smart distancing. --- sensspeccommand.cpp | 501 +++++++++++++++++++++++++++++--------------- sensspeccommand.h | 17 +- sequenceparser.cpp | 3 + 3 files changed, 352 insertions(+), 169 deletions(-) diff --git a/sensspeccommand.cpp b/sensspeccommand.cpp index 726c904..2fb2efb 100644 --- a/sensspeccommand.cpp +++ b/sensspeccommand.cpp @@ -62,7 +62,8 @@ SensSpecCommand::SensSpecCommand(){ SensSpecCommand::SensSpecCommand(string option) { try { - abort = false; calledHelp = false; + abort = false; calledHelp = false; + allLines = 1; //allow user to run help if(option == "help") { help(); abort = true; calledHelp = true; } @@ -189,9 +190,14 @@ SensSpecCommand::SensSpecCommand(string option) { convert(temp, precision); // cout << precision << endl; - lineLabel = validParameter.validFile(parameters, "label", false); if (lineLabel == "not found") { lineLabel = ""; } + string label = validParameter.validFile(parameters, "label", false); + if (label == "not found") { label = ""; } + else { + if(label != "all") { m->splitAtDash(label, labels); allLines = 0; } + else { allLines = 1; } + } - sensSpecFileName = outputDir + m->getRootName(m->getSimpleName(listFile)) + ".sensspec"; + sensSpecFileName = outputDir + m->getRootName(m->getSimpleName(listFile)) + "sensspec"; } } catch(exception& e) { @@ -210,6 +216,8 @@ int SensSpecCommand::execute(){ if(format == "phylip") { processPhylip(); } else if(format == "column") { processColumn(); } + if (m->control_pressed) { m->mothurRemove(sensSpecFileName); return 0; } + m->mothurOutEndLine(); m->mothurOut("Output File Name: "); m->mothurOutEndLine(); m->mothurOut(sensSpecFileName); m->mothurOutEndLine(); @@ -226,216 +234,379 @@ int SensSpecCommand::execute(){ //*************************************************************************************************************** -void SensSpecCommand::processPhylip(){ +int SensSpecCommand::processPhylip(){ try{ //probably need some checking to confirm that the names in the distance matrix are the same as those in the list file - - ifstream inputListFile; - m->openInputFile(listFile, inputListFile); - string origCutoff = ""; bool getCutoff = 0; if(cutoff == -1.00) { getCutoff = 1; } - else { origCutoff = toString(cutoff); cutoff += (0.49 / double(precision)); } + else { origCutoff = toString(cutoff); cutoff += (0.49 / double(precision)); } - string label; - int numOTUs; - map seqMap; string seqList; - while(inputListFile){ - inputListFile >> label >> numOTUs; - for(int i=0;i> seqList; - int seqListLength = seqList.length(); - string seqName = ""; - for(int j=0;jgetLabel(); + + //if the users enters label "0.06" and there is no "0.06" in their file use the next lowest label. + set processedLabels; + set userLabels = labels; + + + while((list != NULL) && ((allLines == 1) || (userLabels.size() != 0))) { + + if (m->control_pressed) { for (int i = 0; i < outputNames.size(); i++) { m->mothurRemove(outputNames[i]); } delete list; return 0; } + + if(allLines == 1 || labels.count(list->getLabel()) == 1){ + + processedLabels.insert(list->getLabel()); + userLabels.erase(list->getLabel()); + + //process + fillSeqMap(seqMap, list); + process(seqMap, list->getLabel(), getCutoff, origCutoff); } - m->gobble(inputListFile); + + if ((m->anyLabelsToProcess(list->getLabel(), userLabels, "") == true) && (processedLabels.count(lastLabel) != 1)) { + string saveLabel = list->getLabel(); + + delete list; + list = input.getListVector(lastLabel); + + processedLabels.insert(list->getLabel()); + userLabels.erase(list->getLabel()); + + //process + fillSeqMap(seqMap, list); + process(seqMap, list->getLabel(), getCutoff, origCutoff); + + //restore real lastlabel to save below + list->setLabel(saveLabel); + } + + lastLabel = list->getLabel(); + + delete list; + list = input.getListVector(); + } - int lNumSeqs = seqMap.size(); - int pNumSeqs = 0; - - ifstream phylipFile; - m->openInputFile(distFile, phylipFile); - phylipFile >> pNumSeqs; - if(pNumSeqs != lNumSeqs){ cout << "numSeq mismatch!" << endl; } - - string seqName; - double distance; - vector otuIndices(lNumSeqs, -1); + + //output error messages about any remaining user labels + set::iterator it; + bool needToRun = false; + for (it = userLabels.begin(); it != userLabels.end(); it++) { + m->mothurOut("Your file does not include the label " + *it); + if (processedLabels.count(lastLabel) != 1) { + m->mothurOut(". I will use " + lastLabel + "."); m->mothurOutEndLine(); + needToRun = true; + }else { + m->mothurOut(". Please refer to " + lastLabel + "."); m->mothurOutEndLine(); + } + } + + //run last label if you need to + if (needToRun == true) { + if (list != NULL) { delete list; } + list = input.getListVector(lastLabel); + + //process + fillSeqMap(seqMap, list); + process(seqMap, list->getLabel(), getCutoff, origCutoff); + + delete list; + } + + return 0; + } + catch(exception& e) { + m->errorOut(e, "SensSpecCommand", "processPhylip"); + exit(1); + } +} +//*************************************************************************************************************** +int SensSpecCommand::fillSeqMap(map& seqMap, ListVector*& list){ + try { + //for each otu + for(int i=0;igetNumBins();i++){ + + if (m->control_pressed) { return 0; } + + string seqList = list->get(i); + int seqListLength = seqList.length(); + string seqName = ""; + + //parse bin by name, mapping each name to its otu number + for(int j=0;jerrorOut(e, "SensSpecCommand", "fillSeqMap"); + exit(1); + } +} +//*************************************************************************************************************** +int SensSpecCommand::fillSeqPairSet(set& seqPairSet, ListVector*& list){ + try { + int numSeqs = 0; + + //for each otu + for(int i=0;igetNumBins();i++){ + + if (m->control_pressed) { return 0; } + + vector seqNameVector; + string bin = list->get(i); + m->splitAtComma(bin, seqNameVector); + + numSeqs += seqNameVector.size(); - for(int i=0;i> seqName; - otuIndices[i] = seqMap[seqName]; + for(int j=0;jerrorOut(e, "SensSpecCommand", "fillSeqMap"); + exit(1); + } +} +//*************************************************************************************************************** +int SensSpecCommand::process(map& seqMap, string label, bool& getCutoff, string& origCutoff){ + try { + + int lNumSeqs = seqMap.size(); + int pNumSeqs = 0; + + ifstream phylipFile; + m->openInputFile(distFile, phylipFile); + phylipFile >> pNumSeqs; + if(pNumSeqs != lNumSeqs){ m->mothurOut("numSeq mismatch!\n"); m->control_pressed = true; } + + string seqName; + double distance; + vector otuIndices(lNumSeqs, -1); + + truePositives = 0; + falsePositives = 0; + trueNegatives = 0; + falseNegatives = 0; + + if(getCutoff == 1){ + if(label != "unique"){ + origCutoff = label; + convert(label, cutoff); + if(hard == 0){ cutoff += (0.49 / double(precision)); } + } + else{ + origCutoff = "unique"; + cutoff = 0.0000; + } + } + + m->mothurOut(label); m->mothurOutEndLine(); + + for(int i=0;icontrol_pressed) { return 0; } + + phylipFile >> seqName; + otuIndices[i] = seqMap[seqName]; + + for(int j=0;j> distance; - for(int j=0;j> distance; - - if(distance <= cutoff){ - if(otuIndices[i] == otuIndices[j]) { truePositives++; } - else { falseNegatives++; } - } - else{ - if(otuIndices[i] == otuIndices[j]) { falsePositives++; } - else { trueNegatives++; } - } + if(distance <= cutoff){ + if(otuIndices[i] == otuIndices[j]) { truePositives++; } + else { falseNegatives++; } + } + else{ + if(otuIndices[i] == otuIndices[j]) { falsePositives++; } + else { trueNegatives++; } + } + } + } + phylipFile.close(); + + outputStatistics(label, origCutoff); + + return 0; + } + catch(exception& e) { + m->errorOut(e, "SensSpecCommand", "process"); + exit(1); + } +} +//*************************************************************************************************************** +int SensSpecCommand::process(set& seqPairSet, string label, bool& getCutoff, string& origCutoff, int numSeqs){ + try { + int numDists = (numSeqs * (numSeqs-1) / 2); + + ifstream columnFile; + m->openInputFile(distFile, columnFile); + string seqNameA, seqNameB, seqPairString; + double distance; + + truePositives = 0; + falsePositives = 0; + trueNegatives = numDists; + falseNegatives = 0; + + if(getCutoff == 1){ + if(label != "unique"){ + origCutoff = label; + convert(label, cutoff); + if(hard == 0){ cutoff += (0.49 / double(precision)); } + } + else{ + origCutoff = "unique"; + cutoff = 0.0000; + } + } + + m->mothurOut(label); m->mothurOutEndLine(); + + while(columnFile){ + columnFile >> seqNameA >> seqNameB >> distance; + if(seqNameA < seqNameB) { seqPairString = seqNameA + '\t' + seqNameB; } + else { seqPairString = seqNameB + '\t' + seqNameA; } + + set::iterator it = seqPairSet.find(seqPairString); + + if(distance <= cutoff){ + if(it != seqPairSet.end()){ + truePositives++; + seqPairSet.erase(it); + } + else{ + falseNegatives++; } + trueNegatives--; + } + else if(it != seqPairSet.end()){ + falsePositives++; + trueNegatives--; + seqPairSet.erase(it); } - phylipFile.close(); - outputStatistics(label, origCutoff); + m->gobble(columnFile); } - inputListFile.close(); - + falsePositives += seqPairSet.size(); + + outputStatistics(label, origCutoff); + + + return 0; } catch(exception& e) { - m->errorOut(e, "SensSpecCommand", "processPhylip"); + m->errorOut(e, "SensSpecCommand", "process"); exit(1); } } - //*************************************************************************************************************** -void SensSpecCommand::processColumn(){ +int SensSpecCommand::processColumn(){ try{ - ifstream inputListFile; - m->openInputFile(listFile, inputListFile); - string origCutoff = ""; bool getCutoff = 0; if(cutoff == -1.00) { getCutoff = 1; } else { origCutoff = toString(cutoff); cutoff += (0.49 / double(precision)); } set seqPairSet; + int numSeqs = 0; + + InputData input(listFile, "list"); + ListVector* list = input.getListVector(); + string lastLabel = list->getLabel(); - string label, seqList; - int numOTUs; - int numSeqs; + //if the users enters label "0.06" and there is no "0.06" in their file use the next lowest label. + set processedLabels; + set userLabels = labels; - while(inputListFile){ - numSeqs = 0; + + while((list != NULL) && ((allLines == 1) || (userLabels.size() != 0))) { + + if (m->control_pressed) { for (int i = 0; i < outputNames.size(); i++) { m->mothurRemove(outputNames[i]); } delete list; return 0; } - inputListFile >> label >> numOTUs; - for(int i=0;igetLabel()) == 1){ - vector seqNameVector; + processedLabels.insert(list->getLabel()); + userLabels.erase(list->getLabel()); - inputListFile >> seqList; - int seqListLength = seqList.length(); - string seqName = ""; - for(int j=0;jgobble(inputListFile); - - int numDists = (numSeqs * (numSeqs-1) / 2); - - ifstream columnFile; - m->openInputFile(distFile, columnFile); - string seqNameA, seqNameB, seqPairString; - double distance; - - truePositives = 0; - falsePositives = 0; - trueNegatives = numDists; - falseNegatives = 0; - - if(getCutoff == 1){ - if(label != "unique"){ - origCutoff = label; - convert(label, cutoff); - if(hard == 0){ cutoff += (0.49 / double(precision)); } - } - else{ - origCutoff = "unique"; - cutoff = 0.0000; - } + //process + numSeqs = fillSeqPairSet(seqPairSet, list); + process(seqPairSet, list->getLabel(), getCutoff, origCutoff, numSeqs); } - cout << label << endl; + if ((m->anyLabelsToProcess(list->getLabel(), userLabels, "") == true) && (processedLabels.count(lastLabel) != 1)) { + string saveLabel = list->getLabel(); + + delete list; + list = input.getListVector(lastLabel); + + processedLabels.insert(list->getLabel()); + userLabels.erase(list->getLabel()); + + //process + numSeqs = fillSeqPairSet(seqPairSet, list); + process(seqPairSet, list->getLabel(), getCutoff, origCutoff, numSeqs); + + //restore real lastlabel to save below + list->setLabel(saveLabel); + } - while(columnFile){ - columnFile >> seqNameA >> seqNameB >> distance; - if(seqNameA < seqNameB) { seqPairString = seqNameA + '\t' + seqNameB; } - else { seqPairString = seqNameB + '\t' + seqNameA; } - - set::iterator it = seqPairSet.find(seqPairString); + lastLabel = list->getLabel(); - if(distance <= cutoff){ - if(it != seqPairSet.end()){ - truePositives++; - seqPairSet.erase(it); - } - else{ - falseNegatives++; - } - trueNegatives--; - } - else if(it != seqPairSet.end()){ - falsePositives++; - trueNegatives--; - seqPairSet.erase(it); - } - - m->gobble(columnFile); + delete list; + list = input.getListVector(); + } + + + //output error messages about any remaining user labels + set::iterator it; + bool needToRun = false; + for (it = userLabels.begin(); it != userLabels.end(); it++) { + m->mothurOut("Your file does not include the label " + *it); + if (processedLabels.count(lastLabel) != 1) { + m->mothurOut(". I will use " + lastLabel + "."); m->mothurOutEndLine(); + needToRun = true; + }else { + m->mothurOut(". Please refer to " + lastLabel + "."); m->mothurOutEndLine(); } - falsePositives += seqPairSet.size(); + } + + //run last label if you need to + if (needToRun == true) { + if (list != NULL) { delete list; } + list = input.getListVector(lastLabel); - outputStatistics(label, origCutoff); + //process + numSeqs = fillSeqPairSet(seqPairSet, list); + delete list; + process(seqPairSet, list->getLabel(), getCutoff, origCutoff, numSeqs); } + + return 0; } catch(exception& e) { m->errorOut(e, "SensSpecCommand", "processColumn"); diff --git a/sensspeccommand.h b/sensspeccommand.h index 1631e74..02e84c0 100644 --- a/sensspeccommand.h +++ b/sensspeccommand.h @@ -13,6 +13,8 @@ #include "mothur.h" #include "command.hpp" +#include "listvector.hpp" +#include "inputdata.h" class SensSpecCommand : public Command { @@ -32,8 +34,8 @@ public: void help() { m->mothurOut(getHelpString()); } private: - void processPhylip(); - void processColumn(); + int processPhylip(); + int processColumn(); void setUpOutput(); void outputStatistics(string, string); @@ -41,13 +43,20 @@ private: string outputDir; string format; vector outputNames; + set labels; //holds labels to be used long int truePositives, falsePositives, trueNegatives, falseNegatives; - bool abort; + bool abort, allLines; bool hard; - string lineLabel; + //string lineLabel; double cutoff; int precision; + + int fillSeqMap(map&, ListVector*&); + int fillSeqPairSet(set&, ListVector*&); + int process(map&, string, bool&, string&); + int process(set&, string, bool&, string&, int); + }; #endif diff --git a/sequenceparser.cpp b/sequenceparser.cpp index 105e0a9..6c98c04 100644 --- a/sequenceparser.cpp +++ b/sequenceparser.cpp @@ -64,6 +64,7 @@ SequenceParser::SequenceParser(string groupFile, string fastaFile, string nameFi string first, second; int countName = 0; set thisnames1; + while(!inName.eof()) { if (m->control_pressed) { break; } @@ -135,12 +136,14 @@ SequenceParser::SequenceParser(string groupFile, string fastaFile, string nameFi if (countName != (groupMap->getNumSeqs())) { vector groupseqsnames = groupMap->getNamesSeqs(); + for (int i = 0; i < groupseqsnames.size(); i++) { set::iterator itnamesfile = thisnames1.find(groupseqsnames[i]); if (itnamesfile == thisnames1.end()){ cout << "missing name " + groupseqsnames[i] << '\t' << allSeqsMap[groupseqsnames[i]] << endl; } } + m->mothurOutEndLine(); m->mothurOut("[ERROR]: Your name file contains " + toString(countName) + " valid sequences, and your groupfile contains " + toString(groupMap->getNumSeqs()) + ", please correct."); m->mothurOutEndLine(); -- 2.39.2