X-Git-Url: https://git.donarmstrong.com/?a=blobdiff_plain;f=shhhercommand.cpp;h=74669ed0d9dba0425c81e74852482a62e4f29b46;hb=643cdf74ccd4fc2d565168716fb5645917dee834;hp=63df681923c0a91e87c3507f861fcd519253f332;hpb=259b6adf51ef0639cafd88cf862e4ffd5e0c7576;p=mothur.git diff --git a/shhhercommand.cpp b/shhhercommand.cpp index 63df681..74669ed 100644 --- a/shhhercommand.cpp +++ b/shhhercommand.cpp @@ -26,64 +26,56 @@ #define MIN_WEIGHT 0.1 #define MIN_TAU 0.0001 #define MIN_ITER 10 - //********************************************************************************************************************** - -vector ShhherCommand::getValidParameters(){ +vector ShhherCommand::setParameters(){ try { - string Array[] = { - "file", "flow", "lookup", "cutoff", "sigma", "outputdir","inputdir", "processors" - }; + CommandParameter pflow("flow", "InputTypes", "", "", "none", "fileflow", "none",false,false); parameters.push_back(pflow); + CommandParameter pfile("file", "InputTypes", "", "", "none", "fileflow", "none",false,false); parameters.push_back(pfile); + CommandParameter plookup("lookup", "InputTypes", "", "", "none", "none", "none",false,false); parameters.push_back(plookup); + CommandParameter pcutoff("cutoff", "Number", "", "0.01", "", "", "",false,false); parameters.push_back(pcutoff); + CommandParameter pprocessors("processors", "Number", "", "1", "", "", "",false,false); parameters.push_back(pprocessors); + CommandParameter pmaxiter("maxiter", "Number", "", "1000", "", "", "",false,false); parameters.push_back(pmaxiter); + CommandParameter psigma("sigma", "Number", "", "60", "", "", "",false,false); parameters.push_back(psigma); + CommandParameter pmindelta("mindelta", "Number", "", "0.000001", "", "", "",false,false); parameters.push_back(pmindelta); + CommandParameter porder("order", "String", "", "", "", "", "",false,false); parameters.push_back(porder); + CommandParameter pinputdir("inputdir", "String", "", "", "", "", "",false,false); parameters.push_back(pinputdir); + CommandParameter poutputdir("outputdir", "String", "", "", "", "", "",false,false); parameters.push_back(poutputdir); - vector myArray (Array, Array+(sizeof(Array)/sizeof(string))); + vector myArray; + for (int i = 0; i < parameters.size(); i++) { myArray.push_back(parameters[i].name); } return myArray; } catch(exception& e) { - m->errorOut(e, "ShhherCommand", "getValidParameters"); + m->errorOut(e, "ShhherCommand", "setParameters"); exit(1); } } - //********************************************************************************************************************** - -ShhherCommand::ShhherCommand(){ +string ShhherCommand::getHelpString(){ try { - abort = true; - - //initialize outputTypes - vector tempOutNames; - outputTypes["pn.dist"] = tempOutNames; - + string helpString = ""; + helpString += "The shhh.seqs command reads a file containing flowgrams and creates a file of corrected sequences.\n"; + return helpString; } catch(exception& e) { - m->errorOut(e, "ShhherCommand", "ShhherCommand"); + m->errorOut(e, "ShhherCommand", "getHelpString"); exit(1); } } - //********************************************************************************************************************** -vector ShhherCommand::getRequiredParameters(){ +ShhherCommand::ShhherCommand(){ try { - string Array[] = {"flow"}; - vector myArray (Array, Array+(sizeof(Array)/sizeof(string))); - return myArray; - } - catch(exception& e) { - m->errorOut(e, "ShhherCommand", "getRequiredParameters"); - exit(1); - } -} - -//********************************************************************************************************************** + abort = true; calledHelp = true; + setParameters(); + + //initialize outputTypes + vector tempOutNames; + outputTypes["pn.dist"] = tempOutNames; -vector ShhherCommand::getRequiredFiles(){ - try { - vector myArray; - return myArray; } catch(exception& e) { - m->errorOut(e, "ShhherCommand", "getRequiredFiles"); + m->errorOut(e, "ShhherCommand", "ShhherCommand"); exit(1); } } @@ -101,20 +93,14 @@ ShhherCommand::ShhherCommand(string option) { #endif - abort = false; + abort = false; calledHelp = false; //allow user to run help - if(option == "help") { help(); abort = true; } + if(option == "help") { help(); abort = true; calledHelp = true; } else { - - //valid paramters for this command - string AlignArray[] = { - "file", "flow", "lookup", "cutoff", "sigma", "outputdir","inputdir", "processors" - }; - - vector myArray (AlignArray, AlignArray+(sizeof(AlignArray)/sizeof(string))); + vector myArray = setParameters(); OptionParser parser(option); map parameters = parser.getParameters(); @@ -171,7 +157,15 @@ ShhherCommand::ShhherCommand(string option) { m->mothurOutEndLine(); abort = true; } - else if (flowFileName == "not open" || flowFilesFileName == "not open") { abort = true; } + else if (flowFileName == "not open" || flowFilesFileName == "not open") { abort = true; } + + if(flowFileName != "not found"){ compositeFASTAFileName = ""; } + else{ + compositeFASTAFileName = flowFilesFileName.substr(0, flowFilesFileName.length()-10) + "pn.fasta"; + ofstream temp; + m->openOutputFile(compositeFASTAFileName, temp); + temp.close(); + } //if the user changes the output directory command factory will send this info to us in the output parameter outputDir = validParameter.validFile(parameters, "outputdir", false); if (outputDir == "not found"){ @@ -188,8 +182,9 @@ ShhherCommand::ShhherCommand(string option) { else if(temp == "not open") { abort = true; } else { lookupFileName = temp; } - temp = validParameter.validFile(parameters, "processors", false);if (temp == "not found"){ temp = "1"; } - convert(temp, processors); + temp = validParameter.validFile(parameters, "processors", false); if (temp == "not found"){ temp = m->getProcessors(); } + m->setProcessors(temp); + convert(temp, processors); temp = validParameter.validFile(parameters, "cutoff", false); if (temp == "not found"){ temp = "0.01"; } convert(temp, cutoff); @@ -203,7 +198,12 @@ ShhherCommand::ShhherCommand(string option) { temp = validParameter.validFile(parameters, "sigma", false);if (temp == "not found") { temp = "60"; } convert(temp, sigma); - globaldata = GlobalData::getInstance(); + flowOrder = validParameter.validFile(parameters, "order", false); + if (flowOrder == "not found"){ flowOrder = "TACG"; } + else if(flowOrder.length() != 4){ + m->mothurOut("The value of the order option must be four bases long\n"); + } + } #ifdef USE_MPI @@ -216,38 +216,15 @@ ShhherCommand::ShhherCommand(string option) { exit(1); } } - -//********************************************************************************************************************** - -ShhherCommand::~ShhherCommand(){} - -//********************************************************************************************************************** - -void ShhherCommand::help(){ - try { - m->mothurOut("The shhher command reads a file containing flowgrams and creates a file of corrected sequences.\n"); - } - catch(exception& e) { - m->errorOut(e, "ShhherCommand", "help"); - exit(1); - } -} - //********************************************************************************************************************** #ifdef USE_MPI int ShhherCommand::execute(){ try { + if (abort == true) { if (calledHelp) { return 0; } return 2; } + int tag = 1976; MPI_Status status; - double begClock = clock(); - unsigned long int begTime = time(NULL); - - cout.setf(ios::fixed, ios::floatfield); - cout.setf(ios::showpoint); - cout << setprecision(2); - - if(pid == 0){ for(int i=1;imothurOut("\nGetting preliminary data...\n"); getSingleLookUp(); getJointLookUp(); @@ -283,15 +260,19 @@ int ShhherCommand::execute(){ } for(int i=0;i>>>>\tProcessing " << flowFileName << " (file " << i+1 << " of " << numFiles << ")\t<<<<<" << endl; - cout << "Reading flowgrams..." << endl; + + m->mothurOut("\n>>>>>\tProcessing " + flowFileName + " (file " + toString(i+1) + " of " + toString(numFiles) + ")\t<<<<<\n"); + m->mothurOut("Reading flowgrams...\n"); getFlowData(); - cout << "Identifying unique flowgrams..." << endl; + + m->mothurOut("Identifying unique flowgrams...\n"); getUniques(); - cout << "Calculating distances between flowgrams..." << endl; + m->mothurOut("Calculating distances between flowgrams...\n"); char fileName[1024]; strcpy(fileName, flowFileName.c_str()); @@ -322,11 +303,9 @@ int ShhherCommand::execute(){ string namesFileName = createNamesFile(); - cout << "\nClustering flowgrams..." << endl; + m->mothurOut("\nClustering flowgrams...\n"); string listFileName = cluster(distFileName, namesFileName); - // string listFileName = "PriestPot_C7.pn.list"; - // string listFileName = "test.mock_rep3.v69.pn.list"; - + getOTUData(listFileName); initPyroCluster(); @@ -343,9 +322,8 @@ int ShhherCommand::execute(){ int numOTUsOnCPU = numOTUs / ncpus; int numSeqsOnCPU = numSeqs / ncpus; - - cout << "\nDenoising flowgrams..." << endl; - cout << "iter\tmaxDelta\tnLL\t\tcycletime" << endl; + m->mothurOut("\nDenoising flowgrams...\n"); + m->mothurOut("iter\tmaxDelta\tnLL\t\tcycletime\n"); while((maxIters == 0 && maxDelta > minDelta) || iter < MIN_ITER || (maxDelta > minDelta && iter < maxIters)){ @@ -365,7 +343,7 @@ int ShhherCommand::execute(){ MPI_Send(&nSeqsPerOTU[0], numOTUs, MPI_INT, i, tag, MPI_COMM_WORLD); MPI_Send(&cumNumSeqs[0], numOTUs, MPI_INT, i, tag, MPI_COMM_WORLD); } - + calcCentroidsDriver(0, numOTUsOnCPU); for(int i=1;imothurOut(toString(iter) + '\t' + toString(maxDelta) + '\t' + toString(nLL) + '\t' + toString(time(NULL) - cycTime) + '\t' + toString((clock() - cycClock)/(double)CLOCKS_PER_SEC) + '\n'); if((maxIters == 0 && maxDelta > minDelta) || iter < MIN_ITER || (maxDelta > minDelta && iter < maxIters)){ int live = 1; @@ -463,7 +441,7 @@ int ShhherCommand::execute(){ } - cout << "\nFinalizing..." << endl; + m->mothurOut("\nFinalizing...\n"); fill(); setOTUs(); vector otuCounts(numOTUs, 0); @@ -478,14 +456,12 @@ int ShhherCommand::execute(){ remove(distFileName.c_str()); remove(namesFileName.c_str()); remove(listFileName.c_str()); - - cout << "Total time to process " << flowFileName << ":\t" << time(NULL) - begTime << '\t' << setprecision(6) << (clock() - begClock)/(double)CLOCKS_PER_SEC << endl; + + m->mothurOut("Total time to process " + toString(flowFileName) + ":\t" + toString(time(NULL) - begTime) + '\t' + toString((clock() - begClock)/(double)CLOCKS_PER_SEC) + '\n'); } - } else{ int abort = 1; - bool live = 1; MPI_Recv(&abort, 1, MPI_INT, 0, tag, MPI_COMM_WORLD, &status); if(abort){ return 0; } @@ -495,6 +471,8 @@ int ShhherCommand::execute(){ for(int i=0;imothurOut(toString(i) + '\t' + toString(time(NULL) - begTime) + '\t' + toString((clock()-begClock)/CLOCKS_PER_SEC) + '\n'); } } - cout << stopSeq << "\t" << (time(NULL) - begTime) << "\t" << (clock()-begClock)/CLOCKS_PER_SEC << endl; + + m->mothurOut(toString(stopSeq) + '\t' + toString(time(NULL) - begTime) + '\t' + toString((clock()-begClock)/CLOCKS_PER_SEC) + '\n'); string fDistFileName = flowFileName.substr(0,flowFileName.find_last_of('.')) + ".pn.dist"; if(pid != 0){ fDistFileName += ".temp." + toString(pid); } @@ -651,9 +629,6 @@ int ShhherCommand::execute(){ try { if (abort == true) { return 0; } - cout.setf(ios::fixed, ios::floatfield); - cout.setf(ios::showpoint); - getSingleLookUp(); getJointLookUp(); @@ -678,18 +653,19 @@ int ShhherCommand::execute(){ for(int i=0;i>>>>\tProcessing " << flowFileName << " (file " << i+1 << " of " << numFiles << ")\t<<<<<" << endl; - cout << "Reading flowgrams..." << endl; + m->mothurOut("\n>>>>>\tProcessing " + flowFileName + " (file " + toString(i+1) + " of " + toString(numFiles) + ")\t<<<<<\n"); + m->mothurOut("Reading flowgrams...\n"); getFlowData(); - cout << "Identifying unique flowgrams..." << endl; + + m->mothurOut("Identifying unique flowgrams...\n"); getUniques(); - cout << "Calculating distances between flowgrams..." << endl; + m->mothurOut("Calculating distances between flowgrams...\n"); string distFileName = createDistFile(processors); string namesFileName = createNamesFile(); - - cout << "\nClustering flowgrams..." << endl; + + m->mothurOut("\nClustering flowgrams...\n"); string listFileName = cluster(distFileName, namesFileName); getOTUData(listFileName); @@ -701,8 +677,8 @@ int ShhherCommand::execute(){ double begClock = clock(); unsigned long int begTime = time(NULL); - cout << "\nDenoising flowgrams..." << endl; - cout << "iter\tmaxDelta\tnLL\t\tcycletime" << endl; + m->mothurOut("\nDenoising flowgrams...\n"); + m->mothurOut("iter\tmaxDelta\tnLL\t\tcycletime\n"); while((maxIters == 0 && maxDelta > minDelta) || iter < MIN_ITER || (maxDelta > minDelta && iter < maxIters)){ @@ -720,10 +696,11 @@ int ShhherCommand::execute(){ iter++; - cout << iter << '\t' << maxDelta << '\t' << setprecision(2) << nLL << '\t' << time(NULL) - cycTime << '\t' << setprecision(6) << (clock() - cycClock)/(double)CLOCKS_PER_SEC << endl; + m->mothurOut(toString(iter) + '\t' + toString(maxDelta) + '\t' + toString(nLL) + '\t' + toString(time(NULL) - cycTime) + '\t' + toString((clock() - cycClock)/(double)CLOCKS_PER_SEC) + '\n'); + } - cout << "\nFinalizing..." << endl; + m->mothurOut("\nFinalizing...\n"); fill(); setOTUs(); @@ -741,7 +718,7 @@ int ShhherCommand::execute(){ remove(namesFileName.c_str()); remove(listFileName.c_str()); - cout << "Total time to process " << flowFileName << ":\t" << time(NULL) - begTime << '\t' << setprecision(6) << (clock() - begClock)/(double)CLOCKS_PER_SEC << endl; + m->mothurOut("Total time to process " + flowFileName + ":\t" + toString(time(NULL) - begTime) + '\t' + toString((clock() - begClock)/(double)CLOCKS_PER_SEC) + '\n'); } return 0; } @@ -759,6 +736,11 @@ void ShhherCommand::getFlowData(){ m->openInputFile(flowFileName, flowFile); string seqName; + seqNameVector.clear(); + lengths.clear(); + flowDataIntI.clear(); + nameMap.clear(); + int currentNumFlowCells; @@ -838,7 +820,7 @@ void ShhherCommand::getJointLookUp(){ for(int i=0;i current(numFlowCells); - for(int j=0;j uniqueLengths[j]) { uniqueLengths[j] = lengths[i]; } break; } index++; @@ -932,7 +923,7 @@ void ShhherCommand::getUniques(){ uniqueFlowDataIntI.resize(numFlowCells * numUniques); uniqueLengths.resize(numUniques); - flowDataPrI.assign(numSeqs * numFlowCells, 0); + flowDataPrI.resize(numSeqs * numFlowCells, 0); for(int i=0;imothurOutEndLine(); - cout << "Total time: " << (time(NULL) - begTime) << "\t" << (clock() - begClock)/CLOCKS_PER_SEC << endl;; + m->mothurOut("Total time: " + toString(time(NULL) - begTime) + '\t' + toString((clock() - begClock)/CLOCKS_PER_SEC) + '\n'); + return fDistFileName; } @@ -1131,14 +1123,6 @@ string ShhherCommand::createNamesFile(){ string ShhherCommand::cluster(string distFileName, string namesFileName){ try { - SparseMatrix* matrix; - ListVector* list; - RAbundVector* rabund; - - globaldata->setNameFile(namesFileName); - globaldata->setColumnFile(distFileName); - globaldata->setFormat("column"); - ReadMatrix* read = new ReadColumnMatrix(distFileName); read->setCutoff(cutoff); @@ -1146,13 +1130,13 @@ string ShhherCommand::cluster(string distFileName, string namesFileName){ clusterNameMap->readMap(); read->read(clusterNameMap); - list = read->getListVector(); - matrix = read->getMatrix(); + ListVector* list = read->getListVector(); + SparseMatrix* matrix = read->getMatrix(); delete read; delete clusterNameMap; - rabund = new RAbundVector(list->getRAbundVector()); + RAbundVector* rabund = new RAbundVector(list->getRAbundVector()); Cluster* cluster = new CompleteLinkage(rabund, list, matrix, cutoff, "furthest"); string tag = cluster->getTag(); @@ -1194,7 +1178,11 @@ void ShhherCommand::getOTUData(string listFileName){ otuData.assign(numSeqs, 0); cumNumSeqs.assign(numOTUs, 0); nSeqsPerOTU.assign(numOTUs, 0); - aaP.resize(numOTUs); + aaP.clear();aaP.resize(numOTUs); + + seqNumber.clear(); + aaI.clear(); + seqIndex.clear(); string singleOTU = ""; @@ -1246,6 +1234,8 @@ void ShhherCommand::getOTUData(string listFileName){ for(int j=nSeqsPerOTU[i];jerrorOut(e, "ShhherCommand", "getOTUData"); @@ -1373,6 +1364,7 @@ void ShhherCommand::calcCentroidsDriver(int start, int finish){ try{ + for(int i=start;i 0 && count > MIN_COUNT){ vector adF(nSeqsPerOTU[i]); vector anL(nSeqsPerOTU[i]); @@ -1664,9 +1656,9 @@ void ShhherCommand::calcNewDistancesChildMPI(int startSeq, int stopSeq, vector newTau(numOTUs,0); vector norms(numSeqs, 0); - otuIndex.resize(0); - seqIndex.resize(0); - singleTau.resize(0); + otuIndex.clear(); + seqIndex.clear(); + singleTau.clear(); @@ -1989,8 +1981,6 @@ void ShhherCommand::writeQualities(vector otuCounts){ void ShhherCommand::writeSequences(vector otuCounts){ try { - string bases = "TACG"; - string fastaFileName = flowFileName.substr(0,flowFileName.find_last_of('.')) + ".pn.fasta"; ofstream fastaFile; m->openOutputFile(fastaFileName, fastaFile); @@ -2003,17 +1993,24 @@ void ShhherCommand::writeSequences(vector otuCounts){ if(otuCounts[i] > 0){ fastaFile << '>' << seqNameVector[aaI[i][0]] << endl; - for(int j=8;jappendFiles(fastaFileName, compositeFASTAFileName); + } } catch(exception& e) { m->errorOut(e, "ShhherCommand", "writeSequences"); @@ -2076,7 +2073,7 @@ void ShhherCommand::writeClusters(vector otuCounts){ ofstream otuCountsFile; m->openOutputFile(otuCountsFileName, otuCountsFile); - string bases = "TACG"; + string bases = flowOrder; for(int i=0;i otuCounts){ int sequence = aaI[i][j]; otuCountsFile << seqNameVector[sequence] << '\t'; - for(int k=8;k