5 * Created by Pat Schloss on 12/27/10.
6 * Copyright 2010 Schloss Lab. All rights reserved.
10 #include "shhhercommand.h"
12 //**********************************************************************************************************************
13 vector<string> ShhherCommand::setParameters(){
15 CommandParameter pflow("flow", "InputTypes", "", "", "none", "fileflow", "none",false,false); parameters.push_back(pflow);
16 CommandParameter pfile("file", "InputTypes", "", "", "none", "fileflow", "none",false,false); parameters.push_back(pfile);
17 CommandParameter plookup("lookup", "InputTypes", "", "", "none", "none", "none",false,false); parameters.push_back(plookup);
18 CommandParameter pcutoff("cutoff", "Number", "", "0.01", "", "", "",false,false); parameters.push_back(pcutoff);
19 CommandParameter pprocessors("processors", "Number", "", "1", "", "", "",false,false); parameters.push_back(pprocessors);
20 CommandParameter pmaxiter("maxiter", "Number", "", "1000", "", "", "",false,false); parameters.push_back(pmaxiter);
21 CommandParameter plarge("large", "Number", "", "-1", "", "", "",false,false); parameters.push_back(plarge);
22 CommandParameter psigma("sigma", "Number", "", "60", "", "", "",false,false); parameters.push_back(psigma);
23 CommandParameter pmindelta("mindelta", "Number", "", "0.000001", "", "", "",false,false); parameters.push_back(pmindelta);
24 CommandParameter porder("order", "String", "", "", "", "", "",false,false); parameters.push_back(porder);
25 CommandParameter pinputdir("inputdir", "String", "", "", "", "", "",false,false); parameters.push_back(pinputdir);
26 CommandParameter poutputdir("outputdir", "String", "", "", "", "", "",false,false); parameters.push_back(poutputdir);
28 vector<string> myArray;
29 for (int i = 0; i < parameters.size(); i++) { myArray.push_back(parameters[i].name); }
33 m->errorOut(e, "ShhherCommand", "setParameters");
37 //**********************************************************************************************************************
38 string ShhherCommand::getHelpString(){
40 string helpString = "";
41 helpString += "The shhh.flows command reads a file containing flowgrams and creates a file of corrected sequences.\n";
45 m->errorOut(e, "ShhherCommand", "getHelpString");
49 //**********************************************************************************************************************
51 ShhherCommand::ShhherCommand(){
53 abort = true; calledHelp = true;
56 //initialize outputTypes
57 // vector<string> tempOutNames;
58 // outputTypes["pn.dist"] = tempOutNames;
62 m->errorOut(e, "ShhherCommand", "ShhherCommand");
67 //**********************************************************************************************************************
69 ShhherCommand::ShhherCommand(string option) {
73 MPI_Comm_rank(MPI_COMM_WORLD, &pid); //find out who we are
74 MPI_Comm_size(MPI_COMM_WORLD, &ncpus);
78 abort = false; calledHelp = false;
80 //allow user to run help
81 if(option == "help") { help(); abort = true; calledHelp = true; }
82 else if(option == "citation") { citation(); abort = true; calledHelp = true;}
85 vector<string> myArray = setParameters();
87 OptionParser parser(option);
88 map<string,string> parameters = parser.getParameters();
90 ValidParameters validParameter;
91 map<string,string>::iterator it;
93 //check to make sure all parameters are valid for command
94 for (it = parameters.begin(); it != parameters.end(); it++) {
95 if (validParameter.isValidParameter(it->first, myArray, it->second) != true) { abort = true; }
98 //initialize outputTypes
99 vector<string> tempOutNames;
100 // outputTypes["pn.dist"] = tempOutNames;
101 // outputTypes["fasta"] = tempOutNames;
103 //if the user changes the input directory command factory will send this info to us in the output parameter
104 string inputDir = validParameter.validFile(parameters, "inputdir", false);
105 if (inputDir == "not found"){ inputDir = ""; }
108 it = parameters.find("flow");
109 //user has given a template file
110 if(it != parameters.end()){
111 path = m->hasPath(it->second);
112 //if the user has not given a path then, add inputdir. else leave path alone.
113 if (path == "") { parameters["flow"] = inputDir + it->second; }
116 it = parameters.find("lookup");
117 //user has given a template file
118 if(it != parameters.end()){
119 path = m->hasPath(it->second);
120 //if the user has not given a path then, add inputdir. else leave path alone.
121 if (path == "") { parameters["lookup"] = inputDir + it->second; }
124 it = parameters.find("file");
125 //user has given a template file
126 if(it != parameters.end()){
127 path = m->hasPath(it->second);
128 //if the user has not given a path then, add inputdir. else leave path alone.
129 if (path == "") { parameters["file"] = inputDir + it->second; }
133 //if the user changes the output directory command factory will send this info to us in the output parameter
134 outputDir = validParameter.validFile(parameters, "outputdir", false); if (outputDir == "not found"){ outputDir = ""; }
136 //check for required parameters
137 flowFileName = validParameter.validFile(parameters, "flow", true);
138 flowFilesFileName = validParameter.validFile(parameters, "file", true);
139 if (flowFileName == "not found" && flowFilesFileName == "not found") {
140 m->mothurOut("values for either flow or file must be provided for the shhh.flows command.");
141 m->mothurOutEndLine();
144 else if (flowFileName == "not open" || flowFilesFileName == "not open") { abort = true; }
146 if(flowFileName != "not found"){
147 compositeFASTAFileName = "";
148 compositeNamesFileName = "";
153 string thisoutputDir = m->hasPath(flowFilesFileName); //if user entered a file with a path then preserve it
155 //flow.files = 9 character offset
156 compositeFASTAFileName = thisoutputDir + m->getRootName(m->getSimpleName(flowFilesFileName)) + "shhh.fasta";
157 m->openOutputFile(compositeFASTAFileName, temp);
160 compositeNamesFileName = thisoutputDir + m->getRootName(m->getSimpleName(flowFilesFileName)) + "shhh.names";
161 m->openOutputFile(compositeNamesFileName, temp);
165 if(flowFilesFileName != "not found"){
168 ifstream flowFilesFile;
169 m->openInputFile(flowFilesFileName, flowFilesFile);
170 while(flowFilesFile){
171 fName = m->getline(flowFilesFile);
173 //test if file is valid
175 int ableToOpen = m->openInputFile(fName, in, "noerror");
177 if (ableToOpen == 1) {
178 if (inputDir != "") { //default path is set
179 string tryPath = inputDir + fName;
180 m->mothurOut("Unable to open " + fName + ". Trying input directory " + tryPath); m->mothurOutEndLine();
182 ableToOpen = m->openInputFile(tryPath, in2, "noerror");
188 if (ableToOpen == 1) {
189 if (m->getDefaultPath() != "") { //default path is set
190 string tryPath = m->getDefaultPath() + m->getSimpleName(fName);
191 m->mothurOut("Unable to open " + fName + ". Trying default " + tryPath); m->mothurOutEndLine();
193 ableToOpen = m->openInputFile(tryPath, in2, "noerror");
199 //if you can't open it its not in current working directory or inputDir, try mothur excutable location
200 if (ableToOpen == 1) {
201 string exepath = m->argv;
202 string tempPath = exepath;
203 for (int i = 0; i < exepath.length(); i++) { tempPath[i] = tolower(exepath[i]); }
204 exepath = exepath.substr(0, (tempPath.find_last_of('m')));
206 string tryPath = m->getFullPathName(exepath) + m->getSimpleName(fName);
207 m->mothurOut("Unable to open " + fName + ". Trying mothur's executable location " + tryPath); m->mothurOutEndLine();
209 ableToOpen = m->openInputFile(tryPath, in2, "noerror");
214 if (ableToOpen == 1) { m->mothurOut("Unable to open " + fName + ". Disregarding. "); m->mothurOutEndLine(); }
215 else { flowFileVector.push_back(fName); }
216 m->gobble(flowFilesFile);
218 flowFilesFile.close();
219 if (flowFileVector.size() == 0) { m->mothurOut("[ERROR]: no valid files."); m->mothurOutEndLine(); abort = true; }
222 if (outputDir == "") { outputDir = m->hasPath(flowFileName); }
223 flowFileVector.push_back(flowFileName);
226 //check for optional parameter and set defaults
227 // ...at some point should added some additional type checking...
229 temp = validParameter.validFile(parameters, "lookup", true);
230 if (temp == "not found") {
231 lookupFileName = "LookUp_Titanium.pat";
235 ableToOpen = m->openInputFile(lookupFileName, in, "noerror");
238 //if you can't open it, try input location
239 if (ableToOpen == 1) {
240 if (inputDir != "") { //default path is set
241 string tryPath = inputDir + lookupFileName;
242 m->mothurOut("Unable to open " + lookupFileName + ". Trying input directory " + tryPath); m->mothurOutEndLine();
244 ableToOpen = m->openInputFile(tryPath, in2, "noerror");
246 lookupFileName = tryPath;
250 //if you can't open it, try default location
251 if (ableToOpen == 1) {
252 if (m->getDefaultPath() != "") { //default path is set
253 string tryPath = m->getDefaultPath() + m->getSimpleName(lookupFileName);
254 m->mothurOut("Unable to open " + lookupFileName + ". Trying default " + tryPath); m->mothurOutEndLine();
256 ableToOpen = m->openInputFile(tryPath, in2, "noerror");
258 lookupFileName = tryPath;
262 //if you can't open it its not in current working directory or inputDir, try mothur excutable location
263 if (ableToOpen == 1) {
264 string exepath = m->argv;
265 string tempPath = exepath;
266 for (int i = 0; i < exepath.length(); i++) { tempPath[i] = tolower(exepath[i]); }
267 exepath = exepath.substr(0, (tempPath.find_last_of('m')));
269 string tryPath = m->getFullPathName(exepath) + m->getSimpleName(lookupFileName);
270 m->mothurOut("Unable to open " + lookupFileName + ". Trying mothur's executable location " + tryPath); m->mothurOutEndLine();
272 ableToOpen = m->openInputFile(tryPath, in2, "noerror");
274 lookupFileName = tryPath;
277 if (ableToOpen == 1) { m->mothurOut("Unable to open " + lookupFileName + "."); m->mothurOutEndLine(); abort=true; }
279 else if(temp == "not open") {
281 lookupFileName = validParameter.validFile(parameters, "lookup", false);
283 //if you can't open it its not inputDir, try mothur excutable location
284 string exepath = m->argv;
285 string tempPath = exepath;
286 for (int i = 0; i < exepath.length(); i++) { tempPath[i] = tolower(exepath[i]); }
287 exepath = exepath.substr(0, (tempPath.find_last_of('m')));
289 string tryPath = m->getFullPathName(exepath) + lookupFileName;
290 m->mothurOut("Unable to open " + lookupFileName + ". Trying mothur's executable location " + tryPath); m->mothurOutEndLine();
292 int ableToOpen = m->openInputFile(tryPath, in2, "noerror");
294 lookupFileName = tryPath;
296 if (ableToOpen == 1) { m->mothurOut("Unable to open " + lookupFileName + "."); m->mothurOutEndLine(); abort=true; }
297 }else { lookupFileName = temp; }
299 temp = validParameter.validFile(parameters, "processors", false); if (temp == "not found"){ temp = m->getProcessors(); }
300 m->setProcessors(temp);
301 m->mothurConvert(temp, processors);
303 temp = validParameter.validFile(parameters, "cutoff", false); if (temp == "not found"){ temp = "0.01"; }
304 m->mothurConvert(temp, cutoff);
306 temp = validParameter.validFile(parameters, "mindelta", false); if (temp == "not found"){ temp = "0.000001"; }
307 m->mothurConvert(temp, minDelta);
309 temp = validParameter.validFile(parameters, "maxiter", false); if (temp == "not found"){ temp = "1000"; }
310 m->mothurConvert(temp, maxIters);
312 temp = validParameter.validFile(parameters, "large", false); if (temp == "not found"){ temp = "0"; }
313 m->mothurConvert(temp, largeSize);
314 if (largeSize != 0) { large = true; }
315 else { large = false; }
316 if (largeSize < 0) { m->mothurOut("The value of the large cannot be negative.\n"); }
319 if (large) { m->mothurOut("The large parameter is not available with the MPI-Enabled version.\n"); large=false; }
323 temp = validParameter.validFile(parameters, "sigma", false);if (temp == "not found") { temp = "60"; }
324 m->mothurConvert(temp, sigma);
326 flowOrder = validParameter.validFile(parameters, "order", false);
327 if (flowOrder == "not found"){ flowOrder = "TACG"; }
328 else if(flowOrder.length() != 4){
329 m->mothurOut("The value of the order option must be four bases long\n");
337 catch(exception& e) {
338 m->errorOut(e, "ShhherCommand", "ShhherCommand");
342 //**********************************************************************************************************************
344 int ShhherCommand::execute(){
346 if (abort == true) { if (calledHelp) { return 0; } return 2; }
353 for(int i=1;i<ncpus;i++){
354 MPI_Send(&abort, 1, MPI_INT, i, tag, MPI_COMM_WORLD);
356 if(abort == 1){ return 0; }
360 m->mothurOut("\nGetting preliminary data...\n");
361 getSingleLookUp(); if (m->control_pressed) { return 0; }
362 getJointLookUp(); if (m->control_pressed) { return 0; }
364 vector<string> flowFileVector;
365 if(flowFilesFileName != "not found"){
368 ifstream flowFilesFile;
369 m->openInputFile(flowFilesFileName, flowFilesFile);
370 while(flowFilesFile){
371 fName = m->getline(flowFilesFile);
372 flowFileVector.push_back(fName);
373 m->gobble(flowFilesFile);
377 flowFileVector.push_back(flowFileName);
380 int numFiles = flowFileVector.size();
382 for(int i=1;i<ncpus;i++){
383 MPI_Send(&numFiles, 1, MPI_INT, i, tag, MPI_COMM_WORLD);
386 for(int i=0;i<numFiles;i++){
388 if (m->control_pressed) { break; }
390 double begClock = clock();
391 unsigned long long begTime = time(NULL);
393 flowFileName = flowFileVector[i];
395 m->mothurOut("\n>>>>>\tProcessing " + flowFileName + " (file " + toString(i+1) + " of " + toString(numFiles) + ")\t<<<<<\n");
396 m->mothurOut("Reading flowgrams...\n");
399 if (m->control_pressed) { break; }
401 m->mothurOut("Identifying unique flowgrams...\n");
404 if (m->control_pressed) { break; }
406 m->mothurOut("Calculating distances between flowgrams...\n");
408 strcpy(fileName, flowFileName.c_str());
410 for(int i=1;i<ncpus;i++){
411 MPI_Send(&fileName[0], 1024, MPI_CHAR, i, tag, MPI_COMM_WORLD);
413 MPI_Send(&numSeqs, 1, MPI_INT, i, tag, MPI_COMM_WORLD);
414 MPI_Send(&numUniques, 1, MPI_INT, i, tag, MPI_COMM_WORLD);
415 MPI_Send(&numFlowCells, 1, MPI_INT, i, tag, MPI_COMM_WORLD);
416 MPI_Send(&flowDataIntI[0], numSeqs * numFlowCells, MPI_SHORT, i, tag, MPI_COMM_WORLD);
417 MPI_Send(&flowDataPrI[0], numSeqs * numFlowCells, MPI_DOUBLE, i, tag, MPI_COMM_WORLD);
418 MPI_Send(&mapUniqueToSeq[0], numSeqs, MPI_INT, i, tag, MPI_COMM_WORLD);
419 MPI_Send(&mapSeqToUnique[0], numSeqs, MPI_INT, i, tag, MPI_COMM_WORLD);
420 MPI_Send(&lengths[0], numSeqs, MPI_INT, i, tag, MPI_COMM_WORLD);
421 MPI_Send(&jointLookUp[0], NUMBINS * NUMBINS, MPI_DOUBLE, i, tag, MPI_COMM_WORLD);
422 MPI_Send(&cutoff, 1, MPI_DOUBLE, i, tag, MPI_COMM_WORLD);
425 string distFileName = flowDistMPI(0, int(sqrt(1.0/float(ncpus)) * numUniques));
427 if (m->control_pressed) { break; }
430 for(int i=1;i<ncpus;i++){
431 MPI_Recv(&done, 1, MPI_INT, i, tag, MPI_COMM_WORLD, &status);
433 m->appendFiles((distFileName + ".temp." + toString(i)), distFileName);
434 m->mothurRemove((distFileName + ".temp." + toString(i)));
437 string namesFileName = createNamesFile();
439 if (m->control_pressed) { break; }
441 m->mothurOut("\nClustering flowgrams...\n");
442 string listFileName = cluster(distFileName, namesFileName);
444 if (m->control_pressed) { break; }
448 getOTUData(listFileName);
450 m->mothurRemove(distFileName);
451 m->mothurRemove(namesFileName);
452 m->mothurRemove(listFileName);
454 if (m->control_pressed) { break; }
458 if (m->control_pressed) { break; }
461 for(int i=1;i<ncpus;i++){
462 MPI_Send(&numOTUs, 1, MPI_INT, i, tag, MPI_COMM_WORLD);
463 MPI_Send(&singleLookUp[0], singleLookUp.size(), MPI_DOUBLE, i, tag, MPI_COMM_WORLD);
464 MPI_Send(&uniqueFlowgrams[0], numFlowCells * numUniques, MPI_SHORT, i, tag, MPI_COMM_WORLD);
465 MPI_Send(&sigma, 1, MPI_DOUBLE, i, tag, MPI_COMM_WORLD);
468 if (m->control_pressed) { break; }
473 int numOTUsOnCPU = numOTUs / ncpus;
474 int numSeqsOnCPU = numSeqs / ncpus;
475 m->mothurOut("\nDenoising flowgrams...\n");
476 m->mothurOut("iter\tmaxDelta\tnLL\t\tcycletime\n");
478 while((maxIters == 0 && maxDelta > minDelta) || iter < MIN_ITER || (maxDelta > minDelta && iter < maxIters)){
480 double cycClock = clock();
481 unsigned long long cycTime = time(NULL);
484 if (m->control_pressed) { break; }
486 int total = singleTau.size();
487 for(int i=1;i<ncpus;i++){
488 MPI_Send(&total, 1, MPI_INT, i, tag, MPI_COMM_WORLD);
489 MPI_Send(&change[0], numOTUs, MPI_SHORT, i, tag, MPI_COMM_WORLD);
490 MPI_Send(¢roids[0], numOTUs, MPI_INT, i, tag, MPI_COMM_WORLD);
492 MPI_Send(&singleTau[0], total, MPI_DOUBLE, i, tag, MPI_COMM_WORLD);
493 MPI_Send(&seqNumber[0], total, MPI_INT, i, tag, MPI_COMM_WORLD);
494 MPI_Send(&seqIndex[0], total, MPI_INT, i, tag, MPI_COMM_WORLD);
495 MPI_Send(&nSeqsPerOTU[0], numOTUs, MPI_INT, i, tag, MPI_COMM_WORLD);
496 MPI_Send(&cumNumSeqs[0], numOTUs, MPI_INT, i, tag, MPI_COMM_WORLD);
499 calcCentroidsDriver(0, numOTUsOnCPU);
501 for(int i=1;i<ncpus;i++){
502 int otuStart = i * numOTUs / ncpus;
503 int otuStop = (i + 1) * numOTUs / ncpus;
505 vector<int> tempCentroids(numOTUs, 0);
506 vector<short> tempChange(numOTUs, 0);
508 MPI_Recv(&tempCentroids[0], numOTUs, MPI_INT, i, tag, MPI_COMM_WORLD, &status);
509 MPI_Recv(&tempChange[0], numOTUs, MPI_SHORT, i, tag, MPI_COMM_WORLD, &status);
511 for(int j=otuStart;j<otuStop;j++){
512 centroids[j] = tempCentroids[j];
513 change[j] = tempChange[j];
517 maxDelta = getNewWeights(); if (m->control_pressed) { break; }
518 double nLL = getLikelihood(); if (m->control_pressed) { break; }
519 checkCentroids(); if (m->control_pressed) { break; }
521 for(int i=1;i<ncpus;i++){
522 MPI_Send(¢roids[0], numOTUs, MPI_INT, i, tag, MPI_COMM_WORLD);
523 MPI_Send(&weight[0], numOTUs, MPI_DOUBLE, i, tag, MPI_COMM_WORLD);
524 MPI_Send(&change[0], numOTUs, MPI_SHORT, i, tag, MPI_COMM_WORLD);
527 calcNewDistancesParent(0, numSeqsOnCPU);
529 total = singleTau.size();
531 for(int i=1;i<ncpus;i++){
533 int seqStart = i * numSeqs / ncpus;
534 int seqStop = (i + 1) * numSeqs / ncpus;
536 MPI_Recv(&childTotal, 1, MPI_INT, i, tag, MPI_COMM_WORLD, &status);
538 vector<int> childSeqIndex(childTotal, 0);
539 vector<double> childSingleTau(childTotal, 0);
540 vector<double> childDist(numSeqs * numOTUs, 0);
541 vector<int> otuIndex(childTotal, 0);
543 MPI_Recv(&childSeqIndex[0], childTotal, MPI_INT, i, tag, MPI_COMM_WORLD, &status);
544 MPI_Recv(&childSingleTau[0], childTotal, MPI_DOUBLE, i, tag, MPI_COMM_WORLD, &status);
545 MPI_Recv(&childDist[0], numOTUs * numSeqs, MPI_DOUBLE, i, tag, MPI_COMM_WORLD, &status);
546 MPI_Recv(&otuIndex[0], childTotal, MPI_INT, i, tag, MPI_COMM_WORLD, &status);
548 int oldTotal = total;
550 singleTau.resize(total, 0);
551 seqIndex.resize(total, 0);
552 seqNumber.resize(total, 0);
556 for(int j=oldTotal;j<total;j++){
557 int otuI = otuIndex[childIndex];
558 int seqI = childSeqIndex[childIndex];
560 singleTau[j] = childSingleTau[childIndex];
562 aaP[otuI][nSeqsPerOTU[otuI]] = j;
563 aaI[otuI][nSeqsPerOTU[otuI]] = seqI;
568 int index = seqStart * numOTUs;
569 for(int j=seqStart;j<seqStop;j++){
570 for(int k=0;k<numOTUs;k++){
571 dist[index] = childDist[index];
579 m->mothurOut(toString(iter) + '\t' + toString(maxDelta) + '\t' + toString(nLL) + '\t' + toString(time(NULL) - cycTime) + '\t' + toString((clock() - cycClock)/(double)CLOCKS_PER_SEC) + '\n');
581 if((maxIters == 0 && maxDelta > minDelta) || iter < MIN_ITER || (maxDelta > minDelta && iter < maxIters)){
583 for(int i=1;i<ncpus;i++){
584 MPI_Send(&live, 1, MPI_INT, i, tag, MPI_COMM_WORLD);
589 for(int i=1;i<ncpus;i++){
590 MPI_Send(&live, 1, MPI_INT, i, tag, MPI_COMM_WORLD); //send kill command
596 if (m->control_pressed) { break; }
598 m->mothurOut("\nFinalizing...\n");
601 if (m->control_pressed) { break; }
605 vector<int> otuCounts(numOTUs, 0);
606 for(int i=0;i<numSeqs;i++) { otuCounts[otuData[i]]++; }
607 calcCentroidsDriver(0, numOTUs);
609 if (m->control_pressed) { break; }
611 writeQualities(otuCounts); if (m->control_pressed) { break; }
612 writeSequences(otuCounts); if (m->control_pressed) { break; }
613 writeNames(otuCounts); if (m->control_pressed) { break; }
614 writeClusters(otuCounts); if (m->control_pressed) { break; }
615 writeGroups(); if (m->control_pressed) { break; }
618 m->mothurOut("Total time to process " + toString(flowFileName) + ":\t" + toString(time(NULL) - begTime) + '\t' + toString((clock() - begClock)/(double)CLOCKS_PER_SEC) + '\n');
624 MPI_Recv(&abort, 1, MPI_INT, 0, tag, MPI_COMM_WORLD, &status);
625 if(abort){ return 0; }
628 MPI_Recv(&numFiles, 1, MPI_INT, 0, tag, MPI_COMM_WORLD, &status);
630 for(int i=0;i<numFiles;i++){
632 if (m->control_pressed) { break; }
634 //Now into the pyrodist part
638 MPI_Recv(&fileName, 1024, MPI_CHAR, 0, tag, MPI_COMM_WORLD, &status);
639 MPI_Recv(&numSeqs, 1, MPI_INT, 0, tag, MPI_COMM_WORLD, &status);
640 MPI_Recv(&numUniques, 1, MPI_INT, 0, tag, MPI_COMM_WORLD, &status);
641 MPI_Recv(&numFlowCells, 1, MPI_INT, 0, tag, MPI_COMM_WORLD, &status);
643 flowDataIntI.resize(numSeqs * numFlowCells);
644 flowDataPrI.resize(numSeqs * numFlowCells);
645 mapUniqueToSeq.resize(numSeqs);
646 mapSeqToUnique.resize(numSeqs);
647 lengths.resize(numSeqs);
648 jointLookUp.resize(NUMBINS * NUMBINS);
650 MPI_Recv(&flowDataIntI[0], numSeqs * numFlowCells, MPI_SHORT, 0, tag, MPI_COMM_WORLD, &status);
651 MPI_Recv(&flowDataPrI[0], numSeqs * numFlowCells, MPI_DOUBLE, 0, tag, MPI_COMM_WORLD, &status);
652 MPI_Recv(&mapUniqueToSeq[0], numSeqs, MPI_INT, 0, tag, MPI_COMM_WORLD, &status);
653 MPI_Recv(&mapSeqToUnique[0], numSeqs, MPI_INT, 0, tag, MPI_COMM_WORLD, &status);
654 MPI_Recv(&lengths[0], numSeqs, MPI_INT, 0, tag, MPI_COMM_WORLD, &status);
655 MPI_Recv(&jointLookUp[0], NUMBINS * NUMBINS, MPI_DOUBLE, 0, tag, MPI_COMM_WORLD, &status);
656 MPI_Recv(&cutoff, 1, MPI_DOUBLE, 0, tag, MPI_COMM_WORLD, &status);
658 flowFileName = string(fileName);
659 int flowDistStart = int(sqrt(float(pid)/float(ncpus)) * numUniques);
660 int flowDistEnd = int(sqrt(float(pid+1)/float(ncpus)) * numUniques);
662 string distanceStringChild = flowDistMPI(flowDistStart, flowDistEnd);
664 if (m->control_pressed) { break; }
667 MPI_Send(&done, 1, MPI_INT, 0, tag, MPI_COMM_WORLD);
669 //Now into the pyronoise part
670 MPI_Recv(&numOTUs, 1, MPI_INT, 0, tag, MPI_COMM_WORLD, &status);
672 singleLookUp.resize(HOMOPS * NUMBINS);
673 uniqueFlowgrams.resize(numUniques * numFlowCells);
674 weight.resize(numOTUs);
675 centroids.resize(numOTUs);
676 change.resize(numOTUs);
677 dist.assign(numOTUs * numSeqs, 0);
678 nSeqsPerOTU.resize(numOTUs);
679 cumNumSeqs.resize(numOTUs);
681 MPI_Recv(&singleLookUp[0], singleLookUp.size(), MPI_DOUBLE, 0, tag, MPI_COMM_WORLD, &status);
682 MPI_Recv(&uniqueFlowgrams[0], uniqueFlowgrams.size(), MPI_SHORT, 0, tag, MPI_COMM_WORLD, &status);
683 MPI_Recv(&sigma, 1, MPI_DOUBLE, 0, tag, MPI_COMM_WORLD, &status);
685 int startOTU = pid * numOTUs / ncpus;
686 int endOTU = (pid + 1) * numOTUs / ncpus;
688 int startSeq = pid * numSeqs / ncpus;
689 int endSeq = (pid + 1) * numSeqs /ncpus;
695 if (m->control_pressed) { break; }
697 MPI_Recv(&total, 1, MPI_INT, 0, tag, MPI_COMM_WORLD, &status);
698 singleTau.assign(total, 0.0000);
699 seqNumber.assign(total, 0);
700 seqIndex.assign(total, 0);
702 MPI_Recv(&change[0], numOTUs, MPI_SHORT, 0, tag, MPI_COMM_WORLD, &status);
703 MPI_Recv(¢roids[0], numOTUs, MPI_INT, 0, tag, MPI_COMM_WORLD, &status);
704 MPI_Recv(&singleTau[0], total, MPI_DOUBLE, 0, tag, MPI_COMM_WORLD, &status);
705 MPI_Recv(&seqNumber[0], total, MPI_DOUBLE, 0, tag, MPI_COMM_WORLD, &status);
706 MPI_Recv(&seqIndex[0], total, MPI_INT, 0, tag, MPI_COMM_WORLD, &status);
707 MPI_Recv(&nSeqsPerOTU[0], total, MPI_INT, 0, tag, MPI_COMM_WORLD, &status);
708 MPI_Recv(&cumNumSeqs[0], numOTUs, MPI_INT, 0, tag, MPI_COMM_WORLD, &status);
710 calcCentroidsDriver(startOTU, endOTU);
712 MPI_Send(¢roids[0], numOTUs, MPI_INT, 0, tag, MPI_COMM_WORLD);
713 MPI_Send(&change[0], numOTUs, MPI_SHORT, 0, tag, MPI_COMM_WORLD);
715 MPI_Recv(¢roids[0], numOTUs, MPI_INT, 0, tag, MPI_COMM_WORLD, &status);
716 MPI_Recv(&weight[0], numOTUs, MPI_DOUBLE, 0, tag, MPI_COMM_WORLD, &status);
717 MPI_Recv(&change[0], numOTUs, MPI_SHORT, 0, tag, MPI_COMM_WORLD, &status);
719 vector<int> otuIndex(total, 0);
720 calcNewDistancesChildMPI(startSeq, endSeq, otuIndex);
721 total = otuIndex.size();
723 MPI_Send(&total, 1, MPI_INT, 0, tag, MPI_COMM_WORLD);
724 MPI_Send(&seqIndex[0], total, MPI_INT, 0, tag, MPI_COMM_WORLD);
725 MPI_Send(&singleTau[0], total, MPI_DOUBLE, 0, tag, MPI_COMM_WORLD);
726 MPI_Send(&dist[0], numOTUs * numSeqs, MPI_DOUBLE, 0, tag, MPI_COMM_WORLD);
727 MPI_Send(&otuIndex[0], total, MPI_INT, 0, tag, MPI_COMM_WORLD);
729 MPI_Recv(&live, 1, MPI_INT, 0, tag, MPI_COMM_WORLD, &status);
734 if (m->control_pressed) { for (int i = 0; i < outputNames.size(); i++) { m->mothurRemove(outputNames[i]); } return 0; }
736 MPI_Barrier(MPI_COMM_WORLD);
739 if(compositeFASTAFileName != ""){
740 outputNames.push_back(compositeFASTAFileName);
741 outputNames.push_back(compositeNamesFileName);
744 m->mothurOutEndLine();
745 m->mothurOut("Output File Names: "); m->mothurOutEndLine();
746 for (int i = 0; i < outputNames.size(); i++) { m->mothurOut(outputNames[i]); m->mothurOutEndLine(); }
747 m->mothurOutEndLine();
752 catch(exception& e) {
753 m->errorOut(e, "ShhherCommand", "execute");
757 /**************************************************************************************************/
758 string ShhherCommand::createNamesFile(){
761 vector<string> duplicateNames(numUniques, "");
762 for(int i=0;i<numSeqs;i++){
763 duplicateNames[mapSeqToUnique[i]] += seqNameVector[i] + ',';
766 string nameFileName = flowFileName.substr(0,flowFileName.find_last_of('.')) + ".shhh.names";
769 m->openOutputFile(nameFileName, nameFile);
771 for(int i=0;i<numUniques;i++){
773 if (m->control_pressed) { break; }
775 // nameFile << seqNameVector[mapUniqueToSeq[i]] << '\t' << duplicateNames[i].substr(0, duplicateNames[i].find_last_of(',')) << endl;
776 nameFile << mapUniqueToSeq[i] << '\t' << duplicateNames[i].substr(0, duplicateNames[i].find_last_of(',')) << endl;
782 catch(exception& e) {
783 m->errorOut(e, "ShhherCommand", "createNamesFile");
787 /**************************************************************************************************/
789 string ShhherCommand::flowDistMPI(int startSeq, int stopSeq){
791 ostringstream outStream;
792 outStream.setf(ios::fixed, ios::floatfield);
793 outStream.setf(ios::dec, ios::basefield);
794 outStream.setf(ios::showpoint);
795 outStream.precision(6);
797 int begTime = time(NULL);
798 double begClock = clock();
800 for(int i=startSeq;i<stopSeq;i++){
802 if (m->control_pressed) { break; }
804 for(int j=0;j<i;j++){
805 float flowDistance = calcPairwiseDist(mapUniqueToSeq[i], mapUniqueToSeq[j]);
807 if(flowDistance < 1e-6){
808 outStream << mapUniqueToSeq[i] << '\t' << mapUniqueToSeq[j] << '\t' << 0.000000 << endl;
810 else if(flowDistance <= cutoff){
811 outStream << mapUniqueToSeq[i] << '\t' << mapUniqueToSeq[j] << '\t' << flowDistance << endl;
815 m->mothurOut(toString(i) + '\t' + toString(time(NULL) - begTime) + '\t' + toString((clock()-begClock)/CLOCKS_PER_SEC) + '\n');
819 string fDistFileName = flowFileName.substr(0,flowFileName.find_last_of('.')) + ".shhh.dist";
820 if(pid != 0){ fDistFileName += ".temp." + toString(pid); }
822 if (m->control_pressed) { return fDistFileName; }
824 m->mothurOut(toString(stopSeq) + '\t' + toString(time(NULL) - begTime) + '\t' + toString((clock()-begClock)/CLOCKS_PER_SEC) + '\n');
826 ofstream distFile(fDistFileName.c_str());
827 distFile << outStream.str();
830 return fDistFileName;
832 catch(exception& e) {
833 m->errorOut(e, "ShhherCommand", "flowDistMPI");
837 /**************************************************************************************************/
839 void ShhherCommand::getOTUData(string listFileName){
843 m->openInputFile(listFileName, listFile);
846 listFile >> label >> numOTUs;
848 otuData.assign(numSeqs, 0);
849 cumNumSeqs.assign(numOTUs, 0);
850 nSeqsPerOTU.assign(numOTUs, 0);
851 aaP.clear();aaP.resize(numOTUs);
857 string singleOTU = "";
859 for(int i=0;i<numOTUs;i++){
861 if (m->control_pressed) { break; }
863 listFile >> singleOTU;
865 istringstream otuString(singleOTU);
871 for(int j=0;j<singleOTU.length();j++){
872 char letter = otuString.get();
878 map<string,int>::iterator nmIt = nameMap.find(seqName);
879 int index = nmIt->second;
885 aaP[i].push_back(index);
890 map<string,int>::iterator nmIt = nameMap.find(seqName);
892 int index = nmIt->second;
897 aaP[i].push_back(index);
902 sort(aaP[i].begin(), aaP[i].end());
903 for(int j=0;j<nSeqsPerOTU[i];j++){
904 seqNumber.push_back(aaP[i][j]);
906 for(int j=nSeqsPerOTU[i];j<numSeqs;j++){
913 for(int i=1;i<numOTUs;i++){
914 cumNumSeqs[i] = cumNumSeqs[i-1] + nSeqsPerOTU[i-1];
917 seqIndex = seqNumber;
922 catch(exception& e) {
923 m->errorOut(e, "ShhherCommand", "getOTUData");
928 /**************************************************************************************************/
930 void ShhherCommand::initPyroCluster(){
932 if (numOTUs < processors) { processors = 1; }
934 dist.assign(numSeqs * numOTUs, 0);
935 change.assign(numOTUs, 1);
936 centroids.assign(numOTUs, -1);
937 weight.assign(numOTUs, 0);
938 singleTau.assign(numSeqs, 1.0);
940 nSeqsBreaks.assign(processors+1, 0);
941 nOTUsBreaks.assign(processors+1, 0);
944 for(int i=0;i<processors;i++){
945 nSeqsBreaks[i+1] = nSeqsBreaks[i] + (int)((double) numSeqs / (double) processors);
946 nOTUsBreaks[i+1] = nOTUsBreaks[i] + (int)((double) numOTUs / (double) processors);
948 nSeqsBreaks[processors] = numSeqs;
949 nOTUsBreaks[processors] = numOTUs;
951 catch(exception& e) {
952 m->errorOut(e, "ShhherCommand", "initPyroCluster");
957 /**************************************************************************************************/
959 void ShhherCommand::fill(){
962 for(int i=0;i<numOTUs;i++){
964 if (m->control_pressed) { break; }
966 cumNumSeqs[i] = index;
967 for(int j=0;j<nSeqsPerOTU[i];j++){
968 seqNumber[index] = aaP[i][j];
969 seqIndex[index] = aaI[i][j];
975 catch(exception& e) {
976 m->errorOut(e, "ShhherCommand", "fill");
981 /**************************************************************************************************/
983 void ShhherCommand::getFlowData(){
986 m->openInputFile(flowFileName, flowFile);
989 seqNameVector.clear();
991 flowDataIntI.clear();
995 int currentNumFlowCells;
999 flowFile >> numFlowCells;
1000 int index = 0;//pcluster
1001 while(!flowFile.eof()){
1003 if (m->control_pressed) { break; }
1005 flowFile >> seqName >> currentNumFlowCells;
1006 lengths.push_back(currentNumFlowCells);
1008 seqNameVector.push_back(seqName);
1009 nameMap[seqName] = index++;//pcluster
1011 for(int i=0;i<numFlowCells;i++){
1012 flowFile >> intensity;
1013 if(intensity > 9.99) { intensity = 9.99; }
1014 int intI = int(100 * intensity + 0.0001);
1015 flowDataIntI.push_back(intI);
1017 m->gobble(flowFile);
1021 numSeqs = seqNameVector.size();
1023 for(int i=0;i<numSeqs;i++){
1025 if (m->control_pressed) { break; }
1027 int iNumFlowCells = i * numFlowCells;
1028 for(int j=lengths[i];j<numFlowCells;j++){
1029 flowDataIntI[iNumFlowCells + j] = 0;
1034 catch(exception& e) {
1035 m->errorOut(e, "ShhherCommand", "getFlowData");
1039 /**************************************************************************************************/
1040 void ShhherCommand::calcNewDistancesChildMPI(int startSeq, int stopSeq, vector<int>& otuIndex){
1043 vector<double> newTau(numOTUs,0);
1044 vector<double> norms(numSeqs, 0);
1049 for(int i=startSeq;i<stopSeq;i++){
1051 if (m->control_pressed) { break; }
1053 double offset = 1e8;
1054 int indexOffset = i * numOTUs;
1056 for(int j=0;j<numOTUs;j++){
1058 if(weight[j] > MIN_WEIGHT && change[j] == 1){
1059 dist[indexOffset + j] = getDistToCentroid(centroids[j], i, lengths[i]);
1061 if(weight[j] > MIN_WEIGHT && dist[indexOffset + j] < offset){
1062 offset = dist[indexOffset + j];
1066 for(int j=0;j<numOTUs;j++){
1067 if(weight[j] > MIN_WEIGHT){
1068 newTau[j] = exp(sigma * (-dist[indexOffset + j] + offset)) * weight[j];
1069 norms[i] += newTau[j];
1076 for(int j=0;j<numOTUs;j++){
1078 newTau[j] /= norms[i];
1080 if(newTau[j] > MIN_TAU){
1081 otuIndex.push_back(j);
1082 seqIndex.push_back(i);
1083 singleTau.push_back(newTau[j]);
1089 catch(exception& e) {
1090 m->errorOut(e, "ShhherCommand", "calcNewDistancesChildMPI");
1095 /**************************************************************************************************/
1097 void ShhherCommand::calcNewDistancesParent(int startSeq, int stopSeq){
1102 vector<double> newTau(numOTUs,0);
1103 vector<double> norms(numSeqs, 0);
1104 nSeqsPerOTU.assign(numOTUs, 0);
1106 for(int i=startSeq;i<stopSeq;i++){
1108 if (m->control_pressed) { break; }
1110 int indexOffset = i * numOTUs;
1112 double offset = 1e8;
1114 for(int j=0;j<numOTUs;j++){
1116 if(weight[j] > MIN_WEIGHT && change[j] == 1){
1117 dist[indexOffset + j] = getDistToCentroid(centroids[j], i, lengths[i]);
1120 if(weight[j] > MIN_WEIGHT && dist[indexOffset + j] < offset){
1121 offset = dist[indexOffset + j];
1125 for(int j=0;j<numOTUs;j++){
1126 if(weight[j] > MIN_WEIGHT){
1127 newTau[j] = exp(sigma * (-dist[indexOffset + j] + offset)) * weight[j];
1128 norms[i] += newTau[j];
1135 for(int j=0;j<numOTUs;j++){
1136 newTau[j] /= norms[i];
1139 for(int j=0;j<numOTUs;j++){
1140 if(newTau[j] > MIN_TAU){
1142 int oldTotal = total;
1146 singleTau.resize(total, 0);
1147 seqNumber.resize(total, 0);
1148 seqIndex.resize(total, 0);
1150 singleTau[oldTotal] = newTau[j];
1152 aaP[j][nSeqsPerOTU[j]] = oldTotal;
1153 aaI[j][nSeqsPerOTU[j]] = i;
1161 catch(exception& e) {
1162 m->errorOut(e, "ShhherCommand", "calcNewDistancesParent");
1167 /**************************************************************************************************/
1169 void ShhherCommand::setOTUs(){
1172 vector<double> bigTauMatrix(numOTUs * numSeqs, 0.0000);
1174 for(int i=0;i<numOTUs;i++){
1176 if (m->control_pressed) { break; }
1178 for(int j=0;j<nSeqsPerOTU[i];j++){
1179 int index = cumNumSeqs[i] + j;
1180 double tauValue = singleTau[seqNumber[index]];
1181 int sIndex = seqIndex[index];
1182 bigTauMatrix[sIndex * numOTUs + i] = tauValue;
1186 for(int i=0;i<numSeqs;i++){
1187 double maxTau = -1.0000;
1189 for(int j=0;j<numOTUs;j++){
1190 if(bigTauMatrix[i * numOTUs + j] > maxTau){
1191 maxTau = bigTauMatrix[i * numOTUs + j];
1196 otuData[i] = maxOTU;
1199 nSeqsPerOTU.assign(numOTUs, 0);
1201 for(int i=0;i<numSeqs;i++){
1202 int index = otuData[i];
1204 singleTau[i] = 1.0000;
1207 aaP[index][nSeqsPerOTU[index]] = i;
1208 aaI[index][nSeqsPerOTU[index]] = i;
1210 nSeqsPerOTU[index]++;
1214 catch(exception& e) {
1215 m->errorOut(e, "ShhherCommand", "setOTUs");
1220 /**************************************************************************************************/
1222 void ShhherCommand::getUniques(){
1227 uniqueFlowgrams.assign(numFlowCells * numSeqs, -1);
1228 uniqueCount.assign(numSeqs, 0); // anWeights
1229 uniqueLengths.assign(numSeqs, 0);
1230 mapSeqToUnique.assign(numSeqs, -1);
1231 mapUniqueToSeq.assign(numSeqs, -1);
1233 vector<short> uniqueFlowDataIntI(numFlowCells * numSeqs, -1);
1235 for(int i=0;i<numSeqs;i++){
1237 if (m->control_pressed) { break; }
1241 vector<short> current(numFlowCells);
1242 for(int j=0;j<numFlowCells;j++){
1243 current[j] = short(((flowDataIntI[i * numFlowCells + j] + 50.0)/100.0));
1246 for(int j=0;j<numUniques;j++){
1247 int offset = j * numFlowCells;
1251 if(lengths[i] < uniqueLengths[j]) { shorterLength = lengths[i]; }
1252 else { shorterLength = uniqueLengths[j]; }
1254 for(int k=0;k<shorterLength;k++){
1255 if(current[k] != uniqueFlowgrams[offset + k]){
1262 mapSeqToUnique[i] = j;
1265 if(lengths[i] > uniqueLengths[j]) { uniqueLengths[j] = lengths[i]; }
1271 if(index == numUniques){
1272 uniqueLengths[numUniques] = lengths[i];
1273 uniqueCount[numUniques] = 1;
1274 mapSeqToUnique[i] = numUniques;//anMap
1275 mapUniqueToSeq[numUniques] = i;//anF
1277 for(int k=0;k<numFlowCells;k++){
1278 uniqueFlowgrams[numUniques * numFlowCells + k] = current[k];
1279 uniqueFlowDataIntI[numUniques * numFlowCells + k] = flowDataIntI[i * numFlowCells + k];
1285 uniqueFlowDataIntI.resize(numFlowCells * numUniques);
1286 uniqueLengths.resize(numUniques);
1288 flowDataPrI.resize(numSeqs * numFlowCells, 0);
1289 for(int i=0;i<flowDataPrI.size();i++) { if (m->control_pressed) { break; } flowDataPrI[i] = getProbIntensity(flowDataIntI[i]); }
1291 catch(exception& e) {
1292 m->errorOut(e, "ShhherCommand", "getUniques");
1297 /**************************************************************************************************/
1299 float ShhherCommand::calcPairwiseDist(int seqA, int seqB){
1301 int minLength = lengths[mapSeqToUnique[seqA]];
1302 if(lengths[seqB] < minLength){ minLength = lengths[mapSeqToUnique[seqB]]; }
1304 int ANumFlowCells = seqA * numFlowCells;
1305 int BNumFlowCells = seqB * numFlowCells;
1309 for(int i=0;i<minLength;i++){
1311 if (m->control_pressed) { break; }
1313 int flowAIntI = flowDataIntI[ANumFlowCells + i];
1314 float flowAPrI = flowDataPrI[ANumFlowCells + i];
1316 int flowBIntI = flowDataIntI[BNumFlowCells + i];
1317 float flowBPrI = flowDataPrI[BNumFlowCells + i];
1318 dist += jointLookUp[flowAIntI * NUMBINS + flowBIntI] - flowAPrI - flowBPrI;
1321 dist /= (float) minLength;
1324 catch(exception& e) {
1325 m->errorOut(e, "ShhherCommand", "calcPairwiseDist");
1330 //**********************************************************************************************************************/
1332 string ShhherCommand::cluster(string distFileName, string namesFileName){
1335 ReadMatrix* read = new ReadColumnMatrix(distFileName);
1336 read->setCutoff(cutoff);
1338 NameAssignment* clusterNameMap = new NameAssignment(namesFileName);
1339 clusterNameMap->readMap();
1340 read->read(clusterNameMap);
1342 ListVector* list = read->getListVector();
1343 SparseMatrix* matrix = read->getMatrix();
1346 delete clusterNameMap;
1348 RAbundVector* rabund = new RAbundVector(list->getRAbundVector());
1350 Cluster* cluster = new CompleteLinkage(rabund, list, matrix, cutoff, "furthest");
1351 string tag = cluster->getTag();
1353 double clusterCutoff = cutoff;
1354 while (matrix->getSmallDist() <= clusterCutoff && matrix->getNNodes() > 0){
1356 if (m->control_pressed) { break; }
1358 cluster->update(clusterCutoff);
1361 list->setLabel(toString(cutoff));
1363 string listFileName = flowFileName.substr(0,flowFileName.find_last_of('.')) + ".shhh.list";
1365 m->openOutputFile(listFileName, listFile);
1366 list->print(listFile);
1369 delete matrix; delete cluster; delete rabund; delete list;
1371 return listFileName;
1373 catch(exception& e) {
1374 m->errorOut(e, "ShhherCommand", "cluster");
1379 /**************************************************************************************************/
1381 void ShhherCommand::calcCentroidsDriver(int start, int finish){
1383 //this function gets the most likely homopolymer length at a flow position for a group of sequences
1388 for(int i=start;i<finish;i++){
1390 if (m->control_pressed) { break; }
1394 int minFlowGram = 100000000;
1395 double minFlowValue = 1e8;
1396 change[i] = 0; //FALSE
1398 for(int j=0;j<nSeqsPerOTU[i];j++){
1399 count += singleTau[seqNumber[cumNumSeqs[i] + j]];
1402 if(nSeqsPerOTU[i] > 0 && count > MIN_COUNT){
1403 vector<double> adF(nSeqsPerOTU[i]);
1404 vector<int> anL(nSeqsPerOTU[i]);
1406 for(int j=0;j<nSeqsPerOTU[i];j++){
1407 int index = cumNumSeqs[i] + j;
1408 int nI = seqIndex[index];
1409 int nIU = mapSeqToUnique[nI];
1412 for(k=0;k<position;k++){
1418 anL[position] = nIU;
1419 adF[position] = 0.0000;
1424 for(int j=0;j<nSeqsPerOTU[i];j++){
1425 int index = cumNumSeqs[i] + j;
1426 int nI = seqIndex[index];
1428 double tauValue = singleTau[seqNumber[index]];
1430 for(int k=0;k<position;k++){
1431 double dist = getDistToCentroid(anL[k], nI, lengths[nI]);
1432 adF[k] += dist * tauValue;
1436 for(int j=0;j<position;j++){
1437 if(adF[j] < minFlowValue){
1439 minFlowValue = adF[j];
1443 if(centroids[i] != anL[minFlowGram]){
1445 centroids[i] = anL[minFlowGram];
1448 else if(centroids[i] != -1){
1454 catch(exception& e) {
1455 m->errorOut(e, "ShhherCommand", "calcCentroidsDriver");
1460 /**************************************************************************************************/
1462 double ShhherCommand::getDistToCentroid(int cent, int flow, int length){
1465 int flowAValue = cent * numFlowCells;
1466 int flowBValue = flow * numFlowCells;
1470 for(int i=0;i<length;i++){
1471 dist += singleLookUp[uniqueFlowgrams[flowAValue] * NUMBINS + flowDataIntI[flowBValue]];
1476 return dist / (double)length;
1478 catch(exception& e) {
1479 m->errorOut(e, "ShhherCommand", "getDistToCentroid");
1484 /**************************************************************************************************/
1486 double ShhherCommand::getNewWeights(){
1489 double maxChange = 0;
1491 for(int i=0;i<numOTUs;i++){
1493 if (m->control_pressed) { break; }
1495 double difference = weight[i];
1498 for(int j=0;j<nSeqsPerOTU[i];j++){
1499 int index = cumNumSeqs[i] + j;
1500 double tauValue = singleTau[seqNumber[index]];
1501 weight[i] += tauValue;
1504 difference = fabs(weight[i] - difference);
1505 if(difference > maxChange){ maxChange = difference; }
1509 catch(exception& e) {
1510 m->errorOut(e, "ShhherCommand", "getNewWeights");
1515 /**************************************************************************************************/
1517 double ShhherCommand::getLikelihood(){
1521 vector<long double> P(numSeqs, 0);
1524 for(int i=0;i<numOTUs;i++){
1525 if(weight[i] > MIN_WEIGHT){
1531 for(int i=0;i<numOTUs;i++){
1533 if (m->control_pressed) { break; }
1535 for(int j=0;j<nSeqsPerOTU[i];j++){
1536 int index = cumNumSeqs[i] + j;
1537 int nI = seqIndex[index];
1538 double singleDist = dist[seqNumber[index]];
1540 P[nI] += weight[i] * exp(-singleDist * sigma);
1544 for(int i=0;i<numSeqs;i++){
1545 if(P[i] == 0){ P[i] = DBL_EPSILON; }
1550 nLL = nLL -(double)numSeqs * log(sigma);
1554 catch(exception& e) {
1555 m->errorOut(e, "ShhherCommand", "getNewWeights");
1560 /**************************************************************************************************/
1562 void ShhherCommand::checkCentroids(){
1564 vector<int> unique(numOTUs, 1);
1566 for(int i=0;i<numOTUs;i++){
1567 if(centroids[i] == -1 || weight[i] < MIN_WEIGHT){
1572 for(int i=0;i<numOTUs;i++){
1574 if (m->control_pressed) { break; }
1577 for(int j=i+1;j<numOTUs;j++){
1580 if(centroids[j] == centroids[i]){
1584 weight[i] += weight[j];
1592 catch(exception& e) {
1593 m->errorOut(e, "ShhherCommand", "checkCentroids");
1597 /**************************************************************************************************/
1601 void ShhherCommand::writeQualities(vector<int> otuCounts){
1604 string thisOutputDir = outputDir;
1605 if (outputDir == "") { thisOutputDir += m->hasPath(flowFileName); }
1606 string qualityFileName = thisOutputDir + m->getRootName(m->getSimpleName(flowFileName)) + "shhh.qual";
1608 ofstream qualityFile;
1609 m->openOutputFile(qualityFileName, qualityFile);
1611 qualityFile.setf(ios::fixed, ios::floatfield);
1612 qualityFile.setf(ios::showpoint);
1613 qualityFile << setprecision(6);
1615 vector<vector<int> > qualities(numOTUs);
1616 vector<double> pr(HOMOPS, 0);
1619 for(int i=0;i<numOTUs;i++){
1621 if (m->control_pressed) { break; }
1626 if(nSeqsPerOTU[i] > 0){
1627 qualities[i].assign(1024, -1);
1629 while(index < numFlowCells){
1630 double maxPrValue = 1e8;
1631 short maxPrIndex = -1;
1632 double count = 0.0000;
1634 pr.assign(HOMOPS, 0);
1636 for(int j=0;j<nSeqsPerOTU[i];j++){
1637 int lIndex = cumNumSeqs[i] + j;
1638 double tauValue = singleTau[seqNumber[lIndex]];
1639 int sequenceIndex = aaI[i][j];
1640 short intensity = flowDataIntI[sequenceIndex * numFlowCells + index];
1644 for(int s=0;s<HOMOPS;s++){
1645 pr[s] += tauValue * singleLookUp[s * NUMBINS + intensity];
1649 maxPrIndex = uniqueFlowgrams[centroids[i] * numFlowCells + index];
1650 maxPrValue = pr[maxPrIndex];
1652 if(count > MIN_COUNT){
1654 double norm = 0.0000;
1656 for(int s=0;s<HOMOPS;s++){
1657 norm += exp(-(pr[s] - maxPrValue));
1660 for(int s=1;s<=maxPrIndex;s++){
1662 double temp = 0.0000;
1664 U += exp(-(pr[s-1]-maxPrValue))/norm;
1672 temp = floor(-10 * temp);
1673 value = (int)floor(temp);
1674 if(value > 100){ value = 100; }
1676 qualities[i][base] = (int)value;
1686 if(otuCounts[i] > 0){
1687 qualityFile << '>' << seqNameVector[mapUniqueToSeq[i]] << endl;
1689 int j=4; //need to get past the first four bases
1690 while(qualities[i][j] != -1){
1691 qualityFile << qualities[i][j] << ' ';
1694 qualityFile << endl;
1697 qualityFile.close();
1698 outputNames.push_back(qualityFileName);
1701 catch(exception& e) {
1702 m->errorOut(e, "ShhherCommand", "writeQualities");
1707 /**************************************************************************************************/
1709 void ShhherCommand::writeSequences(vector<int> otuCounts){
1711 string thisOutputDir = outputDir;
1712 if (outputDir == "") { thisOutputDir += m->hasPath(flowFileName); }
1713 string fastaFileName = thisOutputDir + m->getRootName(m->getSimpleName(flowFileName)) + "shhh.fasta";
1715 m->openOutputFile(fastaFileName, fastaFile);
1717 vector<string> names(numOTUs, "");
1719 for(int i=0;i<numOTUs;i++){
1721 if (m->control_pressed) { break; }
1723 int index = centroids[i];
1725 if(otuCounts[i] > 0){
1726 fastaFile << '>' << seqNameVector[aaI[i][0]] << endl;
1730 for(int j=0;j<numFlowCells;j++){
1732 char base = flowOrder[j % 4];
1733 for(int k=0;k<uniqueFlowgrams[index * numFlowCells + j];k++){
1738 fastaFile << newSeq.substr(4) << endl;
1743 outputNames.push_back(fastaFileName);
1745 if(compositeFASTAFileName != ""){
1746 m->appendFiles(fastaFileName, compositeFASTAFileName);
1749 catch(exception& e) {
1750 m->errorOut(e, "ShhherCommand", "writeSequences");
1755 /**************************************************************************************************/
1757 void ShhherCommand::writeNames(vector<int> otuCounts){
1759 string thisOutputDir = outputDir;
1760 if (outputDir == "") { thisOutputDir += m->hasPath(flowFileName); }
1761 string nameFileName = thisOutputDir + m->getRootName(m->getSimpleName(flowFileName)) + "shhh.names";
1763 m->openOutputFile(nameFileName, nameFile);
1765 for(int i=0;i<numOTUs;i++){
1767 if (m->control_pressed) { break; }
1769 if(otuCounts[i] > 0){
1770 nameFile << seqNameVector[aaI[i][0]] << '\t' << seqNameVector[aaI[i][0]];
1772 for(int j=1;j<nSeqsPerOTU[i];j++){
1773 nameFile << ',' << seqNameVector[aaI[i][j]];
1780 outputNames.push_back(nameFileName);
1783 if(compositeNamesFileName != ""){
1784 m->appendFiles(nameFileName, compositeNamesFileName);
1787 catch(exception& e) {
1788 m->errorOut(e, "ShhherCommand", "writeNames");
1793 /**************************************************************************************************/
1795 void ShhherCommand::writeGroups(){
1797 string thisOutputDir = outputDir;
1798 if (outputDir == "") { thisOutputDir += m->hasPath(flowFileName); }
1799 string fileRoot = thisOutputDir + m->getRootName(m->getSimpleName(flowFileName));
1800 string groupFileName = fileRoot + "shhh.groups";
1802 m->openOutputFile(groupFileName, groupFile);
1804 for(int i=0;i<numSeqs;i++){
1805 if (m->control_pressed) { break; }
1806 groupFile << seqNameVector[i] << '\t' << fileRoot << endl;
1809 outputNames.push_back(groupFileName);
1812 catch(exception& e) {
1813 m->errorOut(e, "ShhherCommand", "writeGroups");
1818 /**************************************************************************************************/
1820 void ShhherCommand::writeClusters(vector<int> otuCounts){
1822 string thisOutputDir = outputDir;
1823 if (outputDir == "") { thisOutputDir += m->hasPath(flowFileName); }
1824 string otuCountsFileName = thisOutputDir + m->getRootName(m->getSimpleName(flowFileName)) + "shhh.counts";
1825 ofstream otuCountsFile;
1826 m->openOutputFile(otuCountsFileName, otuCountsFile);
1828 string bases = flowOrder;
1830 for(int i=0;i<numOTUs;i++){
1832 if (m->control_pressed) {
1835 //output the translated version of the centroid sequence for the otu
1836 if(otuCounts[i] > 0){
1837 int index = centroids[i];
1839 otuCountsFile << "ideal\t";
1840 for(int j=8;j<numFlowCells;j++){
1841 char base = bases[j % 4];
1842 for(int s=0;s<uniqueFlowgrams[index * numFlowCells + j];s++){
1843 otuCountsFile << base;
1846 otuCountsFile << endl;
1848 for(int j=0;j<nSeqsPerOTU[i];j++){
1849 int sequence = aaI[i][j];
1850 otuCountsFile << seqNameVector[sequence] << '\t';
1854 for(int k=0;k<lengths[sequence];k++){
1855 char base = bases[k % 4];
1856 int freq = int(0.01 * (double)flowDataIntI[sequence * numFlowCells + k] + 0.5);
1858 for(int s=0;s<freq;s++){
1860 //otuCountsFile << base;
1863 otuCountsFile << newSeq.substr(4) << endl;
1865 otuCountsFile << endl;
1868 otuCountsFile.close();
1869 outputNames.push_back(otuCountsFileName);
1872 catch(exception& e) {
1873 m->errorOut(e, "ShhherCommand", "writeClusters");
1879 //**********************************************************************************************************************
1881 int ShhherCommand::execute(){
1883 if (abort == true) { return 0; }
1885 getSingleLookUp(); if (m->control_pressed) { return 0; }
1886 getJointLookUp(); if (m->control_pressed) { return 0; }
1888 int numFiles = flowFileVector.size();
1890 if (numFiles < processors) { processors = numFiles; }
1892 #if defined (__APPLE__) || (__MACH__) || (linux) || (__linux) || (__linux__) || (__unix__) || (__unix)
1893 if (processors == 1) { driver(flowFileVector, compositeFASTAFileName, compositeNamesFileName, 0, flowFileVector.size()); }
1894 else { createProcesses(flowFileVector); } //each processor processes one file
1896 driver(flowFileVector, compositeFASTAFileName, compositeNamesFileName, 0, flowFileVector.size());
1899 if(compositeFASTAFileName != ""){
1900 outputNames.push_back(compositeFASTAFileName);
1901 outputNames.push_back(compositeNamesFileName);
1904 m->mothurOutEndLine();
1905 m->mothurOut("Output File Names: "); m->mothurOutEndLine();
1906 for (int i = 0; i < outputNames.size(); i++) { m->mothurOut(outputNames[i]); m->mothurOutEndLine(); }
1907 m->mothurOutEndLine();
1911 catch(exception& e) {
1912 m->errorOut(e, "ShhherCommand", "execute");
1917 /**************************************************************************************************/
1919 int ShhherCommand::createProcesses(vector<string> filenames){
1921 vector<int> processIDS;
1926 if (filenames.size() < processors) { processors = filenames.size(); }
1928 //divide the groups between the processors
1929 vector<linePair> lines;
1930 int numFilesPerProcessor = filenames.size() / processors;
1931 for (int i = 0; i < processors; i++) {
1932 int startIndex = i * numFilesPerProcessor;
1933 int endIndex = (i+1) * numFilesPerProcessor;
1934 if(i == (processors - 1)){ endIndex = filenames.size(); }
1935 lines.push_back(linePair(startIndex, endIndex));
1938 #if defined (__APPLE__) || (__MACH__) || (linux) || (__linux) || (__linux__) || (__unix__) || (__unix)
1940 //loop through and create all the processes you want
1941 while (process != processors) {
1945 processIDS.push_back(pid); //create map from line number to pid so you can append files in correct order later
1947 }else if (pid == 0){
1948 num = driver(filenames, compositeFASTAFileName + toString(getpid()) + ".temp", compositeNamesFileName + toString(getpid()) + ".temp", lines[process].start, lines[process].end);
1951 m->mothurOut("[ERROR]: unable to spawn the necessary processes."); m->mothurOutEndLine();
1952 for (int i = 0; i < processIDS.size(); i++) { kill (processIDS[i], SIGINT); }
1958 driver(filenames, compositeFASTAFileName, compositeNamesFileName, lines[0].start, lines[0].end);
1960 //force parent to wait until all the processes are done
1961 for (int i=0;i<processIDS.size();i++) {
1962 int temp = processIDS[i];
1968 //////////////////////////////////////////////////////////////////////////////////////////////////////
1970 /////////////////////// NOT WORKING, ACCESS VIOLATION ON READ OF FLOWGRAMS IN THREAD /////////////////
1972 //////////////////////////////////////////////////////////////////////////////////////////////////////
1973 //Windows version shared memory, so be careful when passing variables through the shhhFlowsData struct.
1974 //Above fork() will clone, so memory is separate, but that's not the case with windows,
1975 //////////////////////////////////////////////////////////////////////////////////////////////////////
1977 vector<shhhFlowsData*> pDataArray;
1978 DWORD dwThreadIdArray[processors-1];
1979 HANDLE hThreadArray[processors-1];
1981 //Create processor worker threads.
1982 for( int i=0; i<processors-1; i++ ){
1983 // Allocate memory for thread data.
1984 string extension = "";
1985 if (i != 0) { extension = toString(i) + ".temp"; }
1987 shhhFlowsData* tempFlow = new shhhFlowsData(filenames, (compositeFASTAFileName + extension), (compositeNamesFileName + extension), outputDir, flowOrder, jointLookUp, singleLookUp, m, lines[i].start, lines[i].end, cutoff, sigma, minDelta, maxIters, i);
1988 pDataArray.push_back(tempFlow);
1989 processIDS.push_back(i);
1991 hThreadArray[i] = CreateThread(NULL, 0, ShhhFlowsThreadFunction, pDataArray[i], 0, &dwThreadIdArray[i]);
1994 //using the main process as a worker saves time and memory
1996 driver(filenames, compositeFASTAFileName, compositeNamesFileName, lines[processors-1].start, lines[processors-1].end);
1998 //Wait until all threads have terminated.
1999 WaitForMultipleObjects(processors-1, hThreadArray, TRUE, INFINITE);
2001 //Close all thread handles and free memory allocations.
2002 for(int i=0; i < pDataArray.size(); i++){
2003 for(int j=0; j < pDataArray[i]->outputNames.size(); j++){ outputNames.push_back(pDataArray[i]->outputNames[j]); }
2004 CloseHandle(hThreadArray[i]);
2005 delete pDataArray[i];
2010 for (int i=0;i<processIDS.size();i++) {
2011 if (compositeFASTAFileName != "") {
2012 m->appendFiles((compositeFASTAFileName + toString(processIDS[i]) + ".temp"), compositeFASTAFileName);
2013 m->appendFiles((compositeNamesFileName + toString(processIDS[i]) + ".temp"), compositeNamesFileName);
2014 m->mothurRemove((compositeFASTAFileName + toString(processIDS[i]) + ".temp"));
2015 m->mothurRemove((compositeNamesFileName + toString(processIDS[i]) + ".temp"));
2022 catch(exception& e) {
2023 m->errorOut(e, "ShhherCommand", "createProcesses");
2027 /**************************************************************************************************/
2029 vector<string> ShhherCommand::parseFlowFiles(string filename){
2031 vector<string> files;
2035 m->openInputFile(filename, in);
2037 int thisNumFLows = 0;
2038 in >> thisNumFLows; m->gobble(in);
2041 if (m->control_pressed) { break; }
2044 string outputFileName = filename + toString(count) + ".temp";
2045 m->openOutputFile(outputFileName, out);
2046 out << thisNumFLows << endl;
2047 files.push_back(outputFileName);
2049 for (int i = 0; i < largeSize; i++) {
2050 if (in.eof()) { break; }
2051 string line = m->getline(in);
2052 out << line << endl;
2059 if (m->control_pressed) { for (int i = 0; i < files.size(); i++) { m->mothurRemove(files[i]); } files.clear(); }
2063 catch(exception& e) {
2064 m->errorOut(e, "ShhherCommand", "parseFlowFiles");
2068 /**************************************************************************************************/
2070 int ShhherCommand::driver(vector<string> filenames, string thisCompositeFASTAFileName, string thisCompositeNamesFileName, int start, int end){
2073 for(int i=start;i<end;i++){
2075 if (m->control_pressed) { break; }
2077 vector<string> theseFlowFileNames; theseFlowFileNames.push_back(filenames[i]);
2078 if (large) { theseFlowFileNames = parseFlowFiles(filenames[i]); }
2080 if (m->control_pressed) { break; }
2082 double begClock = clock();
2083 unsigned long long begTime;
2085 for (int g = 0; g < theseFlowFileNames.size(); g++) {
2087 string flowFileName = theseFlowFileNames[g];
2088 m->mothurOut("\n>>>>>\tProcessing " + flowFileName + " (file " + toString(i+1) + " of " + toString(filenames.size()) + ")\t<<<<<\n");
2089 m->mothurOut("Reading flowgrams...\n");
2091 vector<string> seqNameVector;
2092 vector<int> lengths;
2093 vector<short> flowDataIntI;
2094 vector<double> flowDataPrI;
2095 map<string, int> nameMap;
2096 vector<short> uniqueFlowgrams;
2097 vector<int> uniqueCount;
2098 vector<int> mapSeqToUnique;
2099 vector<int> mapUniqueToSeq;
2100 vector<int> uniqueLengths;
2103 int numSeqs = getFlowData(flowFileName, seqNameVector, lengths, flowDataIntI, nameMap, numFlowCells);
2105 if (m->control_pressed) { break; }
2107 m->mothurOut("Identifying unique flowgrams...\n");
2108 int numUniques = getUniques(numSeqs, numFlowCells, uniqueFlowgrams, uniqueCount, uniqueLengths, mapSeqToUnique, mapUniqueToSeq, lengths, flowDataPrI, flowDataIntI);
2110 if (m->control_pressed) { break; }
2112 m->mothurOut("Calculating distances between flowgrams...\n");
2113 string distFileName = flowFileName.substr(0,flowFileName.find_last_of('.')) + ".shhh.dist";
2114 begTime = time(NULL);
2117 flowDistParentFork(numFlowCells, distFileName, numUniques, mapUniqueToSeq, mapSeqToUnique, lengths, flowDataPrI, flowDataIntI);
2119 m->mothurOutEndLine();
2120 m->mothurOut("Total time: " + toString(time(NULL) - begTime) + '\t' + toString((clock() - begClock)/CLOCKS_PER_SEC) + '\n');
2123 string namesFileName = flowFileName.substr(0,flowFileName.find_last_of('.')) + ".shhh.names";
2124 createNamesFile(numSeqs, numUniques, namesFileName, seqNameVector, mapSeqToUnique, mapUniqueToSeq);
2126 if (m->control_pressed) { break; }
2128 m->mothurOut("\nClustering flowgrams...\n");
2129 string listFileName = flowFileName.substr(0,flowFileName.find_last_of('.')) + ".shhh.list";
2130 cluster(listFileName, distFileName, namesFileName);
2132 if (m->control_pressed) { break; }
2134 vector<int> otuData;
2135 vector<int> cumNumSeqs;
2136 vector<int> nSeqsPerOTU;
2137 vector<vector<int> > aaP; //tMaster->aanP: each row is a different otu / each col contains the sequence indices
2138 vector<vector<int> > aaI; //tMaster->aanI: that are in each otu - can't differentiate between aaP and aaI
2139 vector<int> seqNumber; //tMaster->anP: the sequence id number sorted by OTU
2140 vector<int> seqIndex; //tMaster->anI; the index that corresponds to seqNumber
2143 int numOTUs = getOTUData(numSeqs, listFileName, otuData, cumNumSeqs, nSeqsPerOTU, aaP, aaI, seqNumber, seqIndex, nameMap);
2145 if (m->control_pressed) { break; }
2147 m->mothurRemove(distFileName);
2148 m->mothurRemove(namesFileName);
2149 m->mothurRemove(listFileName);
2151 vector<double> dist; //adDist - distance of sequences to centroids
2152 vector<short> change; //did the centroid sequence change? 0 = no; 1 = yes
2153 vector<int> centroids; //the representative flowgram for each cluster m
2154 vector<double> weight;
2155 vector<double> singleTau; //tMaster->adTau: 1-D Tau vector (1xnumSeqs)
2156 vector<int> nSeqsBreaks;
2157 vector<int> nOTUsBreaks;
2159 dist.assign(numSeqs * numOTUs, 0);
2160 change.assign(numOTUs, 1);
2161 centroids.assign(numOTUs, -1);
2162 weight.assign(numOTUs, 0);
2163 singleTau.assign(numSeqs, 1.0);
2165 nSeqsBreaks.assign(2, 0);
2166 nOTUsBreaks.assign(2, 0);
2169 nSeqsBreaks[1] = numSeqs;
2170 nOTUsBreaks[1] = numOTUs;
2172 if (m->control_pressed) { break; }
2174 double maxDelta = 0;
2178 begTime = time(NULL);
2180 m->mothurOut("\nDenoising flowgrams...\n");
2181 m->mothurOut("iter\tmaxDelta\tnLL\t\tcycletime\n");
2183 while((maxIters == 0 && maxDelta > minDelta) || iter < MIN_ITER || (maxDelta > minDelta && iter < maxIters)){
2185 if (m->control_pressed) { break; }
2187 double cycClock = clock();
2188 unsigned long long cycTime = time(NULL);
2189 fill(numOTUs, seqNumber, seqIndex, cumNumSeqs, nSeqsPerOTU, aaP, aaI);
2191 if (m->control_pressed) { break; }
2193 calcCentroidsDriver(numOTUs, cumNumSeqs, nSeqsPerOTU, seqIndex, change, centroids, singleTau, mapSeqToUnique, uniqueFlowgrams, flowDataIntI, lengths, numFlowCells, seqNumber);
2195 if (m->control_pressed) { break; }
2197 maxDelta = getNewWeights(numOTUs, cumNumSeqs, nSeqsPerOTU, singleTau, seqNumber, weight);
2199 if (m->control_pressed) { break; }
2201 double nLL = getLikelihood(numSeqs, numOTUs, nSeqsPerOTU, seqNumber, cumNumSeqs, seqIndex, dist, weight);
2203 if (m->control_pressed) { break; }
2205 checkCentroids(numOTUs, centroids, weight);
2207 if (m->control_pressed) { break; }
2209 calcNewDistances(numSeqs, numOTUs, nSeqsPerOTU, dist, weight, change, centroids, aaP, singleTau, aaI, seqNumber, seqIndex, uniqueFlowgrams, flowDataIntI, numFlowCells, lengths);
2211 if (m->control_pressed) { break; }
2215 m->mothurOut(toString(iter) + '\t' + toString(maxDelta) + '\t' + toString(nLL) + '\t' + toString(time(NULL) - cycTime) + '\t' + toString((clock() - cycClock)/(double)CLOCKS_PER_SEC) + '\n');
2219 if (m->control_pressed) { break; }
2221 m->mothurOut("\nFinalizing...\n");
2222 fill(numOTUs, seqNumber, seqIndex, cumNumSeqs, nSeqsPerOTU, aaP, aaI);
2224 if (m->control_pressed) { break; }
2226 setOTUs(numOTUs, numSeqs, seqNumber, seqIndex, cumNumSeqs, nSeqsPerOTU, otuData, singleTau, dist, aaP, aaI);
2228 if (m->control_pressed) { break; }
2230 vector<int> otuCounts(numOTUs, 0);
2231 for(int i=0;i<numSeqs;i++) { otuCounts[otuData[i]]++; }
2233 calcCentroidsDriver(numOTUs, cumNumSeqs, nSeqsPerOTU, seqIndex, change, centroids, singleTau, mapSeqToUnique, uniqueFlowgrams, flowDataIntI, lengths, numFlowCells, seqNumber);
2235 if (m->control_pressed) { break; }
2237 if ((large) && (g == 0)) { flowFileName = filenames[i]; theseFlowFileNames[0] = filenames[i]; }
2238 string thisOutputDir = outputDir;
2239 if (outputDir == "") { thisOutputDir = m->hasPath(flowFileName); }
2240 string qualityFileName = thisOutputDir + m->getRootName(m->getSimpleName(flowFileName)) + "shhh.qual";
2241 string fastaFileName = thisOutputDir + m->getRootName(m->getSimpleName(flowFileName)) + "shhh.fasta";
2242 string nameFileName = thisOutputDir + m->getRootName(m->getSimpleName(flowFileName)) + "shhh.names";
2243 string otuCountsFileName = thisOutputDir + m->getRootName(m->getSimpleName(flowFileName)) + "shhh.counts";
2244 string fileRoot = thisOutputDir + m->getRootName(m->getSimpleName(flowFileName));
2245 string groupFileName = fileRoot + "shhh.groups";
2248 writeQualities(numOTUs, numFlowCells, qualityFileName, otuCounts, nSeqsPerOTU, seqNumber, singleTau, flowDataIntI, uniqueFlowgrams, cumNumSeqs, mapUniqueToSeq, seqNameVector, centroids, aaI); if (m->control_pressed) { break; }
2249 writeSequences(thisCompositeFASTAFileName, numOTUs, numFlowCells, fastaFileName, otuCounts, uniqueFlowgrams, seqNameVector, aaI, centroids);if (m->control_pressed) { break; }
2250 writeNames(thisCompositeNamesFileName, numOTUs, nameFileName, otuCounts, seqNameVector, aaI, nSeqsPerOTU); if (m->control_pressed) { break; }
2251 writeClusters(otuCountsFileName, numOTUs, numFlowCells,otuCounts, centroids, uniqueFlowgrams, seqNameVector, aaI, nSeqsPerOTU, lengths, flowDataIntI); if (m->control_pressed) { break; }
2252 writeGroups(groupFileName, fileRoot, numSeqs, seqNameVector); if (m->control_pressed) { break; }
2256 m->appendFiles(qualityFileName, (thisOutputDir + m->getRootName(m->getSimpleName(theseFlowFileNames[0])) + "shhh.qual"));
2257 m->mothurRemove(qualityFileName);
2258 m->appendFiles(fastaFileName, (thisOutputDir + m->getRootName(m->getSimpleName(theseFlowFileNames[0])) + "shhh.fasta"));
2259 m->mothurRemove(fastaFileName);
2260 m->appendFiles(nameFileName, (thisOutputDir + m->getRootName(m->getSimpleName(theseFlowFileNames[0])) + "shhh.names"));
2261 m->mothurRemove(nameFileName);
2262 m->appendFiles(otuCountsFileName, (thisOutputDir + m->getRootName(m->getSimpleName(theseFlowFileNames[0])) + "shhh.counts"));
2263 m->mothurRemove(otuCountsFileName);
2264 m->appendFiles(groupFileName, (thisOutputDir + m->getRootName(m->getSimpleName(theseFlowFileNames[0])) + "shhh.groups"));
2265 m->mothurRemove(groupFileName);
2266 m->mothurRemove(theseFlowFileNames[g]);
2271 m->mothurOut("Total time to process " + flowFileName + ":\t" + toString(time(NULL) - begTime) + '\t' + toString((clock() - begClock)/(double)CLOCKS_PER_SEC) + '\n');
2274 if (m->control_pressed) { for (int i = 0; i < outputNames.size(); i++) { m->mothurRemove(outputNames[i]); } return 0; }
2278 }catch(exception& e) {
2279 m->errorOut(e, "ShhherCommand", "driver");
2284 /**************************************************************************************************/
2285 int ShhherCommand::getFlowData(string filename, vector<string>& thisSeqNameVector, vector<int>& thisLengths, vector<short>& thisFlowDataIntI, map<string, int>& thisNameMap, int& numFlowCells){
2290 m->openInputFile(filename, flowFile);
2293 int currentNumFlowCells;
2295 thisSeqNameVector.clear();
2296 thisLengths.clear();
2297 thisFlowDataIntI.clear();
2298 thisNameMap.clear();
2300 flowFile >> numFlowCells;
2301 int index = 0;//pcluster
2302 while(!flowFile.eof()){
2304 if (m->control_pressed) { break; }
2306 flowFile >> seqName >> currentNumFlowCells;
2307 thisLengths.push_back(currentNumFlowCells);
2309 thisSeqNameVector.push_back(seqName);
2310 thisNameMap[seqName] = index++;//pcluster
2312 for(int i=0;i<numFlowCells;i++){
2313 flowFile >> intensity;
2314 if(intensity > 9.99) { intensity = 9.99; }
2315 int intI = int(100 * intensity + 0.0001);
2316 thisFlowDataIntI.push_back(intI);
2318 m->gobble(flowFile);
2322 int numSeqs = thisSeqNameVector.size();
2324 for(int i=0;i<numSeqs;i++){
2326 if (m->control_pressed) { break; }
2328 int iNumFlowCells = i * numFlowCells;
2329 for(int j=thisLengths[i];j<numFlowCells;j++){
2330 thisFlowDataIntI[iNumFlowCells + j] = 0;
2337 catch(exception& e) {
2338 m->errorOut(e, "ShhherCommand", "getFlowData");
2342 /**************************************************************************************************/
2344 int ShhherCommand::flowDistParentFork(int numFlowCells, string distFileName, int stopSeq, vector<int>& mapUniqueToSeq, vector<int>& mapSeqToUnique, vector<int>& lengths, vector<double>& flowDataPrI, vector<short>& flowDataIntI){
2347 ostringstream outStream;
2348 outStream.setf(ios::fixed, ios::floatfield);
2349 outStream.setf(ios::dec, ios::basefield);
2350 outStream.setf(ios::showpoint);
2351 outStream.precision(6);
2353 int begTime = time(NULL);
2354 double begClock = clock();
2356 for(int i=0;i<stopSeq;i++){
2358 if (m->control_pressed) { break; }
2360 for(int j=0;j<i;j++){
2361 float flowDistance = calcPairwiseDist(numFlowCells, mapUniqueToSeq[i], mapUniqueToSeq[j], mapSeqToUnique, lengths, flowDataPrI, flowDataIntI);
2363 if(flowDistance < 1e-6){
2364 outStream << mapUniqueToSeq[i] << '\t' << mapUniqueToSeq[j] << '\t' << 0.000000 << endl;
2366 else if(flowDistance <= cutoff){
2367 outStream << mapUniqueToSeq[i] << '\t' << mapUniqueToSeq[j] << '\t' << flowDistance << endl;
2371 m->mothurOut(toString(i) + "\t" + toString(time(NULL) - begTime));
2372 m->mothurOut("\t" + toString((clock()-begClock)/CLOCKS_PER_SEC));
2373 m->mothurOutEndLine();
2377 ofstream distFile(distFileName.c_str());
2378 distFile << outStream.str();
2381 if (m->control_pressed) {}
2383 m->mothurOut(toString(stopSeq-1) + "\t" + toString(time(NULL) - begTime));
2384 m->mothurOut("\t" + toString((clock()-begClock)/CLOCKS_PER_SEC));
2385 m->mothurOutEndLine();
2390 catch(exception& e) {
2391 m->errorOut(e, "ShhherCommand", "flowDistParentFork");
2395 /**************************************************************************************************/
2397 float ShhherCommand::calcPairwiseDist(int numFlowCells, int seqA, int seqB, vector<int>& mapSeqToUnique, vector<int>& lengths, vector<double>& flowDataPrI, vector<short>& flowDataIntI){
2399 int minLength = lengths[mapSeqToUnique[seqA]];
2400 if(lengths[seqB] < minLength){ minLength = lengths[mapSeqToUnique[seqB]]; }
2402 int ANumFlowCells = seqA * numFlowCells;
2403 int BNumFlowCells = seqB * numFlowCells;
2407 for(int i=0;i<minLength;i++){
2409 if (m->control_pressed) { break; }
2411 int flowAIntI = flowDataIntI[ANumFlowCells + i];
2412 float flowAPrI = flowDataPrI[ANumFlowCells + i];
2414 int flowBIntI = flowDataIntI[BNumFlowCells + i];
2415 float flowBPrI = flowDataPrI[BNumFlowCells + i];
2416 dist += jointLookUp[flowAIntI * NUMBINS + flowBIntI] - flowAPrI - flowBPrI;
2419 dist /= (float) minLength;
2422 catch(exception& e) {
2423 m->errorOut(e, "ShhherCommand", "calcPairwiseDist");
2428 /**************************************************************************************************/
2430 int ShhherCommand::getUniques(int numSeqs, int numFlowCells, vector<short>& uniqueFlowgrams, vector<int>& uniqueCount, vector<int>& uniqueLengths, vector<int>& mapSeqToUnique, vector<int>& mapUniqueToSeq, vector<int>& lengths, vector<double>& flowDataPrI, vector<short>& flowDataIntI){
2433 uniqueFlowgrams.assign(numFlowCells * numSeqs, -1);
2434 uniqueCount.assign(numSeqs, 0); // anWeights
2435 uniqueLengths.assign(numSeqs, 0);
2436 mapSeqToUnique.assign(numSeqs, -1);
2437 mapUniqueToSeq.assign(numSeqs, -1);
2439 vector<short> uniqueFlowDataIntI(numFlowCells * numSeqs, -1);
2441 for(int i=0;i<numSeqs;i++){
2443 if (m->control_pressed) { break; }
2447 vector<short> current(numFlowCells);
2448 for(int j=0;j<numFlowCells;j++){
2449 current[j] = short(((flowDataIntI[i * numFlowCells + j] + 50.0)/100.0));
2452 for(int j=0;j<numUniques;j++){
2453 int offset = j * numFlowCells;
2457 if(lengths[i] < uniqueLengths[j]) { shorterLength = lengths[i]; }
2458 else { shorterLength = uniqueLengths[j]; }
2460 for(int k=0;k<shorterLength;k++){
2461 if(current[k] != uniqueFlowgrams[offset + k]){
2468 mapSeqToUnique[i] = j;
2471 if(lengths[i] > uniqueLengths[j]) { uniqueLengths[j] = lengths[i]; }
2477 if(index == numUniques){
2478 uniqueLengths[numUniques] = lengths[i];
2479 uniqueCount[numUniques] = 1;
2480 mapSeqToUnique[i] = numUniques;//anMap
2481 mapUniqueToSeq[numUniques] = i;//anF
2483 for(int k=0;k<numFlowCells;k++){
2484 uniqueFlowgrams[numUniques * numFlowCells + k] = current[k];
2485 uniqueFlowDataIntI[numUniques * numFlowCells + k] = flowDataIntI[i * numFlowCells + k];
2491 uniqueFlowDataIntI.resize(numFlowCells * numUniques);
2492 uniqueLengths.resize(numUniques);
2494 flowDataPrI.resize(numSeqs * numFlowCells, 0);
2495 for(int i=0;i<flowDataPrI.size();i++) { if (m->control_pressed) { break; } flowDataPrI[i] = getProbIntensity(flowDataIntI[i]); }
2499 catch(exception& e) {
2500 m->errorOut(e, "ShhherCommand", "getUniques");
2504 /**************************************************************************************************/
2505 int ShhherCommand::createNamesFile(int numSeqs, int numUniques, string filename, vector<string>& seqNameVector, vector<int>& mapSeqToUnique, vector<int>& mapUniqueToSeq){
2508 vector<string> duplicateNames(numUniques, "");
2509 for(int i=0;i<numSeqs;i++){
2510 duplicateNames[mapSeqToUnique[i]] += seqNameVector[i] + ',';
2514 m->openOutputFile(filename, nameFile);
2516 for(int i=0;i<numUniques;i++){
2518 if (m->control_pressed) { break; }
2520 // nameFile << seqNameVector[mapUniqueToSeq[i]] << '\t' << duplicateNames[i].substr(0, duplicateNames[i].find_last_of(',')) << endl;
2521 nameFile << mapUniqueToSeq[i] << '\t' << duplicateNames[i].substr(0, duplicateNames[i].find_last_of(',')) << endl;
2528 catch(exception& e) {
2529 m->errorOut(e, "ShhherCommand", "createNamesFile");
2533 //**********************************************************************************************************************
2535 int ShhherCommand::cluster(string filename, string distFileName, string namesFileName){
2538 ReadMatrix* read = new ReadColumnMatrix(distFileName);
2539 read->setCutoff(cutoff);
2541 NameAssignment* clusterNameMap = new NameAssignment(namesFileName);
2542 clusterNameMap->readMap();
2543 read->read(clusterNameMap);
2545 ListVector* list = read->getListVector();
2546 SparseMatrix* matrix = read->getMatrix();
2549 delete clusterNameMap;
2551 RAbundVector* rabund = new RAbundVector(list->getRAbundVector());
2553 Cluster* cluster = new CompleteLinkage(rabund, list, matrix, cutoff, "furthest");
2554 string tag = cluster->getTag();
2556 double clusterCutoff = cutoff;
2557 while (matrix->getSmallDist() <= clusterCutoff && matrix->getNNodes() > 0){
2559 if (m->control_pressed) { break; }
2561 cluster->update(clusterCutoff);
2564 list->setLabel(toString(cutoff));
2567 m->openOutputFile(filename, listFile);
2568 list->print(listFile);
2571 delete matrix; delete cluster; delete rabund; delete list;
2575 catch(exception& e) {
2576 m->errorOut(e, "ShhherCommand", "cluster");
2580 /**************************************************************************************************/
2582 int ShhherCommand::getOTUData(int numSeqs, string fileName, vector<int>& otuData,
2583 vector<int>& cumNumSeqs,
2584 vector<int>& nSeqsPerOTU,
2585 vector<vector<int> >& aaP, //tMaster->aanP: each row is a different otu / each col contains the sequence indices
2586 vector<vector<int> >& aaI, //tMaster->aanI: that are in each otu - can't differentiate between aaP and aaI
2587 vector<int>& seqNumber, //tMaster->anP: the sequence id number sorted by OTU
2588 vector<int>& seqIndex,
2589 map<string, int>& nameMap){
2593 m->openInputFile(fileName, listFile);
2597 listFile >> label >> numOTUs;
2599 otuData.assign(numSeqs, 0);
2600 cumNumSeqs.assign(numOTUs, 0);
2601 nSeqsPerOTU.assign(numOTUs, 0);
2602 aaP.clear();aaP.resize(numOTUs);
2608 string singleOTU = "";
2610 for(int i=0;i<numOTUs;i++){
2612 if (m->control_pressed) { break; }
2614 listFile >> singleOTU;
2616 istringstream otuString(singleOTU);
2620 string seqName = "";
2622 for(int j=0;j<singleOTU.length();j++){
2623 char letter = otuString.get();
2629 map<string,int>::iterator nmIt = nameMap.find(seqName);
2630 int index = nmIt->second;
2632 nameMap.erase(nmIt);
2636 aaP[i].push_back(index);
2641 map<string,int>::iterator nmIt = nameMap.find(seqName);
2643 int index = nmIt->second;
2644 nameMap.erase(nmIt);
2648 aaP[i].push_back(index);
2653 sort(aaP[i].begin(), aaP[i].end());
2654 for(int j=0;j<nSeqsPerOTU[i];j++){
2655 seqNumber.push_back(aaP[i][j]);
2657 for(int j=nSeqsPerOTU[i];j<numSeqs;j++){
2658 aaP[i].push_back(0);
2664 for(int i=1;i<numOTUs;i++){
2665 cumNumSeqs[i] = cumNumSeqs[i-1] + nSeqsPerOTU[i-1];
2668 seqIndex = seqNumber;
2675 catch(exception& e) {
2676 m->errorOut(e, "ShhherCommand", "getOTUData");
2680 /**************************************************************************************************/
2682 int ShhherCommand::calcCentroidsDriver(int numOTUs,
2683 vector<int>& cumNumSeqs,
2684 vector<int>& nSeqsPerOTU,
2685 vector<int>& seqIndex,
2686 vector<short>& change, //did the centroid sequence change? 0 = no; 1 = yes
2687 vector<int>& centroids, //the representative flowgram for each cluster m
2688 vector<double>& singleTau, //tMaster->adTau: 1-D Tau vector (1xnumSeqs)
2689 vector<int>& mapSeqToUnique,
2690 vector<short>& uniqueFlowgrams,
2691 vector<short>& flowDataIntI,
2692 vector<int>& lengths,
2694 vector<int>& seqNumber){
2696 //this function gets the most likely homopolymer length at a flow position for a group of sequences
2701 for(int i=0;i<numOTUs;i++){
2703 if (m->control_pressed) { break; }
2707 int minFlowGram = 100000000;
2708 double minFlowValue = 1e8;
2709 change[i] = 0; //FALSE
2711 for(int j=0;j<nSeqsPerOTU[i];j++){
2712 count += singleTau[seqNumber[cumNumSeqs[i] + j]];
2715 if(nSeqsPerOTU[i] > 0 && count > MIN_COUNT){
2716 vector<double> adF(nSeqsPerOTU[i]);
2717 vector<int> anL(nSeqsPerOTU[i]);
2719 for(int j=0;j<nSeqsPerOTU[i];j++){
2720 int index = cumNumSeqs[i] + j;
2721 int nI = seqIndex[index];
2722 int nIU = mapSeqToUnique[nI];
2725 for(k=0;k<position;k++){
2731 anL[position] = nIU;
2732 adF[position] = 0.0000;
2737 for(int j=0;j<nSeqsPerOTU[i];j++){
2738 int index = cumNumSeqs[i] + j;
2739 int nI = seqIndex[index];
2741 double tauValue = singleTau[seqNumber[index]];
2743 for(int k=0;k<position;k++){
2744 double dist = getDistToCentroid(anL[k], nI, lengths[nI], uniqueFlowgrams, flowDataIntI, numFlowCells);
2745 adF[k] += dist * tauValue;
2749 for(int j=0;j<position;j++){
2750 if(adF[j] < minFlowValue){
2752 minFlowValue = adF[j];
2756 if(centroids[i] != anL[minFlowGram]){
2758 centroids[i] = anL[minFlowGram];
2761 else if(centroids[i] != -1){
2769 catch(exception& e) {
2770 m->errorOut(e, "ShhherCommand", "calcCentroidsDriver");
2774 /**************************************************************************************************/
2776 double ShhherCommand::getDistToCentroid(int cent, int flow, int length, vector<short>& uniqueFlowgrams,
2777 vector<short>& flowDataIntI, int numFlowCells){
2780 int flowAValue = cent * numFlowCells;
2781 int flowBValue = flow * numFlowCells;
2785 for(int i=0;i<length;i++){
2786 dist += singleLookUp[uniqueFlowgrams[flowAValue] * NUMBINS + flowDataIntI[flowBValue]];
2791 return dist / (double)length;
2793 catch(exception& e) {
2794 m->errorOut(e, "ShhherCommand", "getDistToCentroid");
2798 /**************************************************************************************************/
2800 double ShhherCommand::getNewWeights(int numOTUs, vector<int>& cumNumSeqs, vector<int>& nSeqsPerOTU, vector<double>& singleTau, vector<int>& seqNumber, vector<double>& weight){
2803 double maxChange = 0;
2805 for(int i=0;i<numOTUs;i++){
2807 if (m->control_pressed) { break; }
2809 double difference = weight[i];
2812 for(int j=0;j<nSeqsPerOTU[i];j++){
2813 int index = cumNumSeqs[i] + j;
2814 double tauValue = singleTau[seqNumber[index]];
2815 weight[i] += tauValue;
2818 difference = fabs(weight[i] - difference);
2819 if(difference > maxChange){ maxChange = difference; }
2823 catch(exception& e) {
2824 m->errorOut(e, "ShhherCommand", "getNewWeights");
2829 /**************************************************************************************************/
2831 double ShhherCommand::getLikelihood(int numSeqs, int numOTUs, vector<int>& nSeqsPerOTU, vector<int>& seqNumber, vector<int>& cumNumSeqs, vector<int>& seqIndex, vector<double>& dist, vector<double>& weight){
2835 vector<long double> P(numSeqs, 0);
2838 for(int i=0;i<numOTUs;i++){
2839 if(weight[i] > MIN_WEIGHT){
2845 for(int i=0;i<numOTUs;i++){
2847 if (m->control_pressed) { break; }
2849 for(int j=0;j<nSeqsPerOTU[i];j++){
2850 int index = cumNumSeqs[i] + j;
2851 int nI = seqIndex[index];
2852 double singleDist = dist[seqNumber[index]];
2854 P[nI] += weight[i] * exp(-singleDist * sigma);
2858 for(int i=0;i<numSeqs;i++){
2859 if(P[i] == 0){ P[i] = DBL_EPSILON; }
2864 nLL = nLL -(double)numSeqs * log(sigma);
2868 catch(exception& e) {
2869 m->errorOut(e, "ShhherCommand", "getNewWeights");
2874 /**************************************************************************************************/
2876 int ShhherCommand::checkCentroids(int numOTUs, vector<int>& centroids, vector<double>& weight){
2878 vector<int> unique(numOTUs, 1);
2880 for(int i=0;i<numOTUs;i++){
2881 if(centroids[i] == -1 || weight[i] < MIN_WEIGHT){
2886 for(int i=0;i<numOTUs;i++){
2888 if (m->control_pressed) { break; }
2891 for(int j=i+1;j<numOTUs;j++){
2894 if(centroids[j] == centroids[i]){
2898 weight[i] += weight[j];
2908 catch(exception& e) {
2909 m->errorOut(e, "ShhherCommand", "checkCentroids");
2913 /**************************************************************************************************/
2915 void ShhherCommand::calcNewDistances(int numSeqs, int numOTUs, vector<int>& nSeqsPerOTU, vector<double>& dist,
2916 vector<double>& weight, vector<short>& change, vector<int>& centroids,
2917 vector<vector<int> >& aaP, vector<double>& singleTau, vector<vector<int> >& aaI,
2918 vector<int>& seqNumber, vector<int>& seqIndex,
2919 vector<short>& uniqueFlowgrams,
2920 vector<short>& flowDataIntI, int numFlowCells, vector<int>& lengths){
2925 vector<double> newTau(numOTUs,0);
2926 vector<double> norms(numSeqs, 0);
2927 nSeqsPerOTU.assign(numOTUs, 0);
2929 for(int i=0;i<numSeqs;i++){
2931 if (m->control_pressed) { break; }
2933 int indexOffset = i * numOTUs;
2935 double offset = 1e8;
2937 for(int j=0;j<numOTUs;j++){
2939 if(weight[j] > MIN_WEIGHT && change[j] == 1){
2940 dist[indexOffset + j] = getDistToCentroid(centroids[j], i, lengths[i], uniqueFlowgrams, flowDataIntI, numFlowCells);
2943 if(weight[j] > MIN_WEIGHT && dist[indexOffset + j] < offset){
2944 offset = dist[indexOffset + j];
2948 for(int j=0;j<numOTUs;j++){
2949 if(weight[j] > MIN_WEIGHT){
2950 newTau[j] = exp(sigma * (-dist[indexOffset + j] + offset)) * weight[j];
2951 norms[i] += newTau[j];
2958 for(int j=0;j<numOTUs;j++){
2959 newTau[j] /= norms[i];
2962 for(int j=0;j<numOTUs;j++){
2963 if(newTau[j] > MIN_TAU){
2965 int oldTotal = total;
2969 singleTau.resize(total, 0);
2970 seqNumber.resize(total, 0);
2971 seqIndex.resize(total, 0);
2973 singleTau[oldTotal] = newTau[j];
2975 aaP[j][nSeqsPerOTU[j]] = oldTotal;
2976 aaI[j][nSeqsPerOTU[j]] = i;
2984 catch(exception& e) {
2985 m->errorOut(e, "ShhherCommand", "calcNewDistances");
2989 /**************************************************************************************************/
2991 int ShhherCommand::fill(int numOTUs, vector<int>& seqNumber, vector<int>& seqIndex, vector<int>& cumNumSeqs, vector<int>& nSeqsPerOTU, vector<vector<int> >& aaP, vector<vector<int> >& aaI){
2994 for(int i=0;i<numOTUs;i++){
2996 if (m->control_pressed) { return 0; }
2998 cumNumSeqs[i] = index;
2999 for(int j=0;j<nSeqsPerOTU[i];j++){
3000 seqNumber[index] = aaP[i][j];
3001 seqIndex[index] = aaI[i][j];
3009 catch(exception& e) {
3010 m->errorOut(e, "ShhherCommand", "fill");
3014 /**************************************************************************************************/
3016 void ShhherCommand::setOTUs(int numOTUs, int numSeqs, vector<int>& seqNumber, vector<int>& seqIndex, vector<int>& cumNumSeqs, vector<int>& nSeqsPerOTU,
3017 vector<int>& otuData, vector<double>& singleTau, vector<double>& dist, vector<vector<int> >& aaP, vector<vector<int> >& aaI){
3020 vector<double> bigTauMatrix(numOTUs * numSeqs, 0.0000);
3022 for(int i=0;i<numOTUs;i++){
3024 if (m->control_pressed) { break; }
3026 for(int j=0;j<nSeqsPerOTU[i];j++){
3027 int index = cumNumSeqs[i] + j;
3028 double tauValue = singleTau[seqNumber[index]];
3029 int sIndex = seqIndex[index];
3030 bigTauMatrix[sIndex * numOTUs + i] = tauValue;
3034 for(int i=0;i<numSeqs;i++){
3035 double maxTau = -1.0000;
3037 for(int j=0;j<numOTUs;j++){
3038 if(bigTauMatrix[i * numOTUs + j] > maxTau){
3039 maxTau = bigTauMatrix[i * numOTUs + j];
3044 otuData[i] = maxOTU;
3047 nSeqsPerOTU.assign(numOTUs, 0);
3049 for(int i=0;i<numSeqs;i++){
3050 int index = otuData[i];
3052 singleTau[i] = 1.0000;
3055 aaP[index][nSeqsPerOTU[index]] = i;
3056 aaI[index][nSeqsPerOTU[index]] = i;
3058 nSeqsPerOTU[index]++;
3061 fill(numOTUs, seqNumber, seqIndex, cumNumSeqs, nSeqsPerOTU, aaP, aaI);
3063 catch(exception& e) {
3064 m->errorOut(e, "ShhherCommand", "setOTUs");
3068 /**************************************************************************************************/
3070 void ShhherCommand::writeQualities(int numOTUs, int numFlowCells, string qualityFileName, vector<int> otuCounts, vector<int>& nSeqsPerOTU, vector<int>& seqNumber,
3071 vector<double>& singleTau, vector<short>& flowDataIntI, vector<short>& uniqueFlowgrams, vector<int>& cumNumSeqs,
3072 vector<int>& mapUniqueToSeq, vector<string>& seqNameVector, vector<int>& centroids, vector<vector<int> >& aaI){
3076 ofstream qualityFile;
3077 m->openOutputFile(qualityFileName, qualityFile);
3079 qualityFile.setf(ios::fixed, ios::floatfield);
3080 qualityFile.setf(ios::showpoint);
3081 qualityFile << setprecision(6);
3083 vector<vector<int> > qualities(numOTUs);
3084 vector<double> pr(HOMOPS, 0);
3087 for(int i=0;i<numOTUs;i++){
3089 if (m->control_pressed) { break; }
3094 if(nSeqsPerOTU[i] > 0){
3095 qualities[i].assign(1024, -1);
3097 while(index < numFlowCells){
3098 double maxPrValue = 1e8;
3099 short maxPrIndex = -1;
3100 double count = 0.0000;
3102 pr.assign(HOMOPS, 0);
3104 for(int j=0;j<nSeqsPerOTU[i];j++){
3105 int lIndex = cumNumSeqs[i] + j;
3106 double tauValue = singleTau[seqNumber[lIndex]];
3107 int sequenceIndex = aaI[i][j];
3108 short intensity = flowDataIntI[sequenceIndex * numFlowCells + index];
3112 for(int s=0;s<HOMOPS;s++){
3113 pr[s] += tauValue * singleLookUp[s * NUMBINS + intensity];
3117 maxPrIndex = uniqueFlowgrams[centroids[i] * numFlowCells + index];
3118 maxPrValue = pr[maxPrIndex];
3120 if(count > MIN_COUNT){
3122 double norm = 0.0000;
3124 for(int s=0;s<HOMOPS;s++){
3125 norm += exp(-(pr[s] - maxPrValue));
3128 for(int s=1;s<=maxPrIndex;s++){
3130 double temp = 0.0000;
3132 U += exp(-(pr[s-1]-maxPrValue))/norm;
3140 temp = floor(-10 * temp);
3141 value = (int)floor(temp);
3142 if(value > 100){ value = 100; }
3144 qualities[i][base] = (int)value;
3154 if(otuCounts[i] > 0){
3155 qualityFile << '>' << seqNameVector[mapUniqueToSeq[i]] << endl;
3157 int j=4; //need to get past the first four bases
3158 while(qualities[i][j] != -1){
3159 qualityFile << qualities[i][j] << ' ';
3160 if (j > qualities[i].size()) { break; }
3163 qualityFile << endl;
3166 qualityFile.close();
3167 outputNames.push_back(qualityFileName);
3170 catch(exception& e) {
3171 m->errorOut(e, "ShhherCommand", "writeQualities");
3176 /**************************************************************************************************/
3178 void ShhherCommand::writeSequences(string thisCompositeFASTAFileName, int numOTUs, int numFlowCells, string fastaFileName, vector<int> otuCounts, vector<short>& uniqueFlowgrams, vector<string>& seqNameVector, vector<vector<int> >& aaI, vector<int>& centroids){
3182 m->openOutputFile(fastaFileName, fastaFile);
3184 vector<string> names(numOTUs, "");
3186 for(int i=0;i<numOTUs;i++){
3188 if (m->control_pressed) { break; }
3190 int index = centroids[i];
3192 if(otuCounts[i] > 0){
3193 fastaFile << '>' << seqNameVector[aaI[i][0]] << endl;
3197 for(int j=0;j<numFlowCells;j++){
3199 char base = flowOrder[j % 4];
3200 for(int k=0;k<uniqueFlowgrams[index * numFlowCells + j];k++){
3205 if (newSeq.length() >= 4) { fastaFile << newSeq.substr(4) << endl; }
3206 else { fastaFile << "NNNN" << endl; }
3211 outputNames.push_back(fastaFileName);
3213 if(thisCompositeFASTAFileName != ""){
3214 m->appendFiles(fastaFileName, thisCompositeFASTAFileName);
3217 catch(exception& e) {
3218 m->errorOut(e, "ShhherCommand", "writeSequences");
3223 /**************************************************************************************************/
3225 void ShhherCommand::writeNames(string thisCompositeNamesFileName, int numOTUs, string nameFileName, vector<int> otuCounts, vector<string>& seqNameVector, vector<vector<int> >& aaI, vector<int>& nSeqsPerOTU){
3229 m->openOutputFile(nameFileName, nameFile);
3231 for(int i=0;i<numOTUs;i++){
3233 if (m->control_pressed) { break; }
3235 if(otuCounts[i] > 0){
3236 nameFile << seqNameVector[aaI[i][0]] << '\t' << seqNameVector[aaI[i][0]];
3238 for(int j=1;j<nSeqsPerOTU[i];j++){
3239 nameFile << ',' << seqNameVector[aaI[i][j]];
3246 outputNames.push_back(nameFileName);
3249 if(thisCompositeNamesFileName != ""){
3250 m->appendFiles(nameFileName, thisCompositeNamesFileName);
3253 catch(exception& e) {
3254 m->errorOut(e, "ShhherCommand", "writeNames");
3259 /**************************************************************************************************/
3261 void ShhherCommand::writeGroups(string groupFileName, string fileRoot, int numSeqs, vector<string>& seqNameVector){
3264 m->openOutputFile(groupFileName, groupFile);
3266 for(int i=0;i<numSeqs;i++){
3267 if (m->control_pressed) { break; }
3268 groupFile << seqNameVector[i] << '\t' << fileRoot << endl;
3271 outputNames.push_back(groupFileName);
3274 catch(exception& e) {
3275 m->errorOut(e, "ShhherCommand", "writeGroups");
3280 /**************************************************************************************************/
3282 void ShhherCommand::writeClusters(string otuCountsFileName, int numOTUs, int numFlowCells, vector<int> otuCounts, vector<int>& centroids, vector<short>& uniqueFlowgrams, vector<string>& seqNameVector, vector<vector<int> >& aaI, vector<int>& nSeqsPerOTU, vector<int>& lengths, vector<short>& flowDataIntI){
3284 ofstream otuCountsFile;
3285 m->openOutputFile(otuCountsFileName, otuCountsFile);
3287 string bases = flowOrder;
3289 for(int i=0;i<numOTUs;i++){
3291 if (m->control_pressed) {
3294 //output the translated version of the centroid sequence for the otu
3295 if(otuCounts[i] > 0){
3296 int index = centroids[i];
3298 otuCountsFile << "ideal\t";
3299 for(int j=8;j<numFlowCells;j++){
3300 char base = bases[j % 4];
3301 for(int s=0;s<uniqueFlowgrams[index * numFlowCells + j];s++){
3302 otuCountsFile << base;
3305 otuCountsFile << endl;
3307 for(int j=0;j<nSeqsPerOTU[i];j++){
3308 int sequence = aaI[i][j];
3309 otuCountsFile << seqNameVector[sequence] << '\t';
3313 for(int k=0;k<lengths[sequence];k++){
3314 char base = bases[k % 4];
3315 int freq = int(0.01 * (double)flowDataIntI[sequence * numFlowCells + k] + 0.5);
3317 for(int s=0;s<freq;s++){
3319 //otuCountsFile << base;
3322 otuCountsFile << newSeq.substr(4) << endl;
3324 otuCountsFile << endl;
3327 otuCountsFile.close();
3328 outputNames.push_back(otuCountsFileName);
3331 catch(exception& e) {
3332 m->errorOut(e, "ShhherCommand", "writeClusters");
3337 /**************************************************************************************************/
3339 void ShhherCommand::getSingleLookUp(){
3341 // these are the -log probabilities that a signal corresponds to a particular homopolymer length
3342 singleLookUp.assign(HOMOPS * NUMBINS, 0);
3345 ifstream lookUpFile;
3346 m->openInputFile(lookupFileName, lookUpFile);
3348 for(int i=0;i<HOMOPS;i++){
3350 if (m->control_pressed) { break; }
3353 lookUpFile >> logFracFreq;
3355 for(int j=0;j<NUMBINS;j++) {
3356 lookUpFile >> singleLookUp[index];
3362 catch(exception& e) {
3363 m->errorOut(e, "ShhherCommand", "getSingleLookUp");
3368 /**************************************************************************************************/
3370 void ShhherCommand::getJointLookUp(){
3373 // the most likely joint probability (-log) that two intenities have the same polymer length
3374 jointLookUp.resize(NUMBINS * NUMBINS, 0);
3376 for(int i=0;i<NUMBINS;i++){
3378 if (m->control_pressed) { break; }
3380 for(int j=0;j<NUMBINS;j++){
3382 double minSum = 100000000;
3384 for(int k=0;k<HOMOPS;k++){
3385 double sum = singleLookUp[k * NUMBINS + i] + singleLookUp[k * NUMBINS + j];
3387 if(sum < minSum) { minSum = sum; }
3389 jointLookUp[i * NUMBINS + j] = minSum;
3393 catch(exception& e) {
3394 m->errorOut(e, "ShhherCommand", "getJointLookUp");
3399 /**************************************************************************************************/
3401 double ShhherCommand::getProbIntensity(int intIntensity){
3404 double minNegLogProb = 100000000;
3407 for(int i=0;i<HOMOPS;i++){//loop signal strength
3409 if (m->control_pressed) { break; }
3411 float negLogProb = singleLookUp[i * NUMBINS + intIntensity];
3412 if(negLogProb < minNegLogProb) { minNegLogProb = negLogProb; }
3415 return minNegLogProb;
3417 catch(exception& e) {
3418 m->errorOut(e, "ShhherCommand", "getProbIntensity");