#include "shhhercommand.h"
-#include "readcolumn.h"
-#include "readmatrix.hpp"
-#include "rabundvector.hpp"
-#include "sabundvector.hpp"
-#include "listvector.hpp"
-#include "cluster.hpp"
-#include "sparsematrix.hpp"
-#include <cfloat>
-
-//**********************************************************************************************************************
-
-#define NUMBINS 1000
-#define HOMOPS 10
-#define MIN_COUNT 0.1
-#define MIN_WEIGHT 0.1
-#define MIN_TAU 0.0001
-#define MIN_ITER 10
//**********************************************************************************************************************
vector<string> ShhherCommand::setParameters(){
try {
- CommandParameter pflow("flow", "InputTypes", "", "", "none", "fileflow", "none",false,false); parameters.push_back(pflow);
- CommandParameter pfile("file", "InputTypes", "", "", "none", "fileflow", "none",false,false); parameters.push_back(pfile);
- CommandParameter plookup("lookup", "InputTypes", "", "", "none", "none", "none",false,false); parameters.push_back(plookup);
- CommandParameter pcutoff("cutoff", "Number", "", "0.01", "", "", "",false,false); parameters.push_back(pcutoff);
- CommandParameter pprocessors("processors", "Number", "", "1", "", "", "",false,false); parameters.push_back(pprocessors);
- CommandParameter pmaxiter("maxiter", "Number", "", "1000", "", "", "",false,false); parameters.push_back(pmaxiter);
- CommandParameter psigma("sigma", "Number", "", "60", "", "", "",false,false); parameters.push_back(psigma);
- CommandParameter pmindelta("mindelta", "Number", "", "0.000001", "", "", "",false,false); parameters.push_back(pmindelta);
- CommandParameter porder("order", "String", "", "", "", "", "",false,false); parameters.push_back(porder);
- CommandParameter pinputdir("inputdir", "String", "", "", "", "", "",false,false); parameters.push_back(pinputdir);
- CommandParameter poutputdir("outputdir", "String", "", "", "", "", "",false,false); parameters.push_back(poutputdir);
+ CommandParameter pflow("flow", "InputTypes", "", "", "none", "fileflow", "none","fasta-name-group-counts-qfile",false,false,true); parameters.push_back(pflow);
+ CommandParameter pfile("file", "InputTypes", "", "", "none", "fileflow", "none","fasta-name-group-counts-qfile",false,false,true); parameters.push_back(pfile);
+ CommandParameter plookup("lookup", "InputTypes", "", "", "none", "none", "none","",false,false,true); parameters.push_back(plookup);
+ CommandParameter pcutoff("cutoff", "Number", "", "0.01", "", "", "","",false,false); parameters.push_back(pcutoff);
+ CommandParameter pprocessors("processors", "Number", "", "1", "", "", "","",false,false,true); parameters.push_back(pprocessors);
+ CommandParameter pmaxiter("maxiter", "Number", "", "1000", "", "", "","",false,false); parameters.push_back(pmaxiter);
+ CommandParameter plarge("large", "Number", "", "-1", "", "", "","",false,false); parameters.push_back(plarge);
+ CommandParameter psigma("sigma", "Number", "", "60", "", "", "","",false,false); parameters.push_back(psigma);
+ CommandParameter pmindelta("mindelta", "Number", "", "0.000001", "", "", "","",false,false); parameters.push_back(pmindelta);
+ CommandParameter porder("order", "Multiple", "A-B-I", "A", "", "", "","",false,false, true); parameters.push_back(porder); CommandParameter pinputdir("inputdir", "String", "", "", "", "", "","",false,false); parameters.push_back(pinputdir);
+ CommandParameter poutputdir("outputdir", "String", "", "", "", "", "","",false,false); parameters.push_back(poutputdir);
vector<string> myArray;
for (int i = 0; i < parameters.size(); i++) { myArray.push_back(parameters[i].name); }
string ShhherCommand::getHelpString(){
try {
string helpString = "";
- helpString += "The shhh.seqs command reads a file containing flowgrams and creates a file of corrected sequences.\n";
+ helpString += "The shhh.flows command reads a file containing flowgrams and creates a file of corrected sequences.\n";
+ helpString += "The shhh.flows command parameters are flow, file, lookup, cutoff, processors, large, maxiter, sigma, mindelta and order.\n";
+ helpString += "The flow parameter is used to input your flow file.\n";
+ helpString += "The file parameter is used to input the *flow.files file created by trim.flows.\n";
+ helpString += "The lookup parameter is used specify the lookup file you would like to use. http://www.mothur.org/wiki/Lookup_files.\n";
+ helpString += "The order parameter options are A, B or I. Default=A. A = TACG and B = TACGTACGTACGATGTAGTCGAGCATCATCTGACGCAGTACGTGCATGATCTCAGTCAGCAGCTATGTCAGTGCATGCAGTGACTGATCGTCATCAGCTAGCATCGACTGCATAGATCGCATGACGATCGCATATCGTCAGTGCATGTAGTCGAGCATCATCTGACGCAGTACGTGCATGATCTCAGTCAGCAGCTATGTCAGTGCATGCATAGATCGCATGACGATCGCATATCGTCAGTGCAGTGACTGATCGTCATCAGCTAGCATCGACTGCATGTAGTCGAGCATCATCTGACGCAGTACGTGCATAGATCGCATGACGATCGCATATCGTCAGTGCATGATCTCAGTCAGCAGCTATGTCAGTGCATGCAGTGACTGATCGTCATCAGCTAGCATCGACTGCATGTAGTCGAGCATCATCTGACGCAGTACGTGCAGTGACTGATCGTCATCAGCTAGCATCGACTGCATAGATCGCATGACGATCGCATATCGTCAGTGCATGATCTCAGTCAGCAGCTATGTCAGTGCATGCATGTAGTCGAGCATCATCTGACGCAGTACGTGCATAGATCGCATGACGATCGCATATCGTCAGTGCAGTGACTGATCGTCATCAGCTAGCATCGACTGCATGATCTCAGTCAGCAGCTATGTCAGTGCATGCAGTGACTGATCGTCATCAGCTAGCATCGACTGCATAGATCGCATGACGATCGCATATCGTCAGTGCATGATCTCAGTCAGCAGCTATGTCAGTGCATGCATGTAGTCGAGCATCATCTGACGCAGTACGTGCATAGATCGCATGACGATCGCATATCGTCAGTGCATGATCTCAGTCAGCAGCTATGTCAGTGCATGCAGTGACTGATCGTCATCAGCTAGCATCGACTGCATGTAGTCGAGCATCATCTGACGCAGTACGTGCATGATCTCAGTCAGCAGCTATGTCAGTGCATGCATAGATCGCATGACGATCGCATATCGTCAGTGCATGTAGTCGAGCATCATCTGACGCAGTACGTGCAGTGACTGATCGTCATCAGCTAGCATCGACTGCATAGATCGCATGACGATCGCATATCGTCAGTGCATGTAGTCGAGCATCATCTGACGCAGTACGTGCATGATCTCAGTCAGCAGCTATGTCAGTGCATGCAGTGACTGATCGTCATCAGCTAGCATCGACTGCATGATCTCAGTCAGCAGCTATGTCAGTGCATGCAGTGACTGATCGTCATCAGCTAGCATCGACTGCATAGATCGCATGACGATCGCATATCGTCAGTGCATGTAGTCGAGCATCATCTGACGCAGTACGTGCATGATCTCAGTCAGCAGCTATGTCAGTGCATGCATGTAGTCGAGCATCATCTGACGCAGTACGTGCAGTGACTGATCGTCATCAGCTAGCATCGACTGCATAGATCGCATGACGATCGCATATCGTCAGTGCAGTGACTGATCGTCATCAGCTAGCATCGACTGCATGTAGTCGAGCATCATCTGACGCAGTACGTGCATAGATCGCATGACGATCGCATATCGTCAGTGCATGATCTCAGTCAGCAGCTATGTCAGTGCATGCAGTGACTGATCGTCATCAGCTAGCATCGACTGCATGTAGTCGAGCATCATCTGACGCAGTACGTGCATGATCTCAGTCAGCAGCTATGTCAGTGCATGCATAGATCGCATGACGATCGCATATCGTCAGTGCAGTGACTGATCGTCATCAGCTAGCATCGACTGCATGATCTCAGTCAGCAGC and I = TACGTACGTCTGAGCATCGATCGATGTACAGCTACGTACGTCTGAGCATCGATCGATGTACAGCTACGTACGTCTGAGCATCGATCGATGTACAGCTACGTACGTCTGAGCATCGATCGATGTACAGCTACGTACGTCTGAGCATCGATCGATGTACAGCTACGTACGTCTGAGCATCGATCGATGTACAGCTACGTACGTCTGAGCATCGATCGATGTACAGCTACGTACGTCTGAGCATCGATCGATGTACAGCTACGTACGTCTGAGCATCGATCGATGTACAGCTACGTACGTCTGAGCATCGATCGATGTACAGCTACGTACGTCTGAGCATCGATCGATGTACAGCTACGTACGTCTGAGCATCGATCGATGTACAGCTACGTACGTCTGAGCATCGATCGATGTACAGCTACGTACGTCTGAGCATCGATCGATGTACAGCTACGTACGTCTGAGCATCGATCGATGTACAGCTACGTACGTCTGAGCATCGATCGATGTACAGCTACGTACGTCTGAGCATCGATCGATGTACAGCTACGTACGTCTGAGCATCGATCGATGTACAGCTACGTACGTCTGAGCATCGATCGATGTACAGCTACGTACGTCTGAGCATCGATCGATGTACAGC.\n";
return helpString;
}
catch(exception& e) {
}
}
//**********************************************************************************************************************
+string ShhherCommand::getOutputPattern(string type) {
+ try {
+ string pattern = "";
+
+ if (type == "fasta") { pattern = "[filename],shhh.fasta"; }
+ else if (type == "name") { pattern = "[filename],shhh.names"; }
+ else if (type == "group") { pattern = "[filename],shhh.groups"; }
+ else if (type == "counts") { pattern = "[filename],shhh.counts"; }
+ else if (type == "qfile") { pattern = "[filename],shhh.qual"; }
+ else { m->mothurOut("[ERROR]: No definition for type " + type + " output pattern.\n"); m->control_pressed = true; }
+
+ return pattern;
+ }
+ catch(exception& e) {
+ m->errorOut(e, "ShhherCommand", "getOutputPattern");
+ exit(1);
+ }
+}
+//**********************************************************************************************************************
ShhherCommand::ShhherCommand(){
try {
//initialize outputTypes
vector<string> tempOutNames;
- outputTypes["pn.dist"] = tempOutNames;
+ outputTypes["fasta"] = tempOutNames;
+ outputTypes["name"] = tempOutNames;
+ outputTypes["group"] = tempOutNames;
+ outputTypes["counts"] = tempOutNames;
+ outputTypes["qfile"] = tempOutNames;
}
catch(exception& e) {
ShhherCommand::ShhherCommand(string option) {
try {
-
+
#ifdef USE_MPI
MPI_Comm_rank(MPI_COMM_WORLD, &pid); //find out who we are
MPI_Comm_size(MPI_COMM_WORLD, &ncpus);
-
+
if(pid == 0){
#endif
-
-
abort = false; calledHelp = false;
-
//allow user to run help
if(option == "help") { help(); abort = true; calledHelp = true; }
else if(option == "citation") { citation(); abort = true; calledHelp = true;}
}
//initialize outputTypes
- vector<string> tempOutNames;
- outputTypes["pn.dist"] = tempOutNames;
- // outputTypes["fasta"] = tempOutNames;
+ vector<string> tempOutNames;
+ outputTypes["fasta"] = tempOutNames;
+ outputTypes["name"] = tempOutNames;
+ outputTypes["group"] = tempOutNames;
+ outputTypes["counts"] = tempOutNames;
+ outputTypes["qfile"] = tempOutNames;
+
//if the user changes the input directory command factory will send this info to us in the output parameter
string inputDir = validParameter.validFile(parameters, "inputdir", false);
if (path == "") { parameters["file"] = inputDir + it->second; }
}
}
-
-
+
+ //if the user changes the output directory command factory will send this info to us in the output parameter
+ outputDir = validParameter.validFile(parameters, "outputdir", false); if (outputDir == "not found"){ outputDir = ""; }
+
//check for required parameters
flowFileName = validParameter.validFile(parameters, "flow", true);
flowFilesFileName = validParameter.validFile(parameters, "file", true);
if (flowFileName == "not found" && flowFilesFileName == "not found") {
- m->mothurOut("values for either flow or file must be provided for the shhh.seqs command.");
+ m->mothurOut("values for either flow or file must be provided for the shhh.flows command.");
m->mothurOutEndLine();
abort = true;
}
else if (flowFileName == "not open" || flowFilesFileName == "not open") { abort = true; }
- if(flowFileName != "not found"){ compositeFASTAFileName = ""; }
+ if(flowFileName != "not found"){
+ compositeFASTAFileName = "";
+ compositeNamesFileName = "";
+ }
else{
- compositeFASTAFileName = flowFilesFileName.substr(0, flowFilesFileName.length()-10) + "pn.fasta";
ofstream temp;
+
+ string thisoutputDir = outputDir;
+ if (outputDir == "") { thisoutputDir = m->hasPath(flowFilesFileName); } //if user entered a file with a path then preserve it
+
+ //we want to rip off .files, and also .flow if its there
+ string fileroot = m->getRootName(m->getSimpleName(flowFilesFileName));
+ if (fileroot[fileroot.length()-1] == '.') { fileroot = fileroot.substr(0, fileroot.length()-1); } //rip off dot
+ string extension = m->getExtension(fileroot);
+ if (extension == ".flow") { fileroot = m->getRootName(fileroot); }
+ else { fileroot += "."; } //add back if needed
+
+ compositeFASTAFileName = thisoutputDir + fileroot + "shhh.fasta";
m->openOutputFile(compositeFASTAFileName, temp);
temp.close();
+
+ compositeNamesFileName = thisoutputDir + fileroot + "shhh.names";
+ m->openOutputFile(compositeNamesFileName, temp);
+ temp.close();
}
-
- //if the user changes the output directory command factory will send this info to us in the output parameter
- outputDir = validParameter.validFile(parameters, "outputdir", false); if (outputDir == "not found"){
- outputDir = "";
- outputDir += m->hasPath(flowFileName); //if user entered a file with a path then preserve it
- }
-
-
+
+ if(flowFilesFileName != "not found"){
+ string fName;
+
+ ifstream flowFilesFile;
+ m->openInputFile(flowFilesFileName, flowFilesFile);
+ while(flowFilesFile){
+ fName = m->getline(flowFilesFile);
+
+ //test if file is valid
+ ifstream in;
+ int ableToOpen = m->openInputFile(fName, in, "noerror");
+ in.close();
+ if (ableToOpen == 1) {
+ if (inputDir != "") { //default path is set
+ string tryPath = inputDir + fName;
+ m->mothurOut("Unable to open " + fName + ". Trying input directory " + tryPath); m->mothurOutEndLine();
+ ifstream in2;
+ ableToOpen = m->openInputFile(tryPath, in2, "noerror");
+ in2.close();
+ fName = tryPath;
+ }
+ }
+
+ if (ableToOpen == 1) {
+ if (m->getDefaultPath() != "") { //default path is set
+ string tryPath = m->getDefaultPath() + m->getSimpleName(fName);
+ m->mothurOut("Unable to open " + fName + ". Trying default " + tryPath); m->mothurOutEndLine();
+ ifstream in2;
+ ableToOpen = m->openInputFile(tryPath, in2, "noerror");
+ in2.close();
+ fName = tryPath;
+ }
+ }
+
+ //if you can't open it its not in current working directory or inputDir, try mothur excutable location
+ if (ableToOpen == 1) {
+ string exepath = m->argv;
+ string tempPath = exepath;
+ for (int i = 0; i < exepath.length(); i++) { tempPath[i] = tolower(exepath[i]); }
+ exepath = exepath.substr(0, (tempPath.find_last_of('m')));
+
+ string tryPath = m->getFullPathName(exepath) + m->getSimpleName(fName);
+ m->mothurOut("Unable to open " + fName + ". Trying mothur's executable location " + tryPath); m->mothurOutEndLine();
+ ifstream in2;
+ ableToOpen = m->openInputFile(tryPath, in2, "noerror");
+ in2.close();
+ fName = tryPath;
+ }
+
+ if (ableToOpen == 1) { m->mothurOut("Unable to open " + fName + ". Disregarding. "); m->mothurOutEndLine(); }
+ else { flowFileVector.push_back(fName); }
+ m->gobble(flowFilesFile);
+ }
+ flowFilesFile.close();
+ if (flowFileVector.size() == 0) { m->mothurOut("[ERROR]: no valid files."); m->mothurOutEndLine(); abort = true; }
+ }
+ else{
+ if (outputDir == "") { outputDir = m->hasPath(flowFileName); }
+ flowFileVector.push_back(flowFileName);
+ }
+
//check for optional parameter and set defaults
// ...at some point should added some additional type checking...
string temp;
temp = validParameter.validFile(parameters, "lookup", true);
- if (temp == "not found") { lookupFileName = "LookUp_Titanium.pat"; }
- else if(temp == "not open") { abort = true; }
- else { lookupFileName = temp; }
+ if (temp == "not found") {
+ string path = m->argv;
+ string tempPath = path;
+ for (int i = 0; i < path.length(); i++) { tempPath[i] = tolower(path[i]); }
+ path = path.substr(0, (tempPath.find_last_of('m')));
+
+#if defined (__APPLE__) || (__MACH__) || (linux) || (__linux) || (__linux__) || (__unix__) || (__unix)
+ path += "lookupFiles/";
+#else
+ path += "lookupFiles\\";
+#endif
+ lookupFileName = m->getFullPathName(path) + "LookUp_Titanium.pat";
+
+ int ableToOpen;
+ ifstream in;
+ ableToOpen = m->openInputFile(lookupFileName, in, "noerror");
+ in.close();
+
+ //if you can't open it, try input location
+ if (ableToOpen == 1) {
+ if (inputDir != "") { //default path is set
+ string tryPath = inputDir + m->getSimpleName(lookupFileName);
+ m->mothurOut("Unable to open " + lookupFileName + ". Trying input directory " + tryPath); m->mothurOutEndLine();
+ ifstream in2;
+ ableToOpen = m->openInputFile(tryPath, in2, "noerror");
+ in2.close();
+ lookupFileName = tryPath;
+ }
+ }
+
+ //if you can't open it, try default location
+ if (ableToOpen == 1) {
+ if (m->getDefaultPath() != "") { //default path is set
+ string tryPath = m->getDefaultPath() + m->getSimpleName(lookupFileName);
+ m->mothurOut("Unable to open " + lookupFileName + ". Trying default " + tryPath); m->mothurOutEndLine();
+ ifstream in2;
+ ableToOpen = m->openInputFile(tryPath, in2, "noerror");
+ in2.close();
+ lookupFileName = tryPath;
+ }
+ }
+
+ //if you can't open it its not in current working directory or inputDir, try mothur excutable location
+ if (ableToOpen == 1) {
+ string exepath = m->argv;
+ string tempPath = exepath;
+ for (int i = 0; i < exepath.length(); i++) { tempPath[i] = tolower(exepath[i]); }
+ exepath = exepath.substr(0, (tempPath.find_last_of('m')));
+
+ string tryPath = m->getFullPathName(exepath) + m->getSimpleName(lookupFileName);
+ m->mothurOut("Unable to open " + lookupFileName + ". Trying mothur's executable location " + tryPath); m->mothurOutEndLine();
+ ifstream in2;
+ ableToOpen = m->openInputFile(tryPath, in2, "noerror");
+ in2.close();
+ lookupFileName = tryPath;
+ }
+
+ if (ableToOpen == 1) { m->mothurOut("Unable to open " + lookupFileName + "."); m->mothurOutEndLine(); abort=true; }
+ }
+ else if(temp == "not open") {
+
+ lookupFileName = validParameter.validFile(parameters, "lookup", false);
+
+ //if you can't open it its not inputDir, try mothur excutable location
+ string exepath = m->argv;
+ string tempPath = exepath;
+ for (int i = 0; i < exepath.length(); i++) { tempPath[i] = tolower(exepath[i]); }
+ exepath = exepath.substr(0, (tempPath.find_last_of('m')));
+
+ string tryPath = m->getFullPathName(exepath) + m->getSimpleName(lookupFileName);
+ m->mothurOut("Unable to open " + lookupFileName + ". Trying mothur's executable location " + tryPath); m->mothurOutEndLine();
+ ifstream in2;
+ int ableToOpen = m->openInputFile(tryPath, in2, "noerror");
+ in2.close();
+ lookupFileName = tryPath;
+
+ if (ableToOpen == 1) { m->mothurOut("Unable to open " + lookupFileName + "."); m->mothurOutEndLine(); abort=true; }
+ }else { lookupFileName = temp; }
temp = validParameter.validFile(parameters, "processors", false); if (temp == "not found"){ temp = m->getProcessors(); }
m->setProcessors(temp);
- convert(temp, processors);
+ m->mothurConvert(temp, processors);
temp = validParameter.validFile(parameters, "cutoff", false); if (temp == "not found"){ temp = "0.01"; }
- convert(temp, cutoff);
+ m->mothurConvert(temp, cutoff);
temp = validParameter.validFile(parameters, "mindelta", false); if (temp == "not found"){ temp = "0.000001"; }
- convert(temp, minDelta);
+ m->mothurConvert(temp, minDelta);
temp = validParameter.validFile(parameters, "maxiter", false); if (temp == "not found"){ temp = "1000"; }
- convert(temp, maxIters);
+ m->mothurConvert(temp, maxIters);
+
+ temp = validParameter.validFile(parameters, "large", false); if (temp == "not found"){ temp = "0"; }
+ m->mothurConvert(temp, largeSize);
+ if (largeSize != 0) { large = true; }
+ else { large = false; }
+ if (largeSize < 0) { m->mothurOut("The value of the large cannot be negative.\n"); }
+
+#ifdef USE_MPI
+ if (large) { m->mothurOut("The large parameter is not available with the MPI-Enabled version.\n"); large=false; }
+#endif
+
temp = validParameter.validFile(parameters, "sigma", false);if (temp == "not found") { temp = "60"; }
- convert(temp, sigma);
-
- flowOrder = validParameter.validFile(parameters, "order", false);
- if (flowOrder == "not found"){ flowOrder = "TACG"; }
- else if(flowOrder.length() != 4){
- m->mothurOut("The value of the order option must be four bases long\n");
- }
+ m->mothurConvert(temp, sigma);
+
+ temp = validParameter.validFile(parameters, "order", false); if (temp == "not found"){ temp = "A"; }
+ if (temp.length() > 1) { m->mothurOut("[ERROR]: " + temp + " is not a valid option for order. order options are A, B, or I. A = TACG, B = TACGTACGTACGATGTAGTCGAGCATCATCTGACGCAGTACGTGCATGATCTCAGTCAGCAGCTATGTCAGTGCATGCAGTGACTGATCGTCATCAGCTAGCATCGACTGCATAGATCGCATGACGATCGCATATCGTCAGTGCATGTAGTCGAGCATCATCTGACGCAGTACGTGCATGATCTCAGTCAGCAGCTATGTCAGTGCATGCATAGATCGCATGACGATCGCATATCGTCAGTGCAGTGACTGATCGTCATCAGCTAGCATCGACTGCATGTAGTCGAGCATCATCTGACGCAGTACGTGCATAGATCGCATGACGATCGCATATCGTCAGTGCATGATCTCAGTCAGCAGCTATGTCAGTGCATGCAGTGACTGATCGTCATCAGCTAGCATCGACTGCATGTAGTCGAGCATCATCTGACGCAGTACGTGCAGTGACTGATCGTCATCAGCTAGCATCGACTGCATAGATCGCATGACGATCGCATATCGTCAGTGCATGATCTCAGTCAGCAGCTATGTCAGTGCATGCATGTAGTCGAGCATCATCTGACGCAGTACGTGCATAGATCGCATGACGATCGCATATCGTCAGTGCAGTGACTGATCGTCATCAGCTAGCATCGACTGCATGATCTCAGTCAGCAGCTATGTCAGTGCATGCAGTGACTGATCGTCATCAGCTAGCATCGACTGCATAGATCGCATGACGATCGCATATCGTCAGTGCATGATCTCAGTCAGCAGCTATGTCAGTGCATGCATGTAGTCGAGCATCATCTGACGCAGTACGTGCATAGATCGCATGACGATCGCATATCGTCAGTGCATGATCTCAGTCAGCAGCTATGTCAGTGCATGCAGTGACTGATCGTCATCAGCTAGCATCGACTGCATGTAGTCGAGCATCATCTGACGCAGTACGTGCATGATCTCAGTCAGCAGCTATGTCAGTGCATGCATAGATCGCATGACGATCGCATATCGTCAGTGCATGTAGTCGAGCATCATCTGACGCAGTACGTGCAGTGACTGATCGTCATCAGCTAGCATCGACTGCATAGATCGCATGACGATCGCATATCGTCAGTGCATGTAGTCGAGCATCATCTGACGCAGTACGTGCATGATCTCAGTCAGCAGCTATGTCAGTGCATGCAGTGACTGATCGTCATCAGCTAGCATCGACTGCATGATCTCAGTCAGCAGCTATGTCAGTGCATGCAGTGACTGATCGTCATCAGCTAGCATCGACTGCATAGATCGCATGACGATCGCATATCGTCAGTGCATGTAGTCGAGCATCATCTGACGCAGTACGTGCATGATCTCAGTCAGCAGCTATGTCAGTGCATGCATGTAGTCGAGCATCATCTGACGCAGTACGTGCAGTGACTGATCGTCATCAGCTAGCATCGACTGCATAGATCGCATGACGATCGCATATCGTCAGTGCAGTGACTGATCGTCATCAGCTAGCATCGACTGCATGTAGTCGAGCATCATCTGACGCAGTACGTGCATAGATCGCATGACGATCGCATATCGTCAGTGCATGATCTCAGTCAGCAGCTATGTCAGTGCATGCAGTGACTGATCGTCATCAGCTAGCATCGACTGCATGTAGTCGAGCATCATCTGACGCAGTACGTGCATGATCTCAGTCAGCAGCTATGTCAGTGCATGCATAGATCGCATGACGATCGCATATCGTCAGTGCAGTGACTGATCGTCATCAGCTAGCATCGACTGCATGATCTCAGTCAGCAGC, and I = TACGTACGTCTGAGCATCGATCGATGTACAGCTACGTACGTCTGAGCATCGATCGATGTACAGCTACGTACGTCTGAGCATCGATCGATGTACAGCTACGTACGTCTGAGCATCGATCGATGTACAGCTACGTACGTCTGAGCATCGATCGATGTACAGCTACGTACGTCTGAGCATCGATCGATGTACAGCTACGTACGTCTGAGCATCGATCGATGTACAGCTACGTACGTCTGAGCATCGATCGATGTACAGCTACGTACGTCTGAGCATCGATCGATGTACAGCTACGTACGTCTGAGCATCGATCGATGTACAGCTACGTACGTCTGAGCATCGATCGATGTACAGCTACGTACGTCTGAGCATCGATCGATGTACAGCTACGTACGTCTGAGCATCGATCGATGTACAGCTACGTACGTCTGAGCATCGATCGATGTACAGCTACGTACGTCTGAGCATCGATCGATGTACAGCTACGTACGTCTGAGCATCGATCGATGTACAGCTACGTACGTCTGAGCATCGATCGATGTACAGCTACGTACGTCTGAGCATCGATCGATGTACAGCTACGTACGTCTGAGCATCGATCGATGTACAGCTACGTACGTCTGAGCATCGATCGATGTACAGC.\n"); abort=true;
+ }
+ else {
+ if (toupper(temp[0]) == 'A') { flowOrder = "TACG"; }
+ else if(toupper(temp[0]) == 'B'){
+ flowOrder = "TACGTACGTACGATGTAGTCGAGCATCATCTGACGCAGTACGTGCATGATCTCAGTCAGCAGCTATGTCAGTGCATGCAGTGACTGATCGTCATCAGCTAGCATCGACTGCATAGATCGCATGACGATCGCATATCGTCAGTGCATGTAGTCGAGCATCATCTGACGCAGTACGTGCATGATCTCAGTCAGCAGCTATGTCAGTGCATGCATAGATCGCATGACGATCGCATATCGTCAGTGCAGTGACTGATCGTCATCAGCTAGCATCGACTGCATGTAGTCGAGCATCATCTGACGCAGTACGTGCATAGATCGCATGACGATCGCATATCGTCAGTGCATGATCTCAGTCAGCAGCTATGTCAGTGCATGCAGTGACTGATCGTCATCAGCTAGCATCGACTGCATGTAGTCGAGCATCATCTGACGCAGTACGTGCAGTGACTGATCGTCATCAGCTAGCATCGACTGCATAGATCGCATGACGATCGCATATCGTCAGTGCATGATCTCAGTCAGCAGCTATGTCAGTGCATGCATGTAGTCGAGCATCATCTGACGCAGTACGTGCATAGATCGCATGACGATCGCATATCGTCAGTGCAGTGACTGATCGTCATCAGCTAGCATCGACTGCATGATCTCAGTCAGCAGCTATGTCAGTGCATGCAGTGACTGATCGTCATCAGCTAGCATCGACTGCATAGATCGCATGACGATCGCATATCGTCAGTGCATGATCTCAGTCAGCAGCTATGTCAGTGCATGCATGTAGTCGAGCATCATCTGACGCAGTACGTGCATAGATCGCATGACGATCGCATATCGTCAGTGCATGATCTCAGTCAGCAGCTATGTCAGTGCATGCAGTGACTGATCGTCATCAGCTAGCATCGACTGCATGTAGTCGAGCATCATCTGACGCAGTACGTGCATGATCTCAGTCAGCAGCTATGTCAGTGCATGCATAGATCGCATGACGATCGCATATCGTCAGTGCATGTAGTCGAGCATCATCTGACGCAGTACGTGCAGTGACTGATCGTCATCAGCTAGCATCGACTGCATAGATCGCATGACGATCGCATATCGTCAGTGCATGTAGTCGAGCATCATCTGACGCAGTACGTGCATGATCTCAGTCAGCAGCTATGTCAGTGCATGCAGTGACTGATCGTCATCAGCTAGCATCGACTGCATGATCTCAGTCAGCAGCTATGTCAGTGCATGCAGTGACTGATCGTCATCAGCTAGCATCGACTGCATAGATCGCATGACGATCGCATATCGTCAGTGCATGTAGTCGAGCATCATCTGACGCAGTACGTGCATGATCTCAGTCAGCAGCTATGTCAGTGCATGCATGTAGTCGAGCATCATCTGACGCAGTACGTGCAGTGACTGATCGTCATCAGCTAGCATCGACTGCATAGATCGCATGACGATCGCATATCGTCAGTGCAGTGACTGATCGTCATCAGCTAGCATCGACTGCATGTAGTCGAGCATCATCTGACGCAGTACGTGCATAGATCGCATGACGATCGCATATCGTCAGTGCATGATCTCAGTCAGCAGCTATGTCAGTGCATGCAGTGACTGATCGTCATCAGCTAGCATCGACTGCATGTAGTCGAGCATCATCTGACGCAGTACGTGCATGATCTCAGTCAGCAGCTATGTCAGTGCATGCATAGATCGCATGACGATCGCATATCGTCAGTGCAGTGACTGATCGTCATCAGCTAGCATCGACTGCATGATCTCAGTCAGCAGC"; }
+ else if(toupper(temp[0]) == 'I'){
+ flowOrder = "TACGTACGTCTGAGCATCGATCGATGTACAGCTACGTACGTCTGAGCATCGATCGATGTACAGCTACGTACGTCTGAGCATCGATCGATGTACAGCTACGTACGTCTGAGCATCGATCGATGTACAGCTACGTACGTCTGAGCATCGATCGATGTACAGCTACGTACGTCTGAGCATCGATCGATGTACAGCTACGTACGTCTGAGCATCGATCGATGTACAGCTACGTACGTCTGAGCATCGATCGATGTACAGCTACGTACGTCTGAGCATCGATCGATGTACAGCTACGTACGTCTGAGCATCGATCGATGTACAGCTACGTACGTCTGAGCATCGATCGATGTACAGCTACGTACGTCTGAGCATCGATCGATGTACAGCTACGTACGTCTGAGCATCGATCGATGTACAGCTACGTACGTCTGAGCATCGATCGATGTACAGCTACGTACGTCTGAGCATCGATCGATGTACAGCTACGTACGTCTGAGCATCGATCGATGTACAGCTACGTACGTCTGAGCATCGATCGATGTACAGCTACGTACGTCTGAGCATCGATCGATGTACAGCTACGTACGTCTGAGCATCGATCGATGTACAGCTACGTACGTCTGAGCATCGATCGATGTACAGC"; }
+ else {
+ m->mothurOut("[ERROR]: " + temp + " is not a valid option for order. order options are A, B, or I. A = TACG, B = TACGTACGTACGATGTAGTCGAGCATCATCTGACGCAGTACGTGCATGATCTCAGTCAGCAGCTATGTCAGTGCATGCAGTGACTGATCGTCATCAGCTAGCATCGACTGCATAGATCGCATGACGATCGCATATCGTCAGTGCATGTAGTCGAGCATCATCTGACGCAGTACGTGCATGATCTCAGTCAGCAGCTATGTCAGTGCATGCATAGATCGCATGACGATCGCATATCGTCAGTGCAGTGACTGATCGTCATCAGCTAGCATCGACTGCATGTAGTCGAGCATCATCTGACGCAGTACGTGCATAGATCGCATGACGATCGCATATCGTCAGTGCATGATCTCAGTCAGCAGCTATGTCAGTGCATGCAGTGACTGATCGTCATCAGCTAGCATCGACTGCATGTAGTCGAGCATCATCTGACGCAGTACGTGCAGTGACTGATCGTCATCAGCTAGCATCGACTGCATAGATCGCATGACGATCGCATATCGTCAGTGCATGATCTCAGTCAGCAGCTATGTCAGTGCATGCATGTAGTCGAGCATCATCTGACGCAGTACGTGCATAGATCGCATGACGATCGCATATCGTCAGTGCAGTGACTGATCGTCATCAGCTAGCATCGACTGCATGATCTCAGTCAGCAGCTATGTCAGTGCATGCAGTGACTGATCGTCATCAGCTAGCATCGACTGCATAGATCGCATGACGATCGCATATCGTCAGTGCATGATCTCAGTCAGCAGCTATGTCAGTGCATGCATGTAGTCGAGCATCATCTGACGCAGTACGTGCATAGATCGCATGACGATCGCATATCGTCAGTGCATGATCTCAGTCAGCAGCTATGTCAGTGCATGCAGTGACTGATCGTCATCAGCTAGCATCGACTGCATGTAGTCGAGCATCATCTGACGCAGTACGTGCATGATCTCAGTCAGCAGCTATGTCAGTGCATGCATAGATCGCATGACGATCGCATATCGTCAGTGCATGTAGTCGAGCATCATCTGACGCAGTACGTGCAGTGACTGATCGTCATCAGCTAGCATCGACTGCATAGATCGCATGACGATCGCATATCGTCAGTGCATGTAGTCGAGCATCATCTGACGCAGTACGTGCATGATCTCAGTCAGCAGCTATGTCAGTGCATGCAGTGACTGATCGTCATCAGCTAGCATCGACTGCATGATCTCAGTCAGCAGCTATGTCAGTGCATGCAGTGACTGATCGTCATCAGCTAGCATCGACTGCATAGATCGCATGACGATCGCATATCGTCAGTGCATGTAGTCGAGCATCATCTGACGCAGTACGTGCATGATCTCAGTCAGCAGCTATGTCAGTGCATGCATGTAGTCGAGCATCATCTGACGCAGTACGTGCAGTGACTGATCGTCATCAGCTAGCATCGACTGCATAGATCGCATGACGATCGCATATCGTCAGTGCAGTGACTGATCGTCATCAGCTAGCATCGACTGCATGTAGTCGAGCATCATCTGACGCAGTACGTGCATAGATCGCATGACGATCGCATATCGTCAGTGCATGATCTCAGTCAGCAGCTATGTCAGTGCATGCAGTGACTGATCGTCATCAGCTAGCATCGACTGCATGTAGTCGAGCATCATCTGACGCAGTACGTGCATGATCTCAGTCAGCAGCTATGTCAGTGCATGCATAGATCGCATGACGATCGCATATCGTCAGTGCAGTGACTGATCGTCATCAGCTAGCATCGACTGCATGATCTCAGTCAGCAGC, and I = TACGTACGTCTGAGCATCGATCGATGTACAGCTACGTACGTCTGAGCATCGATCGATGTACAGCTACGTACGTCTGAGCATCGATCGATGTACAGCTACGTACGTCTGAGCATCGATCGATGTACAGCTACGTACGTCTGAGCATCGATCGATGTACAGCTACGTACGTCTGAGCATCGATCGATGTACAGCTACGTACGTCTGAGCATCGATCGATGTACAGCTACGTACGTCTGAGCATCGATCGATGTACAGCTACGTACGTCTGAGCATCGATCGATGTACAGCTACGTACGTCTGAGCATCGATCGATGTACAGCTACGTACGTCTGAGCATCGATCGATGTACAGCTACGTACGTCTGAGCATCGATCGATGTACAGCTACGTACGTCTGAGCATCGATCGATGTACAGCTACGTACGTCTGAGCATCGATCGATGTACAGCTACGTACGTCTGAGCATCGATCGATGTACAGCTACGTACGTCTGAGCATCGATCGATGTACAGCTACGTACGTCTGAGCATCGATCGATGTACAGCTACGTACGTCTGAGCATCGATCGATGTACAGCTACGTACGTCTGAGCATCGATCGATGTACAGCTACGTACGTCTGAGCATCGATCGATGTACAGC.\n"); abort=true;
+ }
+ }
+
}
-
#ifdef USE_MPI
}
#endif
-
}
catch(exception& e) {
m->errorOut(e, "ShhherCommand", "ShhherCommand");
int tag = 1976;
MPI_Status status;
-
+
if(pid == 0){
for(int i=1;i<ncpus;i++){
processors = ncpus;
m->mothurOut("\nGetting preliminary data...\n");
- getSingleLookUp();
- getJointLookUp();
+ getSingleLookUp(); if (m->control_pressed) { return 0; }
+ getJointLookUp(); if (m->control_pressed) { return 0; }
- vector<string> flowFileVector;
+ vector<string> flowFileVector;
if(flowFilesFileName != "not found"){
string fName;
-
+
ifstream flowFilesFile;
m->openInputFile(flowFilesFileName, flowFilesFile);
while(flowFilesFile){
- flowFilesFile >> fName;
+ fName = m->getline(flowFilesFile);
flowFileVector.push_back(fName);
m->gobble(flowFilesFile);
}
else{
flowFileVector.push_back(flowFileName);
}
+
int numFiles = flowFileVector.size();
for(int i=1;i<ncpus;i++){
}
for(int i=0;i<numFiles;i++){
+
+ if (m->control_pressed) { break; }
+
double begClock = clock();
- unsigned long int begTime = time(NULL);
+ unsigned long long begTime = time(NULL);
flowFileName = flowFileVector[i];
m->mothurOut("\n>>>>>\tProcessing " + flowFileName + " (file " + toString(i+1) + " of " + toString(numFiles) + ")\t<<<<<\n");
m->mothurOut("Reading flowgrams...\n");
getFlowData();
+
+ if (m->control_pressed) { break; }
m->mothurOut("Identifying unique flowgrams...\n");
getUniques();
+
+ if (m->control_pressed) { break; }
m->mothurOut("Calculating distances between flowgrams...\n");
char fileName[1024];
string distFileName = flowDistMPI(0, int(sqrt(1.0/float(ncpus)) * numUniques));
+ if (m->control_pressed) { break; }
+
int done;
for(int i=1;i<ncpus;i++){
MPI_Recv(&done, 1, MPI_INT, i, tag, MPI_COMM_WORLD, &status);
m->appendFiles((distFileName + ".temp." + toString(i)), distFileName);
- remove((distFileName + ".temp." + toString(i)).c_str());
+ m->mothurRemove((distFileName + ".temp." + toString(i)));
}
string namesFileName = createNamesFile();
+ if (m->control_pressed) { break; }
+
m->mothurOut("\nClustering flowgrams...\n");
string listFileName = cluster(distFileName, namesFileName);
-
+
+ if (m->control_pressed) { break; }
+
+
+
getOTUData(listFileName);
+
+ m->mothurRemove(distFileName);
+ m->mothurRemove(namesFileName);
+ m->mothurRemove(listFileName);
+
+ if (m->control_pressed) { break; }
+
initPyroCluster();
+ if (m->control_pressed) { break; }
+
+
for(int i=1;i<ncpus;i++){
MPI_Send(&numOTUs, 1, MPI_INT, i, tag, MPI_COMM_WORLD);
MPI_Send(&singleLookUp[0], singleLookUp.size(), MPI_DOUBLE, i, tag, MPI_COMM_WORLD);
MPI_Send(&sigma, 1, MPI_DOUBLE, i, tag, MPI_COMM_WORLD);
}
+ if (m->control_pressed) { break; }
double maxDelta = 0;
int iter = 0;
m->mothurOut("iter\tmaxDelta\tnLL\t\tcycletime\n");
while((maxIters == 0 && maxDelta > minDelta) || iter < MIN_ITER || (maxDelta > minDelta && iter < maxIters)){
-
+
double cycClock = clock();
- unsigned long int cycTime = time(NULL);
+ unsigned long long cycTime = time(NULL);
fill();
+
+ if (m->control_pressed) { break; }
int total = singleTau.size();
for(int i=1;i<ncpus;i++){
}
}
- maxDelta = getNewWeights();
- double nLL = getLikelihood();
- checkCentroids();
+ maxDelta = getNewWeights(); if (m->control_pressed) { break; }
+ double nLL = getLikelihood(); if (m->control_pressed) { break; }
+ checkCentroids(); if (m->control_pressed) { break; }
for(int i=1;i<ncpus;i++){
MPI_Send(¢roids[0], numOTUs, MPI_INT, i, tag, MPI_COMM_WORLD);
}
+ if (m->control_pressed) { break; }
+
m->mothurOut("\nFinalizing...\n");
fill();
+
+ if (m->control_pressed) { break; }
+
setOTUs();
+
vector<int> otuCounts(numOTUs, 0);
for(int i=0;i<numSeqs;i++) { otuCounts[otuData[i]]++; }
calcCentroidsDriver(0, numOTUs);
- writeQualities(otuCounts);
- writeSequences(otuCounts);
- writeNames(otuCounts);
- writeClusters(otuCounts);
- writeGroups();
-
- remove(distFileName.c_str());
- remove(namesFileName.c_str());
- remove(listFileName.c_str());
+
+ if (m->control_pressed) { break; }
+
+ writeQualities(otuCounts); if (m->control_pressed) { break; }
+ writeSequences(otuCounts); if (m->control_pressed) { break; }
+ writeNames(otuCounts); if (m->control_pressed) { break; }
+ writeClusters(otuCounts); if (m->control_pressed) { break; }
+ writeGroups(); if (m->control_pressed) { break; }
+
m->mothurOut("Total time to process " + toString(flowFileName) + ":\t" + toString(time(NULL) - begTime) + '\t' + toString((clock() - begClock)/(double)CLOCKS_PER_SEC) + '\n');
}
MPI_Recv(&numFiles, 1, MPI_INT, 0, tag, MPI_COMM_WORLD, &status);
for(int i=0;i<numFiles;i++){
+
+ if (m->control_pressed) { break; }
+
//Now into the pyrodist part
bool live = 1;
int flowDistEnd = int(sqrt(float(pid+1)/float(ncpus)) * numUniques);
string distanceStringChild = flowDistMPI(flowDistStart, flowDistEnd);
-
+
+ if (m->control_pressed) { break; }
+
int done = 1;
MPI_Send(&done, 1, MPI_INT, 0, tag, MPI_COMM_WORLD);
while(live){
+ if (m->control_pressed) { break; }
+
MPI_Recv(&total, 1, MPI_INT, 0, tag, MPI_COMM_WORLD, &status);
singleTau.assign(total, 0.0000);
seqNumber.assign(total, 0);
MPI_Recv(&live, 1, MPI_INT, 0, tag, MPI_COMM_WORLD, &status);
}
}
- }
+ }
+
+ if (m->control_pressed) { for (int i = 0; i < outputNames.size(); i++) { m->mothurRemove(outputNames[i]); } return 0; }
+
MPI_Barrier(MPI_COMM_WORLD);
+
+ if(compositeFASTAFileName != ""){
+ outputNames.push_back(compositeFASTAFileName); outputTypes["fasta"].push_back(compositeFASTAFileName);
+ outputNames.push_back(compositeNamesFileName); outputTypes["name"].push_back(compositeNamesFileName);
+ }
+
+ m->mothurOutEndLine();
+ m->mothurOut("Output File Names: "); m->mothurOutEndLine();
+ for (int i = 0; i < outputNames.size(); i++) { m->mothurOut(outputNames[i]); m->mothurOutEndLine(); }
+ m->mothurOutEndLine();
+
return 0;
}
exit(1);
}
}
-
+/**************************************************************************************************/
+string ShhherCommand::createNamesFile(){
+ try{
+
+ vector<string> duplicateNames(numUniques, "");
+ for(int i=0;i<numSeqs;i++){
+ duplicateNames[mapSeqToUnique[i]] += seqNameVector[i] + ',';
+ }
+ map<string, string> variables;
+ variables["[filename]"] = outputDir + m->getRootName(m->getSimpleName(flowFileName));
+ string nameFileName = getOutputFileName("name",variables);
+
+ ofstream nameFile;
+ m->openOutputFile(nameFileName, nameFile);
+
+ for(int i=0;i<numUniques;i++){
+
+ if (m->control_pressed) { break; }
+
+ // nameFile << seqNameVector[mapUniqueToSeq[i]] << '\t' << duplicateNames[i].substr(0, duplicateNames[i].find_last_of(',')) << endl;
+ nameFile << mapUniqueToSeq[i] << '\t' << duplicateNames[i].substr(0, duplicateNames[i].find_last_of(',')) << endl;
+ }
+
+ nameFile.close();
+ return nameFileName;
+ }
+ catch(exception& e) {
+ m->errorOut(e, "ShhherCommand", "createNamesFile");
+ exit(1);
+ }
+}
/**************************************************************************************************/
string ShhherCommand::flowDistMPI(int startSeq, int stopSeq){
double begClock = clock();
for(int i=startSeq;i<stopSeq;i++){
+
+ if (m->control_pressed) { break; }
+
for(int j=0;j<i;j++){
float flowDistance = calcPairwiseDist(mapUniqueToSeq[i], mapUniqueToSeq[j]);
}
}
if(i % 100 == 0){
- m->mothurOut(toString(i) + '\t' + toString(time(NULL) - begTime) + '\t' + toString((clock()-begClock)/CLOCKS_PER_SEC) + '\n');
+ m->mothurOutJustToScreen(toString(i) + '\t' + toString(time(NULL) - begTime) + '\t' + toString((clock()-begClock)/CLOCKS_PER_SEC) + '\n');
}
}
- m->mothurOut(toString(stopSeq) + '\t' + toString(time(NULL) - begTime) + '\t' + toString((clock()-begClock)/CLOCKS_PER_SEC) + '\n');
-
- string fDistFileName = flowFileName.substr(0,flowFileName.find_last_of('.')) + ".pn.dist";
+ string fDistFileName = flowFileName.substr(0,flowFileName.find_last_of('.')) + ".shhh.dist";
if(pid != 0){ fDistFileName += ".temp." + toString(pid); }
+
+ if (m->control_pressed) { return fDistFileName; }
+
+ m->mothurOutJustToScreen(toString(stopSeq) + '\t' + toString(time(NULL) - begTime) + '\t' + toString((clock()-begClock)/CLOCKS_PER_SEC) + '\n');
ofstream distFile(fDistFileName.c_str());
distFile << outStream.str();
return fDistFileName;
}
catch(exception& e) {
- m->errorOut(e, "ShhherCommand", "flowDistParentFork");
+ m->errorOut(e, "ShhherCommand", "flowDistMPI");
exit(1);
}
}
+/**************************************************************************************************/
+
+void ShhherCommand::getOTUData(string listFileName){
+ try {
+
+ ifstream listFile;
+ m->openInputFile(listFileName, listFile);
+ string label;
+
+ listFile >> label >> numOTUs;
+
+ otuData.assign(numSeqs, 0);
+ cumNumSeqs.assign(numOTUs, 0);
+ nSeqsPerOTU.assign(numOTUs, 0);
+ aaP.clear();aaP.resize(numOTUs);
+
+ seqNumber.clear();
+ aaI.clear();
+ seqIndex.clear();
+
+ string singleOTU = "";
+
+ for(int i=0;i<numOTUs;i++){
+
+ if (m->control_pressed) { break; }
+
+ listFile >> singleOTU;
+
+ istringstream otuString(singleOTU);
+
+ while(otuString){
+
+ string seqName = "";
+
+ for(int j=0;j<singleOTU.length();j++){
+ char letter = otuString.get();
+
+ if(letter != ','){
+ seqName += letter;
+ }
+ else{
+ map<string,int>::iterator nmIt = nameMap.find(seqName);
+ int index = nmIt->second;
+
+ nameMap.erase(nmIt);
+
+ otuData[index] = i;
+ nSeqsPerOTU[i]++;
+ aaP[i].push_back(index);
+ seqName = "";
+ }
+ }
+
+ map<string,int>::iterator nmIt = nameMap.find(seqName);
+
+ int index = nmIt->second;
+ nameMap.erase(nmIt);
+
+ otuData[index] = i;
+ nSeqsPerOTU[i]++;
+ aaP[i].push_back(index);
+
+ otuString.get();
+ }
+
+ sort(aaP[i].begin(), aaP[i].end());
+ for(int j=0;j<nSeqsPerOTU[i];j++){
+ seqNumber.push_back(aaP[i][j]);
+ }
+ for(int j=nSeqsPerOTU[i];j<numSeqs;j++){
+ aaP[i].push_back(0);
+ }
+
+
+ }
+
+ for(int i=1;i<numOTUs;i++){
+ cumNumSeqs[i] = cumNumSeqs[i-1] + nSeqsPerOTU[i-1];
+ }
+ aaI = aaP;
+ seqIndex = seqNumber;
+
+ listFile.close();
+
+ }
+ catch(exception& e) {
+ m->errorOut(e, "ShhherCommand", "getOTUData");
+ exit(1);
+ }
+}
-#else
-//**********************************************************************************************************************
+/**************************************************************************************************/
-int ShhherCommand::execute(){
- try {
- if (abort == true) { return 0; }
-
- getSingleLookUp();
- getJointLookUp();
-
- vector<string> flowFileVector;
- if(flowFilesFileName != "not found"){
- string fName;
-
- ifstream flowFilesFile;
- m->openInputFile(flowFilesFileName, flowFilesFile);
- while(flowFilesFile){
- flowFilesFile >> fName;
- flowFileVector.push_back(fName);
- m->gobble(flowFilesFile);
- }
- }
- else{
- flowFileVector.push_back(flowFileName);
- }
- int numFiles = flowFileVector.size();
-
-
- for(int i=0;i<numFiles;i++){
- flowFileName = flowFileVector[i];
+void ShhherCommand::initPyroCluster(){
+ try{
+ if (numOTUs < processors) { processors = 1; }
+
+ if (m->debug) { m->mothurOut("[DEBUG]: numSeqs = " + toString(numSeqs) + " numOTUS = " + toString(numOTUs) + " about to alloc a dist vector with size = " + toString((numSeqs * numOTUs)) + ".\n"); }
+
+ dist.assign(numSeqs * numOTUs, 0);
+ change.assign(numOTUs, 1);
+ centroids.assign(numOTUs, -1);
+ weight.assign(numOTUs, 0);
+ singleTau.assign(numSeqs, 1.0);
+
+ nSeqsBreaks.assign(processors+1, 0);
+ nOTUsBreaks.assign(processors+1, 0);
+
+ if (m->debug) { m->mothurOut("[DEBUG]: made it through the memory allocation.\n"); }
+
+ nSeqsBreaks[0] = 0;
+ for(int i=0;i<processors;i++){
+ nSeqsBreaks[i+1] = nSeqsBreaks[i] + (int)((double) numSeqs / (double) processors);
+ nOTUsBreaks[i+1] = nOTUsBreaks[i] + (int)((double) numOTUs / (double) processors);
+ }
+ nSeqsBreaks[processors] = numSeqs;
+ nOTUsBreaks[processors] = numOTUs;
+ }
+ catch(exception& e) {
+ m->errorOut(e, "ShhherCommand", "initPyroCluster");
+ exit(1);
+ }
+}
+
+/**************************************************************************************************/
+
+void ShhherCommand::fill(){
+ try {
+ int index = 0;
+ for(int i=0;i<numOTUs;i++){
+
+ if (m->control_pressed) { break; }
+
+ cumNumSeqs[i] = index;
+ for(int j=0;j<nSeqsPerOTU[i];j++){
+ seqNumber[index] = aaP[i][j];
+ seqIndex[index] = aaI[i][j];
+
+ index++;
+ }
+ }
+ }
+ catch(exception& e) {
+ m->errorOut(e, "ShhherCommand", "fill");
+ exit(1);
+ }
+}
- m->mothurOut("\n>>>>>\tProcessing " + flowFileName + " (file " + toString(i+1) + " of " + toString(numFiles) + ")\t<<<<<\n");
- m->mothurOut("Reading flowgrams...\n");
- getFlowData();
+/**************************************************************************************************/
+
+void ShhherCommand::getFlowData(){
+ try{
+ ifstream flowFile;
+ m->openInputFile(flowFileName, flowFile);
+
+ string seqName;
+ seqNameVector.clear();
+ lengths.clear();
+ flowDataIntI.clear();
+ nameMap.clear();
+
+
+ int currentNumFlowCells;
+
+ float intensity;
+
+ string numFlowTest;
+ flowFile >> numFlowTest;
+
+ if (!m->isContainingOnlyDigits(numFlowTest)) { m->mothurOut("[ERROR]: expected a number and got " + numFlowTest + ", quitting. Did you use the flow parameter instead of the file parameter?"); m->mothurOutEndLine(); exit(1); }
+ else { convert(numFlowTest, numFlowCells); }
+
+ int index = 0;//pcluster
+ while(!flowFile.eof()){
+
+ if (m->control_pressed) { break; }
+
+ flowFile >> seqName >> currentNumFlowCells;
+ lengths.push_back(currentNumFlowCells);
+
+ seqNameVector.push_back(seqName);
+ nameMap[seqName] = index++;//pcluster
+
+ for(int i=0;i<numFlowCells;i++){
+ flowFile >> intensity;
+ if(intensity > 9.99) { intensity = 9.99; }
+ int intI = int(100 * intensity + 0.0001);
+ flowDataIntI.push_back(intI);
+ }
+ m->gobble(flowFile);
+ }
+ flowFile.close();
+
+ numSeqs = seqNameVector.size();
+
+ for(int i=0;i<numSeqs;i++){
+
+ if (m->control_pressed) { break; }
+
+ int iNumFlowCells = i * numFlowCells;
+ for(int j=lengths[i];j<numFlowCells;j++){
+ flowDataIntI[iNumFlowCells + j] = 0;
+ }
+ }
+
+ }
+ catch(exception& e) {
+ m->errorOut(e, "ShhherCommand", "getFlowData");
+ exit(1);
+ }
+}
+/**************************************************************************************************/
+void ShhherCommand::calcNewDistancesChildMPI(int startSeq, int stopSeq, vector<int>& otuIndex){
+
+ try{
+ vector<double> newTau(numOTUs,0);
+ vector<double> norms(numSeqs, 0);
+ otuIndex.clear();
+ seqIndex.clear();
+ singleTau.clear();
+
+ for(int i=startSeq;i<stopSeq;i++){
- m->mothurOut("Identifying unique flowgrams...\n");
- getUniques();
+ if (m->control_pressed) { break; }
+ double offset = 1e8;
+ int indexOffset = i * numOTUs;
- m->mothurOut("Calculating distances between flowgrams...\n");
- string distFileName = createDistFile(processors);
- string namesFileName = createNamesFile();
+ for(int j=0;j<numOTUs;j++){
- m->mothurOut("\nClustering flowgrams...\n");
- string listFileName = cluster(distFileName, namesFileName);
- getOTUData(listFileName);
-
- initPyroCluster();
-
- double maxDelta = 0;
- int iter = 0;
+ if(weight[j] > MIN_WEIGHT && change[j] == 1){
+ dist[indexOffset + j] = getDistToCentroid(centroids[j], i, lengths[i]);
+ }
+ if(weight[j] > MIN_WEIGHT && dist[indexOffset + j] < offset){
+ offset = dist[indexOffset + j];
+ }
+ }
- double begClock = clock();
- unsigned long int begTime = time(NULL);
-
- m->mothurOut("\nDenoising flowgrams...\n");
- m->mothurOut("iter\tmaxDelta\tnLL\t\tcycletime\n");
+ for(int j=0;j<numOTUs;j++){
+ if(weight[j] > MIN_WEIGHT){
+ newTau[j] = exp(sigma * (-dist[indexOffset + j] + offset)) * weight[j];
+ norms[i] += newTau[j];
+ }
+ else{
+ newTau[j] = 0.0;
+ }
+ }
- while((maxIters == 0 && maxDelta > minDelta) || iter < MIN_ITER || (maxDelta > minDelta && iter < maxIters)){
-
- double cycClock = clock();
- unsigned long int cycTime = time(NULL);
- fill();
-
- calcCentroids();
-
- maxDelta = getNewWeights();
- double nLL = getLikelihood();
- checkCentroids();
-
- calcNewDistances();
-
- iter++;
+ for(int j=0;j<numOTUs;j++){
+
+ newTau[j] /= norms[i];
- m->mothurOut(toString(iter) + '\t' + toString(maxDelta) + '\t' + toString(nLL) + '\t' + toString(time(NULL) - cycTime) + '\t' + toString((clock() - cycClock)/(double)CLOCKS_PER_SEC) + '\n');
-
- }
-
- m->mothurOut("\nFinalizing...\n");
- fill();
- setOTUs();
-
- vector<int> otuCounts(numOTUs, 0);
- for(int i=0;i<numSeqs;i++) { otuCounts[otuData[i]]++; }
-
- calcCentroidsDriver(0, numOTUs);
- writeQualities(otuCounts);
- writeSequences(otuCounts);
- writeNames(otuCounts);
- writeClusters(otuCounts);
- writeGroups();
-
- remove(distFileName.c_str());
- remove(namesFileName.c_str());
- remove(listFileName.c_str());
+ if(newTau[j] > MIN_TAU){
+ otuIndex.push_back(j);
+ seqIndex.push_back(i);
+ singleTau.push_back(newTau[j]);
+ }
+ }
- m->mothurOut("Total time to process " + flowFileName + ":\t" + toString(time(NULL) - begTime) + '\t' + toString((clock() - begClock)/(double)CLOCKS_PER_SEC) + '\n');
}
- return 0;
}
catch(exception& e) {
- m->errorOut(e, "ShhherCommand", "execute");
- exit(1);
- }
+ m->errorOut(e, "ShhherCommand", "calcNewDistancesChildMPI");
+ exit(1);
+ }
}
-#endif
+
/**************************************************************************************************/
-void ShhherCommand::getFlowData(){
- try{
- ifstream flowFile;
- m->openInputFile(flowFileName, flowFile);
-
- string seqName;
- seqNameVector.clear();
- lengths.clear();
- flowDataIntI.clear();
- nameMap.clear();
-
-
- int currentNumFlowCells;
-
- float intensity;
-
- flowFile >> numFlowCells;
- int index = 0;//pcluster
- while(!flowFile.eof()){
- flowFile >> seqName >> currentNumFlowCells;
- lengths.push_back(currentNumFlowCells);
+void ShhherCommand::calcNewDistancesParent(int startSeq, int stopSeq){
+
+ try{
+
+ int total = 0;
+ vector<double> newTau(numOTUs,0);
+ vector<double> norms(numSeqs, 0);
+ nSeqsPerOTU.assign(numOTUs, 0);
+
+ for(int i=startSeq;i<stopSeq;i++){
+
+ if (m->control_pressed) { break; }
+
+ int indexOffset = i * numOTUs;
+
+ double offset = 1e8;
+
+ for(int j=0;j<numOTUs;j++){
+
+ if(weight[j] > MIN_WEIGHT && change[j] == 1){
+ dist[indexOffset + j] = getDistToCentroid(centroids[j], i, lengths[i]);
+ }
+
+ if(weight[j] > MIN_WEIGHT && dist[indexOffset + j] < offset){
+ offset = dist[indexOffset + j];
+ }
+ }
+
+ for(int j=0;j<numOTUs;j++){
+ if(weight[j] > MIN_WEIGHT){
+ newTau[j] = exp(sigma * (-dist[indexOffset + j] + offset)) * weight[j];
+ norms[i] += newTau[j];
+ }
+ else{
+ newTau[j] = 0.0;
+ }
+ }
+
+ for(int j=0;j<numOTUs;j++){
+ newTau[j] /= norms[i];
+ }
+
+ for(int j=0;j<numOTUs;j++){
+ if(newTau[j] > MIN_TAU){
+
+ int oldTotal = total;
+
+ total++;
+
+ singleTau.resize(total, 0);
+ seqNumber.resize(total, 0);
+ seqIndex.resize(total, 0);
+
+ singleTau[oldTotal] = newTau[j];
+
+ aaP[j][nSeqsPerOTU[j]] = oldTotal;
+ aaI[j][nSeqsPerOTU[j]] = i;
+ nSeqsPerOTU[j]++;
+ }
+ }
+
+ }
+
+ }
+ catch(exception& e) {
+ m->errorOut(e, "ShhherCommand", "calcNewDistancesParent");
+ exit(1);
+ }
+}
- seqNameVector.push_back(seqName);
- nameMap[seqName] = index++;//pcluster
+/**************************************************************************************************/
- for(int i=0;i<numFlowCells;i++){
- flowFile >> intensity;
- if(intensity > 9.99) { intensity = 9.99; }
- int intI = int(100 * intensity + 0.0001);
- flowDataIntI.push_back(intI);
- }
- m->gobble(flowFile);
- }
- flowFile.close();
-
- numSeqs = seqNameVector.size();
-
- for(int i=0;i<numSeqs;i++){
- int iNumFlowCells = i * numFlowCells;
- for(int j=lengths[i];j<numFlowCells;j++){
- flowDataIntI[iNumFlowCells + j] = 0;
- }
- }
-
- }
- catch(exception& e) {
- m->errorOut(e, "ShhherCommand", "getFlowData");
- exit(1);
- }
+void ShhherCommand::setOTUs(){
+
+ try {
+ vector<double> bigTauMatrix(numOTUs * numSeqs, 0.0000);
+
+ for(int i=0;i<numOTUs;i++){
+
+ if (m->control_pressed) { break; }
+
+ for(int j=0;j<nSeqsPerOTU[i];j++){
+ int index = cumNumSeqs[i] + j;
+ double tauValue = singleTau[seqNumber[index]];
+ int sIndex = seqIndex[index];
+ bigTauMatrix[sIndex * numOTUs + i] = tauValue;
+ }
+ }
+
+ for(int i=0;i<numSeqs;i++){
+ double maxTau = -1.0000;
+ int maxOTU = -1;
+ for(int j=0;j<numOTUs;j++){
+ if(bigTauMatrix[i * numOTUs + j] > maxTau){
+ maxTau = bigTauMatrix[i * numOTUs + j];
+ maxOTU = j;
+ }
+ }
+
+ otuData[i] = maxOTU;
+ }
+
+ nSeqsPerOTU.assign(numOTUs, 0);
+
+ for(int i=0;i<numSeqs;i++){
+ int index = otuData[i];
+
+ singleTau[i] = 1.0000;
+ dist[i] = 0.0000;
+
+ aaP[index][nSeqsPerOTU[index]] = i;
+ aaI[index][nSeqsPerOTU[index]] = i;
+
+ nSeqsPerOTU[index]++;
+ }
+ fill();
+ }
+ catch(exception& e) {
+ m->errorOut(e, "ShhherCommand", "setOTUs");
+ exit(1);
+ }
}
/**************************************************************************************************/
+
+void ShhherCommand::getUniques(){
+ try{
+
+
+ numUniques = 0;
+ uniqueFlowgrams.assign(numFlowCells * numSeqs, -1);
+ uniqueCount.assign(numSeqs, 0); // anWeights
+ uniqueLengths.assign(numSeqs, 0);
+ mapSeqToUnique.assign(numSeqs, -1);
+ mapUniqueToSeq.assign(numSeqs, -1);
+
+ vector<short> uniqueFlowDataIntI(numFlowCells * numSeqs, -1);
+
+ for(int i=0;i<numSeqs;i++){
+
+ if (m->control_pressed) { break; }
+
+ int index = 0;
+
+ vector<short> current(numFlowCells);
+ for(int j=0;j<numFlowCells;j++){
+ current[j] = short(((flowDataIntI[i * numFlowCells + j] + 50.0)/100.0));
+ }
+
+ for(int j=0;j<numUniques;j++){
+ int offset = j * numFlowCells;
+ bool toEnd = 1;
+
+ int shorterLength;
+ if(lengths[i] < uniqueLengths[j]) { shorterLength = lengths[i]; }
+ else { shorterLength = uniqueLengths[j]; }
+
+ for(int k=0;k<shorterLength;k++){
+ if(current[k] != uniqueFlowgrams[offset + k]){
+ toEnd = 0;
+ break;
+ }
+ }
+
+ if(toEnd){
+ mapSeqToUnique[i] = j;
+ uniqueCount[j]++;
+ index = j;
+ if(lengths[i] > uniqueLengths[j]) { uniqueLengths[j] = lengths[i]; }
+ break;
+ }
+ index++;
+ }
+
+ if(index == numUniques){
+ uniqueLengths[numUniques] = lengths[i];
+ uniqueCount[numUniques] = 1;
+ mapSeqToUnique[i] = numUniques;//anMap
+ mapUniqueToSeq[numUniques] = i;//anF
+
+ for(int k=0;k<numFlowCells;k++){
+ uniqueFlowgrams[numUniques * numFlowCells + k] = current[k];
+ uniqueFlowDataIntI[numUniques * numFlowCells + k] = flowDataIntI[i * numFlowCells + k];
+ }
+
+ numUniques++;
+ }
+ }
+ uniqueFlowDataIntI.resize(numFlowCells * numUniques);
+ uniqueLengths.resize(numUniques);
+
+ flowDataPrI.resize(numSeqs * numFlowCells, 0);
+ for(int i=0;i<flowDataPrI.size();i++) { if (m->control_pressed) { break; } flowDataPrI[i] = getProbIntensity(flowDataIntI[i]); }
+ }
+ catch(exception& e) {
+ m->errorOut(e, "ShhherCommand", "getUniques");
+ exit(1);
+ }
+}
-void ShhherCommand::getSingleLookUp(){
- try{
- // these are the -log probabilities that a signal corresponds to a particular homopolymer length
- singleLookUp.assign(HOMOPS * NUMBINS, 0);
-
- int index = 0;
- ifstream lookUpFile;
- m->openInputFile(lookupFileName, lookUpFile);
-
- for(int i=0;i<HOMOPS;i++){
- float logFracFreq;
- lookUpFile >> logFracFreq;
-
- for(int j=0;j<NUMBINS;j++) {
- lookUpFile >> singleLookUp[index];
- index++;
- }
- }
- lookUpFile.close();
- }
- catch(exception& e) {
- m->errorOut(e, "ShhherCommand", "getSingleLookUp");
- exit(1);
- }
+/**************************************************************************************************/
+
+float ShhherCommand::calcPairwiseDist(int seqA, int seqB){
+ try{
+ int minLength = lengths[mapSeqToUnique[seqA]];
+ if(lengths[seqB] < minLength){ minLength = lengths[mapSeqToUnique[seqB]]; }
+
+ int ANumFlowCells = seqA * numFlowCells;
+ int BNumFlowCells = seqB * numFlowCells;
+
+ float dist = 0;
+
+ for(int i=0;i<minLength;i++){
+
+ if (m->control_pressed) { break; }
+
+ int flowAIntI = flowDataIntI[ANumFlowCells + i];
+ float flowAPrI = flowDataPrI[ANumFlowCells + i];
+
+ int flowBIntI = flowDataIntI[BNumFlowCells + i];
+ float flowBPrI = flowDataPrI[BNumFlowCells + i];
+ dist += jointLookUp[flowAIntI * NUMBINS + flowBIntI] - flowAPrI - flowBPrI;
+ }
+
+ dist /= (float) minLength;
+ return dist;
+ }
+ catch(exception& e) {
+ m->errorOut(e, "ShhherCommand", "calcPairwiseDist");
+ exit(1);
+ }
}
-/**************************************************************************************************/
+//**********************************************************************************************************************/
-void ShhherCommand::getJointLookUp(){
- try{
+string ShhherCommand::cluster(string distFileName, string namesFileName){
+ try {
+
+ ReadMatrix* read = new ReadColumnMatrix(distFileName);
+ read->setCutoff(cutoff);
- // the most likely joint probability (-log) that two intenities have the same polymer length
- jointLookUp.resize(NUMBINS * NUMBINS, 0);
+ NameAssignment* clusterNameMap = new NameAssignment(namesFileName);
+ clusterNameMap->readMap();
+ read->read(clusterNameMap);
+
+ ListVector* list = read->getListVector();
+ SparseDistanceMatrix* matrix = read->getDMatrix();
- for(int i=0;i<NUMBINS;i++){
- for(int j=0;j<NUMBINS;j++){
-
- double minSum = 100000000;
-
- for(int k=0;k<HOMOPS;k++){
- double sum = singleLookUp[k * NUMBINS + i] + singleLookUp[k * NUMBINS + j];
-
- if(sum < minSum) { minSum = sum; }
- }
- jointLookUp[i * NUMBINS + j] = minSum;
- }
- }
- }
- catch(exception& e) {
- m->errorOut(e, "ShhherCommand", "getJointLookUp");
- exit(1);
- }
+ delete read;
+ delete clusterNameMap;
+
+ RAbundVector* rabund = new RAbundVector(list->getRAbundVector());
+
+ Cluster* cluster = new CompleteLinkage(rabund, list, matrix, cutoff, "furthest");
+ string tag = cluster->getTag();
+
+ double clusterCutoff = cutoff;
+ while (matrix->getSmallDist() <= clusterCutoff && matrix->getNNodes() > 0){
+
+ if (m->control_pressed) { break; }
+
+ cluster->update(clusterCutoff);
+ }
+
+ list->setLabel(toString(cutoff));
+
+ string listFileName = flowFileName.substr(0,flowFileName.find_last_of('.')) + ".shhh.list";
+ ofstream listFile;
+ m->openOutputFile(listFileName, listFile);
+ list->print(listFile);
+ listFile.close();
+
+ delete matrix; delete cluster; delete rabund; delete list;
+
+ return listFileName;
+ }
+ catch(exception& e) {
+ m->errorOut(e, "ShhherCommand", "cluster");
+ exit(1);
+ }
}
/**************************************************************************************************/
-double ShhherCommand::getProbIntensity(int intIntensity){
- try{
+void ShhherCommand::calcCentroidsDriver(int start, int finish){
+
+ //this function gets the most likely homopolymer length at a flow position for a group of sequences
+ //within an otu
+
+ try{
+
+ for(int i=start;i<finish;i++){
+
+ if (m->control_pressed) { break; }
+
+ double count = 0;
+ int position = 0;
+ int minFlowGram = 100000000;
+ double minFlowValue = 1e8;
+ change[i] = 0; //FALSE
+
+ for(int j=0;j<nSeqsPerOTU[i];j++){
+ count += singleTau[seqNumber[cumNumSeqs[i] + j]];
+ }
+
+ if(nSeqsPerOTU[i] > 0 && count > MIN_COUNT){
+ vector<double> adF(nSeqsPerOTU[i]);
+ vector<int> anL(nSeqsPerOTU[i]);
+
+ for(int j=0;j<nSeqsPerOTU[i];j++){
+ int index = cumNumSeqs[i] + j;
+ int nI = seqIndex[index];
+ int nIU = mapSeqToUnique[nI];
+
+ int k;
+ for(k=0;k<position;k++){
+ if(nIU == anL[k]){
+ break;
+ }
+ }
+ if(k == position){
+ anL[position] = nIU;
+ adF[position] = 0.0000;
+ position++;
+ }
+ }
+
+ for(int j=0;j<nSeqsPerOTU[i];j++){
+ int index = cumNumSeqs[i] + j;
+ int nI = seqIndex[index];
+
+ double tauValue = singleTau[seqNumber[index]];
+
+ for(int k=0;k<position;k++){
+ double dist = getDistToCentroid(anL[k], nI, lengths[nI]);
+ adF[k] += dist * tauValue;
+ }
+ }
+
+ for(int j=0;j<position;j++){
+ if(adF[j] < minFlowValue){
+ minFlowGram = j;
+ minFlowValue = adF[j];
+ }
+ }
+
+ if(centroids[i] != anL[minFlowGram]){
+ change[i] = 1;
+ centroids[i] = anL[minFlowGram];
+ }
+ }
+ else if(centroids[i] != -1){
+ change[i] = 1;
+ centroids[i] = -1;
+ }
+ }
+ }
+ catch(exception& e) {
+ m->errorOut(e, "ShhherCommand", "calcCentroidsDriver");
+ exit(1);
+ }
+}
- double minNegLogProb = 100000000;
+/**************************************************************************************************/
+
+double ShhherCommand::getDistToCentroid(int cent, int flow, int length){
+ try{
+
+ int flowAValue = cent * numFlowCells;
+ int flowBValue = flow * numFlowCells;
+
+ double dist = 0;
+
+ for(int i=0;i<length;i++){
+ dist += singleLookUp[uniqueFlowgrams[flowAValue] * NUMBINS + flowDataIntI[flowBValue]];
+ flowAValue++;
+ flowBValue++;
+ }
+
+ return dist / (double)length;
+ }
+ catch(exception& e) {
+ m->errorOut(e, "ShhherCommand", "getDistToCentroid");
+ exit(1);
+ }
+}
+
+/**************************************************************************************************/
+
+double ShhherCommand::getNewWeights(){
+ try{
+
+ double maxChange = 0;
+
+ for(int i=0;i<numOTUs;i++){
+
+ if (m->control_pressed) { break; }
+
+ double difference = weight[i];
+ weight[i] = 0;
+
+ for(int j=0;j<nSeqsPerOTU[i];j++){
+ int index = cumNumSeqs[i] + j;
+ double tauValue = singleTau[seqNumber[index]];
+ weight[i] += tauValue;
+ }
+
+ difference = fabs(weight[i] - difference);
+ if(difference > maxChange){ maxChange = difference; }
+ }
+ return maxChange;
+ }
+ catch(exception& e) {
+ m->errorOut(e, "ShhherCommand", "getNewWeights");
+ exit(1);
+ }
+}
+
+ /**************************************************************************************************/
+
+double ShhherCommand::getLikelihood(){
+
+ try{
+
+ vector<long double> P(numSeqs, 0);
+ int effNumOTUs = 0;
+
+ for(int i=0;i<numOTUs;i++){
+ if(weight[i] > MIN_WEIGHT){
+ effNumOTUs++;
+ }
+ }
+
+ string hold;
+ for(int i=0;i<numOTUs;i++){
+
+ if (m->control_pressed) { break; }
+
+ for(int j=0;j<nSeqsPerOTU[i];j++){
+ int index = cumNumSeqs[i] + j;
+ int nI = seqIndex[index];
+ double singleDist = dist[seqNumber[index]];
+
+ P[nI] += weight[i] * exp(-singleDist * sigma);
+ }
+ }
+ double nLL = 0.00;
+ for(int i=0;i<numSeqs;i++){
+ if(P[i] == 0){ P[i] = DBL_EPSILON; }
+
+ nLL += -log(P[i]);
+ }
+
+ nLL = nLL -(double)numSeqs * log(sigma);
+
+ return nLL;
+ }
+ catch(exception& e) {
+ m->errorOut(e, "ShhherCommand", "getNewWeights");
+ exit(1);
+ }
+}
+
+/**************************************************************************************************/
+
+void ShhherCommand::checkCentroids(){
+ try{
+ vector<int> unique(numOTUs, 1);
+
+ for(int i=0;i<numOTUs;i++){
+ if(centroids[i] == -1 || weight[i] < MIN_WEIGHT){
+ unique[i] = -1;
+ }
+ }
+
+ for(int i=0;i<numOTUs;i++){
+
+ if (m->control_pressed) { break; }
+
+ if(unique[i] == 1){
+ for(int j=i+1;j<numOTUs;j++){
+ if(unique[j] == 1){
+
+ if(centroids[j] == centroids[i]){
+ unique[j] = 0;
+ centroids[j] = -1;
+
+ weight[i] += weight[j];
+ weight[j] = 0.0;
+ }
+ }
+ }
+ }
+ }
+ }
+ catch(exception& e) {
+ m->errorOut(e, "ShhherCommand", "checkCentroids");
+ exit(1);
+ }
+}
+ /**************************************************************************************************/
+
+
+
+void ShhherCommand::writeQualities(vector<int> otuCounts){
+
+ try {
+ string thisOutputDir = outputDir;
+ if (outputDir == "") { thisOutputDir += m->hasPath(flowFileName); }
+ map<string, string> variables;
+ variables["[filename]"] = outputDir + m->getRootName(m->getSimpleName(flowFileName));
+ string qualityFileName = getOutputFileName("qfile",variables);
+
+ ofstream qualityFile;
+ m->openOutputFile(qualityFileName, qualityFile);
+
+ qualityFile.setf(ios::fixed, ios::floatfield);
+ qualityFile.setf(ios::showpoint);
+ qualityFile << setprecision(6);
+
+ vector<vector<int> > qualities(numOTUs);
+ vector<double> pr(HOMOPS, 0);
+
+
+ for(int i=0;i<numOTUs;i++){
+
+ if (m->control_pressed) { break; }
+
+ int index = 0;
+ int base = 0;
+
+ if(nSeqsPerOTU[i] > 0){
+ qualities[i].assign(1024, -1);
+
+ while(index < numFlowCells){
+ double maxPrValue = 1e8;
+ short maxPrIndex = -1;
+ double count = 0.0000;
+
+ pr.assign(HOMOPS, 0);
+
+ for(int j=0;j<nSeqsPerOTU[i];j++){
+ int lIndex = cumNumSeqs[i] + j;
+ double tauValue = singleTau[seqNumber[lIndex]];
+ int sequenceIndex = aaI[i][j];
+ short intensity = flowDataIntI[sequenceIndex * numFlowCells + index];
+
+ count += tauValue;
+
+ for(int s=0;s<HOMOPS;s++){
+ pr[s] += tauValue * singleLookUp[s * NUMBINS + intensity];
+ }
+ }
+
+ maxPrIndex = uniqueFlowgrams[centroids[i] * numFlowCells + index];
+ maxPrValue = pr[maxPrIndex];
+
+ if(count > MIN_COUNT){
+ double U = 0.0000;
+ double norm = 0.0000;
+
+ for(int s=0;s<HOMOPS;s++){
+ norm += exp(-(pr[s] - maxPrValue));
+ }
+
+ for(int s=1;s<=maxPrIndex;s++){
+ int value = 0;
+ double temp = 0.0000;
+
+ U += exp(-(pr[s-1]-maxPrValue))/norm;
+
+ if(U>0.00){
+ temp = log10(U);
+ }
+ else{
+ temp = -10.1;
+ }
+ temp = floor(-10 * temp);
+ value = (int)floor(temp);
+ if(value > 100){ value = 100; }
+
+ qualities[i][base] = (int)value;
+ base++;
+ }
+ }
+
+ index++;
+ }
+ }
+
+
+ if(otuCounts[i] > 0){
+ qualityFile << '>' << seqNameVector[mapUniqueToSeq[i]] << endl;
+
+ int j=4; //need to get past the first four bases
+ while(qualities[i][j] != -1){
+ qualityFile << qualities[i][j] << ' ';
+ j++;
+ }
+ qualityFile << endl;
+ }
+ }
+ qualityFile.close();
+ outputNames.push_back(qualityFileName); outputTypes["qfile"].push_back(qualityFileName);
+
+ }
+ catch(exception& e) {
+ m->errorOut(e, "ShhherCommand", "writeQualities");
+ exit(1);
+ }
+}
+
+/**************************************************************************************************/
+
+void ShhherCommand::writeSequences(vector<int> otuCounts){
+ try {
+ string thisOutputDir = outputDir;
+ if (outputDir == "") { thisOutputDir += m->hasPath(flowFileName); }
+ map<string, string> variables;
+ variables["[filename]"] = thisOutputDir + m->getRootName(m->getSimpleName(flowFileName));
+ string fastaFileName = getOutputFileName("fasta",variables);
+ ofstream fastaFile;
+ m->openOutputFile(fastaFileName, fastaFile);
+
+ vector<string> names(numOTUs, "");
+
+ for(int i=0;i<numOTUs;i++){
+
+ if (m->control_pressed) { break; }
+
+ int index = centroids[i];
+
+ if(otuCounts[i] > 0){
+ fastaFile << '>' << seqNameVector[aaI[i][0]] << endl;
+
+ string newSeq = "";
+
+ for(int j=0;j<numFlowCells;j++){
+
+ char base = flowOrder[j % flowOrder.length()];
+ for(int k=0;k<uniqueFlowgrams[index * numFlowCells + j];k++){
+ newSeq += base;
+ }
+ }
+
+ fastaFile << newSeq.substr(4) << endl;
+ }
+ }
+ fastaFile.close();
+
+ outputNames.push_back(fastaFileName); outputTypes["fasta"].push_back(fastaFileName);
+
+ if(compositeFASTAFileName != ""){
+ m->appendFiles(fastaFileName, compositeFASTAFileName);
+ }
+ }
+ catch(exception& e) {
+ m->errorOut(e, "ShhherCommand", "writeSequences");
+ exit(1);
+ }
+}
+
+/**************************************************************************************************/
+
+void ShhherCommand::writeNames(vector<int> otuCounts){
+ try {
+ string thisOutputDir = outputDir;
+ if (outputDir == "") { thisOutputDir += m->hasPath(flowFileName); }
+ map<string, string> variables;
+ variables["[filename]"] = thisOutputDir + m->getRootName(m->getSimpleName(flowFileName));
+ string nameFileName = getOutputFileName("name",variables);
+ ofstream nameFile;
+ m->openOutputFile(nameFileName, nameFile);
+
+ for(int i=0;i<numOTUs;i++){
+
+ if (m->control_pressed) { break; }
+
+ if(otuCounts[i] > 0){
+ nameFile << seqNameVector[aaI[i][0]] << '\t' << seqNameVector[aaI[i][0]];
+
+ for(int j=1;j<nSeqsPerOTU[i];j++){
+ nameFile << ',' << seqNameVector[aaI[i][j]];
+ }
+
+ nameFile << endl;
+ }
+ }
+ nameFile.close();
+ outputNames.push_back(nameFileName); outputTypes["name"].push_back(nameFileName);
+
+
+ if(compositeNamesFileName != ""){
+ m->appendFiles(nameFileName, compositeNamesFileName);
+ }
+ }
+ catch(exception& e) {
+ m->errorOut(e, "ShhherCommand", "writeNames");
+ exit(1);
+ }
+}
+
+/**************************************************************************************************/
+void ShhherCommand::writeGroups(){
+ try {
+ string thisOutputDir = outputDir;
+ if (outputDir == "") { thisOutputDir += m->hasPath(flowFileName); }
+ string fileRoot = m->getRootName(m->getSimpleName(flowFileName));
+ int pos = fileRoot.find_first_of('.');
+ string fileGroup = fileRoot;
+ if (pos != string::npos) { fileGroup = fileRoot.substr(pos+1, (fileRoot.length()-1-(pos+1))); }
+ map<string, string> variables;
+ variables["[filename]"] = thisOutputDir + fileRoot;
+ string groupFileName = getOutputFileName("group",variables);
+ ofstream groupFile;
+ m->openOutputFile(groupFileName, groupFile);
+
+ for(int i=0;i<numSeqs;i++){
+ if (m->control_pressed) { break; }
+ groupFile << seqNameVector[i] << '\t' << fileGroup << endl;
+ }
+ groupFile.close();
+ outputNames.push_back(groupFileName); outputTypes["group"].push_back(groupFileName);
+
+ }
+ catch(exception& e) {
+ m->errorOut(e, "ShhherCommand", "writeGroups");
+ exit(1);
+ }
+}
+
+/**************************************************************************************************/
+
+void ShhherCommand::writeClusters(vector<int> otuCounts){
+ try {
+ string thisOutputDir = outputDir;
+ if (outputDir == "") { thisOutputDir += m->hasPath(flowFileName); }
+ map<string, string> variables;
+ variables["[filename]"] = thisOutputDir + m->getRootName(m->getSimpleName(flowFileName));
+ string otuCountsFileName = getOutputFileName("counts",variables);
+ ofstream otuCountsFile;
+ m->openOutputFile(otuCountsFileName, otuCountsFile);
+
+ string bases = flowOrder;
+
+ for(int i=0;i<numOTUs;i++){
+
+ if (m->control_pressed) {
+ break;
+ }
+ //output the translated version of the centroid sequence for the otu
+ if(otuCounts[i] > 0){
+ int index = centroids[i];
+
+ otuCountsFile << "ideal\t";
+ for(int j=8;j<numFlowCells;j++){
+ char base = bases[j % bases.length()];
+ for(int s=0;s<uniqueFlowgrams[index * numFlowCells + j];s++){
+ otuCountsFile << base;
+ }
+ }
+ otuCountsFile << endl;
+
+ for(int j=0;j<nSeqsPerOTU[i];j++){
+ int sequence = aaI[i][j];
+ otuCountsFile << seqNameVector[sequence] << '\t';
+
+ string newSeq = "";
+
+ for(int k=0;k<lengths[sequence];k++){
+ char base = bases[k % bases.length()];
+ int freq = int(0.01 * (double)flowDataIntI[sequence * numFlowCells + k] + 0.5);
+
+ for(int s=0;s<freq;s++){
+ newSeq += base;
+ //otuCountsFile << base;
+ }
+ }
+ otuCountsFile << newSeq.substr(4) << endl;
+ }
+ otuCountsFile << endl;
+ }
+ }
+ otuCountsFile.close();
+ outputNames.push_back(otuCountsFileName); outputTypes["counts"].push_back(otuCountsFileName);
+
+ }
+ catch(exception& e) {
+ m->errorOut(e, "ShhherCommand", "writeClusters");
+ exit(1);
+ }
+}
+
+#else
+//**********************************************************************************************************************
+
+int ShhherCommand::execute(){
+ try {
+ if (abort == true) { if (calledHelp) { return 0; } return 2; }
- for(int i=0;i<HOMOPS;i++){//loop signal strength
- float negLogProb = singleLookUp[i * NUMBINS + intIntensity];
- if(negLogProb < minNegLogProb) { minNegLogProb = negLogProb; }
+ getSingleLookUp(); if (m->control_pressed) { return 0; }
+ getJointLookUp(); if (m->control_pressed) { return 0; }
+
+ int numFiles = flowFileVector.size();
+
+ if (numFiles < processors) { processors = numFiles; }
+
+#if defined (__APPLE__) || (__MACH__) || (linux) || (__linux) || (__linux__) || (__unix__) || (__unix)
+ if (processors == 1) { driver(flowFileVector, compositeFASTAFileName, compositeNamesFileName); }
+ else { createProcesses(flowFileVector); } //each processor processes one file
+#else
+ driver(flowFileVector, compositeFASTAFileName, compositeNamesFileName);
+#endif
+
+ if(compositeFASTAFileName != ""){
+ outputNames.push_back(compositeFASTAFileName); outputTypes["fasta"].push_back(compositeFASTAFileName);
+ outputNames.push_back(compositeNamesFileName); outputTypes["name"].push_back(compositeNamesFileName);
}
+
+ m->mothurOutEndLine();
+ m->mothurOut("Output File Names: "); m->mothurOutEndLine();
+ for (int i = 0; i < outputNames.size(); i++) { m->mothurOut(outputNames[i]); m->mothurOutEndLine(); }
+ m->mothurOutEndLine();
- return minNegLogProb;
+ return 0;
}
catch(exception& e) {
- m->errorOut(e, "ShhherCommand", "getProbIntensity");
+ m->errorOut(e, "ShhherCommand", "execute");
exit(1);
}
}
-
+#endif
+//********************************************************************************************************************
+//sorts biggest to smallest
+inline bool compareFileSizes(string left, string right){
+
+ FILE * pFile;
+ long leftsize = 0;
+
+ //get num bytes in file
+ string filename = left;
+ pFile = fopen (filename.c_str(),"rb");
+ string error = "Error opening " + filename;
+ if (pFile==NULL) perror (error.c_str());
+ else{
+ fseek (pFile, 0, SEEK_END);
+ leftsize=ftell (pFile);
+ fclose (pFile);
+ }
+
+ FILE * pFile2;
+ long rightsize = 0;
+
+ //get num bytes in file
+ filename = right;
+ pFile2 = fopen (filename.c_str(),"rb");
+ error = "Error opening " + filename;
+ if (pFile2==NULL) perror (error.c_str());
+ else{
+ fseek (pFile2, 0, SEEK_END);
+ rightsize=ftell (pFile2);
+ fclose (pFile2);
+ }
+
+ return (leftsize > rightsize);
+}
/**************************************************************************************************/
-void ShhherCommand::getUniques(){
- try{
+int ShhherCommand::createProcesses(vector<string> filenames){
+ try {
+ vector<int> processIDS;
+ int process = 1;
+ int num = 0;
+
+ //sanity check
+ if (filenames.size() < processors) { processors = filenames.size(); }
+
+ //sort file names by size to divide load better
+ sort(filenames.begin(), filenames.end(), compareFileSizes);
+
+ vector < vector <string> > dividedFiles; //dividedFiles[1] = vector of filenames for process 1...
+ dividedFiles.resize(processors);
+
+ //for each file, figure out which process will complete it
+ //want to divide the load intelligently so the big files are spread between processes
+ for (int i = 0; i < filenames.size(); i++) {
+ int processToAssign = (i+1) % processors;
+ if (processToAssign == 0) { processToAssign = processors; }
+
+ dividedFiles[(processToAssign-1)].push_back(filenames[i]);
+ }
+
+ //now lets reverse the order of ever other process, so we balance big files running with little ones
+ for (int i = 0; i < processors; i++) {
+ int remainder = ((i+1) % processors);
+ if (remainder) { reverse(dividedFiles[i].begin(), dividedFiles[i].end()); }
+ }
+
+
+ //divide the groups between the processors
+ /*vector<linePair> lines;
+ vector<int> numFilesToComplete;
+ int numFilesPerProcessor = filenames.size() / processors;
+ for (int i = 0; i < processors; i++) {
+ int startIndex = i * numFilesPerProcessor;
+ int endIndex = (i+1) * numFilesPerProcessor;
+ if(i == (processors - 1)){ endIndex = filenames.size(); }
+ lines.push_back(linePair(startIndex, endIndex));
+ numFilesToComplete.push_back((endIndex-startIndex));
+ }*/
+
+ #if defined (__APPLE__) || (__MACH__) || (linux) || (__linux) || (__linux__) || (__unix__) || (__unix)
+
+ //loop through and create all the processes you want
+ while (process != processors) {
+ int pid = fork();
+
+ if (pid > 0) {
+ processIDS.push_back(pid); //create map from line number to pid so you can append files in correct order later
+ process++;
+ }else if (pid == 0){
+ num = driver(dividedFiles[process], compositeFASTAFileName + toString(getpid()) + ".temp", compositeNamesFileName + toString(getpid()) + ".temp");
+
+ //pass numSeqs to parent
+ ofstream out;
+ string tempFile = compositeFASTAFileName + toString(getpid()) + ".num.temp";
+ m->openOutputFile(tempFile, out);
+ out << num << endl;
+ out.close();
+
+ exit(0);
+ }else {
+ m->mothurOut("[ERROR]: unable to spawn the necessary processes."); m->mothurOutEndLine();
+ for (int i = 0; i < processIDS.size(); i++) { kill (processIDS[i], SIGINT); }
+ exit(0);
+ }
+ }
+ //do my part
+ driver(dividedFiles[0], compositeFASTAFileName, compositeNamesFileName);
- numUniques = 0;
- uniqueFlowgrams.assign(numFlowCells * numSeqs, -1);
- uniqueCount.assign(numSeqs, 0); // anWeights
- uniqueLengths.assign(numSeqs, 0);
- mapSeqToUnique.assign(numSeqs, -1);
- mapUniqueToSeq.assign(numSeqs, -1);
+ //force parent to wait until all the processes are done
+ for (int i=0;i<processIDS.size();i++) {
+ int temp = processIDS[i];
+ wait(&temp);
+ }
+
+ #else
+
+ //////////////////////////////////////////////////////////////////////////////////////////////////////
+
+ /////////////////////// NOT WORKING, ACCESS VIOLATION ON READ OF FLOWGRAMS IN THREAD /////////////////
+
+ //////////////////////////////////////////////////////////////////////////////////////////////////////
+ //Windows version shared memory, so be careful when passing variables through the shhhFlowsData struct.
+ //Above fork() will clone, so memory is separate, but that's not the case with windows,
+ //////////////////////////////////////////////////////////////////////////////////////////////////////
+ /*
+ vector<shhhFlowsData*> pDataArray;
+ DWORD dwThreadIdArray[processors-1];
+ HANDLE hThreadArray[processors-1];
+
+ //Create processor worker threads.
+ for( int i=0; i<processors-1; i++ ){
+ // Allocate memory for thread data.
+ string extension = "";
+ if (i != 0) { extension = toString(i) + ".temp"; }
+
+ shhhFlowsData* tempFlow = new shhhFlowsData(filenames, (compositeFASTAFileName + extension), (compositeNamesFileName + extension), outputDir, flowOrder, jointLookUp, singleLookUp, m, lines[i].start, lines[i].end, cutoff, sigma, minDelta, maxIters, i);
+ pDataArray.push_back(tempFlow);
+ processIDS.push_back(i);
+
+ hThreadArray[i] = CreateThread(NULL, 0, ShhhFlowsThreadFunction, pDataArray[i], 0, &dwThreadIdArray[i]);
+ }
- vector<short> uniqueFlowDataIntI(numFlowCells * numSeqs, -1);
+ //using the main process as a worker saves time and memory
+ //do my part
+ driver(filenames, compositeFASTAFileName, compositeNamesFileName, lines[processors-1].start, lines[processors-1].end);
- for(int i=0;i<numSeqs;i++){
- int index = 0;
-
- vector<short> current(numFlowCells);
- for(int j=0;j<numFlowCells;j++){
- current[j] = short(((flowDataIntI[i * numFlowCells + j] + 50.0)/100.0));
- }
-
- for(int j=0;j<numUniques;j++){
- int offset = j * numFlowCells;
- bool toEnd = 1;
-
- int shorterLength;
- if(lengths[i] < uniqueLengths[j]) { shorterLength = lengths[i]; }
- else { shorterLength = uniqueLengths[j]; }
-
- for(int k=0;k<shorterLength;k++){
- if(current[k] != uniqueFlowgrams[offset + k]){
- toEnd = 0;
- break;
- }
- }
-
- if(toEnd){
- mapSeqToUnique[i] = j;
- uniqueCount[j]++;
- index = j;
- if(lengths[i] > uniqueLengths[j]) { uniqueLengths[j] = lengths[i]; }
- break;
- }
- index++;
- }
-
- if(index == numUniques){
- uniqueLengths[numUniques] = lengths[i];
- uniqueCount[numUniques] = 1;
- mapSeqToUnique[i] = numUniques;//anMap
- mapUniqueToSeq[numUniques] = i;//anF
-
- for(int k=0;k<numFlowCells;k++){
- uniqueFlowgrams[numUniques * numFlowCells + k] = current[k];
- uniqueFlowDataIntI[numUniques * numFlowCells + k] = flowDataIntI[i * numFlowCells + k];
- }
-
- numUniques++;
- }
- }
- uniqueFlowDataIntI.resize(numFlowCells * numUniques);
- uniqueLengths.resize(numUniques);
+ //Wait until all threads have terminated.
+ WaitForMultipleObjects(processors-1, hThreadArray, TRUE, INFINITE);
- flowDataPrI.resize(numSeqs * numFlowCells, 0);
- for(int i=0;i<flowDataPrI.size();i++) { flowDataPrI[i] = getProbIntensity(flowDataIntI[i]); }
- }
+ //Close all thread handles and free memory allocations.
+ for(int i=0; i < pDataArray.size(); i++){
+ for(int j=0; j < pDataArray[i]->outputNames.size(); j++){ outputNames.push_back(pDataArray[i]->outputNames[j]); }
+ CloseHandle(hThreadArray[i]);
+ delete pDataArray[i];
+ }
+ */
+ #endif
+
+ for (int i=0;i<processIDS.size();i++) {
+ ifstream in;
+ string tempFile = compositeFASTAFileName + toString(processIDS[i]) + ".num.temp";
+ m->openInputFile(tempFile, in);
+ if (!in.eof()) {
+ int tempNum = 0;
+ in >> tempNum;
+ if (tempNum != dividedFiles[i+1].size()) {
+ m->mothurOut("[ERROR]: main process expected " + toString(processIDS[i]) + " to complete " + toString(dividedFiles[i+1].size()) + " files, and it only reported completing " + toString(tempNum) + ". This will cause file mismatches. The flow files may be too large to process with multiple processors. \n");
+ }
+ }
+ in.close(); m->mothurRemove(tempFile);
+
+ if (compositeFASTAFileName != "") {
+ m->appendFiles((compositeFASTAFileName + toString(processIDS[i]) + ".temp"), compositeFASTAFileName);
+ m->appendFiles((compositeNamesFileName + toString(processIDS[i]) + ".temp"), compositeNamesFileName);
+ m->mothurRemove((compositeFASTAFileName + toString(processIDS[i]) + ".temp"));
+ m->mothurRemove((compositeNamesFileName + toString(processIDS[i]) + ".temp"));
+ }
+ }
+
+ return 0;
+
+ }
catch(exception& e) {
- m->errorOut(e, "ShhherCommand", "getUniques");
+ m->errorOut(e, "ShhherCommand", "createProcesses");
exit(1);
}
}
+/**************************************************************************************************/
+vector<string> ShhherCommand::parseFlowFiles(string filename){
+ try {
+ vector<string> files;
+ int count = 0;
+
+ ifstream in;
+ m->openInputFile(filename, in);
+
+ int thisNumFLows = 0;
+ in >> thisNumFLows; m->gobble(in);
+
+ while (!in.eof()) {
+ if (m->control_pressed) { break; }
+
+ ofstream out;
+ string outputFileName = filename + toString(count) + ".temp";
+ m->openOutputFile(outputFileName, out);
+ out << thisNumFLows << endl;
+ files.push_back(outputFileName);
+
+ int numLinesWrote = 0;
+ for (int i = 0; i < largeSize; i++) {
+ if (in.eof()) { break; }
+ string line = m->getline(in); m->gobble(in);
+ out << line << endl;
+ numLinesWrote++;
+ }
+ out.close();
+
+ if (numLinesWrote == 0) { m->mothurRemove(outputFileName); files.pop_back(); }
+ count++;
+ }
+ in.close();
+
+ if (m->control_pressed) { for (int i = 0; i < files.size(); i++) { m->mothurRemove(files[i]); } files.clear(); }
+
+ m->mothurOut("\nDivided " + filename + " into " + toString(files.size()) + " files.\n\n");
+
+ return files;
+ }
+ catch(exception& e) {
+ m->errorOut(e, "ShhherCommand", "parseFlowFiles");
+ exit(1);
+ }
+}
/**************************************************************************************************/
-float ShhherCommand::calcPairwiseDist(int seqA, int seqB){
+int ShhherCommand::driver(vector<string> filenames, string thisCompositeFASTAFileName, string thisCompositeNamesFileName){
+ try {
+
+ int numCompleted = 0;
+
+ for(int i=0;i<filenames.size();i++){
+
+ if (m->control_pressed) { break; }
+
+ vector<string> theseFlowFileNames; theseFlowFileNames.push_back(filenames[i]);
+ if (large) { theseFlowFileNames = parseFlowFiles(filenames[i]); }
+
+ if (m->control_pressed) { break; }
+
+ double begClock = clock();
+ unsigned long long begTime;
+
+ string fileNameForOutput = filenames[i];
+
+ for (int g = 0; g < theseFlowFileNames.size(); g++) {
+
+ string flowFileName = theseFlowFileNames[g];
+ m->mothurOut("\n>>>>>\tProcessing " + flowFileName + " (file " + toString(i+1) + " of " + toString(filenames.size()) + ")\t<<<<<\n");
+ m->mothurOut("Reading flowgrams...\n");
+
+ vector<string> seqNameVector;
+ vector<int> lengths;
+ vector<short> flowDataIntI;
+ vector<double> flowDataPrI;
+ map<string, int> nameMap;
+ vector<short> uniqueFlowgrams;
+ vector<int> uniqueCount;
+ vector<int> mapSeqToUnique;
+ vector<int> mapUniqueToSeq;
+ vector<int> uniqueLengths;
+ int numFlowCells;
+
+ if (m->debug) { m->mothurOut("[DEBUG]: About to read flowgrams.\n"); }
+ int numSeqs = getFlowData(flowFileName, seqNameVector, lengths, flowDataIntI, nameMap, numFlowCells);
+
+ if (m->control_pressed) { break; }
+
+ m->mothurOut("Identifying unique flowgrams...\n");
+ int numUniques = getUniques(numSeqs, numFlowCells, uniqueFlowgrams, uniqueCount, uniqueLengths, mapSeqToUnique, mapUniqueToSeq, lengths, flowDataPrI, flowDataIntI);
+
+ if (m->control_pressed) { break; }
+
+ m->mothurOut("Calculating distances between flowgrams...\n");
+ string distFileName = flowFileName.substr(0,flowFileName.find_last_of('.')) + ".shhh.dist";
+ begTime = time(NULL);
+
+
+ flowDistParentFork(numFlowCells, distFileName, numUniques, mapUniqueToSeq, mapSeqToUnique, lengths, flowDataPrI, flowDataIntI);
+
+ m->mothurOutEndLine();
+ m->mothurOut("Total time: " + toString(time(NULL) - begTime) + '\t' + toString((clock() - begClock)/CLOCKS_PER_SEC) + '\n');
+
+
+ string namesFileName = flowFileName.substr(0,flowFileName.find_last_of('.')) + ".shhh.names";
+ createNamesFile(numSeqs, numUniques, namesFileName, seqNameVector, mapSeqToUnique, mapUniqueToSeq);
+
+ if (m->control_pressed) { break; }
+
+ m->mothurOut("\nClustering flowgrams...\n");
+ string listFileName = flowFileName.substr(0,flowFileName.find_last_of('.')) + ".shhh.list";
+ cluster(listFileName, distFileName, namesFileName);
+
+ if (m->control_pressed) { break; }
+
+ vector<int> otuData;
+ vector<int> cumNumSeqs;
+ vector<int> nSeqsPerOTU;
+ vector<vector<int> > aaP; //tMaster->aanP: each row is a different otu / each col contains the sequence indices
+ vector<vector<int> > aaI; //tMaster->aanI: that are in each otu - can't differentiate between aaP and aaI
+ vector<int> seqNumber; //tMaster->anP: the sequence id number sorted by OTU
+ vector<int> seqIndex; //tMaster->anI; the index that corresponds to seqNumber
+
+
+ int numOTUs = getOTUData(numSeqs, listFileName, otuData, cumNumSeqs, nSeqsPerOTU, aaP, aaI, seqNumber, seqIndex, nameMap);
+
+ if (m->control_pressed) { break; }
+
+ m->mothurRemove(distFileName);
+ m->mothurRemove(namesFileName);
+ m->mothurRemove(listFileName);
+
+ vector<double> dist; //adDist - distance of sequences to centroids
+ vector<short> change; //did the centroid sequence change? 0 = no; 1 = yes
+ vector<int> centroids; //the representative flowgram for each cluster m
+ vector<double> weight;
+ vector<double> singleTau; //tMaster->adTau: 1-D Tau vector (1xnumSeqs)
+ vector<int> nSeqsBreaks;
+ vector<int> nOTUsBreaks;
+
+ if (m->debug) { m->mothurOut("[DEBUG]: numSeqs = " + toString(numSeqs) + " numOTUS = " + toString(numOTUs) + " about to alloc a dist vector with size = " + toString((numSeqs * numOTUs)) + ".\n"); }
+
+ dist.assign(numSeqs * numOTUs, 0);
+ change.assign(numOTUs, 1);
+ centroids.assign(numOTUs, -1);
+ weight.assign(numOTUs, 0);
+ singleTau.assign(numSeqs, 1.0);
+
+ nSeqsBreaks.assign(2, 0);
+ nOTUsBreaks.assign(2, 0);
+
+ nSeqsBreaks[0] = 0;
+ nSeqsBreaks[1] = numSeqs;
+ nOTUsBreaks[1] = numOTUs;
+
+ if (m->debug) { m->mothurOut("[DEBUG]: done allocating memory, about to denoise.\n"); }
+
+ if (m->control_pressed) { break; }
+
+ double maxDelta = 0;
+ int iter = 0;
+
+ begClock = clock();
+ begTime = time(NULL);
+
+ m->mothurOut("\nDenoising flowgrams...\n");
+ m->mothurOut("iter\tmaxDelta\tnLL\t\tcycletime\n");
+
+ while((maxIters == 0 && maxDelta > minDelta) || iter < MIN_ITER || (maxDelta > minDelta && iter < maxIters)){
+
+ if (m->control_pressed) { break; }
+
+ double cycClock = clock();
+ unsigned long long cycTime = time(NULL);
+ fill(numOTUs, seqNumber, seqIndex, cumNumSeqs, nSeqsPerOTU, aaP, aaI);
+
+ if (m->control_pressed) { break; }
+
+ calcCentroidsDriver(numOTUs, cumNumSeqs, nSeqsPerOTU, seqIndex, change, centroids, singleTau, mapSeqToUnique, uniqueFlowgrams, flowDataIntI, lengths, numFlowCells, seqNumber);
+
+ if (m->control_pressed) { break; }
+
+ maxDelta = getNewWeights(numOTUs, cumNumSeqs, nSeqsPerOTU, singleTau, seqNumber, weight);
+
+ if (m->control_pressed) { break; }
+
+ double nLL = getLikelihood(numSeqs, numOTUs, nSeqsPerOTU, seqNumber, cumNumSeqs, seqIndex, dist, weight);
+
+ if (m->control_pressed) { break; }
+
+ checkCentroids(numOTUs, centroids, weight);
+
+ if (m->control_pressed) { break; }
+
+ calcNewDistances(numSeqs, numOTUs, nSeqsPerOTU, dist, weight, change, centroids, aaP, singleTau, aaI, seqNumber, seqIndex, uniqueFlowgrams, flowDataIntI, numFlowCells, lengths);
+
+ if (m->control_pressed) { break; }
+
+ iter++;
+
+ m->mothurOut(toString(iter) + '\t' + toString(maxDelta) + '\t' + toString(nLL) + '\t' + toString(time(NULL) - cycTime) + '\t' + toString((clock() - cycClock)/(double)CLOCKS_PER_SEC) + '\n');
+
+ }
+
+ if (m->control_pressed) { break; }
+
+ m->mothurOut("\nFinalizing...\n");
+ fill(numOTUs, seqNumber, seqIndex, cumNumSeqs, nSeqsPerOTU, aaP, aaI);
+
+ if (m->control_pressed) { break; }
+
+ setOTUs(numOTUs, numSeqs, seqNumber, seqIndex, cumNumSeqs, nSeqsPerOTU, otuData, singleTau, dist, aaP, aaI);
+
+ if (m->control_pressed) { break; }
+
+ vector<int> otuCounts(numOTUs, 0);
+ for(int j=0;j<numSeqs;j++) { otuCounts[otuData[j]]++; }
+
+ calcCentroidsDriver(numOTUs, cumNumSeqs, nSeqsPerOTU, seqIndex, change, centroids, singleTau, mapSeqToUnique, uniqueFlowgrams, flowDataIntI, lengths, numFlowCells, seqNumber);
+
+ if (m->control_pressed) { break; }
+
+ if ((large) && (g == 0)) { flowFileName = filenames[i]; theseFlowFileNames[0] = filenames[i]; }
+ string thisOutputDir = outputDir;
+ if (outputDir == "") { thisOutputDir = m->hasPath(flowFileName); }
+ map<string, string> variables;
+ variables["[filename]"] = thisOutputDir + m->getRootName(m->getSimpleName(flowFileName));
+ string qualityFileName = getOutputFileName("qfile",variables);
+ string fastaFileName = getOutputFileName("fasta",variables);
+ string nameFileName = getOutputFileName("name",variables);
+ string otuCountsFileName = getOutputFileName("counts",variables);
+ string fileRoot = m->getRootName(m->getSimpleName(flowFileName));
+ int pos = fileRoot.find_first_of('.');
+ string fileGroup = fileRoot;
+ if (pos != string::npos) { fileGroup = fileRoot.substr(pos+1, (fileRoot.length()-1-(pos+1))); }
+ string groupFileName = getOutputFileName("group",variables);
+
+
+ writeQualities(numOTUs, numFlowCells, qualityFileName, otuCounts, nSeqsPerOTU, seqNumber, singleTau, flowDataIntI, uniqueFlowgrams, cumNumSeqs, mapUniqueToSeq, seqNameVector, centroids, aaI); if (m->control_pressed) { break; }
+ writeSequences(thisCompositeFASTAFileName, numOTUs, numFlowCells, fastaFileName, otuCounts, uniqueFlowgrams, seqNameVector, aaI, centroids);if (m->control_pressed) { break; }
+ writeNames(thisCompositeNamesFileName, numOTUs, nameFileName, otuCounts, seqNameVector, aaI, nSeqsPerOTU); if (m->control_pressed) { break; }
+ writeClusters(otuCountsFileName, numOTUs, numFlowCells,otuCounts, centroids, uniqueFlowgrams, seqNameVector, aaI, nSeqsPerOTU, lengths, flowDataIntI); if (m->control_pressed) { break; }
+ writeGroups(groupFileName, fileGroup, numSeqs, seqNameVector); if (m->control_pressed) { break; }
+
+ if (large) {
+ if (g > 0) {
+ variables["[filename]"] = thisOutputDir + m->getRootName(m->getSimpleName(theseFlowFileNames[0]));
+ m->appendFiles(qualityFileName, getOutputFileName("qfile",variables));
+ m->mothurRemove(qualityFileName);
+ m->appendFiles(fastaFileName, getOutputFileName("fasta",variables));
+ m->mothurRemove(fastaFileName);
+ m->appendFiles(nameFileName, getOutputFileName("name",variables));
+ m->mothurRemove(nameFileName);
+ m->appendFiles(otuCountsFileName, getOutputFileName("counts",variables));
+ m->mothurRemove(otuCountsFileName);
+ m->appendFiles(groupFileName, getOutputFileName("group",variables));
+ m->mothurRemove(groupFileName);
+ }
+ m->mothurRemove(theseFlowFileNames[g]);
+ }
+ }
+
+ numCompleted++;
+ m->mothurOut("Total time to process " + fileNameForOutput + ":\t" + toString(time(NULL) - begTime) + '\t' + toString((clock() - begClock)/(double)CLOCKS_PER_SEC) + '\n');
+ }
+
+ if (m->control_pressed) { for (int i = 0; i < outputNames.size(); i++) { m->mothurRemove(outputNames[i]); } return 0; }
+
+ return numCompleted;
+
+ }catch(exception& e) {
+ m->errorOut(e, "ShhherCommand", "driver");
+ exit(1);
+ }
+}
+
+/**************************************************************************************************/
+int ShhherCommand::getFlowData(string filename, vector<string>& thisSeqNameVector, vector<int>& thisLengths, vector<short>& thisFlowDataIntI, map<string, int>& thisNameMap, int& numFlowCells){
try{
- int minLength = lengths[mapSeqToUnique[seqA]];
- if(lengths[seqB] < minLength){ minLength = lengths[mapSeqToUnique[seqB]]; }
+
+ ifstream flowFile;
+
+ m->openInputFile(filename, flowFile);
- int ANumFlowCells = seqA * numFlowCells;
- int BNumFlowCells = seqB * numFlowCells;
+ string seqName;
+ int currentNumFlowCells;
+ float intensity;
+ thisSeqNameVector.clear();
+ thisLengths.clear();
+ thisFlowDataIntI.clear();
+ thisNameMap.clear();
+
+ string numFlowTest;
+ flowFile >> numFlowTest;
+
+ if (!m->isContainingOnlyDigits(numFlowTest)) { m->mothurOut("[ERROR]: expected a number and got " + numFlowTest + ", quitting. Did you use the flow parameter instead of the file parameter?"); m->mothurOutEndLine(); exit(1); }
+ else { convert(numFlowTest, numFlowCells); }
+
+ if (m->debug) { m->mothurOut("[DEBUG]: numFlowCells = " + toString(numFlowCells) + ".\n"); }
+ int index = 0;//pcluster
+ while(!flowFile.eof()){
+
+ if (m->control_pressed) { break; }
+
+ flowFile >> seqName >> currentNumFlowCells;
+
+ thisLengths.push_back(currentNumFlowCells);
+
+ thisSeqNameVector.push_back(seqName);
+ thisNameMap[seqName] = index++;//pcluster
+
+ if (m->debug) { m->mothurOut("[DEBUG]: seqName = " + seqName + " length = " + toString(currentNumFlowCells) + " index = " + toString(index) + "\n"); }
+
+ for(int i=0;i<numFlowCells;i++){
+ flowFile >> intensity;
+ if(intensity > 9.99) { intensity = 9.99; }
+ int intI = int(100 * intensity + 0.0001);
+ thisFlowDataIntI.push_back(intI);
+ }
+ m->gobble(flowFile);
+ }
+ flowFile.close();
- float dist = 0;
+ int numSeqs = thisSeqNameVector.size();
- for(int i=0;i<minLength;i++){
- int flowAIntI = flowDataIntI[ANumFlowCells + i];
- float flowAPrI = flowDataPrI[ANumFlowCells + i];
+ for(int i=0;i<numSeqs;i++){
- int flowBIntI = flowDataIntI[BNumFlowCells + i];
- float flowBPrI = flowDataPrI[BNumFlowCells + i];
- dist += jointLookUp[flowAIntI * NUMBINS + flowBIntI] - flowAPrI - flowBPrI;
+ if (m->control_pressed) { break; }
+
+ int iNumFlowCells = i * numFlowCells;
+ for(int j=thisLengths[i];j<numFlowCells;j++){
+ thisFlowDataIntI[iNumFlowCells + j] = 0;
+ }
}
+
+ return numSeqs;
- dist /= (float) minLength;
- return dist;
}
catch(exception& e) {
- m->errorOut(e, "ShhherCommand", "calcPairwiseDist");
+ m->errorOut(e, "ShhherCommand", "getFlowData");
exit(1);
}
}
-
/**************************************************************************************************/
-void ShhherCommand::flowDistParentFork(string distFileName, int startSeq, int stopSeq){
+int ShhherCommand::flowDistParentFork(int numFlowCells, string distFileName, int stopSeq, vector<int>& mapUniqueToSeq, vector<int>& mapSeqToUnique, vector<int>& lengths, vector<double>& flowDataPrI, vector<short>& flowDataIntI){
try{
-
+
ostringstream outStream;
outStream.setf(ios::fixed, ios::floatfield);
outStream.setf(ios::dec, ios::basefield);
int begTime = time(NULL);
double begClock = clock();
-
- for(int i=startSeq;i<stopSeq;i++){
+
+ for(int i=0;i<stopSeq;i++){
+
+ if (m->control_pressed) { break; }
+
for(int j=0;j<i;j++){
- float flowDistance = calcPairwiseDist(mapUniqueToSeq[i], mapUniqueToSeq[j]);
-
+ float flowDistance = calcPairwiseDist(numFlowCells, mapUniqueToSeq[i], mapUniqueToSeq[j], mapSeqToUnique, lengths, flowDataPrI, flowDataIntI);
+
if(flowDistance < 1e-6){
outStream << mapUniqueToSeq[i] << '\t' << mapUniqueToSeq[j] << '\t' << 0.000000 << endl;
}
}
}
if(i % 100 == 0){
- m->mothurOut(toString(i) + "\t" + toString(time(NULL) - begTime));
- m->mothurOut("\t" + toString((clock()-begClock)/CLOCKS_PER_SEC));
- m->mothurOutEndLine();
+ m->mothurOutJustToScreen(toString(i) + "\t" + toString(time(NULL) - begTime));
+ m->mothurOutJustToScreen("\t" + toString((clock()-begClock)/CLOCKS_PER_SEC)+"\n");
}
}
- m->mothurOut(toString(stopSeq-1) + "\t" + toString(time(NULL) - begTime));
- m->mothurOut("\t" + toString((clock()-begClock)/CLOCKS_PER_SEC));
- m->mothurOutEndLine();
ofstream distFile(distFileName.c_str());
distFile << outStream.str();
distFile.close();
+
+ if (m->control_pressed) {}
+ else {
+ m->mothurOutJustToScreen(toString(stopSeq-1) + "\t" + toString(time(NULL) - begTime));
+ m->mothurOutJustToScreen("\t" + toString((clock()-begClock)/CLOCKS_PER_SEC)+"\n");
+ }
+
+ return 0;
}
catch(exception& e) {
m->errorOut(e, "ShhherCommand", "flowDistParentFork");
exit(1);
}
}
-
/**************************************************************************************************/
-string ShhherCommand::createDistFile(int processors){
+float ShhherCommand::calcPairwiseDist(int numFlowCells, int seqA, int seqB, vector<int>& mapSeqToUnique, vector<int>& lengths, vector<double>& flowDataPrI, vector<short>& flowDataIntI){
try{
- string fDistFileName = flowFileName.substr(0,flowFileName.find_last_of('.')) + ".pn.dist";
-
- unsigned long int begTime = time(NULL);
- double begClock = clock();
-
- vector<int> start;
- vector<int> end;
+ int minLength = lengths[mapSeqToUnique[seqA]];
+ if(lengths[seqB] < minLength){ minLength = lengths[mapSeqToUnique[seqB]]; }
+
+ int ANumFlowCells = seqA * numFlowCells;
+ int BNumFlowCells = seqB * numFlowCells;
+
+ float dist = 0;
-#if defined (__APPLE__) || (__MACH__) || (linux) || (__linux)
- if(processors == 1) { flowDistParentFork(fDistFileName, 0, numUniques); }
- else{ //you have multiple processors
+ for(int i=0;i<minLength;i++){
- if (numSeqs < processors){ processors = 1; }
+ if (m->control_pressed) { break; }
- vector<int> start(processors, 0);
- vector<int> end(processors, 0);
+ int flowAIntI = flowDataIntI[ANumFlowCells + i];
+ float flowAPrI = flowDataPrI[ANumFlowCells + i];
- for (int i = 0; i < processors; i++) {
- start[i] = int(sqrt(float(i)/float(processors)) * numUniques);
- end[i] = int(sqrt(float(i+1)/float(processors)) * numUniques);
- }
+ int flowBIntI = flowDataIntI[BNumFlowCells + i];
+ float flowBPrI = flowDataPrI[BNumFlowCells + i];
+ dist += jointLookUp[flowAIntI * NUMBINS + flowBIntI] - flowAPrI - flowBPrI;
+ }
+
+ dist /= (float) minLength;
+ return dist;
+ }
+ catch(exception& e) {
+ m->errorOut(e, "ShhherCommand", "calcPairwiseDist");
+ exit(1);
+ }
+}
+
+/**************************************************************************************************/
+
+int ShhherCommand::getUniques(int numSeqs, int numFlowCells, vector<short>& uniqueFlowgrams, vector<int>& uniqueCount, vector<int>& uniqueLengths, vector<int>& mapSeqToUnique, vector<int>& mapUniqueToSeq, vector<int>& lengths, vector<double>& flowDataPrI, vector<short>& flowDataIntI){
+ try{
+ int numUniques = 0;
+ uniqueFlowgrams.assign(numFlowCells * numSeqs, -1);
+ uniqueCount.assign(numSeqs, 0); // anWeights
+ uniqueLengths.assign(numSeqs, 0);
+ mapSeqToUnique.assign(numSeqs, -1);
+ mapUniqueToSeq.assign(numSeqs, -1);
+
+ vector<short> uniqueFlowDataIntI(numFlowCells * numSeqs, -1);
+
+ for(int i=0;i<numSeqs;i++){
- int process = 1;
- vector<int> processIDs;
-
- //loop through and create all the processes you want
- while (process != processors) {
- int pid = fork();
-
- if (pid > 0) {
- processIDs.push_back(pid); //create map from line number to pid so you can append files in correct order later
- process++;
- }else if (pid == 0){
- flowDistParentFork(fDistFileName + toString(getpid()) + ".temp", start[process], end[process]);
- exit(0);
- }else {
- m->mothurOut("[ERROR]: unable to spawn the necessary processes. Error code: " + toString(pid)); m->mothurOutEndLine();
- perror(" : ");
- for (int i=0;i<processIDs.size();i++) { int temp = processIDs[i]; kill (temp, SIGINT); }
- exit(0);
- }
- }
+ if (m->control_pressed) { break; }
- //parent does its part
- flowDistParentFork(fDistFileName, start[0], end[0]);
+ int index = 0;
- //force parent to wait until all the processes are done
- for (int i=0;i<processIDs.size();i++) {
- int temp = processIDs[i];
- wait(&temp);
+ vector<short> current(numFlowCells);
+ for(int j=0;j<numFlowCells;j++){
+ current[j] = short(((flowDataIntI[i * numFlowCells + j] + 50.0)/100.0));
}
-
- //append and remove temp files
- for (int i=0;i<processIDs.size();i++) {
- m->appendFiles((fDistFileName + toString(processIDs[i]) + ".temp"), fDistFileName);
- remove((fDistFileName + toString(processIDs[i]) + ".temp").c_str());
+
+ for(int j=0;j<numUniques;j++){
+ int offset = j * numFlowCells;
+ bool toEnd = 1;
+
+ int shorterLength;
+ if(lengths[i] < uniqueLengths[j]) { shorterLength = lengths[i]; }
+ else { shorterLength = uniqueLengths[j]; }
+
+ for(int k=0;k<shorterLength;k++){
+ if(current[k] != uniqueFlowgrams[offset + k]){
+ toEnd = 0;
+ break;
+ }
+ }
+
+ if(toEnd){
+ mapSeqToUnique[i] = j;
+ uniqueCount[j]++;
+ index = j;
+ if(lengths[i] > uniqueLengths[j]) { uniqueLengths[j] = lengths[i]; }
+ break;
+ }
+ index++;
}
+ if(index == numUniques){
+ uniqueLengths[numUniques] = lengths[i];
+ uniqueCount[numUniques] = 1;
+ mapSeqToUnique[i] = numUniques;//anMap
+ mapUniqueToSeq[numUniques] = i;//anF
+
+ for(int k=0;k<numFlowCells;k++){
+ uniqueFlowgrams[numUniques * numFlowCells + k] = current[k];
+ uniqueFlowDataIntI[numUniques * numFlowCells + k] = flowDataIntI[i * numFlowCells + k];
+ }
+
+ numUniques++;
+ }
}
+ uniqueFlowDataIntI.resize(numFlowCells * numUniques);
+ uniqueLengths.resize(numUniques);
-#else
- flowDistParentFork(fDistFileName, 0, numUniques);
-#endif
-
- m->mothurOutEndLine();
-
- m->mothurOut("Total time: " + toString(time(NULL) - begTime) + '\t' + toString((clock() - begClock)/CLOCKS_PER_SEC) + '\n');
-
-
- return fDistFileName;
+ flowDataPrI.resize(numSeqs * numFlowCells, 0);
+ for(int i=0;i<flowDataPrI.size();i++) { if (m->control_pressed) { break; } flowDataPrI[i] = getProbIntensity(flowDataIntI[i]); }
+
+ return numUniques;
}
catch(exception& e) {
- m->errorOut(e, "ShhherCommand", "createDistFile");
+ m->errorOut(e, "ShhherCommand", "getUniques");
exit(1);
}
-
}
-
/**************************************************************************************************/
-
-string ShhherCommand::createNamesFile(){
+int ShhherCommand::createNamesFile(int numSeqs, int numUniques, string filename, vector<string>& seqNameVector, vector<int>& mapSeqToUnique, vector<int>& mapUniqueToSeq){
try{
vector<string> duplicateNames(numUniques, "");
duplicateNames[mapSeqToUnique[i]] += seqNameVector[i] + ',';
}
- string nameFileName = flowFileName.substr(0,flowFileName.find_last_of('.')) + ".pn.names";
-
ofstream nameFile;
- m->openOutputFile(nameFileName, nameFile);
+ m->openOutputFile(filename, nameFile);
for(int i=0;i<numUniques;i++){
-// nameFile << seqNameVector[mapUniqueToSeq[i]] << '\t' << duplicateNames[i].substr(0, duplicateNames[i].find_last_of(',')) << endl;
+
+ if (m->control_pressed) { break; }
+
+ // nameFile << seqNameVector[mapUniqueToSeq[i]] << '\t' << duplicateNames[i].substr(0, duplicateNames[i].find_last_of(',')) << endl;
nameFile << mapUniqueToSeq[i] << '\t' << duplicateNames[i].substr(0, duplicateNames[i].find_last_of(',')) << endl;
}
nameFile.close();
- return nameFileName;
+
+ return 0;
}
catch(exception& e) {
m->errorOut(e, "ShhherCommand", "createNamesFile");
exit(1);
}
}
-
//**********************************************************************************************************************
-string ShhherCommand::cluster(string distFileName, string namesFileName){
+int ShhherCommand::cluster(string filename, string distFileName, string namesFileName){
try {
ReadMatrix* read = new ReadColumnMatrix(distFileName);
NameAssignment* clusterNameMap = new NameAssignment(namesFileName);
clusterNameMap->readMap();
read->read(clusterNameMap);
-
+
ListVector* list = read->getListVector();
- SparseMatrix* matrix = read->getMatrix();
+ SparseDistanceMatrix* matrix = read->getDMatrix();
delete read;
delete clusterNameMap;
-
+
RAbundVector* rabund = new RAbundVector(list->getRAbundVector());
- Cluster* cluster = new CompleteLinkage(rabund, list, matrix, cutoff, "furthest");
+ float adjust = -1.0;
+ Cluster* cluster = new CompleteLinkage(rabund, list, matrix, cutoff, "furthest", adjust);
string tag = cluster->getTag();
double clusterCutoff = cutoff;
while (matrix->getSmallDist() <= clusterCutoff && matrix->getNNodes() > 0){
+
+ if (m->control_pressed) { break; }
+
cluster->update(clusterCutoff);
}
list->setLabel(toString(cutoff));
- string listFileName = flowFileName.substr(0,flowFileName.find_last_of('.')) + ".pn.list";
ofstream listFile;
- m->openOutputFile(listFileName, listFile);
+ m->openOutputFile(filename, listFile);
list->print(listFile);
listFile.close();
delete matrix; delete cluster; delete rabund; delete list;
-
- return listFileName;
+
+ return 0;
}
catch(exception& e) {
m->errorOut(e, "ShhherCommand", "cluster");
exit(1);
}
}
-
/**************************************************************************************************/
-void ShhherCommand::getOTUData(string listFileName){
+int ShhherCommand::getOTUData(int numSeqs, string fileName, vector<int>& otuData,
+ vector<int>& cumNumSeqs,
+ vector<int>& nSeqsPerOTU,
+ vector<vector<int> >& aaP, //tMaster->aanP: each row is a different otu / each col contains the sequence indices
+ vector<vector<int> >& aaI, //tMaster->aanI: that are in each otu - can't differentiate between aaP and aaI
+ vector<int>& seqNumber, //tMaster->anP: the sequence id number sorted by OTU
+ vector<int>& seqIndex,
+ map<string, int>& nameMap){
try {
-
+
ifstream listFile;
- m->openInputFile(listFileName, listFile);
+ m->openInputFile(fileName, listFile);
string label;
+ int numOTUs;
listFile >> label >> numOTUs;
-
+
+ if (m->debug) { m->mothurOut("[DEBUG]: Getting OTU Data...\n"); }
+
otuData.assign(numSeqs, 0);
cumNumSeqs.assign(numOTUs, 0);
nSeqsPerOTU.assign(numOTUs, 0);
string singleOTU = "";
for(int i=0;i<numOTUs;i++){
-
+
+ if (m->control_pressed) { break; }
+ if (m->debug) { m->mothurOut("[DEBUG]: processing OTU " + toString(i) + ".\n"); }
+
listFile >> singleOTU;
istringstream otuString(singleOTU);
-
+
while(otuString){
string seqName = "";
}
map<string,int>::iterator nmIt = nameMap.find(seqName);
-
+
int index = nmIt->second;
nameMap.erase(nmIt);
-
+
otuData[index] = i;
nSeqsPerOTU[i]++;
aaP[i].push_back(index);
seqIndex = seqNumber;
listFile.close();
+
+ return numOTUs;
}
catch(exception& e) {
exit(1);
}
}
-
-/**************************************************************************************************/
-
-void ShhherCommand::initPyroCluster(){
- try{
- dist.assign(numSeqs * numOTUs, 0);
- change.assign(numOTUs, 1);
- centroids.assign(numOTUs, -1);
- weight.assign(numOTUs, 0);
- singleTau.assign(numSeqs, 1.0);
-
- nSeqsBreaks.assign(processors+1, 0);
- nOTUsBreaks.assign(processors+1, 0);
-
- nSeqsBreaks[0] = 0;
- for(int i=0;i<processors;i++){
- nSeqsBreaks[i+1] = nSeqsBreaks[i] + (int)((double) numSeqs / (double) processors);
- nOTUsBreaks[i+1] = nOTUsBreaks[i] + (int)((double) numOTUs / (double) processors);
- }
- nSeqsBreaks[processors] = numSeqs;
- nOTUsBreaks[processors] = numOTUs;
- }
- catch(exception& e) {
- m->errorOut(e, "ShhherCommand", "initPyroCluster");
- exit(1);
- }
-}
-
-/**************************************************************************************************/
-
-void ShhherCommand::fill(){
- try {
- int index = 0;
- for(int i=0;i<numOTUs;i++){
- cumNumSeqs[i] = index;
- for(int j=0;j<nSeqsPerOTU[i];j++){
- seqNumber[index] = aaP[i][j];
- seqIndex[index] = aaI[i][j];
-
- index++;
- }
- }
- }
- catch(exception& e) {
- m->errorOut(e, "ShhherCommand", "fill");
- exit(1);
- }
-}
-
-/**************************************************************************************************/
-
-void ShhherCommand::calcCentroids(){
- try{
-
-#if defined (__APPLE__) || (__MACH__) || (linux) || (__linux)
-
- if(processors == 1) {
- calcCentroidsDriver(0, numOTUs);
- }
- else{ //you have multiple processors
- if (numOTUs < processors){ processors = 1; }
-
- int process = 1;
- vector<int> processIDs;
-
- //loop through and create all the processes you want
- while (process != processors) {
- int pid = vfork();
-
- if (pid > 0) {
- processIDs.push_back(pid); //create map from line number to pid so you can append files in correct order later
- process++;
- }else if (pid == 0){
- calcCentroidsDriver(nOTUsBreaks[process], nOTUsBreaks[process+1]);
- exit(0);
- }else {
- m->mothurOut("[ERROR]: unable to spawn the necessary processes. Error code: " + toString(pid)); m->mothurOutEndLine();
- perror(" : ");
- for (int i=0;i<processIDs.size();i++) { int temp = processIDs[i]; kill (temp, SIGINT); }
- exit(0);
- }
- }
-
- //parent does its part
- calcCentroidsDriver(nOTUsBreaks[0], nOTUsBreaks[1]);
-
- //force parent to wait until all the processes are done
- for (int i=0;i<processIDs.size();i++) {
- int temp = processIDs[i];
- wait(&temp);
- }
- }
-
-#else
- calcCentroidsDriver(0, numOTUs);
-#endif
- }
- catch(exception& e) {
- m->errorOut(e, "ShhherCommand", "calcCentroidsDriver");
- exit(1);
- }
-}
-
/**************************************************************************************************/
-void ShhherCommand::calcCentroidsDriver(int start, int finish){
+int ShhherCommand::calcCentroidsDriver(int numOTUs,
+ vector<int>& cumNumSeqs,
+ vector<int>& nSeqsPerOTU,
+ vector<int>& seqIndex,
+ vector<short>& change, //did the centroid sequence change? 0 = no; 1 = yes
+ vector<int>& centroids, //the representative flowgram for each cluster m
+ vector<double>& singleTau, //tMaster->adTau: 1-D Tau vector (1xnumSeqs)
+ vector<int>& mapSeqToUnique,
+ vector<short>& uniqueFlowgrams,
+ vector<short>& flowDataIntI,
+ vector<int>& lengths,
+ int numFlowCells,
+ vector<int>& seqNumber){
//this function gets the most likely homopolymer length at a flow position for a group of sequences
//within an otu
try{
-
- for(int i=start;i<finish;i++){
+ for(int i=0;i<numOTUs;i++){
+
+ if (m->control_pressed) { break; }
double count = 0;
int position = 0;
for(int j=0;j<nSeqsPerOTU[i];j++){
count += singleTau[seqNumber[cumNumSeqs[i] + j]];
}
-
+
if(nSeqsPerOTU[i] > 0 && count > MIN_COUNT){
vector<double> adF(nSeqsPerOTU[i]);
vector<int> anL(nSeqsPerOTU[i]);
double tauValue = singleTau[seqNumber[index]];
for(int k=0;k<position;k++){
- double dist = getDistToCentroid(anL[k], nI, lengths[nI]);
+ double dist = getDistToCentroid(anL[k], nI, lengths[nI], uniqueFlowgrams, flowDataIntI, numFlowCells);
adF[k] += dist * tauValue;
}
}
centroids[i] = -1;
}
}
+
+ return 0;
}
catch(exception& e) {
m->errorOut(e, "ShhherCommand", "calcCentroidsDriver");
exit(1);
}
}
-
/**************************************************************************************************/
-double ShhherCommand::getDistToCentroid(int cent, int flow, int length){
+double ShhherCommand::getDistToCentroid(int cent, int flow, int length, vector<short>& uniqueFlowgrams,
+ vector<short>& flowDataIntI, int numFlowCells){
try{
int flowAValue = cent * numFlowCells;
int flowBValue = flow * numFlowCells;
double dist = 0;
-
+
for(int i=0;i<length;i++){
dist += singleLookUp[uniqueFlowgrams[flowAValue] * NUMBINS + flowDataIntI[flowBValue]];
flowAValue++;
exit(1);
}
}
-
/**************************************************************************************************/
-double ShhherCommand::getNewWeights(){
+double ShhherCommand::getNewWeights(int numOTUs, vector<int>& cumNumSeqs, vector<int>& nSeqsPerOTU, vector<double>& singleTau, vector<int>& seqNumber, vector<double>& weight){
try{
double maxChange = 0;
for(int i=0;i<numOTUs;i++){
- double difference = weight[i];
- weight[i] = 0;
-
- for(int j=0;j<nSeqsPerOTU[i];j++){
- int index = cumNumSeqs[i] + j;
- double tauValue = singleTau[seqNumber[index]];
- weight[i] += tauValue;
- }
-
- difference = fabs(weight[i] - difference);
- if(difference > maxChange){ maxChange = difference; }
- }
- return maxChange;
- }
- catch(exception& e) {
- m->errorOut(e, "ShhherCommand", "getNewWeights");
- exit(1);
- }
-}
-
-/**************************************************************************************************/
-
-double ShhherCommand::getLikelihood(){
-
- try{
-
- vector<long double> P(numSeqs, 0);
- int effNumOTUs = 0;
-
- for(int i=0;i<numOTUs;i++){
- if(weight[i] > MIN_WEIGHT){
- effNumOTUs++;
- }
- }
-
- string hold;
- for(int i=0;i<numOTUs;i++){
- for(int j=0;j<nSeqsPerOTU[i];j++){
- int index = cumNumSeqs[i] + j;
- int nI = seqIndex[index];
- double singleDist = dist[seqNumber[index]];
-
- P[nI] += weight[i] * exp(-singleDist * sigma);
- }
- }
- double nLL = 0.00;
- for(int i=0;i<numSeqs;i++){
- if(P[i] == 0){ P[i] = DBL_EPSILON; }
-
- nLL += -log(P[i]);
- }
-
- nLL = nLL -(double)numSeqs * log(sigma);
-
- return nLL;
- }
- catch(exception& e) {
- m->errorOut(e, "ShhherCommand", "getNewWeights");
- exit(1);
- }
-}
-
-/**************************************************************************************************/
-
-void ShhherCommand::checkCentroids(){
- try{
- vector<int> unique(numOTUs, 1);
-
- for(int i=0;i<numOTUs;i++){
- if(centroids[i] == -1 || weight[i] < MIN_WEIGHT){
- unique[i] = -1;
- }
- }
-
- for(int i=0;i<numOTUs;i++){
- if(unique[i] == 1){
- for(int j=i+1;j<numOTUs;j++){
- if(unique[j] == 1){
-
- if(centroids[j] == centroids[i]){
- unique[j] = 0;
- centroids[j] = -1;
-
- weight[i] += weight[j];
- weight[j] = 0.0;
- }
- }
- }
- }
- }
- }
- catch(exception& e) {
- m->errorOut(e, "ShhherCommand", "checkCentroids");
- exit(1);
- }
-}
-
-/**************************************************************************************************/
-
-void ShhherCommand::calcNewDistances(){
- try{
-
-#if defined (__APPLE__) || (__MACH__) || (linux) || (__linux)
-
- if(processors == 1) {
- calcNewDistancesParent(0, numSeqs);
- }
- else{ //you have multiple processors
- if (numSeqs < processors){ processors = 1; }
-
- vector<vector<int> > child_otuIndex(processors);
- vector<vector<int> > child_seqIndex(processors);
- vector<vector<double> > child_singleTau(processors);
- vector<int> totals(processors);
-
- int process = 1;
- vector<int> processIDs;
-
- //loop through and create all the processes you want
- while (process != processors) {
- int pid = vfork();
-
- if (pid > 0) {
- processIDs.push_back(pid); //create map from line number to pid so you can append files in correct order later
- process++;
- }else if (pid == 0){
- calcNewDistancesChild(nSeqsBreaks[process], nSeqsBreaks[process+1], child_otuIndex[process], child_seqIndex[process], child_singleTau[process]);
- totals[process] = child_otuIndex[process].size();
-
- exit(0);
- }else {
- m->mothurOut("[ERROR]: unable to spawn the necessary processes. Error code: " + toString(pid)); m->mothurOutEndLine();
- perror(" : ");
- for (int i=0;i<processIDs.size();i++) { int temp = processIDs[i]; kill (temp, SIGINT); }
- exit(0);
- }
- }
-
- //parent does its part
- calcNewDistancesParent(nSeqsBreaks[0], nSeqsBreaks[1]);
- int total = seqIndex.size();
-
- //force parent to wait until all the processes are done
- for (int i=0;i<processIDs.size();i++) {
- int temp = processIDs[i];
- wait(&temp);
- }
-
- for(int i=1;i<processors;i++){
- int oldTotal = total;
- total += totals[i];
-
- singleTau.resize(total, 0);
- seqIndex.resize(total, 0);
- seqNumber.resize(total, 0);
-
- int childIndex = 0;
-
- for(int j=oldTotal;j<total;j++){
- int otuI = child_otuIndex[i][childIndex];
- int seqI = child_seqIndex[i][childIndex];
-
- singleTau[j] = child_singleTau[i][childIndex];
- aaP[otuI][nSeqsPerOTU[otuI]] = j;
- aaI[otuI][nSeqsPerOTU[otuI]] = seqI;
- nSeqsPerOTU[otuI]++;
-
- childIndex++;
- }
- }
- }
-#else
- calcNewDistancesParent(0, numSeqs);
-#endif
- }
- catch(exception& e) {
- m->errorOut(e, "ShhherCommand", "calcNewDistances");
- exit(1);
- }
-}
-
-/**************************************************************************************************/
-#ifdef USE_MPI
-void ShhherCommand::calcNewDistancesChildMPI(int startSeq, int stopSeq, vector<int>& otuIndex){
-
- try{
- vector<double> newTau(numOTUs,0);
- vector<double> norms(numSeqs, 0);
- otuIndex.clear();
- seqIndex.clear();
- singleTau.clear();
-
-
-
- for(int i=startSeq;i<stopSeq;i++){
- double offset = 1e8;
- int indexOffset = i * numOTUs;
-
- for(int j=0;j<numOTUs;j++){
-
- if(weight[j] > MIN_WEIGHT && change[j] == 1){
- dist[indexOffset + j] = getDistToCentroid(centroids[j], i, lengths[i]);
- }
- if(weight[j] > MIN_WEIGHT && dist[indexOffset + j] < offset){
- offset = dist[indexOffset + j];
- }
- }
-
- for(int j=0;j<numOTUs;j++){
- if(weight[j] > MIN_WEIGHT){
- newTau[j] = exp(sigma * (-dist[indexOffset + j] + offset)) * weight[j];
- norms[i] += newTau[j];
- }
- else{
- newTau[j] = 0.0;
- }
- }
+ if (m->control_pressed) { break; }
- for(int j=0;j<numOTUs;j++){
-
- newTau[j] /= norms[i];
-
- if(newTau[j] > MIN_TAU){
- otuIndex.push_back(j);
- seqIndex.push_back(i);
- singleTau.push_back(newTau[j]);
- }
+ double difference = weight[i];
+ weight[i] = 0;
+
+ for(int j=0;j<nSeqsPerOTU[i];j++){
+ int index = cumNumSeqs[i] + j;
+ double tauValue = singleTau[seqNumber[index]];
+ weight[i] += tauValue;
}
+ difference = fabs(weight[i] - difference);
+ if(difference > maxChange){ maxChange = difference; }
}
+ return maxChange;
}
catch(exception& e) {
- m->errorOut(e, "ShhherCommand", "calcNewDistancesChildMPI");
+ m->errorOut(e, "ShhherCommand", "getNewWeights");
exit(1);
}
}
-#endif
+
/**************************************************************************************************/
-void ShhherCommand::calcNewDistancesChild(int startSeq, int stopSeq, vector<int>& child_otuIndex, vector<int>& child_seqIndex, vector<double>& child_singleTau){
+double ShhherCommand::getLikelihood(int numSeqs, int numOTUs, vector<int>& nSeqsPerOTU, vector<int>& seqNumber, vector<int>& cumNumSeqs, vector<int>& seqIndex, vector<double>& dist, vector<double>& weight){
try{
- vector<double> newTau(numOTUs,0);
- vector<double> norms(numSeqs, 0);
- child_otuIndex.resize(0);
- child_seqIndex.resize(0);
- child_singleTau.resize(0);
- for(int i=startSeq;i<stopSeq;i++){
- double offset = 1e8;
- int indexOffset = i * numOTUs;
+ vector<long double> P(numSeqs, 0);
+ int effNumOTUs = 0;
+
+ for(int i=0;i<numOTUs;i++){
+ if(weight[i] > MIN_WEIGHT){
+ effNumOTUs++;
+ }
+ }
+
+ string hold;
+ for(int i=0;i<numOTUs;i++){
+ if (m->control_pressed) { break; }
- for(int j=0;j<numOTUs;j++){
- if(weight[j] > MIN_WEIGHT && change[j] == 1){
- dist[indexOffset + j] = getDistToCentroid(centroids[j], i, lengths[i]);
- }
+ for(int j=0;j<nSeqsPerOTU[i];j++){
+ int index = cumNumSeqs[i] + j;
+ int nI = seqIndex[index];
+ double singleDist = dist[seqNumber[index]];
- if(weight[j] > MIN_WEIGHT && dist[indexOffset + j] < offset){
- offset = dist[indexOffset + j];
- }
+ P[nI] += weight[i] * exp(-singleDist * sigma);
}
-
- for(int j=0;j<numOTUs;j++){
- if(weight[j] > MIN_WEIGHT){
- newTau[j] = exp(sigma * (-dist[indexOffset + j] + offset)) * weight[j];
- norms[i] += newTau[j];
- }
- else{
- newTau[j] = 0.0;
- }
+ }
+ double nLL = 0.00;
+ for(int i=0;i<numSeqs;i++){
+ if(P[i] == 0){ P[i] = DBL_EPSILON; }
+
+ nLL += -log(P[i]);
+ }
+
+ nLL = nLL -(double)numSeqs * log(sigma);
+
+ return nLL;
+ }
+ catch(exception& e) {
+ m->errorOut(e, "ShhherCommand", "getNewWeights");
+ exit(1);
+ }
+}
+
+/**************************************************************************************************/
+
+int ShhherCommand::checkCentroids(int numOTUs, vector<int>& centroids, vector<double>& weight){
+ try{
+ vector<int> unique(numOTUs, 1);
+
+ for(int i=0;i<numOTUs;i++){
+ if(centroids[i] == -1 || weight[i] < MIN_WEIGHT){
+ unique[i] = -1;
}
+ }
+
+ for(int i=0;i<numOTUs;i++){
- for(int j=0;j<numOTUs;j++){
- newTau[j] /= norms[i];
-
- if(newTau[j] > MIN_TAU){
- child_otuIndex.push_back(j);
- child_seqIndex.push_back(i);
- child_singleTau.push_back(newTau[j]);
+ if (m->control_pressed) { break; }
+
+ if(unique[i] == 1){
+ for(int j=i+1;j<numOTUs;j++){
+ if(unique[j] == 1){
+
+ if(centroids[j] == centroids[i]){
+ unique[j] = 0;
+ centroids[j] = -1;
+
+ weight[i] += weight[j];
+ weight[j] = 0.0;
+ }
+ }
}
}
}
+
+ return 0;
}
catch(exception& e) {
- m->errorOut(e, "ShhherCommand", "calcNewDistancesChild");
+ m->errorOut(e, "ShhherCommand", "checkCentroids");
exit(1);
}
}
-
/**************************************************************************************************/
-void ShhherCommand::calcNewDistancesParent(int startSeq, int stopSeq){
+void ShhherCommand::calcNewDistances(int numSeqs, int numOTUs, vector<int>& nSeqsPerOTU, vector<double>& dist,
+ vector<double>& weight, vector<short>& change, vector<int>& centroids,
+ vector<vector<int> >& aaP, vector<double>& singleTau, vector<vector<int> >& aaI,
+ vector<int>& seqNumber, vector<int>& seqIndex,
+ vector<short>& uniqueFlowgrams,
+ vector<short>& flowDataIntI, int numFlowCells, vector<int>& lengths){
try{
vector<double> newTau(numOTUs,0);
vector<double> norms(numSeqs, 0);
nSeqsPerOTU.assign(numOTUs, 0);
-
- for(int i=startSeq;i<stopSeq;i++){
- int indexOffset = i * numOTUs;
+
+ for(int i=0;i<numSeqs;i++){
+
+ if (m->control_pressed) { break; }
+ int indexOffset = i * numOTUs;
+
double offset = 1e8;
for(int j=0;j<numOTUs;j++){
+
if(weight[j] > MIN_WEIGHT && change[j] == 1){
- dist[indexOffset + j] = getDistToCentroid(centroids[j], i, lengths[i]);
+ dist[indexOffset + j] = getDistToCentroid(centroids[j], i, lengths[i], uniqueFlowgrams, flowDataIntI, numFlowCells);
}
-
+
if(weight[j] > MIN_WEIGHT && dist[indexOffset + j] < offset){
offset = dist[indexOffset + j];
}
}
-
+
for(int j=0;j<numOTUs;j++){
if(weight[j] > MIN_WEIGHT){
newTau[j] = exp(sigma * (-dist[indexOffset + j] + offset)) * weight[j];
newTau[j] = 0.0;
}
}
-
+
for(int j=0;j<numOTUs;j++){
newTau[j] /= norms[i];
}
-
+
for(int j=0;j<numOTUs;j++){
if(newTau[j] > MIN_TAU){
nSeqsPerOTU[j]++;
}
}
+
}
+
}
catch(exception& e) {
- m->errorOut(e, "ShhherCommand", "calcNewDistancesParent");
+ m->errorOut(e, "ShhherCommand", "calcNewDistances");
exit(1);
}
}
+/**************************************************************************************************/
+int ShhherCommand::fill(int numOTUs, vector<int>& seqNumber, vector<int>& seqIndex, vector<int>& cumNumSeqs, vector<int>& nSeqsPerOTU, vector<vector<int> >& aaP, vector<vector<int> >& aaI){
+ try {
+ int index = 0;
+ for(int i=0;i<numOTUs;i++){
+
+ if (m->control_pressed) { return 0; }
+
+ cumNumSeqs[i] = index;
+ for(int j=0;j<nSeqsPerOTU[i];j++){
+ seqNumber[index] = aaP[i][j];
+ seqIndex[index] = aaI[i][j];
+
+ index++;
+ }
+ }
+
+ return 0;
+ }
+ catch(exception& e) {
+ m->errorOut(e, "ShhherCommand", "fill");
+ exit(1);
+ }
+}
/**************************************************************************************************/
-void ShhherCommand::setOTUs(){
+void ShhherCommand::setOTUs(int numOTUs, int numSeqs, vector<int>& seqNumber, vector<int>& seqIndex, vector<int>& cumNumSeqs, vector<int>& nSeqsPerOTU,
+ vector<int>& otuData, vector<double>& singleTau, vector<double>& dist, vector<vector<int> >& aaP, vector<vector<int> >& aaI){
try {
vector<double> bigTauMatrix(numOTUs * numSeqs, 0.0000);
for(int i=0;i<numOTUs;i++){
+
+ if (m->control_pressed) { break; }
+
for(int j=0;j<nSeqsPerOTU[i];j++){
int index = cumNumSeqs[i] + j;
double tauValue = singleTau[seqNumber[index]];
nSeqsPerOTU[index]++;
}
- fill();
+
+ fill(numOTUs, seqNumber, seqIndex, cumNumSeqs, nSeqsPerOTU, aaP, aaI);
}
catch(exception& e) {
- m->errorOut(e, "ShhherCommand", "calcNewDistances");
+ m->errorOut(e, "ShhherCommand", "setOTUs");
exit(1);
}
}
-
/**************************************************************************************************/
-void ShhherCommand::writeQualities(vector<int> otuCounts){
+void ShhherCommand::writeQualities(int numOTUs, int numFlowCells, string qualityFileName, vector<int> otuCounts, vector<int>& nSeqsPerOTU, vector<int>& seqNumber,
+ vector<double>& singleTau, vector<short>& flowDataIntI, vector<short>& uniqueFlowgrams, vector<int>& cumNumSeqs,
+ vector<int>& mapUniqueToSeq, vector<string>& seqNameVector, vector<int>& centroids, vector<vector<int> >& aaI){
try {
- string qualityFileName = flowFileName.substr(0,flowFileName.find_last_of('.')) + ".pn.qual";
-
+
ofstream qualityFile;
m->openOutputFile(qualityFileName, qualityFile);
-
+
qualityFile.setf(ios::fixed, ios::floatfield);
qualityFile.setf(ios::showpoint);
qualityFile << setprecision(6);
for(int i=0;i<numOTUs;i++){
+
+ if (m->control_pressed) { break; }
+
int index = 0;
int base = 0;
if(otuCounts[i] > 0){
qualityFile << '>' << seqNameVector[mapUniqueToSeq[i]] << endl;
-
+
int j=4; //need to get past the first four bases
while(qualities[i][j] != -1){
- qualityFile << qualities[i][j] << ' ';
- j++;
+ qualityFile << qualities[i][j] << ' ';
+ if (j > qualities[i].size()) { break; }
+ j++;
}
qualityFile << endl;
}
}
qualityFile.close();
-
+ outputNames.push_back(qualityFileName); outputTypes["qfile"].push_back(qualityFileName);
+
}
catch(exception& e) {
m->errorOut(e, "ShhherCommand", "writeQualities");
/**************************************************************************************************/
-void ShhherCommand::writeSequences(vector<int> otuCounts){
+void ShhherCommand::writeSequences(string thisCompositeFASTAFileName, int numOTUs, int numFlowCells, string fastaFileName, vector<int> otuCounts, vector<short>& uniqueFlowgrams, vector<string>& seqNameVector, vector<vector<int> >& aaI, vector<int>& centroids){
try {
- string fastaFileName = flowFileName.substr(0,flowFileName.find_last_of('.')) + ".pn.fasta";
ofstream fastaFile;
m->openOutputFile(fastaFileName, fastaFile);
vector<string> names(numOTUs, "");
for(int i=0;i<numOTUs;i++){
+
+ if (m->control_pressed) { break; }
+
int index = centroids[i];
if(otuCounts[i] > 0){
for(int j=0;j<numFlowCells;j++){
- char base = flowOrder[j % 4];
+ char base = flowOrder[j % flowOrder.length()];
for(int k=0;k<uniqueFlowgrams[index * numFlowCells + j];k++){
newSeq += base;
}
}
- fastaFile << newSeq.substr(4) << endl;
+ if (newSeq.length() >= 4) { fastaFile << newSeq.substr(4) << endl; }
+ else { fastaFile << "NNNN" << endl; }
}
}
fastaFile.close();
-
- if(compositeFASTAFileName != ""){
- m->appendFiles(fastaFileName, compositeFASTAFileName);
+
+ outputNames.push_back(fastaFileName); outputTypes["fasta"].push_back(fastaFileName);
+
+ if(thisCompositeFASTAFileName != ""){
+ m->appendFiles(fastaFileName, thisCompositeFASTAFileName);
}
}
catch(exception& e) {
/**************************************************************************************************/
-void ShhherCommand::writeNames(vector<int> otuCounts){
+void ShhherCommand::writeNames(string thisCompositeNamesFileName, int numOTUs, string nameFileName, vector<int> otuCounts, vector<string>& seqNameVector, vector<vector<int> >& aaI, vector<int>& nSeqsPerOTU){
try {
- string nameFileName = flowFileName.substr(0,flowFileName.find_last_of('.')) + ".pn.final.names";
+
ofstream nameFile;
m->openOutputFile(nameFileName, nameFile);
for(int i=0;i<numOTUs;i++){
+
+ if (m->control_pressed) { break; }
+
if(otuCounts[i] > 0){
nameFile << seqNameVector[aaI[i][0]] << '\t' << seqNameVector[aaI[i][0]];
}
}
nameFile.close();
+ outputNames.push_back(nameFileName); outputTypes["name"].push_back(nameFileName);
+
+
+ if(thisCompositeNamesFileName != ""){
+ m->appendFiles(nameFileName, thisCompositeNamesFileName);
+ }
}
catch(exception& e) {
m->errorOut(e, "ShhherCommand", "writeNames");
/**************************************************************************************************/
-void ShhherCommand::writeGroups(){
+void ShhherCommand::writeGroups(string groupFileName, string fileRoot, int numSeqs, vector<string>& seqNameVector){
try {
- string fileRoot = flowFileName.substr(0,flowFileName.find_last_of('.'));
- string groupFileName = fileRoot + ".pn.groups";
- ofstream groupFile;
+ ofstream groupFile;
m->openOutputFile(groupFileName, groupFile);
for(int i=0;i<numSeqs;i++){
+ if (m->control_pressed) { break; }
groupFile << seqNameVector[i] << '\t' << fileRoot << endl;
}
groupFile.close();
+ outputNames.push_back(groupFileName); outputTypes["group"].push_back(groupFileName);
+
}
catch(exception& e) {
m->errorOut(e, "ShhherCommand", "writeGroups");
/**************************************************************************************************/
-void ShhherCommand::writeClusters(vector<int> otuCounts){
+void ShhherCommand::writeClusters(string otuCountsFileName, int numOTUs, int numFlowCells, vector<int> otuCounts, vector<int>& centroids, vector<short>& uniqueFlowgrams, vector<string>& seqNameVector, vector<vector<int> >& aaI, vector<int>& nSeqsPerOTU, vector<int>& lengths, vector<short>& flowDataIntI){
try {
- string otuCountsFileName = flowFileName.substr(0,flowFileName.find_last_of('.')) + ".pn.counts";
ofstream otuCountsFile;
m->openOutputFile(otuCountsFileName, otuCountsFile);
string bases = flowOrder;
for(int i=0;i<numOTUs;i++){
+
+ if (m->control_pressed) {
+ break;
+ }
//output the translated version of the centroid sequence for the otu
if(otuCounts[i] > 0){
int index = centroids[i];
otuCountsFile << "ideal\t";
for(int j=8;j<numFlowCells;j++){
- char base = bases[j % 4];
+ char base = bases[j % bases.length()];
for(int s=0;s<uniqueFlowgrams[index * numFlowCells + j];s++){
otuCountsFile << base;
}
string newSeq = "";
for(int k=0;k<lengths[sequence];k++){
- char base = bases[k % 4];
+ char base = bases[k % bases.length()];
int freq = int(0.01 * (double)flowDataIntI[sequence * numFlowCells + k] + 0.5);
-
+
for(int s=0;s<freq;s++){
newSeq += base;
//otuCountsFile << base;
}
}
- otuCountsFile << newSeq.substr(4) << endl;
+
+ if (newSeq.length() >= 4) { otuCountsFile << newSeq.substr(4) << endl; }
+ else { otuCountsFile << "NNNN" << endl; }
}
otuCountsFile << endl;
}
}
otuCountsFile.close();
+ outputNames.push_back(otuCountsFileName); outputTypes["counts"].push_back(otuCountsFileName);
+
}
catch(exception& e) {
m->errorOut(e, "ShhherCommand", "writeClusters");
}
}
-//**********************************************************************************************************************
+/**************************************************************************************************/
+
+void ShhherCommand::getSingleLookUp(){
+ try{
+ // these are the -log probabilities that a signal corresponds to a particular homopolymer length
+ singleLookUp.assign(HOMOPS * NUMBINS, 0);
+
+ int index = 0;
+ ifstream lookUpFile;
+ m->openInputFile(lookupFileName, lookUpFile);
+
+ for(int i=0;i<HOMOPS;i++){
+
+ if (m->control_pressed) { break; }
+
+ float logFracFreq;
+ lookUpFile >> logFracFreq;
+
+ for(int j=0;j<NUMBINS;j++) {
+ lookUpFile >> singleLookUp[index];
+ index++;
+ }
+ }
+ lookUpFile.close();
+ }
+ catch(exception& e) {
+ m->errorOut(e, "ShhherCommand", "getSingleLookUp");
+ exit(1);
+ }
+}
+
+/**************************************************************************************************/
+
+void ShhherCommand::getJointLookUp(){
+ try{
+
+ // the most likely joint probability (-log) that two intenities have the same polymer length
+ jointLookUp.resize(NUMBINS * NUMBINS, 0);
+
+ for(int i=0;i<NUMBINS;i++){
+
+ if (m->control_pressed) { break; }
+
+ for(int j=0;j<NUMBINS;j++){
+
+ double minSum = 100000000;
+
+ for(int k=0;k<HOMOPS;k++){
+ double sum = singleLookUp[k * NUMBINS + i] + singleLookUp[k * NUMBINS + j];
+
+ if(sum < minSum) { minSum = sum; }
+ }
+ jointLookUp[i * NUMBINS + j] = minSum;
+ }
+ }
+ }
+ catch(exception& e) {
+ m->errorOut(e, "ShhherCommand", "getJointLookUp");
+ exit(1);
+ }
+}
+
+/**************************************************************************************************/
+
+double ShhherCommand::getProbIntensity(int intIntensity){
+ try{
+
+ double minNegLogProb = 100000000;
+
+
+ for(int i=0;i<HOMOPS;i++){//loop signal strength
+
+ if (m->control_pressed) { break; }
+
+ float negLogProb = singleLookUp[i * NUMBINS + intIntensity];
+ if(negLogProb < minNegLogProb) { minNegLogProb = negLogProb; }
+ }
+
+ return minNegLogProb;
+ }
+ catch(exception& e) {
+ m->errorOut(e, "ShhherCommand", "getProbIntensity");
+ exit(1);
+ }
+}
+
+
+
+