]> git.donarmstrong.com Git - mothur.git/blob - shhhercommand.cpp
changing command name classify.shared to classifyrf.shared
[mothur.git] / shhhercommand.cpp
1 /*
2  *  shhher.cpp
3  *  Mothur
4  *
5  *  Created by Pat Schloss on 12/27/10.
6  *  Copyright 2010 Schloss Lab. All rights reserved.
7  *
8  */
9
10 #include "shhhercommand.h"
11
12 //**********************************************************************************************************************
13 vector<string> ShhherCommand::setParameters(){  
14         try {
15                 CommandParameter pflow("flow", "InputTypes", "", "", "none", "fileflow", "none","fasta-name-group-counts-qfile",false,false,true); parameters.push_back(pflow);
16                 CommandParameter pfile("file", "InputTypes", "", "", "none", "fileflow", "none","fasta-name-group-counts-qfile",false,false,true); parameters.push_back(pfile);
17                 CommandParameter plookup("lookup", "InputTypes", "", "", "none", "none", "none","",false,false,true); parameters.push_back(plookup);
18                 CommandParameter pcutoff("cutoff", "Number", "", "0.01", "", "", "","",false,false); parameters.push_back(pcutoff);
19                 CommandParameter pprocessors("processors", "Number", "", "1", "", "", "","",false,false,true); parameters.push_back(pprocessors);
20                 CommandParameter pmaxiter("maxiter", "Number", "", "1000", "", "", "","",false,false); parameters.push_back(pmaxiter);
21         CommandParameter plarge("large", "Number", "", "-1", "", "", "","",false,false); parameters.push_back(plarge);
22                 CommandParameter psigma("sigma", "Number", "", "60", "", "", "","",false,false); parameters.push_back(psigma);
23                 CommandParameter pmindelta("mindelta", "Number", "", "0.000001", "", "", "","",false,false); parameters.push_back(pmindelta);
24         CommandParameter porder("order", "Multiple", "A-B-I", "A", "", "", "","",false,false, true); parameters.push_back(porder);              CommandParameter pinputdir("inputdir", "String", "", "", "", "", "","",false,false); parameters.push_back(pinputdir);
25                 CommandParameter poutputdir("outputdir", "String", "", "", "", "", "","",false,false); parameters.push_back(poutputdir);
26                 
27                 vector<string> myArray;
28                 for (int i = 0; i < parameters.size(); i++) {   myArray.push_back(parameters[i].name);          }
29                 return myArray;
30         }
31         catch(exception& e) {
32                 m->errorOut(e, "ShhherCommand", "setParameters");
33                 exit(1);
34         }
35 }
36 //**********************************************************************************************************************
37 string ShhherCommand::getHelpString(){  
38         try {
39                 string helpString = "";
40                 helpString += "The shhh.flows command reads a file containing flowgrams and creates a file of corrected sequences.\n";
41         helpString += "The shhh.flows command parameters are flow, file, lookup, cutoff, processors, large, maxiter, sigma, mindelta and order.\n";
42         helpString += "The flow parameter is used to input your flow file.\n";
43         helpString += "The file parameter is used to input the *flow.files file created by trim.flows.\n";
44         helpString += "The lookup parameter is used specify the lookup file you would like to use. http://www.mothur.org/wiki/Lookup_files.\n";
45         helpString += "The order parameter options are A, B or I.  Default=A. A = TACG and B = TACGTACGTACGATGTAGTCGAGCATCATCTGACGCAGTACGTGCATGATCTCAGTCAGCAGCTATGTCAGTGCATGCAGTGACTGATCGTCATCAGCTAGCATCGACTGCATAGATCGCATGACGATCGCATATCGTCAGTGCATGTAGTCGAGCATCATCTGACGCAGTACGTGCATGATCTCAGTCAGCAGCTATGTCAGTGCATGCATAGATCGCATGACGATCGCATATCGTCAGTGCAGTGACTGATCGTCATCAGCTAGCATCGACTGCATGTAGTCGAGCATCATCTGACGCAGTACGTGCATAGATCGCATGACGATCGCATATCGTCAGTGCATGATCTCAGTCAGCAGCTATGTCAGTGCATGCAGTGACTGATCGTCATCAGCTAGCATCGACTGCATGTAGTCGAGCATCATCTGACGCAGTACGTGCAGTGACTGATCGTCATCAGCTAGCATCGACTGCATAGATCGCATGACGATCGCATATCGTCAGTGCATGATCTCAGTCAGCAGCTATGTCAGTGCATGCATGTAGTCGAGCATCATCTGACGCAGTACGTGCATAGATCGCATGACGATCGCATATCGTCAGTGCAGTGACTGATCGTCATCAGCTAGCATCGACTGCATGATCTCAGTCAGCAGCTATGTCAGTGCATGCAGTGACTGATCGTCATCAGCTAGCATCGACTGCATAGATCGCATGACGATCGCATATCGTCAGTGCATGATCTCAGTCAGCAGCTATGTCAGTGCATGCATGTAGTCGAGCATCATCTGACGCAGTACGTGCATAGATCGCATGACGATCGCATATCGTCAGTGCATGATCTCAGTCAGCAGCTATGTCAGTGCATGCAGTGACTGATCGTCATCAGCTAGCATCGACTGCATGTAGTCGAGCATCATCTGACGCAGTACGTGCATGATCTCAGTCAGCAGCTATGTCAGTGCATGCATAGATCGCATGACGATCGCATATCGTCAGTGCATGTAGTCGAGCATCATCTGACGCAGTACGTGCAGTGACTGATCGTCATCAGCTAGCATCGACTGCATAGATCGCATGACGATCGCATATCGTCAGTGCATGTAGTCGAGCATCATCTGACGCAGTACGTGCATGATCTCAGTCAGCAGCTATGTCAGTGCATGCAGTGACTGATCGTCATCAGCTAGCATCGACTGCATGATCTCAGTCAGCAGCTATGTCAGTGCATGCAGTGACTGATCGTCATCAGCTAGCATCGACTGCATAGATCGCATGACGATCGCATATCGTCAGTGCATGTAGTCGAGCATCATCTGACGCAGTACGTGCATGATCTCAGTCAGCAGCTATGTCAGTGCATGCATGTAGTCGAGCATCATCTGACGCAGTACGTGCAGTGACTGATCGTCATCAGCTAGCATCGACTGCATAGATCGCATGACGATCGCATATCGTCAGTGCAGTGACTGATCGTCATCAGCTAGCATCGACTGCATGTAGTCGAGCATCATCTGACGCAGTACGTGCATAGATCGCATGACGATCGCATATCGTCAGTGCATGATCTCAGTCAGCAGCTATGTCAGTGCATGCAGTGACTGATCGTCATCAGCTAGCATCGACTGCATGTAGTCGAGCATCATCTGACGCAGTACGTGCATGATCTCAGTCAGCAGCTATGTCAGTGCATGCATAGATCGCATGACGATCGCATATCGTCAGTGCAGTGACTGATCGTCATCAGCTAGCATCGACTGCATGATCTCAGTCAGCAGC and I = TACGTACGTCTGAGCATCGATCGATGTACAGCTACGTACGTCTGAGCATCGATCGATGTACAGCTACGTACGTCTGAGCATCGATCGATGTACAGCTACGTACGTCTGAGCATCGATCGATGTACAGCTACGTACGTCTGAGCATCGATCGATGTACAGCTACGTACGTCTGAGCATCGATCGATGTACAGCTACGTACGTCTGAGCATCGATCGATGTACAGCTACGTACGTCTGAGCATCGATCGATGTACAGCTACGTACGTCTGAGCATCGATCGATGTACAGCTACGTACGTCTGAGCATCGATCGATGTACAGCTACGTACGTCTGAGCATCGATCGATGTACAGCTACGTACGTCTGAGCATCGATCGATGTACAGCTACGTACGTCTGAGCATCGATCGATGTACAGCTACGTACGTCTGAGCATCGATCGATGTACAGCTACGTACGTCTGAGCATCGATCGATGTACAGCTACGTACGTCTGAGCATCGATCGATGTACAGCTACGTACGTCTGAGCATCGATCGATGTACAGCTACGTACGTCTGAGCATCGATCGATGTACAGCTACGTACGTCTGAGCATCGATCGATGTACAGCTACGTACGTCTGAGCATCGATCGATGTACAGC.\n";
46                 return helpString;
47         }
48         catch(exception& e) {
49                 m->errorOut(e, "ShhherCommand", "getHelpString");
50                 exit(1);
51         }
52 }
53 //**********************************************************************************************************************
54 string ShhherCommand::getOutputPattern(string type) {
55     try {
56         string pattern = "";
57         
58         if (type == "fasta")            {   pattern = "[filename],shhh.fasta";   }
59         else if (type == "name")    {   pattern = "[filename],shhh.names";   }
60         else if (type == "group")        {   pattern = "[filename],shhh.groups";   }
61         else if (type == "counts")        {   pattern = "[filename],shhh.counts";   }
62         else if (type == "qfile")        {   pattern = "[filename],shhh.qual";   }
63         else { m->mothurOut("[ERROR]: No definition for type " + type + " output pattern.\n"); m->control_pressed = true;  }
64         
65         return pattern;
66     }
67     catch(exception& e) {
68         m->errorOut(e, "ShhherCommand", "getOutputPattern");
69         exit(1);
70     }
71 }
72 //**********************************************************************************************************************
73
74 ShhherCommand::ShhherCommand(){ 
75         try {
76                 abort = true; calledHelp = true;
77                 setParameters();
78                 
79                 //initialize outputTypes
80                 vector<string> tempOutNames;
81                 outputTypes["fasta"] = tempOutNames;
82         outputTypes["name"] = tempOutNames;
83         outputTypes["group"] = tempOutNames;
84         outputTypes["counts"] = tempOutNames;
85         outputTypes["qfile"] = tempOutNames;
86
87         }
88         catch(exception& e) {
89                 m->errorOut(e, "ShhherCommand", "ShhherCommand");
90                 exit(1);
91         }
92 }
93
94 //**********************************************************************************************************************
95
96 ShhherCommand::ShhherCommand(string option) {
97         try {
98         
99 #ifdef USE_MPI
100                 MPI_Comm_rank(MPI_COMM_WORLD, &pid); //find out who we are
101                 MPI_Comm_size(MPI_COMM_WORLD, &ncpus);
102         
103                 if(pid == 0){
104 #endif
105                 abort = false; calledHelp = false;   
106                 
107                 //allow user to run help
108                 if(option == "help") { help(); abort = true; calledHelp = true; }
109                 else if(option == "citation") { citation(); abort = true; calledHelp = true;}
110                 
111                 else {
112                         vector<string> myArray = setParameters();
113                         
114                         OptionParser parser(option);
115                         map<string,string> parameters = parser.getParameters();
116                         
117                         ValidParameters validParameter;
118                         map<string,string>::iterator it;
119                         
120                         //check to make sure all parameters are valid for command
121                         for (it = parameters.begin(); it != parameters.end(); it++) { 
122                                 if (validParameter.isValidParameter(it->first, myArray, it->second) != true) {  abort = true;  }
123                         }
124                         
125                         //initialize outputTypes
126             vector<string> tempOutNames;
127             outputTypes["fasta"] = tempOutNames;
128             outputTypes["name"] = tempOutNames;
129             outputTypes["group"] = tempOutNames;
130             outputTypes["counts"] = tempOutNames;
131             outputTypes["qfile"] = tempOutNames;
132
133                         
134                         //if the user changes the input directory command factory will send this info to us in the output parameter 
135                         string inputDir = validParameter.validFile(parameters, "inputdir", false);              
136                         if (inputDir == "not found"){   inputDir = "";          }
137                         else {
138                                 string path;
139                                 it = parameters.find("flow");
140                                 //user has given a template file
141                                 if(it != parameters.end()){ 
142                                         path = m->hasPath(it->second);
143                                         //if the user has not given a path then, add inputdir. else leave path alone.
144                                         if (path == "") {       parameters["flow"] = inputDir + it->second;             }
145                                 }
146                                 
147                                 it = parameters.find("lookup");
148                                 //user has given a template file
149                                 if(it != parameters.end()){ 
150                                         path = m->hasPath(it->second);
151                                         //if the user has not given a path then, add inputdir. else leave path alone.
152                                         if (path == "") {       parameters["lookup"] = inputDir + it->second;           }
153                                 }
154
155                                 it = parameters.find("file");
156                                 //user has given a template file
157                                 if(it != parameters.end()){ 
158                                         path = m->hasPath(it->second);
159                                         //if the user has not given a path then, add inputdir. else leave path alone.
160                                         if (path == "") {       parameters["file"] = inputDir + it->second;             }
161                                 }
162                         }
163             
164             //if the user changes the output directory command factory will send this info to us in the output parameter 
165                         outputDir = validParameter.validFile(parameters, "outputdir", false);           if (outputDir == "not found"){  outputDir = ""; }
166             
167                         //check for required parameters
168                         flowFileName = validParameter.validFile(parameters, "flow", true);
169                         flowFilesFileName = validParameter.validFile(parameters, "file", true);
170                         if (flowFileName == "not found" && flowFilesFileName == "not found") {
171                                 m->mothurOut("values for either flow or file must be provided for the shhh.flows command.");
172                                 m->mothurOutEndLine();
173                                 abort = true; 
174                         }
175                         else if (flowFileName == "not open" || flowFilesFileName == "not open") { abort = true; }
176                         
177                         if(flowFileName != "not found"){
178                                 compositeFASTAFileName = "";    
179                                 compositeNamesFileName = "";    
180                         }
181                         else{
182                                 ofstream temp;
183                 
184                 string thisoutputDir = outputDir;
185                 if (outputDir == "") {  thisoutputDir =  m->hasPath(flowFilesFileName); } //if user entered a file with a path then preserve it
186                 
187                                 //we want to rip off .files, and also .flow if its there
188                 string fileroot = m->getRootName(m->getSimpleName(flowFilesFileName));
189                 if (fileroot[fileroot.length()-1] == '.') {  fileroot = fileroot.substr(0, fileroot.length()-1); } //rip off dot
190                 string extension = m->getExtension(fileroot);
191                 if (extension == ".flow") { fileroot = m->getRootName(fileroot);  }
192                 else { fileroot += "."; } //add back if needed
193                 
194                                 compositeFASTAFileName = thisoutputDir + fileroot + "shhh.fasta";
195                                 m->openOutputFile(compositeFASTAFileName, temp);
196                                 temp.close();
197                                 
198                                 compositeNamesFileName = thisoutputDir + fileroot + "shhh.names";
199                                 m->openOutputFile(compositeNamesFileName, temp);
200                                 temp.close();
201                         }
202             
203             if(flowFilesFileName != "not found"){
204                 string fName;
205                 
206                 ifstream flowFilesFile;
207                 m->openInputFile(flowFilesFileName, flowFilesFile);
208                 while(flowFilesFile){
209                     fName = m->getline(flowFilesFile);
210                     
211                     //test if file is valid
212                     ifstream in;
213                     int ableToOpen = m->openInputFile(fName, in, "noerror");
214                     in.close(); 
215                     if (ableToOpen == 1) {
216                         if (inputDir != "") { //default path is set
217                             string tryPath = inputDir + fName;
218                             m->mothurOut("Unable to open " + fName + ". Trying input directory " + tryPath); m->mothurOutEndLine();
219                             ifstream in2;
220                             ableToOpen = m->openInputFile(tryPath, in2, "noerror");
221                             in2.close();
222                             fName = tryPath;
223                         }
224                     }
225                     
226                     if (ableToOpen == 1) {
227                         if (m->getDefaultPath() != "") { //default path is set
228                             string tryPath = m->getDefaultPath() + m->getSimpleName(fName);
229                             m->mothurOut("Unable to open " + fName + ". Trying default " + tryPath); m->mothurOutEndLine();
230                             ifstream in2;
231                             ableToOpen = m->openInputFile(tryPath, in2, "noerror");
232                             in2.close();
233                             fName = tryPath;
234                         }
235                     }
236                     
237                     //if you can't open it its not in current working directory or inputDir, try mothur excutable location
238                     if (ableToOpen == 1) {
239                         string exepath = m->argv;
240                         string tempPath = exepath;
241                         for (int i = 0; i < exepath.length(); i++) { tempPath[i] = tolower(exepath[i]); }
242                         exepath = exepath.substr(0, (tempPath.find_last_of('m')));
243                         
244                         string tryPath = m->getFullPathName(exepath) + m->getSimpleName(fName);
245                         m->mothurOut("Unable to open " + fName + ". Trying mothur's executable location " + tryPath); m->mothurOutEndLine();
246                         ifstream in2;
247                         ableToOpen = m->openInputFile(tryPath, in2, "noerror");
248                         in2.close();
249                         fName = tryPath;
250                     }
251                     
252                     if (ableToOpen == 1) {  m->mothurOut("Unable to open " + fName + ". Disregarding. "); m->mothurOutEndLine();  }
253                     else { flowFileVector.push_back(fName); }
254                     m->gobble(flowFilesFile);
255                 }
256                 flowFilesFile.close();
257                 if (flowFileVector.size() == 0) {  m->mothurOut("[ERROR]: no valid files."); m->mothurOutEndLine(); abort = true; }
258             }
259             else{
260                 if (outputDir == "") { outputDir = m->hasPath(flowFileName); }
261                 flowFileVector.push_back(flowFileName);
262             }
263                 
264                         //check for optional parameter and set defaults
265                         // ...at some point should added some additional type checking...
266                         string temp;
267                         temp = validParameter.validFile(parameters, "lookup", true);
268                         if (temp == "not found")        {       
269                                 string path = m->argv;
270                 string tempPath = path;
271                 for (int i = 0; i < path.length(); i++) { tempPath[i] = tolower(path[i]); }
272                 path = path.substr(0, (tempPath.find_last_of('m')));
273                 
274 #if defined (__APPLE__) || (__MACH__) || (linux) || (__linux) || (__linux__) || (__unix__) || (__unix)
275                 path += "lookupFiles/";
276 #else
277                 path += "lookupFiles\\";
278 #endif
279                                 lookupFileName = m->getFullPathName(path) + "LookUp_Titanium.pat";
280                                 
281                                 int ableToOpen;
282                                 ifstream in;
283                                 ableToOpen = m->openInputFile(lookupFileName, in, "noerror");
284                                 in.close();     
285                                 
286                                 //if you can't open it, try input location
287                                 if (ableToOpen == 1) {
288                                         if (inputDir != "") { //default path is set
289                                                 string tryPath = inputDir + m->getSimpleName(lookupFileName);
290                                                 m->mothurOut("Unable to open " + lookupFileName + ". Trying input directory " + tryPath); m->mothurOutEndLine();
291                                                 ifstream in2;
292                                                 ableToOpen = m->openInputFile(tryPath, in2, "noerror");
293                                                 in2.close();
294                                                 lookupFileName = tryPath;
295                                         }
296                                 }
297                                 
298                                 //if you can't open it, try default location
299                                 if (ableToOpen == 1) {
300                                         if (m->getDefaultPath() != "") { //default path is set
301                                                 string tryPath = m->getDefaultPath() + m->getSimpleName(lookupFileName);
302                                                 m->mothurOut("Unable to open " + lookupFileName + ". Trying default " + tryPath); m->mothurOutEndLine();
303                                                 ifstream in2;
304                                                 ableToOpen = m->openInputFile(tryPath, in2, "noerror");
305                                                 in2.close();
306                                                 lookupFileName = tryPath;
307                                         }
308                                 }
309                                 
310                                 //if you can't open it its not in current working directory or inputDir, try mothur excutable location
311                                 if (ableToOpen == 1) {
312                                         string exepath = m->argv;
313                                         string tempPath = exepath;
314                                         for (int i = 0; i < exepath.length(); i++) { tempPath[i] = tolower(exepath[i]); }
315                                         exepath = exepath.substr(0, (tempPath.find_last_of('m')));
316                                         
317                                         string tryPath = m->getFullPathName(exepath) + m->getSimpleName(lookupFileName);
318                                         m->mothurOut("Unable to open " + lookupFileName + ". Trying mothur's executable location " + tryPath); m->mothurOutEndLine();
319                                         ifstream in2;
320                                         ableToOpen = m->openInputFile(tryPath, in2, "noerror");
321                                         in2.close();
322                                         lookupFileName = tryPath;
323                                 }
324                                 
325                                 if (ableToOpen == 1) {  m->mothurOut("Unable to open " + lookupFileName + "."); m->mothurOutEndLine(); abort=true;  }
326                         }
327                         else if(temp == "not open")     {       
328                                 
329                                 lookupFileName = validParameter.validFile(parameters, "lookup", false);
330                                 
331                                 //if you can't open it its not inputDir, try mothur excutable location
332                                 string exepath = m->argv;
333                                 string tempPath = exepath;
334                                 for (int i = 0; i < exepath.length(); i++) { tempPath[i] = tolower(exepath[i]); }
335                                 exepath = exepath.substr(0, (tempPath.find_last_of('m')));
336                                         
337                                 string tryPath = m->getFullPathName(exepath) + m->getSimpleName(lookupFileName);
338                                 m->mothurOut("Unable to open " + lookupFileName + ". Trying mothur's executable location " + tryPath); m->mothurOutEndLine();
339                                 ifstream in2;
340                                 int ableToOpen = m->openInputFile(tryPath, in2, "noerror");
341                                 in2.close();
342                                 lookupFileName = tryPath;
343                                 
344                                 if (ableToOpen == 1) {  m->mothurOut("Unable to open " + lookupFileName + "."); m->mothurOutEndLine(); abort=true;  }
345                         }else                                           {       lookupFileName = temp;  }
346                         
347                         temp = validParameter.validFile(parameters, "processors", false);       if (temp == "not found"){       temp = m->getProcessors();      }
348                         m->setProcessors(temp);
349                         m->mothurConvert(temp, processors);
350
351                         temp = validParameter.validFile(parameters, "cutoff", false);   if (temp == "not found"){       temp = "0.01";          }
352                         m->mothurConvert(temp, cutoff); 
353                         
354                         temp = validParameter.validFile(parameters, "mindelta", false); if (temp == "not found"){       temp = "0.000001";      }
355                         m->mothurConvert(temp, minDelta); 
356
357                         temp = validParameter.validFile(parameters, "maxiter", false);  if (temp == "not found"){       temp = "1000";          }
358                         m->mothurConvert(temp, maxIters); 
359             
360             temp = validParameter.validFile(parameters, "large", false);        if (temp == "not found"){       temp = "0";             }
361                         m->mothurConvert(temp, largeSize); 
362             if (largeSize != 0) { large = true; }
363             else { large = false;  }
364             if (largeSize < 0) {  m->mothurOut("The value of the large cannot be negative.\n"); }
365             
366 #ifdef USE_MPI
367             if (large) { m->mothurOut("The large parameter is not available with the MPI-Enabled version.\n"); large=false; }                           
368 #endif
369             
370
371                         temp = validParameter.validFile(parameters, "sigma", false);if (temp == "not found")    {       temp = "60";            }
372                         m->mothurConvert(temp, sigma); 
373                         
374                         temp = validParameter.validFile(parameters, "order", false);  if (temp == "not found"){         temp = "A";     }
375             if (temp.length() > 1) {  m->mothurOut("[ERROR]: " + temp + " is not a valid option for order. order options are A, B, or I. A = TACG, B = TACGTACGTACGATGTAGTCGAGCATCATCTGACGCAGTACGTGCATGATCTCAGTCAGCAGCTATGTCAGTGCATGCAGTGACTGATCGTCATCAGCTAGCATCGACTGCATAGATCGCATGACGATCGCATATCGTCAGTGCATGTAGTCGAGCATCATCTGACGCAGTACGTGCATGATCTCAGTCAGCAGCTATGTCAGTGCATGCATAGATCGCATGACGATCGCATATCGTCAGTGCAGTGACTGATCGTCATCAGCTAGCATCGACTGCATGTAGTCGAGCATCATCTGACGCAGTACGTGCATAGATCGCATGACGATCGCATATCGTCAGTGCATGATCTCAGTCAGCAGCTATGTCAGTGCATGCAGTGACTGATCGTCATCAGCTAGCATCGACTGCATGTAGTCGAGCATCATCTGACGCAGTACGTGCAGTGACTGATCGTCATCAGCTAGCATCGACTGCATAGATCGCATGACGATCGCATATCGTCAGTGCATGATCTCAGTCAGCAGCTATGTCAGTGCATGCATGTAGTCGAGCATCATCTGACGCAGTACGTGCATAGATCGCATGACGATCGCATATCGTCAGTGCAGTGACTGATCGTCATCAGCTAGCATCGACTGCATGATCTCAGTCAGCAGCTATGTCAGTGCATGCAGTGACTGATCGTCATCAGCTAGCATCGACTGCATAGATCGCATGACGATCGCATATCGTCAGTGCATGATCTCAGTCAGCAGCTATGTCAGTGCATGCATGTAGTCGAGCATCATCTGACGCAGTACGTGCATAGATCGCATGACGATCGCATATCGTCAGTGCATGATCTCAGTCAGCAGCTATGTCAGTGCATGCAGTGACTGATCGTCATCAGCTAGCATCGACTGCATGTAGTCGAGCATCATCTGACGCAGTACGTGCATGATCTCAGTCAGCAGCTATGTCAGTGCATGCATAGATCGCATGACGATCGCATATCGTCAGTGCATGTAGTCGAGCATCATCTGACGCAGTACGTGCAGTGACTGATCGTCATCAGCTAGCATCGACTGCATAGATCGCATGACGATCGCATATCGTCAGTGCATGTAGTCGAGCATCATCTGACGCAGTACGTGCATGATCTCAGTCAGCAGCTATGTCAGTGCATGCAGTGACTGATCGTCATCAGCTAGCATCGACTGCATGATCTCAGTCAGCAGCTATGTCAGTGCATGCAGTGACTGATCGTCATCAGCTAGCATCGACTGCATAGATCGCATGACGATCGCATATCGTCAGTGCATGTAGTCGAGCATCATCTGACGCAGTACGTGCATGATCTCAGTCAGCAGCTATGTCAGTGCATGCATGTAGTCGAGCATCATCTGACGCAGTACGTGCAGTGACTGATCGTCATCAGCTAGCATCGACTGCATAGATCGCATGACGATCGCATATCGTCAGTGCAGTGACTGATCGTCATCAGCTAGCATCGACTGCATGTAGTCGAGCATCATCTGACGCAGTACGTGCATAGATCGCATGACGATCGCATATCGTCAGTGCATGATCTCAGTCAGCAGCTATGTCAGTGCATGCAGTGACTGATCGTCATCAGCTAGCATCGACTGCATGTAGTCGAGCATCATCTGACGCAGTACGTGCATGATCTCAGTCAGCAGCTATGTCAGTGCATGCATAGATCGCATGACGATCGCATATCGTCAGTGCAGTGACTGATCGTCATCAGCTAGCATCGACTGCATGATCTCAGTCAGCAGC, and I = TACGTACGTCTGAGCATCGATCGATGTACAGCTACGTACGTCTGAGCATCGATCGATGTACAGCTACGTACGTCTGAGCATCGATCGATGTACAGCTACGTACGTCTGAGCATCGATCGATGTACAGCTACGTACGTCTGAGCATCGATCGATGTACAGCTACGTACGTCTGAGCATCGATCGATGTACAGCTACGTACGTCTGAGCATCGATCGATGTACAGCTACGTACGTCTGAGCATCGATCGATGTACAGCTACGTACGTCTGAGCATCGATCGATGTACAGCTACGTACGTCTGAGCATCGATCGATGTACAGCTACGTACGTCTGAGCATCGATCGATGTACAGCTACGTACGTCTGAGCATCGATCGATGTACAGCTACGTACGTCTGAGCATCGATCGATGTACAGCTACGTACGTCTGAGCATCGATCGATGTACAGCTACGTACGTCTGAGCATCGATCGATGTACAGCTACGTACGTCTGAGCATCGATCGATGTACAGCTACGTACGTCTGAGCATCGATCGATGTACAGCTACGTACGTCTGAGCATCGATCGATGTACAGCTACGTACGTCTGAGCATCGATCGATGTACAGCTACGTACGTCTGAGCATCGATCGATGTACAGC.\n");  abort=true;
376             }
377             else {
378                 if (toupper(temp[0]) == 'A') {  flowOrder = "TACG";   }
379                 else if(toupper(temp[0]) == 'B'){
380                     flowOrder = "TACGTACGTACGATGTAGTCGAGCATCATCTGACGCAGTACGTGCATGATCTCAGTCAGCAGCTATGTCAGTGCATGCAGTGACTGATCGTCATCAGCTAGCATCGACTGCATAGATCGCATGACGATCGCATATCGTCAGTGCATGTAGTCGAGCATCATCTGACGCAGTACGTGCATGATCTCAGTCAGCAGCTATGTCAGTGCATGCATAGATCGCATGACGATCGCATATCGTCAGTGCAGTGACTGATCGTCATCAGCTAGCATCGACTGCATGTAGTCGAGCATCATCTGACGCAGTACGTGCATAGATCGCATGACGATCGCATATCGTCAGTGCATGATCTCAGTCAGCAGCTATGTCAGTGCATGCAGTGACTGATCGTCATCAGCTAGCATCGACTGCATGTAGTCGAGCATCATCTGACGCAGTACGTGCAGTGACTGATCGTCATCAGCTAGCATCGACTGCATAGATCGCATGACGATCGCATATCGTCAGTGCATGATCTCAGTCAGCAGCTATGTCAGTGCATGCATGTAGTCGAGCATCATCTGACGCAGTACGTGCATAGATCGCATGACGATCGCATATCGTCAGTGCAGTGACTGATCGTCATCAGCTAGCATCGACTGCATGATCTCAGTCAGCAGCTATGTCAGTGCATGCAGTGACTGATCGTCATCAGCTAGCATCGACTGCATAGATCGCATGACGATCGCATATCGTCAGTGCATGATCTCAGTCAGCAGCTATGTCAGTGCATGCATGTAGTCGAGCATCATCTGACGCAGTACGTGCATAGATCGCATGACGATCGCATATCGTCAGTGCATGATCTCAGTCAGCAGCTATGTCAGTGCATGCAGTGACTGATCGTCATCAGCTAGCATCGACTGCATGTAGTCGAGCATCATCTGACGCAGTACGTGCATGATCTCAGTCAGCAGCTATGTCAGTGCATGCATAGATCGCATGACGATCGCATATCGTCAGTGCATGTAGTCGAGCATCATCTGACGCAGTACGTGCAGTGACTGATCGTCATCAGCTAGCATCGACTGCATAGATCGCATGACGATCGCATATCGTCAGTGCATGTAGTCGAGCATCATCTGACGCAGTACGTGCATGATCTCAGTCAGCAGCTATGTCAGTGCATGCAGTGACTGATCGTCATCAGCTAGCATCGACTGCATGATCTCAGTCAGCAGCTATGTCAGTGCATGCAGTGACTGATCGTCATCAGCTAGCATCGACTGCATAGATCGCATGACGATCGCATATCGTCAGTGCATGTAGTCGAGCATCATCTGACGCAGTACGTGCATGATCTCAGTCAGCAGCTATGTCAGTGCATGCATGTAGTCGAGCATCATCTGACGCAGTACGTGCAGTGACTGATCGTCATCAGCTAGCATCGACTGCATAGATCGCATGACGATCGCATATCGTCAGTGCAGTGACTGATCGTCATCAGCTAGCATCGACTGCATGTAGTCGAGCATCATCTGACGCAGTACGTGCATAGATCGCATGACGATCGCATATCGTCAGTGCATGATCTCAGTCAGCAGCTATGTCAGTGCATGCAGTGACTGATCGTCATCAGCTAGCATCGACTGCATGTAGTCGAGCATCATCTGACGCAGTACGTGCATGATCTCAGTCAGCAGCTATGTCAGTGCATGCATAGATCGCATGACGATCGCATATCGTCAGTGCAGTGACTGATCGTCATCAGCTAGCATCGACTGCATGATCTCAGTCAGCAGC";   }
381                 else if(toupper(temp[0]) == 'I'){
382                     flowOrder = "TACGTACGTCTGAGCATCGATCGATGTACAGCTACGTACGTCTGAGCATCGATCGATGTACAGCTACGTACGTCTGAGCATCGATCGATGTACAGCTACGTACGTCTGAGCATCGATCGATGTACAGCTACGTACGTCTGAGCATCGATCGATGTACAGCTACGTACGTCTGAGCATCGATCGATGTACAGCTACGTACGTCTGAGCATCGATCGATGTACAGCTACGTACGTCTGAGCATCGATCGATGTACAGCTACGTACGTCTGAGCATCGATCGATGTACAGCTACGTACGTCTGAGCATCGATCGATGTACAGCTACGTACGTCTGAGCATCGATCGATGTACAGCTACGTACGTCTGAGCATCGATCGATGTACAGCTACGTACGTCTGAGCATCGATCGATGTACAGCTACGTACGTCTGAGCATCGATCGATGTACAGCTACGTACGTCTGAGCATCGATCGATGTACAGCTACGTACGTCTGAGCATCGATCGATGTACAGCTACGTACGTCTGAGCATCGATCGATGTACAGCTACGTACGTCTGAGCATCGATCGATGTACAGCTACGTACGTCTGAGCATCGATCGATGTACAGCTACGTACGTCTGAGCATCGATCGATGTACAGC";   }
383                 else {
384                     m->mothurOut("[ERROR]: " + temp + " is not a valid option for order. order options are A, B, or I. A = TACG, B = TACGTACGTACGATGTAGTCGAGCATCATCTGACGCAGTACGTGCATGATCTCAGTCAGCAGCTATGTCAGTGCATGCAGTGACTGATCGTCATCAGCTAGCATCGACTGCATAGATCGCATGACGATCGCATATCGTCAGTGCATGTAGTCGAGCATCATCTGACGCAGTACGTGCATGATCTCAGTCAGCAGCTATGTCAGTGCATGCATAGATCGCATGACGATCGCATATCGTCAGTGCAGTGACTGATCGTCATCAGCTAGCATCGACTGCATGTAGTCGAGCATCATCTGACGCAGTACGTGCATAGATCGCATGACGATCGCATATCGTCAGTGCATGATCTCAGTCAGCAGCTATGTCAGTGCATGCAGTGACTGATCGTCATCAGCTAGCATCGACTGCATGTAGTCGAGCATCATCTGACGCAGTACGTGCAGTGACTGATCGTCATCAGCTAGCATCGACTGCATAGATCGCATGACGATCGCATATCGTCAGTGCATGATCTCAGTCAGCAGCTATGTCAGTGCATGCATGTAGTCGAGCATCATCTGACGCAGTACGTGCATAGATCGCATGACGATCGCATATCGTCAGTGCAGTGACTGATCGTCATCAGCTAGCATCGACTGCATGATCTCAGTCAGCAGCTATGTCAGTGCATGCAGTGACTGATCGTCATCAGCTAGCATCGACTGCATAGATCGCATGACGATCGCATATCGTCAGTGCATGATCTCAGTCAGCAGCTATGTCAGTGCATGCATGTAGTCGAGCATCATCTGACGCAGTACGTGCATAGATCGCATGACGATCGCATATCGTCAGTGCATGATCTCAGTCAGCAGCTATGTCAGTGCATGCAGTGACTGATCGTCATCAGCTAGCATCGACTGCATGTAGTCGAGCATCATCTGACGCAGTACGTGCATGATCTCAGTCAGCAGCTATGTCAGTGCATGCATAGATCGCATGACGATCGCATATCGTCAGTGCATGTAGTCGAGCATCATCTGACGCAGTACGTGCAGTGACTGATCGTCATCAGCTAGCATCGACTGCATAGATCGCATGACGATCGCATATCGTCAGTGCATGTAGTCGAGCATCATCTGACGCAGTACGTGCATGATCTCAGTCAGCAGCTATGTCAGTGCATGCAGTGACTGATCGTCATCAGCTAGCATCGACTGCATGATCTCAGTCAGCAGCTATGTCAGTGCATGCAGTGACTGATCGTCATCAGCTAGCATCGACTGCATAGATCGCATGACGATCGCATATCGTCAGTGCATGTAGTCGAGCATCATCTGACGCAGTACGTGCATGATCTCAGTCAGCAGCTATGTCAGTGCATGCATGTAGTCGAGCATCATCTGACGCAGTACGTGCAGTGACTGATCGTCATCAGCTAGCATCGACTGCATAGATCGCATGACGATCGCATATCGTCAGTGCAGTGACTGATCGTCATCAGCTAGCATCGACTGCATGTAGTCGAGCATCATCTGACGCAGTACGTGCATAGATCGCATGACGATCGCATATCGTCAGTGCATGATCTCAGTCAGCAGCTATGTCAGTGCATGCAGTGACTGATCGTCATCAGCTAGCATCGACTGCATGTAGTCGAGCATCATCTGACGCAGTACGTGCATGATCTCAGTCAGCAGCTATGTCAGTGCATGCATAGATCGCATGACGATCGCATATCGTCAGTGCAGTGACTGATCGTCATCAGCTAGCATCGACTGCATGATCTCAGTCAGCAGC, and I = TACGTACGTCTGAGCATCGATCGATGTACAGCTACGTACGTCTGAGCATCGATCGATGTACAGCTACGTACGTCTGAGCATCGATCGATGTACAGCTACGTACGTCTGAGCATCGATCGATGTACAGCTACGTACGTCTGAGCATCGATCGATGTACAGCTACGTACGTCTGAGCATCGATCGATGTACAGCTACGTACGTCTGAGCATCGATCGATGTACAGCTACGTACGTCTGAGCATCGATCGATGTACAGCTACGTACGTCTGAGCATCGATCGATGTACAGCTACGTACGTCTGAGCATCGATCGATGTACAGCTACGTACGTCTGAGCATCGATCGATGTACAGCTACGTACGTCTGAGCATCGATCGATGTACAGCTACGTACGTCTGAGCATCGATCGATGTACAGCTACGTACGTCTGAGCATCGATCGATGTACAGCTACGTACGTCTGAGCATCGATCGATGTACAGCTACGTACGTCTGAGCATCGATCGATGTACAGCTACGTACGTCTGAGCATCGATCGATGTACAGCTACGTACGTCTGAGCATCGATCGATGTACAGCTACGTACGTCTGAGCATCGATCGATGTACAGCTACGTACGTCTGAGCATCGATCGATGTACAGC.\n");  abort=true;
385                 }
386             }
387
388                         
389                 }
390 #ifdef USE_MPI
391                 }                               
392 #endif
393         }
394         catch(exception& e) {
395                 m->errorOut(e, "ShhherCommand", "ShhherCommand");
396                 exit(1);
397         }
398 }
399 //**********************************************************************************************************************
400 #ifdef USE_MPI
401 int ShhherCommand::execute(){
402         try {
403                 if (abort == true) { if (calledHelp) { return 0; }  return 2;   }
404                 
405                 int tag = 1976;
406                 MPI_Status status; 
407                         
408                 if(pid == 0){
409
410                         for(int i=1;i<ncpus;i++){
411                                 MPI_Send(&abort, 1, MPI_INT, i, tag, MPI_COMM_WORLD);
412                         }
413                         if(abort == 1){ return 0;       }
414
415                         processors = ncpus;
416                         
417                         m->mothurOut("\nGetting preliminary data...\n");
418                         getSingleLookUp();      if (m->control_pressed) { return 0; }
419                         getJointLookUp();       if (m->control_pressed) { return 0; }
420                         
421             vector<string> flowFileVector;
422                         if(flowFilesFileName != "not found"){
423                                 string fName;
424                 
425                                 ifstream flowFilesFile;
426                                 m->openInputFile(flowFilesFileName, flowFilesFile);
427                                 while(flowFilesFile){
428                                         fName = m->getline(flowFilesFile);
429                                         flowFileVector.push_back(fName);
430                                         m->gobble(flowFilesFile);
431                                 }
432                         }
433                         else{
434                                 flowFileVector.push_back(flowFileName);
435                         }
436             
437                         int numFiles = flowFileVector.size();
438
439                         for(int i=1;i<ncpus;i++){
440                                 MPI_Send(&numFiles, 1, MPI_INT, i, tag, MPI_COMM_WORLD);
441                         }
442                         
443                         for(int i=0;i<numFiles;i++){
444                                 
445                                 if (m->control_pressed) { break; }
446                                 
447                                 double begClock = clock();
448                                 unsigned long long begTime = time(NULL);
449
450                                 flowFileName = flowFileVector[i];
451                                 
452                                 m->mothurOut("\n>>>>>\tProcessing " + flowFileName + " (file " + toString(i+1) + " of " + toString(numFiles) + ")\t<<<<<\n");
453                                 m->mothurOut("Reading flowgrams...\n");
454                                 getFlowData();
455                                 
456                                 if (m->control_pressed) { break; }
457
458                                 m->mothurOut("Identifying unique flowgrams...\n");
459                                 getUniques();
460                                 
461                                 if (m->control_pressed) { break; }
462
463                                 m->mothurOut("Calculating distances between flowgrams...\n");
464                                 char fileName[1024];
465                                 strcpy(fileName, flowFileName.c_str());
466
467                                 for(int i=1;i<ncpus;i++){
468                                         MPI_Send(&fileName[0], 1024, MPI_CHAR, i, tag, MPI_COMM_WORLD);
469
470                                         MPI_Send(&numSeqs, 1, MPI_INT, i, tag, MPI_COMM_WORLD);
471                                         MPI_Send(&numUniques, 1, MPI_INT, i, tag, MPI_COMM_WORLD);
472                                         MPI_Send(&numFlowCells, 1, MPI_INT, i, tag, MPI_COMM_WORLD);
473                                         MPI_Send(&flowDataIntI[0], numSeqs * numFlowCells, MPI_SHORT, i, tag, MPI_COMM_WORLD);
474                                         MPI_Send(&flowDataPrI[0], numSeqs * numFlowCells, MPI_DOUBLE, i, tag, MPI_COMM_WORLD);
475                                         MPI_Send(&mapUniqueToSeq[0], numSeqs, MPI_INT, i, tag, MPI_COMM_WORLD);
476                                         MPI_Send(&mapSeqToUnique[0], numSeqs, MPI_INT, i, tag, MPI_COMM_WORLD);
477                                         MPI_Send(&lengths[0], numSeqs, MPI_INT, i, tag, MPI_COMM_WORLD);
478                                         MPI_Send(&jointLookUp[0], NUMBINS * NUMBINS, MPI_DOUBLE, i, tag, MPI_COMM_WORLD);
479                                         MPI_Send(&cutoff, 1, MPI_DOUBLE, i, tag, MPI_COMM_WORLD);
480                                 }                               
481                                                         
482                                 string distFileName = flowDistMPI(0, int(sqrt(1.0/float(ncpus)) * numUniques));
483                                 
484                                 if (m->control_pressed) { break; }
485                                 
486                                 int done;
487                                 for(int i=1;i<ncpus;i++){
488                                         MPI_Recv(&done, 1, MPI_INT, i, tag, MPI_COMM_WORLD, &status);
489                                         
490                                         m->appendFiles((distFileName + ".temp." + toString(i)), distFileName);
491                                         m->mothurRemove((distFileName + ".temp." + toString(i)));
492                                 }
493
494                                 string namesFileName = createNamesFile();
495                                 
496                                 if (m->control_pressed) { break; }
497                                 
498                                 m->mothurOut("\nClustering flowgrams...\n");
499                                 string listFileName = cluster(distFileName, namesFileName);
500                                 
501                                 if (m->control_pressed) { break; }
502                                 
503                 
504                 
505                                 getOTUData(listFileName);
506
507                                 m->mothurRemove(distFileName);
508                                 m->mothurRemove(namesFileName);
509                                 m->mothurRemove(listFileName);
510                                 
511                                 if (m->control_pressed) { break; }
512                                 
513                                 initPyroCluster();
514
515                                 if (m->control_pressed) { break; }
516                                 
517             
518                                 for(int i=1;i<ncpus;i++){
519                                         MPI_Send(&numOTUs, 1, MPI_INT, i, tag, MPI_COMM_WORLD);
520                                         MPI_Send(&singleLookUp[0], singleLookUp.size(), MPI_DOUBLE, i, tag, MPI_COMM_WORLD);
521                                         MPI_Send(&uniqueFlowgrams[0], numFlowCells * numUniques, MPI_SHORT, i, tag, MPI_COMM_WORLD);
522                                         MPI_Send(&sigma, 1, MPI_DOUBLE, i, tag, MPI_COMM_WORLD);
523                                 }
524                                 
525                                 if (m->control_pressed) { break; }
526                                 
527                                 double maxDelta = 0;
528                                 int iter = 0;
529                                 
530                                 int numOTUsOnCPU = numOTUs / ncpus;
531                                 int numSeqsOnCPU = numSeqs / ncpus;
532                                 m->mothurOut("\nDenoising flowgrams...\n");
533                                 m->mothurOut("iter\tmaxDelta\tnLL\t\tcycletime\n");
534                                 
535                                 while((maxIters == 0 && maxDelta > minDelta) || iter < MIN_ITER || (maxDelta > minDelta && iter < maxIters)){
536                                         
537                                         double cycClock = clock();
538                                         unsigned long long cycTime = time(NULL);
539                                         fill();
540                                         
541                                         if (m->control_pressed) { break; }
542
543                                         int total = singleTau.size();
544                                         for(int i=1;i<ncpus;i++){
545                                                 MPI_Send(&total, 1, MPI_INT, i, tag, MPI_COMM_WORLD);
546                                                 MPI_Send(&change[0], numOTUs, MPI_SHORT, i, tag, MPI_COMM_WORLD);
547                                                 MPI_Send(&centroids[0], numOTUs, MPI_INT, i, tag, MPI_COMM_WORLD);
548                                                 
549                                                 MPI_Send(&singleTau[0], total, MPI_DOUBLE, i, tag, MPI_COMM_WORLD);
550                                                 MPI_Send(&seqNumber[0], total, MPI_INT, i, tag, MPI_COMM_WORLD);
551                                                 MPI_Send(&seqIndex[0], total, MPI_INT, i, tag, MPI_COMM_WORLD);
552                                                 MPI_Send(&nSeqsPerOTU[0], numOTUs, MPI_INT, i, tag, MPI_COMM_WORLD);
553                                                 MPI_Send(&cumNumSeqs[0], numOTUs, MPI_INT, i, tag, MPI_COMM_WORLD);
554                                         }
555                                         
556                                         calcCentroidsDriver(0, numOTUsOnCPU);
557                                         
558                                         for(int i=1;i<ncpus;i++){
559                                                 int otuStart = i * numOTUs / ncpus;
560                                                 int otuStop = (i + 1) * numOTUs / ncpus;
561                                                 
562                                                 vector<int> tempCentroids(numOTUs, 0);
563                                                 vector<short> tempChange(numOTUs, 0);
564                                                 
565                                                 MPI_Recv(&tempCentroids[0], numOTUs, MPI_INT, i, tag, MPI_COMM_WORLD, &status);
566                                                 MPI_Recv(&tempChange[0], numOTUs, MPI_SHORT, i, tag, MPI_COMM_WORLD, &status);
567                                                 
568                                                 for(int j=otuStart;j<otuStop;j++){
569                                                         centroids[j] = tempCentroids[j];
570                                                         change[j] = tempChange[j];
571                                                 }
572                                         }
573                                                                         
574                                         maxDelta = getNewWeights(); if (m->control_pressed) { break; }
575                                         double nLL = getLikelihood(); if (m->control_pressed) { break; }
576                                         checkCentroids(); if (m->control_pressed) { break; }
577                                         
578                                         for(int i=1;i<ncpus;i++){
579                                                 MPI_Send(&centroids[0], numOTUs, MPI_INT, i, tag, MPI_COMM_WORLD);
580                                                 MPI_Send(&weight[0], numOTUs, MPI_DOUBLE, i, tag, MPI_COMM_WORLD);
581                                                 MPI_Send(&change[0], numOTUs, MPI_SHORT, i, tag, MPI_COMM_WORLD);
582                                         }
583                                         
584                                         calcNewDistancesParent(0, numSeqsOnCPU);
585
586                                         total = singleTau.size();
587
588                                         for(int i=1;i<ncpus;i++){
589                                                 int childTotal;
590                                                 int seqStart = i * numSeqs / ncpus;
591                                                 int seqStop = (i + 1) * numSeqs / ncpus;
592                                                 
593                                                 MPI_Recv(&childTotal, 1, MPI_INT, i, tag, MPI_COMM_WORLD, &status);
594
595                                                 vector<int> childSeqIndex(childTotal, 0);
596                                                 vector<double> childSingleTau(childTotal, 0);
597                                                 vector<double> childDist(numSeqs * numOTUs, 0);
598                                                 vector<int> otuIndex(childTotal, 0);
599                                                 
600                                                 MPI_Recv(&childSeqIndex[0], childTotal, MPI_INT, i, tag, MPI_COMM_WORLD, &status);
601                                                 MPI_Recv(&childSingleTau[0], childTotal, MPI_DOUBLE, i, tag, MPI_COMM_WORLD, &status);
602                                                 MPI_Recv(&childDist[0], numOTUs * numSeqs, MPI_DOUBLE, i, tag, MPI_COMM_WORLD, &status);
603                                                 MPI_Recv(&otuIndex[0], childTotal, MPI_INT, i, tag, MPI_COMM_WORLD, &status);
604
605                                                 int oldTotal = total;
606                                                 total += childTotal;
607                                                 singleTau.resize(total, 0);
608                                                 seqIndex.resize(total, 0);
609                                                 seqNumber.resize(total, 0);
610                                                 
611                                                 int childIndex = 0;
612                                                 
613                                                 for(int j=oldTotal;j<total;j++){
614                                                         int otuI = otuIndex[childIndex];
615                                                         int seqI = childSeqIndex[childIndex];
616                                                         
617                                                         singleTau[j] = childSingleTau[childIndex];
618                                                         
619                                                         aaP[otuI][nSeqsPerOTU[otuI]] = j;
620                                                         aaI[otuI][nSeqsPerOTU[otuI]] = seqI;
621                                                         nSeqsPerOTU[otuI]++;
622                                                         childIndex++;
623                                                 }
624                                                 
625                                                 int index = seqStart * numOTUs;
626                                                 for(int j=seqStart;j<seqStop;j++){
627                                                         for(int k=0;k<numOTUs;k++){
628                                                                 dist[index] = childDist[index];
629                                                                 index++;
630                                                         }
631                                                 }                                       
632                                         }
633                                         
634                                         iter++;
635                                         
636                                         m->mothurOut(toString(iter) + '\t' + toString(maxDelta) + '\t' + toString(nLL) + '\t' + toString(time(NULL) - cycTime) + '\t' + toString((clock() - cycClock)/(double)CLOCKS_PER_SEC) + '\n');                  
637
638                                         if((maxIters == 0 && maxDelta > minDelta) || iter < MIN_ITER || (maxDelta > minDelta && iter < maxIters)){
639                                                 int live = 1;
640                                                 for(int i=1;i<ncpus;i++){
641                                                         MPI_Send(&live, 1, MPI_INT, i, tag, MPI_COMM_WORLD);
642                                                 }
643                                         }
644                                         else{
645                                                 int live = 0;
646                                                 for(int i=1;i<ncpus;i++){
647                                                         MPI_Send(&live, 1, MPI_INT, i, tag, MPI_COMM_WORLD); //send kill command
648                                                 }
649                                         }
650                                         
651                                 }       
652                                 
653                                 if (m->control_pressed) { break; }
654                                 
655                                 m->mothurOut("\nFinalizing...\n");
656                                 fill();
657                                 
658                                 if (m->control_pressed) { break; }
659                                 
660                                 setOTUs();
661                                 
662                                 vector<int> otuCounts(numOTUs, 0);
663                                 for(int i=0;i<numSeqs;i++)      {       otuCounts[otuData[i]]++;        }
664                                 calcCentroidsDriver(0, numOTUs);
665                                 
666                                 if (m->control_pressed) { break; }
667                                 
668                                 writeQualities(otuCounts);      if (m->control_pressed) { break; }
669                                 writeSequences(otuCounts);      if (m->control_pressed) { break; }
670                                 writeNames(otuCounts);          if (m->control_pressed) { break; }
671                                 writeClusters(otuCounts);       if (m->control_pressed) { break; }
672                                 writeGroups();                          if (m->control_pressed) { break; }
673                                 
674                                                                  
675                                 m->mothurOut("Total time to process " + toString(flowFileName) + ":\t" + toString(time(NULL) - begTime) + '\t' + toString((clock() - begClock)/(double)CLOCKS_PER_SEC) + '\n');                 
676                         }
677                 }
678                 else{
679                         int abort = 1;
680
681                         MPI_Recv(&abort, 1, MPI_INT, 0, tag, MPI_COMM_WORLD, &status);
682                         if(abort){      return 0;       }
683
684                         int numFiles;
685                         MPI_Recv(&numFiles, 1, MPI_INT, 0, tag, MPI_COMM_WORLD, &status);
686
687                         for(int i=0;i<numFiles;i++){
688                                 
689                                 if (m->control_pressed) { break; }
690                                 
691                                 //Now into the pyrodist part
692                                 bool live = 1;
693
694                                 char fileName[1024];
695                                 MPI_Recv(&fileName, 1024, MPI_CHAR, 0, tag, MPI_COMM_WORLD, &status);
696                                 MPI_Recv(&numSeqs, 1, MPI_INT, 0, tag, MPI_COMM_WORLD, &status);
697                                 MPI_Recv(&numUniques, 1, MPI_INT, 0, tag, MPI_COMM_WORLD, &status);
698                                 MPI_Recv(&numFlowCells, 1, MPI_INT, 0, tag, MPI_COMM_WORLD, &status);
699                                 
700                                 flowDataIntI.resize(numSeqs * numFlowCells);
701                                 flowDataPrI.resize(numSeqs * numFlowCells);
702                                 mapUniqueToSeq.resize(numSeqs);
703                                 mapSeqToUnique.resize(numSeqs);
704                                 lengths.resize(numSeqs);
705                                 jointLookUp.resize(NUMBINS * NUMBINS);
706
707                                 MPI_Recv(&flowDataIntI[0], numSeqs * numFlowCells, MPI_SHORT, 0, tag, MPI_COMM_WORLD, &status);
708                                 MPI_Recv(&flowDataPrI[0], numSeqs * numFlowCells, MPI_DOUBLE, 0, tag, MPI_COMM_WORLD, &status);
709                                 MPI_Recv(&mapUniqueToSeq[0], numSeqs, MPI_INT, 0, tag, MPI_COMM_WORLD, &status);
710                                 MPI_Recv(&mapSeqToUnique[0], numSeqs, MPI_INT, 0, tag, MPI_COMM_WORLD, &status);
711                                 MPI_Recv(&lengths[0], numSeqs, MPI_INT, 0, tag, MPI_COMM_WORLD, &status);
712                                 MPI_Recv(&jointLookUp[0], NUMBINS * NUMBINS, MPI_DOUBLE, 0, tag, MPI_COMM_WORLD, &status);
713                                 MPI_Recv(&cutoff, 1, MPI_DOUBLE, 0, tag, MPI_COMM_WORLD, &status);
714
715                                 flowFileName = string(fileName);
716                                 int flowDistStart = int(sqrt(float(pid)/float(ncpus)) * numUniques);
717                                 int flowDistEnd = int(sqrt(float(pid+1)/float(ncpus)) * numUniques);
718                                 
719                                 string distanceStringChild = flowDistMPI(flowDistStart, flowDistEnd);
720                                 
721                                 if (m->control_pressed) { break; }
722                                 
723                                 int done = 1;
724                                 MPI_Send(&done, 1, MPI_INT, 0, tag, MPI_COMM_WORLD);
725
726                                 //Now into the pyronoise part
727                                 MPI_Recv(&numOTUs, 1, MPI_INT, 0, tag, MPI_COMM_WORLD, &status);
728                                 
729                                 singleLookUp.resize(HOMOPS * NUMBINS);
730                                 uniqueFlowgrams.resize(numUniques * numFlowCells);
731                                 weight.resize(numOTUs);
732                                 centroids.resize(numOTUs);
733                                 change.resize(numOTUs);
734                                 dist.assign(numOTUs * numSeqs, 0);
735                                 nSeqsPerOTU.resize(numOTUs);
736                                 cumNumSeqs.resize(numOTUs);
737
738                                 MPI_Recv(&singleLookUp[0], singleLookUp.size(), MPI_DOUBLE, 0, tag, MPI_COMM_WORLD, &status);
739                                 MPI_Recv(&uniqueFlowgrams[0], uniqueFlowgrams.size(), MPI_SHORT, 0, tag, MPI_COMM_WORLD, &status);
740                                 MPI_Recv(&sigma, 1, MPI_DOUBLE, 0, tag, MPI_COMM_WORLD, &status);
741                                 
742                                 int startOTU = pid * numOTUs / ncpus;
743                                 int endOTU = (pid + 1) * numOTUs / ncpus;
744
745                                 int startSeq = pid * numSeqs / ncpus;
746                                 int endSeq = (pid + 1) * numSeqs /ncpus;
747                                 
748                                 int total;
749
750                                 while(live){
751                                         
752                                         if (m->control_pressed) { break; }
753                                         
754                                         MPI_Recv(&total, 1, MPI_INT, 0, tag, MPI_COMM_WORLD, &status);
755                                         singleTau.assign(total, 0.0000);
756                                         seqNumber.assign(total, 0);
757                                         seqIndex.assign(total, 0);
758
759                                         MPI_Recv(&change[0], numOTUs, MPI_SHORT, 0, tag, MPI_COMM_WORLD, &status);
760                                         MPI_Recv(&centroids[0], numOTUs, MPI_INT, 0, tag, MPI_COMM_WORLD, &status);
761                                         MPI_Recv(&singleTau[0], total, MPI_DOUBLE, 0, tag, MPI_COMM_WORLD, &status);
762                                         MPI_Recv(&seqNumber[0], total, MPI_DOUBLE, 0, tag, MPI_COMM_WORLD, &status);
763                                         MPI_Recv(&seqIndex[0], total, MPI_INT, 0, tag, MPI_COMM_WORLD, &status);
764                                         MPI_Recv(&nSeqsPerOTU[0], total, MPI_INT, 0, tag, MPI_COMM_WORLD, &status);
765                                         MPI_Recv(&cumNumSeqs[0], numOTUs, MPI_INT, 0, tag, MPI_COMM_WORLD, &status);
766
767                                         calcCentroidsDriver(startOTU, endOTU);
768
769                                         MPI_Send(&centroids[0], numOTUs, MPI_INT, 0, tag, MPI_COMM_WORLD);
770                                         MPI_Send(&change[0], numOTUs, MPI_SHORT, 0, tag, MPI_COMM_WORLD);
771
772                                         MPI_Recv(&centroids[0], numOTUs, MPI_INT, 0, tag, MPI_COMM_WORLD, &status);
773                                         MPI_Recv(&weight[0], numOTUs, MPI_DOUBLE, 0, tag, MPI_COMM_WORLD, &status);
774                                         MPI_Recv(&change[0], numOTUs, MPI_SHORT, 0, tag, MPI_COMM_WORLD, &status);
775
776                                         vector<int> otuIndex(total, 0);
777                                         calcNewDistancesChildMPI(startSeq, endSeq, otuIndex);
778                                         total = otuIndex.size();
779                                         
780                                         MPI_Send(&total, 1, MPI_INT, 0, tag, MPI_COMM_WORLD);
781                                         MPI_Send(&seqIndex[0], total, MPI_INT, 0, tag, MPI_COMM_WORLD);
782                                         MPI_Send(&singleTau[0], total, MPI_DOUBLE, 0, tag, MPI_COMM_WORLD);
783                                         MPI_Send(&dist[0], numOTUs * numSeqs, MPI_DOUBLE, 0, tag, MPI_COMM_WORLD);
784                                         MPI_Send(&otuIndex[0], total, MPI_INT, 0, tag, MPI_COMM_WORLD);
785                                 
786                                         MPI_Recv(&live, 1, MPI_INT, 0, tag, MPI_COMM_WORLD, &status);
787                                 }
788                         }
789                 }
790                 
791                 if (m->control_pressed) { for (int i = 0; i < outputNames.size(); i++) { m->mothurRemove(outputNames[i]); } return 0; }
792                 
793                 MPI_Barrier(MPI_COMM_WORLD);
794
795                 
796                 if(compositeFASTAFileName != ""){
797                         outputNames.push_back(compositeFASTAFileName); outputTypes["fasta"].push_back(compositeFASTAFileName);
798                         outputNames.push_back(compositeNamesFileName); outputTypes["name"].push_back(compositeNamesFileName); 
799                 }
800
801                 m->mothurOutEndLine();
802                 m->mothurOut("Output File Names: "); m->mothurOutEndLine();
803                 for (int i = 0; i < outputNames.size(); i++) {  m->mothurOut(outputNames[i]); m->mothurOutEndLine();    }
804                 m->mothurOutEndLine();
805                 
806                 return 0;
807
808         }
809         catch(exception& e) {
810                 m->errorOut(e, "ShhherCommand", "execute");
811                 exit(1);
812         }
813 }
814 /**************************************************************************************************/
815 string ShhherCommand::createNamesFile(){
816         try{
817                 
818                 vector<string> duplicateNames(numUniques, "");
819                 for(int i=0;i<numSeqs;i++){
820                         duplicateNames[mapSeqToUnique[i]] += seqNameVector[i] + ',';
821                 }
822                 map<string, string> variables; 
823                 variables["[filename]"] = outputDir + m->getRootName(m->getSimpleName(flowFileName));
824                 string nameFileName = getOutputFileName("name",variables);
825                 
826                 ofstream nameFile;
827                 m->openOutputFile(nameFileName, nameFile);
828                 
829                 for(int i=0;i<numUniques;i++){
830                         
831                         if (m->control_pressed) { break; }
832                         
833             //                  nameFile << seqNameVector[mapUniqueToSeq[i]] << '\t' << duplicateNames[i].substr(0, duplicateNames[i].find_last_of(',')) << endl;
834                         nameFile << mapUniqueToSeq[i] << '\t' << duplicateNames[i].substr(0, duplicateNames[i].find_last_of(',')) << endl;
835                 }
836                 
837                 nameFile.close();
838                 return  nameFileName;
839         }
840         catch(exception& e) {
841                 m->errorOut(e, "ShhherCommand", "createNamesFile");
842                 exit(1);
843         }
844 }
845 /**************************************************************************************************/
846
847 string ShhherCommand::flowDistMPI(int startSeq, int stopSeq){
848         try{            
849                 ostringstream outStream;
850                 outStream.setf(ios::fixed, ios::floatfield);
851                 outStream.setf(ios::dec, ios::basefield);
852                 outStream.setf(ios::showpoint);
853                 outStream.precision(6);
854                 
855                 int begTime = time(NULL);
856                 double begClock = clock();
857                 
858                 for(int i=startSeq;i<stopSeq;i++){
859                         
860                         if (m->control_pressed) { break; }
861                         
862                         for(int j=0;j<i;j++){
863                                 float flowDistance = calcPairwiseDist(mapUniqueToSeq[i], mapUniqueToSeq[j]);
864                                 
865                                 if(flowDistance < 1e-6){
866                                         outStream << mapUniqueToSeq[i] << '\t' << mapUniqueToSeq[j] << '\t' << 0.000000 << endl;
867                                 }
868                                 else if(flowDistance <= cutoff){
869                                         outStream << mapUniqueToSeq[i] << '\t' << mapUniqueToSeq[j] << '\t' << flowDistance << endl;
870                                 }
871                         }
872                         if(i % 100 == 0){
873                                 m->mothurOutJustToScreen(toString(i) + '\t' + toString(time(NULL) - begTime) + '\t' + toString((clock()-begClock)/CLOCKS_PER_SEC) + '\n');
874                         }
875                 }
876                 
877                 string fDistFileName = flowFileName.substr(0,flowFileName.find_last_of('.')) + ".shhh.dist";
878                 if(pid != 0){   fDistFileName += ".temp." + toString(pid);      }
879                 
880                 if (m->control_pressed) { return fDistFileName; }
881                 
882                 m->mothurOutJustToScreen(toString(stopSeq) + '\t' + toString(time(NULL) - begTime) + '\t' + toString((clock()-begClock)/CLOCKS_PER_SEC) + '\n');
883
884                 ofstream distFile(fDistFileName.c_str());
885                 distFile << outStream.str();            
886                 distFile.close();
887                 
888                 return fDistFileName;
889         }
890         catch(exception& e) {
891                 m->errorOut(e, "ShhherCommand", "flowDistMPI");
892                 exit(1);
893         }
894 }
895 /**************************************************************************************************/
896  
897 void ShhherCommand::getOTUData(string listFileName){
898     try {
899         
900         ifstream listFile;
901         m->openInputFile(listFileName, listFile);
902         string label;
903         
904         listFile >> label >> numOTUs;
905         
906         otuData.assign(numSeqs, 0);
907         cumNumSeqs.assign(numOTUs, 0);
908         nSeqsPerOTU.assign(numOTUs, 0);
909         aaP.clear();aaP.resize(numOTUs);
910         
911         seqNumber.clear();
912         aaI.clear();
913         seqIndex.clear();
914         
915         string singleOTU = "";
916         
917         for(int i=0;i<numOTUs;i++){
918             
919             if (m->control_pressed) { break; }
920             
921             listFile >> singleOTU;
922             
923             istringstream otuString(singleOTU);
924             
925             while(otuString){
926                 
927                 string seqName = "";
928                 
929                 for(int j=0;j<singleOTU.length();j++){
930                     char letter = otuString.get();
931                     
932                     if(letter != ','){
933                         seqName += letter;
934                     }
935                     else{
936                         map<string,int>::iterator nmIt = nameMap.find(seqName);
937                         int index = nmIt->second;
938                         
939                         nameMap.erase(nmIt);
940                         
941                         otuData[index] = i;
942                         nSeqsPerOTU[i]++;
943                         aaP[i].push_back(index);
944                         seqName = "";
945                     }
946                 }
947                 
948                 map<string,int>::iterator nmIt = nameMap.find(seqName);
949                 
950                 int index = nmIt->second;
951                 nameMap.erase(nmIt);
952                 
953                 otuData[index] = i;
954                 nSeqsPerOTU[i]++;
955                 aaP[i].push_back(index);        
956                 
957                 otuString.get();
958             }
959             
960             sort(aaP[i].begin(), aaP[i].end());
961             for(int j=0;j<nSeqsPerOTU[i];j++){
962                 seqNumber.push_back(aaP[i][j]);
963             }
964             for(int j=nSeqsPerOTU[i];j<numSeqs;j++){
965                 aaP[i].push_back(0);
966             }
967             
968             
969         }
970         
971         for(int i=1;i<numOTUs;i++){
972             cumNumSeqs[i] = cumNumSeqs[i-1] + nSeqsPerOTU[i-1];
973         }
974         aaI = aaP;
975         seqIndex = seqNumber;
976         
977         listFile.close();       
978         
979     }
980     catch(exception& e) {
981         m->errorOut(e, "ShhherCommand", "getOTUData");
982         exit(1);        
983     }           
984 }
985
986 /**************************************************************************************************/
987
988 void ShhherCommand::initPyroCluster(){                          
989     try{
990         if (numOTUs < processors) { processors = 1; }
991         
992         if (m->debug) { m->mothurOut("[DEBUG]: numSeqs = " + toString(numSeqs) + " numOTUS = " + toString(numOTUs) + " about to alloc a dist vector with size = " + toString((numSeqs * numOTUs)) + ".\n"); }
993         
994         dist.assign(numSeqs * numOTUs, 0);
995         change.assign(numOTUs, 1);
996         centroids.assign(numOTUs, -1);
997         weight.assign(numOTUs, 0);
998         singleTau.assign(numSeqs, 1.0);
999         
1000         nSeqsBreaks.assign(processors+1, 0);
1001         nOTUsBreaks.assign(processors+1, 0);
1002         
1003         if (m->debug) { m->mothurOut("[DEBUG]: made it through the memory allocation.\n"); }
1004         
1005         nSeqsBreaks[0] = 0;
1006         for(int i=0;i<processors;i++){
1007             nSeqsBreaks[i+1] = nSeqsBreaks[i] + (int)((double) numSeqs / (double) processors);
1008             nOTUsBreaks[i+1] = nOTUsBreaks[i] + (int)((double) numOTUs / (double) processors);
1009         }
1010         nSeqsBreaks[processors] = numSeqs;
1011         nOTUsBreaks[processors] = numOTUs;
1012     }
1013     catch(exception& e) {
1014         m->errorOut(e, "ShhherCommand", "initPyroCluster");
1015         exit(1);        
1016     }
1017 }
1018
1019 /**************************************************************************************************/
1020
1021 void ShhherCommand::fill(){
1022     try {
1023         int index = 0;
1024         for(int i=0;i<numOTUs;i++){
1025             
1026             if (m->control_pressed) { break; }
1027             
1028             cumNumSeqs[i] = index;
1029             for(int j=0;j<nSeqsPerOTU[i];j++){
1030                 seqNumber[index] = aaP[i][j];
1031                 seqIndex[index] = aaI[i][j];
1032                 
1033                 index++;
1034             }
1035         }
1036     }
1037     catch(exception& e) {
1038         m->errorOut(e, "ShhherCommand", "fill");
1039         exit(1);        
1040     }           
1041 }
1042
1043 /**************************************************************************************************/
1044  
1045 void ShhherCommand::getFlowData(){
1046     try{
1047         ifstream flowFile;
1048         m->openInputFile(flowFileName, flowFile);
1049         
1050         string seqName;
1051         seqNameVector.clear();
1052         lengths.clear();
1053         flowDataIntI.clear();
1054         nameMap.clear();
1055         
1056         
1057         int currentNumFlowCells;
1058         
1059         float intensity;
1060         
1061         string numFlowTest;
1062         flowFile >> numFlowTest;
1063         
1064         if (!m->isContainingOnlyDigits(numFlowTest)) { m->mothurOut("[ERROR]: expected a number and got " + numFlowTest + ", quitting. Did you use the flow parameter instead of the file parameter?"); m->mothurOutEndLine(); exit(1); }
1065         else { convert(numFlowTest, numFlowCells); }
1066         
1067         int index = 0;//pcluster
1068         while(!flowFile.eof()){
1069             
1070             if (m->control_pressed) { break; }
1071             
1072             flowFile >> seqName >> currentNumFlowCells;
1073             lengths.push_back(currentNumFlowCells);
1074             
1075             seqNameVector.push_back(seqName);
1076             nameMap[seqName] = index++;//pcluster
1077             
1078             for(int i=0;i<numFlowCells;i++){
1079                 flowFile >> intensity;
1080                 if(intensity > 9.99)    {       intensity = 9.99;       }
1081                 int intI = int(100 * intensity + 0.0001);
1082                 flowDataIntI.push_back(intI);
1083             }
1084             m->gobble(flowFile);
1085         }
1086         flowFile.close();
1087         
1088         numSeqs = seqNameVector.size();         
1089         
1090         for(int i=0;i<numSeqs;i++){
1091             
1092             if (m->control_pressed) { break; }
1093             
1094             int iNumFlowCells = i * numFlowCells;
1095             for(int j=lengths[i];j<numFlowCells;j++){
1096                 flowDataIntI[iNumFlowCells + j] = 0;
1097             }
1098         }
1099         
1100     }
1101     catch(exception& e) {
1102         m->errorOut(e, "ShhherCommand", "getFlowData");
1103         exit(1);
1104     }
1105 }
1106 /**************************************************************************************************/
1107 void ShhherCommand::calcNewDistancesChildMPI(int startSeq, int stopSeq, vector<int>& otuIndex){
1108         
1109         try{
1110                 vector<double> newTau(numOTUs,0);
1111                 vector<double> norms(numSeqs, 0);
1112                 otuIndex.clear();
1113                 seqIndex.clear();
1114                 singleTau.clear();
1115                 
1116                 for(int i=startSeq;i<stopSeq;i++){
1117                         
1118                         if (m->control_pressed) { break; }
1119                         
1120                         double offset = 1e8;
1121                         int indexOffset = i * numOTUs;
1122                         
1123                         for(int j=0;j<numOTUs;j++){
1124                                 
1125                                 if(weight[j] > MIN_WEIGHT && change[j] == 1){
1126                                         dist[indexOffset + j] = getDistToCentroid(centroids[j], i, lengths[i]);
1127                                 }
1128                                 if(weight[j] > MIN_WEIGHT && dist[indexOffset + j] < offset){
1129                                         offset = dist[indexOffset + j];
1130                                 }
1131                         }
1132                         
1133                         for(int j=0;j<numOTUs;j++){
1134                                 if(weight[j] > MIN_WEIGHT){
1135                                         newTau[j] = exp(sigma * (-dist[indexOffset + j] + offset)) * weight[j];
1136                                         norms[i] += newTau[j];
1137                                 }
1138                                 else{
1139                                         newTau[j] = 0.0;
1140                                 }
1141                         }
1142                         
1143                         for(int j=0;j<numOTUs;j++){
1144                 
1145                                 newTau[j] /= norms[i];
1146                                 
1147                                 if(newTau[j] > MIN_TAU){
1148                                         otuIndex.push_back(j);
1149                                         seqIndex.push_back(i);
1150                                         singleTau.push_back(newTau[j]);
1151                                 }
1152                         }
1153                         
1154                 }
1155         }
1156         catch(exception& e) {
1157                 m->errorOut(e, "ShhherCommand", "calcNewDistancesChildMPI");
1158                 exit(1);        
1159         }               
1160 }
1161
1162 /**************************************************************************************************/
1163
1164 void ShhherCommand::calcNewDistancesParent(int startSeq, int stopSeq){
1165     
1166     try{
1167         
1168         int total = 0;
1169         vector<double> newTau(numOTUs,0);
1170         vector<double> norms(numSeqs, 0);
1171         nSeqsPerOTU.assign(numOTUs, 0);
1172         
1173         for(int i=startSeq;i<stopSeq;i++){
1174             
1175             if (m->control_pressed) { break; }
1176             
1177             int indexOffset = i * numOTUs;
1178             
1179             double offset = 1e8;
1180             
1181             for(int j=0;j<numOTUs;j++){
1182                 
1183                 if(weight[j] > MIN_WEIGHT && change[j] == 1){
1184                     dist[indexOffset + j] = getDistToCentroid(centroids[j], i, lengths[i]);
1185                 }
1186                 
1187                 if(weight[j] > MIN_WEIGHT && dist[indexOffset + j] < offset){
1188                     offset = dist[indexOffset + j];
1189                 }
1190             }
1191             
1192             for(int j=0;j<numOTUs;j++){
1193                 if(weight[j] > MIN_WEIGHT){
1194                     newTau[j] = exp(sigma * (-dist[indexOffset + j] + offset)) * weight[j];
1195                     norms[i] += newTau[j];
1196                 }
1197                 else{
1198                     newTau[j] = 0.0;
1199                 }
1200             }
1201             
1202             for(int j=0;j<numOTUs;j++){
1203                 newTau[j] /= norms[i];
1204             }
1205             
1206             for(int j=0;j<numOTUs;j++){
1207                 if(newTau[j] > MIN_TAU){
1208                     
1209                     int oldTotal = total;
1210                     
1211                     total++;
1212                     
1213                     singleTau.resize(total, 0);
1214                     seqNumber.resize(total, 0);
1215                     seqIndex.resize(total, 0);
1216                     
1217                     singleTau[oldTotal] = newTau[j];
1218                     
1219                     aaP[j][nSeqsPerOTU[j]] = oldTotal;
1220                     aaI[j][nSeqsPerOTU[j]] = i;
1221                     nSeqsPerOTU[j]++;
1222                 }
1223             }
1224             
1225         }
1226         
1227     }
1228     catch(exception& e) {
1229         m->errorOut(e, "ShhherCommand", "calcNewDistancesParent");
1230         exit(1);        
1231     }           
1232 }
1233
1234 /**************************************************************************************************/
1235
1236 void ShhherCommand::setOTUs(){
1237     
1238     try {
1239         vector<double> bigTauMatrix(numOTUs * numSeqs, 0.0000);
1240         
1241         for(int i=0;i<numOTUs;i++){
1242             
1243             if (m->control_pressed) { break; }
1244             
1245             for(int j=0;j<nSeqsPerOTU[i];j++){
1246                 int index = cumNumSeqs[i] + j;
1247                 double tauValue = singleTau[seqNumber[index]];
1248                 int sIndex = seqIndex[index];
1249                 bigTauMatrix[sIndex * numOTUs + i] = tauValue;                          
1250             }
1251         }
1252         
1253         for(int i=0;i<numSeqs;i++){
1254             double maxTau = -1.0000;
1255             int maxOTU = -1;
1256             for(int j=0;j<numOTUs;j++){
1257                 if(bigTauMatrix[i * numOTUs + j] > maxTau){
1258                     maxTau = bigTauMatrix[i * numOTUs + j];
1259                     maxOTU = j;
1260                 }
1261             }
1262             
1263             otuData[i] = maxOTU;
1264         }
1265         
1266         nSeqsPerOTU.assign(numOTUs, 0);         
1267         
1268         for(int i=0;i<numSeqs;i++){
1269             int index = otuData[i];
1270             
1271             singleTau[i] = 1.0000;
1272             dist[i] = 0.0000;
1273             
1274             aaP[index][nSeqsPerOTU[index]] = i;
1275             aaI[index][nSeqsPerOTU[index]] = i;
1276             
1277             nSeqsPerOTU[index]++;
1278         }
1279         fill(); 
1280     }
1281     catch(exception& e) {
1282         m->errorOut(e, "ShhherCommand", "setOTUs");
1283         exit(1);        
1284     }           
1285 }
1286
1287 /**************************************************************************************************/
1288  
1289 void ShhherCommand::getUniques(){
1290     try{
1291         
1292         
1293         numUniques = 0;
1294         uniqueFlowgrams.assign(numFlowCells * numSeqs, -1);
1295         uniqueCount.assign(numSeqs, 0);                                                 //      anWeights
1296         uniqueLengths.assign(numSeqs, 0);
1297         mapSeqToUnique.assign(numSeqs, -1);
1298         mapUniqueToSeq.assign(numSeqs, -1);
1299         
1300         vector<short> uniqueFlowDataIntI(numFlowCells * numSeqs, -1);
1301         
1302         for(int i=0;i<numSeqs;i++){
1303             
1304             if (m->control_pressed) { break; }
1305             
1306             int index = 0;
1307             
1308             vector<short> current(numFlowCells);
1309             for(int j=0;j<numFlowCells;j++){
1310                 current[j] = short(((flowDataIntI[i * numFlowCells + j] + 50.0)/100.0));
1311             }
1312             
1313             for(int j=0;j<numUniques;j++){
1314                 int offset = j * numFlowCells;
1315                 bool toEnd = 1;
1316                 
1317                 int shorterLength;
1318                 if(lengths[i] < uniqueLengths[j])       {       shorterLength = lengths[i];                     }
1319                 else                                                            {       shorterLength = uniqueLengths[j];       }
1320                 
1321                 for(int k=0;k<shorterLength;k++){
1322                     if(current[k] != uniqueFlowgrams[offset + k]){
1323                         toEnd = 0;
1324                         break;
1325                     }
1326                 }
1327                 
1328                 if(toEnd){
1329                     mapSeqToUnique[i] = j;
1330                     uniqueCount[j]++;
1331                     index = j;
1332                     if(lengths[i] > uniqueLengths[j])   {       uniqueLengths[j] = lengths[i];  }
1333                     break;
1334                 }
1335                 index++;
1336             }
1337             
1338             if(index == numUniques){
1339                 uniqueLengths[numUniques] = lengths[i];
1340                 uniqueCount[numUniques] = 1;
1341                 mapSeqToUnique[i] = numUniques;//anMap
1342                 mapUniqueToSeq[numUniques] = i;//anF
1343                 
1344                 for(int k=0;k<numFlowCells;k++){
1345                     uniqueFlowgrams[numUniques * numFlowCells + k] = current[k];
1346                     uniqueFlowDataIntI[numUniques * numFlowCells + k] = flowDataIntI[i * numFlowCells + k];
1347                 }
1348                 
1349                 numUniques++;
1350             }
1351         }
1352         uniqueFlowDataIntI.resize(numFlowCells * numUniques);
1353         uniqueLengths.resize(numUniques);       
1354         
1355         flowDataPrI.resize(numSeqs * numFlowCells, 0);
1356         for(int i=0;i<flowDataPrI.size();i++)   {       if (m->control_pressed) { break; } flowDataPrI[i] = getProbIntensity(flowDataIntI[i]);          }
1357     }
1358     catch(exception& e) {
1359         m->errorOut(e, "ShhherCommand", "getUniques");
1360         exit(1);
1361     }
1362 }
1363
1364 /**************************************************************************************************/
1365
1366 float ShhherCommand::calcPairwiseDist(int seqA, int seqB){
1367     try{
1368         int minLength = lengths[mapSeqToUnique[seqA]];
1369         if(lengths[seqB] < minLength){  minLength = lengths[mapSeqToUnique[seqB]];      }
1370         
1371         int ANumFlowCells = seqA * numFlowCells;
1372         int BNumFlowCells = seqB * numFlowCells;
1373         
1374         float dist = 0;
1375         
1376         for(int i=0;i<minLength;i++){
1377             
1378             if (m->control_pressed) { break; }
1379             
1380             int flowAIntI = flowDataIntI[ANumFlowCells + i];
1381             float flowAPrI = flowDataPrI[ANumFlowCells + i];
1382             
1383             int flowBIntI = flowDataIntI[BNumFlowCells + i];
1384             float flowBPrI = flowDataPrI[BNumFlowCells + i];
1385             dist += jointLookUp[flowAIntI * NUMBINS + flowBIntI] - flowAPrI - flowBPrI;
1386         }
1387         
1388         dist /= (float) minLength;
1389         return dist;
1390     }
1391     catch(exception& e) {
1392         m->errorOut(e, "ShhherCommand", "calcPairwiseDist");
1393         exit(1);
1394     }
1395 }
1396
1397 //**********************************************************************************************************************/
1398
1399 string ShhherCommand::cluster(string distFileName, string namesFileName){
1400     try {
1401         
1402         ReadMatrix* read = new ReadColumnMatrix(distFileName);  
1403                 read->setCutoff(cutoff);
1404                 
1405                 NameAssignment* clusterNameMap = new NameAssignment(namesFileName);
1406                 clusterNameMap->readMap();
1407                 read->read(clusterNameMap);
1408         
1409                 ListVector* list = read->getListVector();
1410                 SparseDistanceMatrix* matrix = read->getDMatrix();
1411                 
1412                 delete read; 
1413                 delete clusterNameMap; 
1414         
1415         RAbundVector* rabund = new RAbundVector(list->getRAbundVector());
1416         
1417         Cluster* cluster = new CompleteLinkage(rabund, list, matrix, cutoff, "furthest"); 
1418         string tag = cluster->getTag();
1419         
1420         double clusterCutoff = cutoff;
1421         while (matrix->getSmallDist() <= clusterCutoff && matrix->getNNodes() > 0){
1422             
1423             if (m->control_pressed) { break; }
1424             
1425             cluster->update(clusterCutoff);
1426         }
1427         
1428         list->setLabel(toString(cutoff));
1429         
1430         string listFileName = flowFileName.substr(0,flowFileName.find_last_of('.')) + ".shhh.list";
1431         ofstream listFile;
1432         m->openOutputFile(listFileName, listFile);
1433         list->print(listFile);
1434         listFile.close();
1435         
1436         delete matrix;  delete cluster; delete rabund; delete list;
1437         
1438         return listFileName;
1439     }
1440     catch(exception& e) {
1441         m->errorOut(e, "ShhherCommand", "cluster");
1442         exit(1);        
1443     }           
1444 }
1445
1446 /**************************************************************************************************/
1447
1448 void ShhherCommand::calcCentroidsDriver(int start, int finish){                          
1449     
1450     //this function gets the most likely homopolymer length at a flow position for a group of sequences
1451     //within an otu
1452     
1453     try{
1454         
1455         for(int i=start;i<finish;i++){
1456             
1457             if (m->control_pressed) { break; }
1458             
1459             double count = 0;
1460             int position = 0;
1461             int minFlowGram = 100000000;
1462             double minFlowValue = 1e8;
1463             change[i] = 0; //FALSE
1464             
1465             for(int j=0;j<nSeqsPerOTU[i];j++){
1466                 count += singleTau[seqNumber[cumNumSeqs[i] + j]];
1467             }
1468             
1469             if(nSeqsPerOTU[i] > 0 && count > MIN_COUNT){
1470                 vector<double> adF(nSeqsPerOTU[i]);
1471                 vector<int> anL(nSeqsPerOTU[i]);
1472                 
1473                 for(int j=0;j<nSeqsPerOTU[i];j++){
1474                     int index = cumNumSeqs[i] + j;
1475                     int nI = seqIndex[index];
1476                     int nIU = mapSeqToUnique[nI];
1477                     
1478                     int k;
1479                     for(k=0;k<position;k++){
1480                         if(nIU == anL[k]){
1481                             break;
1482                         }
1483                     }
1484                     if(k == position){
1485                         anL[position] = nIU;
1486                         adF[position] = 0.0000;
1487                         position++;
1488                     }                                           
1489                 }
1490                 
1491                 for(int j=0;j<nSeqsPerOTU[i];j++){
1492                     int index = cumNumSeqs[i] + j;
1493                     int nI = seqIndex[index];
1494                     
1495                     double tauValue = singleTau[seqNumber[index]];
1496                     
1497                     for(int k=0;k<position;k++){
1498                         double dist = getDistToCentroid(anL[k], nI, lengths[nI]);
1499                         adF[k] += dist * tauValue;
1500                     }
1501                 }
1502                 
1503                 for(int j=0;j<position;j++){
1504                     if(adF[j] < minFlowValue){
1505                         minFlowGram = j;
1506                         minFlowValue = adF[j];
1507                     }
1508                 }
1509                 
1510                 if(centroids[i] != anL[minFlowGram]){
1511                     change[i] = 1;
1512                     centroids[i] = anL[minFlowGram];
1513                 }
1514             }
1515             else if(centroids[i] != -1){
1516                 change[i] = 1;
1517                 centroids[i] = -1;                      
1518             }
1519         }
1520     }
1521     catch(exception& e) {
1522         m->errorOut(e, "ShhherCommand", "calcCentroidsDriver");
1523         exit(1);        
1524     }           
1525 }
1526
1527 /**************************************************************************************************/
1528
1529 double ShhherCommand::getDistToCentroid(int cent, int flow, int length){
1530     try{
1531         
1532         int flowAValue = cent * numFlowCells;
1533         int flowBValue = flow * numFlowCells;
1534         
1535         double dist = 0;
1536         
1537         for(int i=0;i<length;i++){
1538             dist += singleLookUp[uniqueFlowgrams[flowAValue] * NUMBINS + flowDataIntI[flowBValue]];
1539             flowAValue++;
1540             flowBValue++;
1541         }
1542         
1543         return dist / (double)length;
1544     }
1545     catch(exception& e) {
1546         m->errorOut(e, "ShhherCommand", "getDistToCentroid");
1547         exit(1);        
1548     }           
1549 }
1550
1551 /**************************************************************************************************/
1552
1553 double ShhherCommand::getNewWeights(){
1554     try{
1555         
1556         double maxChange = 0;
1557         
1558         for(int i=0;i<numOTUs;i++){
1559             
1560             if (m->control_pressed) { break; }
1561             
1562             double difference = weight[i];
1563             weight[i] = 0;
1564             
1565             for(int j=0;j<nSeqsPerOTU[i];j++){
1566                 int index = cumNumSeqs[i] + j;
1567                 double tauValue = singleTau[seqNumber[index]];
1568                 weight[i] += tauValue;
1569             }
1570             
1571             difference = fabs(weight[i] - difference);
1572             if(difference > maxChange){ maxChange = difference; }
1573         }
1574         return maxChange;
1575     }
1576     catch(exception& e) {
1577         m->errorOut(e, "ShhherCommand", "getNewWeights");
1578         exit(1);        
1579     }           
1580 }
1581  
1582  /**************************************************************************************************/
1583  
1584 double ShhherCommand::getLikelihood(){
1585     
1586     try{
1587         
1588         vector<long double> P(numSeqs, 0);
1589         int effNumOTUs = 0;
1590         
1591         for(int i=0;i<numOTUs;i++){
1592             if(weight[i] > MIN_WEIGHT){
1593                 effNumOTUs++;
1594             }
1595         }
1596         
1597         string hold;
1598         for(int i=0;i<numOTUs;i++){
1599             
1600             if (m->control_pressed) { break; }
1601             
1602             for(int j=0;j<nSeqsPerOTU[i];j++){
1603                 int index = cumNumSeqs[i] + j;
1604                 int nI = seqIndex[index];
1605                 double singleDist = dist[seqNumber[index]];
1606                 
1607                 P[nI] += weight[i] * exp(-singleDist * sigma);
1608             }
1609         }
1610         double nLL = 0.00;
1611         for(int i=0;i<numSeqs;i++){
1612             if(P[i] == 0){      P[i] = DBL_EPSILON;     }
1613             
1614             nLL += -log(P[i]);
1615         }
1616         
1617         nLL = nLL -(double)numSeqs * log(sigma);
1618         
1619         return nLL; 
1620     }
1621     catch(exception& e) {
1622         m->errorOut(e, "ShhherCommand", "getNewWeights");
1623         exit(1);        
1624     }           
1625 }
1626
1627 /**************************************************************************************************/
1628
1629 void ShhherCommand::checkCentroids(){
1630     try{
1631         vector<int> unique(numOTUs, 1);
1632         
1633         for(int i=0;i<numOTUs;i++){
1634             if(centroids[i] == -1 || weight[i] < MIN_WEIGHT){
1635                 unique[i] = -1;
1636             }
1637         }
1638         
1639         for(int i=0;i<numOTUs;i++){
1640             
1641             if (m->control_pressed) { break; }
1642             
1643             if(unique[i] == 1){
1644                 for(int j=i+1;j<numOTUs;j++){
1645                     if(unique[j] == 1){
1646                         
1647                         if(centroids[j] == centroids[i]){
1648                             unique[j] = 0;
1649                             centroids[j] = -1;
1650                             
1651                             weight[i] += weight[j];
1652                             weight[j] = 0.0;
1653                         }
1654                     }
1655                 }
1656             }
1657         }
1658     }
1659     catch(exception& e) {
1660         m->errorOut(e, "ShhherCommand", "checkCentroids");
1661         exit(1);        
1662     }           
1663 }
1664  /**************************************************************************************************/
1665
1666
1667  
1668 void ShhherCommand::writeQualities(vector<int> otuCounts){
1669     
1670     try {
1671         string thisOutputDir = outputDir;
1672         if (outputDir == "") {  thisOutputDir += m->hasPath(flowFileName);  }
1673         map<string, string> variables; 
1674                 variables["[filename]"] = outputDir + m->getRootName(m->getSimpleName(flowFileName));
1675         string qualityFileName = getOutputFileName("qfile",variables);
1676         
1677         ofstream qualityFile;
1678         m->openOutputFile(qualityFileName, qualityFile);
1679         
1680         qualityFile.setf(ios::fixed, ios::floatfield);
1681         qualityFile.setf(ios::showpoint);
1682         qualityFile << setprecision(6);
1683         
1684         vector<vector<int> > qualities(numOTUs);
1685         vector<double> pr(HOMOPS, 0);
1686         
1687         
1688         for(int i=0;i<numOTUs;i++){
1689             
1690             if (m->control_pressed) { break; }
1691             
1692             int index = 0;
1693             int base = 0;
1694             
1695             if(nSeqsPerOTU[i] > 0){
1696                 qualities[i].assign(1024, -1);
1697                 
1698                 while(index < numFlowCells){
1699                     double maxPrValue = 1e8;
1700                     short maxPrIndex = -1;
1701                     double count = 0.0000;
1702                     
1703                     pr.assign(HOMOPS, 0);
1704                     
1705                     for(int j=0;j<nSeqsPerOTU[i];j++){
1706                         int lIndex = cumNumSeqs[i] + j;
1707                         double tauValue = singleTau[seqNumber[lIndex]];
1708                         int sequenceIndex = aaI[i][j];
1709                         short intensity = flowDataIntI[sequenceIndex * numFlowCells + index];
1710                         
1711                         count += tauValue;
1712                         
1713                         for(int s=0;s<HOMOPS;s++){
1714                             pr[s] += tauValue * singleLookUp[s * NUMBINS + intensity];
1715                         }
1716                     }
1717                     
1718                     maxPrIndex = uniqueFlowgrams[centroids[i] * numFlowCells + index];
1719                     maxPrValue = pr[maxPrIndex];
1720                     
1721                     if(count > MIN_COUNT){
1722                         double U = 0.0000;
1723                         double norm = 0.0000;
1724                         
1725                         for(int s=0;s<HOMOPS;s++){
1726                             norm += exp(-(pr[s] - maxPrValue));
1727                         }
1728                         
1729                         for(int s=1;s<=maxPrIndex;s++){
1730                             int value = 0;
1731                             double temp = 0.0000;
1732                             
1733                             U += exp(-(pr[s-1]-maxPrValue))/norm;
1734                             
1735                             if(U>0.00){
1736                                 temp = log10(U);
1737                             }
1738                             else{
1739                                 temp = -10.1;
1740                             }
1741                             temp = floor(-10 * temp);
1742                             value = (int)floor(temp);
1743                             if(value > 100){    value = 100;    }
1744                             
1745                             qualities[i][base] = (int)value;
1746                             base++;
1747                         }
1748                     }
1749                     
1750                     index++;
1751                 }
1752             }
1753             
1754             
1755             if(otuCounts[i] > 0){
1756                 qualityFile << '>' << seqNameVector[mapUniqueToSeq[i]] << endl;
1757                 
1758                 int j=4;        //need to get past the first four bases
1759                 while(qualities[i][j] != -1){
1760                     qualityFile << qualities[i][j] << ' ';
1761                     j++;
1762                 }
1763                 qualityFile << endl;
1764             }
1765         }
1766         qualityFile.close();
1767         outputNames.push_back(qualityFileName); outputTypes["qfile"].push_back(qualityFileName);
1768         
1769     }
1770     catch(exception& e) {
1771         m->errorOut(e, "ShhherCommand", "writeQualities");
1772         exit(1);        
1773     }           
1774 }
1775
1776 /**************************************************************************************************/
1777
1778 void ShhherCommand::writeSequences(vector<int> otuCounts){
1779     try {
1780         string thisOutputDir = outputDir;
1781         if (outputDir == "") {  thisOutputDir += m->hasPath(flowFileName);  }
1782         map<string, string> variables; 
1783                 variables["[filename]"] = thisOutputDir + m->getRootName(m->getSimpleName(flowFileName));
1784         string fastaFileName = getOutputFileName("fasta",variables);
1785         ofstream fastaFile;
1786         m->openOutputFile(fastaFileName, fastaFile);
1787         
1788         vector<string> names(numOTUs, "");
1789         
1790         for(int i=0;i<numOTUs;i++){
1791             
1792             if (m->control_pressed) { break; }
1793             
1794             int index = centroids[i];
1795             
1796             if(otuCounts[i] > 0){
1797                 fastaFile << '>' << seqNameVector[aaI[i][0]] << endl;
1798                 
1799                 string newSeq = "";
1800                 
1801                 for(int j=0;j<numFlowCells;j++){
1802                     
1803                     char base = flowOrder[j % flowOrder.length()];
1804                     for(int k=0;k<uniqueFlowgrams[index * numFlowCells + j];k++){
1805                         newSeq += base;
1806                     }
1807                 }
1808                 
1809                 fastaFile << newSeq.substr(4) << endl;
1810             }
1811         }
1812         fastaFile.close();
1813         
1814         outputNames.push_back(fastaFileName); outputTypes["fasta"].push_back(fastaFileName);
1815         
1816         if(compositeFASTAFileName != ""){
1817             m->appendFiles(fastaFileName, compositeFASTAFileName);
1818         }
1819     }
1820     catch(exception& e) {
1821         m->errorOut(e, "ShhherCommand", "writeSequences");
1822         exit(1);        
1823     }           
1824 }
1825
1826 /**************************************************************************************************/
1827
1828 void ShhherCommand::writeNames(vector<int> otuCounts){
1829     try {
1830         string thisOutputDir = outputDir;
1831         if (outputDir == "") {  thisOutputDir += m->hasPath(flowFileName);  }
1832         map<string, string> variables; 
1833                 variables["[filename]"] = thisOutputDir + m->getRootName(m->getSimpleName(flowFileName));
1834         string nameFileName = getOutputFileName("name",variables);
1835         ofstream nameFile;
1836         m->openOutputFile(nameFileName, nameFile);
1837         
1838         for(int i=0;i<numOTUs;i++){
1839             
1840             if (m->control_pressed) { break; }
1841             
1842             if(otuCounts[i] > 0){
1843                 nameFile << seqNameVector[aaI[i][0]] << '\t' << seqNameVector[aaI[i][0]];
1844                 
1845                 for(int j=1;j<nSeqsPerOTU[i];j++){
1846                     nameFile << ',' << seqNameVector[aaI[i][j]];
1847                 }
1848                 
1849                 nameFile << endl;
1850             }
1851         }
1852         nameFile.close();
1853         outputNames.push_back(nameFileName); outputTypes["name"].push_back(nameFileName);
1854         
1855         
1856         if(compositeNamesFileName != ""){
1857             m->appendFiles(nameFileName, compositeNamesFileName);
1858         }               
1859     }
1860     catch(exception& e) {
1861         m->errorOut(e, "ShhherCommand", "writeNames");
1862         exit(1);        
1863     }           
1864 }
1865
1866 /**************************************************************************************************/
1867
1868 void ShhherCommand::writeGroups(){
1869     try {
1870         string thisOutputDir = outputDir;
1871         if (outputDir == "") {  thisOutputDir += m->hasPath(flowFileName);  }
1872         string fileRoot = m->getRootName(m->getSimpleName(flowFileName));
1873         int pos = fileRoot.find_first_of('.');
1874         string fileGroup = fileRoot;
1875         if (pos != string::npos) {  fileGroup = fileRoot.substr(pos+1, (fileRoot.length()-1-(pos+1)));  }
1876         map<string, string> variables; 
1877                 variables["[filename]"] = thisOutputDir + fileRoot;
1878         string groupFileName = getOutputFileName("group",variables);
1879         ofstream groupFile;
1880         m->openOutputFile(groupFileName, groupFile);
1881         
1882         for(int i=0;i<numSeqs;i++){
1883             if (m->control_pressed) { break; }
1884             groupFile << seqNameVector[i] << '\t' << fileGroup << endl;
1885         }
1886         groupFile.close();
1887         outputNames.push_back(groupFileName); outputTypes["group"].push_back(groupFileName);
1888         
1889     }
1890     catch(exception& e) {
1891         m->errorOut(e, "ShhherCommand", "writeGroups");
1892         exit(1);        
1893     }           
1894 }
1895
1896 /**************************************************************************************************/
1897
1898 void ShhherCommand::writeClusters(vector<int> otuCounts){
1899     try {
1900         string thisOutputDir = outputDir;
1901         if (outputDir == "") {  thisOutputDir += m->hasPath(flowFileName);  }
1902         map<string, string> variables; 
1903                 variables["[filename]"] = thisOutputDir + m->getRootName(m->getSimpleName(flowFileName));
1904         string otuCountsFileName = getOutputFileName("counts",variables);
1905         ofstream otuCountsFile;
1906         m->openOutputFile(otuCountsFileName, otuCountsFile);
1907         
1908         string bases = flowOrder;
1909         
1910         for(int i=0;i<numOTUs;i++){
1911             
1912             if (m->control_pressed) {
1913                 break;
1914             }
1915             //output the translated version of the centroid sequence for the otu
1916             if(otuCounts[i] > 0){
1917                 int index = centroids[i];
1918                 
1919                 otuCountsFile << "ideal\t";
1920                 for(int j=8;j<numFlowCells;j++){
1921                     char base = bases[j % bases.length()];
1922                     for(int s=0;s<uniqueFlowgrams[index * numFlowCells + j];s++){
1923                         otuCountsFile << base;
1924                     }
1925                 }
1926                 otuCountsFile << endl;
1927                 
1928                 for(int j=0;j<nSeqsPerOTU[i];j++){
1929                     int sequence = aaI[i][j];
1930                     otuCountsFile << seqNameVector[sequence] << '\t';
1931                     
1932                     string newSeq = "";
1933                     
1934                     for(int k=0;k<lengths[sequence];k++){
1935                         char base = bases[k % bases.length()];
1936                         int freq = int(0.01 * (double)flowDataIntI[sequence * numFlowCells + k] + 0.5);
1937                         
1938                         for(int s=0;s<freq;s++){
1939                             newSeq += base;
1940                             //otuCountsFile << base;
1941                         }
1942                     }
1943                     otuCountsFile << newSeq.substr(4) << endl;
1944                 }
1945                 otuCountsFile << endl;
1946             }
1947         }
1948         otuCountsFile.close();
1949         outputNames.push_back(otuCountsFileName); outputTypes["counts"].push_back(otuCountsFileName);
1950         
1951     }
1952     catch(exception& e) {
1953         m->errorOut(e, "ShhherCommand", "writeClusters");
1954         exit(1);        
1955     }           
1956 }
1957
1958 #else
1959 //**********************************************************************************************************************
1960
1961 int ShhherCommand::execute(){
1962         try {
1963                 if (abort == true) { if (calledHelp) { return 0; }  return 2;   }
1964                 
1965                 getSingleLookUp();      if (m->control_pressed) { return 0; }
1966                 getJointLookUp();       if (m->control_pressed) { return 0; }
1967                 
1968         int numFiles = flowFileVector.size();
1969                 
1970         if (numFiles < processors) { processors = numFiles; }
1971         
1972 #if defined (__APPLE__) || (__MACH__) || (linux) || (__linux) || (__linux__) || (__unix__) || (__unix)
1973         if (processors == 1) { driver(flowFileVector, compositeFASTAFileName, compositeNamesFileName); }
1974         else { createProcesses(flowFileVector); } //each processor processes one file
1975 #else
1976         driver(flowFileVector, compositeFASTAFileName, compositeNamesFileName);
1977 #endif
1978         
1979                 if(compositeFASTAFileName != ""){
1980                         outputNames.push_back(compositeFASTAFileName); outputTypes["fasta"].push_back(compositeFASTAFileName);
1981                         outputNames.push_back(compositeNamesFileName); outputTypes["name"].push_back(compositeNamesFileName);
1982                 }
1983
1984                 m->mothurOutEndLine();
1985                 m->mothurOut("Output File Names: "); m->mothurOutEndLine();
1986                 for (int i = 0; i < outputNames.size(); i++) {  m->mothurOut(outputNames[i]); m->mothurOutEndLine();    }
1987                 m->mothurOutEndLine();
1988                 
1989                 return 0;
1990         }
1991         catch(exception& e) {
1992                 m->errorOut(e, "ShhherCommand", "execute");
1993                 exit(1);
1994         }
1995 }
1996 #endif
1997 //********************************************************************************************************************
1998 //sorts biggest to smallest
1999 inline bool compareFileSizes(string left, string right){
2000     
2001     FILE * pFile;
2002     long leftsize = 0;
2003     
2004     //get num bytes in file
2005     string filename = left;
2006     pFile = fopen (filename.c_str(),"rb");
2007     string error = "Error opening " + filename;
2008     if (pFile==NULL) perror (error.c_str());
2009     else{
2010         fseek (pFile, 0, SEEK_END);
2011         leftsize=ftell (pFile);
2012         fclose (pFile);
2013     }
2014     
2015     FILE * pFile2;
2016     long rightsize = 0;
2017     
2018     //get num bytes in file
2019     filename = right;
2020     pFile2 = fopen (filename.c_str(),"rb");
2021     error = "Error opening " + filename;
2022     if (pFile2==NULL) perror (error.c_str());
2023     else{
2024         fseek (pFile2, 0, SEEK_END);
2025         rightsize=ftell (pFile2);
2026         fclose (pFile2);
2027     }
2028     
2029     return (leftsize > rightsize);      
2030
2031 /**************************************************************************************************/
2032
2033 int ShhherCommand::createProcesses(vector<string> filenames){
2034     try {
2035         vector<int> processIDS;
2036                 int process = 1;
2037                 int num = 0;
2038                 
2039                 //sanity check
2040                 if (filenames.size() < processors) { processors = filenames.size(); }
2041         
2042         //sort file names by size to divide load better
2043         sort(filenames.begin(), filenames.end(), compareFileSizes);
2044         
2045         vector < vector <string> > dividedFiles; //dividedFiles[1] = vector of filenames for process 1...
2046         dividedFiles.resize(processors);
2047         
2048         //for each file, figure out which process will complete it
2049         //want to divide the load intelligently so the big files are spread between processes
2050         for (int i = 0; i < filenames.size(); i++) { 
2051             int processToAssign = (i+1) % processors; 
2052             if (processToAssign == 0) { processToAssign = processors; }
2053             
2054             dividedFiles[(processToAssign-1)].push_back(filenames[i]);
2055         }
2056         
2057         //now lets reverse the order of ever other process, so we balance big files running with little ones
2058         for (int i = 0; i < processors; i++) {
2059             int remainder = ((i+1) % processors);
2060             if (remainder) {  reverse(dividedFiles[i].begin(), dividedFiles[i].end());  }
2061         }
2062         
2063                         
2064                 //divide the groups between the processors
2065                 /*vector<linePair> lines;
2066         vector<int> numFilesToComplete;
2067                 int numFilesPerProcessor = filenames.size() / processors;
2068                 for (int i = 0; i < processors; i++) {
2069                         int startIndex =  i * numFilesPerProcessor;
2070                         int endIndex = (i+1) * numFilesPerProcessor;
2071                         if(i == (processors - 1)){      endIndex = filenames.size();    }
2072                         lines.push_back(linePair(startIndex, endIndex));
2073             numFilesToComplete.push_back((endIndex-startIndex));
2074                 }*/
2075                 
2076         #if defined (__APPLE__) || (__MACH__) || (linux) || (__linux) || (__linux__) || (__unix__) || (__unix)          
2077                 
2078                 //loop through and create all the processes you want
2079                 while (process != processors) {
2080                         int pid = fork();
2081                         
2082                         if (pid > 0) {
2083                                 processIDS.push_back(pid);  //create map from line number to pid so you can append files in correct order later
2084                                 process++;
2085                         }else if (pid == 0){
2086                                 num = driver(dividedFiles[process], compositeFASTAFileName + toString(getpid()) + ".temp", compositeNamesFileName  + toString(getpid()) + ".temp");
2087                 
2088                 //pass numSeqs to parent
2089                                 ofstream out;
2090                                 string tempFile = compositeFASTAFileName + toString(getpid()) + ".num.temp";
2091                                 m->openOutputFile(tempFile, out);
2092                                 out << num << endl;
2093                                 out.close();
2094                 
2095                                 exit(0);
2096                         }else { 
2097                                 m->mothurOut("[ERROR]: unable to spawn the necessary processes."); m->mothurOutEndLine(); 
2098                                 for (int i = 0; i < processIDS.size(); i++) { kill (processIDS[i], SIGINT); }
2099                                 exit(0);
2100                         }
2101                 }
2102                 
2103                 //do my part
2104                 driver(dividedFiles[0], compositeFASTAFileName, compositeNamesFileName);
2105                 
2106                 //force parent to wait until all the processes are done
2107                 for (int i=0;i<processIDS.size();i++) { 
2108                         int temp = processIDS[i];
2109                         wait(&temp);
2110                 }
2111         
2112         #else
2113         
2114         //////////////////////////////////////////////////////////////////////////////////////////////////////
2115         
2116         /////////////////////// NOT WORKING, ACCESS VIOLATION ON READ OF FLOWGRAMS IN THREAD /////////////////
2117         
2118         //////////////////////////////////////////////////////////////////////////////////////////////////////
2119                 //Windows version shared memory, so be careful when passing variables through the shhhFlowsData struct. 
2120                 //Above fork() will clone, so memory is separate, but that's not the case with windows, 
2121                 //////////////////////////////////////////////////////////////////////////////////////////////////////
2122                 /*
2123                 vector<shhhFlowsData*> pDataArray; 
2124                 DWORD   dwThreadIdArray[processors-1];
2125                 HANDLE  hThreadArray[processors-1]; 
2126                 
2127                 //Create processor worker threads.
2128                 for( int i=0; i<processors-1; i++ ){
2129                         // Allocate memory for thread data.
2130                         string extension = "";
2131                         if (i != 0) { extension = toString(i) + ".temp"; }
2132                         
2133             shhhFlowsData* tempFlow = new shhhFlowsData(filenames, (compositeFASTAFileName + extension), (compositeNamesFileName + extension), outputDir, flowOrder, jointLookUp, singleLookUp, m, lines[i].start, lines[i].end, cutoff, sigma, minDelta, maxIters, i);
2134                         pDataArray.push_back(tempFlow);
2135                         processIDS.push_back(i);
2136             
2137                         hThreadArray[i] = CreateThread(NULL, 0, ShhhFlowsThreadFunction, pDataArray[i], 0, &dwThreadIdArray[i]);   
2138                 }
2139                 
2140                 //using the main process as a worker saves time and memory
2141                 //do my part
2142                 driver(filenames, compositeFASTAFileName, compositeNamesFileName, lines[processors-1].start, lines[processors-1].end);
2143                 
2144                 //Wait until all threads have terminated.
2145                 WaitForMultipleObjects(processors-1, hThreadArray, TRUE, INFINITE);
2146                 
2147                 //Close all thread handles and free memory allocations.
2148                 for(int i=0; i < pDataArray.size(); i++){
2149                         for(int j=0; j < pDataArray[i]->outputNames.size(); j++){ outputNames.push_back(pDataArray[i]->outputNames[j]); }
2150                         CloseHandle(hThreadArray[i]);
2151                         delete pDataArray[i];
2152                 }
2153                 */
2154         #endif
2155         
2156         for (int i=0;i<processIDS.size();i++) { 
2157             ifstream in;
2158                         string tempFile =  compositeFASTAFileName + toString(processIDS[i]) + ".num.temp";
2159                         m->openInputFile(tempFile, in);
2160                         if (!in.eof()) { 
2161                 int tempNum = 0; 
2162                 in >> tempNum; 
2163                 if (tempNum != dividedFiles[i+1].size()) {
2164                     m->mothurOut("[ERROR]: main process expected " + toString(processIDS[i]) + " to complete " + toString(dividedFiles[i+1].size()) + " files, and it only reported completing " + toString(tempNum) + ". This will cause file mismatches.  The flow files may be too large to process with multiple processors. \n");
2165                 }
2166             }
2167                         in.close(); m->mothurRemove(tempFile);
2168             
2169             if (compositeFASTAFileName != "") {
2170                 m->appendFiles((compositeFASTAFileName + toString(processIDS[i]) + ".temp"), compositeFASTAFileName);
2171                 m->appendFiles((compositeNamesFileName + toString(processIDS[i]) + ".temp"), compositeNamesFileName);
2172                 m->mothurRemove((compositeFASTAFileName + toString(processIDS[i]) + ".temp"));
2173                 m->mothurRemove((compositeNamesFileName + toString(processIDS[i]) + ".temp"));
2174             }
2175         }
2176         
2177         return 0;
2178         
2179     }
2180         catch(exception& e) {
2181                 m->errorOut(e, "ShhherCommand", "createProcesses");
2182                 exit(1);
2183         }
2184 }
2185 /**************************************************************************************************/
2186
2187 vector<string> ShhherCommand::parseFlowFiles(string filename){
2188     try {
2189         vector<string> files;
2190         int count = 0;
2191         
2192         ifstream in;
2193         m->openInputFile(filename, in);
2194         
2195         int thisNumFLows = 0;
2196         in >> thisNumFLows; m->gobble(in);
2197         
2198         while (!in.eof()) {
2199             if (m->control_pressed) { break; }
2200             
2201             ofstream out;
2202             string outputFileName = filename + toString(count) + ".temp";
2203             m->openOutputFile(outputFileName, out);
2204             out << thisNumFLows << endl;
2205             files.push_back(outputFileName);
2206             
2207             int numLinesWrote = 0;
2208             for (int i = 0; i < largeSize; i++) {
2209                 if (in.eof()) { break; }
2210                 string line = m->getline(in); m->gobble(in);
2211                 out << line << endl;
2212                 numLinesWrote++;
2213             }
2214             out.close();
2215             
2216             if (numLinesWrote == 0) {  m->mothurRemove(outputFileName); files.pop_back();  }
2217             count++;
2218         }
2219         in.close();
2220         
2221         if (m->control_pressed) { for (int i = 0; i < files.size(); i++) { m->mothurRemove(files[i]); }  files.clear(); }
2222         
2223         m->mothurOut("\nDivided " + filename + " into " + toString(files.size()) + " files.\n\n"); 
2224         
2225         return files;
2226     }
2227         catch(exception& e) {
2228                 m->errorOut(e, "ShhherCommand", "parseFlowFiles");
2229                 exit(1);
2230         }
2231 }
2232 /**************************************************************************************************/
2233
2234 int ShhherCommand::driver(vector<string> filenames, string thisCompositeFASTAFileName, string thisCompositeNamesFileName){
2235     try {
2236         
2237         int numCompleted = 0;
2238         
2239         for(int i=0;i<filenames.size();i++){
2240                         
2241                         if (m->control_pressed) { break; }
2242                         
2243             vector<string> theseFlowFileNames; theseFlowFileNames.push_back(filenames[i]);
2244             if (large) {  theseFlowFileNames = parseFlowFiles(filenames[i]);  }
2245             
2246             if (m->control_pressed) { break; }
2247             
2248             double begClock = clock();
2249             unsigned long long begTime;
2250             
2251             string fileNameForOutput = filenames[i];
2252             
2253             for (int g = 0; g < theseFlowFileNames.size(); g++) {
2254                 
2255                 string flowFileName = theseFlowFileNames[g];
2256                 m->mothurOut("\n>>>>>\tProcessing " + flowFileName + " (file " + toString(i+1) + " of " + toString(filenames.size()) + ")\t<<<<<\n");
2257                 m->mothurOut("Reading flowgrams...\n");
2258                 
2259                 vector<string> seqNameVector;
2260                 vector<int> lengths;
2261                 vector<short> flowDataIntI;
2262                 vector<double> flowDataPrI;
2263                 map<string, int> nameMap;
2264                 vector<short> uniqueFlowgrams;
2265                 vector<int> uniqueCount;
2266                 vector<int> mapSeqToUnique;
2267                 vector<int> mapUniqueToSeq;
2268                 vector<int> uniqueLengths;
2269                 int numFlowCells;
2270                 
2271                 if (m->debug) { m->mothurOut("[DEBUG]: About to read flowgrams.\n"); }
2272                 int numSeqs = getFlowData(flowFileName, seqNameVector, lengths, flowDataIntI, nameMap, numFlowCells);
2273                 
2274                 if (m->control_pressed) { break; }
2275                 
2276                 m->mothurOut("Identifying unique flowgrams...\n");
2277                 int numUniques = getUniques(numSeqs, numFlowCells, uniqueFlowgrams, uniqueCount, uniqueLengths, mapSeqToUnique, mapUniqueToSeq, lengths, flowDataPrI, flowDataIntI);
2278                 
2279                 if (m->control_pressed) { break; }
2280                 
2281                 m->mothurOut("Calculating distances between flowgrams...\n");
2282                 string distFileName = flowFileName.substr(0,flowFileName.find_last_of('.')) + ".shhh.dist";
2283                 begTime = time(NULL);
2284                
2285                 
2286                 flowDistParentFork(numFlowCells, distFileName, numUniques, mapUniqueToSeq, mapSeqToUnique, lengths, flowDataPrI, flowDataIntI);
2287                 
2288                 m->mothurOutEndLine();
2289                 m->mothurOut("Total time: " + toString(time(NULL) - begTime) + '\t' + toString((clock() - begClock)/CLOCKS_PER_SEC) + '\n');
2290                 
2291                 
2292                 string namesFileName = flowFileName.substr(0,flowFileName.find_last_of('.')) + ".shhh.names";
2293                 createNamesFile(numSeqs, numUniques, namesFileName, seqNameVector, mapSeqToUnique, mapUniqueToSeq);
2294                 
2295                 if (m->control_pressed) { break; }
2296                 
2297                 m->mothurOut("\nClustering flowgrams...\n");
2298                 string listFileName = flowFileName.substr(0,flowFileName.find_last_of('.')) + ".shhh.list";
2299                 cluster(listFileName, distFileName, namesFileName);
2300                 
2301                 if (m->control_pressed) { break; }
2302                 
2303                 vector<int> otuData;
2304                 vector<int> cumNumSeqs;
2305                 vector<int> nSeqsPerOTU;
2306                 vector<vector<int> > aaP;       //tMaster->aanP:        each row is a different otu / each col contains the sequence indices
2307                 vector<vector<int> > aaI;       //tMaster->aanI:        that are in each otu - can't differentiate between aaP and aaI 
2308                 vector<int> seqNumber;          //tMaster->anP:         the sequence id number sorted by OTU
2309                 vector<int> seqIndex;           //tMaster->anI;         the index that corresponds to seqNumber
2310                 
2311                 
2312                 int numOTUs = getOTUData(numSeqs, listFileName, otuData, cumNumSeqs, nSeqsPerOTU, aaP, aaI, seqNumber, seqIndex, nameMap);
2313                 
2314                 if (m->control_pressed) { break; }
2315                 
2316                 m->mothurRemove(distFileName);
2317                 m->mothurRemove(namesFileName);
2318                 m->mothurRemove(listFileName);
2319                 
2320                 vector<double> dist;            //adDist - distance of sequences to centroids
2321                 vector<short> change;           //did the centroid sequence change? 0 = no; 1 = yes
2322                 vector<int> centroids;          //the representative flowgram for each cluster m
2323                 vector<double> weight;
2324                 vector<double> singleTau;       //tMaster->adTau:       1-D Tau vector (1xnumSeqs)
2325                 vector<int> nSeqsBreaks;
2326                 vector<int> nOTUsBreaks;
2327                 
2328                 if (m->debug) { m->mothurOut("[DEBUG]: numSeqs = " + toString(numSeqs) + " numOTUS = " + toString(numOTUs) + " about to alloc a dist vector with size = " + toString((numSeqs * numOTUs)) + ".\n"); }
2329                 
2330                 dist.assign(numSeqs * numOTUs, 0);
2331                 change.assign(numOTUs, 1);
2332                 centroids.assign(numOTUs, -1);
2333                 weight.assign(numOTUs, 0);
2334                 singleTau.assign(numSeqs, 1.0);
2335                 
2336                 nSeqsBreaks.assign(2, 0);
2337                 nOTUsBreaks.assign(2, 0);
2338                 
2339                 nSeqsBreaks[0] = 0;
2340                 nSeqsBreaks[1] = numSeqs;
2341                 nOTUsBreaks[1] = numOTUs;
2342                 
2343                 if (m->debug) { m->mothurOut("[DEBUG]: done allocating memory, about to denoise.\n"); }
2344                 
2345                 if (m->control_pressed) { break; }
2346                 
2347                 double maxDelta = 0;
2348                 int iter = 0;
2349                 
2350                 begClock = clock();
2351                 begTime = time(NULL);
2352                 
2353                 m->mothurOut("\nDenoising flowgrams...\n");
2354                 m->mothurOut("iter\tmaxDelta\tnLL\t\tcycletime\n");
2355                 
2356                 while((maxIters == 0 && maxDelta > minDelta) || iter < MIN_ITER || (maxDelta > minDelta && iter < maxIters)){
2357                     
2358                     if (m->control_pressed) { break; }
2359                     
2360                     double cycClock = clock();
2361                     unsigned long long cycTime = time(NULL);
2362                     fill(numOTUs, seqNumber, seqIndex, cumNumSeqs, nSeqsPerOTU, aaP, aaI);
2363                     
2364                     if (m->control_pressed) { break; }
2365                     
2366                     calcCentroidsDriver(numOTUs, cumNumSeqs, nSeqsPerOTU, seqIndex, change, centroids, singleTau, mapSeqToUnique, uniqueFlowgrams, flowDataIntI, lengths, numFlowCells, seqNumber);
2367                     
2368                     if (m->control_pressed) { break; }
2369                     
2370                     maxDelta = getNewWeights(numOTUs, cumNumSeqs, nSeqsPerOTU, singleTau, seqNumber, weight);  
2371                     
2372                     if (m->control_pressed) { break; }
2373                     
2374                     double nLL = getLikelihood(numSeqs, numOTUs, nSeqsPerOTU, seqNumber, cumNumSeqs, seqIndex, dist, weight); 
2375                     
2376                     if (m->control_pressed) { break; }
2377                     
2378                     checkCentroids(numOTUs, centroids, weight);
2379                     
2380                     if (m->control_pressed) { break; }
2381                     
2382                     calcNewDistances(numSeqs, numOTUs, nSeqsPerOTU,  dist, weight, change, centroids, aaP, singleTau, aaI, seqNumber, seqIndex, uniqueFlowgrams, flowDataIntI, numFlowCells, lengths);
2383                     
2384                     if (m->control_pressed) { break; }
2385                     
2386                     iter++;
2387                     
2388                     m->mothurOut(toString(iter) + '\t' + toString(maxDelta) + '\t' + toString(nLL) + '\t' + toString(time(NULL) - cycTime) + '\t' + toString((clock() - cycClock)/(double)CLOCKS_PER_SEC) + '\n');
2389                     
2390                 }       
2391                 
2392                 if (m->control_pressed) { break; }
2393                 
2394                 m->mothurOut("\nFinalizing...\n");
2395                 fill(numOTUs, seqNumber, seqIndex, cumNumSeqs, nSeqsPerOTU, aaP, aaI);
2396                 
2397                 if (m->control_pressed) { break; }
2398                 
2399                 setOTUs(numOTUs, numSeqs, seqNumber, seqIndex, cumNumSeqs, nSeqsPerOTU, otuData, singleTau, dist, aaP, aaI);
2400                 
2401                 if (m->control_pressed) { break; }
2402                 
2403                 vector<int> otuCounts(numOTUs, 0);
2404                 for(int j=0;j<numSeqs;j++)      {       otuCounts[otuData[j]]++;        }
2405                 
2406                 calcCentroidsDriver(numOTUs, cumNumSeqs, nSeqsPerOTU, seqIndex, change, centroids, singleTau, mapSeqToUnique, uniqueFlowgrams, flowDataIntI, lengths, numFlowCells, seqNumber); 
2407                 
2408                 if (m->control_pressed) { break; }
2409                 
2410                 if ((large) && (g == 0)) {  flowFileName = filenames[i]; theseFlowFileNames[0] = filenames[i]; }
2411                 string thisOutputDir = outputDir;
2412                 if (outputDir == "") {  thisOutputDir = m->hasPath(flowFileName);  }
2413                 map<string, string> variables; 
2414                 variables["[filename]"] = thisOutputDir + m->getRootName(m->getSimpleName(flowFileName));
2415                 string qualityFileName = getOutputFileName("qfile",variables);
2416                 string fastaFileName = getOutputFileName("fasta",variables);
2417                 string nameFileName = getOutputFileName("name",variables);
2418                 string otuCountsFileName = getOutputFileName("counts",variables);
2419                 string fileRoot = m->getRootName(m->getSimpleName(flowFileName));
2420                 int pos = fileRoot.find_first_of('.');
2421                 string fileGroup = fileRoot;
2422                 if (pos != string::npos) {  fileGroup = fileRoot.substr(pos+1, (fileRoot.length()-1-(pos+1)));  }
2423                 string groupFileName = getOutputFileName("group",variables);
2424
2425                 
2426                 writeQualities(numOTUs, numFlowCells, qualityFileName, otuCounts, nSeqsPerOTU, seqNumber, singleTau, flowDataIntI, uniqueFlowgrams, cumNumSeqs, mapUniqueToSeq, seqNameVector, centroids, aaI); if (m->control_pressed) { break; }
2427                 writeSequences(thisCompositeFASTAFileName, numOTUs, numFlowCells, fastaFileName, otuCounts, uniqueFlowgrams, seqNameVector, aaI, centroids);if (m->control_pressed) { break; }
2428                 writeNames(thisCompositeNamesFileName, numOTUs, nameFileName, otuCounts, seqNameVector, aaI, nSeqsPerOTU);                              if (m->control_pressed) { break; }
2429                 writeClusters(otuCountsFileName, numOTUs, numFlowCells,otuCounts, centroids, uniqueFlowgrams, seqNameVector, aaI, nSeqsPerOTU, lengths, flowDataIntI);                  if (m->control_pressed) { break; }
2430                 writeGroups(groupFileName, fileGroup, numSeqs, seqNameVector);                                          if (m->control_pressed) { break; }
2431                 
2432                 if (large) {
2433                     if (g > 0) {
2434                         variables["[filename]"] = thisOutputDir + m->getRootName(m->getSimpleName(theseFlowFileNames[0]));
2435                         m->appendFiles(qualityFileName, getOutputFileName("qfile",variables));
2436                         m->mothurRemove(qualityFileName);
2437                         m->appendFiles(fastaFileName, getOutputFileName("fasta",variables));
2438                         m->mothurRemove(fastaFileName);
2439                         m->appendFiles(nameFileName, getOutputFileName("name",variables));
2440                         m->mothurRemove(nameFileName);
2441                         m->appendFiles(otuCountsFileName, getOutputFileName("counts",variables));
2442                         m->mothurRemove(otuCountsFileName);
2443                         m->appendFiles(groupFileName, getOutputFileName("group",variables));
2444                         m->mothurRemove(groupFileName);
2445                     }
2446                     m->mothurRemove(theseFlowFileNames[g]);
2447                 }
2448                         }
2449             
2450             numCompleted++;
2451                         m->mothurOut("Total time to process " + fileNameForOutput + ":\t" + toString(time(NULL) - begTime) + '\t' + toString((clock() - begClock)/(double)CLOCKS_PER_SEC) + '\n');
2452                 }
2453                 
2454         if (m->control_pressed) { for (int i = 0; i < outputNames.size(); i++) { m->mothurRemove(outputNames[i]); } return 0; }
2455         
2456         return numCompleted;
2457         
2458     }catch(exception& e) {
2459             m->errorOut(e, "ShhherCommand", "driver");
2460             exit(1);
2461     }
2462 }
2463
2464 /**************************************************************************************************/
2465 int ShhherCommand::getFlowData(string filename, vector<string>& thisSeqNameVector, vector<int>& thisLengths, vector<short>& thisFlowDataIntI, map<string, int>& thisNameMap, int& numFlowCells){
2466         try{
2467        
2468                 ifstream flowFile;
2469        
2470                 m->openInputFile(filename, flowFile);
2471                 
2472                 string seqName;
2473                 int currentNumFlowCells;
2474                 float intensity;
2475         thisSeqNameVector.clear();
2476                 thisLengths.clear();
2477                 thisFlowDataIntI.clear();
2478                 thisNameMap.clear();
2479                 
2480                 string numFlowTest;
2481         flowFile >> numFlowTest;
2482         
2483         if (!m->isContainingOnlyDigits(numFlowTest)) { m->mothurOut("[ERROR]: expected a number and got " + numFlowTest + ", quitting. Did you use the flow parameter instead of the file parameter?"); m->mothurOutEndLine(); exit(1); }
2484         else { convert(numFlowTest, numFlowCells); }
2485         
2486         if (m->debug) { m->mothurOut("[DEBUG]: numFlowCells = " + toString(numFlowCells) + ".\n"); }
2487                 int index = 0;//pcluster
2488                 while(!flowFile.eof()){
2489                         
2490                         if (m->control_pressed) { break; }
2491                         
2492                         flowFile >> seqName >> currentNumFlowCells;
2493             
2494                         thisLengths.push_back(currentNumFlowCells);
2495            
2496                         thisSeqNameVector.push_back(seqName);
2497                         thisNameMap[seqName] = index++;//pcluster
2498             
2499             if (m->debug) { m->mothurOut("[DEBUG]: seqName = " + seqName + " length = " + toString(currentNumFlowCells) + " index = " + toString(index) + "\n"); }
2500             
2501                         for(int i=0;i<numFlowCells;i++){
2502                                 flowFile >> intensity;
2503                                 if(intensity > 9.99)    {       intensity = 9.99;       }
2504                                 int intI = int(100 * intensity + 0.0001);
2505                                 thisFlowDataIntI.push_back(intI);
2506                         }
2507                         m->gobble(flowFile);
2508                 }
2509                 flowFile.close();
2510                 
2511                 int numSeqs = thisSeqNameVector.size();         
2512                 
2513                 for(int i=0;i<numSeqs;i++){
2514                         
2515                         if (m->control_pressed) { break; }
2516                         
2517                         int iNumFlowCells = i * numFlowCells;
2518                         for(int j=thisLengths[i];j<numFlowCells;j++){
2519                                 thisFlowDataIntI[iNumFlowCells + j] = 0;
2520                         }
2521                 }
2522         
2523         return numSeqs;
2524                 
2525         }
2526         catch(exception& e) {
2527                 m->errorOut(e, "ShhherCommand", "getFlowData");
2528                 exit(1);
2529         }
2530 }
2531 /**************************************************************************************************/
2532
2533 int ShhherCommand::flowDistParentFork(int numFlowCells, string distFileName, int stopSeq, vector<int>& mapUniqueToSeq, vector<int>& mapSeqToUnique, vector<int>& lengths, vector<double>& flowDataPrI, vector<short>& flowDataIntI){
2534         try{            
2535         
2536                 ostringstream outStream;
2537                 outStream.setf(ios::fixed, ios::floatfield);
2538                 outStream.setf(ios::dec, ios::basefield);
2539                 outStream.setf(ios::showpoint);
2540                 outStream.precision(6);
2541                 
2542                 int begTime = time(NULL);
2543                 double begClock = clock();
2544         
2545                 for(int i=0;i<stopSeq;i++){
2546                         
2547                         if (m->control_pressed) { break; }
2548                         
2549                         for(int j=0;j<i;j++){
2550                                 float flowDistance = calcPairwiseDist(numFlowCells, mapUniqueToSeq[i], mapUniqueToSeq[j], mapSeqToUnique, lengths, flowDataPrI, flowDataIntI);
2551                 
2552                                 if(flowDistance < 1e-6){
2553                                         outStream << mapUniqueToSeq[i] << '\t' << mapUniqueToSeq[j] << '\t' << 0.000000 << endl;
2554                                 }
2555                                 else if(flowDistance <= cutoff){
2556                                         outStream << mapUniqueToSeq[i] << '\t' << mapUniqueToSeq[j] << '\t' << flowDistance << endl;
2557                                 }
2558                         }
2559                         if(i % 100 == 0){
2560                                 m->mothurOutJustToScreen(toString(i) + "\t" + toString(time(NULL) - begTime));
2561                                 m->mothurOutJustToScreen("\t" + toString((clock()-begClock)/CLOCKS_PER_SEC)+"\n");
2562                         }
2563                 }
2564                 
2565                 ofstream distFile(distFileName.c_str());
2566                 distFile << outStream.str();            
2567                 distFile.close();
2568                 
2569                 if (m->control_pressed) {}
2570                 else {
2571                         m->mothurOutJustToScreen(toString(stopSeq-1) + "\t" + toString(time(NULL) - begTime));
2572                         m->mothurOutJustToScreen("\t" + toString((clock()-begClock)/CLOCKS_PER_SEC)+"\n");
2573                 }
2574         
2575         return 0;
2576         }
2577         catch(exception& e) {
2578                 m->errorOut(e, "ShhherCommand", "flowDistParentFork");
2579                 exit(1);
2580         }
2581 }
2582 /**************************************************************************************************/
2583
2584 float ShhherCommand::calcPairwiseDist(int numFlowCells, int seqA, int seqB, vector<int>& mapSeqToUnique, vector<int>& lengths, vector<double>& flowDataPrI, vector<short>& flowDataIntI){
2585         try{
2586                 int minLength = lengths[mapSeqToUnique[seqA]];
2587                 if(lengths[seqB] < minLength){  minLength = lengths[mapSeqToUnique[seqB]];      }
2588                 
2589                 int ANumFlowCells = seqA * numFlowCells;
2590                 int BNumFlowCells = seqB * numFlowCells;
2591                 
2592                 float dist = 0;
2593                 
2594                 for(int i=0;i<minLength;i++){
2595                         
2596                         if (m->control_pressed) { break; }
2597                         
2598                         int flowAIntI = flowDataIntI[ANumFlowCells + i];
2599                         float flowAPrI = flowDataPrI[ANumFlowCells + i];
2600                         
2601                         int flowBIntI = flowDataIntI[BNumFlowCells + i];
2602                         float flowBPrI = flowDataPrI[BNumFlowCells + i];
2603                         dist += jointLookUp[flowAIntI * NUMBINS + flowBIntI] - flowAPrI - flowBPrI;
2604                 }
2605                 
2606                 dist /= (float) minLength;
2607                 return dist;
2608         }
2609         catch(exception& e) {
2610                 m->errorOut(e, "ShhherCommand", "calcPairwiseDist");
2611                 exit(1);
2612         }
2613 }
2614
2615 /**************************************************************************************************/
2616
2617 int ShhherCommand::getUniques(int numSeqs, int numFlowCells, vector<short>& uniqueFlowgrams, vector<int>& uniqueCount, vector<int>& uniqueLengths, vector<int>& mapSeqToUnique, vector<int>& mapUniqueToSeq, vector<int>& lengths, vector<double>& flowDataPrI, vector<short>& flowDataIntI){
2618         try{
2619                 int numUniques = 0;
2620                 uniqueFlowgrams.assign(numFlowCells * numSeqs, -1);
2621                 uniqueCount.assign(numSeqs, 0);                                                 //      anWeights
2622                 uniqueLengths.assign(numSeqs, 0);
2623                 mapSeqToUnique.assign(numSeqs, -1);
2624                 mapUniqueToSeq.assign(numSeqs, -1);
2625                 
2626                 vector<short> uniqueFlowDataIntI(numFlowCells * numSeqs, -1);
2627                 
2628                 for(int i=0;i<numSeqs;i++){
2629                         
2630                         if (m->control_pressed) { break; }
2631                         
2632                         int index = 0;
2633                         
2634                         vector<short> current(numFlowCells);
2635                         for(int j=0;j<numFlowCells;j++){
2636                                 current[j] = short(((flowDataIntI[i * numFlowCells + j] + 50.0)/100.0));
2637                         }
2638             
2639                         for(int j=0;j<numUniques;j++){
2640                                 int offset = j * numFlowCells;
2641                                 bool toEnd = 1;
2642                                 
2643                                 int shorterLength;
2644                                 if(lengths[i] < uniqueLengths[j])       {       shorterLength = lengths[i];                     }
2645                                 else                                                            {       shorterLength = uniqueLengths[j];       }
2646                 
2647                                 for(int k=0;k<shorterLength;k++){
2648                                         if(current[k] != uniqueFlowgrams[offset + k]){
2649                                                 toEnd = 0;
2650                                                 break;
2651                                         }
2652                                 }
2653                                 
2654                                 if(toEnd){
2655                                         mapSeqToUnique[i] = j;
2656                                         uniqueCount[j]++;
2657                                         index = j;
2658                                         if(lengths[i] > uniqueLengths[j])       {       uniqueLengths[j] = lengths[i];  }
2659                                         break;
2660                                 }
2661                                 index++;
2662                         }
2663                         
2664                         if(index == numUniques){
2665                                 uniqueLengths[numUniques] = lengths[i];
2666                                 uniqueCount[numUniques] = 1;
2667                                 mapSeqToUnique[i] = numUniques;//anMap
2668                                 mapUniqueToSeq[numUniques] = i;//anF
2669                                 
2670                                 for(int k=0;k<numFlowCells;k++){
2671                                         uniqueFlowgrams[numUniques * numFlowCells + k] = current[k];
2672                                         uniqueFlowDataIntI[numUniques * numFlowCells + k] = flowDataIntI[i * numFlowCells + k];
2673                                 }
2674                                 
2675                                 numUniques++;
2676                         }
2677                 }
2678                 uniqueFlowDataIntI.resize(numFlowCells * numUniques);
2679                 uniqueLengths.resize(numUniques);       
2680                 
2681                 flowDataPrI.resize(numSeqs * numFlowCells, 0);
2682                 for(int i=0;i<flowDataPrI.size();i++)   {       if (m->control_pressed) { break; } flowDataPrI[i] = getProbIntensity(flowDataIntI[i]);          }
2683         
2684         return numUniques;
2685         }
2686         catch(exception& e) {
2687                 m->errorOut(e, "ShhherCommand", "getUniques");
2688                 exit(1);
2689         }
2690 }
2691 /**************************************************************************************************/
2692 int ShhherCommand::createNamesFile(int numSeqs, int numUniques, string filename, vector<string>& seqNameVector, vector<int>& mapSeqToUnique, vector<int>& mapUniqueToSeq){
2693         try{
2694                 
2695                 vector<string> duplicateNames(numUniques, "");
2696                 for(int i=0;i<numSeqs;i++){
2697                         duplicateNames[mapSeqToUnique[i]] += seqNameVector[i] + ',';
2698                 }
2699                 
2700                 ofstream nameFile;
2701                 m->openOutputFile(filename, nameFile);
2702                 
2703                 for(int i=0;i<numUniques;i++){
2704                         
2705                         if (m->control_pressed) { break; }
2706                         
2707             //                  nameFile << seqNameVector[mapUniqueToSeq[i]] << '\t' << duplicateNames[i].substr(0, duplicateNames[i].find_last_of(',')) << endl;
2708                         nameFile << mapUniqueToSeq[i] << '\t' << duplicateNames[i].substr(0, duplicateNames[i].find_last_of(',')) << endl;
2709                 }
2710                 
2711                 nameFile.close();
2712         
2713                 return 0;
2714         }
2715         catch(exception& e) {
2716                 m->errorOut(e, "ShhherCommand", "createNamesFile");
2717                 exit(1);
2718         }
2719 }
2720 //**********************************************************************************************************************
2721
2722 int ShhherCommand::cluster(string filename, string distFileName, string namesFileName){
2723         try {
2724                 
2725                 ReadMatrix* read = new ReadColumnMatrix(distFileName);  
2726                 read->setCutoff(cutoff);
2727                 
2728                 NameAssignment* clusterNameMap = new NameAssignment(namesFileName);
2729                 clusterNameMap->readMap();
2730                 read->read(clusterNameMap);
2731         
2732                 ListVector* list = read->getListVector();
2733                 SparseDistanceMatrix* matrix = read->getDMatrix();
2734                 
2735                 delete read; 
2736                 delete clusterNameMap; 
2737         
2738                 RAbundVector* rabund = new RAbundVector(list->getRAbundVector());
2739                 
2740         float adjust = -1.0;
2741                 Cluster* cluster = new CompleteLinkage(rabund, list, matrix, cutoff, "furthest", adjust);
2742                 string tag = cluster->getTag();
2743                 
2744                 double clusterCutoff = cutoff;
2745                 while (matrix->getSmallDist() <= clusterCutoff && matrix->getNNodes() > 0){
2746                         
2747                         if (m->control_pressed) { break; }
2748                         
2749                         cluster->update(clusterCutoff);
2750                 }
2751                 
2752                 list->setLabel(toString(cutoff));
2753                 
2754                 ofstream listFile;
2755                 m->openOutputFile(filename, listFile);
2756                 list->print(listFile);
2757                 listFile.close();
2758                 
2759                 delete matrix;  delete cluster; delete rabund; delete list;
2760         
2761                 return 0;
2762         }
2763         catch(exception& e) {
2764                 m->errorOut(e, "ShhherCommand", "cluster");
2765                 exit(1);        
2766         }               
2767 }
2768 /**************************************************************************************************/
2769
2770 int ShhherCommand::getOTUData(int numSeqs, string fileName,  vector<int>& otuData,
2771                                vector<int>& cumNumSeqs,
2772                                vector<int>& nSeqsPerOTU,
2773                                vector<vector<int> >& aaP,       //tMaster->aanP:        each row is a different otu / each col contains the sequence indices
2774                                vector<vector<int> >& aaI,       //tMaster->aanI:        that are in each otu - can't differentiate between aaP and aaI 
2775                                vector<int>& seqNumber,          //tMaster->anP:         the sequence id number sorted by OTU
2776                                vector<int>& seqIndex,
2777                                map<string, int>& nameMap){
2778         try {
2779         
2780                 ifstream listFile;
2781                 m->openInputFile(fileName, listFile);
2782                 string label;
2783         int numOTUs;
2784                 
2785                 listFile >> label >> numOTUs;
2786         
2787         if (m->debug) { m->mothurOut("[DEBUG]: Getting OTU Data...\n"); }
2788         
2789                 otuData.assign(numSeqs, 0);
2790                 cumNumSeqs.assign(numOTUs, 0);
2791                 nSeqsPerOTU.assign(numOTUs, 0);
2792                 aaP.clear();aaP.resize(numOTUs);
2793                 
2794                 seqNumber.clear();
2795                 aaI.clear();
2796                 seqIndex.clear();
2797                 
2798                 string singleOTU = "";
2799                 
2800                 for(int i=0;i<numOTUs;i++){
2801                         
2802                         if (m->control_pressed) { break; }
2803             if (m->debug) { m->mothurOut("[DEBUG]: processing OTU " + toString(i) + ".\n"); }
2804             
2805                         listFile >> singleOTU;
2806                         
2807                         istringstream otuString(singleOTU);
2808             
2809                         while(otuString){
2810                                 
2811                                 string seqName = "";
2812                                 
2813                                 for(int j=0;j<singleOTU.length();j++){
2814                                         char letter = otuString.get();
2815                                         
2816                                         if(letter != ','){
2817                                                 seqName += letter;
2818                                         }
2819                                         else{
2820                                                 map<string,int>::iterator nmIt = nameMap.find(seqName);
2821                                                 int index = nmIt->second;
2822                                                 
2823                                                 nameMap.erase(nmIt);
2824                                                 
2825                                                 otuData[index] = i;
2826                                                 nSeqsPerOTU[i]++;
2827                                                 aaP[i].push_back(index);
2828                                                 seqName = "";
2829                                         }
2830                                 }
2831                                 
2832                                 map<string,int>::iterator nmIt = nameMap.find(seqName);
2833                 
2834                                 int index = nmIt->second;
2835                                 nameMap.erase(nmIt);
2836                 
2837                                 otuData[index] = i;
2838                                 nSeqsPerOTU[i]++;
2839                                 aaP[i].push_back(index);        
2840                                 
2841                                 otuString.get();
2842                         }
2843                         
2844                         sort(aaP[i].begin(), aaP[i].end());
2845                         for(int j=0;j<nSeqsPerOTU[i];j++){
2846                                 seqNumber.push_back(aaP[i][j]);
2847                         }
2848                         for(int j=nSeqsPerOTU[i];j<numSeqs;j++){
2849                                 aaP[i].push_back(0);
2850                         }
2851                         
2852                         
2853                 }
2854                 
2855                 for(int i=1;i<numOTUs;i++){
2856                         cumNumSeqs[i] = cumNumSeqs[i-1] + nSeqsPerOTU[i-1];
2857                 }
2858                 aaI = aaP;
2859                 seqIndex = seqNumber;
2860                 
2861                 listFile.close();       
2862         
2863         return numOTUs;
2864                 
2865         }
2866         catch(exception& e) {
2867                 m->errorOut(e, "ShhherCommand", "getOTUData");
2868                 exit(1);        
2869         }               
2870 }
2871 /**************************************************************************************************/
2872
2873 int ShhherCommand::calcCentroidsDriver(int numOTUs, 
2874                                           vector<int>& cumNumSeqs,
2875                                           vector<int>& nSeqsPerOTU,
2876                                           vector<int>& seqIndex,
2877                                           vector<short>& change,                //did the centroid sequence change? 0 = no; 1 = yes
2878                                           vector<int>& centroids,               //the representative flowgram for each cluster m
2879                                           vector<double>& singleTau,    //tMaster->adTau:       1-D Tau vector (1xnumSeqs)
2880                                           vector<int>& mapSeqToUnique,
2881                                           vector<short>& uniqueFlowgrams,
2882                                           vector<short>& flowDataIntI,
2883                                           vector<int>& lengths,
2884                                           int numFlowCells,
2885                                           vector<int>& seqNumber){                          
2886         
2887         //this function gets the most likely homopolymer length at a flow position for a group of sequences
2888         //within an otu
2889         
2890         try{
2891                 
2892                 for(int i=0;i<numOTUs;i++){
2893                         
2894                         if (m->control_pressed) { break; }
2895                         
2896                         double count = 0;
2897                         int position = 0;
2898                         int minFlowGram = 100000000;
2899                         double minFlowValue = 1e8;
2900                         change[i] = 0; //FALSE
2901                         
2902                         for(int j=0;j<nSeqsPerOTU[i];j++){
2903                                 count += singleTau[seqNumber[cumNumSeqs[i] + j]];
2904                         }
2905             
2906                         if(nSeqsPerOTU[i] > 0 && count > MIN_COUNT){
2907                                 vector<double> adF(nSeqsPerOTU[i]);
2908                                 vector<int> anL(nSeqsPerOTU[i]);
2909                                 
2910                                 for(int j=0;j<nSeqsPerOTU[i];j++){
2911                                         int index = cumNumSeqs[i] + j;
2912                                         int nI = seqIndex[index];
2913                                         int nIU = mapSeqToUnique[nI];
2914                                         
2915                                         int k;
2916                                         for(k=0;k<position;k++){
2917                                                 if(nIU == anL[k]){
2918                                                         break;
2919                                                 }
2920                                         }
2921                                         if(k == position){
2922                                                 anL[position] = nIU;
2923                                                 adF[position] = 0.0000;
2924                                                 position++;
2925                                         }                                               
2926                                 }
2927                                 
2928                                 for(int j=0;j<nSeqsPerOTU[i];j++){
2929                                         int index = cumNumSeqs[i] + j;
2930                                         int nI = seqIndex[index];
2931                                         
2932                                         double tauValue = singleTau[seqNumber[index]];
2933                                         
2934                                         for(int k=0;k<position;k++){
2935                                                 double dist = getDistToCentroid(anL[k], nI, lengths[nI], uniqueFlowgrams, flowDataIntI, numFlowCells);
2936                                                 adF[k] += dist * tauValue;
2937                                         }
2938                                 }
2939                                 
2940                                 for(int j=0;j<position;j++){
2941                                         if(adF[j] < minFlowValue){
2942                                                 minFlowGram = j;
2943                                                 minFlowValue = adF[j];
2944                                         }
2945                                 }
2946                                 
2947                                 if(centroids[i] != anL[minFlowGram]){
2948                                         change[i] = 1;
2949                                         centroids[i] = anL[minFlowGram];
2950                                 }
2951                         }
2952                         else if(centroids[i] != -1){
2953                                 change[i] = 1;
2954                                 centroids[i] = -1;                      
2955                         }
2956                 }
2957         
2958         return 0;
2959         }
2960         catch(exception& e) {
2961                 m->errorOut(e, "ShhherCommand", "calcCentroidsDriver");
2962                 exit(1);        
2963         }               
2964 }
2965 /**************************************************************************************************/
2966
2967 double ShhherCommand::getDistToCentroid(int cent, int flow, int length, vector<short>& uniqueFlowgrams,
2968                                         vector<short>& flowDataIntI, int numFlowCells){
2969         try{
2970                 
2971                 int flowAValue = cent * numFlowCells;
2972                 int flowBValue = flow * numFlowCells;
2973                 
2974                 double dist = 0;
2975         
2976                 for(int i=0;i<length;i++){
2977                         dist += singleLookUp[uniqueFlowgrams[flowAValue] * NUMBINS + flowDataIntI[flowBValue]];
2978                         flowAValue++;
2979                         flowBValue++;
2980                 }
2981                 
2982                 return dist / (double)length;
2983         }
2984         catch(exception& e) {
2985                 m->errorOut(e, "ShhherCommand", "getDistToCentroid");
2986                 exit(1);        
2987         }               
2988 }
2989 /**************************************************************************************************/
2990
2991 double ShhherCommand::getNewWeights(int numOTUs, vector<int>& cumNumSeqs, vector<int>& nSeqsPerOTU, vector<double>& singleTau, vector<int>& seqNumber, vector<double>& weight){
2992         try{
2993                 
2994                 double maxChange = 0;
2995                 
2996                 for(int i=0;i<numOTUs;i++){
2997                         
2998                         if (m->control_pressed) { break; }
2999                         
3000                         double difference = weight[i];
3001                         weight[i] = 0;
3002                         
3003                         for(int j=0;j<nSeqsPerOTU[i];j++){
3004                                 int index = cumNumSeqs[i] + j;
3005                                 double tauValue = singleTau[seqNumber[index]];
3006                                 weight[i] += tauValue;
3007                         }
3008                         
3009                         difference = fabs(weight[i] - difference);
3010                         if(difference > maxChange){     maxChange = difference; }
3011                 }
3012                 return maxChange;
3013         }
3014         catch(exception& e) {
3015                 m->errorOut(e, "ShhherCommand", "getNewWeights");
3016                 exit(1);        
3017         }               
3018 }
3019
3020 /**************************************************************************************************/
3021
3022 double ShhherCommand::getLikelihood(int numSeqs, int numOTUs, vector<int>& nSeqsPerOTU, vector<int>& seqNumber, vector<int>& cumNumSeqs, vector<int>& seqIndex, vector<double>& dist, vector<double>& weight){
3023         
3024         try{
3025                 
3026                 vector<long double> P(numSeqs, 0);
3027                 int effNumOTUs = 0;
3028                 
3029                 for(int i=0;i<numOTUs;i++){
3030                         if(weight[i] > MIN_WEIGHT){
3031                                 effNumOTUs++;
3032                         }
3033                 }
3034                 
3035                 string hold;
3036                 for(int i=0;i<numOTUs;i++){
3037                         
3038                         if (m->control_pressed) { break; }
3039                         
3040                         for(int j=0;j<nSeqsPerOTU[i];j++){
3041                                 int index = cumNumSeqs[i] + j;
3042                                 int nI = seqIndex[index];
3043                                 double singleDist = dist[seqNumber[index]];
3044                                 
3045                                 P[nI] += weight[i] * exp(-singleDist * sigma);
3046                         }
3047                 }
3048                 double nLL = 0.00;
3049                 for(int i=0;i<numSeqs;i++){
3050                         if(P[i] == 0){  P[i] = DBL_EPSILON;     }
3051             
3052                         nLL += -log(P[i]);
3053                 }
3054                 
3055                 nLL = nLL -(double)numSeqs * log(sigma);
3056         
3057                 return nLL; 
3058         }
3059         catch(exception& e) {
3060                 m->errorOut(e, "ShhherCommand", "getNewWeights");
3061                 exit(1);        
3062         }               
3063 }
3064
3065 /**************************************************************************************************/
3066
3067 int ShhherCommand::checkCentroids(int numOTUs, vector<int>& centroids, vector<double>& weight){
3068         try{
3069                 vector<int> unique(numOTUs, 1);
3070                 
3071                 for(int i=0;i<numOTUs;i++){
3072                         if(centroids[i] == -1 || weight[i] < MIN_WEIGHT){
3073                                 unique[i] = -1;
3074                         }
3075                 }
3076                 
3077                 for(int i=0;i<numOTUs;i++){
3078                         
3079                         if (m->control_pressed) { break; }
3080                         
3081                         if(unique[i] == 1){
3082                                 for(int j=i+1;j<numOTUs;j++){
3083                                         if(unique[j] == 1){
3084                                                 
3085                                                 if(centroids[j] == centroids[i]){
3086                                                         unique[j] = 0;
3087                                                         centroids[j] = -1;
3088                                                         
3089                                                         weight[i] += weight[j];
3090                                                         weight[j] = 0.0;
3091                                                 }
3092                                         }
3093                                 }
3094                         }
3095                 }
3096         
3097         return 0;
3098         }
3099         catch(exception& e) {
3100                 m->errorOut(e, "ShhherCommand", "checkCentroids");
3101                 exit(1);        
3102         }               
3103 }
3104 /**************************************************************************************************/
3105
3106 void ShhherCommand::calcNewDistances(int numSeqs, int numOTUs, vector<int>& nSeqsPerOTU, vector<double>& dist, 
3107                                      vector<double>& weight, vector<short>& change, vector<int>& centroids,
3108                                      vector<vector<int> >& aaP, vector<double>& singleTau, vector<vector<int> >& aaI,   
3109                                      vector<int>& seqNumber, vector<int>& seqIndex,
3110                                      vector<short>& uniqueFlowgrams,
3111                                      vector<short>& flowDataIntI, int numFlowCells, vector<int>& lengths){
3112         
3113         try{
3114                 
3115                 int total = 0;
3116                 vector<double> newTau(numOTUs,0);
3117                 vector<double> norms(numSeqs, 0);
3118                 nSeqsPerOTU.assign(numOTUs, 0);
3119         
3120                 for(int i=0;i<numSeqs;i++){
3121                         
3122                         if (m->control_pressed) { break; }
3123                         
3124                         int indexOffset = i * numOTUs;
3125             
3126                         double offset = 1e8;
3127                         
3128                         for(int j=0;j<numOTUs;j++){
3129                 
3130                                 if(weight[j] > MIN_WEIGHT && change[j] == 1){
3131                                         dist[indexOffset + j] = getDistToCentroid(centroids[j], i, lengths[i], uniqueFlowgrams, flowDataIntI, numFlowCells);
3132                                 }
3133                 
3134                                 if(weight[j] > MIN_WEIGHT && dist[indexOffset + j] < offset){
3135                                         offset = dist[indexOffset + j];
3136                                 }
3137                         }
3138             
3139                         for(int j=0;j<numOTUs;j++){
3140                                 if(weight[j] > MIN_WEIGHT){
3141                                         newTau[j] = exp(sigma * (-dist[indexOffset + j] + offset)) * weight[j];
3142                                         norms[i] += newTau[j];
3143                                 }
3144                                 else{
3145                                         newTau[j] = 0.0;
3146                                 }
3147                         }
3148             
3149                         for(int j=0;j<numOTUs;j++){
3150                                 newTau[j] /= norms[i];
3151                         }
3152             
3153                         for(int j=0;j<numOTUs;j++){
3154                                 if(newTau[j] > MIN_TAU){
3155                                         
3156                                         int oldTotal = total;
3157                                         
3158                                         total++;
3159                                         
3160                                         singleTau.resize(total, 0);
3161                                         seqNumber.resize(total, 0);
3162                                         seqIndex.resize(total, 0);
3163                                         
3164                                         singleTau[oldTotal] = newTau[j];
3165                                         
3166                                         aaP[j][nSeqsPerOTU[j]] = oldTotal;
3167                                         aaI[j][nSeqsPerOTU[j]] = i;
3168                                         nSeqsPerOTU[j]++;
3169                                 }
3170                         }
3171             
3172                 }
3173         
3174         }
3175         catch(exception& e) {
3176                 m->errorOut(e, "ShhherCommand", "calcNewDistances");
3177                 exit(1);        
3178         }               
3179 }
3180 /**************************************************************************************************/
3181
3182 int ShhherCommand::fill(int numOTUs, vector<int>& seqNumber, vector<int>& seqIndex, vector<int>& cumNumSeqs, vector<int>& nSeqsPerOTU, vector<vector<int> >& aaP, vector<vector<int> >& aaI){
3183         try {
3184                 int index = 0;
3185                 for(int i=0;i<numOTUs;i++){
3186                         
3187                         if (m->control_pressed) { return 0; }
3188                         
3189                         cumNumSeqs[i] = index;
3190                         for(int j=0;j<nSeqsPerOTU[i];j++){
3191                                 seqNumber[index] = aaP[i][j];
3192                                 seqIndex[index] = aaI[i][j];
3193                                 
3194                                 index++;
3195                         }
3196                 }
3197         
3198         return 0; 
3199         }
3200         catch(exception& e) {
3201                 m->errorOut(e, "ShhherCommand", "fill");
3202                 exit(1);        
3203         }               
3204 }
3205 /**************************************************************************************************/
3206
3207 void ShhherCommand::setOTUs(int numOTUs, int numSeqs, vector<int>& seqNumber, vector<int>& seqIndex, vector<int>& cumNumSeqs, vector<int>& nSeqsPerOTU,
3208                             vector<int>& otuData, vector<double>& singleTau, vector<double>& dist, vector<vector<int> >& aaP, vector<vector<int> >& aaI){
3209         
3210         try {
3211                 vector<double> bigTauMatrix(numOTUs * numSeqs, 0.0000);
3212                 
3213                 for(int i=0;i<numOTUs;i++){
3214                         
3215                         if (m->control_pressed) { break; }
3216                         
3217                         for(int j=0;j<nSeqsPerOTU[i];j++){
3218                                 int index = cumNumSeqs[i] + j;
3219                                 double tauValue = singleTau[seqNumber[index]];
3220                                 int sIndex = seqIndex[index];
3221                                 bigTauMatrix[sIndex * numOTUs + i] = tauValue;                          
3222                         }
3223                 }
3224                 
3225                 for(int i=0;i<numSeqs;i++){
3226                         double maxTau = -1.0000;
3227                         int maxOTU = -1;
3228                         for(int j=0;j<numOTUs;j++){
3229                                 if(bigTauMatrix[i * numOTUs + j] > maxTau){
3230                                         maxTau = bigTauMatrix[i * numOTUs + j];
3231                                         maxOTU = j;
3232                                 }
3233                         }
3234                         
3235                         otuData[i] = maxOTU;
3236                 }
3237                 
3238                 nSeqsPerOTU.assign(numOTUs, 0);         
3239                 
3240                 for(int i=0;i<numSeqs;i++){
3241                         int index = otuData[i];
3242                         
3243                         singleTau[i] = 1.0000;
3244                         dist[i] = 0.0000;
3245                         
3246                         aaP[index][nSeqsPerOTU[index]] = i;
3247                         aaI[index][nSeqsPerOTU[index]] = i;
3248                         
3249                         nSeqsPerOTU[index]++;
3250                 }
3251         
3252                 fill(numOTUs, seqNumber, seqIndex, cumNumSeqs, nSeqsPerOTU, aaP, aaI);  
3253         }
3254         catch(exception& e) {
3255                 m->errorOut(e, "ShhherCommand", "setOTUs");
3256                 exit(1);        
3257         }               
3258 }
3259 /**************************************************************************************************/
3260
3261 void ShhherCommand::writeQualities(int numOTUs, int numFlowCells, string qualityFileName, vector<int> otuCounts, vector<int>& nSeqsPerOTU, vector<int>& seqNumber,
3262                                    vector<double>& singleTau, vector<short>& flowDataIntI, vector<short>& uniqueFlowgrams, vector<int>& cumNumSeqs,
3263                                    vector<int>& mapUniqueToSeq, vector<string>& seqNameVector, vector<int>& centroids, vector<vector<int> >& aaI){
3264         
3265         try {
3266         
3267                 ofstream qualityFile;
3268                 m->openOutputFile(qualityFileName, qualityFile);
3269         
3270                 qualityFile.setf(ios::fixed, ios::floatfield);
3271                 qualityFile.setf(ios::showpoint);
3272                 qualityFile << setprecision(6);
3273                 
3274                 vector<vector<int> > qualities(numOTUs);
3275                 vector<double> pr(HOMOPS, 0);
3276                 
3277                 
3278                 for(int i=0;i<numOTUs;i++){
3279                         
3280                         if (m->control_pressed) { break; }
3281                         
3282                         int index = 0;
3283                         int base = 0;
3284                         
3285                         if(nSeqsPerOTU[i] > 0){
3286                                 qualities[i].assign(1024, -1);
3287                                 
3288                                 while(index < numFlowCells){
3289                                         double maxPrValue = 1e8;
3290                                         short maxPrIndex = -1;
3291                                         double count = 0.0000;
3292                                         
3293                                         pr.assign(HOMOPS, 0);
3294                                         
3295                                         for(int j=0;j<nSeqsPerOTU[i];j++){
3296                                                 int lIndex = cumNumSeqs[i] + j;
3297                                                 double tauValue = singleTau[seqNumber[lIndex]];
3298                                                 int sequenceIndex = aaI[i][j];
3299                                                 short intensity = flowDataIntI[sequenceIndex * numFlowCells + index];
3300                                                 
3301                                                 count += tauValue;
3302                                                 
3303                                                 for(int s=0;s<HOMOPS;s++){
3304                                                         pr[s] += tauValue * singleLookUp[s * NUMBINS + intensity];
3305                                                 }
3306                                         }
3307                                         
3308                                         maxPrIndex = uniqueFlowgrams[centroids[i] * numFlowCells + index];
3309                                         maxPrValue = pr[maxPrIndex];
3310                                         
3311                                         if(count > MIN_COUNT){
3312                                                 double U = 0.0000;
3313                                                 double norm = 0.0000;
3314                                                 
3315                                                 for(int s=0;s<HOMOPS;s++){
3316                                                         norm += exp(-(pr[s] - maxPrValue));
3317                                                 }
3318                                                 
3319                                                 for(int s=1;s<=maxPrIndex;s++){
3320                                                         int value = 0;
3321                                                         double temp = 0.0000;
3322                                                         
3323                                                         U += exp(-(pr[s-1]-maxPrValue))/norm;
3324                                                         
3325                                                         if(U>0.00){
3326                                                                 temp = log10(U);
3327                                                         }
3328                                                         else{
3329                                                                 temp = -10.1;
3330                                                         }
3331                                                         temp = floor(-10 * temp);
3332                                                         value = (int)floor(temp);
3333                                                         if(value > 100){        value = 100;    }
3334                                                         
3335                                                         qualities[i][base] = (int)value;
3336                                                         base++;
3337                                                 }
3338                                         }
3339                                         
3340                                         index++;
3341                                 }
3342                         }
3343                         
3344                         
3345                         if(otuCounts[i] > 0){
3346                                 qualityFile << '>' << seqNameVector[mapUniqueToSeq[i]] << endl;
3347                         
3348                                 int j=4;        //need to get past the first four bases
3349                                 while(qualities[i][j] != -1){
3350                     qualityFile << qualities[i][j] << ' ';
3351                     if (j > qualities[i].size()) { break; }
3352                     j++;
3353                                 }
3354                                 qualityFile << endl;
3355                         }
3356                 }
3357                 qualityFile.close();
3358                 outputNames.push_back(qualityFileName); outputTypes["qfile"].push_back(qualityFileName);
3359         
3360         }
3361         catch(exception& e) {
3362                 m->errorOut(e, "ShhherCommand", "writeQualities");
3363                 exit(1);        
3364         }               
3365 }
3366
3367 /**************************************************************************************************/
3368
3369 void ShhherCommand::writeSequences(string thisCompositeFASTAFileName, int numOTUs, int numFlowCells, string fastaFileName, vector<int> otuCounts, vector<short>& uniqueFlowgrams, vector<string>& seqNameVector, vector<vector<int> >& aaI, vector<int>& centroids){
3370         try {
3371                 
3372                 ofstream fastaFile;
3373                 m->openOutputFile(fastaFileName, fastaFile);
3374                 
3375                 vector<string> names(numOTUs, "");
3376                 
3377                 for(int i=0;i<numOTUs;i++){
3378                         
3379                         if (m->control_pressed) { break; }
3380                         
3381                         int index = centroids[i];
3382                         
3383                         if(otuCounts[i] > 0){
3384                                 fastaFile << '>' << seqNameVector[aaI[i][0]] << endl;
3385                                 
3386                                 string newSeq = "";
3387                                 
3388                                 for(int j=0;j<numFlowCells;j++){
3389                                         
3390                                         char base = flowOrder[j % flowOrder.length()];
3391                                         for(int k=0;k<uniqueFlowgrams[index * numFlowCells + j];k++){
3392                                                 newSeq += base;
3393                                         }
3394                                 }
3395                                 
3396                                 if (newSeq.length() >= 4) {  fastaFile << newSeq.substr(4) << endl;  }
3397                 else {  fastaFile << "NNNN" << endl;  }
3398                         }
3399                 }
3400                 fastaFile.close();
3401         
3402                 outputNames.push_back(fastaFileName); outputTypes["fasta"].push_back(fastaFileName);
3403         
3404                 if(thisCompositeFASTAFileName != ""){
3405                         m->appendFiles(fastaFileName, thisCompositeFASTAFileName);
3406                 }
3407         }
3408         catch(exception& e) {
3409                 m->errorOut(e, "ShhherCommand", "writeSequences");
3410                 exit(1);        
3411         }               
3412 }
3413
3414 /**************************************************************************************************/
3415
3416 void ShhherCommand::writeNames(string thisCompositeNamesFileName, int numOTUs, string nameFileName, vector<int> otuCounts, vector<string>& seqNameVector, vector<vector<int> >& aaI, vector<int>& nSeqsPerOTU){
3417         try {
3418                 
3419                 ofstream nameFile;
3420                 m->openOutputFile(nameFileName, nameFile);
3421                 
3422                 for(int i=0;i<numOTUs;i++){
3423                         
3424                         if (m->control_pressed) { break; }
3425                         
3426                         if(otuCounts[i] > 0){
3427                                 nameFile << seqNameVector[aaI[i][0]] << '\t' << seqNameVector[aaI[i][0]];
3428                                 
3429                                 for(int j=1;j<nSeqsPerOTU[i];j++){
3430                                         nameFile << ',' << seqNameVector[aaI[i][j]];
3431                                 }
3432                                 
3433                                 nameFile << endl;
3434                         }
3435                 }
3436                 nameFile.close();
3437                 outputNames.push_back(nameFileName); outputTypes["name"].push_back(nameFileName);
3438                 
3439                 
3440                 if(thisCompositeNamesFileName != ""){
3441                         m->appendFiles(nameFileName, thisCompositeNamesFileName);
3442                 }               
3443         }
3444         catch(exception& e) {
3445                 m->errorOut(e, "ShhherCommand", "writeNames");
3446                 exit(1);        
3447         }               
3448 }
3449
3450 /**************************************************************************************************/
3451
3452 void ShhherCommand::writeGroups(string groupFileName, string fileRoot, int numSeqs, vector<string>& seqNameVector){
3453         try {
3454         ofstream groupFile;
3455                 m->openOutputFile(groupFileName, groupFile);
3456                 
3457                 for(int i=0;i<numSeqs;i++){
3458                         if (m->control_pressed) { break; }
3459                         groupFile << seqNameVector[i] << '\t' << fileRoot << endl;
3460                 }
3461                 groupFile.close();
3462                 outputNames.push_back(groupFileName); outputTypes["group"].push_back(groupFileName);
3463         
3464         }
3465         catch(exception& e) {
3466                 m->errorOut(e, "ShhherCommand", "writeGroups");
3467                 exit(1);        
3468         }               
3469 }
3470
3471 /**************************************************************************************************/
3472
3473 void ShhherCommand::writeClusters(string otuCountsFileName, int numOTUs, int numFlowCells, vector<int> otuCounts, vector<int>& centroids, vector<short>& uniqueFlowgrams, vector<string>& seqNameVector, vector<vector<int> >& aaI, vector<int>& nSeqsPerOTU, vector<int>& lengths, vector<short>& flowDataIntI){
3474         try {
3475                 ofstream otuCountsFile;
3476                 m->openOutputFile(otuCountsFileName, otuCountsFile);
3477                 
3478                 string bases = flowOrder;
3479                 
3480                 for(int i=0;i<numOTUs;i++){
3481                         
3482                         if (m->control_pressed) {
3483                                 break;
3484                         }
3485                         //output the translated version of the centroid sequence for the otu
3486                         if(otuCounts[i] > 0){
3487                                 int index = centroids[i];
3488                                 
3489                                 otuCountsFile << "ideal\t";
3490                                 for(int j=8;j<numFlowCells;j++){
3491                                         char base = bases[j % bases.length()];
3492                                         for(int s=0;s<uniqueFlowgrams[index * numFlowCells + j];s++){
3493                                                 otuCountsFile << base;
3494                                         }
3495                                 }
3496                                 otuCountsFile << endl;
3497                                 
3498                                 for(int j=0;j<nSeqsPerOTU[i];j++){
3499                                         int sequence = aaI[i][j];
3500                                         otuCountsFile << seqNameVector[sequence] << '\t';
3501                                         
3502                                         string newSeq = "";
3503                                         
3504                                         for(int k=0;k<lengths[sequence];k++){
3505                                                 char base = bases[k % bases.length()];
3506                                                 int freq = int(0.01 * (double)flowDataIntI[sequence * numFlowCells + k] + 0.5);
3507                         
3508                                                 for(int s=0;s<freq;s++){
3509                                                         newSeq += base;
3510                                                         //otuCountsFile << base;
3511                                                 }
3512                                         }
3513                                         
3514                     if (newSeq.length() >= 4) {  otuCountsFile << newSeq.substr(4) << endl;  }
3515                     else {  otuCountsFile << "NNNN" << endl;  }
3516                                 }
3517                                 otuCountsFile << endl;
3518                         }
3519                 }
3520                 otuCountsFile.close();
3521                 outputNames.push_back(otuCountsFileName); outputTypes["counts"].push_back(otuCountsFileName);
3522         
3523         }
3524         catch(exception& e) {
3525                 m->errorOut(e, "ShhherCommand", "writeClusters");
3526                 exit(1);        
3527         }               
3528 }
3529
3530 /**************************************************************************************************/
3531
3532 void ShhherCommand::getSingleLookUp(){
3533         try{
3534                 //      these are the -log probabilities that a signal corresponds to a particular homopolymer length
3535                 singleLookUp.assign(HOMOPS * NUMBINS, 0);
3536                 
3537                 int index = 0;
3538                 ifstream lookUpFile;
3539                 m->openInputFile(lookupFileName, lookUpFile);
3540                 
3541                 for(int i=0;i<HOMOPS;i++){
3542                         
3543                         if (m->control_pressed) { break; }
3544                         
3545                         float logFracFreq;
3546                         lookUpFile >> logFracFreq;
3547                         
3548                         for(int j=0;j<NUMBINS;j++)      {
3549                                 lookUpFile >> singleLookUp[index];
3550                                 index++;
3551                         }
3552                 }       
3553                 lookUpFile.close();
3554         }
3555         catch(exception& e) {
3556                 m->errorOut(e, "ShhherCommand", "getSingleLookUp");
3557                 exit(1);
3558         }
3559 }
3560
3561 /**************************************************************************************************/
3562
3563 void ShhherCommand::getJointLookUp(){
3564         try{
3565                 
3566                 //      the most likely joint probability (-log) that two intenities have the same polymer length
3567                 jointLookUp.resize(NUMBINS * NUMBINS, 0);
3568                 
3569                 for(int i=0;i<NUMBINS;i++){
3570                         
3571                         if (m->control_pressed) { break; }
3572                         
3573                         for(int j=0;j<NUMBINS;j++){             
3574                                 
3575                                 double minSum = 100000000;
3576                                 
3577                                 for(int k=0;k<HOMOPS;k++){
3578                                         double sum = singleLookUp[k * NUMBINS + i] + singleLookUp[k * NUMBINS + j];
3579                                         
3580                                         if(sum < minSum)        {       minSum = sum;           }
3581                                 }       
3582                                 jointLookUp[i * NUMBINS + j] = minSum;
3583                         }
3584                 }
3585         }
3586         catch(exception& e) {
3587                 m->errorOut(e, "ShhherCommand", "getJointLookUp");
3588                 exit(1);
3589         }
3590 }
3591
3592 /**************************************************************************************************/
3593
3594 double ShhherCommand::getProbIntensity(int intIntensity){                          
3595         try{
3596
3597                 double minNegLogProb = 100000000; 
3598
3599                 
3600                 for(int i=0;i<HOMOPS;i++){//loop signal strength
3601                         
3602                         if (m->control_pressed) { break; }
3603                         
3604                         float negLogProb = singleLookUp[i * NUMBINS + intIntensity];
3605                         if(negLogProb < minNegLogProb)  {       minNegLogProb = negLogProb; }
3606                 }
3607                 
3608                 return minNegLogProb;
3609         }
3610         catch(exception& e) {
3611                 m->errorOut(e, "ShhherCommand", "getProbIntensity");
3612                 exit(1);
3613         }
3614 }
3615
3616
3617
3618