]> git.donarmstrong.com Git - mothur.git/blob - shhhercommand.cpp
changes while testing
[mothur.git] / shhhercommand.cpp
1 /*
2  *  shhher.cpp
3  *  Mothur
4  *
5  *  Created by Pat Schloss on 12/27/10.
6  *  Copyright 2010 Schloss Lab. All rights reserved.
7  *
8  */
9
10 #include "shhhercommand.h"
11
12 //**********************************************************************************************************************
13 vector<string> ShhherCommand::setParameters(){  
14         try {
15                 CommandParameter pflow("flow", "InputTypes", "", "", "none", "fileflow", "none","fasta-name-group-counts-qfile",false,false,true); parameters.push_back(pflow);
16                 CommandParameter pfile("file", "InputTypes", "", "", "none", "fileflow", "none","fasta-name-group-counts-qfile",false,false,true); parameters.push_back(pfile);
17                 CommandParameter plookup("lookup", "InputTypes", "", "", "none", "none", "none","",false,false,true); parameters.push_back(plookup);
18                 CommandParameter pcutoff("cutoff", "Number", "", "0.01", "", "", "","",false,false); parameters.push_back(pcutoff);
19                 CommandParameter pprocessors("processors", "Number", "", "1", "", "", "","",false,false,true); parameters.push_back(pprocessors);
20                 CommandParameter pmaxiter("maxiter", "Number", "", "1000", "", "", "","",false,false); parameters.push_back(pmaxiter);
21         CommandParameter plarge("large", "Number", "", "-1", "", "", "","",false,false); parameters.push_back(plarge);
22                 CommandParameter psigma("sigma", "Number", "", "60", "", "", "","",false,false); parameters.push_back(psigma);
23                 CommandParameter pmindelta("mindelta", "Number", "", "0.000001", "", "", "","",false,false); parameters.push_back(pmindelta);
24         CommandParameter porder("order", "Multiple", "A-B-I", "A", "", "", "","",false,false, true); parameters.push_back(porder);              CommandParameter pinputdir("inputdir", "String", "", "", "", "", "","",false,false); parameters.push_back(pinputdir);
25                 CommandParameter poutputdir("outputdir", "String", "", "", "", "", "","",false,false); parameters.push_back(poutputdir);
26                 
27                 vector<string> myArray;
28                 for (int i = 0; i < parameters.size(); i++) {   myArray.push_back(parameters[i].name);          }
29                 return myArray;
30         }
31         catch(exception& e) {
32                 m->errorOut(e, "ShhherCommand", "setParameters");
33                 exit(1);
34         }
35 }
36 //**********************************************************************************************************************
37 string ShhherCommand::getHelpString(){  
38         try {
39                 string helpString = "";
40                 helpString += "The shhh.flows command reads a file containing flowgrams and creates a file of corrected sequences.\n";
41         helpString += "The shhh.flows command parameters are flow, file, lookup, cutoff, processors, large, maxiter, sigma, mindelta and order.\n";
42         helpString += "The flow parameter is used to input your flow file.\n";
43         helpString += "The file parameter is used to input the *flow.files file created by trim.flows.\n";
44         helpString += "The lookup parameter is used specify the lookup file you would like to use. http://www.mothur.org/wiki/Lookup_files.\n";
45         helpString += "The order parameter options are A, B or I.  Default=A. A = TACG and B = TACGTACGTACGATGTAGTCGAGCATCATCTGACGCAGTACGTGCATGATCTCAGTCAGCAGCTATGTCAGTGCATGCAGTGACTGATCGTCATCAGCTAGCATCGACTGCATAGATCGCATGACGATCGCATATCGTCAGTGCATGTAGTCGAGCATCATCTGACGCAGTACGTGCATGATCTCAGTCAGCAGCTATGTCAGTGCATGCATAGATCGCATGACGATCGCATATCGTCAGTGCAGTGACTGATCGTCATCAGCTAGCATCGACTGCATGTAGTCGAGCATCATCTGACGCAGTACGTGCATAGATCGCATGACGATCGCATATCGTCAGTGCATGATCTCAGTCAGCAGCTATGTCAGTGCATGCAGTGACTGATCGTCATCAGCTAGCATCGACTGCATGTAGTCGAGCATCATCTGACGCAGTACGTGCAGTGACTGATCGTCATCAGCTAGCATCGACTGCATAGATCGCATGACGATCGCATATCGTCAGTGCATGATCTCAGTCAGCAGCTATGTCAGTGCATGCATGTAGTCGAGCATCATCTGACGCAGTACGTGCATAGATCGCATGACGATCGCATATCGTCAGTGCAGTGACTGATCGTCATCAGCTAGCATCGACTGCATGATCTCAGTCAGCAGCTATGTCAGTGCATGCAGTGACTGATCGTCATCAGCTAGCATCGACTGCATAGATCGCATGACGATCGCATATCGTCAGTGCATGATCTCAGTCAGCAGCTATGTCAGTGCATGCATGTAGTCGAGCATCATCTGACGCAGTACGTGCATAGATCGCATGACGATCGCATATCGTCAGTGCATGATCTCAGTCAGCAGCTATGTCAGTGCATGCAGTGACTGATCGTCATCAGCTAGCATCGACTGCATGTAGTCGAGCATCATCTGACGCAGTACGTGCATGATCTCAGTCAGCAGCTATGTCAGTGCATGCATAGATCGCATGACGATCGCATATCGTCAGTGCATGTAGTCGAGCATCATCTGACGCAGTACGTGCAGTGACTGATCGTCATCAGCTAGCATCGACTGCATAGATCGCATGACGATCGCATATCGTCAGTGCATGTAGTCGAGCATCATCTGACGCAGTACGTGCATGATCTCAGTCAGCAGCTATGTCAGTGCATGCAGTGACTGATCGTCATCAGCTAGCATCGACTGCATGATCTCAGTCAGCAGCTATGTCAGTGCATGCAGTGACTGATCGTCATCAGCTAGCATCGACTGCATAGATCGCATGACGATCGCATATCGTCAGTGCATGTAGTCGAGCATCATCTGACGCAGTACGTGCATGATCTCAGTCAGCAGCTATGTCAGTGCATGCATGTAGTCGAGCATCATCTGACGCAGTACGTGCAGTGACTGATCGTCATCAGCTAGCATCGACTGCATAGATCGCATGACGATCGCATATCGTCAGTGCAGTGACTGATCGTCATCAGCTAGCATCGACTGCATGTAGTCGAGCATCATCTGACGCAGTACGTGCATAGATCGCATGACGATCGCATATCGTCAGTGCATGATCTCAGTCAGCAGCTATGTCAGTGCATGCAGTGACTGATCGTCATCAGCTAGCATCGACTGCATGTAGTCGAGCATCATCTGACGCAGTACGTGCATGATCTCAGTCAGCAGCTATGTCAGTGCATGCATAGATCGCATGACGATCGCATATCGTCAGTGCAGTGACTGATCGTCATCAGCTAGCATCGACTGCATGATCTCAGTCAGCAGC and I = TACGTACGTCTGAGCATCGATCGATGTACAGCTACGTACGTCTGAGCATCGATCGATGTACAGCTACGTACGTCTGAGCATCGATCGATGTACAGCTACGTACGTCTGAGCATCGATCGATGTACAGCTACGTACGTCTGAGCATCGATCGATGTACAGCTACGTACGTCTGAGCATCGATCGATGTACAGCTACGTACGTCTGAGCATCGATCGATGTACAGCTACGTACGTCTGAGCATCGATCGATGTACAGCTACGTACGTCTGAGCATCGATCGATGTACAGCTACGTACGTCTGAGCATCGATCGATGTACAGCTACGTACGTCTGAGCATCGATCGATGTACAGCTACGTACGTCTGAGCATCGATCGATGTACAGCTACGTACGTCTGAGCATCGATCGATGTACAGCTACGTACGTCTGAGCATCGATCGATGTACAGCTACGTACGTCTGAGCATCGATCGATGTACAGCTACGTACGTCTGAGCATCGATCGATGTACAGCTACGTACGTCTGAGCATCGATCGATGTACAGCTACGTACGTCTGAGCATCGATCGATGTACAGCTACGTACGTCTGAGCATCGATCGATGTACAGCTACGTACGTCTGAGCATCGATCGATGTACAGC.\n";
46                 return helpString;
47         }
48         catch(exception& e) {
49                 m->errorOut(e, "ShhherCommand", "getHelpString");
50                 exit(1);
51         }
52 }
53 //**********************************************************************************************************************
54 string ShhherCommand::getOutputPattern(string type) {
55     try {
56         string pattern = "";
57         
58         if (type == "fasta")            {   pattern = "[filename],shhh.fasta";   }
59         else if (type == "name")    {   pattern = "[filename],shhh.names";   }
60         else if (type == "group")        {   pattern = "[filename],shhh.groups";   }
61         else if (type == "counts")        {   pattern = "[filename],shhh.counts";   }
62         else if (type == "qfile")        {   pattern = "[filename],shhh.qual";   }
63         else { m->mothurOut("[ERROR]: No definition for type " + type + " output pattern.\n"); m->control_pressed = true;  }
64         
65         return pattern;
66     }
67     catch(exception& e) {
68         m->errorOut(e, "ShhherCommand", "getOutputPattern");
69         exit(1);
70     }
71 }
72 //**********************************************************************************************************************
73
74 ShhherCommand::ShhherCommand(){ 
75         try {
76                 abort = true; calledHelp = true;
77                 setParameters();
78                 
79                 //initialize outputTypes
80                 vector<string> tempOutNames;
81                 outputTypes["fasta"] = tempOutNames;
82         outputTypes["name"] = tempOutNames;
83         outputTypes["group"] = tempOutNames;
84         outputTypes["counts"] = tempOutNames;
85         outputTypes["qfile"] = tempOutNames;
86
87         }
88         catch(exception& e) {
89                 m->errorOut(e, "ShhherCommand", "ShhherCommand");
90                 exit(1);
91         }
92 }
93
94 //**********************************************************************************************************************
95
96 ShhherCommand::ShhherCommand(string option) {
97         try {
98         
99 #ifdef USE_MPI
100                 MPI_Comm_rank(MPI_COMM_WORLD, &pid); //find out who we are
101                 MPI_Comm_size(MPI_COMM_WORLD, &ncpus);
102         
103                 if(pid == 0){
104 #endif
105                 abort = false; calledHelp = false;   
106                 
107                 //allow user to run help
108                 if(option == "help") { help(); abort = true; calledHelp = true; }
109                 else if(option == "citation") { citation(); abort = true; calledHelp = true;}
110                 
111                 else {
112                         vector<string> myArray = setParameters();
113                         
114                         OptionParser parser(option);
115                         map<string,string> parameters = parser.getParameters();
116                         
117                         ValidParameters validParameter;
118                         map<string,string>::iterator it;
119                         
120                         //check to make sure all parameters are valid for command
121                         for (it = parameters.begin(); it != parameters.end(); it++) { 
122                                 if (validParameter.isValidParameter(it->first, myArray, it->second) != true) {  abort = true;  }
123                         }
124                         
125                         //initialize outputTypes
126             vector<string> tempOutNames;
127             outputTypes["fasta"] = tempOutNames;
128             outputTypes["name"] = tempOutNames;
129             outputTypes["group"] = tempOutNames;
130             outputTypes["counts"] = tempOutNames;
131             outputTypes["qfile"] = tempOutNames;
132
133                         
134                         //if the user changes the input directory command factory will send this info to us in the output parameter 
135                         string inputDir = validParameter.validFile(parameters, "inputdir", false);              
136                         if (inputDir == "not found"){   inputDir = "";          }
137                         else {
138                                 string path;
139                                 it = parameters.find("flow");
140                                 //user has given a template file
141                                 if(it != parameters.end()){ 
142                                         path = m->hasPath(it->second);
143                                         //if the user has not given a path then, add inputdir. else leave path alone.
144                                         if (path == "") {       parameters["flow"] = inputDir + it->second;             }
145                                 }
146                                 
147                                 it = parameters.find("lookup");
148                                 //user has given a template file
149                                 if(it != parameters.end()){ 
150                                         path = m->hasPath(it->second);
151                                         //if the user has not given a path then, add inputdir. else leave path alone.
152                                         if (path == "") {       parameters["lookup"] = inputDir + it->second;           }
153                                 }
154
155                                 it = parameters.find("file");
156                                 //user has given a template file
157                                 if(it != parameters.end()){ 
158                                         path = m->hasPath(it->second);
159                                         //if the user has not given a path then, add inputdir. else leave path alone.
160                                         if (path == "") {       parameters["file"] = inputDir + it->second;             }
161                                 }
162                         }
163             
164             //if the user changes the output directory command factory will send this info to us in the output parameter 
165                         outputDir = validParameter.validFile(parameters, "outputdir", false);           if (outputDir == "not found"){  outputDir = ""; }
166             
167                         //check for required parameters
168                         flowFileName = validParameter.validFile(parameters, "flow", true);
169                         flowFilesFileName = validParameter.validFile(parameters, "file", true);
170                         if (flowFileName == "not found" && flowFilesFileName == "not found") {
171                                 m->mothurOut("values for either flow or file must be provided for the shhh.flows command.");
172                                 m->mothurOutEndLine();
173                                 abort = true; 
174                         }
175                         else if (flowFileName == "not open" || flowFilesFileName == "not open") { abort = true; }
176                         
177                         if(flowFileName != "not found"){
178                                 compositeFASTAFileName = "";    
179                                 compositeNamesFileName = "";    
180                         }
181                         else{
182                                 ofstream temp;
183                 
184                 string thisoutputDir = outputDir;
185                 if (outputDir == "") {  thisoutputDir =  m->hasPath(flowFilesFileName); } //if user entered a file with a path then preserve it
186                 
187                                 //we want to rip off .files, and also .flow if its there
188                 string fileroot = m->getRootName(m->getSimpleName(flowFilesFileName));
189                 if (fileroot[fileroot.length()-1] == '.') {  fileroot = fileroot.substr(0, fileroot.length()-1); } //rip off dot
190                 string extension = m->getExtension(fileroot);
191                 if (extension == ".flow") { fileroot = m->getRootName(fileroot);  }
192                 else { fileroot += "."; } //add back if needed
193                 
194                                 compositeFASTAFileName = thisoutputDir + fileroot + "shhh.fasta";
195                                 m->openOutputFile(compositeFASTAFileName, temp);
196                                 temp.close();
197                                 
198                                 compositeNamesFileName = thisoutputDir + fileroot + "shhh.names";
199                                 m->openOutputFile(compositeNamesFileName, temp);
200                                 temp.close();
201                         }
202             
203             if(flowFilesFileName != "not found"){
204                 string fName;
205                 
206                 ifstream flowFilesFile;
207                 m->openInputFile(flowFilesFileName, flowFilesFile);
208                 while(flowFilesFile){
209                     fName = m->getline(flowFilesFile);
210                     
211                     //test if file is valid
212                     ifstream in;
213                     int ableToOpen = m->openInputFile(fName, in, "noerror");
214                     in.close(); 
215                     if (ableToOpen == 1) {
216                         if (inputDir != "") { //default path is set
217                             string tryPath = inputDir + fName;
218                             m->mothurOut("Unable to open " + fName + ". Trying input directory " + tryPath); m->mothurOutEndLine();
219                             ifstream in2;
220                             ableToOpen = m->openInputFile(tryPath, in2, "noerror");
221                             in2.close();
222                             fName = tryPath;
223                         }
224                     }
225                     
226                     if (ableToOpen == 1) {
227                         if (m->getDefaultPath() != "") { //default path is set
228                             string tryPath = m->getDefaultPath() + m->getSimpleName(fName);
229                             m->mothurOut("Unable to open " + fName + ". Trying default " + tryPath); m->mothurOutEndLine();
230                             ifstream in2;
231                             ableToOpen = m->openInputFile(tryPath, in2, "noerror");
232                             in2.close();
233                             fName = tryPath;
234                         }
235                     }
236                     
237                     //if you can't open it its not in current working directory or inputDir, try mothur excutable location
238                     if (ableToOpen == 1) {
239                         string exepath = m->argv;
240                         string tempPath = exepath;
241                         for (int i = 0; i < exepath.length(); i++) { tempPath[i] = tolower(exepath[i]); }
242                         exepath = exepath.substr(0, (tempPath.find_last_of('m')));
243                         
244                         string tryPath = m->getFullPathName(exepath) + m->getSimpleName(fName);
245                         m->mothurOut("Unable to open " + fName + ". Trying mothur's executable location " + tryPath); m->mothurOutEndLine();
246                         ifstream in2;
247                         ableToOpen = m->openInputFile(tryPath, in2, "noerror");
248                         in2.close();
249                         fName = tryPath;
250                     }
251                     
252                     if (ableToOpen == 1) {  m->mothurOut("Unable to open " + fName + ". Disregarding. "); m->mothurOutEndLine();  }
253                     else { flowFileVector.push_back(fName); }
254                     m->gobble(flowFilesFile);
255                 }
256                 flowFilesFile.close();
257                 if (flowFileVector.size() == 0) {  m->mothurOut("[ERROR]: no valid files."); m->mothurOutEndLine(); abort = true; }
258             }
259             else{
260                 if (outputDir == "") { outputDir = m->hasPath(flowFileName); }
261                 flowFileVector.push_back(flowFileName);
262             }
263                 
264                         //check for optional parameter and set defaults
265                         // ...at some point should added some additional type checking...
266                         string temp;
267                         temp = validParameter.validFile(parameters, "lookup", true);
268                         if (temp == "not found")        {       
269                                 string path = m->argv;
270                 string tempPath = path;
271                 for (int i = 0; i < path.length(); i++) { tempPath[i] = tolower(path[i]); }
272                 path = path.substr(0, (tempPath.find_last_of('m')));
273                 
274 #if defined (__APPLE__) || (__MACH__) || (linux) || (__linux) || (__linux__) || (__unix__) || (__unix)
275                 path += "lookupFiles/";
276 #else
277                 path += "lookupFiles\\";
278 #endif
279                                 lookupFileName = m->getFullPathName(path) + "LookUp_Titanium.pat";
280                                 
281                                 int ableToOpen;
282                                 ifstream in;
283                                 ableToOpen = m->openInputFile(lookupFileName, in, "noerror");
284                                 in.close();     
285                                 
286                                 //if you can't open it, try input location
287                                 if (ableToOpen == 1) {
288                                         if (inputDir != "") { //default path is set
289                                                 string tryPath = inputDir + m->getSimpleName(lookupFileName);
290                                                 m->mothurOut("Unable to open " + lookupFileName + ". Trying input directory " + tryPath); m->mothurOutEndLine();
291                                                 ifstream in2;
292                                                 ableToOpen = m->openInputFile(tryPath, in2, "noerror");
293                                                 in2.close();
294                                                 lookupFileName = tryPath;
295                                         }
296                                 }
297                                 
298                                 //if you can't open it, try default location
299                                 if (ableToOpen == 1) {
300                                         if (m->getDefaultPath() != "") { //default path is set
301                                                 string tryPath = m->getDefaultPath() + m->getSimpleName(lookupFileName);
302                                                 m->mothurOut("Unable to open " + lookupFileName + ". Trying default " + tryPath); m->mothurOutEndLine();
303                                                 ifstream in2;
304                                                 ableToOpen = m->openInputFile(tryPath, in2, "noerror");
305                                                 in2.close();
306                                                 lookupFileName = tryPath;
307                                         }
308                                 }
309                                 
310                                 //if you can't open it its not in current working directory or inputDir, try mothur excutable location
311                                 if (ableToOpen == 1) {
312                                         string exepath = m->argv;
313                                         string tempPath = exepath;
314                                         for (int i = 0; i < exepath.length(); i++) { tempPath[i] = tolower(exepath[i]); }
315                                         exepath = exepath.substr(0, (tempPath.find_last_of('m')));
316                                         
317                                         string tryPath = m->getFullPathName(exepath) + m->getSimpleName(lookupFileName);
318                                         m->mothurOut("Unable to open " + lookupFileName + ". Trying mothur's executable location " + tryPath); m->mothurOutEndLine();
319                                         ifstream in2;
320                                         ableToOpen = m->openInputFile(tryPath, in2, "noerror");
321                                         in2.close();
322                                         lookupFileName = tryPath;
323                                 }
324                                 
325                                 if (ableToOpen == 1) {  m->mothurOut("Unable to open " + lookupFileName + "."); m->mothurOutEndLine(); abort=true;  }
326                         }
327                         else if(temp == "not open")     {       
328                                 
329                                 lookupFileName = validParameter.validFile(parameters, "lookup", false);
330                                 
331                                 //if you can't open it its not inputDir, try mothur excutable location
332                                 string exepath = m->argv;
333                                 string tempPath = exepath;
334                                 for (int i = 0; i < exepath.length(); i++) { tempPath[i] = tolower(exepath[i]); }
335                                 exepath = exepath.substr(0, (tempPath.find_last_of('m')));
336                                         
337                                 string tryPath = m->getFullPathName(exepath) + m->getSimpleName(lookupFileName);
338                                 m->mothurOut("Unable to open " + lookupFileName + ". Trying mothur's executable location " + tryPath); m->mothurOutEndLine();
339                                 ifstream in2;
340                                 int ableToOpen = m->openInputFile(tryPath, in2, "noerror");
341                                 in2.close();
342                                 lookupFileName = tryPath;
343                                 
344                                 if (ableToOpen == 1) {  m->mothurOut("Unable to open " + lookupFileName + "."); m->mothurOutEndLine(); abort=true;  }
345                         }else                                           {       lookupFileName = temp;  }
346                         
347                         temp = validParameter.validFile(parameters, "processors", false);       if (temp == "not found"){       temp = m->getProcessors();      }
348                         m->setProcessors(temp);
349                         m->mothurConvert(temp, processors);
350
351                         temp = validParameter.validFile(parameters, "cutoff", false);   if (temp == "not found"){       temp = "0.01";          }
352                         m->mothurConvert(temp, cutoff); 
353                         
354                         temp = validParameter.validFile(parameters, "mindelta", false); if (temp == "not found"){       temp = "0.000001";      }
355                         m->mothurConvert(temp, minDelta); 
356
357                         temp = validParameter.validFile(parameters, "maxiter", false);  if (temp == "not found"){       temp = "1000";          }
358                         m->mothurConvert(temp, maxIters); 
359             
360             temp = validParameter.validFile(parameters, "large", false);        if (temp == "not found"){       temp = "0";             }
361                         m->mothurConvert(temp, largeSize); 
362             if (largeSize != 0) { large = true; }
363             else { large = false;  }
364             if (largeSize < 0) {  m->mothurOut("The value of the large cannot be negative.\n"); }
365             
366 #ifdef USE_MPI
367             if (large) { m->mothurOut("The large parameter is not available with the MPI-Enabled version.\n"); large=false; }                           
368 #endif
369             
370
371                         temp = validParameter.validFile(parameters, "sigma", false);if (temp == "not found")    {       temp = "60";            }
372                         m->mothurConvert(temp, sigma); 
373                         
374                         temp = validParameter.validFile(parameters, "order", false);  if (temp == "not found"){         temp = "A";     }
375             if (temp.length() > 1) {  m->mothurOut("[ERROR]: " + temp + " is not a valid option for order. order options are A, B, or I. A = TACG, B = TACGTACGTACGATGTAGTCGAGCATCATCTGACGCAGTACGTGCATGATCTCAGTCAGCAGCTATGTCAGTGCATGCAGTGACTGATCGTCATCAGCTAGCATCGACTGCATAGATCGCATGACGATCGCATATCGTCAGTGCATGTAGTCGAGCATCATCTGACGCAGTACGTGCATGATCTCAGTCAGCAGCTATGTCAGTGCATGCATAGATCGCATGACGATCGCATATCGTCAGTGCAGTGACTGATCGTCATCAGCTAGCATCGACTGCATGTAGTCGAGCATCATCTGACGCAGTACGTGCATAGATCGCATGACGATCGCATATCGTCAGTGCATGATCTCAGTCAGCAGCTATGTCAGTGCATGCAGTGACTGATCGTCATCAGCTAGCATCGACTGCATGTAGTCGAGCATCATCTGACGCAGTACGTGCAGTGACTGATCGTCATCAGCTAGCATCGACTGCATAGATCGCATGACGATCGCATATCGTCAGTGCATGATCTCAGTCAGCAGCTATGTCAGTGCATGCATGTAGTCGAGCATCATCTGACGCAGTACGTGCATAGATCGCATGACGATCGCATATCGTCAGTGCAGTGACTGATCGTCATCAGCTAGCATCGACTGCATGATCTCAGTCAGCAGCTATGTCAGTGCATGCAGTGACTGATCGTCATCAGCTAGCATCGACTGCATAGATCGCATGACGATCGCATATCGTCAGTGCATGATCTCAGTCAGCAGCTATGTCAGTGCATGCATGTAGTCGAGCATCATCTGACGCAGTACGTGCATAGATCGCATGACGATCGCATATCGTCAGTGCATGATCTCAGTCAGCAGCTATGTCAGTGCATGCAGTGACTGATCGTCATCAGCTAGCATCGACTGCATGTAGTCGAGCATCATCTGACGCAGTACGTGCATGATCTCAGTCAGCAGCTATGTCAGTGCATGCATAGATCGCATGACGATCGCATATCGTCAGTGCATGTAGTCGAGCATCATCTGACGCAGTACGTGCAGTGACTGATCGTCATCAGCTAGCATCGACTGCATAGATCGCATGACGATCGCATATCGTCAGTGCATGTAGTCGAGCATCATCTGACGCAGTACGTGCATGATCTCAGTCAGCAGCTATGTCAGTGCATGCAGTGACTGATCGTCATCAGCTAGCATCGACTGCATGATCTCAGTCAGCAGCTATGTCAGTGCATGCAGTGACTGATCGTCATCAGCTAGCATCGACTGCATAGATCGCATGACGATCGCATATCGTCAGTGCATGTAGTCGAGCATCATCTGACGCAGTACGTGCATGATCTCAGTCAGCAGCTATGTCAGTGCATGCATGTAGTCGAGCATCATCTGACGCAGTACGTGCAGTGACTGATCGTCATCAGCTAGCATCGACTGCATAGATCGCATGACGATCGCATATCGTCAGTGCAGTGACTGATCGTCATCAGCTAGCATCGACTGCATGTAGTCGAGCATCATCTGACGCAGTACGTGCATAGATCGCATGACGATCGCATATCGTCAGTGCATGATCTCAGTCAGCAGCTATGTCAGTGCATGCAGTGACTGATCGTCATCAGCTAGCATCGACTGCATGTAGTCGAGCATCATCTGACGCAGTACGTGCATGATCTCAGTCAGCAGCTATGTCAGTGCATGCATAGATCGCATGACGATCGCATATCGTCAGTGCAGTGACTGATCGTCATCAGCTAGCATCGACTGCATGATCTCAGTCAGCAGC, and I = TACGTACGTCTGAGCATCGATCGATGTACAGCTACGTACGTCTGAGCATCGATCGATGTACAGCTACGTACGTCTGAGCATCGATCGATGTACAGCTACGTACGTCTGAGCATCGATCGATGTACAGCTACGTACGTCTGAGCATCGATCGATGTACAGCTACGTACGTCTGAGCATCGATCGATGTACAGCTACGTACGTCTGAGCATCGATCGATGTACAGCTACGTACGTCTGAGCATCGATCGATGTACAGCTACGTACGTCTGAGCATCGATCGATGTACAGCTACGTACGTCTGAGCATCGATCGATGTACAGCTACGTACGTCTGAGCATCGATCGATGTACAGCTACGTACGTCTGAGCATCGATCGATGTACAGCTACGTACGTCTGAGCATCGATCGATGTACAGCTACGTACGTCTGAGCATCGATCGATGTACAGCTACGTACGTCTGAGCATCGATCGATGTACAGCTACGTACGTCTGAGCATCGATCGATGTACAGCTACGTACGTCTGAGCATCGATCGATGTACAGCTACGTACGTCTGAGCATCGATCGATGTACAGCTACGTACGTCTGAGCATCGATCGATGTACAGCTACGTACGTCTGAGCATCGATCGATGTACAGC.\n");  abort=true;
376             }
377             else {
378                 if (toupper(temp[0]) == 'A') {  flowOrder = "TACG";   }
379                 else if(toupper(temp[0]) == 'B'){
380                     flowOrder = "TACGTACGTACGATGTAGTCGAGCATCATCTGACGCAGTACGTGCATGATCTCAGTCAGCAGCTATGTCAGTGCATGCAGTGACTGATCGTCATCAGCTAGCATCGACTGCATAGATCGCATGACGATCGCATATCGTCAGTGCATGTAGTCGAGCATCATCTGACGCAGTACGTGCATGATCTCAGTCAGCAGCTATGTCAGTGCATGCATAGATCGCATGACGATCGCATATCGTCAGTGCAGTGACTGATCGTCATCAGCTAGCATCGACTGCATGTAGTCGAGCATCATCTGACGCAGTACGTGCATAGATCGCATGACGATCGCATATCGTCAGTGCATGATCTCAGTCAGCAGCTATGTCAGTGCATGCAGTGACTGATCGTCATCAGCTAGCATCGACTGCATGTAGTCGAGCATCATCTGACGCAGTACGTGCAGTGACTGATCGTCATCAGCTAGCATCGACTGCATAGATCGCATGACGATCGCATATCGTCAGTGCATGATCTCAGTCAGCAGCTATGTCAGTGCATGCATGTAGTCGAGCATCATCTGACGCAGTACGTGCATAGATCGCATGACGATCGCATATCGTCAGTGCAGTGACTGATCGTCATCAGCTAGCATCGACTGCATGATCTCAGTCAGCAGCTATGTCAGTGCATGCAGTGACTGATCGTCATCAGCTAGCATCGACTGCATAGATCGCATGACGATCGCATATCGTCAGTGCATGATCTCAGTCAGCAGCTATGTCAGTGCATGCATGTAGTCGAGCATCATCTGACGCAGTACGTGCATAGATCGCATGACGATCGCATATCGTCAGTGCATGATCTCAGTCAGCAGCTATGTCAGTGCATGCAGTGACTGATCGTCATCAGCTAGCATCGACTGCATGTAGTCGAGCATCATCTGACGCAGTACGTGCATGATCTCAGTCAGCAGCTATGTCAGTGCATGCATAGATCGCATGACGATCGCATATCGTCAGTGCATGTAGTCGAGCATCATCTGACGCAGTACGTGCAGTGACTGATCGTCATCAGCTAGCATCGACTGCATAGATCGCATGACGATCGCATATCGTCAGTGCATGTAGTCGAGCATCATCTGACGCAGTACGTGCATGATCTCAGTCAGCAGCTATGTCAGTGCATGCAGTGACTGATCGTCATCAGCTAGCATCGACTGCATGATCTCAGTCAGCAGCTATGTCAGTGCATGCAGTGACTGATCGTCATCAGCTAGCATCGACTGCATAGATCGCATGACGATCGCATATCGTCAGTGCATGTAGTCGAGCATCATCTGACGCAGTACGTGCATGATCTCAGTCAGCAGCTATGTCAGTGCATGCATGTAGTCGAGCATCATCTGACGCAGTACGTGCAGTGACTGATCGTCATCAGCTAGCATCGACTGCATAGATCGCATGACGATCGCATATCGTCAGTGCAGTGACTGATCGTCATCAGCTAGCATCGACTGCATGTAGTCGAGCATCATCTGACGCAGTACGTGCATAGATCGCATGACGATCGCATATCGTCAGTGCATGATCTCAGTCAGCAGCTATGTCAGTGCATGCAGTGACTGATCGTCATCAGCTAGCATCGACTGCATGTAGTCGAGCATCATCTGACGCAGTACGTGCATGATCTCAGTCAGCAGCTATGTCAGTGCATGCATAGATCGCATGACGATCGCATATCGTCAGTGCAGTGACTGATCGTCATCAGCTAGCATCGACTGCATGATCTCAGTCAGCAGC";   }
381                 else if(toupper(temp[0]) == 'I'){
382                     flowOrder = "TACGTACGTCTGAGCATCGATCGATGTACAGCTACGTACGTCTGAGCATCGATCGATGTACAGCTACGTACGTCTGAGCATCGATCGATGTACAGCTACGTACGTCTGAGCATCGATCGATGTACAGCTACGTACGTCTGAGCATCGATCGATGTACAGCTACGTACGTCTGAGCATCGATCGATGTACAGCTACGTACGTCTGAGCATCGATCGATGTACAGCTACGTACGTCTGAGCATCGATCGATGTACAGCTACGTACGTCTGAGCATCGATCGATGTACAGCTACGTACGTCTGAGCATCGATCGATGTACAGCTACGTACGTCTGAGCATCGATCGATGTACAGCTACGTACGTCTGAGCATCGATCGATGTACAGCTACGTACGTCTGAGCATCGATCGATGTACAGCTACGTACGTCTGAGCATCGATCGATGTACAGCTACGTACGTCTGAGCATCGATCGATGTACAGCTACGTACGTCTGAGCATCGATCGATGTACAGCTACGTACGTCTGAGCATCGATCGATGTACAGCTACGTACGTCTGAGCATCGATCGATGTACAGCTACGTACGTCTGAGCATCGATCGATGTACAGCTACGTACGTCTGAGCATCGATCGATGTACAGC";   }
383                 else {
384                     m->mothurOut("[ERROR]: " + temp + " is not a valid option for order. order options are A, B, or I. A = TACG, B = TACGTACGTACGATGTAGTCGAGCATCATCTGACGCAGTACGTGCATGATCTCAGTCAGCAGCTATGTCAGTGCATGCAGTGACTGATCGTCATCAGCTAGCATCGACTGCATAGATCGCATGACGATCGCATATCGTCAGTGCATGTAGTCGAGCATCATCTGACGCAGTACGTGCATGATCTCAGTCAGCAGCTATGTCAGTGCATGCATAGATCGCATGACGATCGCATATCGTCAGTGCAGTGACTGATCGTCATCAGCTAGCATCGACTGCATGTAGTCGAGCATCATCTGACGCAGTACGTGCATAGATCGCATGACGATCGCATATCGTCAGTGCATGATCTCAGTCAGCAGCTATGTCAGTGCATGCAGTGACTGATCGTCATCAGCTAGCATCGACTGCATGTAGTCGAGCATCATCTGACGCAGTACGTGCAGTGACTGATCGTCATCAGCTAGCATCGACTGCATAGATCGCATGACGATCGCATATCGTCAGTGCATGATCTCAGTCAGCAGCTATGTCAGTGCATGCATGTAGTCGAGCATCATCTGACGCAGTACGTGCATAGATCGCATGACGATCGCATATCGTCAGTGCAGTGACTGATCGTCATCAGCTAGCATCGACTGCATGATCTCAGTCAGCAGCTATGTCAGTGCATGCAGTGACTGATCGTCATCAGCTAGCATCGACTGCATAGATCGCATGACGATCGCATATCGTCAGTGCATGATCTCAGTCAGCAGCTATGTCAGTGCATGCATGTAGTCGAGCATCATCTGACGCAGTACGTGCATAGATCGCATGACGATCGCATATCGTCAGTGCATGATCTCAGTCAGCAGCTATGTCAGTGCATGCAGTGACTGATCGTCATCAGCTAGCATCGACTGCATGTAGTCGAGCATCATCTGACGCAGTACGTGCATGATCTCAGTCAGCAGCTATGTCAGTGCATGCATAGATCGCATGACGATCGCATATCGTCAGTGCATGTAGTCGAGCATCATCTGACGCAGTACGTGCAGTGACTGATCGTCATCAGCTAGCATCGACTGCATAGATCGCATGACGATCGCATATCGTCAGTGCATGTAGTCGAGCATCATCTGACGCAGTACGTGCATGATCTCAGTCAGCAGCTATGTCAGTGCATGCAGTGACTGATCGTCATCAGCTAGCATCGACTGCATGATCTCAGTCAGCAGCTATGTCAGTGCATGCAGTGACTGATCGTCATCAGCTAGCATCGACTGCATAGATCGCATGACGATCGCATATCGTCAGTGCATGTAGTCGAGCATCATCTGACGCAGTACGTGCATGATCTCAGTCAGCAGCTATGTCAGTGCATGCATGTAGTCGAGCATCATCTGACGCAGTACGTGCAGTGACTGATCGTCATCAGCTAGCATCGACTGCATAGATCGCATGACGATCGCATATCGTCAGTGCAGTGACTGATCGTCATCAGCTAGCATCGACTGCATGTAGTCGAGCATCATCTGACGCAGTACGTGCATAGATCGCATGACGATCGCATATCGTCAGTGCATGATCTCAGTCAGCAGCTATGTCAGTGCATGCAGTGACTGATCGTCATCAGCTAGCATCGACTGCATGTAGTCGAGCATCATCTGACGCAGTACGTGCATGATCTCAGTCAGCAGCTATGTCAGTGCATGCATAGATCGCATGACGATCGCATATCGTCAGTGCAGTGACTGATCGTCATCAGCTAGCATCGACTGCATGATCTCAGTCAGCAGC, and I = TACGTACGTCTGAGCATCGATCGATGTACAGCTACGTACGTCTGAGCATCGATCGATGTACAGCTACGTACGTCTGAGCATCGATCGATGTACAGCTACGTACGTCTGAGCATCGATCGATGTACAGCTACGTACGTCTGAGCATCGATCGATGTACAGCTACGTACGTCTGAGCATCGATCGATGTACAGCTACGTACGTCTGAGCATCGATCGATGTACAGCTACGTACGTCTGAGCATCGATCGATGTACAGCTACGTACGTCTGAGCATCGATCGATGTACAGCTACGTACGTCTGAGCATCGATCGATGTACAGCTACGTACGTCTGAGCATCGATCGATGTACAGCTACGTACGTCTGAGCATCGATCGATGTACAGCTACGTACGTCTGAGCATCGATCGATGTACAGCTACGTACGTCTGAGCATCGATCGATGTACAGCTACGTACGTCTGAGCATCGATCGATGTACAGCTACGTACGTCTGAGCATCGATCGATGTACAGCTACGTACGTCTGAGCATCGATCGATGTACAGCTACGTACGTCTGAGCATCGATCGATGTACAGCTACGTACGTCTGAGCATCGATCGATGTACAGCTACGTACGTCTGAGCATCGATCGATGTACAGC.\n");  abort=true;
385                 }
386             }
387
388                         
389                 }
390 #ifdef USE_MPI
391                 }                               
392 #endif
393         }
394         catch(exception& e) {
395                 m->errorOut(e, "ShhherCommand", "ShhherCommand");
396                 exit(1);
397         }
398 }
399 //**********************************************************************************************************************
400 #ifdef USE_MPI
401 int ShhherCommand::execute(){
402         try {
403                 if (abort == true) { if (calledHelp) { return 0; }  return 2;   }
404                 
405                 int tag = 1976;
406                 MPI_Status status; 
407                         
408                 if(pid == 0){
409
410                         for(int i=1;i<ncpus;i++){
411                                 MPI_Send(&abort, 1, MPI_INT, i, tag, MPI_COMM_WORLD);
412                         }
413                         if(abort == 1){ return 0;       }
414
415                         processors = ncpus;
416                         
417                         m->mothurOut("\nGetting preliminary data...\n");
418                         getSingleLookUp();      if (m->control_pressed) { return 0; }
419                         getJointLookUp();       if (m->control_pressed) { return 0; }
420                         
421             vector<string> flowFileVector;
422                         if(flowFilesFileName != "not found"){
423                                 string fName;
424                 
425                                 ifstream flowFilesFile;
426                                 m->openInputFile(flowFilesFileName, flowFilesFile);
427                                 while(flowFilesFile){
428                                         fName = m->getline(flowFilesFile);
429                                         flowFileVector.push_back(fName);
430                                         m->gobble(flowFilesFile);
431                                 }
432                         }
433                         else{
434                                 flowFileVector.push_back(flowFileName);
435                         }
436             
437                         int numFiles = flowFileVector.size();
438
439                         for(int i=1;i<ncpus;i++){
440                                 MPI_Send(&numFiles, 1, MPI_INT, i, tag, MPI_COMM_WORLD);
441                         }
442                         
443                         for(int i=0;i<numFiles;i++){
444                                 
445                                 if (m->control_pressed) { break; }
446                                 
447                                 double begClock = clock();
448                                 unsigned long long begTime = time(NULL);
449
450                                 flowFileName = flowFileVector[i];
451                                 
452                                 m->mothurOut("\n>>>>>\tProcessing " + flowFileName + " (file " + toString(i+1) + " of " + toString(numFiles) + ")\t<<<<<\n");
453                                 m->mothurOut("Reading flowgrams...\n");
454                                 getFlowData();
455                                 
456                                 if (m->control_pressed) { break; }
457
458                                 m->mothurOut("Identifying unique flowgrams...\n");
459                                 getUniques();
460                                 
461                                 if (m->control_pressed) { break; }
462
463                                 m->mothurOut("Calculating distances between flowgrams...\n");
464                                 char fileName[1024];
465                                 strcpy(fileName, flowFileName.c_str());
466
467                                 for(int i=1;i<ncpus;i++){
468                                         MPI_Send(&fileName[0], 1024, MPI_CHAR, i, tag, MPI_COMM_WORLD);
469
470                                         MPI_Send(&numSeqs, 1, MPI_INT, i, tag, MPI_COMM_WORLD);
471                                         MPI_Send(&numUniques, 1, MPI_INT, i, tag, MPI_COMM_WORLD);
472                                         MPI_Send(&numFlowCells, 1, MPI_INT, i, tag, MPI_COMM_WORLD);
473                                         MPI_Send(&flowDataIntI[0], numSeqs * numFlowCells, MPI_SHORT, i, tag, MPI_COMM_WORLD);
474                                         MPI_Send(&flowDataPrI[0], numSeqs * numFlowCells, MPI_DOUBLE, i, tag, MPI_COMM_WORLD);
475                                         MPI_Send(&mapUniqueToSeq[0], numSeqs, MPI_INT, i, tag, MPI_COMM_WORLD);
476                                         MPI_Send(&mapSeqToUnique[0], numSeqs, MPI_INT, i, tag, MPI_COMM_WORLD);
477                                         MPI_Send(&lengths[0], numSeqs, MPI_INT, i, tag, MPI_COMM_WORLD);
478                                         MPI_Send(&jointLookUp[0], NUMBINS * NUMBINS, MPI_DOUBLE, i, tag, MPI_COMM_WORLD);
479                                         MPI_Send(&cutoff, 1, MPI_DOUBLE, i, tag, MPI_COMM_WORLD);
480                                 }                               
481                                                         
482                                 string distFileName = flowDistMPI(0, int(sqrt(1.0/float(ncpus)) * numUniques));
483                                 
484                                 if (m->control_pressed) { break; }
485                                 
486                                 int done;
487                                 for(int i=1;i<ncpus;i++){
488                                         MPI_Recv(&done, 1, MPI_INT, i, tag, MPI_COMM_WORLD, &status);
489                                         
490                                         m->appendFiles((distFileName + ".temp." + toString(i)), distFileName);
491                                         m->mothurRemove((distFileName + ".temp." + toString(i)));
492                                 }
493
494                                 string namesFileName = createNamesFile();
495                                 
496                                 if (m->control_pressed) { break; }
497                                 
498                                 m->mothurOut("\nClustering flowgrams...\n");
499                                 string listFileName = cluster(distFileName, namesFileName);
500                                 
501                                 if (m->control_pressed) { break; }
502                                 
503                 
504                 
505                                 getOTUData(listFileName);
506
507                                 m->mothurRemove(distFileName);
508                                 m->mothurRemove(namesFileName);
509                                 m->mothurRemove(listFileName);
510                                 
511                                 if (m->control_pressed) { break; }
512                                 
513                                 initPyroCluster();
514
515                                 if (m->control_pressed) { break; }
516                                 
517             
518                                 for(int i=1;i<ncpus;i++){
519                                         MPI_Send(&numOTUs, 1, MPI_INT, i, tag, MPI_COMM_WORLD);
520                                         MPI_Send(&singleLookUp[0], singleLookUp.size(), MPI_DOUBLE, i, tag, MPI_COMM_WORLD);
521                                         MPI_Send(&uniqueFlowgrams[0], numFlowCells * numUniques, MPI_SHORT, i, tag, MPI_COMM_WORLD);
522                                         MPI_Send(&sigma, 1, MPI_DOUBLE, i, tag, MPI_COMM_WORLD);
523                                 }
524                                 
525                                 if (m->control_pressed) { break; }
526                                 
527                                 double maxDelta = 0;
528                                 int iter = 0;
529                                 
530                                 int numOTUsOnCPU = numOTUs / ncpus;
531                                 int numSeqsOnCPU = numSeqs / ncpus;
532                                 m->mothurOut("\nDenoising flowgrams...\n");
533                                 m->mothurOut("iter\tmaxDelta\tnLL\t\tcycletime\n");
534                                 
535                                 while((maxIters == 0 && maxDelta > minDelta) || iter < MIN_ITER || (maxDelta > minDelta && iter < maxIters)){
536                                         
537                                         double cycClock = clock();
538                                         unsigned long long cycTime = time(NULL);
539                                         fill();
540                                         
541                                         if (m->control_pressed) { break; }
542
543                                         int total = singleTau.size();
544                                         for(int i=1;i<ncpus;i++){
545                                                 MPI_Send(&total, 1, MPI_INT, i, tag, MPI_COMM_WORLD);
546                                                 MPI_Send(&change[0], numOTUs, MPI_SHORT, i, tag, MPI_COMM_WORLD);
547                                                 MPI_Send(&centroids[0], numOTUs, MPI_INT, i, tag, MPI_COMM_WORLD);
548                                                 
549                                                 MPI_Send(&singleTau[0], total, MPI_DOUBLE, i, tag, MPI_COMM_WORLD);
550                                                 MPI_Send(&seqNumber[0], total, MPI_INT, i, tag, MPI_COMM_WORLD);
551                                                 MPI_Send(&seqIndex[0], total, MPI_INT, i, tag, MPI_COMM_WORLD);
552                                                 MPI_Send(&nSeqsPerOTU[0], numOTUs, MPI_INT, i, tag, MPI_COMM_WORLD);
553                                                 MPI_Send(&cumNumSeqs[0], numOTUs, MPI_INT, i, tag, MPI_COMM_WORLD);
554                                         }
555                                         
556                                         calcCentroidsDriver(0, numOTUsOnCPU);
557                                         
558                                         for(int i=1;i<ncpus;i++){
559                                                 int otuStart = i * numOTUs / ncpus;
560                                                 int otuStop = (i + 1) * numOTUs / ncpus;
561                                                 
562                                                 vector<int> tempCentroids(numOTUs, 0);
563                                                 vector<short> tempChange(numOTUs, 0);
564                                                 
565                                                 MPI_Recv(&tempCentroids[0], numOTUs, MPI_INT, i, tag, MPI_COMM_WORLD, &status);
566                                                 MPI_Recv(&tempChange[0], numOTUs, MPI_SHORT, i, tag, MPI_COMM_WORLD, &status);
567                                                 
568                                                 for(int j=otuStart;j<otuStop;j++){
569                                                         centroids[j] = tempCentroids[j];
570                                                         change[j] = tempChange[j];
571                                                 }
572                                         }
573                                                                         
574                                         maxDelta = getNewWeights(); if (m->control_pressed) { break; }
575                                         double nLL = getLikelihood(); if (m->control_pressed) { break; }
576                                         checkCentroids(); if (m->control_pressed) { break; }
577                                         
578                                         for(int i=1;i<ncpus;i++){
579                                                 MPI_Send(&centroids[0], numOTUs, MPI_INT, i, tag, MPI_COMM_WORLD);
580                                                 MPI_Send(&weight[0], numOTUs, MPI_DOUBLE, i, tag, MPI_COMM_WORLD);
581                                                 MPI_Send(&change[0], numOTUs, MPI_SHORT, i, tag, MPI_COMM_WORLD);
582                                         }
583                                         
584                                         calcNewDistancesParent(0, numSeqsOnCPU);
585
586                                         total = singleTau.size();
587
588                                         for(int i=1;i<ncpus;i++){
589                                                 int childTotal;
590                                                 int seqStart = i * numSeqs / ncpus;
591                                                 int seqStop = (i + 1) * numSeqs / ncpus;
592                                                 
593                                                 MPI_Recv(&childTotal, 1, MPI_INT, i, tag, MPI_COMM_WORLD, &status);
594
595                                                 vector<int> childSeqIndex(childTotal, 0);
596                                                 vector<double> childSingleTau(childTotal, 0);
597                                                 vector<double> childDist(numSeqs * numOTUs, 0);
598                                                 vector<int> otuIndex(childTotal, 0);
599                                                 
600                                                 MPI_Recv(&childSeqIndex[0], childTotal, MPI_INT, i, tag, MPI_COMM_WORLD, &status);
601                                                 MPI_Recv(&childSingleTau[0], childTotal, MPI_DOUBLE, i, tag, MPI_COMM_WORLD, &status);
602                                                 MPI_Recv(&childDist[0], numOTUs * numSeqs, MPI_DOUBLE, i, tag, MPI_COMM_WORLD, &status);
603                                                 MPI_Recv(&otuIndex[0], childTotal, MPI_INT, i, tag, MPI_COMM_WORLD, &status);
604
605                                                 int oldTotal = total;
606                                                 total += childTotal;
607                                                 singleTau.resize(total, 0);
608                                                 seqIndex.resize(total, 0);
609                                                 seqNumber.resize(total, 0);
610                                                 
611                                                 int childIndex = 0;
612                                                 
613                                                 for(int j=oldTotal;j<total;j++){
614                                                         int otuI = otuIndex[childIndex];
615                                                         int seqI = childSeqIndex[childIndex];
616                                                         
617                                                         singleTau[j] = childSingleTau[childIndex];
618                                                         
619                                                         aaP[otuI][nSeqsPerOTU[otuI]] = j;
620                                                         aaI[otuI][nSeqsPerOTU[otuI]] = seqI;
621                                                         nSeqsPerOTU[otuI]++;
622                                                         childIndex++;
623                                                 }
624                                                 
625                                                 int index = seqStart * numOTUs;
626                                                 for(int j=seqStart;j<seqStop;j++){
627                                                         for(int k=0;k<numOTUs;k++){
628                                                                 dist[index] = childDist[index];
629                                                                 index++;
630                                                         }
631                                                 }                                       
632                                         }
633                                         
634                                         iter++;
635                                         
636                                         m->mothurOut(toString(iter) + '\t' + toString(maxDelta) + '\t' + toString(nLL) + '\t' + toString(time(NULL) - cycTime) + '\t' + toString((clock() - cycClock)/(double)CLOCKS_PER_SEC) + '\n');                  
637
638                                         if((maxIters == 0 && maxDelta > minDelta) || iter < MIN_ITER || (maxDelta > minDelta && iter < maxIters)){
639                                                 int live = 1;
640                                                 for(int i=1;i<ncpus;i++){
641                                                         MPI_Send(&live, 1, MPI_INT, i, tag, MPI_COMM_WORLD);
642                                                 }
643                                         }
644                                         else{
645                                                 int live = 0;
646                                                 for(int i=1;i<ncpus;i++){
647                                                         MPI_Send(&live, 1, MPI_INT, i, tag, MPI_COMM_WORLD); //send kill command
648                                                 }
649                                         }
650                                         
651                                 }       
652                                 
653                                 if (m->control_pressed) { break; }
654                                 
655                                 m->mothurOut("\nFinalizing...\n");
656                                 fill();
657                                 
658                                 if (m->control_pressed) { break; }
659                                 
660                                 setOTUs();
661                                 
662                                 vector<int> otuCounts(numOTUs, 0);
663                                 for(int i=0;i<numSeqs;i++)      {       otuCounts[otuData[i]]++;        }
664                                 calcCentroidsDriver(0, numOTUs);
665                                 
666                                 if (m->control_pressed) { break; }
667                                 
668                                 writeQualities(otuCounts);      if (m->control_pressed) { break; }
669                                 writeSequences(otuCounts);      if (m->control_pressed) { break; }
670                                 writeNames(otuCounts);          if (m->control_pressed) { break; }
671                                 writeClusters(otuCounts);       if (m->control_pressed) { break; }
672                                 writeGroups();                          if (m->control_pressed) { break; }
673                                 
674                                                                  
675                                 m->mothurOut("Total time to process " + toString(flowFileName) + ":\t" + toString(time(NULL) - begTime) + '\t' + toString((clock() - begClock)/(double)CLOCKS_PER_SEC) + '\n');                 
676                         }
677                 }
678                 else{
679                         int abort = 1;
680
681                         MPI_Recv(&abort, 1, MPI_INT, 0, tag, MPI_COMM_WORLD, &status);
682                         if(abort){      return 0;       }
683
684                         int numFiles;
685                         MPI_Recv(&numFiles, 1, MPI_INT, 0, tag, MPI_COMM_WORLD, &status);
686
687                         for(int i=0;i<numFiles;i++){
688                                 
689                                 if (m->control_pressed) { break; }
690                                 
691                                 //Now into the pyrodist part
692                                 bool live = 1;
693
694                                 char fileName[1024];
695                                 MPI_Recv(&fileName, 1024, MPI_CHAR, 0, tag, MPI_COMM_WORLD, &status);
696                                 MPI_Recv(&numSeqs, 1, MPI_INT, 0, tag, MPI_COMM_WORLD, &status);
697                                 MPI_Recv(&numUniques, 1, MPI_INT, 0, tag, MPI_COMM_WORLD, &status);
698                                 MPI_Recv(&numFlowCells, 1, MPI_INT, 0, tag, MPI_COMM_WORLD, &status);
699                                 
700                                 flowDataIntI.resize(numSeqs * numFlowCells);
701                                 flowDataPrI.resize(numSeqs * numFlowCells);
702                                 mapUniqueToSeq.resize(numSeqs);
703                                 mapSeqToUnique.resize(numSeqs);
704                                 lengths.resize(numSeqs);
705                                 jointLookUp.resize(NUMBINS * NUMBINS);
706
707                                 MPI_Recv(&flowDataIntI[0], numSeqs * numFlowCells, MPI_SHORT, 0, tag, MPI_COMM_WORLD, &status);
708                                 MPI_Recv(&flowDataPrI[0], numSeqs * numFlowCells, MPI_DOUBLE, 0, tag, MPI_COMM_WORLD, &status);
709                                 MPI_Recv(&mapUniqueToSeq[0], numSeqs, MPI_INT, 0, tag, MPI_COMM_WORLD, &status);
710                                 MPI_Recv(&mapSeqToUnique[0], numSeqs, MPI_INT, 0, tag, MPI_COMM_WORLD, &status);
711                                 MPI_Recv(&lengths[0], numSeqs, MPI_INT, 0, tag, MPI_COMM_WORLD, &status);
712                                 MPI_Recv(&jointLookUp[0], NUMBINS * NUMBINS, MPI_DOUBLE, 0, tag, MPI_COMM_WORLD, &status);
713                                 MPI_Recv(&cutoff, 1, MPI_DOUBLE, 0, tag, MPI_COMM_WORLD, &status);
714
715                                 flowFileName = string(fileName);
716                                 int flowDistStart = int(sqrt(float(pid)/float(ncpus)) * numUniques);
717                                 int flowDistEnd = int(sqrt(float(pid+1)/float(ncpus)) * numUniques);
718                                 
719                                 string distanceStringChild = flowDistMPI(flowDistStart, flowDistEnd);
720                                 
721                                 if (m->control_pressed) { break; }
722                                 
723                                 int done = 1;
724                                 MPI_Send(&done, 1, MPI_INT, 0, tag, MPI_COMM_WORLD);
725
726                                 //Now into the pyronoise part
727                                 MPI_Recv(&numOTUs, 1, MPI_INT, 0, tag, MPI_COMM_WORLD, &status);
728                                 
729                                 singleLookUp.resize(HOMOPS * NUMBINS);
730                                 uniqueFlowgrams.resize(numUniques * numFlowCells);
731                                 weight.resize(numOTUs);
732                                 centroids.resize(numOTUs);
733                                 change.resize(numOTUs);
734                                 dist.assign(numOTUs * numSeqs, 0);
735                                 nSeqsPerOTU.resize(numOTUs);
736                                 cumNumSeqs.resize(numOTUs);
737
738                                 MPI_Recv(&singleLookUp[0], singleLookUp.size(), MPI_DOUBLE, 0, tag, MPI_COMM_WORLD, &status);
739                                 MPI_Recv(&uniqueFlowgrams[0], uniqueFlowgrams.size(), MPI_SHORT, 0, tag, MPI_COMM_WORLD, &status);
740                                 MPI_Recv(&sigma, 1, MPI_DOUBLE, 0, tag, MPI_COMM_WORLD, &status);
741                                 
742                                 int startOTU = pid * numOTUs / ncpus;
743                                 int endOTU = (pid + 1) * numOTUs / ncpus;
744
745                                 int startSeq = pid * numSeqs / ncpus;
746                                 int endSeq = (pid + 1) * numSeqs /ncpus;
747                                 
748                                 int total;
749
750                                 while(live){
751                                         
752                                         if (m->control_pressed) { break; }
753                                         
754                                         MPI_Recv(&total, 1, MPI_INT, 0, tag, MPI_COMM_WORLD, &status);
755                                         singleTau.assign(total, 0.0000);
756                                         seqNumber.assign(total, 0);
757                                         seqIndex.assign(total, 0);
758
759                                         MPI_Recv(&change[0], numOTUs, MPI_SHORT, 0, tag, MPI_COMM_WORLD, &status);
760                                         MPI_Recv(&centroids[0], numOTUs, MPI_INT, 0, tag, MPI_COMM_WORLD, &status);
761                                         MPI_Recv(&singleTau[0], total, MPI_DOUBLE, 0, tag, MPI_COMM_WORLD, &status);
762                                         MPI_Recv(&seqNumber[0], total, MPI_DOUBLE, 0, tag, MPI_COMM_WORLD, &status);
763                                         MPI_Recv(&seqIndex[0], total, MPI_INT, 0, tag, MPI_COMM_WORLD, &status);
764                                         MPI_Recv(&nSeqsPerOTU[0], total, MPI_INT, 0, tag, MPI_COMM_WORLD, &status);
765                                         MPI_Recv(&cumNumSeqs[0], numOTUs, MPI_INT, 0, tag, MPI_COMM_WORLD, &status);
766
767                                         calcCentroidsDriver(startOTU, endOTU);
768
769                                         MPI_Send(&centroids[0], numOTUs, MPI_INT, 0, tag, MPI_COMM_WORLD);
770                                         MPI_Send(&change[0], numOTUs, MPI_SHORT, 0, tag, MPI_COMM_WORLD);
771
772                                         MPI_Recv(&centroids[0], numOTUs, MPI_INT, 0, tag, MPI_COMM_WORLD, &status);
773                                         MPI_Recv(&weight[0], numOTUs, MPI_DOUBLE, 0, tag, MPI_COMM_WORLD, &status);
774                                         MPI_Recv(&change[0], numOTUs, MPI_SHORT, 0, tag, MPI_COMM_WORLD, &status);
775
776                                         vector<int> otuIndex(total, 0);
777                                         calcNewDistancesChildMPI(startSeq, endSeq, otuIndex);
778                                         total = otuIndex.size();
779                                         
780                                         MPI_Send(&total, 1, MPI_INT, 0, tag, MPI_COMM_WORLD);
781                                         MPI_Send(&seqIndex[0], total, MPI_INT, 0, tag, MPI_COMM_WORLD);
782                                         MPI_Send(&singleTau[0], total, MPI_DOUBLE, 0, tag, MPI_COMM_WORLD);
783                                         MPI_Send(&dist[0], numOTUs * numSeqs, MPI_DOUBLE, 0, tag, MPI_COMM_WORLD);
784                                         MPI_Send(&otuIndex[0], total, MPI_INT, 0, tag, MPI_COMM_WORLD);
785                                 
786                                         MPI_Recv(&live, 1, MPI_INT, 0, tag, MPI_COMM_WORLD, &status);
787                                 }
788                         }
789                 }
790                 
791                 if (m->control_pressed) { for (int i = 0; i < outputNames.size(); i++) { m->mothurRemove(outputNames[i]); } return 0; }
792                 
793                 MPI_Barrier(MPI_COMM_WORLD);
794
795                 
796                 if(compositeFASTAFileName != ""){
797                         outputNames.push_back(compositeFASTAFileName); outputTypes["fasta"].push_back(compositeFASTAFileName);
798                         outputNames.push_back(compositeNamesFileName); outputTypes["name"].push_back(compositeNamesFileName); 
799                 }
800
801                 m->mothurOutEndLine();
802                 m->mothurOut("Output File Names: "); m->mothurOutEndLine();
803                 for (int i = 0; i < outputNames.size(); i++) {  m->mothurOut(outputNames[i]); m->mothurOutEndLine();    }
804                 m->mothurOutEndLine();
805                 
806                 return 0;
807
808         }
809         catch(exception& e) {
810                 m->errorOut(e, "ShhherCommand", "execute");
811                 exit(1);
812         }
813 }
814 /**************************************************************************************************/
815 string ShhherCommand::createNamesFile(){
816         try{
817                 
818                 vector<string> duplicateNames(numUniques, "");
819                 for(int i=0;i<numSeqs;i++){
820                         duplicateNames[mapSeqToUnique[i]] += seqNameVector[i] + ',';
821                 }
822                 map<string, string> variables; 
823                 variables["[filename]"] = outputDir + m->getRootName(m->getSimpleName(flowFileName));
824                 string nameFileName = getOutputFileName("name",variables);
825                 
826                 ofstream nameFile;
827                 m->openOutputFile(nameFileName, nameFile);
828                 
829                 for(int i=0;i<numUniques;i++){
830                         
831                         if (m->control_pressed) { break; }
832                         
833             //                  nameFile << seqNameVector[mapUniqueToSeq[i]] << '\t' << duplicateNames[i].substr(0, duplicateNames[i].find_last_of(',')) << endl;
834                         nameFile << mapUniqueToSeq[i] << '\t' << duplicateNames[i].substr(0, duplicateNames[i].find_last_of(',')) << endl;
835                 }
836                 
837                 nameFile.close();
838                 return  nameFileName;
839         }
840         catch(exception& e) {
841                 m->errorOut(e, "ShhherCommand", "createNamesFile");
842                 exit(1);
843         }
844 }
845 /**************************************************************************************************/
846
847 string ShhherCommand::flowDistMPI(int startSeq, int stopSeq){
848         try{            
849                 ostringstream outStream;
850                 outStream.setf(ios::fixed, ios::floatfield);
851                 outStream.setf(ios::dec, ios::basefield);
852                 outStream.setf(ios::showpoint);
853                 outStream.precision(6);
854                 
855                 int begTime = time(NULL);
856                 double begClock = clock();
857                 
858                 for(int i=startSeq;i<stopSeq;i++){
859                         
860                         if (m->control_pressed) { break; }
861                         
862                         for(int j=0;j<i;j++){
863                                 float flowDistance = calcPairwiseDist(mapUniqueToSeq[i], mapUniqueToSeq[j]);
864                                 
865                                 if(flowDistance < 1e-6){
866                                         outStream << mapUniqueToSeq[i] << '\t' << mapUniqueToSeq[j] << '\t' << 0.000000 << endl;
867                                 }
868                                 else if(flowDistance <= cutoff){
869                                         outStream << mapUniqueToSeq[i] << '\t' << mapUniqueToSeq[j] << '\t' << flowDistance << endl;
870                                 }
871                         }
872                         if(i % 100 == 0){
873                                 m->mothurOutJustToScreen(toString(i) + '\t' + toString(time(NULL) - begTime) + '\t' + toString((clock()-begClock)/CLOCKS_PER_SEC) + '\n');
874                         }
875                 }
876                 
877                 string fDistFileName = flowFileName.substr(0,flowFileName.find_last_of('.')) + ".shhh.dist";
878                 if(pid != 0){   fDistFileName += ".temp." + toString(pid);      }
879                 
880                 if (m->control_pressed) { return fDistFileName; }
881                 
882                 m->mothurOutJustToScreen(toString(stopSeq) + '\t' + toString(time(NULL) - begTime) + '\t' + toString((clock()-begClock)/CLOCKS_PER_SEC) + '\n');
883
884                 ofstream distFile(fDistFileName.c_str());
885                 distFile << outStream.str();            
886                 distFile.close();
887                 
888                 return fDistFileName;
889         }
890         catch(exception& e) {
891                 m->errorOut(e, "ShhherCommand", "flowDistMPI");
892                 exit(1);
893         }
894 }
895 /**************************************************************************************************/
896  
897 void ShhherCommand::getOTUData(string listFileName){
898     try {
899         
900         ifstream listFile;
901         m->openInputFile(listFileName, listFile);
902         string label;
903         
904         listFile >> label >> numOTUs;
905         
906         otuData.assign(numSeqs, 0);
907         cumNumSeqs.assign(numOTUs, 0);
908         nSeqsPerOTU.assign(numOTUs, 0);
909         aaP.clear();aaP.resize(numOTUs);
910         
911         seqNumber.clear();
912         aaI.clear();
913         seqIndex.clear();
914         
915         string singleOTU = "";
916         
917         for(int i=0;i<numOTUs;i++){
918             
919             if (m->control_pressed) { break; }
920             
921             listFile >> singleOTU;
922             
923             istringstream otuString(singleOTU);
924             
925             while(otuString){
926                 
927                 string seqName = "";
928                 
929                 for(int j=0;j<singleOTU.length();j++){
930                     char letter = otuString.get();
931                     
932                     if(letter != ','){
933                         seqName += letter;
934                     }
935                     else{
936                         map<string,int>::iterator nmIt = nameMap.find(seqName);
937                         int index = nmIt->second;
938                         
939                         nameMap.erase(nmIt);
940                         
941                         otuData[index] = i;
942                         nSeqsPerOTU[i]++;
943                         aaP[i].push_back(index);
944                         seqName = "";
945                     }
946                 }
947                 
948                 map<string,int>::iterator nmIt = nameMap.find(seqName);
949                 
950                 int index = nmIt->second;
951                 nameMap.erase(nmIt);
952                 
953                 otuData[index] = i;
954                 nSeqsPerOTU[i]++;
955                 aaP[i].push_back(index);        
956                 
957                 otuString.get();
958             }
959             
960             sort(aaP[i].begin(), aaP[i].end());
961             for(int j=0;j<nSeqsPerOTU[i];j++){
962                 seqNumber.push_back(aaP[i][j]);
963             }
964             for(int j=nSeqsPerOTU[i];j<numSeqs;j++){
965                 aaP[i].push_back(0);
966             }
967             
968             
969         }
970         
971         for(int i=1;i<numOTUs;i++){
972             cumNumSeqs[i] = cumNumSeqs[i-1] + nSeqsPerOTU[i-1];
973         }
974         aaI = aaP;
975         seqIndex = seqNumber;
976         
977         listFile.close();       
978         
979     }
980     catch(exception& e) {
981         m->errorOut(e, "ShhherCommand", "getOTUData");
982         exit(1);        
983     }           
984 }
985
986 /**************************************************************************************************/
987
988 void ShhherCommand::initPyroCluster(){                          
989     try{
990         if (numOTUs < processors) { processors = 1; }
991         
992         if (m->debug) { m->mothurOut("[DEBUG]: numSeqs = " + toString(numSeqs) + " numOTUS = " + toString(numOTUs) + " about to alloc a dist vector with size = " + toString((numSeqs * numOTUs)) + ".\n"); }
993         
994         dist.assign(numSeqs * numOTUs, 0);
995         change.assign(numOTUs, 1);
996         centroids.assign(numOTUs, -1);
997         weight.assign(numOTUs, 0);
998         singleTau.assign(numSeqs, 1.0);
999         
1000         nSeqsBreaks.assign(processors+1, 0);
1001         nOTUsBreaks.assign(processors+1, 0);
1002         
1003         if (m->debug) { m->mothurOut("[DEBUG]: made it through the memory allocation.\n"); }
1004         
1005         nSeqsBreaks[0] = 0;
1006         for(int i=0;i<processors;i++){
1007             nSeqsBreaks[i+1] = nSeqsBreaks[i] + (int)((double) numSeqs / (double) processors);
1008             nOTUsBreaks[i+1] = nOTUsBreaks[i] + (int)((double) numOTUs / (double) processors);
1009         }
1010         nSeqsBreaks[processors] = numSeqs;
1011         nOTUsBreaks[processors] = numOTUs;
1012     }
1013     catch(exception& e) {
1014         m->errorOut(e, "ShhherCommand", "initPyroCluster");
1015         exit(1);        
1016     }
1017 }
1018
1019 /**************************************************************************************************/
1020
1021 void ShhherCommand::fill(){
1022     try {
1023         int index = 0;
1024         for(int i=0;i<numOTUs;i++){
1025             
1026             if (m->control_pressed) { break; }
1027             
1028             cumNumSeqs[i] = index;
1029             for(int j=0;j<nSeqsPerOTU[i];j++){
1030                 seqNumber[index] = aaP[i][j];
1031                 seqIndex[index] = aaI[i][j];
1032                 
1033                 index++;
1034             }
1035         }
1036     }
1037     catch(exception& e) {
1038         m->errorOut(e, "ShhherCommand", "fill");
1039         exit(1);        
1040     }           
1041 }
1042
1043 /**************************************************************************************************/
1044  
1045 void ShhherCommand::getFlowData(){
1046     try{
1047         ifstream flowFile;
1048         m->openInputFile(flowFileName, flowFile);
1049         
1050         string seqName;
1051         seqNameVector.clear();
1052         lengths.clear();
1053         flowDataIntI.clear();
1054         nameMap.clear();
1055         
1056         
1057         int currentNumFlowCells;
1058         
1059         float intensity;
1060         
1061         string numFlowTest;
1062         flowFile >> numFlowTest;
1063         
1064         if (!m->isContainingOnlyDigits(numFlowTest)) { m->mothurOut("[ERROR]: expected a number and got " + numFlowTest + ", quitting. Did you use the flow parameter instead of the file parameter?"); m->mothurOutEndLine(); exit(1); }
1065         else { convert(numFlowTest, numFlowCells); }
1066         
1067         int index = 0;//pcluster
1068         while(!flowFile.eof()){
1069             
1070             if (m->control_pressed) { break; }
1071             
1072             flowFile >> seqName >> currentNumFlowCells;
1073             lengths.push_back(currentNumFlowCells);
1074             
1075             seqNameVector.push_back(seqName);
1076             nameMap[seqName] = index++;//pcluster
1077             
1078             for(int i=0;i<numFlowCells;i++){
1079                 flowFile >> intensity;
1080                 if(intensity > 9.99)    {       intensity = 9.99;       }
1081                 int intI = int(100 * intensity + 0.0001);
1082                 flowDataIntI.push_back(intI);
1083             }
1084             m->gobble(flowFile);
1085         }
1086         flowFile.close();
1087         
1088         numSeqs = seqNameVector.size();         
1089         
1090         for(int i=0;i<numSeqs;i++){
1091             
1092             if (m->control_pressed) { break; }
1093             
1094             int iNumFlowCells = i * numFlowCells;
1095             for(int j=lengths[i];j<numFlowCells;j++){
1096                 flowDataIntI[iNumFlowCells + j] = 0;
1097             }
1098         }
1099         
1100     }
1101     catch(exception& e) {
1102         m->errorOut(e, "ShhherCommand", "getFlowData");
1103         exit(1);
1104     }
1105 }
1106 /**************************************************************************************************/
1107 void ShhherCommand::calcNewDistancesChildMPI(int startSeq, int stopSeq, vector<int>& otuIndex){
1108         
1109         try{
1110                 vector<double> newTau(numOTUs,0);
1111                 vector<double> norms(numSeqs, 0);
1112                 otuIndex.clear();
1113                 seqIndex.clear();
1114                 singleTau.clear();
1115                 
1116                 for(int i=startSeq;i<stopSeq;i++){
1117                         
1118                         if (m->control_pressed) { break; }
1119                         
1120                         double offset = 1e8;
1121                         int indexOffset = i * numOTUs;
1122                         
1123                         for(int j=0;j<numOTUs;j++){
1124                                 
1125                                 if(weight[j] > MIN_WEIGHT && change[j] == 1){
1126                                         dist[indexOffset + j] = getDistToCentroid(centroids[j], i, lengths[i]);
1127                                 }
1128                                 if(weight[j] > MIN_WEIGHT && dist[indexOffset + j] < offset){
1129                                         offset = dist[indexOffset + j];
1130                                 }
1131                         }
1132                         
1133                         for(int j=0;j<numOTUs;j++){
1134                                 if(weight[j] > MIN_WEIGHT){
1135                                         newTau[j] = exp(sigma * (-dist[indexOffset + j] + offset)) * weight[j];
1136                                         norms[i] += newTau[j];
1137                                 }
1138                                 else{
1139                                         newTau[j] = 0.0;
1140                                 }
1141                         }
1142                         
1143                         for(int j=0;j<numOTUs;j++){
1144                 
1145                                 newTau[j] /= norms[i];
1146                                 
1147                                 if(newTau[j] > MIN_TAU){
1148                                         otuIndex.push_back(j);
1149                                         seqIndex.push_back(i);
1150                                         singleTau.push_back(newTau[j]);
1151                                 }
1152                         }
1153                         
1154                 }
1155         }
1156         catch(exception& e) {
1157                 m->errorOut(e, "ShhherCommand", "calcNewDistancesChildMPI");
1158                 exit(1);        
1159         }               
1160 }
1161
1162 /**************************************************************************************************/
1163
1164 void ShhherCommand::calcNewDistancesParent(int startSeq, int stopSeq){
1165     
1166     try{
1167         
1168         int total = 0;
1169         vector<double> newTau(numOTUs,0);
1170         vector<double> norms(numSeqs, 0);
1171         nSeqsPerOTU.assign(numOTUs, 0);
1172         
1173         for(int i=startSeq;i<stopSeq;i++){
1174             
1175             if (m->control_pressed) { break; }
1176             
1177             int indexOffset = i * numOTUs;
1178             
1179             double offset = 1e8;
1180             
1181             for(int j=0;j<numOTUs;j++){
1182                 
1183                 if(weight[j] > MIN_WEIGHT && change[j] == 1){
1184                     dist[indexOffset + j] = getDistToCentroid(centroids[j], i, lengths[i]);
1185                 }
1186                 
1187                 if(weight[j] > MIN_WEIGHT && dist[indexOffset + j] < offset){
1188                     offset = dist[indexOffset + j];
1189                 }
1190             }
1191             
1192             for(int j=0;j<numOTUs;j++){
1193                 if(weight[j] > MIN_WEIGHT){
1194                     newTau[j] = exp(sigma * (-dist[indexOffset + j] + offset)) * weight[j];
1195                     norms[i] += newTau[j];
1196                 }
1197                 else{
1198                     newTau[j] = 0.0;
1199                 }
1200             }
1201             
1202             for(int j=0;j<numOTUs;j++){
1203                 newTau[j] /= norms[i];
1204             }
1205             
1206             for(int j=0;j<numOTUs;j++){
1207                 if(newTau[j] > MIN_TAU){
1208                     
1209                     int oldTotal = total;
1210                     
1211                     total++;
1212                     
1213                     singleTau.resize(total, 0);
1214                     seqNumber.resize(total, 0);
1215                     seqIndex.resize(total, 0);
1216                     
1217                     singleTau[oldTotal] = newTau[j];
1218                     
1219                     aaP[j][nSeqsPerOTU[j]] = oldTotal;
1220                     aaI[j][nSeqsPerOTU[j]] = i;
1221                     nSeqsPerOTU[j]++;
1222                 }
1223             }
1224             
1225         }
1226         
1227     }
1228     catch(exception& e) {
1229         m->errorOut(e, "ShhherCommand", "calcNewDistancesParent");
1230         exit(1);        
1231     }           
1232 }
1233
1234 /**************************************************************************************************/
1235
1236 void ShhherCommand::setOTUs(){
1237     
1238     try {
1239         vector<double> bigTauMatrix(numOTUs * numSeqs, 0.0000);
1240         
1241         for(int i=0;i<numOTUs;i++){
1242             
1243             if (m->control_pressed) { break; }
1244             
1245             for(int j=0;j<nSeqsPerOTU[i];j++){
1246                 int index = cumNumSeqs[i] + j;
1247                 double tauValue = singleTau[seqNumber[index]];
1248                 int sIndex = seqIndex[index];
1249                 bigTauMatrix[sIndex * numOTUs + i] = tauValue;                          
1250             }
1251         }
1252         
1253         for(int i=0;i<numSeqs;i++){
1254             double maxTau = -1.0000;
1255             int maxOTU = -1;
1256             for(int j=0;j<numOTUs;j++){
1257                 if(bigTauMatrix[i * numOTUs + j] > maxTau){
1258                     maxTau = bigTauMatrix[i * numOTUs + j];
1259                     maxOTU = j;
1260                 }
1261             }
1262             
1263             otuData[i] = maxOTU;
1264         }
1265         
1266         nSeqsPerOTU.assign(numOTUs, 0);         
1267         
1268         for(int i=0;i<numSeqs;i++){
1269             int index = otuData[i];
1270             
1271             singleTau[i] = 1.0000;
1272             dist[i] = 0.0000;
1273             
1274             aaP[index][nSeqsPerOTU[index]] = i;
1275             aaI[index][nSeqsPerOTU[index]] = i;
1276             
1277             nSeqsPerOTU[index]++;
1278         }
1279         fill(); 
1280     }
1281     catch(exception& e) {
1282         m->errorOut(e, "ShhherCommand", "setOTUs");
1283         exit(1);        
1284     }           
1285 }
1286
1287 /**************************************************************************************************/
1288  
1289 void ShhherCommand::getUniques(){
1290     try{
1291         
1292         
1293         numUniques = 0;
1294         uniqueFlowgrams.assign(numFlowCells * numSeqs, -1);
1295         uniqueCount.assign(numSeqs, 0);                                                 //      anWeights
1296         uniqueLengths.assign(numSeqs, 0);
1297         mapSeqToUnique.assign(numSeqs, -1);
1298         mapUniqueToSeq.assign(numSeqs, -1);
1299         
1300         vector<short> uniqueFlowDataIntI(numFlowCells * numSeqs, -1);
1301         
1302         for(int i=0;i<numSeqs;i++){
1303             
1304             if (m->control_pressed) { break; }
1305             
1306             int index = 0;
1307             
1308             vector<short> current(numFlowCells);
1309             for(int j=0;j<numFlowCells;j++){
1310                 current[j] = short(((flowDataIntI[i * numFlowCells + j] + 50.0)/100.0));
1311             }
1312             
1313             for(int j=0;j<numUniques;j++){
1314                 int offset = j * numFlowCells;
1315                 bool toEnd = 1;
1316                 
1317                 int shorterLength;
1318                 if(lengths[i] < uniqueLengths[j])       {       shorterLength = lengths[i];                     }
1319                 else                                                            {       shorterLength = uniqueLengths[j];       }
1320                 
1321                 for(int k=0;k<shorterLength;k++){
1322                     if(current[k] != uniqueFlowgrams[offset + k]){
1323                         toEnd = 0;
1324                         break;
1325                     }
1326                 }
1327                 
1328                 if(toEnd){
1329                     mapSeqToUnique[i] = j;
1330                     uniqueCount[j]++;
1331                     index = j;
1332                     if(lengths[i] > uniqueLengths[j])   {       uniqueLengths[j] = lengths[i];  }
1333                     break;
1334                 }
1335                 index++;
1336             }
1337             
1338             if(index == numUniques){
1339                 uniqueLengths[numUniques] = lengths[i];
1340                 uniqueCount[numUniques] = 1;
1341                 mapSeqToUnique[i] = numUniques;//anMap
1342                 mapUniqueToSeq[numUniques] = i;//anF
1343                 
1344                 for(int k=0;k<numFlowCells;k++){
1345                     uniqueFlowgrams[numUniques * numFlowCells + k] = current[k];
1346                     uniqueFlowDataIntI[numUniques * numFlowCells + k] = flowDataIntI[i * numFlowCells + k];
1347                 }
1348                 
1349                 numUniques++;
1350             }
1351         }
1352         uniqueFlowDataIntI.resize(numFlowCells * numUniques);
1353         uniqueLengths.resize(numUniques);       
1354         
1355         flowDataPrI.resize(numSeqs * numFlowCells, 0);
1356         for(int i=0;i<flowDataPrI.size();i++)   {       if (m->control_pressed) { break; } flowDataPrI[i] = getProbIntensity(flowDataIntI[i]);          }
1357     }
1358     catch(exception& e) {
1359         m->errorOut(e, "ShhherCommand", "getUniques");
1360         exit(1);
1361     }
1362 }
1363
1364 /**************************************************************************************************/
1365
1366 float ShhherCommand::calcPairwiseDist(int seqA, int seqB){
1367     try{
1368         int minLength = lengths[mapSeqToUnique[seqA]];
1369         if(lengths[seqB] < minLength){  minLength = lengths[mapSeqToUnique[seqB]];      }
1370         
1371         int ANumFlowCells = seqA * numFlowCells;
1372         int BNumFlowCells = seqB * numFlowCells;
1373         
1374         float dist = 0;
1375         
1376         for(int i=0;i<minLength;i++){
1377             
1378             if (m->control_pressed) { break; }
1379             
1380             int flowAIntI = flowDataIntI[ANumFlowCells + i];
1381             float flowAPrI = flowDataPrI[ANumFlowCells + i];
1382             
1383             int flowBIntI = flowDataIntI[BNumFlowCells + i];
1384             float flowBPrI = flowDataPrI[BNumFlowCells + i];
1385             dist += jointLookUp[flowAIntI * NUMBINS + flowBIntI] - flowAPrI - flowBPrI;
1386         }
1387         
1388         dist /= (float) minLength;
1389         return dist;
1390     }
1391     catch(exception& e) {
1392         m->errorOut(e, "ShhherCommand", "calcPairwiseDist");
1393         exit(1);
1394     }
1395 }
1396
1397 //**********************************************************************************************************************/
1398
1399 string ShhherCommand::cluster(string distFileName, string namesFileName){
1400     try {
1401         
1402         ReadMatrix* read = new ReadColumnMatrix(distFileName);  
1403                 read->setCutoff(cutoff);
1404                 
1405                 NameAssignment* clusterNameMap = new NameAssignment(namesFileName);
1406                 clusterNameMap->readMap();
1407                 read->read(clusterNameMap);
1408         
1409                 ListVector* list = read->getListVector();
1410                 SparseDistanceMatrix* matrix = read->getDMatrix();
1411                 
1412                 delete read; 
1413                 delete clusterNameMap; 
1414         
1415         RAbundVector* rabund = new RAbundVector(list->getRAbundVector());
1416         
1417         Cluster* cluster = new CompleteLinkage(rabund, list, matrix, cutoff, "furthest"); 
1418         string tag = cluster->getTag();
1419         
1420         double clusterCutoff = cutoff;
1421         while (matrix->getSmallDist() <= clusterCutoff && matrix->getNNodes() > 0){
1422             
1423             if (m->control_pressed) { break; }
1424             
1425             cluster->update(clusterCutoff);
1426         }
1427         
1428         list->setLabel(toString(cutoff));
1429         
1430         string listFileName = flowFileName.substr(0,flowFileName.find_last_of('.')) + ".shhh.list";
1431         ofstream listFile;
1432         m->openOutputFile(listFileName, listFile);
1433         list->print(listFile);
1434         listFile.close();
1435         
1436         delete matrix;  delete cluster; delete rabund; delete list;
1437         
1438         return listFileName;
1439     }
1440     catch(exception& e) {
1441         m->errorOut(e, "ShhherCommand", "cluster");
1442         exit(1);        
1443     }           
1444 }
1445
1446 /**************************************************************************************************/
1447
1448 void ShhherCommand::calcCentroidsDriver(int start, int finish){                          
1449     
1450     //this function gets the most likely homopolymer length at a flow position for a group of sequences
1451     //within an otu
1452     
1453     try{
1454         
1455         for(int i=start;i<finish;i++){
1456             
1457             if (m->control_pressed) { break; }
1458             
1459             double count = 0;
1460             int position = 0;
1461             int minFlowGram = 100000000;
1462             double minFlowValue = 1e8;
1463             change[i] = 0; //FALSE
1464             
1465             for(int j=0;j<nSeqsPerOTU[i];j++){
1466                 count += singleTau[seqNumber[cumNumSeqs[i] + j]];
1467             }
1468             
1469             if(nSeqsPerOTU[i] > 0 && count > MIN_COUNT){
1470                 vector<double> adF(nSeqsPerOTU[i]);
1471                 vector<int> anL(nSeqsPerOTU[i]);
1472                 
1473                 for(int j=0;j<nSeqsPerOTU[i];j++){
1474                     int index = cumNumSeqs[i] + j;
1475                     int nI = seqIndex[index];
1476                     int nIU = mapSeqToUnique[nI];
1477                     
1478                     int k;
1479                     for(k=0;k<position;k++){
1480                         if(nIU == anL[k]){
1481                             break;
1482                         }
1483                     }
1484                     if(k == position){
1485                         anL[position] = nIU;
1486                         adF[position] = 0.0000;
1487                         position++;
1488                     }                                           
1489                 }
1490                 
1491                 for(int j=0;j<nSeqsPerOTU[i];j++){
1492                     int index = cumNumSeqs[i] + j;
1493                     int nI = seqIndex[index];
1494                     
1495                     double tauValue = singleTau[seqNumber[index]];
1496                     
1497                     for(int k=0;k<position;k++){
1498                         double dist = getDistToCentroid(anL[k], nI, lengths[nI]);
1499                         adF[k] += dist * tauValue;
1500                     }
1501                 }
1502                 
1503                 for(int j=0;j<position;j++){
1504                     if(adF[j] < minFlowValue){
1505                         minFlowGram = j;
1506                         minFlowValue = adF[j];
1507                     }
1508                 }
1509                 
1510                 if(centroids[i] != anL[minFlowGram]){
1511                     change[i] = 1;
1512                     centroids[i] = anL[minFlowGram];
1513                 }
1514             }
1515             else if(centroids[i] != -1){
1516                 change[i] = 1;
1517                 centroids[i] = -1;                      
1518             }
1519         }
1520     }
1521     catch(exception& e) {
1522         m->errorOut(e, "ShhherCommand", "calcCentroidsDriver");
1523         exit(1);        
1524     }           
1525 }
1526
1527 /**************************************************************************************************/
1528
1529 double ShhherCommand::getDistToCentroid(int cent, int flow, int length){
1530     try{
1531         
1532         int flowAValue = cent * numFlowCells;
1533         int flowBValue = flow * numFlowCells;
1534         
1535         double dist = 0;
1536         
1537         for(int i=0;i<length;i++){
1538             dist += singleLookUp[uniqueFlowgrams[flowAValue] * NUMBINS + flowDataIntI[flowBValue]];
1539             flowAValue++;
1540             flowBValue++;
1541         }
1542         
1543         return dist / (double)length;
1544     }
1545     catch(exception& e) {
1546         m->errorOut(e, "ShhherCommand", "getDistToCentroid");
1547         exit(1);        
1548     }           
1549 }
1550
1551 /**************************************************************************************************/
1552
1553 double ShhherCommand::getNewWeights(){
1554     try{
1555         
1556         double maxChange = 0;
1557         
1558         for(int i=0;i<numOTUs;i++){
1559             
1560             if (m->control_pressed) { break; }
1561             
1562             double difference = weight[i];
1563             weight[i] = 0;
1564             
1565             for(int j=0;j<nSeqsPerOTU[i];j++){
1566                 int index = cumNumSeqs[i] + j;
1567                 double tauValue = singleTau[seqNumber[index]];
1568                 weight[i] += tauValue;
1569             }
1570             
1571             difference = fabs(weight[i] - difference);
1572             if(difference > maxChange){ maxChange = difference; }
1573         }
1574         return maxChange;
1575     }
1576     catch(exception& e) {
1577         m->errorOut(e, "ShhherCommand", "getNewWeights");
1578         exit(1);        
1579     }           
1580 }
1581  
1582  /**************************************************************************************************/
1583  
1584 double ShhherCommand::getLikelihood(){
1585     
1586     try{
1587         
1588         vector<long double> P(numSeqs, 0);
1589         int effNumOTUs = 0;
1590         
1591         for(int i=0;i<numOTUs;i++){
1592             if(weight[i] > MIN_WEIGHT){
1593                 effNumOTUs++;
1594             }
1595         }
1596         
1597         string hold;
1598         for(int i=0;i<numOTUs;i++){
1599             
1600             if (m->control_pressed) { break; }
1601             
1602             for(int j=0;j<nSeqsPerOTU[i];j++){
1603                 int index = cumNumSeqs[i] + j;
1604                 int nI = seqIndex[index];
1605                 double singleDist = dist[seqNumber[index]];
1606                 
1607                 P[nI] += weight[i] * exp(-singleDist * sigma);
1608             }
1609         }
1610         double nLL = 0.00;
1611         for(int i=0;i<numSeqs;i++){
1612             if(P[i] == 0){      P[i] = DBL_EPSILON;     }
1613             
1614             nLL += -log(P[i]);
1615         }
1616         
1617         nLL = nLL -(double)numSeqs * log(sigma);
1618         
1619         return nLL; 
1620     }
1621     catch(exception& e) {
1622         m->errorOut(e, "ShhherCommand", "getNewWeights");
1623         exit(1);        
1624     }           
1625 }
1626
1627 /**************************************************************************************************/
1628
1629 void ShhherCommand::checkCentroids(){
1630     try{
1631         vector<int> unique(numOTUs, 1);
1632         
1633         for(int i=0;i<numOTUs;i++){
1634             if(centroids[i] == -1 || weight[i] < MIN_WEIGHT){
1635                 unique[i] = -1;
1636             }
1637         }
1638         
1639         for(int i=0;i<numOTUs;i++){
1640             
1641             if (m->control_pressed) { break; }
1642             
1643             if(unique[i] == 1){
1644                 for(int j=i+1;j<numOTUs;j++){
1645                     if(unique[j] == 1){
1646                         
1647                         if(centroids[j] == centroids[i]){
1648                             unique[j] = 0;
1649                             centroids[j] = -1;
1650                             
1651                             weight[i] += weight[j];
1652                             weight[j] = 0.0;
1653                         }
1654                     }
1655                 }
1656             }
1657         }
1658     }
1659     catch(exception& e) {
1660         m->errorOut(e, "ShhherCommand", "checkCentroids");
1661         exit(1);        
1662     }           
1663 }
1664  /**************************************************************************************************/
1665
1666
1667  
1668 void ShhherCommand::writeQualities(vector<int> otuCounts){
1669     
1670     try {
1671         string thisOutputDir = outputDir;
1672         if (outputDir == "") {  thisOutputDir += m->hasPath(flowFileName);  }
1673         map<string, string> variables; 
1674                 variables["[filename]"] = outputDir + m->getRootName(m->getSimpleName(flowFileName));
1675         string qualityFileName = getOutputFileName("qfile",variables);
1676         
1677         ofstream qualityFile;
1678         m->openOutputFile(qualityFileName, qualityFile);
1679         
1680         qualityFile.setf(ios::fixed, ios::floatfield);
1681         qualityFile.setf(ios::showpoint);
1682         qualityFile << setprecision(6);
1683         
1684         vector<vector<int> > qualities(numOTUs);
1685         vector<double> pr(HOMOPS, 0);
1686         
1687         
1688         for(int i=0;i<numOTUs;i++){
1689             
1690             if (m->control_pressed) { break; }
1691             
1692             int index = 0;
1693             
1694             if(nSeqsPerOTU[i] > 0){
1695                 
1696                 while(index < numFlowCells){
1697                     double maxPrValue = 1e8;
1698                     short maxPrIndex = -1;
1699                     double count = 0.0000;
1700                     
1701                     pr.assign(HOMOPS, 0);
1702                     
1703                     for(int j=0;j<nSeqsPerOTU[i];j++){
1704                         int lIndex = cumNumSeqs[i] + j;
1705                         double tauValue = singleTau[seqNumber[lIndex]];
1706                         int sequenceIndex = aaI[i][j];
1707                         short intensity = flowDataIntI[sequenceIndex * numFlowCells + index];
1708                         
1709                         count += tauValue;
1710                         
1711                         for(int s=0;s<HOMOPS;s++){
1712                             pr[s] += tauValue * singleLookUp[s * NUMBINS + intensity];
1713                         }
1714                     }
1715                     
1716                     maxPrIndex = uniqueFlowgrams[centroids[i] * numFlowCells + index];
1717                     maxPrValue = pr[maxPrIndex];
1718                     
1719                     if(count > MIN_COUNT){
1720                         double U = 0.0000;
1721                         double norm = 0.0000;
1722                         
1723                         for(int s=0;s<HOMOPS;s++){
1724                             norm += exp(-(pr[s] - maxPrValue));
1725                         }
1726                         
1727                         for(int s=1;s<=maxPrIndex;s++){
1728                             int value = 0;
1729                             double temp = 0.0000;
1730                             
1731                             U += exp(-(pr[s-1]-maxPrValue))/norm;
1732                             
1733                             if(U>0.00){
1734                                 temp = log10(U);
1735                             }
1736                             else{
1737                                 temp = -10.1;
1738                             }
1739                             temp = floor(-10 * temp);
1740                             value = (int)floor(temp);
1741                             if(value > 100){    value = 100;    }
1742                             
1743                             qualities[i].push_back((int)value);
1744                         }
1745                     }
1746                     
1747                     index++;
1748                 }
1749             }
1750             
1751             
1752             if(otuCounts[i] > 0){
1753                 qualityFile << '>' << seqNameVector[mapUniqueToSeq[i]] << endl;
1754                 for (int j = 4; j < qualities[i].size(); j++) { qualityFile << qualities[i][j] << ' '; }
1755                 qualityFile << endl;
1756             }
1757         }
1758         qualityFile.close();
1759         outputNames.push_back(qualityFileName); outputTypes["qfile"].push_back(qualityFileName);
1760         
1761     }
1762     catch(exception& e) {
1763         m->errorOut(e, "ShhherCommand", "writeQualities");
1764         exit(1);        
1765     }           
1766 }
1767
1768 /**************************************************************************************************/
1769
1770 void ShhherCommand::writeSequences(vector<int> otuCounts){
1771     try {
1772         string thisOutputDir = outputDir;
1773         if (outputDir == "") {  thisOutputDir += m->hasPath(flowFileName);  }
1774         map<string, string> variables; 
1775                 variables["[filename]"] = thisOutputDir + m->getRootName(m->getSimpleName(flowFileName));
1776         string fastaFileName = getOutputFileName("fasta",variables);
1777         ofstream fastaFile;
1778         m->openOutputFile(fastaFileName, fastaFile);
1779         
1780         vector<string> names(numOTUs, "");
1781         
1782         for(int i=0;i<numOTUs;i++){
1783             
1784             if (m->control_pressed) { break; }
1785             
1786             int index = centroids[i];
1787             
1788             if(otuCounts[i] > 0){
1789                 fastaFile << '>' << seqNameVector[aaI[i][0]] << endl;
1790                 
1791                 string newSeq = "";
1792                 
1793                 for(int j=0;j<numFlowCells;j++){
1794                     
1795                     char base = flowOrder[j % flowOrder.length()];
1796                     for(int k=0;k<uniqueFlowgrams[index * numFlowCells + j];k++){
1797                         newSeq += base;
1798                     }
1799                 }
1800                 
1801                 fastaFile << newSeq.substr(4) << endl;
1802             }
1803         }
1804         fastaFile.close();
1805         
1806         outputNames.push_back(fastaFileName); outputTypes["fasta"].push_back(fastaFileName);
1807         
1808         if(compositeFASTAFileName != ""){
1809             m->appendFiles(fastaFileName, compositeFASTAFileName);
1810         }
1811     }
1812     catch(exception& e) {
1813         m->errorOut(e, "ShhherCommand", "writeSequences");
1814         exit(1);        
1815     }           
1816 }
1817
1818 /**************************************************************************************************/
1819
1820 void ShhherCommand::writeNames(vector<int> otuCounts){
1821     try {
1822         string thisOutputDir = outputDir;
1823         if (outputDir == "") {  thisOutputDir += m->hasPath(flowFileName);  }
1824         map<string, string> variables; 
1825                 variables["[filename]"] = thisOutputDir + m->getRootName(m->getSimpleName(flowFileName));
1826         string nameFileName = getOutputFileName("name",variables);
1827         ofstream nameFile;
1828         m->openOutputFile(nameFileName, nameFile);
1829         
1830         for(int i=0;i<numOTUs;i++){
1831             
1832             if (m->control_pressed) { break; }
1833             
1834             if(otuCounts[i] > 0){
1835                 nameFile << seqNameVector[aaI[i][0]] << '\t' << seqNameVector[aaI[i][0]];
1836                 
1837                 for(int j=1;j<nSeqsPerOTU[i];j++){
1838                     nameFile << ',' << seqNameVector[aaI[i][j]];
1839                 }
1840                 
1841                 nameFile << endl;
1842             }
1843         }
1844         nameFile.close();
1845         outputNames.push_back(nameFileName); outputTypes["name"].push_back(nameFileName);
1846         
1847         
1848         if(compositeNamesFileName != ""){
1849             m->appendFiles(nameFileName, compositeNamesFileName);
1850         }               
1851     }
1852     catch(exception& e) {
1853         m->errorOut(e, "ShhherCommand", "writeNames");
1854         exit(1);        
1855     }           
1856 }
1857
1858 /**************************************************************************************************/
1859
1860 void ShhherCommand::writeGroups(){
1861     try {
1862         string thisOutputDir = outputDir;
1863         if (outputDir == "") {  thisOutputDir += m->hasPath(flowFileName);  }
1864         string fileRoot = m->getRootName(m->getSimpleName(flowFileName));
1865         int pos = fileRoot.find_first_of('.');
1866         string fileGroup = fileRoot;
1867         if (pos != string::npos) {  fileGroup = fileRoot.substr(pos+1, (fileRoot.length()-1-(pos+1)));  }
1868         map<string, string> variables; 
1869                 variables["[filename]"] = thisOutputDir + fileRoot;
1870         string groupFileName = getOutputFileName("group",variables);
1871         ofstream groupFile;
1872         m->openOutputFile(groupFileName, groupFile);
1873         
1874         for(int i=0;i<numSeqs;i++){
1875             if (m->control_pressed) { break; }
1876             groupFile << seqNameVector[i] << '\t' << fileGroup << endl;
1877         }
1878         groupFile.close();
1879         outputNames.push_back(groupFileName); outputTypes["group"].push_back(groupFileName);
1880         
1881     }
1882     catch(exception& e) {
1883         m->errorOut(e, "ShhherCommand", "writeGroups");
1884         exit(1);        
1885     }           
1886 }
1887
1888 /**************************************************************************************************/
1889
1890 void ShhherCommand::writeClusters(vector<int> otuCounts){
1891     try {
1892         string thisOutputDir = outputDir;
1893         if (outputDir == "") {  thisOutputDir += m->hasPath(flowFileName);  }
1894         map<string, string> variables; 
1895                 variables["[filename]"] = thisOutputDir + m->getRootName(m->getSimpleName(flowFileName));
1896         string otuCountsFileName = getOutputFileName("counts",variables);
1897         ofstream otuCountsFile;
1898         m->openOutputFile(otuCountsFileName, otuCountsFile);
1899         
1900         string bases = flowOrder;
1901         
1902         for(int i=0;i<numOTUs;i++){
1903             
1904             if (m->control_pressed) {
1905                 break;
1906             }
1907             //output the translated version of the centroid sequence for the otu
1908             if(otuCounts[i] > 0){
1909                 int index = centroids[i];
1910                 
1911                 otuCountsFile << "ideal\t";
1912                 for(int j=8;j<numFlowCells;j++){
1913                     char base = bases[j % bases.length()];
1914                     for(int s=0;s<uniqueFlowgrams[index * numFlowCells + j];s++){
1915                         otuCountsFile << base;
1916                     }
1917                 }
1918                 otuCountsFile << endl;
1919                 
1920                 for(int j=0;j<nSeqsPerOTU[i];j++){
1921                     int sequence = aaI[i][j];
1922                     otuCountsFile << seqNameVector[sequence] << '\t';
1923                     
1924                     string newSeq = "";
1925                     
1926                     for(int k=0;k<lengths[sequence];k++){
1927                         char base = bases[k % bases.length()];
1928                         int freq = int(0.01 * (double)flowDataIntI[sequence * numFlowCells + k] + 0.5);
1929                         
1930                         for(int s=0;s<freq;s++){
1931                             newSeq += base;
1932                             //otuCountsFile << base;
1933                         }
1934                     }
1935                     otuCountsFile << newSeq.substr(4) << endl;
1936                 }
1937                 otuCountsFile << endl;
1938             }
1939         }
1940         otuCountsFile.close();
1941         outputNames.push_back(otuCountsFileName); outputTypes["counts"].push_back(otuCountsFileName);
1942         
1943     }
1944     catch(exception& e) {
1945         m->errorOut(e, "ShhherCommand", "writeClusters");
1946         exit(1);        
1947     }           
1948 }
1949
1950 #else
1951 //**********************************************************************************************************************
1952
1953 int ShhherCommand::execute(){
1954         try {
1955                 if (abort == true) { if (calledHelp) { return 0; }  return 2;   }
1956                 
1957                 getSingleLookUp();      if (m->control_pressed) { return 0; }
1958                 getJointLookUp();       if (m->control_pressed) { return 0; }
1959                 
1960         int numFiles = flowFileVector.size();
1961                 
1962         if (numFiles < processors) { processors = numFiles; }
1963         
1964 #if defined (__APPLE__) || (__MACH__) || (linux) || (__linux) || (__linux__) || (__unix__) || (__unix)
1965         if (processors == 1) { driver(flowFileVector, compositeFASTAFileName, compositeNamesFileName); }
1966         else { createProcesses(flowFileVector); } //each processor processes one file
1967 #else
1968         driver(flowFileVector, compositeFASTAFileName, compositeNamesFileName);
1969 #endif
1970         
1971                 if(compositeFASTAFileName != ""){
1972                         outputNames.push_back(compositeFASTAFileName); outputTypes["fasta"].push_back(compositeFASTAFileName);
1973                         outputNames.push_back(compositeNamesFileName); outputTypes["name"].push_back(compositeNamesFileName);
1974                 }
1975
1976                 m->mothurOutEndLine();
1977                 m->mothurOut("Output File Names: "); m->mothurOutEndLine();
1978                 for (int i = 0; i < outputNames.size(); i++) {  m->mothurOut(outputNames[i]); m->mothurOutEndLine();    }
1979                 m->mothurOutEndLine();
1980                 
1981                 return 0;
1982         }
1983         catch(exception& e) {
1984                 m->errorOut(e, "ShhherCommand", "execute");
1985                 exit(1);
1986         }
1987 }
1988 #endif
1989 //********************************************************************************************************************
1990 //sorts biggest to smallest
1991 inline bool compareFileSizes(string left, string right){
1992     
1993     FILE * pFile;
1994     long leftsize = 0;
1995     
1996     //get num bytes in file
1997     string filename = left;
1998     pFile = fopen (filename.c_str(),"rb");
1999     string error = "Error opening " + filename;
2000     if (pFile==NULL) perror (error.c_str());
2001     else{
2002         fseek (pFile, 0, SEEK_END);
2003         leftsize=ftell (pFile);
2004         fclose (pFile);
2005     }
2006     
2007     FILE * pFile2;
2008     long rightsize = 0;
2009     
2010     //get num bytes in file
2011     filename = right;
2012     pFile2 = fopen (filename.c_str(),"rb");
2013     error = "Error opening " + filename;
2014     if (pFile2==NULL) perror (error.c_str());
2015     else{
2016         fseek (pFile2, 0, SEEK_END);
2017         rightsize=ftell (pFile2);
2018         fclose (pFile2);
2019     }
2020     
2021     return (leftsize > rightsize);      
2022
2023 /**************************************************************************************************/
2024
2025 int ShhherCommand::createProcesses(vector<string> filenames){
2026     try {
2027         vector<int> processIDS;
2028                 int process = 1;
2029                 int num = 0;
2030                 
2031                 //sanity check
2032                 if (filenames.size() < processors) { processors = filenames.size(); }
2033         
2034         //sort file names by size to divide load better
2035         sort(filenames.begin(), filenames.end(), compareFileSizes);
2036         
2037         vector < vector <string> > dividedFiles; //dividedFiles[1] = vector of filenames for process 1...
2038         dividedFiles.resize(processors);
2039         
2040         //for each file, figure out which process will complete it
2041         //want to divide the load intelligently so the big files are spread between processes
2042         for (int i = 0; i < filenames.size(); i++) { 
2043             int processToAssign = (i+1) % processors; 
2044             if (processToAssign == 0) { processToAssign = processors; }
2045             
2046             dividedFiles[(processToAssign-1)].push_back(filenames[i]);
2047         }
2048         
2049         //now lets reverse the order of ever other process, so we balance big files running with little ones
2050         for (int i = 0; i < processors; i++) {
2051             int remainder = ((i+1) % processors);
2052             if (remainder) {  reverse(dividedFiles[i].begin(), dividedFiles[i].end());  }
2053         }
2054         
2055                         
2056                 //divide the groups between the processors
2057                 /*vector<linePair> lines;
2058         vector<int> numFilesToComplete;
2059                 int numFilesPerProcessor = filenames.size() / processors;
2060                 for (int i = 0; i < processors; i++) {
2061                         int startIndex =  i * numFilesPerProcessor;
2062                         int endIndex = (i+1) * numFilesPerProcessor;
2063                         if(i == (processors - 1)){      endIndex = filenames.size();    }
2064                         lines.push_back(linePair(startIndex, endIndex));
2065             numFilesToComplete.push_back((endIndex-startIndex));
2066                 }*/
2067                 
2068         #if defined (__APPLE__) || (__MACH__) || (linux) || (__linux) || (__linux__) || (__unix__) || (__unix)          
2069                 
2070                 //loop through and create all the processes you want
2071                 while (process != processors) {
2072                         int pid = fork();
2073                         
2074                         if (pid > 0) {
2075                                 processIDS.push_back(pid);  //create map from line number to pid so you can append files in correct order later
2076                                 process++;
2077                         }else if (pid == 0){
2078                                 num = driver(dividedFiles[process], compositeFASTAFileName + toString(getpid()) + ".temp", compositeNamesFileName  + toString(getpid()) + ".temp");
2079                 
2080                 //pass numSeqs to parent
2081                                 ofstream out;
2082                                 string tempFile = compositeFASTAFileName + toString(getpid()) + ".num.temp";
2083                                 m->openOutputFile(tempFile, out);
2084                                 out << num << endl;
2085                                 out.close();
2086                 
2087                                 exit(0);
2088                         }else { 
2089                                 m->mothurOut("[ERROR]: unable to spawn the necessary processes."); m->mothurOutEndLine(); 
2090                                 for (int i = 0; i < processIDS.size(); i++) { kill (processIDS[i], SIGINT); }
2091                                 exit(0);
2092                         }
2093                 }
2094                 
2095                 //do my part
2096                 driver(dividedFiles[0], compositeFASTAFileName, compositeNamesFileName);
2097                 
2098                 //force parent to wait until all the processes are done
2099                 for (int i=0;i<processIDS.size();i++) { 
2100                         int temp = processIDS[i];
2101                         wait(&temp);
2102                 }
2103         
2104         #else
2105         
2106         //////////////////////////////////////////////////////////////////////////////////////////////////////
2107         
2108         /////////////////////// NOT WORKING, ACCESS VIOLATION ON READ OF FLOWGRAMS IN THREAD /////////////////
2109         
2110         //////////////////////////////////////////////////////////////////////////////////////////////////////
2111                 //Windows version shared memory, so be careful when passing variables through the shhhFlowsData struct. 
2112                 //Above fork() will clone, so memory is separate, but that's not the case with windows, 
2113                 //////////////////////////////////////////////////////////////////////////////////////////////////////
2114                 /*
2115                 vector<shhhFlowsData*> pDataArray; 
2116                 DWORD   dwThreadIdArray[processors-1];
2117                 HANDLE  hThreadArray[processors-1]; 
2118                 
2119                 //Create processor worker threads.
2120                 for( int i=0; i<processors-1; i++ ){
2121                         // Allocate memory for thread data.
2122                         string extension = "";
2123                         if (i != 0) { extension = toString(i) + ".temp"; }
2124                         
2125             shhhFlowsData* tempFlow = new shhhFlowsData(filenames, (compositeFASTAFileName + extension), (compositeNamesFileName + extension), outputDir, flowOrder, jointLookUp, singleLookUp, m, lines[i].start, lines[i].end, cutoff, sigma, minDelta, maxIters, i);
2126                         pDataArray.push_back(tempFlow);
2127                         processIDS.push_back(i);
2128             
2129                         hThreadArray[i] = CreateThread(NULL, 0, ShhhFlowsThreadFunction, pDataArray[i], 0, &dwThreadIdArray[i]);   
2130                 }
2131                 
2132                 //using the main process as a worker saves time and memory
2133                 //do my part
2134                 driver(filenames, compositeFASTAFileName, compositeNamesFileName, lines[processors-1].start, lines[processors-1].end);
2135                 
2136                 //Wait until all threads have terminated.
2137                 WaitForMultipleObjects(processors-1, hThreadArray, TRUE, INFINITE);
2138                 
2139                 //Close all thread handles and free memory allocations.
2140                 for(int i=0; i < pDataArray.size(); i++){
2141                         for(int j=0; j < pDataArray[i]->outputNames.size(); j++){ outputNames.push_back(pDataArray[i]->outputNames[j]); }
2142                         CloseHandle(hThreadArray[i]);
2143                         delete pDataArray[i];
2144                 }
2145                 */
2146         #endif
2147         
2148         for (int i=0;i<processIDS.size();i++) { 
2149             ifstream in;
2150                         string tempFile =  compositeFASTAFileName + toString(processIDS[i]) + ".num.temp";
2151                         m->openInputFile(tempFile, in);
2152                         if (!in.eof()) { 
2153                 int tempNum = 0; 
2154                 in >> tempNum; 
2155                 if (tempNum != dividedFiles[i+1].size()) {
2156                     m->mothurOut("[ERROR]: main process expected " + toString(processIDS[i]) + " to complete " + toString(dividedFiles[i+1].size()) + " files, and it only reported completing " + toString(tempNum) + ". This will cause file mismatches.  The flow files may be too large to process with multiple processors. \n");
2157                 }
2158             }
2159                         in.close(); m->mothurRemove(tempFile);
2160             
2161             if (compositeFASTAFileName != "") {
2162                 m->appendFiles((compositeFASTAFileName + toString(processIDS[i]) + ".temp"), compositeFASTAFileName);
2163                 m->appendFiles((compositeNamesFileName + toString(processIDS[i]) + ".temp"), compositeNamesFileName);
2164                 m->mothurRemove((compositeFASTAFileName + toString(processIDS[i]) + ".temp"));
2165                 m->mothurRemove((compositeNamesFileName + toString(processIDS[i]) + ".temp"));
2166             }
2167         }
2168         
2169         return 0;
2170         
2171     }
2172         catch(exception& e) {
2173                 m->errorOut(e, "ShhherCommand", "createProcesses");
2174                 exit(1);
2175         }
2176 }
2177 /**************************************************************************************************/
2178
2179 vector<string> ShhherCommand::parseFlowFiles(string filename){
2180     try {
2181         vector<string> files;
2182         int count = 0;
2183         
2184         ifstream in;
2185         m->openInputFile(filename, in);
2186         
2187         int thisNumFLows = 0;
2188         in >> thisNumFLows; m->gobble(in);
2189         
2190         while (!in.eof()) {
2191             if (m->control_pressed) { break; }
2192             
2193             ofstream out;
2194             string outputFileName = filename + toString(count) + ".temp";
2195             m->openOutputFile(outputFileName, out);
2196             out << thisNumFLows << endl;
2197             files.push_back(outputFileName);
2198             
2199             int numLinesWrote = 0;
2200             for (int i = 0; i < largeSize; i++) {
2201                 if (in.eof()) { break; }
2202                 string line = m->getline(in); m->gobble(in);
2203                 out << line << endl;
2204                 numLinesWrote++;
2205             }
2206             out.close();
2207             
2208             if (numLinesWrote == 0) {  m->mothurRemove(outputFileName); files.pop_back();  }
2209             count++;
2210         }
2211         in.close();
2212         
2213         if (m->control_pressed) { for (int i = 0; i < files.size(); i++) { m->mothurRemove(files[i]); }  files.clear(); }
2214         
2215         m->mothurOut("\nDivided " + filename + " into " + toString(files.size()) + " files.\n\n"); 
2216         
2217         return files;
2218     }
2219         catch(exception& e) {
2220                 m->errorOut(e, "ShhherCommand", "parseFlowFiles");
2221                 exit(1);
2222         }
2223 }
2224 /**************************************************************************************************/
2225
2226 int ShhherCommand::driver(vector<string> filenames, string thisCompositeFASTAFileName, string thisCompositeNamesFileName){
2227     try {
2228         
2229         int numCompleted = 0;
2230         
2231         for(int i=0;i<filenames.size();i++){
2232                         
2233                         if (m->control_pressed) { break; }
2234                         
2235             vector<string> theseFlowFileNames; theseFlowFileNames.push_back(filenames[i]);
2236             if (large) {  theseFlowFileNames = parseFlowFiles(filenames[i]);  }
2237             
2238             if (m->control_pressed) { break; }
2239             
2240             double begClock = clock();
2241             unsigned long long begTime;
2242             
2243             string fileNameForOutput = filenames[i];
2244             
2245             for (int g = 0; g < theseFlowFileNames.size(); g++) {
2246                 
2247                 string flowFileName = theseFlowFileNames[g];
2248                 m->mothurOut("\n>>>>>\tProcessing " + flowFileName + " (file " + toString(i+1) + " of " + toString(filenames.size()) + ")\t<<<<<\n");
2249                 m->mothurOut("Reading flowgrams...\n");
2250                 
2251                 vector<string> seqNameVector;
2252                 vector<int> lengths;
2253                 vector<short> flowDataIntI;
2254                 vector<double> flowDataPrI;
2255                 map<string, int> nameMap;
2256                 vector<short> uniqueFlowgrams;
2257                 vector<int> uniqueCount;
2258                 vector<int> mapSeqToUnique;
2259                 vector<int> mapUniqueToSeq;
2260                 vector<int> uniqueLengths;
2261                 int numFlowCells;
2262                 
2263                 if (m->debug) { m->mothurOut("[DEBUG]: About to read flowgrams.\n"); }
2264                 int numSeqs = getFlowData(flowFileName, seqNameVector, lengths, flowDataIntI, nameMap, numFlowCells);
2265                 
2266                 if (m->control_pressed) { break; }
2267                 
2268                 m->mothurOut("Identifying unique flowgrams...\n");
2269                 int numUniques = getUniques(numSeqs, numFlowCells, uniqueFlowgrams, uniqueCount, uniqueLengths, mapSeqToUnique, mapUniqueToSeq, lengths, flowDataPrI, flowDataIntI);
2270                 
2271                 if (m->control_pressed) { break; }
2272                 
2273                 m->mothurOut("Calculating distances between flowgrams...\n");
2274                 string distFileName = flowFileName.substr(0,flowFileName.find_last_of('.')) + ".shhh.dist";
2275                 begTime = time(NULL);
2276                
2277                 
2278                 flowDistParentFork(numFlowCells, distFileName, numUniques, mapUniqueToSeq, mapSeqToUnique, lengths, flowDataPrI, flowDataIntI);
2279                 
2280                 m->mothurOutEndLine();
2281                 m->mothurOut("Total time: " + toString(time(NULL) - begTime) + '\t' + toString((clock() - begClock)/CLOCKS_PER_SEC) + '\n');
2282                 
2283                 
2284                 string namesFileName = flowFileName.substr(0,flowFileName.find_last_of('.')) + ".shhh.names";
2285                 createNamesFile(numSeqs, numUniques, namesFileName, seqNameVector, mapSeqToUnique, mapUniqueToSeq);
2286                 
2287                 if (m->control_pressed) { break; }
2288                 
2289                 m->mothurOut("\nClustering flowgrams...\n");
2290                 string listFileName = flowFileName.substr(0,flowFileName.find_last_of('.')) + ".shhh.list";
2291                 cluster(listFileName, distFileName, namesFileName);
2292                 
2293                 if (m->control_pressed) { break; }
2294                 
2295                 vector<int> otuData;
2296                 vector<int> cumNumSeqs;
2297                 vector<int> nSeqsPerOTU;
2298                 vector<vector<int> > aaP;       //tMaster->aanP:        each row is a different otu / each col contains the sequence indices
2299                 vector<vector<int> > aaI;       //tMaster->aanI:        that are in each otu - can't differentiate between aaP and aaI 
2300                 vector<int> seqNumber;          //tMaster->anP:         the sequence id number sorted by OTU
2301                 vector<int> seqIndex;           //tMaster->anI;         the index that corresponds to seqNumber
2302                 
2303                 
2304                 int numOTUs = getOTUData(numSeqs, listFileName, otuData, cumNumSeqs, nSeqsPerOTU, aaP, aaI, seqNumber, seqIndex, nameMap);
2305                 
2306                 if (m->control_pressed) { break; }
2307                 
2308                 m->mothurRemove(distFileName);
2309                 m->mothurRemove(namesFileName);
2310                 m->mothurRemove(listFileName);
2311                 
2312                 vector<double> dist;            //adDist - distance of sequences to centroids
2313                 vector<short> change;           //did the centroid sequence change? 0 = no; 1 = yes
2314                 vector<int> centroids;          //the representative flowgram for each cluster m
2315                 vector<double> weight;
2316                 vector<double> singleTau;       //tMaster->adTau:       1-D Tau vector (1xnumSeqs)
2317                 vector<int> nSeqsBreaks;
2318                 vector<int> nOTUsBreaks;
2319                 
2320                 if (m->debug) { m->mothurOut("[DEBUG]: numSeqs = " + toString(numSeqs) + " numOTUS = " + toString(numOTUs) + " about to alloc a dist vector with size = " + toString((numSeqs * numOTUs)) + ".\n"); }
2321                 
2322                 dist.assign(numSeqs * numOTUs, 0);
2323                 change.assign(numOTUs, 1);
2324                 centroids.assign(numOTUs, -1);
2325                 weight.assign(numOTUs, 0);
2326                 singleTau.assign(numSeqs, 1.0);
2327                 
2328                 nSeqsBreaks.assign(2, 0);
2329                 nOTUsBreaks.assign(2, 0);
2330                 
2331                 nSeqsBreaks[0] = 0;
2332                 nSeqsBreaks[1] = numSeqs;
2333                 nOTUsBreaks[1] = numOTUs;
2334                 
2335                 if (m->debug) { m->mothurOut("[DEBUG]: done allocating memory, about to denoise.\n"); }
2336                 
2337                 if (m->control_pressed) { break; }
2338                 
2339                 double maxDelta = 0;
2340                 int iter = 0;
2341                 
2342                 begClock = clock();
2343                 begTime = time(NULL);
2344                 
2345                 m->mothurOut("\nDenoising flowgrams...\n");
2346                 m->mothurOut("iter\tmaxDelta\tnLL\t\tcycletime\n");
2347                 
2348                 while((maxIters == 0 && maxDelta > minDelta) || iter < MIN_ITER || (maxDelta > minDelta && iter < maxIters)){
2349                     
2350                     if (m->control_pressed) { break; }
2351                     
2352                     double cycClock = clock();
2353                     unsigned long long cycTime = time(NULL);
2354                     fill(numOTUs, seqNumber, seqIndex, cumNumSeqs, nSeqsPerOTU, aaP, aaI);
2355                     
2356                     if (m->control_pressed) { break; }
2357                     
2358                     calcCentroidsDriver(numOTUs, cumNumSeqs, nSeqsPerOTU, seqIndex, change, centroids, singleTau, mapSeqToUnique, uniqueFlowgrams, flowDataIntI, lengths, numFlowCells, seqNumber);
2359                     
2360                     if (m->control_pressed) { break; }
2361                     
2362                     maxDelta = getNewWeights(numOTUs, cumNumSeqs, nSeqsPerOTU, singleTau, seqNumber, weight);  
2363                     
2364                     if (m->control_pressed) { break; }
2365                     
2366                     double nLL = getLikelihood(numSeqs, numOTUs, nSeqsPerOTU, seqNumber, cumNumSeqs, seqIndex, dist, weight); 
2367                     
2368                     if (m->control_pressed) { break; }
2369                     
2370                     checkCentroids(numOTUs, centroids, weight);
2371                     
2372                     if (m->control_pressed) { break; }
2373                     
2374                     calcNewDistances(numSeqs, numOTUs, nSeqsPerOTU,  dist, weight, change, centroids, aaP, singleTau, aaI, seqNumber, seqIndex, uniqueFlowgrams, flowDataIntI, numFlowCells, lengths);
2375                     
2376                     if (m->control_pressed) { break; }
2377                     
2378                     iter++;
2379                     
2380                     m->mothurOut(toString(iter) + '\t' + toString(maxDelta) + '\t' + toString(nLL) + '\t' + toString(time(NULL) - cycTime) + '\t' + toString((clock() - cycClock)/(double)CLOCKS_PER_SEC) + '\n');
2381                     
2382                 }       
2383                 
2384                 if (m->control_pressed) { break; }
2385                 
2386                 m->mothurOut("\nFinalizing...\n");
2387                 fill(numOTUs, seqNumber, seqIndex, cumNumSeqs, nSeqsPerOTU, aaP, aaI);
2388                 
2389                 if (m->debug) { m->mothurOut("[DEBUG]: done fill().\n"); }
2390
2391                 if (m->control_pressed) { break; }
2392                 
2393                 setOTUs(numOTUs, numSeqs, seqNumber, seqIndex, cumNumSeqs, nSeqsPerOTU, otuData, singleTau, dist, aaP, aaI);
2394                 
2395                 if (m->debug) { m->mothurOut("[DEBUG]: done setOTUs().\n"); }
2396                 
2397                 if (m->control_pressed) { break; }
2398                 
2399                 vector<int> otuCounts(numOTUs, 0);
2400                 for(int j=0;j<numSeqs;j++)      {       otuCounts[otuData[j]]++;        }
2401                 
2402                 calcCentroidsDriver(numOTUs, cumNumSeqs, nSeqsPerOTU, seqIndex, change, centroids, singleTau, mapSeqToUnique, uniqueFlowgrams, flowDataIntI, lengths, numFlowCells, seqNumber);
2403                 
2404                 if (m->debug) { m->mothurOut("[DEBUG]: done calcCentroidsDriver().\n"); }
2405                 
2406                 if (m->control_pressed) { break; }
2407                 
2408                 if ((large) && (g == 0)) {  flowFileName = filenames[i]; theseFlowFileNames[0] = filenames[i]; }
2409                 string thisOutputDir = outputDir;
2410                 if (outputDir == "") {  thisOutputDir = m->hasPath(flowFileName);  }
2411                 map<string, string> variables; 
2412                 variables["[filename]"] = thisOutputDir + m->getRootName(m->getSimpleName(flowFileName));
2413                 string qualityFileName = getOutputFileName("qfile",variables);
2414                 string fastaFileName = getOutputFileName("fasta",variables);
2415                 string nameFileName = getOutputFileName("name",variables);
2416                 string otuCountsFileName = getOutputFileName("counts",variables);
2417                 string fileRoot = m->getRootName(m->getSimpleName(flowFileName));
2418                 int pos = fileRoot.find_first_of('.');
2419                 string fileGroup = fileRoot;
2420                 if (pos != string::npos) {  fileGroup = fileRoot.substr(pos+1, (fileRoot.length()-1-(pos+1)));  }
2421                 string groupFileName = getOutputFileName("group",variables);
2422
2423                 
2424                 writeQualities(numOTUs, numFlowCells, qualityFileName, otuCounts, nSeqsPerOTU, seqNumber, singleTau, flowDataIntI, uniqueFlowgrams, cumNumSeqs, mapUniqueToSeq, seqNameVector, centroids, aaI); if (m->control_pressed) { break; }
2425                 writeSequences(thisCompositeFASTAFileName, numOTUs, numFlowCells, fastaFileName, otuCounts, uniqueFlowgrams, seqNameVector, aaI, centroids);if (m->control_pressed) { break; }
2426                 writeNames(thisCompositeNamesFileName, numOTUs, nameFileName, otuCounts, seqNameVector, aaI, nSeqsPerOTU);                              if (m->control_pressed) { break; }
2427                 writeClusters(otuCountsFileName, numOTUs, numFlowCells,otuCounts, centroids, uniqueFlowgrams, seqNameVector, aaI, nSeqsPerOTU, lengths, flowDataIntI);                  if (m->control_pressed) { break; }
2428                 writeGroups(groupFileName, fileGroup, numSeqs, seqNameVector);                                          if (m->control_pressed) { break; }
2429                 
2430                 if (large) {
2431                     if (g > 0) {
2432                         variables["[filename]"] = thisOutputDir + m->getRootName(m->getSimpleName(theseFlowFileNames[0]));
2433                         m->appendFiles(qualityFileName, getOutputFileName("qfile",variables));
2434                         m->mothurRemove(qualityFileName);
2435                         m->appendFiles(fastaFileName, getOutputFileName("fasta",variables));
2436                         m->mothurRemove(fastaFileName);
2437                         m->appendFiles(nameFileName, getOutputFileName("name",variables));
2438                         m->mothurRemove(nameFileName);
2439                         m->appendFiles(otuCountsFileName, getOutputFileName("counts",variables));
2440                         m->mothurRemove(otuCountsFileName);
2441                         m->appendFiles(groupFileName, getOutputFileName("group",variables));
2442                         m->mothurRemove(groupFileName);
2443                     }
2444                     m->mothurRemove(theseFlowFileNames[g]);
2445                 }
2446                         }
2447             
2448             numCompleted++;
2449                         m->mothurOut("Total time to process " + fileNameForOutput + ":\t" + toString(time(NULL) - begTime) + '\t' + toString((clock() - begClock)/(double)CLOCKS_PER_SEC) + '\n');
2450                 }
2451                 
2452         if (m->control_pressed) { for (int i = 0; i < outputNames.size(); i++) { m->mothurRemove(outputNames[i]); } return 0; }
2453         
2454         return numCompleted;
2455         
2456     }catch(exception& e) {
2457             m->errorOut(e, "ShhherCommand", "driver");
2458             exit(1);
2459     }
2460 }
2461
2462 /**************************************************************************************************/
2463 int ShhherCommand::getFlowData(string filename, vector<string>& thisSeqNameVector, vector<int>& thisLengths, vector<short>& thisFlowDataIntI, map<string, int>& thisNameMap, int& numFlowCells){
2464         try{
2465        
2466                 ifstream flowFile;
2467        
2468                 m->openInputFile(filename, flowFile);
2469                 
2470                 string seqName;
2471                 int currentNumFlowCells;
2472                 float intensity;
2473         thisSeqNameVector.clear();
2474                 thisLengths.clear();
2475                 thisFlowDataIntI.clear();
2476                 thisNameMap.clear();
2477                 
2478                 string numFlowTest;
2479         flowFile >> numFlowTest;
2480         
2481         if (!m->isContainingOnlyDigits(numFlowTest)) { m->mothurOut("[ERROR]: expected a number and got " + numFlowTest + ", quitting. Did you use the flow parameter instead of the file parameter?"); m->mothurOutEndLine(); exit(1); }
2482         else { convert(numFlowTest, numFlowCells); }
2483         
2484         if (m->debug) { m->mothurOut("[DEBUG]: numFlowCells = " + toString(numFlowCells) + ".\n"); }
2485                 int index = 0;//pcluster
2486                 while(!flowFile.eof()){
2487                         
2488                         if (m->control_pressed) { break; }
2489                         
2490                         flowFile >> seqName >> currentNumFlowCells;
2491             
2492                         thisLengths.push_back(currentNumFlowCells);
2493            
2494                         thisSeqNameVector.push_back(seqName);
2495                         thisNameMap[seqName] = index++;//pcluster
2496             
2497             if (m->debug) { m->mothurOut("[DEBUG]: seqName = " + seqName + " length = " + toString(currentNumFlowCells) + " index = " + toString(index) + "\n"); }
2498             
2499                         for(int i=0;i<numFlowCells;i++){
2500                                 flowFile >> intensity;
2501                                 if(intensity > 9.99)    {       intensity = 9.99;       }
2502                                 int intI = int(100 * intensity + 0.0001);
2503                                 thisFlowDataIntI.push_back(intI);
2504                         }
2505                         m->gobble(flowFile);
2506                 }
2507                 flowFile.close();
2508                 
2509                 int numSeqs = thisSeqNameVector.size();         
2510                 
2511                 for(int i=0;i<numSeqs;i++){
2512                         
2513                         if (m->control_pressed) { break; }
2514                         
2515                         int iNumFlowCells = i * numFlowCells;
2516                         for(int j=thisLengths[i];j<numFlowCells;j++){
2517                                 thisFlowDataIntI[iNumFlowCells + j] = 0;
2518                         }
2519                 }
2520         
2521         return numSeqs;
2522                 
2523         }
2524         catch(exception& e) {
2525                 m->errorOut(e, "ShhherCommand", "getFlowData");
2526                 exit(1);
2527         }
2528 }
2529 /**************************************************************************************************/
2530
2531 int ShhherCommand::flowDistParentFork(int numFlowCells, string distFileName, int stopSeq, vector<int>& mapUniqueToSeq, vector<int>& mapSeqToUnique, vector<int>& lengths, vector<double>& flowDataPrI, vector<short>& flowDataIntI){
2532         try{            
2533         
2534                 ostringstream outStream;
2535                 outStream.setf(ios::fixed, ios::floatfield);
2536                 outStream.setf(ios::dec, ios::basefield);
2537                 outStream.setf(ios::showpoint);
2538                 outStream.precision(6);
2539                 
2540                 int begTime = time(NULL);
2541                 double begClock = clock();
2542         
2543                 for(int i=0;i<stopSeq;i++){
2544                         
2545                         if (m->control_pressed) { break; }
2546                         
2547                         for(int j=0;j<i;j++){
2548                                 float flowDistance = calcPairwiseDist(numFlowCells, mapUniqueToSeq[i], mapUniqueToSeq[j], mapSeqToUnique, lengths, flowDataPrI, flowDataIntI);
2549                 
2550                                 if(flowDistance < 1e-6){
2551                                         outStream << mapUniqueToSeq[i] << '\t' << mapUniqueToSeq[j] << '\t' << 0.000000 << endl;
2552                                 }
2553                                 else if(flowDistance <= cutoff){
2554                                         outStream << mapUniqueToSeq[i] << '\t' << mapUniqueToSeq[j] << '\t' << flowDistance << endl;
2555                                 }
2556                         }
2557                         if(i % 100 == 0){
2558                                 m->mothurOutJustToScreen(toString(i) + "\t" + toString(time(NULL) - begTime));
2559                                 m->mothurOutJustToScreen("\t" + toString((clock()-begClock)/CLOCKS_PER_SEC)+"\n");
2560                         }
2561                 }
2562                 
2563                 ofstream distFile(distFileName.c_str());
2564                 distFile << outStream.str();            
2565                 distFile.close();
2566                 
2567                 if (m->control_pressed) {}
2568                 else {
2569                         m->mothurOutJustToScreen(toString(stopSeq-1) + "\t" + toString(time(NULL) - begTime));
2570                         m->mothurOutJustToScreen("\t" + toString((clock()-begClock)/CLOCKS_PER_SEC)+"\n");
2571                 }
2572         
2573         return 0;
2574         }
2575         catch(exception& e) {
2576                 m->errorOut(e, "ShhherCommand", "flowDistParentFork");
2577                 exit(1);
2578         }
2579 }
2580 /**************************************************************************************************/
2581
2582 float ShhherCommand::calcPairwiseDist(int numFlowCells, int seqA, int seqB, vector<int>& mapSeqToUnique, vector<int>& lengths, vector<double>& flowDataPrI, vector<short>& flowDataIntI){
2583         try{
2584                 int minLength = lengths[mapSeqToUnique[seqA]];
2585                 if(lengths[seqB] < minLength){  minLength = lengths[mapSeqToUnique[seqB]];      }
2586                 
2587                 int ANumFlowCells = seqA * numFlowCells;
2588                 int BNumFlowCells = seqB * numFlowCells;
2589                 
2590                 float dist = 0;
2591                 
2592                 for(int i=0;i<minLength;i++){
2593                         
2594                         if (m->control_pressed) { break; }
2595                         
2596                         int flowAIntI = flowDataIntI[ANumFlowCells + i];
2597                         float flowAPrI = flowDataPrI[ANumFlowCells + i];
2598                         
2599                         int flowBIntI = flowDataIntI[BNumFlowCells + i];
2600                         float flowBPrI = flowDataPrI[BNumFlowCells + i];
2601                         dist += jointLookUp[flowAIntI * NUMBINS + flowBIntI] - flowAPrI - flowBPrI;
2602                 }
2603                 
2604                 dist /= (float) minLength;
2605                 return dist;
2606         }
2607         catch(exception& e) {
2608                 m->errorOut(e, "ShhherCommand", "calcPairwiseDist");
2609                 exit(1);
2610         }
2611 }
2612
2613 /**************************************************************************************************/
2614
2615 int ShhherCommand::getUniques(int numSeqs, int numFlowCells, vector<short>& uniqueFlowgrams, vector<int>& uniqueCount, vector<int>& uniqueLengths, vector<int>& mapSeqToUnique, vector<int>& mapUniqueToSeq, vector<int>& lengths, vector<double>& flowDataPrI, vector<short>& flowDataIntI){
2616         try{
2617                 int numUniques = 0;
2618                 uniqueFlowgrams.assign(numFlowCells * numSeqs, -1);
2619                 uniqueCount.assign(numSeqs, 0);                                                 //      anWeights
2620                 uniqueLengths.assign(numSeqs, 0);
2621                 mapSeqToUnique.assign(numSeqs, -1);
2622                 mapUniqueToSeq.assign(numSeqs, -1);
2623                 
2624                 vector<short> uniqueFlowDataIntI(numFlowCells * numSeqs, -1);
2625                 
2626                 for(int i=0;i<numSeqs;i++){
2627                         
2628                         if (m->control_pressed) { break; }
2629                         
2630                         int index = 0;
2631                         
2632                         vector<short> current(numFlowCells);
2633                         for(int j=0;j<numFlowCells;j++){
2634                                 current[j] = short(((flowDataIntI[i * numFlowCells + j] + 50.0)/100.0));
2635                         }
2636             
2637                         for(int j=0;j<numUniques;j++){
2638                                 int offset = j * numFlowCells;
2639                                 bool toEnd = 1;
2640                                 
2641                                 int shorterLength;
2642                                 if(lengths[i] < uniqueLengths[j])       {       shorterLength = lengths[i];                     }
2643                                 else                                                            {       shorterLength = uniqueLengths[j];       }
2644                 
2645                                 for(int k=0;k<shorterLength;k++){
2646                                         if(current[k] != uniqueFlowgrams[offset + k]){
2647                                                 toEnd = 0;
2648                                                 break;
2649                                         }
2650                                 }
2651                                 
2652                                 if(toEnd){
2653                                         mapSeqToUnique[i] = j;
2654                                         uniqueCount[j]++;
2655                                         index = j;
2656                                         if(lengths[i] > uniqueLengths[j])       {       uniqueLengths[j] = lengths[i];  }
2657                                         break;
2658                                 }
2659                                 index++;
2660                         }
2661                         
2662                         if(index == numUniques){
2663                                 uniqueLengths[numUniques] = lengths[i];
2664                                 uniqueCount[numUniques] = 1;
2665                                 mapSeqToUnique[i] = numUniques;//anMap
2666                                 mapUniqueToSeq[numUniques] = i;//anF
2667                                 
2668                                 for(int k=0;k<numFlowCells;k++){
2669                                         uniqueFlowgrams[numUniques * numFlowCells + k] = current[k];
2670                                         uniqueFlowDataIntI[numUniques * numFlowCells + k] = flowDataIntI[i * numFlowCells + k];
2671                                 }
2672                                 
2673                                 numUniques++;
2674                         }
2675                 }
2676                 uniqueFlowDataIntI.resize(numFlowCells * numUniques);
2677                 uniqueLengths.resize(numUniques);       
2678                 
2679                 flowDataPrI.resize(numSeqs * numFlowCells, 0);
2680                 for(int i=0;i<flowDataPrI.size();i++)   {       if (m->control_pressed) { break; } flowDataPrI[i] = getProbIntensity(flowDataIntI[i]);          }
2681         
2682         return numUniques;
2683         }
2684         catch(exception& e) {
2685                 m->errorOut(e, "ShhherCommand", "getUniques");
2686                 exit(1);
2687         }
2688 }
2689 /**************************************************************************************************/
2690 int ShhherCommand::createNamesFile(int numSeqs, int numUniques, string filename, vector<string>& seqNameVector, vector<int>& mapSeqToUnique, vector<int>& mapUniqueToSeq){
2691         try{
2692                 
2693                 vector<string> duplicateNames(numUniques, "");
2694                 for(int i=0;i<numSeqs;i++){
2695                         duplicateNames[mapSeqToUnique[i]] += seqNameVector[i] + ',';
2696                 }
2697                 
2698                 ofstream nameFile;
2699                 m->openOutputFile(filename, nameFile);
2700                 
2701                 for(int i=0;i<numUniques;i++){
2702                         
2703                         if (m->control_pressed) { break; }
2704                         
2705             //                  nameFile << seqNameVector[mapUniqueToSeq[i]] << '\t' << duplicateNames[i].substr(0, duplicateNames[i].find_last_of(',')) << endl;
2706                         nameFile << mapUniqueToSeq[i] << '\t' << duplicateNames[i].substr(0, duplicateNames[i].find_last_of(',')) << endl;
2707                 }
2708                 
2709                 nameFile.close();
2710         
2711                 return 0;
2712         }
2713         catch(exception& e) {
2714                 m->errorOut(e, "ShhherCommand", "createNamesFile");
2715                 exit(1);
2716         }
2717 }
2718 //**********************************************************************************************************************
2719
2720 int ShhherCommand::cluster(string filename, string distFileName, string namesFileName){
2721         try {
2722                 
2723                 ReadMatrix* read = new ReadColumnMatrix(distFileName);  
2724                 read->setCutoff(cutoff);
2725                 
2726                 NameAssignment* clusterNameMap = new NameAssignment(namesFileName);
2727                 clusterNameMap->readMap();
2728                 read->read(clusterNameMap);
2729         
2730                 ListVector* list = read->getListVector();
2731                 SparseDistanceMatrix* matrix = read->getDMatrix();
2732                 
2733                 delete read; 
2734                 delete clusterNameMap; 
2735         
2736                 RAbundVector* rabund = new RAbundVector(list->getRAbundVector());
2737                 
2738         float adjust = -1.0;
2739                 Cluster* cluster = new CompleteLinkage(rabund, list, matrix, cutoff, "furthest", adjust);
2740                 string tag = cluster->getTag();
2741                 
2742                 double clusterCutoff = cutoff;
2743                 while (matrix->getSmallDist() <= clusterCutoff && matrix->getNNodes() > 0){
2744                         
2745                         if (m->control_pressed) { break; }
2746                         
2747                         cluster->update(clusterCutoff);
2748                 }
2749                 
2750                 list->setLabel(toString(cutoff));
2751                 
2752                 ofstream listFile;
2753                 m->openOutputFile(filename, listFile);
2754                 list->print(listFile);
2755                 listFile.close();
2756                 
2757                 delete matrix;  delete cluster; delete rabund; delete list;
2758         
2759                 return 0;
2760         }
2761         catch(exception& e) {
2762                 m->errorOut(e, "ShhherCommand", "cluster");
2763                 exit(1);        
2764         }               
2765 }
2766 /**************************************************************************************************/
2767
2768 int ShhherCommand::getOTUData(int numSeqs, string fileName,  vector<int>& otuData,
2769                                vector<int>& cumNumSeqs,
2770                                vector<int>& nSeqsPerOTU,
2771                                vector<vector<int> >& aaP,       //tMaster->aanP:        each row is a different otu / each col contains the sequence indices
2772                                vector<vector<int> >& aaI,       //tMaster->aanI:        that are in each otu - can't differentiate between aaP and aaI 
2773                                vector<int>& seqNumber,          //tMaster->anP:         the sequence id number sorted by OTU
2774                                vector<int>& seqIndex,
2775                                map<string, int>& nameMap){
2776         try {
2777         
2778                 ifstream listFile;
2779                 m->openInputFile(fileName, listFile);
2780                 string label;
2781         int numOTUs;
2782                 
2783                 listFile >> label >> numOTUs;
2784         
2785         if (m->debug) { m->mothurOut("[DEBUG]: Getting OTU Data...\n"); }
2786         
2787                 otuData.assign(numSeqs, 0);
2788                 cumNumSeqs.assign(numOTUs, 0);
2789                 nSeqsPerOTU.assign(numOTUs, 0);
2790                 aaP.clear();aaP.resize(numOTUs);
2791                 
2792                 seqNumber.clear();
2793                 aaI.clear();
2794                 seqIndex.clear();
2795                 
2796                 string singleOTU = "";
2797                 
2798                 for(int i=0;i<numOTUs;i++){
2799                         
2800                         if (m->control_pressed) { break; }
2801             if (m->debug) { m->mothurOut("[DEBUG]: processing OTU " + toString(i) + ".\n"); }
2802             
2803                         listFile >> singleOTU;
2804                         
2805                         istringstream otuString(singleOTU);
2806             
2807                         while(otuString){
2808                                 
2809                                 string seqName = "";
2810                                 
2811                                 for(int j=0;j<singleOTU.length();j++){
2812                                         char letter = otuString.get();
2813                                         
2814                                         if(letter != ','){
2815                                                 seqName += letter;
2816                                         }
2817                                         else{
2818                                                 map<string,int>::iterator nmIt = nameMap.find(seqName);
2819                                                 int index = nmIt->second;
2820                                                 
2821                                                 nameMap.erase(nmIt);
2822                                                 
2823                                                 otuData[index] = i;
2824                                                 nSeqsPerOTU[i]++;
2825                                                 aaP[i].push_back(index);
2826                                                 seqName = "";
2827                                         }
2828                                 }
2829                                 
2830                                 map<string,int>::iterator nmIt = nameMap.find(seqName);
2831                 
2832                                 int index = nmIt->second;
2833                                 nameMap.erase(nmIt);
2834                 
2835                                 otuData[index] = i;
2836                                 nSeqsPerOTU[i]++;
2837                                 aaP[i].push_back(index);        
2838                                 
2839                                 otuString.get();
2840                         }
2841                         
2842                         sort(aaP[i].begin(), aaP[i].end());
2843                         for(int j=0;j<nSeqsPerOTU[i];j++){
2844                                 seqNumber.push_back(aaP[i][j]);
2845                         }
2846                         for(int j=nSeqsPerOTU[i];j<numSeqs;j++){
2847                                 aaP[i].push_back(0);
2848                         }
2849                         
2850                         
2851                 }
2852                 
2853                 for(int i=1;i<numOTUs;i++){
2854                         cumNumSeqs[i] = cumNumSeqs[i-1] + nSeqsPerOTU[i-1];
2855                 }
2856                 aaI = aaP;
2857                 seqIndex = seqNumber;
2858                 
2859                 listFile.close();       
2860         
2861         return numOTUs;
2862                 
2863         }
2864         catch(exception& e) {
2865                 m->errorOut(e, "ShhherCommand", "getOTUData");
2866                 exit(1);        
2867         }               
2868 }
2869 /**************************************************************************************************/
2870
2871 int ShhherCommand::calcCentroidsDriver(int numOTUs, 
2872                                           vector<int>& cumNumSeqs,
2873                                           vector<int>& nSeqsPerOTU,
2874                                           vector<int>& seqIndex,
2875                                           vector<short>& change,                //did the centroid sequence change? 0 = no; 1 = yes
2876                                           vector<int>& centroids,               //the representative flowgram for each cluster m
2877                                           vector<double>& singleTau,    //tMaster->adTau:       1-D Tau vector (1xnumSeqs)
2878                                           vector<int>& mapSeqToUnique,
2879                                           vector<short>& uniqueFlowgrams,
2880                                           vector<short>& flowDataIntI,
2881                                           vector<int>& lengths,
2882                                           int numFlowCells,
2883                                           vector<int>& seqNumber){                          
2884         
2885         //this function gets the most likely homopolymer length at a flow position for a group of sequences
2886         //within an otu
2887         
2888         try{
2889                 
2890                 for(int i=0;i<numOTUs;i++){
2891                         
2892                         if (m->control_pressed) { break; }
2893                         
2894                         double count = 0;
2895                         int position = 0;
2896                         int minFlowGram = 100000000;
2897                         double minFlowValue = 1e8;
2898                         change[i] = 0; //FALSE
2899                         
2900                         for(int j=0;j<nSeqsPerOTU[i];j++){
2901                                 count += singleTau[seqNumber[cumNumSeqs[i] + j]];
2902                         }
2903             
2904                         if(nSeqsPerOTU[i] > 0 && count > MIN_COUNT){
2905                                 vector<double> adF(nSeqsPerOTU[i]);
2906                                 vector<int> anL(nSeqsPerOTU[i]);
2907                                 
2908                                 for(int j=0;j<nSeqsPerOTU[i];j++){
2909                                         int index = cumNumSeqs[i] + j;
2910                                         int nI = seqIndex[index];
2911                                         int nIU = mapSeqToUnique[nI];
2912                                         
2913                                         int k;
2914                                         for(k=0;k<position;k++){
2915                                                 if(nIU == anL[k]){
2916                                                         break;
2917                                                 }
2918                                         }
2919                                         if(k == position){
2920                                                 anL[position] = nIU;
2921                                                 adF[position] = 0.0000;
2922                                                 position++;
2923                                         }                                               
2924                                 }
2925                                 
2926                                 for(int j=0;j<nSeqsPerOTU[i];j++){
2927                                         int index = cumNumSeqs[i] + j;
2928                                         int nI = seqIndex[index];
2929                                         
2930                                         double tauValue = singleTau[seqNumber[index]];
2931                                         
2932                                         for(int k=0;k<position;k++){
2933                                                 double dist = getDistToCentroid(anL[k], nI, lengths[nI], uniqueFlowgrams, flowDataIntI, numFlowCells);
2934                                                 adF[k] += dist * tauValue;
2935                                         }
2936                                 }
2937                                 
2938                                 for(int j=0;j<position;j++){
2939                                         if(adF[j] < minFlowValue){
2940                                                 minFlowGram = j;
2941                                                 minFlowValue = adF[j];
2942                                         }
2943                                 }
2944                                 
2945                                 if(centroids[i] != anL[minFlowGram]){
2946                                         change[i] = 1;
2947                                         centroids[i] = anL[minFlowGram];
2948                                 }
2949                         }
2950                         else if(centroids[i] != -1){
2951                                 change[i] = 1;
2952                                 centroids[i] = -1;                      
2953                         }
2954                 }
2955         
2956         return 0;
2957         }
2958         catch(exception& e) {
2959                 m->errorOut(e, "ShhherCommand", "calcCentroidsDriver");
2960                 exit(1);        
2961         }               
2962 }
2963 /**************************************************************************************************/
2964
2965 double ShhherCommand::getDistToCentroid(int cent, int flow, int length, vector<short>& uniqueFlowgrams,
2966                                         vector<short>& flowDataIntI, int numFlowCells){
2967         try{
2968                 
2969                 int flowAValue = cent * numFlowCells;
2970                 int flowBValue = flow * numFlowCells;
2971                 
2972                 double dist = 0;
2973         
2974                 for(int i=0;i<length;i++){
2975                         dist += singleLookUp[uniqueFlowgrams[flowAValue] * NUMBINS + flowDataIntI[flowBValue]];
2976                         flowAValue++;
2977                         flowBValue++;
2978                 }
2979                 
2980                 return dist / (double)length;
2981         }
2982         catch(exception& e) {
2983                 m->errorOut(e, "ShhherCommand", "getDistToCentroid");
2984                 exit(1);        
2985         }               
2986 }
2987 /**************************************************************************************************/
2988
2989 double ShhherCommand::getNewWeights(int numOTUs, vector<int>& cumNumSeqs, vector<int>& nSeqsPerOTU, vector<double>& singleTau, vector<int>& seqNumber, vector<double>& weight){
2990         try{
2991                 
2992                 double maxChange = 0;
2993                 
2994                 for(int i=0;i<numOTUs;i++){
2995                         
2996                         if (m->control_pressed) { break; }
2997                         
2998                         double difference = weight[i];
2999                         weight[i] = 0;
3000                         
3001                         for(int j=0;j<nSeqsPerOTU[i];j++){
3002                                 int index = cumNumSeqs[i] + j;
3003                                 double tauValue = singleTau[seqNumber[index]];
3004                                 weight[i] += tauValue;
3005                         }
3006                         
3007                         difference = fabs(weight[i] - difference);
3008                         if(difference > maxChange){     maxChange = difference; }
3009                 }
3010                 return maxChange;
3011         }
3012         catch(exception& e) {
3013                 m->errorOut(e, "ShhherCommand", "getNewWeights");
3014                 exit(1);        
3015         }               
3016 }
3017
3018 /**************************************************************************************************/
3019
3020 double ShhherCommand::getLikelihood(int numSeqs, int numOTUs, vector<int>& nSeqsPerOTU, vector<int>& seqNumber, vector<int>& cumNumSeqs, vector<int>& seqIndex, vector<double>& dist, vector<double>& weight){
3021         
3022         try{
3023                 
3024                 vector<long double> P(numSeqs, 0);
3025                 int effNumOTUs = 0;
3026                 
3027                 for(int i=0;i<numOTUs;i++){
3028                         if(weight[i] > MIN_WEIGHT){
3029                                 effNumOTUs++;
3030                         }
3031                 }
3032                 
3033                 string hold;
3034                 for(int i=0;i<numOTUs;i++){
3035                         
3036                         if (m->control_pressed) { break; }
3037                         
3038                         for(int j=0;j<nSeqsPerOTU[i];j++){
3039                                 int index = cumNumSeqs[i] + j;
3040                                 int nI = seqIndex[index];
3041                                 double singleDist = dist[seqNumber[index]];
3042                                 
3043                                 P[nI] += weight[i] * exp(-singleDist * sigma);
3044                         }
3045                 }
3046                 double nLL = 0.00;
3047                 for(int i=0;i<numSeqs;i++){
3048                         if(P[i] == 0){  P[i] = DBL_EPSILON;     }
3049             
3050                         nLL += -log(P[i]);
3051                 }
3052                 
3053                 nLL = nLL -(double)numSeqs * log(sigma);
3054         
3055                 return nLL; 
3056         }
3057         catch(exception& e) {
3058                 m->errorOut(e, "ShhherCommand", "getNewWeights");
3059                 exit(1);        
3060         }               
3061 }
3062
3063 /**************************************************************************************************/
3064
3065 int ShhherCommand::checkCentroids(int numOTUs, vector<int>& centroids, vector<double>& weight){
3066         try{
3067                 vector<int> unique(numOTUs, 1);
3068                 
3069                 for(int i=0;i<numOTUs;i++){
3070                         if(centroids[i] == -1 || weight[i] < MIN_WEIGHT){
3071                                 unique[i] = -1;
3072                         }
3073                 }
3074                 
3075                 for(int i=0;i<numOTUs;i++){
3076                         
3077                         if (m->control_pressed) { break; }
3078                         
3079                         if(unique[i] == 1){
3080                                 for(int j=i+1;j<numOTUs;j++){
3081                                         if(unique[j] == 1){
3082                                                 
3083                                                 if(centroids[j] == centroids[i]){
3084                                                         unique[j] = 0;
3085                                                         centroids[j] = -1;
3086                                                         
3087                                                         weight[i] += weight[j];
3088                                                         weight[j] = 0.0;
3089                                                 }
3090                                         }
3091                                 }
3092                         }
3093                 }
3094         
3095         return 0;
3096         }
3097         catch(exception& e) {
3098                 m->errorOut(e, "ShhherCommand", "checkCentroids");
3099                 exit(1);        
3100         }               
3101 }
3102 /**************************************************************************************************/
3103
3104 void ShhherCommand::calcNewDistances(int numSeqs, int numOTUs, vector<int>& nSeqsPerOTU, vector<double>& dist, 
3105                                      vector<double>& weight, vector<short>& change, vector<int>& centroids,
3106                                      vector<vector<int> >& aaP, vector<double>& singleTau, vector<vector<int> >& aaI,   
3107                                      vector<int>& seqNumber, vector<int>& seqIndex,
3108                                      vector<short>& uniqueFlowgrams,
3109                                      vector<short>& flowDataIntI, int numFlowCells, vector<int>& lengths){
3110         
3111         try{
3112                 
3113                 int total = 0;
3114                 vector<double> newTau(numOTUs,0);
3115                 vector<double> norms(numSeqs, 0);
3116                 nSeqsPerOTU.assign(numOTUs, 0);
3117         
3118                 for(int i=0;i<numSeqs;i++){
3119                         
3120                         if (m->control_pressed) { break; }
3121                         
3122                         int indexOffset = i * numOTUs;
3123             
3124                         double offset = 1e8;
3125                         
3126                         for(int j=0;j<numOTUs;j++){
3127                 
3128                                 if(weight[j] > MIN_WEIGHT && change[j] == 1){
3129                                         dist[indexOffset + j] = getDistToCentroid(centroids[j], i, lengths[i], uniqueFlowgrams, flowDataIntI, numFlowCells);
3130                                 }
3131                 
3132                                 if(weight[j] > MIN_WEIGHT && dist[indexOffset + j] < offset){
3133                                         offset = dist[indexOffset + j];
3134                                 }
3135                         }
3136             
3137                         for(int j=0;j<numOTUs;j++){
3138                                 if(weight[j] > MIN_WEIGHT){
3139                                         newTau[j] = exp(sigma * (-dist[indexOffset + j] + offset)) * weight[j];
3140                                         norms[i] += newTau[j];
3141                                 }
3142                                 else{
3143                                         newTau[j] = 0.0;
3144                                 }
3145                         }
3146             
3147                         for(int j=0;j<numOTUs;j++){
3148                                 newTau[j] /= norms[i];
3149                         }
3150             
3151                         for(int j=0;j<numOTUs;j++){
3152                                 if(newTau[j] > MIN_TAU){
3153                                         
3154                                         int oldTotal = total;
3155                                         
3156                                         total++;
3157                                         
3158                                         singleTau.resize(total, 0);
3159                                         seqNumber.resize(total, 0);
3160                                         seqIndex.resize(total, 0);
3161                                         
3162                                         singleTau[oldTotal] = newTau[j];
3163                                         
3164                                         aaP[j][nSeqsPerOTU[j]] = oldTotal;
3165                                         aaI[j][nSeqsPerOTU[j]] = i;
3166                                         nSeqsPerOTU[j]++;
3167                                 }
3168                         }
3169             
3170                 }
3171         
3172         }
3173         catch(exception& e) {
3174                 m->errorOut(e, "ShhherCommand", "calcNewDistances");
3175                 exit(1);        
3176         }               
3177 }
3178 /**************************************************************************************************/
3179
3180 int ShhherCommand::fill(int numOTUs, vector<int>& seqNumber, vector<int>& seqIndex, vector<int>& cumNumSeqs, vector<int>& nSeqsPerOTU, vector<vector<int> >& aaP, vector<vector<int> >& aaI){
3181         try {
3182                 int index = 0;
3183                 for(int i=0;i<numOTUs;i++){
3184                         
3185                         if (m->control_pressed) { return 0; }
3186                         
3187                         cumNumSeqs[i] = index;
3188                         for(int j=0;j<nSeqsPerOTU[i];j++){
3189                                 seqNumber[index] = aaP[i][j];
3190                                 seqIndex[index] = aaI[i][j];
3191                                 
3192                                 index++;
3193                         }
3194                 }
3195         
3196         return 0; 
3197         }
3198         catch(exception& e) {
3199                 m->errorOut(e, "ShhherCommand", "fill");
3200                 exit(1);        
3201         }               
3202 }
3203 /**************************************************************************************************/
3204
3205 void ShhherCommand::setOTUs(int numOTUs, int numSeqs, vector<int>& seqNumber, vector<int>& seqIndex, vector<int>& cumNumSeqs, vector<int>& nSeqsPerOTU,
3206                             vector<int>& otuData, vector<double>& singleTau, vector<double>& dist, vector<vector<int> >& aaP, vector<vector<int> >& aaI){
3207         
3208         try {
3209                 vector<double> bigTauMatrix(numOTUs * numSeqs, 0.0000);
3210                 
3211                 for(int i=0;i<numOTUs;i++){
3212                         
3213                         if (m->control_pressed) { break; }
3214                         
3215                         for(int j=0;j<nSeqsPerOTU[i];j++){
3216                                 int index = cumNumSeqs[i] + j;
3217                                 double tauValue = singleTau[seqNumber[index]];
3218                                 int sIndex = seqIndex[index];
3219                                 bigTauMatrix[sIndex * numOTUs + i] = tauValue;                          
3220                         }
3221                 }
3222                 
3223                 for(int i=0;i<numSeqs;i++){
3224                         double maxTau = -1.0000;
3225                         int maxOTU = -1;
3226                         for(int j=0;j<numOTUs;j++){
3227                                 if(bigTauMatrix[i * numOTUs + j] > maxTau){
3228                                         maxTau = bigTauMatrix[i * numOTUs + j];
3229                                         maxOTU = j;
3230                                 }
3231                         }
3232                         
3233                         otuData[i] = maxOTU;
3234                 }
3235                 
3236                 nSeqsPerOTU.assign(numOTUs, 0);         
3237                 
3238                 for(int i=0;i<numSeqs;i++){
3239                         int index = otuData[i];
3240                         
3241                         singleTau[i] = 1.0000;
3242                         dist[i] = 0.0000;
3243                         
3244                         aaP[index][nSeqsPerOTU[index]] = i;
3245                         aaI[index][nSeqsPerOTU[index]] = i;
3246                         
3247                         nSeqsPerOTU[index]++;
3248                 }
3249         
3250                 fill(numOTUs, seqNumber, seqIndex, cumNumSeqs, nSeqsPerOTU, aaP, aaI);  
3251         }
3252         catch(exception& e) {
3253                 m->errorOut(e, "ShhherCommand", "setOTUs");
3254                 exit(1);        
3255         }               
3256 }
3257 /**************************************************************************************************/
3258
3259 void ShhherCommand::writeQualities(int numOTUs, int numFlowCells, string qualityFileName, vector<int> otuCounts, vector<int>& nSeqsPerOTU, vector<int>& seqNumber,
3260                                    vector<double>& singleTau, vector<short>& flowDataIntI, vector<short>& uniqueFlowgrams, vector<int>& cumNumSeqs,
3261                                    vector<int>& mapUniqueToSeq, vector<string>& seqNameVector, vector<int>& centroids, vector<vector<int> >& aaI){
3262         
3263         try {
3264         
3265                 ofstream qualityFile;
3266                 m->openOutputFile(qualityFileName, qualityFile);
3267         
3268                 qualityFile.setf(ios::fixed, ios::floatfield);
3269                 qualityFile.setf(ios::showpoint);
3270                 qualityFile << setprecision(6);
3271                 
3272                 vector<vector<int> > qualities(numOTUs);
3273                 vector<double> pr(HOMOPS, 0);
3274                 
3275                 
3276                 for(int i=0;i<numOTUs;i++){
3277                         
3278                         if (m->control_pressed) { break; }
3279                         
3280                         int index = 0;
3281             
3282                         if(nSeqsPerOTU[i] > 0){
3283                                 
3284                                 while(index < numFlowCells){
3285                     
3286                                         double maxPrValue = 1e8;
3287                                         short maxPrIndex = -1;
3288                                         double count = 0.0000;
3289                                         
3290                                         pr.assign(HOMOPS, 0);
3291                                         
3292                                         for(int j=0;j<nSeqsPerOTU[i];j++){
3293                                                 int lIndex = cumNumSeqs[i] + j;
3294                                                 double tauValue = singleTau[seqNumber[lIndex]];
3295                                                 int sequenceIndex = aaI[i][j];
3296                                                 short intensity = flowDataIntI[sequenceIndex * numFlowCells + index];
3297                                                 
3298                                                 count += tauValue;
3299                                                 
3300                                                 for(int s=0;s<HOMOPS;s++){
3301                                                         pr[s] += tauValue * singleLookUp[s * NUMBINS + intensity];
3302                                                 }
3303                                         }
3304                     
3305                                         maxPrIndex = uniqueFlowgrams[centroids[i] * numFlowCells + index];
3306                                         maxPrValue = pr[maxPrIndex];
3307                                         
3308                                         if(count > MIN_COUNT){
3309                                                 double U = 0.0000;
3310                                                 double norm = 0.0000;
3311                                                 
3312                                                 for(int s=0;s<HOMOPS;s++){
3313                                                         norm += exp(-(pr[s] - maxPrValue));
3314                                                 }
3315                         
3316                                                 for(int s=1;s<=maxPrIndex;s++){
3317                                                         int value = 0;
3318                                                         double temp = 0.0000;
3319                                                         
3320                                                         U += exp(-(pr[s-1]-maxPrValue))/norm;
3321                                                         
3322                                                         if(U>0.00){
3323                                                                 temp = log10(U);
3324                                                         }
3325                                                         else{
3326                                                                 temp = -10.1;
3327                                                         }
3328                                                         temp = floor(-10 * temp);
3329                                                         value = (int)floor(temp);
3330                                                         if(value > 100){        value = 100;    }
3331                                                         
3332                             qualities[i].push_back((int)value);
3333                                                 }
3334                                         }//end if
3335                                         
3336                                         index++;
3337                     
3338                                 }//end while
3339                 
3340                         }//end if
3341                         
3342             
3343                         if(otuCounts[i] > 0){
3344                                 qualityFile << '>' << seqNameVector[mapUniqueToSeq[i]] << endl;
3345                 //need to get past the first four bases
3346                 for (int j = 4; j < qualities[i].size(); j++) { qualityFile << qualities[i][j] << ' '; }
3347                                 qualityFile << endl;
3348                         }
3349                 }//end for
3350                 qualityFile.close();
3351                 outputNames.push_back(qualityFileName); outputTypes["qfile"].push_back(qualityFileName);
3352         
3353         }
3354         catch(exception& e) {
3355                 m->errorOut(e, "ShhherCommand", "writeQualities");
3356                 exit(1);        
3357         }               
3358 }
3359
3360 /**************************************************************************************************/
3361
3362 void ShhherCommand::writeSequences(string thisCompositeFASTAFileName, int numOTUs, int numFlowCells, string fastaFileName, vector<int> otuCounts, vector<short>& uniqueFlowgrams, vector<string>& seqNameVector, vector<vector<int> >& aaI, vector<int>& centroids){
3363         try {
3364                 
3365                 ofstream fastaFile;
3366                 m->openOutputFile(fastaFileName, fastaFile);
3367                 
3368                 vector<string> names(numOTUs, "");
3369                 
3370                 for(int i=0;i<numOTUs;i++){
3371                         
3372                         if (m->control_pressed) { break; }
3373                         
3374                         int index = centroids[i];
3375                         
3376                         if(otuCounts[i] > 0){
3377                                 fastaFile << '>' << seqNameVector[aaI[i][0]] << endl;
3378                                 
3379                                 string newSeq = "";
3380                                 
3381                                 for(int j=0;j<numFlowCells;j++){
3382                                         
3383                                         char base = flowOrder[j % flowOrder.length()];
3384                                         for(int k=0;k<uniqueFlowgrams[index * numFlowCells + j];k++){
3385                                                 newSeq += base;
3386                                         }
3387                                 }
3388                                 
3389                                 if (newSeq.length() >= 4) {  fastaFile << newSeq.substr(4) << endl;  }
3390                 else {  fastaFile << "NNNN" << endl;  }
3391                         }
3392                 }
3393                 fastaFile.close();
3394         
3395                 outputNames.push_back(fastaFileName); outputTypes["fasta"].push_back(fastaFileName);
3396         
3397                 if(thisCompositeFASTAFileName != ""){
3398                         m->appendFiles(fastaFileName, thisCompositeFASTAFileName);
3399                 }
3400         }
3401         catch(exception& e) {
3402                 m->errorOut(e, "ShhherCommand", "writeSequences");
3403                 exit(1);        
3404         }               
3405 }
3406
3407 /**************************************************************************************************/
3408
3409 void ShhherCommand::writeNames(string thisCompositeNamesFileName, int numOTUs, string nameFileName, vector<int> otuCounts, vector<string>& seqNameVector, vector<vector<int> >& aaI, vector<int>& nSeqsPerOTU){
3410         try {
3411                 
3412                 ofstream nameFile;
3413                 m->openOutputFile(nameFileName, nameFile);
3414                 
3415                 for(int i=0;i<numOTUs;i++){
3416                         
3417                         if (m->control_pressed) { break; }
3418                         
3419                         if(otuCounts[i] > 0){
3420                                 nameFile << seqNameVector[aaI[i][0]] << '\t' << seqNameVector[aaI[i][0]];
3421                                 
3422                                 for(int j=1;j<nSeqsPerOTU[i];j++){
3423                                         nameFile << ',' << seqNameVector[aaI[i][j]];
3424                                 }
3425                                 
3426                                 nameFile << endl;
3427                         }
3428                 }
3429                 nameFile.close();
3430                 outputNames.push_back(nameFileName); outputTypes["name"].push_back(nameFileName);
3431                 
3432                 
3433                 if(thisCompositeNamesFileName != ""){
3434                         m->appendFiles(nameFileName, thisCompositeNamesFileName);
3435                 }               
3436         }
3437         catch(exception& e) {
3438                 m->errorOut(e, "ShhherCommand", "writeNames");
3439                 exit(1);        
3440         }               
3441 }
3442
3443 /**************************************************************************************************/
3444
3445 void ShhherCommand::writeGroups(string groupFileName, string fileRoot, int numSeqs, vector<string>& seqNameVector){
3446         try {
3447         ofstream groupFile;
3448                 m->openOutputFile(groupFileName, groupFile);
3449                 
3450                 for(int i=0;i<numSeqs;i++){
3451                         if (m->control_pressed) { break; }
3452                         groupFile << seqNameVector[i] << '\t' << fileRoot << endl;
3453                 }
3454                 groupFile.close();
3455                 outputNames.push_back(groupFileName); outputTypes["group"].push_back(groupFileName);
3456         
3457         }
3458         catch(exception& e) {
3459                 m->errorOut(e, "ShhherCommand", "writeGroups");
3460                 exit(1);        
3461         }               
3462 }
3463
3464 /**************************************************************************************************/
3465
3466 void ShhherCommand::writeClusters(string otuCountsFileName, int numOTUs, int numFlowCells, vector<int> otuCounts, vector<int>& centroids, vector<short>& uniqueFlowgrams, vector<string>& seqNameVector, vector<vector<int> >& aaI, vector<int>& nSeqsPerOTU, vector<int>& lengths, vector<short>& flowDataIntI){
3467         try {
3468                 ofstream otuCountsFile;
3469                 m->openOutputFile(otuCountsFileName, otuCountsFile);
3470                 
3471                 string bases = flowOrder;
3472                 
3473                 for(int i=0;i<numOTUs;i++){
3474                         
3475                         if (m->control_pressed) {
3476                                 break;
3477                         }
3478                         //output the translated version of the centroid sequence for the otu
3479                         if(otuCounts[i] > 0){
3480                                 int index = centroids[i];
3481                                 
3482                                 otuCountsFile << "ideal\t";
3483                                 for(int j=8;j<numFlowCells;j++){
3484                                         char base = bases[j % bases.length()];
3485                                         for(int s=0;s<uniqueFlowgrams[index * numFlowCells + j];s++){
3486                                                 otuCountsFile << base;
3487                                         }
3488                                 }
3489                                 otuCountsFile << endl;
3490                                 
3491                                 for(int j=0;j<nSeqsPerOTU[i];j++){
3492                                         int sequence = aaI[i][j];
3493                                         otuCountsFile << seqNameVector[sequence] << '\t';
3494                                         
3495                                         string newSeq = "";
3496                                         
3497                                         for(int k=0;k<lengths[sequence];k++){
3498                                                 char base = bases[k % bases.length()];
3499                                                 int freq = int(0.01 * (double)flowDataIntI[sequence * numFlowCells + k] + 0.5);
3500                         
3501                                                 for(int s=0;s<freq;s++){
3502                                                         newSeq += base;
3503                                                         //otuCountsFile << base;
3504                                                 }
3505                                         }
3506                                         
3507                     if (newSeq.length() >= 4) {  otuCountsFile << newSeq.substr(4) << endl;  }
3508                     else {  otuCountsFile << "NNNN" << endl;  }
3509                                 }
3510                                 otuCountsFile << endl;
3511                         }
3512                 }
3513                 otuCountsFile.close();
3514                 outputNames.push_back(otuCountsFileName); outputTypes["counts"].push_back(otuCountsFileName);
3515         
3516         }
3517         catch(exception& e) {
3518                 m->errorOut(e, "ShhherCommand", "writeClusters");
3519                 exit(1);        
3520         }               
3521 }
3522
3523 /**************************************************************************************************/
3524
3525 void ShhherCommand::getSingleLookUp(){
3526         try{
3527                 //      these are the -log probabilities that a signal corresponds to a particular homopolymer length
3528                 singleLookUp.assign(HOMOPS * NUMBINS, 0);
3529                 
3530                 int index = 0;
3531                 ifstream lookUpFile;
3532                 m->openInputFile(lookupFileName, lookUpFile);
3533                 
3534                 for(int i=0;i<HOMOPS;i++){
3535                         
3536                         if (m->control_pressed) { break; }
3537                         
3538                         float logFracFreq;
3539                         lookUpFile >> logFracFreq;
3540                         
3541                         for(int j=0;j<NUMBINS;j++)      {
3542                                 lookUpFile >> singleLookUp[index];
3543                                 index++;
3544                         }
3545                 }       
3546                 lookUpFile.close();
3547         }
3548         catch(exception& e) {
3549                 m->errorOut(e, "ShhherCommand", "getSingleLookUp");
3550                 exit(1);
3551         }
3552 }
3553
3554 /**************************************************************************************************/
3555
3556 void ShhherCommand::getJointLookUp(){
3557         try{
3558                 
3559                 //      the most likely joint probability (-log) that two intenities have the same polymer length
3560                 jointLookUp.resize(NUMBINS * NUMBINS, 0);
3561                 
3562                 for(int i=0;i<NUMBINS;i++){
3563                         
3564                         if (m->control_pressed) { break; }
3565                         
3566                         for(int j=0;j<NUMBINS;j++){             
3567                                 
3568                                 double minSum = 100000000;
3569                                 
3570                                 for(int k=0;k<HOMOPS;k++){
3571                                         double sum = singleLookUp[k * NUMBINS + i] + singleLookUp[k * NUMBINS + j];
3572                                         
3573                                         if(sum < minSum)        {       minSum = sum;           }
3574                                 }       
3575                                 jointLookUp[i * NUMBINS + j] = minSum;
3576                         }
3577                 }
3578         }
3579         catch(exception& e) {
3580                 m->errorOut(e, "ShhherCommand", "getJointLookUp");
3581                 exit(1);
3582         }
3583 }
3584
3585 /**************************************************************************************************/
3586
3587 double ShhherCommand::getProbIntensity(int intIntensity){                          
3588         try{
3589
3590                 double minNegLogProb = 100000000; 
3591
3592                 
3593                 for(int i=0;i<HOMOPS;i++){//loop signal strength
3594                         
3595                         if (m->control_pressed) { break; }
3596                         
3597                         float negLogProb = singleLookUp[i * NUMBINS + intIntensity];
3598                         if(negLogProb < minNegLogProb)  {       minNegLogProb = negLogProb; }
3599                 }
3600                 
3601                 return minNegLogProb;
3602         }
3603         catch(exception& e) {
3604                 m->errorOut(e, "ShhherCommand", "getProbIntensity");
3605                 exit(1);
3606         }
3607 }
3608
3609
3610
3611