]> git.donarmstrong.com Git - mothur.git/blob - shhhercommand.cpp
working on get.communitytype command
[mothur.git] / shhhercommand.cpp
1 /*
2  *  shhher.cpp
3  *  Mothur
4  *
5  *  Created by Pat Schloss on 12/27/10.
6  *  Copyright 2010 Schloss Lab. All rights reserved.
7  *
8  */
9
10 #include "shhhercommand.h"
11
12 //**********************************************************************************************************************
13 vector<string> ShhherCommand::setParameters(){  
14         try {
15                 CommandParameter pflow("flow", "InputTypes", "", "", "none", "fileflow", "none","fasta-name-group-counts-qfile",false,false,true); parameters.push_back(pflow);
16                 CommandParameter pfile("file", "InputTypes", "", "", "none", "fileflow", "none","fasta-name-group-counts-qfile",false,false,true); parameters.push_back(pfile);
17                 CommandParameter plookup("lookup", "InputTypes", "", "", "none", "none", "none","",false,false,true); parameters.push_back(plookup);
18                 CommandParameter pcutoff("cutoff", "Number", "", "0.01", "", "", "","",false,false); parameters.push_back(pcutoff);
19                 CommandParameter pprocessors("processors", "Number", "", "1", "", "", "","",false,false,true); parameters.push_back(pprocessors);
20                 CommandParameter pmaxiter("maxiter", "Number", "", "1000", "", "", "","",false,false); parameters.push_back(pmaxiter);
21         CommandParameter plarge("large", "Number", "", "-1", "", "", "","",false,false); parameters.push_back(plarge);
22                 CommandParameter psigma("sigma", "Number", "", "60", "", "", "","",false,false); parameters.push_back(psigma);
23                 CommandParameter pmindelta("mindelta", "Number", "", "0.000001", "", "", "","",false,false); parameters.push_back(pmindelta);
24         CommandParameter porder("order", "Multiple", "A-B-I", "A", "", "", "","",false,false, true); parameters.push_back(porder);              CommandParameter pinputdir("inputdir", "String", "", "", "", "", "","",false,false); parameters.push_back(pinputdir);
25                 CommandParameter poutputdir("outputdir", "String", "", "", "", "", "","",false,false); parameters.push_back(poutputdir);
26                 
27                 vector<string> myArray;
28                 for (int i = 0; i < parameters.size(); i++) {   myArray.push_back(parameters[i].name);          }
29                 return myArray;
30         }
31         catch(exception& e) {
32                 m->errorOut(e, "ShhherCommand", "setParameters");
33                 exit(1);
34         }
35 }
36 //**********************************************************************************************************************
37 string ShhherCommand::getHelpString(){  
38         try {
39                 string helpString = "";
40                 helpString += "The shhh.flows command reads a file containing flowgrams and creates a file of corrected sequences.\n";
41         helpString += "The shhh.flows command parameters are flow, file, lookup, cutoff, processors, large, maxiter, sigma, mindelta and order.\n";
42         helpString += "The flow parameter is used to input your flow file.\n";
43         helpString += "The file parameter is used to input the *flow.files file created by trim.flows.\n";
44         helpString += "The lookup parameter is used specify the lookup file you would like to use. http://www.mothur.org/wiki/Lookup_files.\n";
45         helpString += "The order parameter options are A, B or I.  Default=A. A = TACG and B = TACGTACGTACGATGTAGTCGAGCATCATCTGACGCAGTACGTGCATGATCTCAGTCAGCAGCTATGTCAGTGCATGCAGTGACTGATCGTCATCAGCTAGCATCGACTGCATAGATCGCATGACGATCGCATATCGTCAGTGCATGTAGTCGAGCATCATCTGACGCAGTACGTGCATGATCTCAGTCAGCAGCTATGTCAGTGCATGCATAGATCGCATGACGATCGCATATCGTCAGTGCAGTGACTGATCGTCATCAGCTAGCATCGACTGCATGTAGTCGAGCATCATCTGACGCAGTACGTGCATAGATCGCATGACGATCGCATATCGTCAGTGCATGATCTCAGTCAGCAGCTATGTCAGTGCATGCAGTGACTGATCGTCATCAGCTAGCATCGACTGCATGTAGTCGAGCATCATCTGACGCAGTACGTGCAGTGACTGATCGTCATCAGCTAGCATCGACTGCATAGATCGCATGACGATCGCATATCGTCAGTGCATGATCTCAGTCAGCAGCTATGTCAGTGCATGCATGTAGTCGAGCATCATCTGACGCAGTACGTGCATAGATCGCATGACGATCGCATATCGTCAGTGCAGTGACTGATCGTCATCAGCTAGCATCGACTGCATGATCTCAGTCAGCAGCTATGTCAGTGCATGCAGTGACTGATCGTCATCAGCTAGCATCGACTGCATAGATCGCATGACGATCGCATATCGTCAGTGCATGATCTCAGTCAGCAGCTATGTCAGTGCATGCATGTAGTCGAGCATCATCTGACGCAGTACGTGCATAGATCGCATGACGATCGCATATCGTCAGTGCATGATCTCAGTCAGCAGCTATGTCAGTGCATGCAGTGACTGATCGTCATCAGCTAGCATCGACTGCATGTAGTCGAGCATCATCTGACGCAGTACGTGCATGATCTCAGTCAGCAGCTATGTCAGTGCATGCATAGATCGCATGACGATCGCATATCGTCAGTGCATGTAGTCGAGCATCATCTGACGCAGTACGTGCAGTGACTGATCGTCATCAGCTAGCATCGACTGCATAGATCGCATGACGATCGCATATCGTCAGTGCATGTAGTCGAGCATCATCTGACGCAGTACGTGCATGATCTCAGTCAGCAGCTATGTCAGTGCATGCAGTGACTGATCGTCATCAGCTAGCATCGACTGCATGATCTCAGTCAGCAGCTATGTCAGTGCATGCAGTGACTGATCGTCATCAGCTAGCATCGACTGCATAGATCGCATGACGATCGCATATCGTCAGTGCATGTAGTCGAGCATCATCTGACGCAGTACGTGCATGATCTCAGTCAGCAGCTATGTCAGTGCATGCATGTAGTCGAGCATCATCTGACGCAGTACGTGCAGTGACTGATCGTCATCAGCTAGCATCGACTGCATAGATCGCATGACGATCGCATATCGTCAGTGCAGTGACTGATCGTCATCAGCTAGCATCGACTGCATGTAGTCGAGCATCATCTGACGCAGTACGTGCATAGATCGCATGACGATCGCATATCGTCAGTGCATGATCTCAGTCAGCAGCTATGTCAGTGCATGCAGTGACTGATCGTCATCAGCTAGCATCGACTGCATGTAGTCGAGCATCATCTGACGCAGTACGTGCATGATCTCAGTCAGCAGCTATGTCAGTGCATGCATAGATCGCATGACGATCGCATATCGTCAGTGCAGTGACTGATCGTCATCAGCTAGCATCGACTGCATGATCTCAGTCAGCAGC and I = TACGTACGTCTGAGCATCGATCGATGTACAGCTACGTACGTCTGAGCATCGATCGATGTACAGCTACGTACGTCTGAGCATCGATCGATGTACAGCTACGTACGTCTGAGCATCGATCGATGTACAGCTACGTACGTCTGAGCATCGATCGATGTACAGCTACGTACGTCTGAGCATCGATCGATGTACAGCTACGTACGTCTGAGCATCGATCGATGTACAGCTACGTACGTCTGAGCATCGATCGATGTACAGCTACGTACGTCTGAGCATCGATCGATGTACAGCTACGTACGTCTGAGCATCGATCGATGTACAGCTACGTACGTCTGAGCATCGATCGATGTACAGCTACGTACGTCTGAGCATCGATCGATGTACAGCTACGTACGTCTGAGCATCGATCGATGTACAGCTACGTACGTCTGAGCATCGATCGATGTACAGCTACGTACGTCTGAGCATCGATCGATGTACAGCTACGTACGTCTGAGCATCGATCGATGTACAGCTACGTACGTCTGAGCATCGATCGATGTACAGCTACGTACGTCTGAGCATCGATCGATGTACAGCTACGTACGTCTGAGCATCGATCGATGTACAGCTACGTACGTCTGAGCATCGATCGATGTACAGC.\n";
46                 return helpString;
47         }
48         catch(exception& e) {
49                 m->errorOut(e, "ShhherCommand", "getHelpString");
50                 exit(1);
51         }
52 }
53 //**********************************************************************************************************************
54 string ShhherCommand::getOutputPattern(string type) {
55     try {
56         string pattern = "";
57         
58         if (type == "fasta")            {   pattern = "[filename],shhh.fasta";   }
59         else if (type == "name")    {   pattern = "[filename],shhh.names";   }
60         else if (type == "group")        {   pattern = "[filename],shhh.groups";   }
61         else if (type == "counts")        {   pattern = "[filename],shhh.counts";   }
62         else if (type == "qfile")        {   pattern = "[filename],shhh.qual";   }
63         else { m->mothurOut("[ERROR]: No definition for type " + type + " output pattern.\n"); m->control_pressed = true;  }
64         
65         return pattern;
66     }
67     catch(exception& e) {
68         m->errorOut(e, "ShhherCommand", "getOutputPattern");
69         exit(1);
70     }
71 }
72 //**********************************************************************************************************************
73
74 ShhherCommand::ShhherCommand(){ 
75         try {
76                 abort = true; calledHelp = true;
77                 setParameters();
78                 
79                 //initialize outputTypes
80                 vector<string> tempOutNames;
81                 outputTypes["fasta"] = tempOutNames;
82         outputTypes["name"] = tempOutNames;
83         outputTypes["group"] = tempOutNames;
84         outputTypes["counts"] = tempOutNames;
85         outputTypes["qfile"] = tempOutNames;
86
87         }
88         catch(exception& e) {
89                 m->errorOut(e, "ShhherCommand", "ShhherCommand");
90                 exit(1);
91         }
92 }
93
94 //**********************************************************************************************************************
95
96 ShhherCommand::ShhherCommand(string option) {
97         try {
98         
99 #ifdef USE_MPI
100                 MPI_Comm_rank(MPI_COMM_WORLD, &pid); //find out who we are
101                 MPI_Comm_size(MPI_COMM_WORLD, &ncpus);
102         
103                 if(pid == 0){
104 #endif
105                 abort = false; calledHelp = false;   
106                 
107                 //allow user to run help
108                 if(option == "help") { help(); abort = true; calledHelp = true; }
109                 else if(option == "citation") { citation(); abort = true; calledHelp = true;}
110                 
111                 else {
112                         vector<string> myArray = setParameters();
113                         
114                         OptionParser parser(option);
115                         map<string,string> parameters = parser.getParameters();
116                         
117                         ValidParameters validParameter;
118                         map<string,string>::iterator it;
119                         
120                         //check to make sure all parameters are valid for command
121                         for (it = parameters.begin(); it != parameters.end(); it++) { 
122                                 if (validParameter.isValidParameter(it->first, myArray, it->second) != true) {  abort = true;  }
123                         }
124                         
125                         //initialize outputTypes
126             vector<string> tempOutNames;
127             outputTypes["fasta"] = tempOutNames;
128             outputTypes["name"] = tempOutNames;
129             outputTypes["group"] = tempOutNames;
130             outputTypes["counts"] = tempOutNames;
131             outputTypes["qfile"] = tempOutNames;
132
133                         
134                         //if the user changes the input directory command factory will send this info to us in the output parameter 
135                         string inputDir = validParameter.validFile(parameters, "inputdir", false);              
136                         if (inputDir == "not found"){   inputDir = "";          }
137                         else {
138                                 string path;
139                                 it = parameters.find("flow");
140                                 //user has given a template file
141                                 if(it != parameters.end()){ 
142                                         path = m->hasPath(it->second);
143                                         //if the user has not given a path then, add inputdir. else leave path alone.
144                                         if (path == "") {       parameters["flow"] = inputDir + it->second;             }
145                                 }
146                                 
147                                 it = parameters.find("lookup");
148                                 //user has given a template file
149                                 if(it != parameters.end()){ 
150                                         path = m->hasPath(it->second);
151                                         //if the user has not given a path then, add inputdir. else leave path alone.
152                                         if (path == "") {       parameters["lookup"] = inputDir + it->second;           }
153                                 }
154
155                                 it = parameters.find("file");
156                                 //user has given a template file
157                                 if(it != parameters.end()){ 
158                                         path = m->hasPath(it->second);
159                                         //if the user has not given a path then, add inputdir. else leave path alone.
160                                         if (path == "") {       parameters["file"] = inputDir + it->second;             }
161                                 }
162                         }
163             
164             //if the user changes the output directory command factory will send this info to us in the output parameter 
165                         outputDir = validParameter.validFile(parameters, "outputdir", false);           if (outputDir == "not found"){  outputDir = ""; }
166             
167                         //check for required parameters
168                         flowFileName = validParameter.validFile(parameters, "flow", true);
169                         flowFilesFileName = validParameter.validFile(parameters, "file", true);
170                         if (flowFileName == "not found" && flowFilesFileName == "not found") {
171                                 m->mothurOut("values for either flow or file must be provided for the shhh.flows command.");
172                                 m->mothurOutEndLine();
173                                 abort = true; 
174                         }
175                         else if (flowFileName == "not open" || flowFilesFileName == "not open") { abort = true; }
176                         
177                         if(flowFileName != "not found"){
178                                 compositeFASTAFileName = "";    
179                                 compositeNamesFileName = "";    
180                         }
181                         else{
182                                 ofstream temp;
183                 
184                 string thisoutputDir = outputDir;
185                 if (outputDir == "") {  thisoutputDir =  m->hasPath(flowFilesFileName); } //if user entered a file with a path then preserve it
186                 
187                                 //we want to rip off .files, and also .flow if its there
188                 string fileroot = m->getRootName(m->getSimpleName(flowFilesFileName));
189                 if (fileroot[fileroot.length()-1] == '.') {  fileroot = fileroot.substr(0, fileroot.length()-1); } //rip off dot
190                 string extension = m->getExtension(fileroot);
191                 if (extension == ".flow") { fileroot = m->getRootName(fileroot);  }
192                 else { fileroot += "."; } //add back if needed
193                 
194                                 compositeFASTAFileName = thisoutputDir + fileroot + "shhh.fasta";
195                                 m->openOutputFile(compositeFASTAFileName, temp);
196                                 temp.close();
197                                 
198                                 compositeNamesFileName = thisoutputDir + fileroot + "shhh.names";
199                                 m->openOutputFile(compositeNamesFileName, temp);
200                                 temp.close();
201                         }
202             
203             if(flowFilesFileName != "not found"){
204                 string fName;
205                 
206                 ifstream flowFilesFile;
207                 m->openInputFile(flowFilesFileName, flowFilesFile);
208                 while(flowFilesFile){
209                     fName = m->getline(flowFilesFile);
210                     
211                     //test if file is valid
212                     ifstream in;
213                     int ableToOpen = m->openInputFile(fName, in, "noerror");
214                     in.close(); 
215                     if (ableToOpen == 1) {
216                         if (inputDir != "") { //default path is set
217                             string tryPath = inputDir + fName;
218                             m->mothurOut("Unable to open " + fName + ". Trying input directory " + tryPath); m->mothurOutEndLine();
219                             ifstream in2;
220                             ableToOpen = m->openInputFile(tryPath, in2, "noerror");
221                             in2.close();
222                             fName = tryPath;
223                         }
224                     }
225                     
226                     if (ableToOpen == 1) {
227                         if (m->getDefaultPath() != "") { //default path is set
228                             string tryPath = m->getDefaultPath() + m->getSimpleName(fName);
229                             m->mothurOut("Unable to open " + fName + ". Trying default " + tryPath); m->mothurOutEndLine();
230                             ifstream in2;
231                             ableToOpen = m->openInputFile(tryPath, in2, "noerror");
232                             in2.close();
233                             fName = tryPath;
234                         }
235                     }
236                     
237                     //if you can't open it its not in current working directory or inputDir, try mothur excutable location
238                     if (ableToOpen == 1) {
239                         string exepath = m->argv;
240                         string tempPath = exepath;
241                         for (int i = 0; i < exepath.length(); i++) { tempPath[i] = tolower(exepath[i]); }
242                         exepath = exepath.substr(0, (tempPath.find_last_of('m')));
243                         
244                         string tryPath = m->getFullPathName(exepath) + m->getSimpleName(fName);
245                         m->mothurOut("Unable to open " + fName + ". Trying mothur's executable location " + tryPath); m->mothurOutEndLine();
246                         ifstream in2;
247                         ableToOpen = m->openInputFile(tryPath, in2, "noerror");
248                         in2.close();
249                         fName = tryPath;
250                     }
251                     
252                     if (ableToOpen == 1) {  m->mothurOut("Unable to open " + fName + ". Disregarding. "); m->mothurOutEndLine();  }
253                     else { flowFileVector.push_back(fName); }
254                     m->gobble(flowFilesFile);
255                 }
256                 flowFilesFile.close();
257                 if (flowFileVector.size() == 0) {  m->mothurOut("[ERROR]: no valid files."); m->mothurOutEndLine(); abort = true; }
258             }
259             else{
260                 if (outputDir == "") { outputDir = m->hasPath(flowFileName); }
261                 flowFileVector.push_back(flowFileName);
262             }
263                 
264                         //check for optional parameter and set defaults
265                         // ...at some point should added some additional type checking...
266                         string temp;
267                         temp = validParameter.validFile(parameters, "lookup", true);
268                         if (temp == "not found")        {       
269                                 string path = m->argv;
270                 string tempPath = path;
271                 for (int i = 0; i < path.length(); i++) { tempPath[i] = tolower(path[i]); }
272                 path = path.substr(0, (tempPath.find_last_of('m')));
273                 
274 #if defined (__APPLE__) || (__MACH__) || (linux) || (__linux) || (__linux__) || (__unix__) || (__unix)
275                 path += "lookupFiles/";
276 #else
277                 path += "lookupFiles\\";
278 #endif
279                                 lookupFileName = m->getFullPathName(path) + "LookUp_Titanium.pat";
280                                 
281                                 int ableToOpen;
282                                 ifstream in;
283                                 ableToOpen = m->openInputFile(lookupFileName, in, "noerror");
284                                 in.close();     
285                                 
286                                 //if you can't open it, try input location
287                                 if (ableToOpen == 1) {
288                                         if (inputDir != "") { //default path is set
289                                                 string tryPath = inputDir + m->getSimpleName(lookupFileName);
290                                                 m->mothurOut("Unable to open " + lookupFileName + ". Trying input directory " + tryPath); m->mothurOutEndLine();
291                                                 ifstream in2;
292                                                 ableToOpen = m->openInputFile(tryPath, in2, "noerror");
293                                                 in2.close();
294                                                 lookupFileName = tryPath;
295                                         }
296                                 }
297                                 
298                                 //if you can't open it, try default location
299                                 if (ableToOpen == 1) {
300                                         if (m->getDefaultPath() != "") { //default path is set
301                                                 string tryPath = m->getDefaultPath() + m->getSimpleName(lookupFileName);
302                                                 m->mothurOut("Unable to open " + lookupFileName + ". Trying default " + tryPath); m->mothurOutEndLine();
303                                                 ifstream in2;
304                                                 ableToOpen = m->openInputFile(tryPath, in2, "noerror");
305                                                 in2.close();
306                                                 lookupFileName = tryPath;
307                                         }
308                                 }
309                                 
310                                 //if you can't open it its not in current working directory or inputDir, try mothur excutable location
311                                 if (ableToOpen == 1) {
312                                         string exepath = m->argv;
313                                         string tempPath = exepath;
314                                         for (int i = 0; i < exepath.length(); i++) { tempPath[i] = tolower(exepath[i]); }
315                                         exepath = exepath.substr(0, (tempPath.find_last_of('m')));
316                                         
317                                         string tryPath = m->getFullPathName(exepath) + m->getSimpleName(lookupFileName);
318                                         m->mothurOut("Unable to open " + lookupFileName + ". Trying mothur's executable location " + tryPath); m->mothurOutEndLine();
319                                         ifstream in2;
320                                         ableToOpen = m->openInputFile(tryPath, in2, "noerror");
321                                         in2.close();
322                                         lookupFileName = tryPath;
323                                 }
324                                 
325                                 if (ableToOpen == 1) {  m->mothurOut("Unable to open " + lookupFileName + "."); m->mothurOutEndLine(); abort=true;  }
326                         }
327                         else if(temp == "not open")     {       
328                                 
329                                 lookupFileName = validParameter.validFile(parameters, "lookup", false);
330                                 
331                                 //if you can't open it its not inputDir, try mothur excutable location
332                                 string exepath = m->argv;
333                                 string tempPath = exepath;
334                                 for (int i = 0; i < exepath.length(); i++) { tempPath[i] = tolower(exepath[i]); }
335                                 exepath = exepath.substr(0, (tempPath.find_last_of('m')));
336                                         
337                                 string tryPath = m->getFullPathName(exepath) + m->getSimpleName(lookupFileName);
338                                 m->mothurOut("Unable to open " + lookupFileName + ". Trying mothur's executable location " + tryPath); m->mothurOutEndLine();
339                                 ifstream in2;
340                                 int ableToOpen = m->openInputFile(tryPath, in2, "noerror");
341                                 in2.close();
342                                 lookupFileName = tryPath;
343                                 
344                                 if (ableToOpen == 1) {  m->mothurOut("Unable to open " + lookupFileName + "."); m->mothurOutEndLine(); abort=true;  }
345                         }else                                           {       lookupFileName = temp;  }
346                         
347                         temp = validParameter.validFile(parameters, "processors", false);       if (temp == "not found"){       temp = m->getProcessors();      }
348                         m->setProcessors(temp);
349                         m->mothurConvert(temp, processors);
350
351                         temp = validParameter.validFile(parameters, "cutoff", false);   if (temp == "not found"){       temp = "0.01";          }
352                         m->mothurConvert(temp, cutoff); 
353                         
354                         temp = validParameter.validFile(parameters, "mindelta", false); if (temp == "not found"){       temp = "0.000001";      }
355                         m->mothurConvert(temp, minDelta); 
356
357                         temp = validParameter.validFile(parameters, "maxiter", false);  if (temp == "not found"){       temp = "1000";          }
358                         m->mothurConvert(temp, maxIters); 
359             
360             temp = validParameter.validFile(parameters, "large", false);        if (temp == "not found"){       temp = "0";             }
361                         m->mothurConvert(temp, largeSize); 
362             if (largeSize != 0) { large = true; }
363             else { large = false;  }
364             if (largeSize < 0) {  m->mothurOut("The value of the large cannot be negative.\n"); }
365             
366 #ifdef USE_MPI
367             if (large) { m->mothurOut("The large parameter is not available with the MPI-Enabled version.\n"); large=false; }                           
368 #endif
369             
370
371                         temp = validParameter.validFile(parameters, "sigma", false);if (temp == "not found")    {       temp = "60";            }
372                         m->mothurConvert(temp, sigma); 
373                         
374                         temp = validParameter.validFile(parameters, "order", false);  if (temp == "not found"){         temp = "A";     }
375             if (temp.length() > 1) {  m->mothurOut("[ERROR]: " + temp + " is not a valid option for order. order options are A, B, or I. A = TACG, B = TACGTACGTACGATGTAGTCGAGCATCATCTGACGCAGTACGTGCATGATCTCAGTCAGCAGCTATGTCAGTGCATGCAGTGACTGATCGTCATCAGCTAGCATCGACTGCATAGATCGCATGACGATCGCATATCGTCAGTGCATGTAGTCGAGCATCATCTGACGCAGTACGTGCATGATCTCAGTCAGCAGCTATGTCAGTGCATGCATAGATCGCATGACGATCGCATATCGTCAGTGCAGTGACTGATCGTCATCAGCTAGCATCGACTGCATGTAGTCGAGCATCATCTGACGCAGTACGTGCATAGATCGCATGACGATCGCATATCGTCAGTGCATGATCTCAGTCAGCAGCTATGTCAGTGCATGCAGTGACTGATCGTCATCAGCTAGCATCGACTGCATGTAGTCGAGCATCATCTGACGCAGTACGTGCAGTGACTGATCGTCATCAGCTAGCATCGACTGCATAGATCGCATGACGATCGCATATCGTCAGTGCATGATCTCAGTCAGCAGCTATGTCAGTGCATGCATGTAGTCGAGCATCATCTGACGCAGTACGTGCATAGATCGCATGACGATCGCATATCGTCAGTGCAGTGACTGATCGTCATCAGCTAGCATCGACTGCATGATCTCAGTCAGCAGCTATGTCAGTGCATGCAGTGACTGATCGTCATCAGCTAGCATCGACTGCATAGATCGCATGACGATCGCATATCGTCAGTGCATGATCTCAGTCAGCAGCTATGTCAGTGCATGCATGTAGTCGAGCATCATCTGACGCAGTACGTGCATAGATCGCATGACGATCGCATATCGTCAGTGCATGATCTCAGTCAGCAGCTATGTCAGTGCATGCAGTGACTGATCGTCATCAGCTAGCATCGACTGCATGTAGTCGAGCATCATCTGACGCAGTACGTGCATGATCTCAGTCAGCAGCTATGTCAGTGCATGCATAGATCGCATGACGATCGCATATCGTCAGTGCATGTAGTCGAGCATCATCTGACGCAGTACGTGCAGTGACTGATCGTCATCAGCTAGCATCGACTGCATAGATCGCATGACGATCGCATATCGTCAGTGCATGTAGTCGAGCATCATCTGACGCAGTACGTGCATGATCTCAGTCAGCAGCTATGTCAGTGCATGCAGTGACTGATCGTCATCAGCTAGCATCGACTGCATGATCTCAGTCAGCAGCTATGTCAGTGCATGCAGTGACTGATCGTCATCAGCTAGCATCGACTGCATAGATCGCATGACGATCGCATATCGTCAGTGCATGTAGTCGAGCATCATCTGACGCAGTACGTGCATGATCTCAGTCAGCAGCTATGTCAGTGCATGCATGTAGTCGAGCATCATCTGACGCAGTACGTGCAGTGACTGATCGTCATCAGCTAGCATCGACTGCATAGATCGCATGACGATCGCATATCGTCAGTGCAGTGACTGATCGTCATCAGCTAGCATCGACTGCATGTAGTCGAGCATCATCTGACGCAGTACGTGCATAGATCGCATGACGATCGCATATCGTCAGTGCATGATCTCAGTCAGCAGCTATGTCAGTGCATGCAGTGACTGATCGTCATCAGCTAGCATCGACTGCATGTAGTCGAGCATCATCTGACGCAGTACGTGCATGATCTCAGTCAGCAGCTATGTCAGTGCATGCATAGATCGCATGACGATCGCATATCGTCAGTGCAGTGACTGATCGTCATCAGCTAGCATCGACTGCATGATCTCAGTCAGCAGC, and I = TACGTACGTCTGAGCATCGATCGATGTACAGCTACGTACGTCTGAGCATCGATCGATGTACAGCTACGTACGTCTGAGCATCGATCGATGTACAGCTACGTACGTCTGAGCATCGATCGATGTACAGCTACGTACGTCTGAGCATCGATCGATGTACAGCTACGTACGTCTGAGCATCGATCGATGTACAGCTACGTACGTCTGAGCATCGATCGATGTACAGCTACGTACGTCTGAGCATCGATCGATGTACAGCTACGTACGTCTGAGCATCGATCGATGTACAGCTACGTACGTCTGAGCATCGATCGATGTACAGCTACGTACGTCTGAGCATCGATCGATGTACAGCTACGTACGTCTGAGCATCGATCGATGTACAGCTACGTACGTCTGAGCATCGATCGATGTACAGCTACGTACGTCTGAGCATCGATCGATGTACAGCTACGTACGTCTGAGCATCGATCGATGTACAGCTACGTACGTCTGAGCATCGATCGATGTACAGCTACGTACGTCTGAGCATCGATCGATGTACAGCTACGTACGTCTGAGCATCGATCGATGTACAGCTACGTACGTCTGAGCATCGATCGATGTACAGCTACGTACGTCTGAGCATCGATCGATGTACAGC.\n");  abort=true;
376             }
377             else {
378                 if (toupper(temp[0]) == 'A') {  flowOrder = "TACG";   }
379                 else if(toupper(temp[0]) == 'B'){
380                     flowOrder = "TACGTACGTACGATGTAGTCGAGCATCATCTGACGCAGTACGTGCATGATCTCAGTCAGCAGCTATGTCAGTGCATGCAGTGACTGATCGTCATCAGCTAGCATCGACTGCATAGATCGCATGACGATCGCATATCGTCAGTGCATGTAGTCGAGCATCATCTGACGCAGTACGTGCATGATCTCAGTCAGCAGCTATGTCAGTGCATGCATAGATCGCATGACGATCGCATATCGTCAGTGCAGTGACTGATCGTCATCAGCTAGCATCGACTGCATGTAGTCGAGCATCATCTGACGCAGTACGTGCATAGATCGCATGACGATCGCATATCGTCAGTGCATGATCTCAGTCAGCAGCTATGTCAGTGCATGCAGTGACTGATCGTCATCAGCTAGCATCGACTGCATGTAGTCGAGCATCATCTGACGCAGTACGTGCAGTGACTGATCGTCATCAGCTAGCATCGACTGCATAGATCGCATGACGATCGCATATCGTCAGTGCATGATCTCAGTCAGCAGCTATGTCAGTGCATGCATGTAGTCGAGCATCATCTGACGCAGTACGTGCATAGATCGCATGACGATCGCATATCGTCAGTGCAGTGACTGATCGTCATCAGCTAGCATCGACTGCATGATCTCAGTCAGCAGCTATGTCAGTGCATGCAGTGACTGATCGTCATCAGCTAGCATCGACTGCATAGATCGCATGACGATCGCATATCGTCAGTGCATGATCTCAGTCAGCAGCTATGTCAGTGCATGCATGTAGTCGAGCATCATCTGACGCAGTACGTGCATAGATCGCATGACGATCGCATATCGTCAGTGCATGATCTCAGTCAGCAGCTATGTCAGTGCATGCAGTGACTGATCGTCATCAGCTAGCATCGACTGCATGTAGTCGAGCATCATCTGACGCAGTACGTGCATGATCTCAGTCAGCAGCTATGTCAGTGCATGCATAGATCGCATGACGATCGCATATCGTCAGTGCATGTAGTCGAGCATCATCTGACGCAGTACGTGCAGTGACTGATCGTCATCAGCTAGCATCGACTGCATAGATCGCATGACGATCGCATATCGTCAGTGCATGTAGTCGAGCATCATCTGACGCAGTACGTGCATGATCTCAGTCAGCAGCTATGTCAGTGCATGCAGTGACTGATCGTCATCAGCTAGCATCGACTGCATGATCTCAGTCAGCAGCTATGTCAGTGCATGCAGTGACTGATCGTCATCAGCTAGCATCGACTGCATAGATCGCATGACGATCGCATATCGTCAGTGCATGTAGTCGAGCATCATCTGACGCAGTACGTGCATGATCTCAGTCAGCAGCTATGTCAGTGCATGCATGTAGTCGAGCATCATCTGACGCAGTACGTGCAGTGACTGATCGTCATCAGCTAGCATCGACTGCATAGATCGCATGACGATCGCATATCGTCAGTGCAGTGACTGATCGTCATCAGCTAGCATCGACTGCATGTAGTCGAGCATCATCTGACGCAGTACGTGCATAGATCGCATGACGATCGCATATCGTCAGTGCATGATCTCAGTCAGCAGCTATGTCAGTGCATGCAGTGACTGATCGTCATCAGCTAGCATCGACTGCATGTAGTCGAGCATCATCTGACGCAGTACGTGCATGATCTCAGTCAGCAGCTATGTCAGTGCATGCATAGATCGCATGACGATCGCATATCGTCAGTGCAGTGACTGATCGTCATCAGCTAGCATCGACTGCATGATCTCAGTCAGCAGC";   }
381                 else if(toupper(temp[0]) == 'I'){
382                     flowOrder = "TACGTACGTCTGAGCATCGATCGATGTACAGCTACGTACGTCTGAGCATCGATCGATGTACAGCTACGTACGTCTGAGCATCGATCGATGTACAGCTACGTACGTCTGAGCATCGATCGATGTACAGCTACGTACGTCTGAGCATCGATCGATGTACAGCTACGTACGTCTGAGCATCGATCGATGTACAGCTACGTACGTCTGAGCATCGATCGATGTACAGCTACGTACGTCTGAGCATCGATCGATGTACAGCTACGTACGTCTGAGCATCGATCGATGTACAGCTACGTACGTCTGAGCATCGATCGATGTACAGCTACGTACGTCTGAGCATCGATCGATGTACAGCTACGTACGTCTGAGCATCGATCGATGTACAGCTACGTACGTCTGAGCATCGATCGATGTACAGCTACGTACGTCTGAGCATCGATCGATGTACAGCTACGTACGTCTGAGCATCGATCGATGTACAGCTACGTACGTCTGAGCATCGATCGATGTACAGCTACGTACGTCTGAGCATCGATCGATGTACAGCTACGTACGTCTGAGCATCGATCGATGTACAGCTACGTACGTCTGAGCATCGATCGATGTACAGCTACGTACGTCTGAGCATCGATCGATGTACAGC";   }
383                 else {
384                     m->mothurOut("[ERROR]: " + temp + " is not a valid option for order. order options are A, B, or I. A = TACG, B = TACGTACGTACGATGTAGTCGAGCATCATCTGACGCAGTACGTGCATGATCTCAGTCAGCAGCTATGTCAGTGCATGCAGTGACTGATCGTCATCAGCTAGCATCGACTGCATAGATCGCATGACGATCGCATATCGTCAGTGCATGTAGTCGAGCATCATCTGACGCAGTACGTGCATGATCTCAGTCAGCAGCTATGTCAGTGCATGCATAGATCGCATGACGATCGCATATCGTCAGTGCAGTGACTGATCGTCATCAGCTAGCATCGACTGCATGTAGTCGAGCATCATCTGACGCAGTACGTGCATAGATCGCATGACGATCGCATATCGTCAGTGCATGATCTCAGTCAGCAGCTATGTCAGTGCATGCAGTGACTGATCGTCATCAGCTAGCATCGACTGCATGTAGTCGAGCATCATCTGACGCAGTACGTGCAGTGACTGATCGTCATCAGCTAGCATCGACTGCATAGATCGCATGACGATCGCATATCGTCAGTGCATGATCTCAGTCAGCAGCTATGTCAGTGCATGCATGTAGTCGAGCATCATCTGACGCAGTACGTGCATAGATCGCATGACGATCGCATATCGTCAGTGCAGTGACTGATCGTCATCAGCTAGCATCGACTGCATGATCTCAGTCAGCAGCTATGTCAGTGCATGCAGTGACTGATCGTCATCAGCTAGCATCGACTGCATAGATCGCATGACGATCGCATATCGTCAGTGCATGATCTCAGTCAGCAGCTATGTCAGTGCATGCATGTAGTCGAGCATCATCTGACGCAGTACGTGCATAGATCGCATGACGATCGCATATCGTCAGTGCATGATCTCAGTCAGCAGCTATGTCAGTGCATGCAGTGACTGATCGTCATCAGCTAGCATCGACTGCATGTAGTCGAGCATCATCTGACGCAGTACGTGCATGATCTCAGTCAGCAGCTATGTCAGTGCATGCATAGATCGCATGACGATCGCATATCGTCAGTGCATGTAGTCGAGCATCATCTGACGCAGTACGTGCAGTGACTGATCGTCATCAGCTAGCATCGACTGCATAGATCGCATGACGATCGCATATCGTCAGTGCATGTAGTCGAGCATCATCTGACGCAGTACGTGCATGATCTCAGTCAGCAGCTATGTCAGTGCATGCAGTGACTGATCGTCATCAGCTAGCATCGACTGCATGATCTCAGTCAGCAGCTATGTCAGTGCATGCAGTGACTGATCGTCATCAGCTAGCATCGACTGCATAGATCGCATGACGATCGCATATCGTCAGTGCATGTAGTCGAGCATCATCTGACGCAGTACGTGCATGATCTCAGTCAGCAGCTATGTCAGTGCATGCATGTAGTCGAGCATCATCTGACGCAGTACGTGCAGTGACTGATCGTCATCAGCTAGCATCGACTGCATAGATCGCATGACGATCGCATATCGTCAGTGCAGTGACTGATCGTCATCAGCTAGCATCGACTGCATGTAGTCGAGCATCATCTGACGCAGTACGTGCATAGATCGCATGACGATCGCATATCGTCAGTGCATGATCTCAGTCAGCAGCTATGTCAGTGCATGCAGTGACTGATCGTCATCAGCTAGCATCGACTGCATGTAGTCGAGCATCATCTGACGCAGTACGTGCATGATCTCAGTCAGCAGCTATGTCAGTGCATGCATAGATCGCATGACGATCGCATATCGTCAGTGCAGTGACTGATCGTCATCAGCTAGCATCGACTGCATGATCTCAGTCAGCAGC, and I = TACGTACGTCTGAGCATCGATCGATGTACAGCTACGTACGTCTGAGCATCGATCGATGTACAGCTACGTACGTCTGAGCATCGATCGATGTACAGCTACGTACGTCTGAGCATCGATCGATGTACAGCTACGTACGTCTGAGCATCGATCGATGTACAGCTACGTACGTCTGAGCATCGATCGATGTACAGCTACGTACGTCTGAGCATCGATCGATGTACAGCTACGTACGTCTGAGCATCGATCGATGTACAGCTACGTACGTCTGAGCATCGATCGATGTACAGCTACGTACGTCTGAGCATCGATCGATGTACAGCTACGTACGTCTGAGCATCGATCGATGTACAGCTACGTACGTCTGAGCATCGATCGATGTACAGCTACGTACGTCTGAGCATCGATCGATGTACAGCTACGTACGTCTGAGCATCGATCGATGTACAGCTACGTACGTCTGAGCATCGATCGATGTACAGCTACGTACGTCTGAGCATCGATCGATGTACAGCTACGTACGTCTGAGCATCGATCGATGTACAGCTACGTACGTCTGAGCATCGATCGATGTACAGCTACGTACGTCTGAGCATCGATCGATGTACAGCTACGTACGTCTGAGCATCGATCGATGTACAGC.\n");  abort=true;
385                 }
386             }
387
388                         
389                 }
390 #ifdef USE_MPI
391                 }                               
392 #endif
393         }
394         catch(exception& e) {
395                 m->errorOut(e, "ShhherCommand", "ShhherCommand");
396                 exit(1);
397         }
398 }
399 //**********************************************************************************************************************
400 #ifdef USE_MPI
401 int ShhherCommand::execute(){
402         try {
403                 if (abort == true) { if (calledHelp) { return 0; }  return 2;   }
404                 
405                 int tag = 1976;
406                 MPI_Status status; 
407                         
408                 if(pid == 0){
409
410                         for(int i=1;i<ncpus;i++){
411                                 MPI_Send(&abort, 1, MPI_INT, i, tag, MPI_COMM_WORLD);
412                         }
413                         if(abort == 1){ return 0;       }
414
415                         processors = ncpus;
416                         
417                         m->mothurOut("\nGetting preliminary data...\n");
418                         getSingleLookUp();      if (m->control_pressed) { return 0; }
419                         getJointLookUp();       if (m->control_pressed) { return 0; }
420                         
421             vector<string> flowFileVector;
422                         if(flowFilesFileName != "not found"){
423                                 string fName;
424                 
425                                 ifstream flowFilesFile;
426                                 m->openInputFile(flowFilesFileName, flowFilesFile);
427                                 while(flowFilesFile){
428                                         fName = m->getline(flowFilesFile);
429                                         flowFileVector.push_back(fName);
430                                         m->gobble(flowFilesFile);
431                                 }
432                         }
433                         else{
434                                 flowFileVector.push_back(flowFileName);
435                         }
436             
437                         int numFiles = flowFileVector.size();
438
439                         for(int i=1;i<ncpus;i++){
440                                 MPI_Send(&numFiles, 1, MPI_INT, i, tag, MPI_COMM_WORLD);
441                         }
442                         
443                         for(int i=0;i<numFiles;i++){
444                                 
445                                 if (m->control_pressed) { break; }
446                                 
447                                 double begClock = clock();
448                                 unsigned long long begTime = time(NULL);
449
450                                 flowFileName = flowFileVector[i];
451                                 
452                                 m->mothurOut("\n>>>>>\tProcessing " + flowFileName + " (file " + toString(i+1) + " of " + toString(numFiles) + ")\t<<<<<\n");
453                                 m->mothurOut("Reading flowgrams...\n");
454                                 getFlowData();
455                                 
456                                 if (m->control_pressed) { break; }
457
458                                 m->mothurOut("Identifying unique flowgrams...\n");
459                                 getUniques();
460                                 
461                                 if (m->control_pressed) { break; }
462
463                                 m->mothurOut("Calculating distances between flowgrams...\n");
464                                 char fileName[1024];
465                                 strcpy(fileName, flowFileName.c_str());
466
467                                 for(int i=1;i<ncpus;i++){
468                                         MPI_Send(&fileName[0], 1024, MPI_CHAR, i, tag, MPI_COMM_WORLD);
469
470                                         MPI_Send(&numSeqs, 1, MPI_INT, i, tag, MPI_COMM_WORLD);
471                                         MPI_Send(&numUniques, 1, MPI_INT, i, tag, MPI_COMM_WORLD);
472                                         MPI_Send(&numFlowCells, 1, MPI_INT, i, tag, MPI_COMM_WORLD);
473                                         MPI_Send(&flowDataIntI[0], numSeqs * numFlowCells, MPI_SHORT, i, tag, MPI_COMM_WORLD);
474                                         MPI_Send(&flowDataPrI[0], numSeqs * numFlowCells, MPI_DOUBLE, i, tag, MPI_COMM_WORLD);
475                                         MPI_Send(&mapUniqueToSeq[0], numSeqs, MPI_INT, i, tag, MPI_COMM_WORLD);
476                                         MPI_Send(&mapSeqToUnique[0], numSeqs, MPI_INT, i, tag, MPI_COMM_WORLD);
477                                         MPI_Send(&lengths[0], numSeqs, MPI_INT, i, tag, MPI_COMM_WORLD);
478                                         MPI_Send(&jointLookUp[0], NUMBINS * NUMBINS, MPI_DOUBLE, i, tag, MPI_COMM_WORLD);
479                                         MPI_Send(&cutoff, 1, MPI_DOUBLE, i, tag, MPI_COMM_WORLD);
480                                 }                               
481                                                         
482                                 string distFileName = flowDistMPI(0, int(sqrt(1.0/float(ncpus)) * numUniques));
483                                 
484                                 if (m->control_pressed) { break; }
485                                 
486                                 int done;
487                                 for(int i=1;i<ncpus;i++){
488                                         MPI_Recv(&done, 1, MPI_INT, i, tag, MPI_COMM_WORLD, &status);
489                                         
490                                         m->appendFiles((distFileName + ".temp." + toString(i)), distFileName);
491                                         m->mothurRemove((distFileName + ".temp." + toString(i)));
492                                 }
493
494                                 string namesFileName = createNamesFile();
495                                 
496                                 if (m->control_pressed) { break; }
497                                 
498                                 m->mothurOut("\nClustering flowgrams...\n");
499                                 string listFileName = cluster(distFileName, namesFileName);
500                                 
501                                 if (m->control_pressed) { break; }
502                                 
503                 
504                 
505                                 getOTUData(listFileName);
506
507                                 m->mothurRemove(distFileName);
508                                 m->mothurRemove(namesFileName);
509                                 m->mothurRemove(listFileName);
510                                 
511                                 if (m->control_pressed) { break; }
512                                 
513                                 initPyroCluster();
514
515                                 if (m->control_pressed) { break; }
516                                 
517             
518                                 for(int i=1;i<ncpus;i++){
519                                         MPI_Send(&numOTUs, 1, MPI_INT, i, tag, MPI_COMM_WORLD);
520                                         MPI_Send(&singleLookUp[0], singleLookUp.size(), MPI_DOUBLE, i, tag, MPI_COMM_WORLD);
521                                         MPI_Send(&uniqueFlowgrams[0], numFlowCells * numUniques, MPI_SHORT, i, tag, MPI_COMM_WORLD);
522                                         MPI_Send(&sigma, 1, MPI_DOUBLE, i, tag, MPI_COMM_WORLD);
523                                 }
524                                 
525                                 if (m->control_pressed) { break; }
526                                 
527                                 double maxDelta = 0;
528                                 int iter = 0;
529                                 
530                                 int numOTUsOnCPU = numOTUs / ncpus;
531                                 int numSeqsOnCPU = numSeqs / ncpus;
532                                 m->mothurOut("\nDenoising flowgrams...\n");
533                                 m->mothurOut("iter\tmaxDelta\tnLL\t\tcycletime\n");
534                                 
535                                 while((maxIters == 0 && maxDelta > minDelta) || iter < MIN_ITER || (maxDelta > minDelta && iter < maxIters)){
536                                         
537                                         double cycClock = clock();
538                                         unsigned long long cycTime = time(NULL);
539                                         fill();
540                                         
541                                         if (m->control_pressed) { break; }
542
543                                         int total = singleTau.size();
544                                         for(int i=1;i<ncpus;i++){
545                                                 MPI_Send(&total, 1, MPI_INT, i, tag, MPI_COMM_WORLD);
546                                                 MPI_Send(&change[0], numOTUs, MPI_SHORT, i, tag, MPI_COMM_WORLD);
547                                                 MPI_Send(&centroids[0], numOTUs, MPI_INT, i, tag, MPI_COMM_WORLD);
548                                                 
549                                                 MPI_Send(&singleTau[0], total, MPI_DOUBLE, i, tag, MPI_COMM_WORLD);
550                                                 MPI_Send(&seqNumber[0], total, MPI_INT, i, tag, MPI_COMM_WORLD);
551                                                 MPI_Send(&seqIndex[0], total, MPI_INT, i, tag, MPI_COMM_WORLD);
552                                                 MPI_Send(&nSeqsPerOTU[0], numOTUs, MPI_INT, i, tag, MPI_COMM_WORLD);
553                                                 MPI_Send(&cumNumSeqs[0], numOTUs, MPI_INT, i, tag, MPI_COMM_WORLD);
554                                         }
555                                         
556                                         calcCentroidsDriver(0, numOTUsOnCPU);
557                                         
558                                         for(int i=1;i<ncpus;i++){
559                                                 int otuStart = i * numOTUs / ncpus;
560                                                 int otuStop = (i + 1) * numOTUs / ncpus;
561                                                 
562                                                 vector<int> tempCentroids(numOTUs, 0);
563                                                 vector<short> tempChange(numOTUs, 0);
564                                                 
565                                                 MPI_Recv(&tempCentroids[0], numOTUs, MPI_INT, i, tag, MPI_COMM_WORLD, &status);
566                                                 MPI_Recv(&tempChange[0], numOTUs, MPI_SHORT, i, tag, MPI_COMM_WORLD, &status);
567                                                 
568                                                 for(int j=otuStart;j<otuStop;j++){
569                                                         centroids[j] = tempCentroids[j];
570                                                         change[j] = tempChange[j];
571                                                 }
572                                         }
573                                                                         
574                                         maxDelta = getNewWeights(); if (m->control_pressed) { break; }
575                                         double nLL = getLikelihood(); if (m->control_pressed) { break; }
576                                         checkCentroids(); if (m->control_pressed) { break; }
577                                         
578                                         for(int i=1;i<ncpus;i++){
579                                                 MPI_Send(&centroids[0], numOTUs, MPI_INT, i, tag, MPI_COMM_WORLD);
580                                                 MPI_Send(&weight[0], numOTUs, MPI_DOUBLE, i, tag, MPI_COMM_WORLD);
581                                                 MPI_Send(&change[0], numOTUs, MPI_SHORT, i, tag, MPI_COMM_WORLD);
582                                         }
583                                         
584                                         calcNewDistancesParent(0, numSeqsOnCPU);
585
586                                         total = singleTau.size();
587
588                                         for(int i=1;i<ncpus;i++){
589                                                 int childTotal;
590                                                 int seqStart = i * numSeqs / ncpus;
591                                                 int seqStop = (i + 1) * numSeqs / ncpus;
592                                                 
593                                                 MPI_Recv(&childTotal, 1, MPI_INT, i, tag, MPI_COMM_WORLD, &status);
594
595                                                 vector<int> childSeqIndex(childTotal, 0);
596                                                 vector<double> childSingleTau(childTotal, 0);
597                                                 vector<double> childDist(numSeqs * numOTUs, 0);
598                                                 vector<int> otuIndex(childTotal, 0);
599                                                 
600                                                 MPI_Recv(&childSeqIndex[0], childTotal, MPI_INT, i, tag, MPI_COMM_WORLD, &status);
601                                                 MPI_Recv(&childSingleTau[0], childTotal, MPI_DOUBLE, i, tag, MPI_COMM_WORLD, &status);
602                                                 MPI_Recv(&childDist[0], numOTUs * numSeqs, MPI_DOUBLE, i, tag, MPI_COMM_WORLD, &status);
603                                                 MPI_Recv(&otuIndex[0], childTotal, MPI_INT, i, tag, MPI_COMM_WORLD, &status);
604
605                                                 int oldTotal = total;
606                                                 total += childTotal;
607                                                 singleTau.resize(total, 0);
608                                                 seqIndex.resize(total, 0);
609                                                 seqNumber.resize(total, 0);
610                                                 
611                                                 int childIndex = 0;
612                                                 
613                                                 for(int j=oldTotal;j<total;j++){
614                                                         int otuI = otuIndex[childIndex];
615                                                         int seqI = childSeqIndex[childIndex];
616                                                         
617                                                         singleTau[j] = childSingleTau[childIndex];
618                                                         
619                                                         aaP[otuI][nSeqsPerOTU[otuI]] = j;
620                                                         aaI[otuI][nSeqsPerOTU[otuI]] = seqI;
621                                                         nSeqsPerOTU[otuI]++;
622                                                         childIndex++;
623                                                 }
624                                                 
625                                                 int index = seqStart * numOTUs;
626                                                 for(int j=seqStart;j<seqStop;j++){
627                                                         for(int k=0;k<numOTUs;k++){
628                                                                 dist[index] = childDist[index];
629                                                                 index++;
630                                                         }
631                                                 }                                       
632                                         }
633                                         
634                                         iter++;
635                                         
636                                         m->mothurOut(toString(iter) + '\t' + toString(maxDelta) + '\t' + toString(nLL) + '\t' + toString(time(NULL) - cycTime) + '\t' + toString((clock() - cycClock)/(double)CLOCKS_PER_SEC) + '\n');                  
637
638                                         if((maxIters == 0 && maxDelta > minDelta) || iter < MIN_ITER || (maxDelta > minDelta && iter < maxIters)){
639                                                 int live = 1;
640                                                 for(int i=1;i<ncpus;i++){
641                                                         MPI_Send(&live, 1, MPI_INT, i, tag, MPI_COMM_WORLD);
642                                                 }
643                                         }
644                                         else{
645                                                 int live = 0;
646                                                 for(int i=1;i<ncpus;i++){
647                                                         MPI_Send(&live, 1, MPI_INT, i, tag, MPI_COMM_WORLD); //send kill command
648                                                 }
649                                         }
650                                         
651                                 }       
652                                 
653                                 if (m->control_pressed) { break; }
654                                 
655                                 m->mothurOut("\nFinalizing...\n");
656                                 fill();
657                                 
658                                 if (m->control_pressed) { break; }
659                                 
660                                 setOTUs();
661                                 
662                                 vector<int> otuCounts(numOTUs, 0);
663                                 for(int i=0;i<numSeqs;i++)      {       otuCounts[otuData[i]]++;        }
664                                 calcCentroidsDriver(0, numOTUs);
665                                 
666                                 if (m->control_pressed) { break; }
667                                 
668                                 writeQualities(otuCounts);      if (m->control_pressed) { break; }
669                                 writeSequences(otuCounts);      if (m->control_pressed) { break; }
670                                 writeNames(otuCounts);          if (m->control_pressed) { break; }
671                                 writeClusters(otuCounts);       if (m->control_pressed) { break; }
672                                 writeGroups();                          if (m->control_pressed) { break; }
673                                 
674                                                                  
675                                 m->mothurOut("Total time to process " + toString(flowFileName) + ":\t" + toString(time(NULL) - begTime) + '\t' + toString((clock() - begClock)/(double)CLOCKS_PER_SEC) + '\n');                 
676                         }
677                 }
678                 else{
679                         int abort = 1;
680
681                         MPI_Recv(&abort, 1, MPI_INT, 0, tag, MPI_COMM_WORLD, &status);
682                         if(abort){      return 0;       }
683
684                         int numFiles;
685                         MPI_Recv(&numFiles, 1, MPI_INT, 0, tag, MPI_COMM_WORLD, &status);
686
687                         for(int i=0;i<numFiles;i++){
688                                 
689                                 if (m->control_pressed) { break; }
690                                 
691                                 //Now into the pyrodist part
692                                 bool live = 1;
693
694                                 char fileName[1024];
695                                 MPI_Recv(&fileName, 1024, MPI_CHAR, 0, tag, MPI_COMM_WORLD, &status);
696                                 MPI_Recv(&numSeqs, 1, MPI_INT, 0, tag, MPI_COMM_WORLD, &status);
697                                 MPI_Recv(&numUniques, 1, MPI_INT, 0, tag, MPI_COMM_WORLD, &status);
698                                 MPI_Recv(&numFlowCells, 1, MPI_INT, 0, tag, MPI_COMM_WORLD, &status);
699                                 
700                                 flowDataIntI.resize(numSeqs * numFlowCells);
701                                 flowDataPrI.resize(numSeqs * numFlowCells);
702                                 mapUniqueToSeq.resize(numSeqs);
703                                 mapSeqToUnique.resize(numSeqs);
704                                 lengths.resize(numSeqs);
705                                 jointLookUp.resize(NUMBINS * NUMBINS);
706
707                                 MPI_Recv(&flowDataIntI[0], numSeqs * numFlowCells, MPI_SHORT, 0, tag, MPI_COMM_WORLD, &status);
708                                 MPI_Recv(&flowDataPrI[0], numSeqs * numFlowCells, MPI_DOUBLE, 0, tag, MPI_COMM_WORLD, &status);
709                                 MPI_Recv(&mapUniqueToSeq[0], numSeqs, MPI_INT, 0, tag, MPI_COMM_WORLD, &status);
710                                 MPI_Recv(&mapSeqToUnique[0], numSeqs, MPI_INT, 0, tag, MPI_COMM_WORLD, &status);
711                                 MPI_Recv(&lengths[0], numSeqs, MPI_INT, 0, tag, MPI_COMM_WORLD, &status);
712                                 MPI_Recv(&jointLookUp[0], NUMBINS * NUMBINS, MPI_DOUBLE, 0, tag, MPI_COMM_WORLD, &status);
713                                 MPI_Recv(&cutoff, 1, MPI_DOUBLE, 0, tag, MPI_COMM_WORLD, &status);
714
715                                 flowFileName = string(fileName);
716                                 int flowDistStart = int(sqrt(float(pid)/float(ncpus)) * numUniques);
717                                 int flowDistEnd = int(sqrt(float(pid+1)/float(ncpus)) * numUniques);
718                                 
719                                 string distanceStringChild = flowDistMPI(flowDistStart, flowDistEnd);
720                                 
721                                 if (m->control_pressed) { break; }
722                                 
723                                 int done = 1;
724                                 MPI_Send(&done, 1, MPI_INT, 0, tag, MPI_COMM_WORLD);
725
726                                 //Now into the pyronoise part
727                                 MPI_Recv(&numOTUs, 1, MPI_INT, 0, tag, MPI_COMM_WORLD, &status);
728                                 
729                                 singleLookUp.resize(HOMOPS * NUMBINS);
730                                 uniqueFlowgrams.resize(numUniques * numFlowCells);
731                                 weight.resize(numOTUs);
732                                 centroids.resize(numOTUs);
733                                 change.resize(numOTUs);
734                                 dist.assign(numOTUs * numSeqs, 0);
735                                 nSeqsPerOTU.resize(numOTUs);
736                                 cumNumSeqs.resize(numOTUs);
737
738                                 MPI_Recv(&singleLookUp[0], singleLookUp.size(), MPI_DOUBLE, 0, tag, MPI_COMM_WORLD, &status);
739                                 MPI_Recv(&uniqueFlowgrams[0], uniqueFlowgrams.size(), MPI_SHORT, 0, tag, MPI_COMM_WORLD, &status);
740                                 MPI_Recv(&sigma, 1, MPI_DOUBLE, 0, tag, MPI_COMM_WORLD, &status);
741                                 
742                                 int startOTU = pid * numOTUs / ncpus;
743                                 int endOTU = (pid + 1) * numOTUs / ncpus;
744
745                                 int startSeq = pid * numSeqs / ncpus;
746                                 int endSeq = (pid + 1) * numSeqs /ncpus;
747                                 
748                                 int total;
749
750                                 while(live){
751                                         
752                                         if (m->control_pressed) { break; }
753                                         
754                                         MPI_Recv(&total, 1, MPI_INT, 0, tag, MPI_COMM_WORLD, &status);
755                                         singleTau.assign(total, 0.0000);
756                                         seqNumber.assign(total, 0);
757                                         seqIndex.assign(total, 0);
758
759                                         MPI_Recv(&change[0], numOTUs, MPI_SHORT, 0, tag, MPI_COMM_WORLD, &status);
760                                         MPI_Recv(&centroids[0], numOTUs, MPI_INT, 0, tag, MPI_COMM_WORLD, &status);
761                                         MPI_Recv(&singleTau[0], total, MPI_DOUBLE, 0, tag, MPI_COMM_WORLD, &status);
762                                         MPI_Recv(&seqNumber[0], total, MPI_DOUBLE, 0, tag, MPI_COMM_WORLD, &status);
763                                         MPI_Recv(&seqIndex[0], total, MPI_INT, 0, tag, MPI_COMM_WORLD, &status);
764                                         MPI_Recv(&nSeqsPerOTU[0], total, MPI_INT, 0, tag, MPI_COMM_WORLD, &status);
765                                         MPI_Recv(&cumNumSeqs[0], numOTUs, MPI_INT, 0, tag, MPI_COMM_WORLD, &status);
766
767                                         calcCentroidsDriver(startOTU, endOTU);
768
769                                         MPI_Send(&centroids[0], numOTUs, MPI_INT, 0, tag, MPI_COMM_WORLD);
770                                         MPI_Send(&change[0], numOTUs, MPI_SHORT, 0, tag, MPI_COMM_WORLD);
771
772                                         MPI_Recv(&centroids[0], numOTUs, MPI_INT, 0, tag, MPI_COMM_WORLD, &status);
773                                         MPI_Recv(&weight[0], numOTUs, MPI_DOUBLE, 0, tag, MPI_COMM_WORLD, &status);
774                                         MPI_Recv(&change[0], numOTUs, MPI_SHORT, 0, tag, MPI_COMM_WORLD, &status);
775
776                                         vector<int> otuIndex(total, 0);
777                                         calcNewDistancesChildMPI(startSeq, endSeq, otuIndex);
778                                         total = otuIndex.size();
779                                         
780                                         MPI_Send(&total, 1, MPI_INT, 0, tag, MPI_COMM_WORLD);
781                                         MPI_Send(&seqIndex[0], total, MPI_INT, 0, tag, MPI_COMM_WORLD);
782                                         MPI_Send(&singleTau[0], total, MPI_DOUBLE, 0, tag, MPI_COMM_WORLD);
783                                         MPI_Send(&dist[0], numOTUs * numSeqs, MPI_DOUBLE, 0, tag, MPI_COMM_WORLD);
784                                         MPI_Send(&otuIndex[0], total, MPI_INT, 0, tag, MPI_COMM_WORLD);
785                                 
786                                         MPI_Recv(&live, 1, MPI_INT, 0, tag, MPI_COMM_WORLD, &status);
787                                 }
788                         }
789                 }
790                 
791                 if (m->control_pressed) { for (int i = 0; i < outputNames.size(); i++) { m->mothurRemove(outputNames[i]); } return 0; }
792                 
793                 MPI_Barrier(MPI_COMM_WORLD);
794
795                 
796                 if(compositeFASTAFileName != ""){
797                         outputNames.push_back(compositeFASTAFileName); outputTypes["fasta"].push_back(compositeFASTAFileName);
798                         outputNames.push_back(compositeNamesFileName); outputTypes["name"].push_back(compositeNamesFileName); 
799                 }
800
801                 m->mothurOutEndLine();
802                 m->mothurOut("Output File Names: "); m->mothurOutEndLine();
803                 for (int i = 0; i < outputNames.size(); i++) {  m->mothurOut(outputNames[i]); m->mothurOutEndLine();    }
804                 m->mothurOutEndLine();
805                 
806                 return 0;
807
808         }
809         catch(exception& e) {
810                 m->errorOut(e, "ShhherCommand", "execute");
811                 exit(1);
812         }
813 }
814 /**************************************************************************************************/
815 string ShhherCommand::createNamesFile(){
816         try{
817                 
818                 vector<string> duplicateNames(numUniques, "");
819                 for(int i=0;i<numSeqs;i++){
820                         duplicateNames[mapSeqToUnique[i]] += seqNameVector[i] + ',';
821                 }
822                 map<string, string> variables; 
823                 variables["[filename]"] = outputDir + m->getRootName(m->getSimpleName(flowFileName));
824                 string nameFileName = getOutputFileName("name",variables);
825                 
826                 ofstream nameFile;
827                 m->openOutputFile(nameFileName, nameFile);
828                 
829                 for(int i=0;i<numUniques;i++){
830                         
831                         if (m->control_pressed) { break; }
832                         
833             //                  nameFile << seqNameVector[mapUniqueToSeq[i]] << '\t' << duplicateNames[i].substr(0, duplicateNames[i].find_last_of(',')) << endl;
834                         nameFile << mapUniqueToSeq[i] << '\t' << duplicateNames[i].substr(0, duplicateNames[i].find_last_of(',')) << endl;
835                 }
836                 
837                 nameFile.close();
838                 return  nameFileName;
839         }
840         catch(exception& e) {
841                 m->errorOut(e, "ShhherCommand", "createNamesFile");
842                 exit(1);
843         }
844 }
845 /**************************************************************************************************/
846
847 string ShhherCommand::flowDistMPI(int startSeq, int stopSeq){
848         try{            
849                 ostringstream outStream;
850                 outStream.setf(ios::fixed, ios::floatfield);
851                 outStream.setf(ios::dec, ios::basefield);
852                 outStream.setf(ios::showpoint);
853                 outStream.precision(6);
854                 
855                 int begTime = time(NULL);
856                 double begClock = clock();
857                 
858                 for(int i=startSeq;i<stopSeq;i++){
859                         
860                         if (m->control_pressed) { break; }
861                         
862                         for(int j=0;j<i;j++){
863                                 float flowDistance = calcPairwiseDist(mapUniqueToSeq[i], mapUniqueToSeq[j]);
864                                 
865                                 if(flowDistance < 1e-6){
866                                         outStream << mapUniqueToSeq[i] << '\t' << mapUniqueToSeq[j] << '\t' << 0.000000 << endl;
867                                 }
868                                 else if(flowDistance <= cutoff){
869                                         outStream << mapUniqueToSeq[i] << '\t' << mapUniqueToSeq[j] << '\t' << flowDistance << endl;
870                                 }
871                         }
872                         if(i % 100 == 0){
873                                 m->mothurOutJustToScreen(toString(i) + '\t' + toString(time(NULL) - begTime) + '\t' + toString((clock()-begClock)/CLOCKS_PER_SEC) + '\n');
874                         }
875                 }
876                 
877                 string fDistFileName = flowFileName.substr(0,flowFileName.find_last_of('.')) + ".shhh.dist";
878                 if(pid != 0){   fDistFileName += ".temp." + toString(pid);      }
879                 
880                 if (m->control_pressed) { return fDistFileName; }
881                 
882                 m->mothurOutJustToScreen(toString(stopSeq) + '\t' + toString(time(NULL) - begTime) + '\t' + toString((clock()-begClock)/CLOCKS_PER_SEC) + '\n');
883
884                 ofstream distFile(fDistFileName.c_str());
885                 distFile << outStream.str();            
886                 distFile.close();
887                 
888                 return fDistFileName;
889         }
890         catch(exception& e) {
891                 m->errorOut(e, "ShhherCommand", "flowDistMPI");
892                 exit(1);
893         }
894 }
895 /**************************************************************************************************/
896  
897 void ShhherCommand::getOTUData(string listFileName){
898     try {
899         
900         ifstream listFile;
901         m->openInputFile(listFileName, listFile);
902         string label;
903         
904         listFile >> label >> numOTUs;
905         
906         otuData.assign(numSeqs, 0);
907         cumNumSeqs.assign(numOTUs, 0);
908         nSeqsPerOTU.assign(numOTUs, 0);
909         aaP.clear();aaP.resize(numOTUs);
910         
911         seqNumber.clear();
912         aaI.clear();
913         seqIndex.clear();
914         
915         string singleOTU = "";
916         
917         for(int i=0;i<numOTUs;i++){
918             
919             if (m->control_pressed) { break; }
920             
921             listFile >> singleOTU;
922             
923             istringstream otuString(singleOTU);
924             
925             while(otuString){
926                 
927                 string seqName = "";
928                 
929                 for(int j=0;j<singleOTU.length();j++){
930                     char letter = otuString.get();
931                     
932                     if(letter != ','){
933                         seqName += letter;
934                     }
935                     else{
936                         map<string,int>::iterator nmIt = nameMap.find(seqName);
937                         int index = nmIt->second;
938                         
939                         nameMap.erase(nmIt);
940                         
941                         otuData[index] = i;
942                         nSeqsPerOTU[i]++;
943                         aaP[i].push_back(index);
944                         seqName = "";
945                     }
946                 }
947                 
948                 map<string,int>::iterator nmIt = nameMap.find(seqName);
949                 
950                 int index = nmIt->second;
951                 nameMap.erase(nmIt);
952                 
953                 otuData[index] = i;
954                 nSeqsPerOTU[i]++;
955                 aaP[i].push_back(index);        
956                 
957                 otuString.get();
958             }
959             
960             sort(aaP[i].begin(), aaP[i].end());
961             for(int j=0;j<nSeqsPerOTU[i];j++){
962                 seqNumber.push_back(aaP[i][j]);
963             }
964             for(int j=nSeqsPerOTU[i];j<numSeqs;j++){
965                 aaP[i].push_back(0);
966             }
967             
968             
969         }
970         
971         for(int i=1;i<numOTUs;i++){
972             cumNumSeqs[i] = cumNumSeqs[i-1] + nSeqsPerOTU[i-1];
973         }
974         aaI = aaP;
975         seqIndex = seqNumber;
976         
977         listFile.close();       
978         
979     }
980     catch(exception& e) {
981         m->errorOut(e, "ShhherCommand", "getOTUData");
982         exit(1);        
983     }           
984 }
985
986 /**************************************************************************************************/
987
988 void ShhherCommand::initPyroCluster(){                          
989     try{
990         if (numOTUs < processors) { processors = 1; }
991         
992         if (m->debug) { m->mothurOut("[DEBUG]: numSeqs = " + toString(numSeqs) + " numOTUS = " + toString(numOTUs) + " about to alloc a dist vector with size = " + toString((numSeqs * numOTUs)) + ".\n"); }
993         
994         dist.assign(numSeqs * numOTUs, 0);
995         change.assign(numOTUs, 1);
996         centroids.assign(numOTUs, -1);
997         weight.assign(numOTUs, 0);
998         singleTau.assign(numSeqs, 1.0);
999         
1000         nSeqsBreaks.assign(processors+1, 0);
1001         nOTUsBreaks.assign(processors+1, 0);
1002         
1003         if (m->debug) { m->mothurOut("[DEBUG]: made it through the memory allocation.\n"); }
1004         
1005         nSeqsBreaks[0] = 0;
1006         for(int i=0;i<processors;i++){
1007             nSeqsBreaks[i+1] = nSeqsBreaks[i] + (int)((double) numSeqs / (double) processors);
1008             nOTUsBreaks[i+1] = nOTUsBreaks[i] + (int)((double) numOTUs / (double) processors);
1009         }
1010         nSeqsBreaks[processors] = numSeqs;
1011         nOTUsBreaks[processors] = numOTUs;
1012     }
1013     catch(exception& e) {
1014         m->errorOut(e, "ShhherCommand", "initPyroCluster");
1015         exit(1);        
1016     }
1017 }
1018
1019 /**************************************************************************************************/
1020
1021 void ShhherCommand::fill(){
1022     try {
1023         int index = 0;
1024         for(int i=0;i<numOTUs;i++){
1025             
1026             if (m->control_pressed) { break; }
1027             
1028             cumNumSeqs[i] = index;
1029             for(int j=0;j<nSeqsPerOTU[i];j++){
1030                 seqNumber[index] = aaP[i][j];
1031                 seqIndex[index] = aaI[i][j];
1032                 
1033                 index++;
1034             }
1035         }
1036     }
1037     catch(exception& e) {
1038         m->errorOut(e, "ShhherCommand", "fill");
1039         exit(1);        
1040     }           
1041 }
1042
1043 /**************************************************************************************************/
1044  
1045 void ShhherCommand::getFlowData(){
1046     try{
1047         ifstream flowFile;
1048         m->openInputFile(flowFileName, flowFile);
1049         
1050         string seqName;
1051         seqNameVector.clear();
1052         lengths.clear();
1053         flowDataIntI.clear();
1054         nameMap.clear();
1055         
1056         
1057         int currentNumFlowCells;
1058         
1059         float intensity;
1060         
1061         string numFlowTest;
1062         flowFile >> numFlowTest;
1063         
1064         if (!m->isContainingOnlyDigits(numFlowTest)) { m->mothurOut("[ERROR]: expected a number and got " + numFlowTest + ", quitting. Did you use the flow parameter instead of the file parameter?"); m->mothurOutEndLine(); exit(1); }
1065         else { convert(numFlowTest, numFlowCells); }
1066         
1067         int index = 0;//pcluster
1068         while(!flowFile.eof()){
1069             
1070             if (m->control_pressed) { break; }
1071             
1072             flowFile >> seqName >> currentNumFlowCells;
1073             lengths.push_back(currentNumFlowCells);
1074             
1075             seqNameVector.push_back(seqName);
1076             nameMap[seqName] = index++;//pcluster
1077             
1078             for(int i=0;i<numFlowCells;i++){
1079                 flowFile >> intensity;
1080                 if(intensity > 9.99)    {       intensity = 9.99;       }
1081                 int intI = int(100 * intensity + 0.0001);
1082                 flowDataIntI.push_back(intI);
1083             }
1084             m->gobble(flowFile);
1085         }
1086         flowFile.close();
1087         
1088         numSeqs = seqNameVector.size();         
1089         
1090         for(int i=0;i<numSeqs;i++){
1091             
1092             if (m->control_pressed) { break; }
1093             
1094             int iNumFlowCells = i * numFlowCells;
1095             for(int j=lengths[i];j<numFlowCells;j++){
1096                 flowDataIntI[iNumFlowCells + j] = 0;
1097             }
1098         }
1099         
1100     }
1101     catch(exception& e) {
1102         m->errorOut(e, "ShhherCommand", "getFlowData");
1103         exit(1);
1104     }
1105 }
1106 /**************************************************************************************************/
1107 void ShhherCommand::calcNewDistancesChildMPI(int startSeq, int stopSeq, vector<int>& otuIndex){
1108         
1109         try{
1110                 vector<double> newTau(numOTUs,0);
1111                 vector<double> norms(numSeqs, 0);
1112                 otuIndex.clear();
1113                 seqIndex.clear();
1114                 singleTau.clear();
1115                 
1116                 for(int i=startSeq;i<stopSeq;i++){
1117                         
1118                         if (m->control_pressed) { break; }
1119                         
1120                         double offset = 1e8;
1121                         int indexOffset = i * numOTUs;
1122                         
1123                         for(int j=0;j<numOTUs;j++){
1124                                 
1125                                 if(weight[j] > MIN_WEIGHT && change[j] == 1){
1126                                         dist[indexOffset + j] = getDistToCentroid(centroids[j], i, lengths[i]);
1127                                 }
1128                                 if(weight[j] > MIN_WEIGHT && dist[indexOffset + j] < offset){
1129                                         offset = dist[indexOffset + j];
1130                                 }
1131                         }
1132                         
1133                         for(int j=0;j<numOTUs;j++){
1134                                 if(weight[j] > MIN_WEIGHT){
1135                                         newTau[j] = exp(sigma * (-dist[indexOffset + j] + offset)) * weight[j];
1136                                         norms[i] += newTau[j];
1137                                 }
1138                                 else{
1139                                         newTau[j] = 0.0;
1140                                 }
1141                         }
1142                         
1143                         for(int j=0;j<numOTUs;j++){
1144                 
1145                                 newTau[j] /= norms[i];
1146                                 
1147                                 if(newTau[j] > MIN_TAU){
1148                                         otuIndex.push_back(j);
1149                                         seqIndex.push_back(i);
1150                                         singleTau.push_back(newTau[j]);
1151                                 }
1152                         }
1153                         
1154                 }
1155         }
1156         catch(exception& e) {
1157                 m->errorOut(e, "ShhherCommand", "calcNewDistancesChildMPI");
1158                 exit(1);        
1159         }               
1160 }
1161
1162 /**************************************************************************************************/
1163
1164 void ShhherCommand::calcNewDistancesParent(int startSeq, int stopSeq){
1165     
1166     try{
1167         
1168         int total = 0;
1169         vector<double> newTau(numOTUs,0);
1170         vector<double> norms(numSeqs, 0);
1171         nSeqsPerOTU.assign(numOTUs, 0);
1172         
1173         for(int i=startSeq;i<stopSeq;i++){
1174             
1175             if (m->control_pressed) { break; }
1176             
1177             int indexOffset = i * numOTUs;
1178             
1179             double offset = 1e8;
1180             
1181             for(int j=0;j<numOTUs;j++){
1182                 
1183                 if(weight[j] > MIN_WEIGHT && change[j] == 1){
1184                     dist[indexOffset + j] = getDistToCentroid(centroids[j], i, lengths[i]);
1185                 }
1186                 
1187                 if(weight[j] > MIN_WEIGHT && dist[indexOffset + j] < offset){
1188                     offset = dist[indexOffset + j];
1189                 }
1190             }
1191             
1192             for(int j=0;j<numOTUs;j++){
1193                 if(weight[j] > MIN_WEIGHT){
1194                     newTau[j] = exp(sigma * (-dist[indexOffset + j] + offset)) * weight[j];
1195                     norms[i] += newTau[j];
1196                 }
1197                 else{
1198                     newTau[j] = 0.0;
1199                 }
1200             }
1201             
1202             for(int j=0;j<numOTUs;j++){
1203                 newTau[j] /= norms[i];
1204             }
1205             
1206             for(int j=0;j<numOTUs;j++){
1207                 if(newTau[j] > MIN_TAU){
1208                     
1209                     int oldTotal = total;
1210                     
1211                     total++;
1212                     
1213                     singleTau.resize(total, 0);
1214                     seqNumber.resize(total, 0);
1215                     seqIndex.resize(total, 0);
1216                     
1217                     singleTau[oldTotal] = newTau[j];
1218                     
1219                     aaP[j][nSeqsPerOTU[j]] = oldTotal;
1220                     aaI[j][nSeqsPerOTU[j]] = i;
1221                     nSeqsPerOTU[j]++;
1222                 }
1223             }
1224             
1225         }
1226         
1227     }
1228     catch(exception& e) {
1229         m->errorOut(e, "ShhherCommand", "calcNewDistancesParent");
1230         exit(1);        
1231     }           
1232 }
1233
1234 /**************************************************************************************************/
1235
1236 void ShhherCommand::setOTUs(){
1237     
1238     try {
1239         vector<double> bigTauMatrix(numOTUs * numSeqs, 0.0000);
1240         
1241         for(int i=0;i<numOTUs;i++){
1242             
1243             if (m->control_pressed) { break; }
1244             
1245             for(int j=0;j<nSeqsPerOTU[i];j++){
1246                 int index = cumNumSeqs[i] + j;
1247                 double tauValue = singleTau[seqNumber[index]];
1248                 int sIndex = seqIndex[index];
1249                 bigTauMatrix[sIndex * numOTUs + i] = tauValue;                          
1250             }
1251         }
1252         
1253         for(int i=0;i<numSeqs;i++){
1254             double maxTau = -1.0000;
1255             int maxOTU = -1;
1256             for(int j=0;j<numOTUs;j++){
1257                 if(bigTauMatrix[i * numOTUs + j] > maxTau){
1258                     maxTau = bigTauMatrix[i * numOTUs + j];
1259                     maxOTU = j;
1260                 }
1261             }
1262             
1263             otuData[i] = maxOTU;
1264         }
1265         
1266         nSeqsPerOTU.assign(numOTUs, 0);         
1267         
1268         for(int i=0;i<numSeqs;i++){
1269             int index = otuData[i];
1270             
1271             singleTau[i] = 1.0000;
1272             dist[i] = 0.0000;
1273             
1274             aaP[index][nSeqsPerOTU[index]] = i;
1275             aaI[index][nSeqsPerOTU[index]] = i;
1276             
1277             nSeqsPerOTU[index]++;
1278         }
1279         fill(); 
1280     }
1281     catch(exception& e) {
1282         m->errorOut(e, "ShhherCommand", "setOTUs");
1283         exit(1);        
1284     }           
1285 }
1286
1287 /**************************************************************************************************/
1288  
1289 void ShhherCommand::getUniques(){
1290     try{
1291         
1292         
1293         numUniques = 0;
1294         uniqueFlowgrams.assign(numFlowCells * numSeqs, -1);
1295         uniqueCount.assign(numSeqs, 0);                                                 //      anWeights
1296         uniqueLengths.assign(numSeqs, 0);
1297         mapSeqToUnique.assign(numSeqs, -1);
1298         mapUniqueToSeq.assign(numSeqs, -1);
1299         
1300         vector<short> uniqueFlowDataIntI(numFlowCells * numSeqs, -1);
1301         
1302         for(int i=0;i<numSeqs;i++){
1303             
1304             if (m->control_pressed) { break; }
1305             
1306             int index = 0;
1307             
1308             vector<short> current(numFlowCells);
1309             for(int j=0;j<numFlowCells;j++){
1310                 current[j] = short(((flowDataIntI[i * numFlowCells + j] + 50.0)/100.0));
1311             }
1312             
1313             for(int j=0;j<numUniques;j++){
1314                 int offset = j * numFlowCells;
1315                 bool toEnd = 1;
1316                 
1317                 int shorterLength;
1318                 if(lengths[i] < uniqueLengths[j])       {       shorterLength = lengths[i];                     }
1319                 else                                                            {       shorterLength = uniqueLengths[j];       }
1320                 
1321                 for(int k=0;k<shorterLength;k++){
1322                     if(current[k] != uniqueFlowgrams[offset + k]){
1323                         toEnd = 0;
1324                         break;
1325                     }
1326                 }
1327                 
1328                 if(toEnd){
1329                     mapSeqToUnique[i] = j;
1330                     uniqueCount[j]++;
1331                     index = j;
1332                     if(lengths[i] > uniqueLengths[j])   {       uniqueLengths[j] = lengths[i];  }
1333                     break;
1334                 }
1335                 index++;
1336             }
1337             
1338             if(index == numUniques){
1339                 uniqueLengths[numUniques] = lengths[i];
1340                 uniqueCount[numUniques] = 1;
1341                 mapSeqToUnique[i] = numUniques;//anMap
1342                 mapUniqueToSeq[numUniques] = i;//anF
1343                 
1344                 for(int k=0;k<numFlowCells;k++){
1345                     uniqueFlowgrams[numUniques * numFlowCells + k] = current[k];
1346                     uniqueFlowDataIntI[numUniques * numFlowCells + k] = flowDataIntI[i * numFlowCells + k];
1347                 }
1348                 
1349                 numUniques++;
1350             }
1351         }
1352         uniqueFlowDataIntI.resize(numFlowCells * numUniques);
1353         uniqueLengths.resize(numUniques);       
1354         
1355         flowDataPrI.resize(numSeqs * numFlowCells, 0);
1356         for(int i=0;i<flowDataPrI.size();i++)   {       if (m->control_pressed) { break; } flowDataPrI[i] = getProbIntensity(flowDataIntI[i]);          }
1357     }
1358     catch(exception& e) {
1359         m->errorOut(e, "ShhherCommand", "getUniques");
1360         exit(1);
1361     }
1362 }
1363
1364 /**************************************************************************************************/
1365
1366 float ShhherCommand::calcPairwiseDist(int seqA, int seqB){
1367     try{
1368         int minLength = lengths[mapSeqToUnique[seqA]];
1369         if(lengths[seqB] < minLength){  minLength = lengths[mapSeqToUnique[seqB]];      }
1370         
1371         int ANumFlowCells = seqA * numFlowCells;
1372         int BNumFlowCells = seqB * numFlowCells;
1373         
1374         float dist = 0;
1375         
1376         for(int i=0;i<minLength;i++){
1377             
1378             if (m->control_pressed) { break; }
1379             
1380             int flowAIntI = flowDataIntI[ANumFlowCells + i];
1381             float flowAPrI = flowDataPrI[ANumFlowCells + i];
1382             
1383             int flowBIntI = flowDataIntI[BNumFlowCells + i];
1384             float flowBPrI = flowDataPrI[BNumFlowCells + i];
1385             dist += jointLookUp[flowAIntI * NUMBINS + flowBIntI] - flowAPrI - flowBPrI;
1386         }
1387         
1388         dist /= (float) minLength;
1389         return dist;
1390     }
1391     catch(exception& e) {
1392         m->errorOut(e, "ShhherCommand", "calcPairwiseDist");
1393         exit(1);
1394     }
1395 }
1396
1397 //**********************************************************************************************************************/
1398
1399 string ShhherCommand::cluster(string distFileName, string namesFileName){
1400     try {
1401         
1402         ReadMatrix* read = new ReadColumnMatrix(distFileName);  
1403                 read->setCutoff(cutoff);
1404                 
1405                 NameAssignment* clusterNameMap = new NameAssignment(namesFileName);
1406                 clusterNameMap->readMap();
1407                 read->read(clusterNameMap);
1408         
1409                 ListVector* list = read->getListVector();
1410                 SparseDistanceMatrix* matrix = read->getDMatrix();
1411                 
1412                 delete read; 
1413                 delete clusterNameMap; 
1414         
1415         RAbundVector* rabund = new RAbundVector(list->getRAbundVector());
1416         
1417         float adjust = -1.0;
1418         Cluster* cluster = new CompleteLinkage(rabund, list, matrix, cutoff, "furthest", adjust);
1419         string tag = cluster->getTag();
1420         
1421         double clusterCutoff = cutoff;
1422         while (matrix->getSmallDist() <= clusterCutoff && matrix->getNNodes() > 0){
1423             
1424             if (m->control_pressed) { break; }
1425             
1426             cluster->update(clusterCutoff);
1427         }
1428         
1429         list->setLabel(toString(cutoff));
1430         
1431         string listFileName = flowFileName.substr(0,flowFileName.find_last_of('.')) + ".shhh.list";
1432         ofstream listFile;
1433         m->openOutputFile(listFileName, listFile);
1434         list->print(listFile);
1435         listFile.close();
1436         
1437         delete matrix;  delete cluster; delete rabund; delete list;
1438         
1439         return listFileName;
1440     }
1441     catch(exception& e) {
1442         m->errorOut(e, "ShhherCommand", "cluster");
1443         exit(1);        
1444     }           
1445 }
1446
1447 /**************************************************************************************************/
1448
1449 void ShhherCommand::calcCentroidsDriver(int start, int finish){                          
1450     
1451     //this function gets the most likely homopolymer length at a flow position for a group of sequences
1452     //within an otu
1453     
1454     try{
1455         
1456         for(int i=start;i<finish;i++){
1457             
1458             if (m->control_pressed) { break; }
1459             
1460             double count = 0;
1461             int position = 0;
1462             int minFlowGram = 100000000;
1463             double minFlowValue = 1e8;
1464             change[i] = 0; //FALSE
1465             
1466             for(int j=0;j<nSeqsPerOTU[i];j++){
1467                 count += singleTau[seqNumber[cumNumSeqs[i] + j]];
1468             }
1469             
1470             if(nSeqsPerOTU[i] > 0 && count > MIN_COUNT){
1471                 vector<double> adF(nSeqsPerOTU[i]);
1472                 vector<int> anL(nSeqsPerOTU[i]);
1473                 
1474                 for(int j=0;j<nSeqsPerOTU[i];j++){
1475                     int index = cumNumSeqs[i] + j;
1476                     int nI = seqIndex[index];
1477                     int nIU = mapSeqToUnique[nI];
1478                     
1479                     int k;
1480                     for(k=0;k<position;k++){
1481                         if(nIU == anL[k]){
1482                             break;
1483                         }
1484                     }
1485                     if(k == position){
1486                         anL[position] = nIU;
1487                         adF[position] = 0.0000;
1488                         position++;
1489                     }                                           
1490                 }
1491                 
1492                 for(int j=0;j<nSeqsPerOTU[i];j++){
1493                     int index = cumNumSeqs[i] + j;
1494                     int nI = seqIndex[index];
1495                     
1496                     double tauValue = singleTau[seqNumber[index]];
1497                     
1498                     for(int k=0;k<position;k++){
1499                         double dist = getDistToCentroid(anL[k], nI, lengths[nI]);
1500                         adF[k] += dist * tauValue;
1501                     }
1502                 }
1503                 
1504                 for(int j=0;j<position;j++){
1505                     if(adF[j] < minFlowValue){
1506                         minFlowGram = j;
1507                         minFlowValue = adF[j];
1508                     }
1509                 }
1510                 
1511                 if(centroids[i] != anL[minFlowGram]){
1512                     change[i] = 1;
1513                     centroids[i] = anL[minFlowGram];
1514                 }
1515             }
1516             else if(centroids[i] != -1){
1517                 change[i] = 1;
1518                 centroids[i] = -1;                      
1519             }
1520         }
1521     }
1522     catch(exception& e) {
1523         m->errorOut(e, "ShhherCommand", "calcCentroidsDriver");
1524         exit(1);        
1525     }           
1526 }
1527
1528 /**************************************************************************************************/
1529
1530 double ShhherCommand::getDistToCentroid(int cent, int flow, int length){
1531     try{
1532         
1533         int flowAValue = cent * numFlowCells;
1534         int flowBValue = flow * numFlowCells;
1535         
1536         double dist = 0;
1537         
1538         for(int i=0;i<length;i++){
1539             dist += singleLookUp[uniqueFlowgrams[flowAValue] * NUMBINS + flowDataIntI[flowBValue]];
1540             flowAValue++;
1541             flowBValue++;
1542         }
1543         
1544         return dist / (double)length;
1545     }
1546     catch(exception& e) {
1547         m->errorOut(e, "ShhherCommand", "getDistToCentroid");
1548         exit(1);        
1549     }           
1550 }
1551
1552 /**************************************************************************************************/
1553
1554 double ShhherCommand::getNewWeights(){
1555     try{
1556         
1557         double maxChange = 0;
1558         
1559         for(int i=0;i<numOTUs;i++){
1560             
1561             if (m->control_pressed) { break; }
1562             
1563             double difference = weight[i];
1564             weight[i] = 0;
1565             
1566             for(int j=0;j<nSeqsPerOTU[i];j++){
1567                 int index = cumNumSeqs[i] + j;
1568                 double tauValue = singleTau[seqNumber[index]];
1569                 weight[i] += tauValue;
1570             }
1571             
1572             difference = fabs(weight[i] - difference);
1573             if(difference > maxChange){ maxChange = difference; }
1574         }
1575         return maxChange;
1576     }
1577     catch(exception& e) {
1578         m->errorOut(e, "ShhherCommand", "getNewWeights");
1579         exit(1);        
1580     }           
1581 }
1582  
1583  /**************************************************************************************************/
1584  
1585 double ShhherCommand::getLikelihood(){
1586     
1587     try{
1588         
1589         vector<long double> P(numSeqs, 0);
1590         int effNumOTUs = 0;
1591         
1592         for(int i=0;i<numOTUs;i++){
1593             if(weight[i] > MIN_WEIGHT){
1594                 effNumOTUs++;
1595             }
1596         }
1597         
1598         string hold;
1599         for(int i=0;i<numOTUs;i++){
1600             
1601             if (m->control_pressed) { break; }
1602             
1603             for(int j=0;j<nSeqsPerOTU[i];j++){
1604                 int index = cumNumSeqs[i] + j;
1605                 int nI = seqIndex[index];
1606                 double singleDist = dist[seqNumber[index]];
1607                 
1608                 P[nI] += weight[i] * exp(-singleDist * sigma);
1609             }
1610         }
1611         double nLL = 0.00;
1612         for(int i=0;i<numSeqs;i++){
1613             if(P[i] == 0){      P[i] = DBL_EPSILON;     }
1614             
1615             nLL += -log(P[i]);
1616         }
1617         
1618         nLL = nLL -(double)numSeqs * log(sigma);
1619         
1620         return nLL; 
1621     }
1622     catch(exception& e) {
1623         m->errorOut(e, "ShhherCommand", "getNewWeights");
1624         exit(1);        
1625     }           
1626 }
1627
1628 /**************************************************************************************************/
1629
1630 void ShhherCommand::checkCentroids(){
1631     try{
1632         vector<int> unique(numOTUs, 1);
1633         
1634         for(int i=0;i<numOTUs;i++){
1635             if(centroids[i] == -1 || weight[i] < MIN_WEIGHT){
1636                 unique[i] = -1;
1637             }
1638         }
1639         
1640         for(int i=0;i<numOTUs;i++){
1641             
1642             if (m->control_pressed) { break; }
1643             
1644             if(unique[i] == 1){
1645                 for(int j=i+1;j<numOTUs;j++){
1646                     if(unique[j] == 1){
1647                         
1648                         if(centroids[j] == centroids[i]){
1649                             unique[j] = 0;
1650                             centroids[j] = -1;
1651                             
1652                             weight[i] += weight[j];
1653                             weight[j] = 0.0;
1654                         }
1655                     }
1656                 }
1657             }
1658         }
1659     }
1660     catch(exception& e) {
1661         m->errorOut(e, "ShhherCommand", "checkCentroids");
1662         exit(1);        
1663     }           
1664 }
1665  /**************************************************************************************************/
1666
1667
1668  
1669 void ShhherCommand::writeQualities(vector<int> otuCounts){
1670     
1671     try {
1672         string thisOutputDir = outputDir;
1673         if (outputDir == "") {  thisOutputDir += m->hasPath(flowFileName);  }
1674         map<string, string> variables; 
1675                 variables["[filename]"] = outputDir + m->getRootName(m->getSimpleName(flowFileName));
1676         string qualityFileName = getOutputFileName("qfile",variables);
1677         
1678         ofstream qualityFile;
1679         m->openOutputFile(qualityFileName, qualityFile);
1680         
1681         qualityFile.setf(ios::fixed, ios::floatfield);
1682         qualityFile.setf(ios::showpoint);
1683         qualityFile << setprecision(6);
1684         
1685         vector<vector<int> > qualities(numOTUs);
1686         vector<double> pr(HOMOPS, 0);
1687         
1688         
1689         for(int i=0;i<numOTUs;i++){
1690             
1691             if (m->control_pressed) { break; }
1692             
1693             int index = 0;
1694             
1695             if(nSeqsPerOTU[i] > 0){
1696                 
1697                 while(index < numFlowCells){
1698                     double maxPrValue = 1e8;
1699                     short maxPrIndex = -1;
1700                     double count = 0.0000;
1701                     
1702                     pr.assign(HOMOPS, 0);
1703                     
1704                     for(int j=0;j<nSeqsPerOTU[i];j++){
1705                         int lIndex = cumNumSeqs[i] + j;
1706                         double tauValue = singleTau[seqNumber[lIndex]];
1707                         int sequenceIndex = aaI[i][j];
1708                         short intensity = flowDataIntI[sequenceIndex * numFlowCells + index];
1709                         
1710                         count += tauValue;
1711                         
1712                         for(int s=0;s<HOMOPS;s++){
1713                             pr[s] += tauValue * singleLookUp[s * NUMBINS + intensity];
1714                         }
1715                     }
1716                     
1717                     maxPrIndex = uniqueFlowgrams[centroids[i] * numFlowCells + index];
1718                     maxPrValue = pr[maxPrIndex];
1719                     
1720                     if(count > MIN_COUNT){
1721                         double U = 0.0000;
1722                         double norm = 0.0000;
1723                         
1724                         for(int s=0;s<HOMOPS;s++){
1725                             norm += exp(-(pr[s] - maxPrValue));
1726                         }
1727                         
1728                         for(int s=1;s<=maxPrIndex;s++){
1729                             int value = 0;
1730                             double temp = 0.0000;
1731                             
1732                             U += exp(-(pr[s-1]-maxPrValue))/norm;
1733                             
1734                             if(U>0.00){
1735                                 temp = log10(U);
1736                             }
1737                             else{
1738                                 temp = -10.1;
1739                             }
1740                             temp = floor(-10 * temp);
1741                             value = (int)floor(temp);
1742                             if(value > 100){    value = 100;    }
1743                             
1744                             qualities[i].push_back((int)value);
1745                         }
1746                     }
1747                     
1748                     index++;
1749                 }
1750             }
1751             
1752             
1753             if(otuCounts[i] > 0){
1754                 qualityFile << '>' << seqNameVector[mapUniqueToSeq[i]] << endl;
1755                 for (int j = 4; j < qualities[i].size(); j++) { qualityFile << qualities[i][j] << ' '; }
1756                 qualityFile << endl;
1757             }
1758         }
1759         qualityFile.close();
1760         outputNames.push_back(qualityFileName); outputTypes["qfile"].push_back(qualityFileName);
1761         
1762     }
1763     catch(exception& e) {
1764         m->errorOut(e, "ShhherCommand", "writeQualities");
1765         exit(1);        
1766     }           
1767 }
1768
1769 /**************************************************************************************************/
1770
1771 void ShhherCommand::writeSequences(vector<int> otuCounts){
1772     try {
1773         string thisOutputDir = outputDir;
1774         if (outputDir == "") {  thisOutputDir += m->hasPath(flowFileName);  }
1775         map<string, string> variables; 
1776                 variables["[filename]"] = thisOutputDir + m->getRootName(m->getSimpleName(flowFileName));
1777         string fastaFileName = getOutputFileName("fasta",variables);
1778         ofstream fastaFile;
1779         m->openOutputFile(fastaFileName, fastaFile);
1780         
1781         vector<string> names(numOTUs, "");
1782         
1783         for(int i=0;i<numOTUs;i++){
1784             
1785             if (m->control_pressed) { break; }
1786             
1787             int index = centroids[i];
1788             
1789             if(otuCounts[i] > 0){
1790                 fastaFile << '>' << seqNameVector[aaI[i][0]] << endl;
1791                 
1792                 string newSeq = "";
1793                 
1794                 for(int j=0;j<numFlowCells;j++){
1795                     
1796                     char base = flowOrder[j % flowOrder.length()];
1797                     for(int k=0;k<uniqueFlowgrams[index * numFlowCells + j];k++){
1798                         newSeq += base;
1799                     }
1800                 }
1801                 
1802                 fastaFile << newSeq.substr(4) << endl;
1803             }
1804         }
1805         fastaFile.close();
1806         
1807         outputNames.push_back(fastaFileName); outputTypes["fasta"].push_back(fastaFileName);
1808         
1809         if(compositeFASTAFileName != ""){
1810             m->appendFiles(fastaFileName, compositeFASTAFileName);
1811         }
1812     }
1813     catch(exception& e) {
1814         m->errorOut(e, "ShhherCommand", "writeSequences");
1815         exit(1);        
1816     }           
1817 }
1818
1819 /**************************************************************************************************/
1820
1821 void ShhherCommand::writeNames(vector<int> otuCounts){
1822     try {
1823         string thisOutputDir = outputDir;
1824         if (outputDir == "") {  thisOutputDir += m->hasPath(flowFileName);  }
1825         map<string, string> variables; 
1826                 variables["[filename]"] = thisOutputDir + m->getRootName(m->getSimpleName(flowFileName));
1827         string nameFileName = getOutputFileName("name",variables);
1828         ofstream nameFile;
1829         m->openOutputFile(nameFileName, nameFile);
1830         
1831         for(int i=0;i<numOTUs;i++){
1832             
1833             if (m->control_pressed) { break; }
1834             
1835             if(otuCounts[i] > 0){
1836                 nameFile << seqNameVector[aaI[i][0]] << '\t' << seqNameVector[aaI[i][0]];
1837                 
1838                 for(int j=1;j<nSeqsPerOTU[i];j++){
1839                     nameFile << ',' << seqNameVector[aaI[i][j]];
1840                 }
1841                 
1842                 nameFile << endl;
1843             }
1844         }
1845         nameFile.close();
1846         outputNames.push_back(nameFileName); outputTypes["name"].push_back(nameFileName);
1847         
1848         
1849         if(compositeNamesFileName != ""){
1850             m->appendFiles(nameFileName, compositeNamesFileName);
1851         }               
1852     }
1853     catch(exception& e) {
1854         m->errorOut(e, "ShhherCommand", "writeNames");
1855         exit(1);        
1856     }           
1857 }
1858
1859 /**************************************************************************************************/
1860
1861 void ShhherCommand::writeGroups(){
1862     try {
1863         string thisOutputDir = outputDir;
1864         if (outputDir == "") {  thisOutputDir += m->hasPath(flowFileName);  }
1865         string fileRoot = m->getRootName(m->getSimpleName(flowFileName));
1866         int pos = fileRoot.find_first_of('.');
1867         string fileGroup = fileRoot;
1868         if (pos != string::npos) {  fileGroup = fileRoot.substr(pos+1, (fileRoot.length()-1-(pos+1)));  }
1869         map<string, string> variables; 
1870                 variables["[filename]"] = thisOutputDir + fileRoot;
1871         string groupFileName = getOutputFileName("group",variables);
1872         ofstream groupFile;
1873         m->openOutputFile(groupFileName, groupFile);
1874         
1875         for(int i=0;i<numSeqs;i++){
1876             if (m->control_pressed) { break; }
1877             groupFile << seqNameVector[i] << '\t' << fileGroup << endl;
1878         }
1879         groupFile.close();
1880         outputNames.push_back(groupFileName); outputTypes["group"].push_back(groupFileName);
1881         
1882     }
1883     catch(exception& e) {
1884         m->errorOut(e, "ShhherCommand", "writeGroups");
1885         exit(1);        
1886     }           
1887 }
1888
1889 /**************************************************************************************************/
1890
1891 void ShhherCommand::writeClusters(vector<int> otuCounts){
1892     try {
1893         string thisOutputDir = outputDir;
1894         if (outputDir == "") {  thisOutputDir += m->hasPath(flowFileName);  }
1895         map<string, string> variables; 
1896                 variables["[filename]"] = thisOutputDir + m->getRootName(m->getSimpleName(flowFileName));
1897         string otuCountsFileName = getOutputFileName("counts",variables);
1898         ofstream otuCountsFile;
1899         m->openOutputFile(otuCountsFileName, otuCountsFile);
1900         
1901         string bases = flowOrder;
1902         
1903         for(int i=0;i<numOTUs;i++){
1904             
1905             if (m->control_pressed) {
1906                 break;
1907             }
1908             //output the translated version of the centroid sequence for the otu
1909             if(otuCounts[i] > 0){
1910                 int index = centroids[i];
1911                 
1912                 otuCountsFile << "ideal\t";
1913                 for(int j=8;j<numFlowCells;j++){
1914                     char base = bases[j % bases.length()];
1915                     for(int s=0;s<uniqueFlowgrams[index * numFlowCells + j];s++){
1916                         otuCountsFile << base;
1917                     }
1918                 }
1919                 otuCountsFile << endl;
1920                 
1921                 for(int j=0;j<nSeqsPerOTU[i];j++){
1922                     int sequence = aaI[i][j];
1923                     otuCountsFile << seqNameVector[sequence] << '\t';
1924                     
1925                     string newSeq = "";
1926                     
1927                     for(int k=0;k<lengths[sequence];k++){
1928                         char base = bases[k % bases.length()];
1929                         int freq = int(0.01 * (double)flowDataIntI[sequence * numFlowCells + k] + 0.5);
1930                         
1931                         for(int s=0;s<freq;s++){
1932                             newSeq += base;
1933                             //otuCountsFile << base;
1934                         }
1935                     }
1936                     otuCountsFile << newSeq.substr(4) << endl;
1937                 }
1938                 otuCountsFile << endl;
1939             }
1940         }
1941         otuCountsFile.close();
1942         outputNames.push_back(otuCountsFileName); outputTypes["counts"].push_back(otuCountsFileName);
1943         
1944     }
1945     catch(exception& e) {
1946         m->errorOut(e, "ShhherCommand", "writeClusters");
1947         exit(1);        
1948     }           
1949 }
1950
1951 #else
1952 //**********************************************************************************************************************
1953
1954 int ShhherCommand::execute(){
1955         try {
1956                 if (abort == true) { if (calledHelp) { return 0; }  return 2;   }
1957                 
1958                 getSingleLookUp();      if (m->control_pressed) { return 0; }
1959                 getJointLookUp();       if (m->control_pressed) { return 0; }
1960                 
1961         int numFiles = flowFileVector.size();
1962                 
1963         if (numFiles < processors) { processors = numFiles; }
1964         
1965 #if defined (__APPLE__) || (__MACH__) || (linux) || (__linux) || (__linux__) || (__unix__) || (__unix)
1966         if (processors == 1) { driver(flowFileVector, compositeFASTAFileName, compositeNamesFileName); }
1967         else { createProcesses(flowFileVector); } //each processor processes one file
1968 #else
1969         driver(flowFileVector, compositeFASTAFileName, compositeNamesFileName);
1970 #endif
1971         
1972                 if(compositeFASTAFileName != ""){
1973                         outputNames.push_back(compositeFASTAFileName); outputTypes["fasta"].push_back(compositeFASTAFileName);
1974                         outputNames.push_back(compositeNamesFileName); outputTypes["name"].push_back(compositeNamesFileName);
1975                 }
1976
1977                 m->mothurOutEndLine();
1978                 m->mothurOut("Output File Names: "); m->mothurOutEndLine();
1979                 for (int i = 0; i < outputNames.size(); i++) {  m->mothurOut(outputNames[i]); m->mothurOutEndLine();    }
1980                 m->mothurOutEndLine();
1981                 
1982                 return 0;
1983         }
1984         catch(exception& e) {
1985                 m->errorOut(e, "ShhherCommand", "execute");
1986                 exit(1);
1987         }
1988 }
1989 #endif
1990 //********************************************************************************************************************
1991 //sorts biggest to smallest
1992 inline bool compareFileSizes(string left, string right){
1993     
1994     FILE * pFile;
1995     long leftsize = 0;
1996     
1997     //get num bytes in file
1998     string filename = left;
1999     pFile = fopen (filename.c_str(),"rb");
2000     string error = "Error opening " + filename;
2001     if (pFile==NULL) perror (error.c_str());
2002     else{
2003         fseek (pFile, 0, SEEK_END);
2004         leftsize=ftell (pFile);
2005         fclose (pFile);
2006     }
2007     
2008     FILE * pFile2;
2009     long rightsize = 0;
2010     
2011     //get num bytes in file
2012     filename = right;
2013     pFile2 = fopen (filename.c_str(),"rb");
2014     error = "Error opening " + filename;
2015     if (pFile2==NULL) perror (error.c_str());
2016     else{
2017         fseek (pFile2, 0, SEEK_END);
2018         rightsize=ftell (pFile2);
2019         fclose (pFile2);
2020     }
2021     
2022     return (leftsize > rightsize);      
2023
2024 /**************************************************************************************************/
2025
2026 int ShhherCommand::createProcesses(vector<string> filenames){
2027     try {
2028         vector<int> processIDS;
2029                 int process = 1;
2030                 int num = 0;
2031                 
2032                 //sanity check
2033                 if (filenames.size() < processors) { processors = filenames.size(); }
2034         
2035         //sort file names by size to divide load better
2036         sort(filenames.begin(), filenames.end(), compareFileSizes);
2037         
2038         vector < vector <string> > dividedFiles; //dividedFiles[1] = vector of filenames for process 1...
2039         dividedFiles.resize(processors);
2040         
2041         //for each file, figure out which process will complete it
2042         //want to divide the load intelligently so the big files are spread between processes
2043         for (int i = 0; i < filenames.size(); i++) { 
2044             int processToAssign = (i+1) % processors; 
2045             if (processToAssign == 0) { processToAssign = processors; }
2046             
2047             dividedFiles[(processToAssign-1)].push_back(filenames[i]);
2048         }
2049         
2050         //now lets reverse the order of ever other process, so we balance big files running with little ones
2051         for (int i = 0; i < processors; i++) {
2052             int remainder = ((i+1) % processors);
2053             if (remainder) {  reverse(dividedFiles[i].begin(), dividedFiles[i].end());  }
2054         }
2055         
2056                         
2057                 //divide the groups between the processors
2058                 /*vector<linePair> lines;
2059         vector<int> numFilesToComplete;
2060                 int numFilesPerProcessor = filenames.size() / processors;
2061                 for (int i = 0; i < processors; i++) {
2062                         int startIndex =  i * numFilesPerProcessor;
2063                         int endIndex = (i+1) * numFilesPerProcessor;
2064                         if(i == (processors - 1)){      endIndex = filenames.size();    }
2065                         lines.push_back(linePair(startIndex, endIndex));
2066             numFilesToComplete.push_back((endIndex-startIndex));
2067                 }*/
2068                 
2069         #if defined (__APPLE__) || (__MACH__) || (linux) || (__linux) || (__linux__) || (__unix__) || (__unix)          
2070                 
2071                 //loop through and create all the processes you want
2072                 while (process != processors) {
2073                         int pid = fork();
2074                         
2075                         if (pid > 0) {
2076                                 processIDS.push_back(pid);  //create map from line number to pid so you can append files in correct order later
2077                                 process++;
2078                         }else if (pid == 0){
2079                                 num = driver(dividedFiles[process], compositeFASTAFileName + toString(getpid()) + ".temp", compositeNamesFileName  + toString(getpid()) + ".temp");
2080                 
2081                 //pass numSeqs to parent
2082                                 ofstream out;
2083                                 string tempFile = compositeFASTAFileName + toString(getpid()) + ".num.temp";
2084                                 m->openOutputFile(tempFile, out);
2085                                 out << num << endl;
2086                                 out.close();
2087                 
2088                                 exit(0);
2089                         }else { 
2090                                 m->mothurOut("[ERROR]: unable to spawn the necessary processes."); m->mothurOutEndLine(); 
2091                                 for (int i = 0; i < processIDS.size(); i++) { kill (processIDS[i], SIGINT); }
2092                                 exit(0);
2093                         }
2094                 }
2095                 
2096                 //do my part
2097                 driver(dividedFiles[0], compositeFASTAFileName, compositeNamesFileName);
2098                 
2099                 //force parent to wait until all the processes are done
2100                 for (int i=0;i<processIDS.size();i++) { 
2101                         int temp = processIDS[i];
2102                         wait(&temp);
2103                 }
2104         
2105         #else
2106         
2107         //////////////////////////////////////////////////////////////////////////////////////////////////////
2108         
2109         /////////////////////// NOT WORKING, ACCESS VIOLATION ON READ OF FLOWGRAMS IN THREAD /////////////////
2110         
2111         //////////////////////////////////////////////////////////////////////////////////////////////////////
2112                 //Windows version shared memory, so be careful when passing variables through the shhhFlowsData struct. 
2113                 //Above fork() will clone, so memory is separate, but that's not the case with windows, 
2114                 //////////////////////////////////////////////////////////////////////////////////////////////////////
2115                 /*
2116                 vector<shhhFlowsData*> pDataArray; 
2117                 DWORD   dwThreadIdArray[processors-1];
2118                 HANDLE  hThreadArray[processors-1]; 
2119                 
2120                 //Create processor worker threads.
2121                 for( int i=0; i<processors-1; i++ ){
2122                         // Allocate memory for thread data.
2123                         string extension = "";
2124                         if (i != 0) { extension = toString(i) + ".temp"; }
2125                         
2126             shhhFlowsData* tempFlow = new shhhFlowsData(filenames, (compositeFASTAFileName + extension), (compositeNamesFileName + extension), outputDir, flowOrder, jointLookUp, singleLookUp, m, lines[i].start, lines[i].end, cutoff, sigma, minDelta, maxIters, i);
2127                         pDataArray.push_back(tempFlow);
2128                         processIDS.push_back(i);
2129             
2130                         hThreadArray[i] = CreateThread(NULL, 0, ShhhFlowsThreadFunction, pDataArray[i], 0, &dwThreadIdArray[i]);   
2131                 }
2132                 
2133                 //using the main process as a worker saves time and memory
2134                 //do my part
2135                 driver(filenames, compositeFASTAFileName, compositeNamesFileName, lines[processors-1].start, lines[processors-1].end);
2136                 
2137                 //Wait until all threads have terminated.
2138                 WaitForMultipleObjects(processors-1, hThreadArray, TRUE, INFINITE);
2139                 
2140                 //Close all thread handles and free memory allocations.
2141                 for(int i=0; i < pDataArray.size(); i++){
2142                         for(int j=0; j < pDataArray[i]->outputNames.size(); j++){ outputNames.push_back(pDataArray[i]->outputNames[j]); }
2143                         CloseHandle(hThreadArray[i]);
2144                         delete pDataArray[i];
2145                 }
2146                 */
2147         #endif
2148         
2149         for (int i=0;i<processIDS.size();i++) { 
2150             ifstream in;
2151                         string tempFile =  compositeFASTAFileName + toString(processIDS[i]) + ".num.temp";
2152                         m->openInputFile(tempFile, in);
2153                         if (!in.eof()) { 
2154                 int tempNum = 0; 
2155                 in >> tempNum; 
2156                 if (tempNum != dividedFiles[i+1].size()) {
2157                     m->mothurOut("[ERROR]: main process expected " + toString(processIDS[i]) + " to complete " + toString(dividedFiles[i+1].size()) + " files, and it only reported completing " + toString(tempNum) + ". This will cause file mismatches.  The flow files may be too large to process with multiple processors. \n");
2158                 }
2159             }
2160                         in.close(); m->mothurRemove(tempFile);
2161             
2162             if (compositeFASTAFileName != "") {
2163                 m->appendFiles((compositeFASTAFileName + toString(processIDS[i]) + ".temp"), compositeFASTAFileName);
2164                 m->appendFiles((compositeNamesFileName + toString(processIDS[i]) + ".temp"), compositeNamesFileName);
2165                 m->mothurRemove((compositeFASTAFileName + toString(processIDS[i]) + ".temp"));
2166                 m->mothurRemove((compositeNamesFileName + toString(processIDS[i]) + ".temp"));
2167             }
2168         }
2169         
2170         return 0;
2171         
2172     }
2173         catch(exception& e) {
2174                 m->errorOut(e, "ShhherCommand", "createProcesses");
2175                 exit(1);
2176         }
2177 }
2178 /**************************************************************************************************/
2179
2180 vector<string> ShhherCommand::parseFlowFiles(string filename){
2181     try {
2182         vector<string> files;
2183         int count = 0;
2184         
2185         ifstream in;
2186         m->openInputFile(filename, in);
2187         
2188         int thisNumFLows = 0;
2189         in >> thisNumFLows; m->gobble(in);
2190         
2191         while (!in.eof()) {
2192             if (m->control_pressed) { break; }
2193             
2194             ofstream out;
2195             string outputFileName = filename + toString(count) + ".temp";
2196             m->openOutputFile(outputFileName, out);
2197             out << thisNumFLows << endl;
2198             files.push_back(outputFileName);
2199             
2200             int numLinesWrote = 0;
2201             for (int i = 0; i < largeSize; i++) {
2202                 if (in.eof()) { break; }
2203                 string line = m->getline(in); m->gobble(in);
2204                 out << line << endl;
2205                 numLinesWrote++;
2206             }
2207             out.close();
2208             
2209             if (numLinesWrote == 0) {  m->mothurRemove(outputFileName); files.pop_back();  }
2210             count++;
2211         }
2212         in.close();
2213         
2214         if (m->control_pressed) { for (int i = 0; i < files.size(); i++) { m->mothurRemove(files[i]); }  files.clear(); }
2215         
2216         m->mothurOut("\nDivided " + filename + " into " + toString(files.size()) + " files.\n\n"); 
2217         
2218         return files;
2219     }
2220         catch(exception& e) {
2221                 m->errorOut(e, "ShhherCommand", "parseFlowFiles");
2222                 exit(1);
2223         }
2224 }
2225 /**************************************************************************************************/
2226
2227 int ShhherCommand::driver(vector<string> filenames, string thisCompositeFASTAFileName, string thisCompositeNamesFileName){
2228     try {
2229         
2230         int numCompleted = 0;
2231         
2232         for(int i=0;i<filenames.size();i++){
2233                         
2234                         if (m->control_pressed) { break; }
2235                         
2236             vector<string> theseFlowFileNames; theseFlowFileNames.push_back(filenames[i]);
2237             if (large) {  theseFlowFileNames = parseFlowFiles(filenames[i]);  }
2238             
2239             if (m->control_pressed) { break; }
2240             
2241             double begClock = clock();
2242             unsigned long long begTime;
2243             
2244             string fileNameForOutput = filenames[i];
2245             
2246             for (int g = 0; g < theseFlowFileNames.size(); g++) {
2247                 
2248                 string flowFileName = theseFlowFileNames[g];
2249                 m->mothurOut("\n>>>>>\tProcessing " + flowFileName + " (file " + toString(i+1) + " of " + toString(filenames.size()) + ")\t<<<<<\n");
2250                 m->mothurOut("Reading flowgrams...\n");
2251                 
2252                 vector<string> seqNameVector;
2253                 vector<int> lengths;
2254                 vector<short> flowDataIntI;
2255                 vector<double> flowDataPrI;
2256                 map<string, int> nameMap;
2257                 vector<short> uniqueFlowgrams;
2258                 vector<int> uniqueCount;
2259                 vector<int> mapSeqToUnique;
2260                 vector<int> mapUniqueToSeq;
2261                 vector<int> uniqueLengths;
2262                 int numFlowCells;
2263                 
2264                 if (m->debug) { m->mothurOut("[DEBUG]: About to read flowgrams.\n"); }
2265                 int numSeqs = getFlowData(flowFileName, seqNameVector, lengths, flowDataIntI, nameMap, numFlowCells);
2266                 
2267                 if (m->control_pressed) { break; }
2268                 
2269                 m->mothurOut("Identifying unique flowgrams...\n");
2270                 int numUniques = getUniques(numSeqs, numFlowCells, uniqueFlowgrams, uniqueCount, uniqueLengths, mapSeqToUnique, mapUniqueToSeq, lengths, flowDataPrI, flowDataIntI);
2271                 
2272                 if (m->control_pressed) { break; }
2273                 
2274                 m->mothurOut("Calculating distances between flowgrams...\n");
2275                 string distFileName = flowFileName.substr(0,flowFileName.find_last_of('.')) + ".shhh.dist";
2276                 begTime = time(NULL);
2277                
2278                 
2279                 flowDistParentFork(numFlowCells, distFileName, numUniques, mapUniqueToSeq, mapSeqToUnique, lengths, flowDataPrI, flowDataIntI);
2280                 
2281                 m->mothurOutEndLine();
2282                 m->mothurOut("Total time: " + toString(time(NULL) - begTime) + '\t' + toString((clock() - begClock)/CLOCKS_PER_SEC) + '\n');
2283                 
2284                 
2285                 string namesFileName = flowFileName.substr(0,flowFileName.find_last_of('.')) + ".shhh.names";
2286                 createNamesFile(numSeqs, numUniques, namesFileName, seqNameVector, mapSeqToUnique, mapUniqueToSeq);
2287                 
2288                 if (m->control_pressed) { break; }
2289                 
2290                 m->mothurOut("\nClustering flowgrams...\n");
2291                 string listFileName = flowFileName.substr(0,flowFileName.find_last_of('.')) + ".shhh.list";
2292                 cluster(listFileName, distFileName, namesFileName);
2293                 
2294                 if (m->control_pressed) { break; }
2295                 
2296                 vector<int> otuData;
2297                 vector<int> cumNumSeqs;
2298                 vector<int> nSeqsPerOTU;
2299                 vector<vector<int> > aaP;       //tMaster->aanP:        each row is a different otu / each col contains the sequence indices
2300                 vector<vector<int> > aaI;       //tMaster->aanI:        that are in each otu - can't differentiate between aaP and aaI 
2301                 vector<int> seqNumber;          //tMaster->anP:         the sequence id number sorted by OTU
2302                 vector<int> seqIndex;           //tMaster->anI;         the index that corresponds to seqNumber
2303                 
2304                 
2305                 int numOTUs = getOTUData(numSeqs, listFileName, otuData, cumNumSeqs, nSeqsPerOTU, aaP, aaI, seqNumber, seqIndex, nameMap);
2306                 
2307                 if (m->control_pressed) { break; }
2308                 
2309                 m->mothurRemove(distFileName);
2310                 m->mothurRemove(namesFileName);
2311                 m->mothurRemove(listFileName);
2312                 
2313                 vector<double> dist;            //adDist - distance of sequences to centroids
2314                 vector<short> change;           //did the centroid sequence change? 0 = no; 1 = yes
2315                 vector<int> centroids;          //the representative flowgram for each cluster m
2316                 vector<double> weight;
2317                 vector<double> singleTau;       //tMaster->adTau:       1-D Tau vector (1xnumSeqs)
2318                 vector<int> nSeqsBreaks;
2319                 vector<int> nOTUsBreaks;
2320                 
2321                 if (m->debug) { m->mothurOut("[DEBUG]: numSeqs = " + toString(numSeqs) + " numOTUS = " + toString(numOTUs) + " about to alloc a dist vector with size = " + toString((numSeqs * numOTUs)) + ".\n"); }
2322                 
2323                 dist.assign(numSeqs * numOTUs, 0);
2324                 change.assign(numOTUs, 1);
2325                 centroids.assign(numOTUs, -1);
2326                 weight.assign(numOTUs, 0);
2327                 singleTau.assign(numSeqs, 1.0);
2328                 
2329                 nSeqsBreaks.assign(2, 0);
2330                 nOTUsBreaks.assign(2, 0);
2331                 
2332                 nSeqsBreaks[0] = 0;
2333                 nSeqsBreaks[1] = numSeqs;
2334                 nOTUsBreaks[1] = numOTUs;
2335                 
2336                 if (m->debug) { m->mothurOut("[DEBUG]: done allocating memory, about to denoise.\n"); }
2337                 
2338                 if (m->control_pressed) { break; }
2339                 
2340                 double maxDelta = 0;
2341                 int iter = 0;
2342                 
2343                 begClock = clock();
2344                 begTime = time(NULL);
2345                 
2346                 m->mothurOut("\nDenoising flowgrams...\n");
2347                 m->mothurOut("iter\tmaxDelta\tnLL\t\tcycletime\n");
2348                 
2349                 while((maxIters == 0 && maxDelta > minDelta) || iter < MIN_ITER || (maxDelta > minDelta && iter < maxIters)){
2350                     
2351                     if (m->control_pressed) { break; }
2352                     
2353                     double cycClock = clock();
2354                     unsigned long long cycTime = time(NULL);
2355                     fill(numOTUs, seqNumber, seqIndex, cumNumSeqs, nSeqsPerOTU, aaP, aaI);
2356                     
2357                     if (m->control_pressed) { break; }
2358                     
2359                     calcCentroidsDriver(numOTUs, cumNumSeqs, nSeqsPerOTU, seqIndex, change, centroids, singleTau, mapSeqToUnique, uniqueFlowgrams, flowDataIntI, lengths, numFlowCells, seqNumber);
2360                     
2361                     if (m->control_pressed) { break; }
2362                     
2363                     maxDelta = getNewWeights(numOTUs, cumNumSeqs, nSeqsPerOTU, singleTau, seqNumber, weight);  
2364                     
2365                     if (m->control_pressed) { break; }
2366                     
2367                     double nLL = getLikelihood(numSeqs, numOTUs, nSeqsPerOTU, seqNumber, cumNumSeqs, seqIndex, dist, weight); 
2368                     
2369                     if (m->control_pressed) { break; }
2370                     
2371                     checkCentroids(numOTUs, centroids, weight);
2372                     
2373                     if (m->control_pressed) { break; }
2374                     
2375                     calcNewDistances(numSeqs, numOTUs, nSeqsPerOTU,  dist, weight, change, centroids, aaP, singleTau, aaI, seqNumber, seqIndex, uniqueFlowgrams, flowDataIntI, numFlowCells, lengths);
2376                     
2377                     if (m->control_pressed) { break; }
2378                     
2379                     iter++;
2380                     
2381                     m->mothurOut(toString(iter) + '\t' + toString(maxDelta) + '\t' + toString(nLL) + '\t' + toString(time(NULL) - cycTime) + '\t' + toString((clock() - cycClock)/(double)CLOCKS_PER_SEC) + '\n');
2382                     
2383                 }       
2384                 
2385                 if (m->control_pressed) { break; }
2386                 
2387                 m->mothurOut("\nFinalizing...\n");
2388                 fill(numOTUs, seqNumber, seqIndex, cumNumSeqs, nSeqsPerOTU, aaP, aaI);
2389                 
2390                 if (m->debug) { m->mothurOut("[DEBUG]: done fill().\n"); }
2391
2392                 if (m->control_pressed) { break; }
2393                 
2394                 setOTUs(numOTUs, numSeqs, seqNumber, seqIndex, cumNumSeqs, nSeqsPerOTU, otuData, singleTau, dist, aaP, aaI);
2395                 
2396                 if (m->debug) { m->mothurOut("[DEBUG]: done setOTUs().\n"); }
2397                 
2398                 if (m->control_pressed) { break; }
2399                 
2400                 vector<int> otuCounts(numOTUs, 0);
2401                 for(int j=0;j<numSeqs;j++)      {       otuCounts[otuData[j]]++;        }
2402                 
2403                 calcCentroidsDriver(numOTUs, cumNumSeqs, nSeqsPerOTU, seqIndex, change, centroids, singleTau, mapSeqToUnique, uniqueFlowgrams, flowDataIntI, lengths, numFlowCells, seqNumber);
2404                 
2405                 if (m->debug) { m->mothurOut("[DEBUG]: done calcCentroidsDriver().\n"); }
2406                 
2407                 if (m->control_pressed) { break; }
2408                 
2409                 if ((large) && (g == 0)) {  flowFileName = filenames[i]; theseFlowFileNames[0] = filenames[i]; }
2410                 string thisOutputDir = outputDir;
2411                 if (outputDir == "") {  thisOutputDir = m->hasPath(flowFileName);  }
2412                 map<string, string> variables; 
2413                 variables["[filename]"] = thisOutputDir + m->getRootName(m->getSimpleName(flowFileName));
2414                 string qualityFileName = getOutputFileName("qfile",variables);
2415                 string fastaFileName = getOutputFileName("fasta",variables);
2416                 string nameFileName = getOutputFileName("name",variables);
2417                 string otuCountsFileName = getOutputFileName("counts",variables);
2418                 string fileRoot = m->getRootName(m->getSimpleName(flowFileName));
2419                 int pos = fileRoot.find_first_of('.');
2420                 string fileGroup = fileRoot;
2421                 if (pos != string::npos) {  fileGroup = fileRoot.substr(pos+1, (fileRoot.length()-1-(pos+1)));  }
2422                 string groupFileName = getOutputFileName("group",variables);
2423
2424                 
2425                 writeQualities(numOTUs, numFlowCells, qualityFileName, otuCounts, nSeqsPerOTU, seqNumber, singleTau, flowDataIntI, uniqueFlowgrams, cumNumSeqs, mapUniqueToSeq, seqNameVector, centroids, aaI); if (m->control_pressed) { break; }
2426                 writeSequences(thisCompositeFASTAFileName, numOTUs, numFlowCells, fastaFileName, otuCounts, uniqueFlowgrams, seqNameVector, aaI, centroids);if (m->control_pressed) { break; }
2427                 writeNames(thisCompositeNamesFileName, numOTUs, nameFileName, otuCounts, seqNameVector, aaI, nSeqsPerOTU);                              if (m->control_pressed) { break; }
2428                 writeClusters(otuCountsFileName, numOTUs, numFlowCells,otuCounts, centroids, uniqueFlowgrams, seqNameVector, aaI, nSeqsPerOTU, lengths, flowDataIntI);                  if (m->control_pressed) { break; }
2429                 writeGroups(groupFileName, fileGroup, numSeqs, seqNameVector);                                          if (m->control_pressed) { break; }
2430                 
2431                 if (large) {
2432                     if (g > 0) {
2433                         variables["[filename]"] = thisOutputDir + m->getRootName(m->getSimpleName(theseFlowFileNames[0]));
2434                         m->appendFiles(qualityFileName, getOutputFileName("qfile",variables));
2435                         m->mothurRemove(qualityFileName);
2436                         m->appendFiles(fastaFileName, getOutputFileName("fasta",variables));
2437                         m->mothurRemove(fastaFileName);
2438                         m->appendFiles(nameFileName, getOutputFileName("name",variables));
2439                         m->mothurRemove(nameFileName);
2440                         m->appendFiles(otuCountsFileName, getOutputFileName("counts",variables));
2441                         m->mothurRemove(otuCountsFileName);
2442                         m->appendFiles(groupFileName, getOutputFileName("group",variables));
2443                         m->mothurRemove(groupFileName);
2444                     }
2445                     m->mothurRemove(theseFlowFileNames[g]);
2446                 }
2447                         }
2448             
2449             numCompleted++;
2450                         m->mothurOut("Total time to process " + fileNameForOutput + ":\t" + toString(time(NULL) - begTime) + '\t' + toString((clock() - begClock)/(double)CLOCKS_PER_SEC) + '\n');
2451                 }
2452                 
2453         if (m->control_pressed) { for (int i = 0; i < outputNames.size(); i++) { m->mothurRemove(outputNames[i]); } return 0; }
2454         
2455         return numCompleted;
2456         
2457     }catch(exception& e) {
2458             m->errorOut(e, "ShhherCommand", "driver");
2459             exit(1);
2460     }
2461 }
2462
2463 /**************************************************************************************************/
2464 int ShhherCommand::getFlowData(string filename, vector<string>& thisSeqNameVector, vector<int>& thisLengths, vector<short>& thisFlowDataIntI, map<string, int>& thisNameMap, int& numFlowCells){
2465         try{
2466        
2467                 ifstream flowFile;
2468        
2469                 m->openInputFile(filename, flowFile);
2470                 
2471                 string seqName;
2472                 int currentNumFlowCells;
2473                 float intensity;
2474         thisSeqNameVector.clear();
2475                 thisLengths.clear();
2476                 thisFlowDataIntI.clear();
2477                 thisNameMap.clear();
2478                 
2479                 string numFlowTest;
2480         flowFile >> numFlowTest;
2481         
2482         if (!m->isContainingOnlyDigits(numFlowTest)) { m->mothurOut("[ERROR]: expected a number and got " + numFlowTest + ", quitting. Did you use the flow parameter instead of the file parameter?"); m->mothurOutEndLine(); exit(1); }
2483         else { convert(numFlowTest, numFlowCells); }
2484         
2485         if (m->debug) { m->mothurOut("[DEBUG]: numFlowCells = " + toString(numFlowCells) + ".\n"); }
2486                 int index = 0;//pcluster
2487                 while(!flowFile.eof()){
2488                         
2489                         if (m->control_pressed) { break; }
2490                         
2491                         flowFile >> seqName >> currentNumFlowCells;
2492             
2493                         thisLengths.push_back(currentNumFlowCells);
2494            
2495                         thisSeqNameVector.push_back(seqName);
2496                         thisNameMap[seqName] = index++;//pcluster
2497             
2498             if (m->debug) { m->mothurOut("[DEBUG]: seqName = " + seqName + " length = " + toString(currentNumFlowCells) + " index = " + toString(index) + "\n"); }
2499             
2500                         for(int i=0;i<numFlowCells;i++){
2501                                 flowFile >> intensity;
2502                                 if(intensity > 9.99)    {       intensity = 9.99;       }
2503                                 int intI = int(100 * intensity + 0.0001);
2504                                 thisFlowDataIntI.push_back(intI);
2505                         }
2506                         m->gobble(flowFile);
2507                 }
2508                 flowFile.close();
2509                 
2510                 int numSeqs = thisSeqNameVector.size();         
2511                 
2512                 for(int i=0;i<numSeqs;i++){
2513                         
2514                         if (m->control_pressed) { break; }
2515                         
2516                         int iNumFlowCells = i * numFlowCells;
2517                         for(int j=thisLengths[i];j<numFlowCells;j++){
2518                                 thisFlowDataIntI[iNumFlowCells + j] = 0;
2519                         }
2520                 }
2521         
2522         return numSeqs;
2523                 
2524         }
2525         catch(exception& e) {
2526                 m->errorOut(e, "ShhherCommand", "getFlowData");
2527                 exit(1);
2528         }
2529 }
2530 /**************************************************************************************************/
2531
2532 int ShhherCommand::flowDistParentFork(int numFlowCells, string distFileName, int stopSeq, vector<int>& mapUniqueToSeq, vector<int>& mapSeqToUnique, vector<int>& lengths, vector<double>& flowDataPrI, vector<short>& flowDataIntI){
2533         try{            
2534         
2535                 ostringstream outStream;
2536                 outStream.setf(ios::fixed, ios::floatfield);
2537                 outStream.setf(ios::dec, ios::basefield);
2538                 outStream.setf(ios::showpoint);
2539                 outStream.precision(6);
2540                 
2541                 int begTime = time(NULL);
2542                 double begClock = clock();
2543         
2544                 for(int i=0;i<stopSeq;i++){
2545                         
2546                         if (m->control_pressed) { break; }
2547                         
2548                         for(int j=0;j<i;j++){
2549                                 float flowDistance = calcPairwiseDist(numFlowCells, mapUniqueToSeq[i], mapUniqueToSeq[j], mapSeqToUnique, lengths, flowDataPrI, flowDataIntI);
2550                 
2551                                 if(flowDistance < 1e-6){
2552                                         outStream << mapUniqueToSeq[i] << '\t' << mapUniqueToSeq[j] << '\t' << 0.000000 << endl;
2553                                 }
2554                                 else if(flowDistance <= cutoff){
2555                                         outStream << mapUniqueToSeq[i] << '\t' << mapUniqueToSeq[j] << '\t' << flowDistance << endl;
2556                                 }
2557                         }
2558                         if(i % 100 == 0){
2559                                 m->mothurOutJustToScreen(toString(i) + "\t" + toString(time(NULL) - begTime));
2560                                 m->mothurOutJustToScreen("\t" + toString((clock()-begClock)/CLOCKS_PER_SEC)+"\n");
2561                         }
2562                 }
2563                 
2564                 ofstream distFile(distFileName.c_str());
2565                 distFile << outStream.str();            
2566                 distFile.close();
2567                 
2568                 if (m->control_pressed) {}
2569                 else {
2570                         m->mothurOutJustToScreen(toString(stopSeq-1) + "\t" + toString(time(NULL) - begTime));
2571                         m->mothurOutJustToScreen("\t" + toString((clock()-begClock)/CLOCKS_PER_SEC)+"\n");
2572                 }
2573         
2574         return 0;
2575         }
2576         catch(exception& e) {
2577                 m->errorOut(e, "ShhherCommand", "flowDistParentFork");
2578                 exit(1);
2579         }
2580 }
2581 /**************************************************************************************************/
2582
2583 float ShhherCommand::calcPairwiseDist(int numFlowCells, int seqA, int seqB, vector<int>& mapSeqToUnique, vector<int>& lengths, vector<double>& flowDataPrI, vector<short>& flowDataIntI){
2584         try{
2585                 int minLength = lengths[mapSeqToUnique[seqA]];
2586                 if(lengths[seqB] < minLength){  minLength = lengths[mapSeqToUnique[seqB]];      }
2587                 
2588                 int ANumFlowCells = seqA * numFlowCells;
2589                 int BNumFlowCells = seqB * numFlowCells;
2590                 
2591                 float dist = 0;
2592                 
2593                 for(int i=0;i<minLength;i++){
2594                         
2595                         if (m->control_pressed) { break; }
2596                         
2597                         int flowAIntI = flowDataIntI[ANumFlowCells + i];
2598                         float flowAPrI = flowDataPrI[ANumFlowCells + i];
2599                         
2600                         int flowBIntI = flowDataIntI[BNumFlowCells + i];
2601                         float flowBPrI = flowDataPrI[BNumFlowCells + i];
2602                         dist += jointLookUp[flowAIntI * NUMBINS + flowBIntI] - flowAPrI - flowBPrI;
2603                 }
2604                 
2605                 dist /= (float) minLength;
2606                 return dist;
2607         }
2608         catch(exception& e) {
2609                 m->errorOut(e, "ShhherCommand", "calcPairwiseDist");
2610                 exit(1);
2611         }
2612 }
2613
2614 /**************************************************************************************************/
2615
2616 int ShhherCommand::getUniques(int numSeqs, int numFlowCells, vector<short>& uniqueFlowgrams, vector<int>& uniqueCount, vector<int>& uniqueLengths, vector<int>& mapSeqToUnique, vector<int>& mapUniqueToSeq, vector<int>& lengths, vector<double>& flowDataPrI, vector<short>& flowDataIntI){
2617         try{
2618                 int numUniques = 0;
2619                 uniqueFlowgrams.assign(numFlowCells * numSeqs, -1);
2620                 uniqueCount.assign(numSeqs, 0);                                                 //      anWeights
2621                 uniqueLengths.assign(numSeqs, 0);
2622                 mapSeqToUnique.assign(numSeqs, -1);
2623                 mapUniqueToSeq.assign(numSeqs, -1);
2624                 
2625                 vector<short> uniqueFlowDataIntI(numFlowCells * numSeqs, -1);
2626                 
2627                 for(int i=0;i<numSeqs;i++){
2628                         
2629                         if (m->control_pressed) { break; }
2630                         
2631                         int index = 0;
2632                         
2633                         vector<short> current(numFlowCells);
2634                         for(int j=0;j<numFlowCells;j++){
2635                                 current[j] = short(((flowDataIntI[i * numFlowCells + j] + 50.0)/100.0));
2636                         }
2637             
2638                         for(int j=0;j<numUniques;j++){
2639                                 int offset = j * numFlowCells;
2640                                 bool toEnd = 1;
2641                                 
2642                                 int shorterLength;
2643                                 if(lengths[i] < uniqueLengths[j])       {       shorterLength = lengths[i];                     }
2644                                 else                                                            {       shorterLength = uniqueLengths[j];       }
2645                 
2646                                 for(int k=0;k<shorterLength;k++){
2647                                         if(current[k] != uniqueFlowgrams[offset + k]){
2648                                                 toEnd = 0;
2649                                                 break;
2650                                         }
2651                                 }
2652                                 
2653                                 if(toEnd){
2654                                         mapSeqToUnique[i] = j;
2655                                         uniqueCount[j]++;
2656                                         index = j;
2657                                         if(lengths[i] > uniqueLengths[j])       {       uniqueLengths[j] = lengths[i];  }
2658                                         break;
2659                                 }
2660                                 index++;
2661                         }
2662                         
2663                         if(index == numUniques){
2664                                 uniqueLengths[numUniques] = lengths[i];
2665                                 uniqueCount[numUniques] = 1;
2666                                 mapSeqToUnique[i] = numUniques;//anMap
2667                                 mapUniqueToSeq[numUniques] = i;//anF
2668                                 
2669                                 for(int k=0;k<numFlowCells;k++){
2670                                         uniqueFlowgrams[numUniques * numFlowCells + k] = current[k];
2671                                         uniqueFlowDataIntI[numUniques * numFlowCells + k] = flowDataIntI[i * numFlowCells + k];
2672                                 }
2673                                 
2674                                 numUniques++;
2675                         }
2676                 }
2677                 uniqueFlowDataIntI.resize(numFlowCells * numUniques);
2678                 uniqueLengths.resize(numUniques);       
2679                 
2680                 flowDataPrI.resize(numSeqs * numFlowCells, 0);
2681                 for(int i=0;i<flowDataPrI.size();i++)   {       if (m->control_pressed) { break; } flowDataPrI[i] = getProbIntensity(flowDataIntI[i]);          }
2682         
2683         return numUniques;
2684         }
2685         catch(exception& e) {
2686                 m->errorOut(e, "ShhherCommand", "getUniques");
2687                 exit(1);
2688         }
2689 }
2690 /**************************************************************************************************/
2691 int ShhherCommand::createNamesFile(int numSeqs, int numUniques, string filename, vector<string>& seqNameVector, vector<int>& mapSeqToUnique, vector<int>& mapUniqueToSeq){
2692         try{
2693                 
2694                 vector<string> duplicateNames(numUniques, "");
2695                 for(int i=0;i<numSeqs;i++){
2696                         duplicateNames[mapSeqToUnique[i]] += seqNameVector[i] + ',';
2697                 }
2698                 
2699                 ofstream nameFile;
2700                 m->openOutputFile(filename, nameFile);
2701                 
2702                 for(int i=0;i<numUniques;i++){
2703                         
2704                         if (m->control_pressed) { break; }
2705                         
2706             //                  nameFile << seqNameVector[mapUniqueToSeq[i]] << '\t' << duplicateNames[i].substr(0, duplicateNames[i].find_last_of(',')) << endl;
2707                         nameFile << mapUniqueToSeq[i] << '\t' << duplicateNames[i].substr(0, duplicateNames[i].find_last_of(',')) << endl;
2708                 }
2709                 
2710                 nameFile.close();
2711         
2712                 return 0;
2713         }
2714         catch(exception& e) {
2715                 m->errorOut(e, "ShhherCommand", "createNamesFile");
2716                 exit(1);
2717         }
2718 }
2719 //**********************************************************************************************************************
2720
2721 int ShhherCommand::cluster(string filename, string distFileName, string namesFileName){
2722         try {
2723                 
2724                 ReadMatrix* read = new ReadColumnMatrix(distFileName);  
2725                 read->setCutoff(cutoff);
2726                 
2727                 NameAssignment* clusterNameMap = new NameAssignment(namesFileName);
2728                 clusterNameMap->readMap();
2729                 read->read(clusterNameMap);
2730         
2731                 ListVector* list = read->getListVector();
2732                 SparseDistanceMatrix* matrix = read->getDMatrix();
2733                 
2734                 delete read; 
2735                 delete clusterNameMap; 
2736         
2737                 RAbundVector* rabund = new RAbundVector(list->getRAbundVector());
2738                 
2739         float adjust = -1.0;
2740                 Cluster* cluster = new CompleteLinkage(rabund, list, matrix, cutoff, "furthest", adjust);
2741                 string tag = cluster->getTag();
2742                 
2743                 double clusterCutoff = cutoff;
2744                 while (matrix->getSmallDist() <= clusterCutoff && matrix->getNNodes() > 0){
2745                         
2746                         if (m->control_pressed) { break; }
2747                         
2748                         cluster->update(clusterCutoff);
2749                 }
2750                 
2751                 list->setLabel(toString(cutoff));
2752                 
2753                 ofstream listFile;
2754                 m->openOutputFile(filename, listFile);
2755                 list->print(listFile);
2756                 listFile.close();
2757                 
2758                 delete matrix;  delete cluster; delete rabund; delete list;
2759         
2760                 return 0;
2761         }
2762         catch(exception& e) {
2763                 m->errorOut(e, "ShhherCommand", "cluster");
2764                 exit(1);        
2765         }               
2766 }
2767 /**************************************************************************************************/
2768
2769 int ShhherCommand::getOTUData(int numSeqs, string fileName,  vector<int>& otuData,
2770                                vector<int>& cumNumSeqs,
2771                                vector<int>& nSeqsPerOTU,
2772                                vector<vector<int> >& aaP,       //tMaster->aanP:        each row is a different otu / each col contains the sequence indices
2773                                vector<vector<int> >& aaI,       //tMaster->aanI:        that are in each otu - can't differentiate between aaP and aaI 
2774                                vector<int>& seqNumber,          //tMaster->anP:         the sequence id number sorted by OTU
2775                                vector<int>& seqIndex,
2776                                map<string, int>& nameMap){
2777         try {
2778         
2779                 ifstream listFile;
2780                 m->openInputFile(fileName, listFile);
2781                 string label;
2782         int numOTUs;
2783                 
2784                 listFile >> label >> numOTUs;
2785         
2786         if (m->debug) { m->mothurOut("[DEBUG]: Getting OTU Data...\n"); }
2787         
2788                 otuData.assign(numSeqs, 0);
2789                 cumNumSeqs.assign(numOTUs, 0);
2790                 nSeqsPerOTU.assign(numOTUs, 0);
2791                 aaP.clear();aaP.resize(numOTUs);
2792                 
2793                 seqNumber.clear();
2794                 aaI.clear();
2795                 seqIndex.clear();
2796                 
2797                 string singleOTU = "";
2798                 
2799                 for(int i=0;i<numOTUs;i++){
2800                         
2801                         if (m->control_pressed) { break; }
2802             if (m->debug) { m->mothurOut("[DEBUG]: processing OTU " + toString(i) + ".\n"); }
2803             
2804                         listFile >> singleOTU;
2805                         
2806                         istringstream otuString(singleOTU);
2807             
2808                         while(otuString){
2809                                 
2810                                 string seqName = "";
2811                                 
2812                                 for(int j=0;j<singleOTU.length();j++){
2813                                         char letter = otuString.get();
2814                                         
2815                                         if(letter != ','){
2816                                                 seqName += letter;
2817                                         }
2818                                         else{
2819                                                 map<string,int>::iterator nmIt = nameMap.find(seqName);
2820                                                 int index = nmIt->second;
2821                                                 
2822                                                 nameMap.erase(nmIt);
2823                                                 
2824                                                 otuData[index] = i;
2825                                                 nSeqsPerOTU[i]++;
2826                                                 aaP[i].push_back(index);
2827                                                 seqName = "";
2828                                         }
2829                                 }
2830                                 
2831                                 map<string,int>::iterator nmIt = nameMap.find(seqName);
2832                 
2833                                 int index = nmIt->second;
2834                                 nameMap.erase(nmIt);
2835                 
2836                                 otuData[index] = i;
2837                                 nSeqsPerOTU[i]++;
2838                                 aaP[i].push_back(index);        
2839                                 
2840                                 otuString.get();
2841                         }
2842                         
2843                         sort(aaP[i].begin(), aaP[i].end());
2844                         for(int j=0;j<nSeqsPerOTU[i];j++){
2845                                 seqNumber.push_back(aaP[i][j]);
2846                         }
2847                         for(int j=nSeqsPerOTU[i];j<numSeqs;j++){
2848                                 aaP[i].push_back(0);
2849                         }
2850                         
2851                         
2852                 }
2853                 
2854                 for(int i=1;i<numOTUs;i++){
2855                         cumNumSeqs[i] = cumNumSeqs[i-1] + nSeqsPerOTU[i-1];
2856                 }
2857                 aaI = aaP;
2858                 seqIndex = seqNumber;
2859                 
2860                 listFile.close();       
2861         
2862         return numOTUs;
2863                 
2864         }
2865         catch(exception& e) {
2866                 m->errorOut(e, "ShhherCommand", "getOTUData");
2867                 exit(1);        
2868         }               
2869 }
2870 /**************************************************************************************************/
2871
2872 int ShhherCommand::calcCentroidsDriver(int numOTUs, 
2873                                           vector<int>& cumNumSeqs,
2874                                           vector<int>& nSeqsPerOTU,
2875                                           vector<int>& seqIndex,
2876                                           vector<short>& change,                //did the centroid sequence change? 0 = no; 1 = yes
2877                                           vector<int>& centroids,               //the representative flowgram for each cluster m
2878                                           vector<double>& singleTau,    //tMaster->adTau:       1-D Tau vector (1xnumSeqs)
2879                                           vector<int>& mapSeqToUnique,
2880                                           vector<short>& uniqueFlowgrams,
2881                                           vector<short>& flowDataIntI,
2882                                           vector<int>& lengths,
2883                                           int numFlowCells,
2884                                           vector<int>& seqNumber){                          
2885         
2886         //this function gets the most likely homopolymer length at a flow position for a group of sequences
2887         //within an otu
2888         
2889         try{
2890                 
2891                 for(int i=0;i<numOTUs;i++){
2892                         
2893                         if (m->control_pressed) { break; }
2894                         
2895                         double count = 0;
2896                         int position = 0;
2897                         int minFlowGram = 100000000;
2898                         double minFlowValue = 1e8;
2899                         change[i] = 0; //FALSE
2900                         
2901                         for(int j=0;j<nSeqsPerOTU[i];j++){
2902                                 count += singleTau[seqNumber[cumNumSeqs[i] + j]];
2903                         }
2904             
2905                         if(nSeqsPerOTU[i] > 0 && count > MIN_COUNT){
2906                                 vector<double> adF(nSeqsPerOTU[i]);
2907                                 vector<int> anL(nSeqsPerOTU[i]);
2908                                 
2909                                 for(int j=0;j<nSeqsPerOTU[i];j++){
2910                                         int index = cumNumSeqs[i] + j;
2911                                         int nI = seqIndex[index];
2912                                         int nIU = mapSeqToUnique[nI];
2913                                         
2914                                         int k;
2915                                         for(k=0;k<position;k++){
2916                                                 if(nIU == anL[k]){
2917                                                         break;
2918                                                 }
2919                                         }
2920                                         if(k == position){
2921                                                 anL[position] = nIU;
2922                                                 adF[position] = 0.0000;
2923                                                 position++;
2924                                         }                                               
2925                                 }
2926                                 
2927                                 for(int j=0;j<nSeqsPerOTU[i];j++){
2928                                         int index = cumNumSeqs[i] + j;
2929                                         int nI = seqIndex[index];
2930                                         
2931                                         double tauValue = singleTau[seqNumber[index]];
2932                                         
2933                                         for(int k=0;k<position;k++){
2934                                                 double dist = getDistToCentroid(anL[k], nI, lengths[nI], uniqueFlowgrams, flowDataIntI, numFlowCells);
2935                                                 adF[k] += dist * tauValue;
2936                                         }
2937                                 }
2938                                 
2939                                 for(int j=0;j<position;j++){
2940                                         if(adF[j] < minFlowValue){
2941                                                 minFlowGram = j;
2942                                                 minFlowValue = adF[j];
2943                                         }
2944                                 }
2945                                 
2946                                 if(centroids[i] != anL[minFlowGram]){
2947                                         change[i] = 1;
2948                                         centroids[i] = anL[minFlowGram];
2949                                 }
2950                         }
2951                         else if(centroids[i] != -1){
2952                                 change[i] = 1;
2953                                 centroids[i] = -1;                      
2954                         }
2955                 }
2956         
2957         return 0;
2958         }
2959         catch(exception& e) {
2960                 m->errorOut(e, "ShhherCommand", "calcCentroidsDriver");
2961                 exit(1);        
2962         }               
2963 }
2964 /**************************************************************************************************/
2965
2966 double ShhherCommand::getDistToCentroid(int cent, int flow, int length, vector<short>& uniqueFlowgrams,
2967                                         vector<short>& flowDataIntI, int numFlowCells){
2968         try{
2969                 
2970                 int flowAValue = cent * numFlowCells;
2971                 int flowBValue = flow * numFlowCells;
2972                 
2973                 double dist = 0;
2974         
2975                 for(int i=0;i<length;i++){
2976                         dist += singleLookUp[uniqueFlowgrams[flowAValue] * NUMBINS + flowDataIntI[flowBValue]];
2977                         flowAValue++;
2978                         flowBValue++;
2979                 }
2980                 
2981                 return dist / (double)length;
2982         }
2983         catch(exception& e) {
2984                 m->errorOut(e, "ShhherCommand", "getDistToCentroid");
2985                 exit(1);        
2986         }               
2987 }
2988 /**************************************************************************************************/
2989
2990 double ShhherCommand::getNewWeights(int numOTUs, vector<int>& cumNumSeqs, vector<int>& nSeqsPerOTU, vector<double>& singleTau, vector<int>& seqNumber, vector<double>& weight){
2991         try{
2992                 
2993                 double maxChange = 0;
2994                 
2995                 for(int i=0;i<numOTUs;i++){
2996                         
2997                         if (m->control_pressed) { break; }
2998                         
2999                         double difference = weight[i];
3000                         weight[i] = 0;
3001                         
3002                         for(int j=0;j<nSeqsPerOTU[i];j++){
3003                                 int index = cumNumSeqs[i] + j;
3004                                 double tauValue = singleTau[seqNumber[index]];
3005                                 weight[i] += tauValue;
3006                         }
3007                         
3008                         difference = fabs(weight[i] - difference);
3009                         if(difference > maxChange){     maxChange = difference; }
3010                 }
3011                 return maxChange;
3012         }
3013         catch(exception& e) {
3014                 m->errorOut(e, "ShhherCommand", "getNewWeights");
3015                 exit(1);        
3016         }               
3017 }
3018
3019 /**************************************************************************************************/
3020
3021 double ShhherCommand::getLikelihood(int numSeqs, int numOTUs, vector<int>& nSeqsPerOTU, vector<int>& seqNumber, vector<int>& cumNumSeqs, vector<int>& seqIndex, vector<double>& dist, vector<double>& weight){
3022         
3023         try{
3024                 
3025                 vector<long double> P(numSeqs, 0);
3026                 int effNumOTUs = 0;
3027                 
3028                 for(int i=0;i<numOTUs;i++){
3029                         if(weight[i] > MIN_WEIGHT){
3030                                 effNumOTUs++;
3031                         }
3032                 }
3033                 
3034                 string hold;
3035                 for(int i=0;i<numOTUs;i++){
3036                         
3037                         if (m->control_pressed) { break; }
3038                         
3039                         for(int j=0;j<nSeqsPerOTU[i];j++){
3040                                 int index = cumNumSeqs[i] + j;
3041                                 int nI = seqIndex[index];
3042                                 double singleDist = dist[seqNumber[index]];
3043                                 
3044                                 P[nI] += weight[i] * exp(-singleDist * sigma);
3045                         }
3046                 }
3047                 double nLL = 0.00;
3048                 for(int i=0;i<numSeqs;i++){
3049                         if(P[i] == 0){  P[i] = DBL_EPSILON;     }
3050             
3051                         nLL += -log(P[i]);
3052                 }
3053                 
3054                 nLL = nLL -(double)numSeqs * log(sigma);
3055         
3056                 return nLL; 
3057         }
3058         catch(exception& e) {
3059                 m->errorOut(e, "ShhherCommand", "getNewWeights");
3060                 exit(1);        
3061         }               
3062 }
3063
3064 /**************************************************************************************************/
3065
3066 int ShhherCommand::checkCentroids(int numOTUs, vector<int>& centroids, vector<double>& weight){
3067         try{
3068                 vector<int> unique(numOTUs, 1);
3069                 
3070                 for(int i=0;i<numOTUs;i++){
3071                         if(centroids[i] == -1 || weight[i] < MIN_WEIGHT){
3072                                 unique[i] = -1;
3073                         }
3074                 }
3075                 
3076                 for(int i=0;i<numOTUs;i++){
3077                         
3078                         if (m->control_pressed) { break; }
3079                         
3080                         if(unique[i] == 1){
3081                                 for(int j=i+1;j<numOTUs;j++){
3082                                         if(unique[j] == 1){
3083                                                 
3084                                                 if(centroids[j] == centroids[i]){
3085                                                         unique[j] = 0;
3086                                                         centroids[j] = -1;
3087                                                         
3088                                                         weight[i] += weight[j];
3089                                                         weight[j] = 0.0;
3090                                                 }
3091                                         }
3092                                 }
3093                         }
3094                 }
3095         
3096         return 0;
3097         }
3098         catch(exception& e) {
3099                 m->errorOut(e, "ShhherCommand", "checkCentroids");
3100                 exit(1);        
3101         }               
3102 }
3103 /**************************************************************************************************/
3104
3105 void ShhherCommand::calcNewDistances(int numSeqs, int numOTUs, vector<int>& nSeqsPerOTU, vector<double>& dist, 
3106                                      vector<double>& weight, vector<short>& change, vector<int>& centroids,
3107                                      vector<vector<int> >& aaP, vector<double>& singleTau, vector<vector<int> >& aaI,   
3108                                      vector<int>& seqNumber, vector<int>& seqIndex,
3109                                      vector<short>& uniqueFlowgrams,
3110                                      vector<short>& flowDataIntI, int numFlowCells, vector<int>& lengths){
3111         
3112         try{
3113                 
3114                 int total = 0;
3115                 vector<double> newTau(numOTUs,0);
3116                 vector<double> norms(numSeqs, 0);
3117                 nSeqsPerOTU.assign(numOTUs, 0);
3118         
3119                 for(int i=0;i<numSeqs;i++){
3120                         
3121                         if (m->control_pressed) { break; }
3122                         
3123                         int indexOffset = i * numOTUs;
3124             
3125                         double offset = 1e8;
3126                         
3127                         for(int j=0;j<numOTUs;j++){
3128                 
3129                                 if(weight[j] > MIN_WEIGHT && change[j] == 1){
3130                                         dist[indexOffset + j] = getDistToCentroid(centroids[j], i, lengths[i], uniqueFlowgrams, flowDataIntI, numFlowCells);
3131                                 }
3132                 
3133                                 if(weight[j] > MIN_WEIGHT && dist[indexOffset + j] < offset){
3134                                         offset = dist[indexOffset + j];
3135                                 }
3136                         }
3137             
3138                         for(int j=0;j<numOTUs;j++){
3139                                 if(weight[j] > MIN_WEIGHT){
3140                                         newTau[j] = exp(sigma * (-dist[indexOffset + j] + offset)) * weight[j];
3141                                         norms[i] += newTau[j];
3142                                 }
3143                                 else{
3144                                         newTau[j] = 0.0;
3145                                 }
3146                         }
3147             
3148                         for(int j=0;j<numOTUs;j++){
3149                                 newTau[j] /= norms[i];
3150                         }
3151             
3152                         for(int j=0;j<numOTUs;j++){
3153                                 if(newTau[j] > MIN_TAU){
3154                                         
3155                                         int oldTotal = total;
3156                                         
3157                                         total++;
3158                                         
3159                                         singleTau.resize(total, 0);
3160                                         seqNumber.resize(total, 0);
3161                                         seqIndex.resize(total, 0);
3162                                         
3163                                         singleTau[oldTotal] = newTau[j];
3164                                         
3165                                         aaP[j][nSeqsPerOTU[j]] = oldTotal;
3166                                         aaI[j][nSeqsPerOTU[j]] = i;
3167                                         nSeqsPerOTU[j]++;
3168                                 }
3169                         }
3170             
3171                 }
3172         
3173         }
3174         catch(exception& e) {
3175                 m->errorOut(e, "ShhherCommand", "calcNewDistances");
3176                 exit(1);        
3177         }               
3178 }
3179 /**************************************************************************************************/
3180
3181 int ShhherCommand::fill(int numOTUs, vector<int>& seqNumber, vector<int>& seqIndex, vector<int>& cumNumSeqs, vector<int>& nSeqsPerOTU, vector<vector<int> >& aaP, vector<vector<int> >& aaI){
3182         try {
3183                 int index = 0;
3184                 for(int i=0;i<numOTUs;i++){
3185                         
3186                         if (m->control_pressed) { return 0; }
3187                         
3188                         cumNumSeqs[i] = index;
3189                         for(int j=0;j<nSeqsPerOTU[i];j++){
3190                                 seqNumber[index] = aaP[i][j];
3191                                 seqIndex[index] = aaI[i][j];
3192                                 
3193                                 index++;
3194                         }
3195                 }
3196         
3197         return 0; 
3198         }
3199         catch(exception& e) {
3200                 m->errorOut(e, "ShhherCommand", "fill");
3201                 exit(1);        
3202         }               
3203 }
3204 /**************************************************************************************************/
3205
3206 void ShhherCommand::setOTUs(int numOTUs, int numSeqs, vector<int>& seqNumber, vector<int>& seqIndex, vector<int>& cumNumSeqs, vector<int>& nSeqsPerOTU,
3207                             vector<int>& otuData, vector<double>& singleTau, vector<double>& dist, vector<vector<int> >& aaP, vector<vector<int> >& aaI){
3208         
3209         try {
3210                 vector<double> bigTauMatrix(numOTUs * numSeqs, 0.0000);
3211                 
3212                 for(int i=0;i<numOTUs;i++){
3213                         
3214                         if (m->control_pressed) { break; }
3215                         
3216                         for(int j=0;j<nSeqsPerOTU[i];j++){
3217                                 int index = cumNumSeqs[i] + j;
3218                                 double tauValue = singleTau[seqNumber[index]];
3219                                 int sIndex = seqIndex[index];
3220                                 bigTauMatrix[sIndex * numOTUs + i] = tauValue;                          
3221                         }
3222                 }
3223                 
3224                 for(int i=0;i<numSeqs;i++){
3225                         double maxTau = -1.0000;
3226                         int maxOTU = -1;
3227                         for(int j=0;j<numOTUs;j++){
3228                                 if(bigTauMatrix[i * numOTUs + j] > maxTau){
3229                                         maxTau = bigTauMatrix[i * numOTUs + j];
3230                                         maxOTU = j;
3231                                 }
3232                         }
3233                         
3234                         otuData[i] = maxOTU;
3235                 }
3236                 
3237                 nSeqsPerOTU.assign(numOTUs, 0);         
3238                 
3239                 for(int i=0;i<numSeqs;i++){
3240                         int index = otuData[i];
3241                         
3242                         singleTau[i] = 1.0000;
3243                         dist[i] = 0.0000;
3244                         
3245                         aaP[index][nSeqsPerOTU[index]] = i;
3246                         aaI[index][nSeqsPerOTU[index]] = i;
3247                         
3248                         nSeqsPerOTU[index]++;
3249                 }
3250         
3251                 fill(numOTUs, seqNumber, seqIndex, cumNumSeqs, nSeqsPerOTU, aaP, aaI);  
3252         }
3253         catch(exception& e) {
3254                 m->errorOut(e, "ShhherCommand", "setOTUs");
3255                 exit(1);        
3256         }               
3257 }
3258 /**************************************************************************************************/
3259
3260 void ShhherCommand::writeQualities(int numOTUs, int numFlowCells, string qualityFileName, vector<int> otuCounts, vector<int>& nSeqsPerOTU, vector<int>& seqNumber,
3261                                    vector<double>& singleTau, vector<short>& flowDataIntI, vector<short>& uniqueFlowgrams, vector<int>& cumNumSeqs,
3262                                    vector<int>& mapUniqueToSeq, vector<string>& seqNameVector, vector<int>& centroids, vector<vector<int> >& aaI){
3263         
3264         try {
3265         
3266                 ofstream qualityFile;
3267                 m->openOutputFile(qualityFileName, qualityFile);
3268         
3269                 qualityFile.setf(ios::fixed, ios::floatfield);
3270                 qualityFile.setf(ios::showpoint);
3271                 qualityFile << setprecision(6);
3272                 
3273                 vector<vector<int> > qualities(numOTUs);
3274                 vector<double> pr(HOMOPS, 0);
3275                 
3276                 
3277                 for(int i=0;i<numOTUs;i++){
3278                         
3279                         if (m->control_pressed) { break; }
3280                         
3281                         int index = 0;
3282             
3283                         if(nSeqsPerOTU[i] > 0){
3284                                 
3285                                 while(index < numFlowCells){
3286                     
3287                                         double maxPrValue = 1e8;
3288                                         short maxPrIndex = -1;
3289                                         double count = 0.0000;
3290                                         
3291                                         pr.assign(HOMOPS, 0);
3292                                         
3293                                         for(int j=0;j<nSeqsPerOTU[i];j++){
3294                                                 int lIndex = cumNumSeqs[i] + j;
3295                                                 double tauValue = singleTau[seqNumber[lIndex]];
3296                                                 int sequenceIndex = aaI[i][j];
3297                                                 short intensity = flowDataIntI[sequenceIndex * numFlowCells + index];
3298                                                 
3299                                                 count += tauValue;
3300                                                 
3301                                                 for(int s=0;s<HOMOPS;s++){
3302                                                         pr[s] += tauValue * singleLookUp[s * NUMBINS + intensity];
3303                                                 }
3304                                         }
3305                     
3306                                         maxPrIndex = uniqueFlowgrams[centroids[i] * numFlowCells + index];
3307                                         maxPrValue = pr[maxPrIndex];
3308                                         
3309                                         if(count > MIN_COUNT){
3310                                                 double U = 0.0000;
3311                                                 double norm = 0.0000;
3312                                                 
3313                                                 for(int s=0;s<HOMOPS;s++){
3314                                                         norm += exp(-(pr[s] - maxPrValue));
3315                                                 }
3316                         
3317                                                 for(int s=1;s<=maxPrIndex;s++){
3318                                                         int value = 0;
3319                                                         double temp = 0.0000;
3320                                                         
3321                                                         U += exp(-(pr[s-1]-maxPrValue))/norm;
3322                                                         
3323                                                         if(U>0.00){
3324                                                                 temp = log10(U);
3325                                                         }
3326                                                         else{
3327                                                                 temp = -10.1;
3328                                                         }
3329                                                         temp = floor(-10 * temp);
3330                                                         value = (int)floor(temp);
3331                                                         if(value > 100){        value = 100;    }
3332                                                         
3333                             qualities[i].push_back((int)value);
3334                                                 }
3335                                         }//end if
3336                                         
3337                                         index++;
3338                     
3339                                 }//end while
3340                 
3341                         }//end if
3342                         
3343             
3344                         if(otuCounts[i] > 0){
3345                                 qualityFile << '>' << seqNameVector[mapUniqueToSeq[i]] << endl;
3346                 //need to get past the first four bases
3347                 for (int j = 4; j < qualities[i].size(); j++) { qualityFile << qualities[i][j] << ' '; }
3348                                 qualityFile << endl;
3349                         }
3350                 }//end for
3351                 qualityFile.close();
3352                 outputNames.push_back(qualityFileName); outputTypes["qfile"].push_back(qualityFileName);
3353         
3354         }
3355         catch(exception& e) {
3356                 m->errorOut(e, "ShhherCommand", "writeQualities");
3357                 exit(1);        
3358         }               
3359 }
3360
3361 /**************************************************************************************************/
3362
3363 void ShhherCommand::writeSequences(string thisCompositeFASTAFileName, int numOTUs, int numFlowCells, string fastaFileName, vector<int> otuCounts, vector<short>& uniqueFlowgrams, vector<string>& seqNameVector, vector<vector<int> >& aaI, vector<int>& centroids){
3364         try {
3365                 
3366                 ofstream fastaFile;
3367                 m->openOutputFile(fastaFileName, fastaFile);
3368                 
3369                 vector<string> names(numOTUs, "");
3370                 
3371                 for(int i=0;i<numOTUs;i++){
3372                         
3373                         if (m->control_pressed) { break; }
3374                         
3375                         int index = centroids[i];
3376                         
3377                         if(otuCounts[i] > 0){
3378                                 fastaFile << '>' << seqNameVector[aaI[i][0]] << endl;
3379                                 
3380                                 string newSeq = "";
3381                                 
3382                                 for(int j=0;j<numFlowCells;j++){
3383                                         
3384                                         char base = flowOrder[j % flowOrder.length()];
3385                                         for(int k=0;k<uniqueFlowgrams[index * numFlowCells + j];k++){
3386                                                 newSeq += base;
3387                                         }
3388                                 }
3389                                 
3390                                 if (newSeq.length() >= 4) {  fastaFile << newSeq.substr(4) << endl;  }
3391                 else {  fastaFile << "NNNN" << endl;  }
3392                         }
3393                 }
3394                 fastaFile.close();
3395         
3396                 outputNames.push_back(fastaFileName); outputTypes["fasta"].push_back(fastaFileName);
3397         
3398                 if(thisCompositeFASTAFileName != ""){
3399                         m->appendFiles(fastaFileName, thisCompositeFASTAFileName);
3400                 }
3401         }
3402         catch(exception& e) {
3403                 m->errorOut(e, "ShhherCommand", "writeSequences");
3404                 exit(1);        
3405         }               
3406 }
3407
3408 /**************************************************************************************************/
3409
3410 void ShhherCommand::writeNames(string thisCompositeNamesFileName, int numOTUs, string nameFileName, vector<int> otuCounts, vector<string>& seqNameVector, vector<vector<int> >& aaI, vector<int>& nSeqsPerOTU){
3411         try {
3412                 
3413                 ofstream nameFile;
3414                 m->openOutputFile(nameFileName, nameFile);
3415                 
3416                 for(int i=0;i<numOTUs;i++){
3417                         
3418                         if (m->control_pressed) { break; }
3419                         
3420                         if(otuCounts[i] > 0){
3421                                 nameFile << seqNameVector[aaI[i][0]] << '\t' << seqNameVector[aaI[i][0]];
3422                                 
3423                                 for(int j=1;j<nSeqsPerOTU[i];j++){
3424                                         nameFile << ',' << seqNameVector[aaI[i][j]];
3425                                 }
3426                                 
3427                                 nameFile << endl;
3428                         }
3429                 }
3430                 nameFile.close();
3431                 outputNames.push_back(nameFileName); outputTypes["name"].push_back(nameFileName);
3432                 
3433                 
3434                 if(thisCompositeNamesFileName != ""){
3435                         m->appendFiles(nameFileName, thisCompositeNamesFileName);
3436                 }               
3437         }
3438         catch(exception& e) {
3439                 m->errorOut(e, "ShhherCommand", "writeNames");
3440                 exit(1);        
3441         }               
3442 }
3443
3444 /**************************************************************************************************/
3445
3446 void ShhherCommand::writeGroups(string groupFileName, string fileRoot, int numSeqs, vector<string>& seqNameVector){
3447         try {
3448         ofstream groupFile;
3449                 m->openOutputFile(groupFileName, groupFile);
3450                 
3451                 for(int i=0;i<numSeqs;i++){
3452                         if (m->control_pressed) { break; }
3453                         groupFile << seqNameVector[i] << '\t' << fileRoot << endl;
3454                 }
3455                 groupFile.close();
3456                 outputNames.push_back(groupFileName); outputTypes["group"].push_back(groupFileName);
3457         
3458         }
3459         catch(exception& e) {
3460                 m->errorOut(e, "ShhherCommand", "writeGroups");
3461                 exit(1);        
3462         }               
3463 }
3464
3465 /**************************************************************************************************/
3466
3467 void ShhherCommand::writeClusters(string otuCountsFileName, int numOTUs, int numFlowCells, vector<int> otuCounts, vector<int>& centroids, vector<short>& uniqueFlowgrams, vector<string>& seqNameVector, vector<vector<int> >& aaI, vector<int>& nSeqsPerOTU, vector<int>& lengths, vector<short>& flowDataIntI){
3468         try {
3469                 ofstream otuCountsFile;
3470                 m->openOutputFile(otuCountsFileName, otuCountsFile);
3471                 
3472                 string bases = flowOrder;
3473                 
3474                 for(int i=0;i<numOTUs;i++){
3475                         
3476                         if (m->control_pressed) {
3477                                 break;
3478                         }
3479                         //output the translated version of the centroid sequence for the otu
3480                         if(otuCounts[i] > 0){
3481                                 int index = centroids[i];
3482                                 
3483                                 otuCountsFile << "ideal\t";
3484                                 for(int j=8;j<numFlowCells;j++){
3485                                         char base = bases[j % bases.length()];
3486                                         for(int s=0;s<uniqueFlowgrams[index * numFlowCells + j];s++){
3487                                                 otuCountsFile << base;
3488                                         }
3489                                 }
3490                                 otuCountsFile << endl;
3491                                 
3492                                 for(int j=0;j<nSeqsPerOTU[i];j++){
3493                                         int sequence = aaI[i][j];
3494                                         otuCountsFile << seqNameVector[sequence] << '\t';
3495                                         
3496                                         string newSeq = "";
3497                                         
3498                                         for(int k=0;k<lengths[sequence];k++){
3499                                                 char base = bases[k % bases.length()];
3500                                                 int freq = int(0.01 * (double)flowDataIntI[sequence * numFlowCells + k] + 0.5);
3501                         
3502                                                 for(int s=0;s<freq;s++){
3503                                                         newSeq += base;
3504                                                         //otuCountsFile << base;
3505                                                 }
3506                                         }
3507                                         
3508                     if (newSeq.length() >= 4) {  otuCountsFile << newSeq.substr(4) << endl;  }
3509                     else {  otuCountsFile << "NNNN" << endl;  }
3510                                 }
3511                                 otuCountsFile << endl;
3512                         }
3513                 }
3514                 otuCountsFile.close();
3515                 outputNames.push_back(otuCountsFileName); outputTypes["counts"].push_back(otuCountsFileName);
3516         
3517         }
3518         catch(exception& e) {
3519                 m->errorOut(e, "ShhherCommand", "writeClusters");
3520                 exit(1);        
3521         }               
3522 }
3523
3524 /**************************************************************************************************/
3525
3526 void ShhherCommand::getSingleLookUp(){
3527         try{
3528                 //      these are the -log probabilities that a signal corresponds to a particular homopolymer length
3529                 singleLookUp.assign(HOMOPS * NUMBINS, 0);
3530                 
3531                 int index = 0;
3532                 ifstream lookUpFile;
3533                 m->openInputFile(lookupFileName, lookUpFile);
3534                 
3535                 for(int i=0;i<HOMOPS;i++){
3536                         
3537                         if (m->control_pressed) { break; }
3538                         
3539                         float logFracFreq;
3540                         lookUpFile >> logFracFreq;
3541                         
3542                         for(int j=0;j<NUMBINS;j++)      {
3543                                 lookUpFile >> singleLookUp[index];
3544                                 index++;
3545                         }
3546                 }       
3547                 lookUpFile.close();
3548         }
3549         catch(exception& e) {
3550                 m->errorOut(e, "ShhherCommand", "getSingleLookUp");
3551                 exit(1);
3552         }
3553 }
3554
3555 /**************************************************************************************************/
3556
3557 void ShhherCommand::getJointLookUp(){
3558         try{
3559                 
3560                 //      the most likely joint probability (-log) that two intenities have the same polymer length
3561                 jointLookUp.resize(NUMBINS * NUMBINS, 0);
3562                 
3563                 for(int i=0;i<NUMBINS;i++){
3564                         
3565                         if (m->control_pressed) { break; }
3566                         
3567                         for(int j=0;j<NUMBINS;j++){             
3568                                 
3569                                 double minSum = 100000000;
3570                                 
3571                                 for(int k=0;k<HOMOPS;k++){
3572                                         double sum = singleLookUp[k * NUMBINS + i] + singleLookUp[k * NUMBINS + j];
3573                                         
3574                                         if(sum < minSum)        {       minSum = sum;           }
3575                                 }       
3576                                 jointLookUp[i * NUMBINS + j] = minSum;
3577                         }
3578                 }
3579         }
3580         catch(exception& e) {
3581                 m->errorOut(e, "ShhherCommand", "getJointLookUp");
3582                 exit(1);
3583         }
3584 }
3585
3586 /**************************************************************************************************/
3587
3588 double ShhherCommand::getProbIntensity(int intIntensity){                          
3589         try{
3590
3591                 double minNegLogProb = 100000000; 
3592
3593                 
3594                 for(int i=0;i<HOMOPS;i++){//loop signal strength
3595                         
3596                         if (m->control_pressed) { break; }
3597                         
3598                         float negLogProb = singleLookUp[i * NUMBINS + intIntensity];
3599                         if(negLogProb < minNegLogProb)  {       minNegLogProb = negLogProb; }
3600                 }
3601                 
3602                 return minNegLogProb;
3603         }
3604         catch(exception& e) {
3605                 m->errorOut(e, "ShhherCommand", "getProbIntensity");
3606                 exit(1);
3607         }
3608 }
3609
3610
3611
3612