]> git.donarmstrong.com Git - mothur.git/blob - shhhercommand.cpp
sffinfo bug with flow grams right index when clipQualRight=0
[mothur.git] / shhhercommand.cpp
1 /*
2  *  shhher.cpp
3  *  Mothur
4  *
5  *  Created by Pat Schloss on 12/27/10.
6  *  Copyright 2010 Schloss Lab. All rights reserved.
7  *
8  */
9
10 #include "shhhercommand.h"
11
12 //**********************************************************************************************************************
13 vector<string> ShhherCommand::setParameters(){  
14         try {
15                 CommandParameter pflow("flow", "InputTypes", "", "", "none", "fileflow", "none","fasta-name-group-counts-qfile",false,false,true); parameters.push_back(pflow);
16                 CommandParameter pfile("file", "InputTypes", "", "", "none", "fileflow", "none","fasta-name-group-counts-qfile",false,false,true); parameters.push_back(pfile);
17                 CommandParameter plookup("lookup", "InputTypes", "", "", "none", "none", "none","",false,false,true); parameters.push_back(plookup);
18                 CommandParameter pcutoff("cutoff", "Number", "", "0.01", "", "", "","",false,false); parameters.push_back(pcutoff);
19                 CommandParameter pprocessors("processors", "Number", "", "1", "", "", "","",false,false,true); parameters.push_back(pprocessors);
20                 CommandParameter pmaxiter("maxiter", "Number", "", "1000", "", "", "","",false,false); parameters.push_back(pmaxiter);
21         CommandParameter plarge("large", "Number", "", "-1", "", "", "","",false,false); parameters.push_back(plarge);
22                 CommandParameter psigma("sigma", "Number", "", "60", "", "", "","",false,false); parameters.push_back(psigma);
23                 CommandParameter pmindelta("mindelta", "Number", "", "0.000001", "", "", "","",false,false); parameters.push_back(pmindelta);
24         CommandParameter porder("order", "Multiple", "A-B", "A", "", "", "","",false,false, true); parameters.push_back(porder);                CommandParameter pinputdir("inputdir", "String", "", "", "", "", "","",false,false); parameters.push_back(pinputdir);
25                 CommandParameter poutputdir("outputdir", "String", "", "", "", "", "","",false,false); parameters.push_back(poutputdir);
26                 
27                 vector<string> myArray;
28                 for (int i = 0; i < parameters.size(); i++) {   myArray.push_back(parameters[i].name);          }
29                 return myArray;
30         }
31         catch(exception& e) {
32                 m->errorOut(e, "ShhherCommand", "setParameters");
33                 exit(1);
34         }
35 }
36 //**********************************************************************************************************************
37 string ShhherCommand::getHelpString(){  
38         try {
39                 string helpString = "";
40                 helpString += "The shhh.flows command reads a file containing flowgrams and creates a file of corrected sequences.\n";
41         helpString += "The shhh.flows command parameters are flow, file, lookup, cutoff, processors, large, maxiter, sigma, mindelta and order.\n";
42         helpString += "The flow parameter is used to input your flow file.\n";
43         helpString += "The file parameter is used to input the *flow.files file created by trim.flows.\n";
44         helpString += "The lookup parameter is used specify the lookup file you would like to use. http://www.mothur.org/wiki/Lookup_files.\n";
45         helpString += "The order parameter options are A or B.  A = TACG and B = TACGTACGTACGATGTAGTCGAGCATCATCTGACGCAGTACGTGCATGATCTCAGTCAGCAGCTATGTCAGTGCATGCAGTGACTGATCGTCATCAGCTAGCATCGACTGCATAGATCGCATGACGATCGCATATCGTCAGTGCATGTAGTCGAGCATCATCTGACGCAGTACGTGCATGATCTCAGTCAGCAGCTATGTCAGTGCATGCATAGATCGCATGACGATCGCATATCGTCAGTGCAGTGACTGATCGTCATCAGCTAGCATCGACTGCATGTAGTCGAGCATCATCTGACGCAGTACGTGCATAGATCGCATGACGATCGCATATCGTCAGTGCATGATCTCAGTCAGCAGCTATGTCAGTGCATGCAGTGACTGATCGTCATCAGCTAGCATCGACTGCATGTAGTCGAGCATCATCTGACGCAGTACGTGCAGTGACTGATCGTCATCAGCTAGCATCGACTGCATAGATCGCATGACGATCGCATATCGTCAGTGCATGATCTCAGTCAGCAGCTATGTCAGTGCATGCATGTAGTCGAGCATCATCTGACGCAGTACGTGCATAGATCGCATGACGATCGCATATCGTCAGTGCAGTGACTGATCGTCATCAGCTAGCATCGACTGCATGATCTCAGTCAGCAGCTATGTCAGTGCATGCAGTGACTGATCGTCATCAGCTAGCATCGACTGCATAGATCGCATGACGATCGCATATCGTCAGTGCATGATCTCAGTCAGCAGCTATGTCAGTGCATGCATGTAGTCGAGCATCATCTGACGCAGTACGTGCATAGATCGCATGACGATCGCATATCGTCAGTGCATGATCTCAGTCAGCAGCTATGTCAGTGCATGCAGTGACTGATCGTCATCAGCTAGCATCGACTGCATGTAGTCGAGCATCATCTGACGCAGTACGTGCATGATCTCAGTCAGCAGCTATGTCAGTGCATGCATAGATCGCATGACGATCGCATATCGTCAGTGCATGTAGTCGAGCATCATCTGACGCAGTACGTGCAGTGACTGATCGTCATCAGCTAGCATCGACTGCATAGATCGCATGACGATCGCATATCGTCAGTGCATGTAGTCGAGCATCATCTGACGCAGTACGTGCATGATCTCAGTCAGCAGCTATGTCAGTGCATGCAGTGACTGATCGTCATCAGCTAGCATCGACTGCATGATCTCAGTCAGCAGCTATGTCAGTGCATGCAGTGACTGATCGTCATCAGCTAGCATCGACTGCATAGATCGCATGACGATCGCATATCGTCAGTGCATGTAGTCGAGCATCATCTGACGCAGTACGTGCATGATCTCAGTCAGCAGCTATGTCAGTGCATGCATGTAGTCGAGCATCATCTGACGCAGTACGTGCAGTGACTGATCGTCATCAGCTAGCATCGACTGCATAGATCGCATGACGATCGCATATCGTCAGTGCAGTGACTGATCGTCATCAGCTAGCATCGACTGCATGTAGTCGAGCATCATCTGACGCAGTACGTGCATAGATCGCATGACGATCGCATATCGTCAGTGCATGATCTCAGTCAGCAGCTATGTCAGTGCATGCAGTGACTGATCGTCATCAGCTAGCATCGACTGCATGTAGTCGAGCATCATCTGACGCAGTACGTGCATGATCTCAGTCAGCAGCTATGTCAGTGCATGCATAGATCGCATGACGATCGCATATCGTCAGTGCAGTGACTGATCGTCATCAGCTAGCATCGACTGCATGATCTCAGTCAGCAGC.\n";
46                 return helpString;
47         }
48         catch(exception& e) {
49                 m->errorOut(e, "ShhherCommand", "getHelpString");
50                 exit(1);
51         }
52 }
53 //**********************************************************************************************************************
54 string ShhherCommand::getOutputPattern(string type) {
55     try {
56         string pattern = "";
57         
58         if (type == "fasta")            {   pattern = "[filename],shhh.fasta";   }
59         else if (type == "name")    {   pattern = "[filename],shhh.names";   }
60         else if (type == "group")        {   pattern = "[filename],shhh.groups";   }
61         else if (type == "counts")        {   pattern = "[filename],shhh.counts";   }
62         else if (type == "qfile")        {   pattern = "[filename],shhh.qual";   }
63         else { m->mothurOut("[ERROR]: No definition for type " + type + " output pattern.\n"); m->control_pressed = true;  }
64         
65         return pattern;
66     }
67     catch(exception& e) {
68         m->errorOut(e, "ShhherCommand", "getOutputPattern");
69         exit(1);
70     }
71 }
72 //**********************************************************************************************************************
73
74 ShhherCommand::ShhherCommand(){ 
75         try {
76                 abort = true; calledHelp = true;
77                 setParameters();
78                 
79                 //initialize outputTypes
80                 vector<string> tempOutNames;
81                 outputTypes["fasta"] = tempOutNames;
82         outputTypes["name"] = tempOutNames;
83         outputTypes["group"] = tempOutNames;
84         outputTypes["counts"] = tempOutNames;
85         outputTypes["qfile"] = tempOutNames;
86
87         }
88         catch(exception& e) {
89                 m->errorOut(e, "ShhherCommand", "ShhherCommand");
90                 exit(1);
91         }
92 }
93
94 //**********************************************************************************************************************
95
96 ShhherCommand::ShhherCommand(string option) {
97         try {
98         
99 #ifdef USE_MPI
100                 MPI_Comm_rank(MPI_COMM_WORLD, &pid); //find out who we are
101                 MPI_Comm_size(MPI_COMM_WORLD, &ncpus);
102         
103                 if(pid == 0){
104 #endif
105                 abort = false; calledHelp = false;   
106                 
107                 //allow user to run help
108                 if(option == "help") { help(); abort = true; calledHelp = true; }
109                 else if(option == "citation") { citation(); abort = true; calledHelp = true;}
110                 
111                 else {
112                         vector<string> myArray = setParameters();
113                         
114                         OptionParser parser(option);
115                         map<string,string> parameters = parser.getParameters();
116                         
117                         ValidParameters validParameter;
118                         map<string,string>::iterator it;
119                         
120                         //check to make sure all parameters are valid for command
121                         for (it = parameters.begin(); it != parameters.end(); it++) { 
122                                 if (validParameter.isValidParameter(it->first, myArray, it->second) != true) {  abort = true;  }
123                         }
124                         
125                         //initialize outputTypes
126             vector<string> tempOutNames;
127             outputTypes["fasta"] = tempOutNames;
128             outputTypes["name"] = tempOutNames;
129             outputTypes["group"] = tempOutNames;
130             outputTypes["counts"] = tempOutNames;
131             outputTypes["qfile"] = tempOutNames;
132
133                         
134                         //if the user changes the input directory command factory will send this info to us in the output parameter 
135                         string inputDir = validParameter.validFile(parameters, "inputdir", false);              
136                         if (inputDir == "not found"){   inputDir = "";          }
137                         else {
138                                 string path;
139                                 it = parameters.find("flow");
140                                 //user has given a template file
141                                 if(it != parameters.end()){ 
142                                         path = m->hasPath(it->second);
143                                         //if the user has not given a path then, add inputdir. else leave path alone.
144                                         if (path == "") {       parameters["flow"] = inputDir + it->second;             }
145                                 }
146                                 
147                                 it = parameters.find("lookup");
148                                 //user has given a template file
149                                 if(it != parameters.end()){ 
150                                         path = m->hasPath(it->second);
151                                         //if the user has not given a path then, add inputdir. else leave path alone.
152                                         if (path == "") {       parameters["lookup"] = inputDir + it->second;           }
153                                 }
154
155                                 it = parameters.find("file");
156                                 //user has given a template file
157                                 if(it != parameters.end()){ 
158                                         path = m->hasPath(it->second);
159                                         //if the user has not given a path then, add inputdir. else leave path alone.
160                                         if (path == "") {       parameters["file"] = inputDir + it->second;             }
161                                 }
162                         }
163             
164             //if the user changes the output directory command factory will send this info to us in the output parameter 
165                         outputDir = validParameter.validFile(parameters, "outputdir", false);           if (outputDir == "not found"){  outputDir = ""; }
166             
167                         //check for required parameters
168                         flowFileName = validParameter.validFile(parameters, "flow", true);
169                         flowFilesFileName = validParameter.validFile(parameters, "file", true);
170                         if (flowFileName == "not found" && flowFilesFileName == "not found") {
171                                 m->mothurOut("values for either flow or file must be provided for the shhh.flows command.");
172                                 m->mothurOutEndLine();
173                                 abort = true; 
174                         }
175                         else if (flowFileName == "not open" || flowFilesFileName == "not open") { abort = true; }
176                         
177                         if(flowFileName != "not found"){
178                                 compositeFASTAFileName = "";    
179                                 compositeNamesFileName = "";    
180                         }
181                         else{
182                                 ofstream temp;
183                 
184                 string thisoutputDir = outputDir;
185                 if (outputDir == "") {  thisoutputDir =  m->hasPath(flowFilesFileName); } //if user entered a file with a path then preserve it
186                 
187                                 //we want to rip off .files, and also .flow if its there
188                 string fileroot = m->getRootName(m->getSimpleName(flowFilesFileName));
189                 if (fileroot[fileroot.length()-1] == '.') {  fileroot = fileroot.substr(0, fileroot.length()-1); } //rip off dot
190                 string extension = m->getExtension(fileroot);
191                 if (extension == ".flow") { fileroot = m->getRootName(fileroot);  }
192                 else { fileroot += "."; } //add back if needed
193                 
194                                 compositeFASTAFileName = thisoutputDir + fileroot + "shhh.fasta";
195                                 m->openOutputFile(compositeFASTAFileName, temp);
196                                 temp.close();
197                                 
198                                 compositeNamesFileName = thisoutputDir + fileroot + "shhh.names";
199                                 m->openOutputFile(compositeNamesFileName, temp);
200                                 temp.close();
201                         }
202             
203             if(flowFilesFileName != "not found"){
204                 string fName;
205                 
206                 ifstream flowFilesFile;
207                 m->openInputFile(flowFilesFileName, flowFilesFile);
208                 while(flowFilesFile){
209                     fName = m->getline(flowFilesFile);
210                     
211                     //test if file is valid
212                     ifstream in;
213                     int ableToOpen = m->openInputFile(fName, in, "noerror");
214                     in.close(); 
215                     if (ableToOpen == 1) {
216                         if (inputDir != "") { //default path is set
217                             string tryPath = inputDir + fName;
218                             m->mothurOut("Unable to open " + fName + ". Trying input directory " + tryPath); m->mothurOutEndLine();
219                             ifstream in2;
220                             ableToOpen = m->openInputFile(tryPath, in2, "noerror");
221                             in2.close();
222                             fName = tryPath;
223                         }
224                     }
225                     
226                     if (ableToOpen == 1) {
227                         if (m->getDefaultPath() != "") { //default path is set
228                             string tryPath = m->getDefaultPath() + m->getSimpleName(fName);
229                             m->mothurOut("Unable to open " + fName + ". Trying default " + tryPath); m->mothurOutEndLine();
230                             ifstream in2;
231                             ableToOpen = m->openInputFile(tryPath, in2, "noerror");
232                             in2.close();
233                             fName = tryPath;
234                         }
235                     }
236                     
237                     //if you can't open it its not in current working directory or inputDir, try mothur excutable location
238                     if (ableToOpen == 1) {
239                         string exepath = m->argv;
240                         string tempPath = exepath;
241                         for (int i = 0; i < exepath.length(); i++) { tempPath[i] = tolower(exepath[i]); }
242                         exepath = exepath.substr(0, (tempPath.find_last_of('m')));
243                         
244                         string tryPath = m->getFullPathName(exepath) + m->getSimpleName(fName);
245                         m->mothurOut("Unable to open " + fName + ". Trying mothur's executable location " + tryPath); m->mothurOutEndLine();
246                         ifstream in2;
247                         ableToOpen = m->openInputFile(tryPath, in2, "noerror");
248                         in2.close();
249                         fName = tryPath;
250                     }
251                     
252                     if (ableToOpen == 1) {  m->mothurOut("Unable to open " + fName + ". Disregarding. "); m->mothurOutEndLine();  }
253                     else { flowFileVector.push_back(fName); }
254                     m->gobble(flowFilesFile);
255                 }
256                 flowFilesFile.close();
257                 if (flowFileVector.size() == 0) {  m->mothurOut("[ERROR]: no valid files."); m->mothurOutEndLine(); abort = true; }
258             }
259             else{
260                 if (outputDir == "") { outputDir = m->hasPath(flowFileName); }
261                 flowFileVector.push_back(flowFileName);
262             }
263                 
264                         //check for optional parameter and set defaults
265                         // ...at some point should added some additional type checking...
266                         string temp;
267                         temp = validParameter.validFile(parameters, "lookup", true);
268                         if (temp == "not found")        {       
269                                 lookupFileName = "LookUp_Titanium.pat"; 
270                                 
271                                 int ableToOpen;
272                                 ifstream in;
273                                 ableToOpen = m->openInputFile(lookupFileName, in, "noerror");
274                                 in.close();     
275                                 
276                                 //if you can't open it, try input location
277                                 if (ableToOpen == 1) {
278                                         if (inputDir != "") { //default path is set
279                                                 string tryPath = inputDir + lookupFileName;
280                                                 m->mothurOut("Unable to open " + lookupFileName + ". Trying input directory " + tryPath); m->mothurOutEndLine();
281                                                 ifstream in2;
282                                                 ableToOpen = m->openInputFile(tryPath, in2, "noerror");
283                                                 in2.close();
284                                                 lookupFileName = tryPath;
285                                         }
286                                 }
287                                 
288                                 //if you can't open it, try default location
289                                 if (ableToOpen == 1) {
290                                         if (m->getDefaultPath() != "") { //default path is set
291                                                 string tryPath = m->getDefaultPath() + m->getSimpleName(lookupFileName);
292                                                 m->mothurOut("Unable to open " + lookupFileName + ". Trying default " + tryPath); m->mothurOutEndLine();
293                                                 ifstream in2;
294                                                 ableToOpen = m->openInputFile(tryPath, in2, "noerror");
295                                                 in2.close();
296                                                 lookupFileName = tryPath;
297                                         }
298                                 }
299                                 
300                                 //if you can't open it its not in current working directory or inputDir, try mothur excutable location
301                                 if (ableToOpen == 1) {
302                                         string exepath = m->argv;
303                                         string tempPath = exepath;
304                                         for (int i = 0; i < exepath.length(); i++) { tempPath[i] = tolower(exepath[i]); }
305                                         exepath = exepath.substr(0, (tempPath.find_last_of('m')));
306                                         
307                                         string tryPath = m->getFullPathName(exepath) + m->getSimpleName(lookupFileName);
308                                         m->mothurOut("Unable to open " + lookupFileName + ". Trying mothur's executable location " + tryPath); m->mothurOutEndLine();
309                                         ifstream in2;
310                                         ableToOpen = m->openInputFile(tryPath, in2, "noerror");
311                                         in2.close();
312                                         lookupFileName = tryPath;
313                                 }
314                                 
315                                 if (ableToOpen == 1) {  m->mothurOut("Unable to open " + lookupFileName + "."); m->mothurOutEndLine(); abort=true;  }
316                         }
317                         else if(temp == "not open")     {       
318                                 
319                                 lookupFileName = validParameter.validFile(parameters, "lookup", false);
320                                 
321                                 //if you can't open it its not inputDir, try mothur excutable location
322                                 string exepath = m->argv;
323                                 string tempPath = exepath;
324                                 for (int i = 0; i < exepath.length(); i++) { tempPath[i] = tolower(exepath[i]); }
325                                 exepath = exepath.substr(0, (tempPath.find_last_of('m')));
326                                         
327                                 string tryPath = m->getFullPathName(exepath) + lookupFileName;
328                                 m->mothurOut("Unable to open " + lookupFileName + ". Trying mothur's executable location " + tryPath); m->mothurOutEndLine();
329                                 ifstream in2;
330                                 int ableToOpen = m->openInputFile(tryPath, in2, "noerror");
331                                 in2.close();
332                                 lookupFileName = tryPath;
333                                 
334                                 if (ableToOpen == 1) {  m->mothurOut("Unable to open " + lookupFileName + "."); m->mothurOutEndLine(); abort=true;  }
335                         }else                                           {       lookupFileName = temp;  }
336                         
337                         temp = validParameter.validFile(parameters, "processors", false);       if (temp == "not found"){       temp = m->getProcessors();      }
338                         m->setProcessors(temp);
339                         m->mothurConvert(temp, processors);
340
341                         temp = validParameter.validFile(parameters, "cutoff", false);   if (temp == "not found"){       temp = "0.01";          }
342                         m->mothurConvert(temp, cutoff); 
343                         
344                         temp = validParameter.validFile(parameters, "mindelta", false); if (temp == "not found"){       temp = "0.000001";      }
345                         m->mothurConvert(temp, minDelta); 
346
347                         temp = validParameter.validFile(parameters, "maxiter", false);  if (temp == "not found"){       temp = "1000";          }
348                         m->mothurConvert(temp, maxIters); 
349             
350             temp = validParameter.validFile(parameters, "large", false);        if (temp == "not found"){       temp = "0";             }
351                         m->mothurConvert(temp, largeSize); 
352             if (largeSize != 0) { large = true; }
353             else { large = false;  }
354             if (largeSize < 0) {  m->mothurOut("The value of the large cannot be negative.\n"); }
355             
356 #ifdef USE_MPI
357             if (large) { m->mothurOut("The large parameter is not available with the MPI-Enabled version.\n"); large=false; }                           
358 #endif
359             
360
361                         temp = validParameter.validFile(parameters, "sigma", false);if (temp == "not found")    {       temp = "60";            }
362                         m->mothurConvert(temp, sigma); 
363                         
364                         temp = validParameter.validFile(parameters, "order", false);  if (temp == "not found"){         temp = "A";     }
365             if (temp.length() > 1) {  m->mothurOut("[ERROR]: " + temp + " is not a valid option for order. order options are A or B. A = TACG and B = TACGTACGTACGATGTAGTCGAGCATCATCTGACGCAGTACGTGCATGATCTCAGTCAGCAGCTATGTCAGTGCATGCAGTGACTGATCGTCATCAGCTAGCATCGACTGCATAGATCGCATGACGATCGCATATCGTCAGTGCATGTAGTCGAGCATCATCTGACGCAGTACGTGCATGATCTCAGTCAGCAGCTATGTCAGTGCATGCATAGATCGCATGACGATCGCATATCGTCAGTGCAGTGACTGATCGTCATCAGCTAGCATCGACTGCATGTAGTCGAGCATCATCTGACGCAGTACGTGCATAGATCGCATGACGATCGCATATCGTCAGTGCATGATCTCAGTCAGCAGCTATGTCAGTGCATGCAGTGACTGATCGTCATCAGCTAGCATCGACTGCATGTAGTCGAGCATCATCTGACGCAGTACGTGCAGTGACTGATCGTCATCAGCTAGCATCGACTGCATAGATCGCATGACGATCGCATATCGTCAGTGCATGATCTCAGTCAGCAGCTATGTCAGTGCATGCATGTAGTCGAGCATCATCTGACGCAGTACGTGCATAGATCGCATGACGATCGCATATCGTCAGTGCAGTGACTGATCGTCATCAGCTAGCATCGACTGCATGATCTCAGTCAGCAGCTATGTCAGTGCATGCAGTGACTGATCGTCATCAGCTAGCATCGACTGCATAGATCGCATGACGATCGCATATCGTCAGTGCATGATCTCAGTCAGCAGCTATGTCAGTGCATGCATGTAGTCGAGCATCATCTGACGCAGTACGTGCATAGATCGCATGACGATCGCATATCGTCAGTGCATGATCTCAGTCAGCAGCTATGTCAGTGCATGCAGTGACTGATCGTCATCAGCTAGCATCGACTGCATGTAGTCGAGCATCATCTGACGCAGTACGTGCATGATCTCAGTCAGCAGCTATGTCAGTGCATGCATAGATCGCATGACGATCGCATATCGTCAGTGCATGTAGTCGAGCATCATCTGACGCAGTACGTGCAGTGACTGATCGTCATCAGCTAGCATCGACTGCATAGATCGCATGACGATCGCATATCGTCAGTGCATGTAGTCGAGCATCATCTGACGCAGTACGTGCATGATCTCAGTCAGCAGCTATGTCAGTGCATGCAGTGACTGATCGTCATCAGCTAGCATCGACTGCATGATCTCAGTCAGCAGCTATGTCAGTGCATGCAGTGACTGATCGTCATCAGCTAGCATCGACTGCATAGATCGCATGACGATCGCATATCGTCAGTGCATGTAGTCGAGCATCATCTGACGCAGTACGTGCATGATCTCAGTCAGCAGCTATGTCAGTGCATGCATGTAGTCGAGCATCATCTGACGCAGTACGTGCAGTGACTGATCGTCATCAGCTAGCATCGACTGCATAGATCGCATGACGATCGCATATCGTCAGTGCAGTGACTGATCGTCATCAGCTAGCATCGACTGCATGTAGTCGAGCATCATCTGACGCAGTACGTGCATAGATCGCATGACGATCGCATATCGTCAGTGCATGATCTCAGTCAGCAGCTATGTCAGTGCATGCAGTGACTGATCGTCATCAGCTAGCATCGACTGCATGTAGTCGAGCATCATCTGACGCAGTACGTGCATGATCTCAGTCAGCAGCTATGTCAGTGCATGCATAGATCGCATGACGATCGCATATCGTCAGTGCAGTGACTGATCGTCATCAGCTAGCATCGACTGCATGATCTCAGTCAGCAGC.\n");  abort=true;
366             }
367             else {
368                 if (toupper(temp[0]) == 'A') {  flowOrder = "TACG";   }
369                 else if(toupper(temp[0]) == 'B'){ 
370                     flowOrder = "TACGTACGTACGATGTAGTCGAGCATCATCTGACGCAGTACGTGCATGATCTCAGTCAGCAGCTATGTCAGTGCATGCAGTGACTGATCGTCATCAGCTAGCATCGACTGCATAGATCGCATGACGATCGCATATCGTCAGTGCATGTAGTCGAGCATCATCTGACGCAGTACGTGCATGATCTCAGTCAGCAGCTATGTCAGTGCATGCATAGATCGCATGACGATCGCATATCGTCAGTGCAGTGACTGATCGTCATCAGCTAGCATCGACTGCATGTAGTCGAGCATCATCTGACGCAGTACGTGCATAGATCGCATGACGATCGCATATCGTCAGTGCATGATCTCAGTCAGCAGCTATGTCAGTGCATGCAGTGACTGATCGTCATCAGCTAGCATCGACTGCATGTAGTCGAGCATCATCTGACGCAGTACGTGCAGTGACTGATCGTCATCAGCTAGCATCGACTGCATAGATCGCATGACGATCGCATATCGTCAGTGCATGATCTCAGTCAGCAGCTATGTCAGTGCATGCATGTAGTCGAGCATCATCTGACGCAGTACGTGCATAGATCGCATGACGATCGCATATCGTCAGTGCAGTGACTGATCGTCATCAGCTAGCATCGACTGCATGATCTCAGTCAGCAGCTATGTCAGTGCATGCAGTGACTGATCGTCATCAGCTAGCATCGACTGCATAGATCGCATGACGATCGCATATCGTCAGTGCATGATCTCAGTCAGCAGCTATGTCAGTGCATGCATGTAGTCGAGCATCATCTGACGCAGTACGTGCATAGATCGCATGACGATCGCATATCGTCAGTGCATGATCTCAGTCAGCAGCTATGTCAGTGCATGCAGTGACTGATCGTCATCAGCTAGCATCGACTGCATGTAGTCGAGCATCATCTGACGCAGTACGTGCATGATCTCAGTCAGCAGCTATGTCAGTGCATGCATAGATCGCATGACGATCGCATATCGTCAGTGCATGTAGTCGAGCATCATCTGACGCAGTACGTGCAGTGACTGATCGTCATCAGCTAGCATCGACTGCATAGATCGCATGACGATCGCATATCGTCAGTGCATGTAGTCGAGCATCATCTGACGCAGTACGTGCATGATCTCAGTCAGCAGCTATGTCAGTGCATGCAGTGACTGATCGTCATCAGCTAGCATCGACTGCATGATCTCAGTCAGCAGCTATGTCAGTGCATGCAGTGACTGATCGTCATCAGCTAGCATCGACTGCATAGATCGCATGACGATCGCATATCGTCAGTGCATGTAGTCGAGCATCATCTGACGCAGTACGTGCATGATCTCAGTCAGCAGCTATGTCAGTGCATGCATGTAGTCGAGCATCATCTGACGCAGTACGTGCAGTGACTGATCGTCATCAGCTAGCATCGACTGCATAGATCGCATGACGATCGCATATCGTCAGTGCAGTGACTGATCGTCATCAGCTAGCATCGACTGCATGTAGTCGAGCATCATCTGACGCAGTACGTGCATAGATCGCATGACGATCGCATATCGTCAGTGCATGATCTCAGTCAGCAGCTATGTCAGTGCATGCAGTGACTGATCGTCATCAGCTAGCATCGACTGCATGTAGTCGAGCATCATCTGACGCAGTACGTGCATGATCTCAGTCAGCAGCTATGTCAGTGCATGCATAGATCGCATGACGATCGCATATCGTCAGTGCAGTGACTGATCGTCATCAGCTAGCATCGACTGCATGATCTCAGTCAGCAGC";   }
371                 else {
372                     m->mothurOut("[ERROR]: " + temp + " is not a valid option for order. order options are A or B. A = TACG and B = TACGTACGTACGATGTAGTCGAGCATCATCTGACGCAGTACGTGCATGATCTCAGTCAGCAGCTATGTCAGTGCATGCAGTGACTGATCGTCATCAGCTAGCATCGACTGCATAGATCGCATGACGATCGCATATCGTCAGTGCATGTAGTCGAGCATCATCTGACGCAGTACGTGCATGATCTCAGTCAGCAGCTATGTCAGTGCATGCATAGATCGCATGACGATCGCATATCGTCAGTGCAGTGACTGATCGTCATCAGCTAGCATCGACTGCATGTAGTCGAGCATCATCTGACGCAGTACGTGCATAGATCGCATGACGATCGCATATCGTCAGTGCATGATCTCAGTCAGCAGCTATGTCAGTGCATGCAGTGACTGATCGTCATCAGCTAGCATCGACTGCATGTAGTCGAGCATCATCTGACGCAGTACGTGCAGTGACTGATCGTCATCAGCTAGCATCGACTGCATAGATCGCATGACGATCGCATATCGTCAGTGCATGATCTCAGTCAGCAGCTATGTCAGTGCATGCATGTAGTCGAGCATCATCTGACGCAGTACGTGCATAGATCGCATGACGATCGCATATCGTCAGTGCAGTGACTGATCGTCATCAGCTAGCATCGACTGCATGATCTCAGTCAGCAGCTATGTCAGTGCATGCAGTGACTGATCGTCATCAGCTAGCATCGACTGCATAGATCGCATGACGATCGCATATCGTCAGTGCATGATCTCAGTCAGCAGCTATGTCAGTGCATGCATGTAGTCGAGCATCATCTGACGCAGTACGTGCATAGATCGCATGACGATCGCATATCGTCAGTGCATGATCTCAGTCAGCAGCTATGTCAGTGCATGCAGTGACTGATCGTCATCAGCTAGCATCGACTGCATGTAGTCGAGCATCATCTGACGCAGTACGTGCATGATCTCAGTCAGCAGCTATGTCAGTGCATGCATAGATCGCATGACGATCGCATATCGTCAGTGCATGTAGTCGAGCATCATCTGACGCAGTACGTGCAGTGACTGATCGTCATCAGCTAGCATCGACTGCATAGATCGCATGACGATCGCATATCGTCAGTGCATGTAGTCGAGCATCATCTGACGCAGTACGTGCATGATCTCAGTCAGCAGCTATGTCAGTGCATGCAGTGACTGATCGTCATCAGCTAGCATCGACTGCATGATCTCAGTCAGCAGCTATGTCAGTGCATGCAGTGACTGATCGTCATCAGCTAGCATCGACTGCATAGATCGCATGACGATCGCATATCGTCAGTGCATGTAGTCGAGCATCATCTGACGCAGTACGTGCATGATCTCAGTCAGCAGCTATGTCAGTGCATGCATGTAGTCGAGCATCATCTGACGCAGTACGTGCAGTGACTGATCGTCATCAGCTAGCATCGACTGCATAGATCGCATGACGATCGCATATCGTCAGTGCAGTGACTGATCGTCATCAGCTAGCATCGACTGCATGTAGTCGAGCATCATCTGACGCAGTACGTGCATAGATCGCATGACGATCGCATATCGTCAGTGCATGATCTCAGTCAGCAGCTATGTCAGTGCATGCAGTGACTGATCGTCATCAGCTAGCATCGACTGCATGTAGTCGAGCATCATCTGACGCAGTACGTGCATGATCTCAGTCAGCAGCTATGTCAGTGCATGCATAGATCGCATGACGATCGCATATCGTCAGTGCAGTGACTGATCGTCATCAGCTAGCATCGACTGCATGATCTCAGTCAGCAGC.\n");  abort=true;
373                 }
374             }
375
376                         
377                 }
378 #ifdef USE_MPI
379                 }                               
380 #endif
381         }
382         catch(exception& e) {
383                 m->errorOut(e, "ShhherCommand", "ShhherCommand");
384                 exit(1);
385         }
386 }
387 //**********************************************************************************************************************
388 #ifdef USE_MPI
389 int ShhherCommand::execute(){
390         try {
391                 if (abort == true) { if (calledHelp) { return 0; }  return 2;   }
392                 
393                 int tag = 1976;
394                 MPI_Status status; 
395                         
396                 if(pid == 0){
397
398                         for(int i=1;i<ncpus;i++){
399                                 MPI_Send(&abort, 1, MPI_INT, i, tag, MPI_COMM_WORLD);
400                         }
401                         if(abort == 1){ return 0;       }
402
403                         processors = ncpus;
404                         
405                         m->mothurOut("\nGetting preliminary data...\n");
406                         getSingleLookUp();      if (m->control_pressed) { return 0; }
407                         getJointLookUp();       if (m->control_pressed) { return 0; }
408                         
409             vector<string> flowFileVector;
410                         if(flowFilesFileName != "not found"){
411                                 string fName;
412                 
413                                 ifstream flowFilesFile;
414                                 m->openInputFile(flowFilesFileName, flowFilesFile);
415                                 while(flowFilesFile){
416                                         fName = m->getline(flowFilesFile);
417                                         flowFileVector.push_back(fName);
418                                         m->gobble(flowFilesFile);
419                                 }
420                         }
421                         else{
422                                 flowFileVector.push_back(flowFileName);
423                         }
424             
425                         int numFiles = flowFileVector.size();
426
427                         for(int i=1;i<ncpus;i++){
428                                 MPI_Send(&numFiles, 1, MPI_INT, i, tag, MPI_COMM_WORLD);
429                         }
430                         
431                         for(int i=0;i<numFiles;i++){
432                                 
433                                 if (m->control_pressed) { break; }
434                                 
435                                 double begClock = clock();
436                                 unsigned long long begTime = time(NULL);
437
438                                 flowFileName = flowFileVector[i];
439                                 
440                                 m->mothurOut("\n>>>>>\tProcessing " + flowFileName + " (file " + toString(i+1) + " of " + toString(numFiles) + ")\t<<<<<\n");
441                                 m->mothurOut("Reading flowgrams...\n");
442                                 getFlowData();
443                                 
444                                 if (m->control_pressed) { break; }
445
446                                 m->mothurOut("Identifying unique flowgrams...\n");
447                                 getUniques();
448                                 
449                                 if (m->control_pressed) { break; }
450
451                                 m->mothurOut("Calculating distances between flowgrams...\n");
452                                 char fileName[1024];
453                                 strcpy(fileName, flowFileName.c_str());
454
455                                 for(int i=1;i<ncpus;i++){
456                                         MPI_Send(&fileName[0], 1024, MPI_CHAR, i, tag, MPI_COMM_WORLD);
457
458                                         MPI_Send(&numSeqs, 1, MPI_INT, i, tag, MPI_COMM_WORLD);
459                                         MPI_Send(&numUniques, 1, MPI_INT, i, tag, MPI_COMM_WORLD);
460                                         MPI_Send(&numFlowCells, 1, MPI_INT, i, tag, MPI_COMM_WORLD);
461                                         MPI_Send(&flowDataIntI[0], numSeqs * numFlowCells, MPI_SHORT, i, tag, MPI_COMM_WORLD);
462                                         MPI_Send(&flowDataPrI[0], numSeqs * numFlowCells, MPI_DOUBLE, i, tag, MPI_COMM_WORLD);
463                                         MPI_Send(&mapUniqueToSeq[0], numSeqs, MPI_INT, i, tag, MPI_COMM_WORLD);
464                                         MPI_Send(&mapSeqToUnique[0], numSeqs, MPI_INT, i, tag, MPI_COMM_WORLD);
465                                         MPI_Send(&lengths[0], numSeqs, MPI_INT, i, tag, MPI_COMM_WORLD);
466                                         MPI_Send(&jointLookUp[0], NUMBINS * NUMBINS, MPI_DOUBLE, i, tag, MPI_COMM_WORLD);
467                                         MPI_Send(&cutoff, 1, MPI_DOUBLE, i, tag, MPI_COMM_WORLD);
468                                 }                               
469                                                         
470                                 string distFileName = flowDistMPI(0, int(sqrt(1.0/float(ncpus)) * numUniques));
471                                 
472                                 if (m->control_pressed) { break; }
473                                 
474                                 int done;
475                                 for(int i=1;i<ncpus;i++){
476                                         MPI_Recv(&done, 1, MPI_INT, i, tag, MPI_COMM_WORLD, &status);
477                                         
478                                         m->appendFiles((distFileName + ".temp." + toString(i)), distFileName);
479                                         m->mothurRemove((distFileName + ".temp." + toString(i)));
480                                 }
481
482                                 string namesFileName = createNamesFile();
483                                 
484                                 if (m->control_pressed) { break; }
485                                 
486                                 m->mothurOut("\nClustering flowgrams...\n");
487                                 string listFileName = cluster(distFileName, namesFileName);
488                                 
489                                 if (m->control_pressed) { break; }
490                                 
491                 
492                 
493                                 getOTUData(listFileName);
494
495                                 m->mothurRemove(distFileName);
496                                 m->mothurRemove(namesFileName);
497                                 m->mothurRemove(listFileName);
498                                 
499                                 if (m->control_pressed) { break; }
500                                 
501                                 initPyroCluster();
502
503                                 if (m->control_pressed) { break; }
504                                 
505             
506                                 for(int i=1;i<ncpus;i++){
507                                         MPI_Send(&numOTUs, 1, MPI_INT, i, tag, MPI_COMM_WORLD);
508                                         MPI_Send(&singleLookUp[0], singleLookUp.size(), MPI_DOUBLE, i, tag, MPI_COMM_WORLD);
509                                         MPI_Send(&uniqueFlowgrams[0], numFlowCells * numUniques, MPI_SHORT, i, tag, MPI_COMM_WORLD);
510                                         MPI_Send(&sigma, 1, MPI_DOUBLE, i, tag, MPI_COMM_WORLD);
511                                 }
512                                 
513                                 if (m->control_pressed) { break; }
514                                 
515                                 double maxDelta = 0;
516                                 int iter = 0;
517                                 
518                                 int numOTUsOnCPU = numOTUs / ncpus;
519                                 int numSeqsOnCPU = numSeqs / ncpus;
520                                 m->mothurOut("\nDenoising flowgrams...\n");
521                                 m->mothurOut("iter\tmaxDelta\tnLL\t\tcycletime\n");
522                                 
523                                 while((maxIters == 0 && maxDelta > minDelta) || iter < MIN_ITER || (maxDelta > minDelta && iter < maxIters)){
524                                         
525                                         double cycClock = clock();
526                                         unsigned long long cycTime = time(NULL);
527                                         fill();
528                                         
529                                         if (m->control_pressed) { break; }
530
531                                         int total = singleTau.size();
532                                         for(int i=1;i<ncpus;i++){
533                                                 MPI_Send(&total, 1, MPI_INT, i, tag, MPI_COMM_WORLD);
534                                                 MPI_Send(&change[0], numOTUs, MPI_SHORT, i, tag, MPI_COMM_WORLD);
535                                                 MPI_Send(&centroids[0], numOTUs, MPI_INT, i, tag, MPI_COMM_WORLD);
536                                                 
537                                                 MPI_Send(&singleTau[0], total, MPI_DOUBLE, i, tag, MPI_COMM_WORLD);
538                                                 MPI_Send(&seqNumber[0], total, MPI_INT, i, tag, MPI_COMM_WORLD);
539                                                 MPI_Send(&seqIndex[0], total, MPI_INT, i, tag, MPI_COMM_WORLD);
540                                                 MPI_Send(&nSeqsPerOTU[0], numOTUs, MPI_INT, i, tag, MPI_COMM_WORLD);
541                                                 MPI_Send(&cumNumSeqs[0], numOTUs, MPI_INT, i, tag, MPI_COMM_WORLD);
542                                         }
543                                         
544                                         calcCentroidsDriver(0, numOTUsOnCPU);
545                                         
546                                         for(int i=1;i<ncpus;i++){
547                                                 int otuStart = i * numOTUs / ncpus;
548                                                 int otuStop = (i + 1) * numOTUs / ncpus;
549                                                 
550                                                 vector<int> tempCentroids(numOTUs, 0);
551                                                 vector<short> tempChange(numOTUs, 0);
552                                                 
553                                                 MPI_Recv(&tempCentroids[0], numOTUs, MPI_INT, i, tag, MPI_COMM_WORLD, &status);
554                                                 MPI_Recv(&tempChange[0], numOTUs, MPI_SHORT, i, tag, MPI_COMM_WORLD, &status);
555                                                 
556                                                 for(int j=otuStart;j<otuStop;j++){
557                                                         centroids[j] = tempCentroids[j];
558                                                         change[j] = tempChange[j];
559                                                 }
560                                         }
561                                                                         
562                                         maxDelta = getNewWeights(); if (m->control_pressed) { break; }
563                                         double nLL = getLikelihood(); if (m->control_pressed) { break; }
564                                         checkCentroids(); if (m->control_pressed) { break; }
565                                         
566                                         for(int i=1;i<ncpus;i++){
567                                                 MPI_Send(&centroids[0], numOTUs, MPI_INT, i, tag, MPI_COMM_WORLD);
568                                                 MPI_Send(&weight[0], numOTUs, MPI_DOUBLE, i, tag, MPI_COMM_WORLD);
569                                                 MPI_Send(&change[0], numOTUs, MPI_SHORT, i, tag, MPI_COMM_WORLD);
570                                         }
571                                         
572                                         calcNewDistancesParent(0, numSeqsOnCPU);
573
574                                         total = singleTau.size();
575
576                                         for(int i=1;i<ncpus;i++){
577                                                 int childTotal;
578                                                 int seqStart = i * numSeqs / ncpus;
579                                                 int seqStop = (i + 1) * numSeqs / ncpus;
580                                                 
581                                                 MPI_Recv(&childTotal, 1, MPI_INT, i, tag, MPI_COMM_WORLD, &status);
582
583                                                 vector<int> childSeqIndex(childTotal, 0);
584                                                 vector<double> childSingleTau(childTotal, 0);
585                                                 vector<double> childDist(numSeqs * numOTUs, 0);
586                                                 vector<int> otuIndex(childTotal, 0);
587                                                 
588                                                 MPI_Recv(&childSeqIndex[0], childTotal, MPI_INT, i, tag, MPI_COMM_WORLD, &status);
589                                                 MPI_Recv(&childSingleTau[0], childTotal, MPI_DOUBLE, i, tag, MPI_COMM_WORLD, &status);
590                                                 MPI_Recv(&childDist[0], numOTUs * numSeqs, MPI_DOUBLE, i, tag, MPI_COMM_WORLD, &status);
591                                                 MPI_Recv(&otuIndex[0], childTotal, MPI_INT, i, tag, MPI_COMM_WORLD, &status);
592
593                                                 int oldTotal = total;
594                                                 total += childTotal;
595                                                 singleTau.resize(total, 0);
596                                                 seqIndex.resize(total, 0);
597                                                 seqNumber.resize(total, 0);
598                                                 
599                                                 int childIndex = 0;
600                                                 
601                                                 for(int j=oldTotal;j<total;j++){
602                                                         int otuI = otuIndex[childIndex];
603                                                         int seqI = childSeqIndex[childIndex];
604                                                         
605                                                         singleTau[j] = childSingleTau[childIndex];
606                                                         
607                                                         aaP[otuI][nSeqsPerOTU[otuI]] = j;
608                                                         aaI[otuI][nSeqsPerOTU[otuI]] = seqI;
609                                                         nSeqsPerOTU[otuI]++;
610                                                         childIndex++;
611                                                 }
612                                                 
613                                                 int index = seqStart * numOTUs;
614                                                 for(int j=seqStart;j<seqStop;j++){
615                                                         for(int k=0;k<numOTUs;k++){
616                                                                 dist[index] = childDist[index];
617                                                                 index++;
618                                                         }
619                                                 }                                       
620                                         }
621                                         
622                                         iter++;
623                                         
624                                         m->mothurOut(toString(iter) + '\t' + toString(maxDelta) + '\t' + toString(nLL) + '\t' + toString(time(NULL) - cycTime) + '\t' + toString((clock() - cycClock)/(double)CLOCKS_PER_SEC) + '\n');                  
625
626                                         if((maxIters == 0 && maxDelta > minDelta) || iter < MIN_ITER || (maxDelta > minDelta && iter < maxIters)){
627                                                 int live = 1;
628                                                 for(int i=1;i<ncpus;i++){
629                                                         MPI_Send(&live, 1, MPI_INT, i, tag, MPI_COMM_WORLD);
630                                                 }
631                                         }
632                                         else{
633                                                 int live = 0;
634                                                 for(int i=1;i<ncpus;i++){
635                                                         MPI_Send(&live, 1, MPI_INT, i, tag, MPI_COMM_WORLD); //send kill command
636                                                 }
637                                         }
638                                         
639                                 }       
640                                 
641                                 if (m->control_pressed) { break; }
642                                 
643                                 m->mothurOut("\nFinalizing...\n");
644                                 fill();
645                                 
646                                 if (m->control_pressed) { break; }
647                                 
648                                 setOTUs();
649                                 
650                                 vector<int> otuCounts(numOTUs, 0);
651                                 for(int i=0;i<numSeqs;i++)      {       otuCounts[otuData[i]]++;        }
652                                 calcCentroidsDriver(0, numOTUs);
653                                 
654                                 if (m->control_pressed) { break; }
655                                 
656                                 writeQualities(otuCounts);      if (m->control_pressed) { break; }
657                                 writeSequences(otuCounts);      if (m->control_pressed) { break; }
658                                 writeNames(otuCounts);          if (m->control_pressed) { break; }
659                                 writeClusters(otuCounts);       if (m->control_pressed) { break; }
660                                 writeGroups();                          if (m->control_pressed) { break; }
661                                 
662                                                                  
663                                 m->mothurOut("Total time to process " + toString(flowFileName) + ":\t" + toString(time(NULL) - begTime) + '\t' + toString((clock() - begClock)/(double)CLOCKS_PER_SEC) + '\n');                 
664                         }
665                 }
666                 else{
667                         int abort = 1;
668
669                         MPI_Recv(&abort, 1, MPI_INT, 0, tag, MPI_COMM_WORLD, &status);
670                         if(abort){      return 0;       }
671
672                         int numFiles;
673                         MPI_Recv(&numFiles, 1, MPI_INT, 0, tag, MPI_COMM_WORLD, &status);
674
675                         for(int i=0;i<numFiles;i++){
676                                 
677                                 if (m->control_pressed) { break; }
678                                 
679                                 //Now into the pyrodist part
680                                 bool live = 1;
681
682                                 char fileName[1024];
683                                 MPI_Recv(&fileName, 1024, MPI_CHAR, 0, tag, MPI_COMM_WORLD, &status);
684                                 MPI_Recv(&numSeqs, 1, MPI_INT, 0, tag, MPI_COMM_WORLD, &status);
685                                 MPI_Recv(&numUniques, 1, MPI_INT, 0, tag, MPI_COMM_WORLD, &status);
686                                 MPI_Recv(&numFlowCells, 1, MPI_INT, 0, tag, MPI_COMM_WORLD, &status);
687                                 
688                                 flowDataIntI.resize(numSeqs * numFlowCells);
689                                 flowDataPrI.resize(numSeqs * numFlowCells);
690                                 mapUniqueToSeq.resize(numSeqs);
691                                 mapSeqToUnique.resize(numSeqs);
692                                 lengths.resize(numSeqs);
693                                 jointLookUp.resize(NUMBINS * NUMBINS);
694
695                                 MPI_Recv(&flowDataIntI[0], numSeqs * numFlowCells, MPI_SHORT, 0, tag, MPI_COMM_WORLD, &status);
696                                 MPI_Recv(&flowDataPrI[0], numSeqs * numFlowCells, MPI_DOUBLE, 0, tag, MPI_COMM_WORLD, &status);
697                                 MPI_Recv(&mapUniqueToSeq[0], numSeqs, MPI_INT, 0, tag, MPI_COMM_WORLD, &status);
698                                 MPI_Recv(&mapSeqToUnique[0], numSeqs, MPI_INT, 0, tag, MPI_COMM_WORLD, &status);
699                                 MPI_Recv(&lengths[0], numSeqs, MPI_INT, 0, tag, MPI_COMM_WORLD, &status);
700                                 MPI_Recv(&jointLookUp[0], NUMBINS * NUMBINS, MPI_DOUBLE, 0, tag, MPI_COMM_WORLD, &status);
701                                 MPI_Recv(&cutoff, 1, MPI_DOUBLE, 0, tag, MPI_COMM_WORLD, &status);
702
703                                 flowFileName = string(fileName);
704                                 int flowDistStart = int(sqrt(float(pid)/float(ncpus)) * numUniques);
705                                 int flowDistEnd = int(sqrt(float(pid+1)/float(ncpus)) * numUniques);
706                                 
707                                 string distanceStringChild = flowDistMPI(flowDistStart, flowDistEnd);
708                                 
709                                 if (m->control_pressed) { break; }
710                                 
711                                 int done = 1;
712                                 MPI_Send(&done, 1, MPI_INT, 0, tag, MPI_COMM_WORLD);
713
714                                 //Now into the pyronoise part
715                                 MPI_Recv(&numOTUs, 1, MPI_INT, 0, tag, MPI_COMM_WORLD, &status);
716                                 
717                                 singleLookUp.resize(HOMOPS * NUMBINS);
718                                 uniqueFlowgrams.resize(numUniques * numFlowCells);
719                                 weight.resize(numOTUs);
720                                 centroids.resize(numOTUs);
721                                 change.resize(numOTUs);
722                                 dist.assign(numOTUs * numSeqs, 0);
723                                 nSeqsPerOTU.resize(numOTUs);
724                                 cumNumSeqs.resize(numOTUs);
725
726                                 MPI_Recv(&singleLookUp[0], singleLookUp.size(), MPI_DOUBLE, 0, tag, MPI_COMM_WORLD, &status);
727                                 MPI_Recv(&uniqueFlowgrams[0], uniqueFlowgrams.size(), MPI_SHORT, 0, tag, MPI_COMM_WORLD, &status);
728                                 MPI_Recv(&sigma, 1, MPI_DOUBLE, 0, tag, MPI_COMM_WORLD, &status);
729                                 
730                                 int startOTU = pid * numOTUs / ncpus;
731                                 int endOTU = (pid + 1) * numOTUs / ncpus;
732
733                                 int startSeq = pid * numSeqs / ncpus;
734                                 int endSeq = (pid + 1) * numSeqs /ncpus;
735                                 
736                                 int total;
737
738                                 while(live){
739                                         
740                                         if (m->control_pressed) { break; }
741                                         
742                                         MPI_Recv(&total, 1, MPI_INT, 0, tag, MPI_COMM_WORLD, &status);
743                                         singleTau.assign(total, 0.0000);
744                                         seqNumber.assign(total, 0);
745                                         seqIndex.assign(total, 0);
746
747                                         MPI_Recv(&change[0], numOTUs, MPI_SHORT, 0, tag, MPI_COMM_WORLD, &status);
748                                         MPI_Recv(&centroids[0], numOTUs, MPI_INT, 0, tag, MPI_COMM_WORLD, &status);
749                                         MPI_Recv(&singleTau[0], total, MPI_DOUBLE, 0, tag, MPI_COMM_WORLD, &status);
750                                         MPI_Recv(&seqNumber[0], total, MPI_DOUBLE, 0, tag, MPI_COMM_WORLD, &status);
751                                         MPI_Recv(&seqIndex[0], total, MPI_INT, 0, tag, MPI_COMM_WORLD, &status);
752                                         MPI_Recv(&nSeqsPerOTU[0], total, MPI_INT, 0, tag, MPI_COMM_WORLD, &status);
753                                         MPI_Recv(&cumNumSeqs[0], numOTUs, MPI_INT, 0, tag, MPI_COMM_WORLD, &status);
754
755                                         calcCentroidsDriver(startOTU, endOTU);
756
757                                         MPI_Send(&centroids[0], numOTUs, MPI_INT, 0, tag, MPI_COMM_WORLD);
758                                         MPI_Send(&change[0], numOTUs, MPI_SHORT, 0, tag, MPI_COMM_WORLD);
759
760                                         MPI_Recv(&centroids[0], numOTUs, MPI_INT, 0, tag, MPI_COMM_WORLD, &status);
761                                         MPI_Recv(&weight[0], numOTUs, MPI_DOUBLE, 0, tag, MPI_COMM_WORLD, &status);
762                                         MPI_Recv(&change[0], numOTUs, MPI_SHORT, 0, tag, MPI_COMM_WORLD, &status);
763
764                                         vector<int> otuIndex(total, 0);
765                                         calcNewDistancesChildMPI(startSeq, endSeq, otuIndex);
766                                         total = otuIndex.size();
767                                         
768                                         MPI_Send(&total, 1, MPI_INT, 0, tag, MPI_COMM_WORLD);
769                                         MPI_Send(&seqIndex[0], total, MPI_INT, 0, tag, MPI_COMM_WORLD);
770                                         MPI_Send(&singleTau[0], total, MPI_DOUBLE, 0, tag, MPI_COMM_WORLD);
771                                         MPI_Send(&dist[0], numOTUs * numSeqs, MPI_DOUBLE, 0, tag, MPI_COMM_WORLD);
772                                         MPI_Send(&otuIndex[0], total, MPI_INT, 0, tag, MPI_COMM_WORLD);
773                                 
774                                         MPI_Recv(&live, 1, MPI_INT, 0, tag, MPI_COMM_WORLD, &status);
775                                 }
776                         }
777                 }
778                 
779                 if (m->control_pressed) { for (int i = 0; i < outputNames.size(); i++) { m->mothurRemove(outputNames[i]); } return 0; }
780                 
781                 MPI_Barrier(MPI_COMM_WORLD);
782
783                 
784                 if(compositeFASTAFileName != ""){
785                         outputNames.push_back(compositeFASTAFileName); outputTypes["fasta"].push_back(compositeFASTAFileName);
786                         outputNames.push_back(compositeNamesFileName); outputTypes["name"].push_back(compositeNamesFileName); 
787                 }
788
789                 m->mothurOutEndLine();
790                 m->mothurOut("Output File Names: "); m->mothurOutEndLine();
791                 for (int i = 0; i < outputNames.size(); i++) {  m->mothurOut(outputNames[i]); m->mothurOutEndLine();    }
792                 m->mothurOutEndLine();
793                 
794                 return 0;
795
796         }
797         catch(exception& e) {
798                 m->errorOut(e, "ShhherCommand", "execute");
799                 exit(1);
800         }
801 }
802 /**************************************************************************************************/
803 string ShhherCommand::createNamesFile(){
804         try{
805                 
806                 vector<string> duplicateNames(numUniques, "");
807                 for(int i=0;i<numSeqs;i++){
808                         duplicateNames[mapSeqToUnique[i]] += seqNameVector[i] + ',';
809                 }
810                 map<string, string> variables; 
811                 variables["[filename]"] = outputDir + m->getRootName(m->getSimpleName(flowFileName));
812                 string nameFileName = getOutputFileName("name",variables);
813                 
814                 ofstream nameFile;
815                 m->openOutputFile(nameFileName, nameFile);
816                 
817                 for(int i=0;i<numUniques;i++){
818                         
819                         if (m->control_pressed) { break; }
820                         
821             //                  nameFile << seqNameVector[mapUniqueToSeq[i]] << '\t' << duplicateNames[i].substr(0, duplicateNames[i].find_last_of(',')) << endl;
822                         nameFile << mapUniqueToSeq[i] << '\t' << duplicateNames[i].substr(0, duplicateNames[i].find_last_of(',')) << endl;
823                 }
824                 
825                 nameFile.close();
826                 return  nameFileName;
827         }
828         catch(exception& e) {
829                 m->errorOut(e, "ShhherCommand", "createNamesFile");
830                 exit(1);
831         }
832 }
833 /**************************************************************************************************/
834
835 string ShhherCommand::flowDistMPI(int startSeq, int stopSeq){
836         try{            
837                 ostringstream outStream;
838                 outStream.setf(ios::fixed, ios::floatfield);
839                 outStream.setf(ios::dec, ios::basefield);
840                 outStream.setf(ios::showpoint);
841                 outStream.precision(6);
842                 
843                 int begTime = time(NULL);
844                 double begClock = clock();
845                 
846                 for(int i=startSeq;i<stopSeq;i++){
847                         
848                         if (m->control_pressed) { break; }
849                         
850                         for(int j=0;j<i;j++){
851                                 float flowDistance = calcPairwiseDist(mapUniqueToSeq[i], mapUniqueToSeq[j]);
852                                 
853                                 if(flowDistance < 1e-6){
854                                         outStream << mapUniqueToSeq[i] << '\t' << mapUniqueToSeq[j] << '\t' << 0.000000 << endl;
855                                 }
856                                 else if(flowDistance <= cutoff){
857                                         outStream << mapUniqueToSeq[i] << '\t' << mapUniqueToSeq[j] << '\t' << flowDistance << endl;
858                                 }
859                         }
860                         if(i % 100 == 0){
861                                 m->mothurOut(toString(i) + '\t' + toString(time(NULL) - begTime) + '\t' + toString((clock()-begClock)/CLOCKS_PER_SEC) + '\n');
862                         }
863                 }
864                 
865                 string fDistFileName = flowFileName.substr(0,flowFileName.find_last_of('.')) + ".shhh.dist";
866                 if(pid != 0){   fDistFileName += ".temp." + toString(pid);      }
867                 
868                 if (m->control_pressed) { return fDistFileName; }
869                 
870                 m->mothurOut(toString(stopSeq) + '\t' + toString(time(NULL) - begTime) + '\t' + toString((clock()-begClock)/CLOCKS_PER_SEC) + '\n');
871
872                 ofstream distFile(fDistFileName.c_str());
873                 distFile << outStream.str();            
874                 distFile.close();
875                 
876                 return fDistFileName;
877         }
878         catch(exception& e) {
879                 m->errorOut(e, "ShhherCommand", "flowDistMPI");
880                 exit(1);
881         }
882 }
883 /**************************************************************************************************/
884  
885 void ShhherCommand::getOTUData(string listFileName){
886     try {
887         
888         ifstream listFile;
889         m->openInputFile(listFileName, listFile);
890         string label;
891         
892         listFile >> label >> numOTUs;
893         
894         otuData.assign(numSeqs, 0);
895         cumNumSeqs.assign(numOTUs, 0);
896         nSeqsPerOTU.assign(numOTUs, 0);
897         aaP.clear();aaP.resize(numOTUs);
898         
899         seqNumber.clear();
900         aaI.clear();
901         seqIndex.clear();
902         
903         string singleOTU = "";
904         
905         for(int i=0;i<numOTUs;i++){
906             
907             if (m->control_pressed) { break; }
908             
909             listFile >> singleOTU;
910             
911             istringstream otuString(singleOTU);
912             
913             while(otuString){
914                 
915                 string seqName = "";
916                 
917                 for(int j=0;j<singleOTU.length();j++){
918                     char letter = otuString.get();
919                     
920                     if(letter != ','){
921                         seqName += letter;
922                     }
923                     else{
924                         map<string,int>::iterator nmIt = nameMap.find(seqName);
925                         int index = nmIt->second;
926                         
927                         nameMap.erase(nmIt);
928                         
929                         otuData[index] = i;
930                         nSeqsPerOTU[i]++;
931                         aaP[i].push_back(index);
932                         seqName = "";
933                     }
934                 }
935                 
936                 map<string,int>::iterator nmIt = nameMap.find(seqName);
937                 
938                 int index = nmIt->second;
939                 nameMap.erase(nmIt);
940                 
941                 otuData[index] = i;
942                 nSeqsPerOTU[i]++;
943                 aaP[i].push_back(index);        
944                 
945                 otuString.get();
946             }
947             
948             sort(aaP[i].begin(), aaP[i].end());
949             for(int j=0;j<nSeqsPerOTU[i];j++){
950                 seqNumber.push_back(aaP[i][j]);
951             }
952             for(int j=nSeqsPerOTU[i];j<numSeqs;j++){
953                 aaP[i].push_back(0);
954             }
955             
956             
957         }
958         
959         for(int i=1;i<numOTUs;i++){
960             cumNumSeqs[i] = cumNumSeqs[i-1] + nSeqsPerOTU[i-1];
961         }
962         aaI = aaP;
963         seqIndex = seqNumber;
964         
965         listFile.close();       
966         
967     }
968     catch(exception& e) {
969         m->errorOut(e, "ShhherCommand", "getOTUData");
970         exit(1);        
971     }           
972 }
973
974 /**************************************************************************************************/
975
976 void ShhherCommand::initPyroCluster(){                          
977     try{
978         if (numOTUs < processors) { processors = 1; }
979         
980         if (m->debug) { m->mothurOut("[DEBUG]: numSeqs = " + toString(numSeqs) + " numOTUS = " + toString(numOTUs) + " about to alloc a dist vector with size = " + toString((numSeqs * numOTUs)) + ".\n"); }
981         
982         dist.assign(numSeqs * numOTUs, 0);
983         change.assign(numOTUs, 1);
984         centroids.assign(numOTUs, -1);
985         weight.assign(numOTUs, 0);
986         singleTau.assign(numSeqs, 1.0);
987         
988         nSeqsBreaks.assign(processors+1, 0);
989         nOTUsBreaks.assign(processors+1, 0);
990         
991         if (m->debug) { m->mothurOut("[DEBUG]: made it through the memory allocation.\n"); }
992         
993         nSeqsBreaks[0] = 0;
994         for(int i=0;i<processors;i++){
995             nSeqsBreaks[i+1] = nSeqsBreaks[i] + (int)((double) numSeqs / (double) processors);
996             nOTUsBreaks[i+1] = nOTUsBreaks[i] + (int)((double) numOTUs / (double) processors);
997         }
998         nSeqsBreaks[processors] = numSeqs;
999         nOTUsBreaks[processors] = numOTUs;
1000     }
1001     catch(exception& e) {
1002         m->errorOut(e, "ShhherCommand", "initPyroCluster");
1003         exit(1);        
1004     }
1005 }
1006
1007 /**************************************************************************************************/
1008
1009 void ShhherCommand::fill(){
1010     try {
1011         int index = 0;
1012         for(int i=0;i<numOTUs;i++){
1013             
1014             if (m->control_pressed) { break; }
1015             
1016             cumNumSeqs[i] = index;
1017             for(int j=0;j<nSeqsPerOTU[i];j++){
1018                 seqNumber[index] = aaP[i][j];
1019                 seqIndex[index] = aaI[i][j];
1020                 
1021                 index++;
1022             }
1023         }
1024     }
1025     catch(exception& e) {
1026         m->errorOut(e, "ShhherCommand", "fill");
1027         exit(1);        
1028     }           
1029 }
1030
1031 /**************************************************************************************************/
1032  
1033 void ShhherCommand::getFlowData(){
1034     try{
1035         ifstream flowFile;
1036         m->openInputFile(flowFileName, flowFile);
1037         
1038         string seqName;
1039         seqNameVector.clear();
1040         lengths.clear();
1041         flowDataIntI.clear();
1042         nameMap.clear();
1043         
1044         
1045         int currentNumFlowCells;
1046         
1047         float intensity;
1048         
1049         string numFlowTest;
1050         flowFile >> numFlowTest;
1051         
1052         if (!m->isContainingOnlyDigits(numFlowTest)) { m->mothurOut("[ERROR]: expected a number and got " + numFlowTest + ", quitting. Did you use the flow parameter instead of the file parameter?"); m->mothurOutEndLine(); exit(1); }
1053         else { convert(numFlowTest, numFlowCells); }
1054         
1055         int index = 0;//pcluster
1056         while(!flowFile.eof()){
1057             
1058             if (m->control_pressed) { break; }
1059             
1060             flowFile >> seqName >> currentNumFlowCells;
1061             lengths.push_back(currentNumFlowCells);
1062             
1063             seqNameVector.push_back(seqName);
1064             nameMap[seqName] = index++;//pcluster
1065             
1066             for(int i=0;i<numFlowCells;i++){
1067                 flowFile >> intensity;
1068                 if(intensity > 9.99)    {       intensity = 9.99;       }
1069                 int intI = int(100 * intensity + 0.0001);
1070                 flowDataIntI.push_back(intI);
1071             }
1072             m->gobble(flowFile);
1073         }
1074         flowFile.close();
1075         
1076         numSeqs = seqNameVector.size();         
1077         
1078         for(int i=0;i<numSeqs;i++){
1079             
1080             if (m->control_pressed) { break; }
1081             
1082             int iNumFlowCells = i * numFlowCells;
1083             for(int j=lengths[i];j<numFlowCells;j++){
1084                 flowDataIntI[iNumFlowCells + j] = 0;
1085             }
1086         }
1087         
1088     }
1089     catch(exception& e) {
1090         m->errorOut(e, "ShhherCommand", "getFlowData");
1091         exit(1);
1092     }
1093 }
1094 /**************************************************************************************************/
1095 void ShhherCommand::calcNewDistancesChildMPI(int startSeq, int stopSeq, vector<int>& otuIndex){
1096         
1097         try{
1098                 vector<double> newTau(numOTUs,0);
1099                 vector<double> norms(numSeqs, 0);
1100                 otuIndex.clear();
1101                 seqIndex.clear();
1102                 singleTau.clear();
1103                 
1104                 for(int i=startSeq;i<stopSeq;i++){
1105                         
1106                         if (m->control_pressed) { break; }
1107                         
1108                         double offset = 1e8;
1109                         int indexOffset = i * numOTUs;
1110                         
1111                         for(int j=0;j<numOTUs;j++){
1112                                 
1113                                 if(weight[j] > MIN_WEIGHT && change[j] == 1){
1114                                         dist[indexOffset + j] = getDistToCentroid(centroids[j], i, lengths[i]);
1115                                 }
1116                                 if(weight[j] > MIN_WEIGHT && dist[indexOffset + j] < offset){
1117                                         offset = dist[indexOffset + j];
1118                                 }
1119                         }
1120                         
1121                         for(int j=0;j<numOTUs;j++){
1122                                 if(weight[j] > MIN_WEIGHT){
1123                                         newTau[j] = exp(sigma * (-dist[indexOffset + j] + offset)) * weight[j];
1124                                         norms[i] += newTau[j];
1125                                 }
1126                                 else{
1127                                         newTau[j] = 0.0;
1128                                 }
1129                         }
1130                         
1131                         for(int j=0;j<numOTUs;j++){
1132                 
1133                                 newTau[j] /= norms[i];
1134                                 
1135                                 if(newTau[j] > MIN_TAU){
1136                                         otuIndex.push_back(j);
1137                                         seqIndex.push_back(i);
1138                                         singleTau.push_back(newTau[j]);
1139                                 }
1140                         }
1141                         
1142                 }
1143         }
1144         catch(exception& e) {
1145                 m->errorOut(e, "ShhherCommand", "calcNewDistancesChildMPI");
1146                 exit(1);        
1147         }               
1148 }
1149
1150 /**************************************************************************************************/
1151
1152 void ShhherCommand::calcNewDistancesParent(int startSeq, int stopSeq){
1153     
1154     try{
1155         
1156         int total = 0;
1157         vector<double> newTau(numOTUs,0);
1158         vector<double> norms(numSeqs, 0);
1159         nSeqsPerOTU.assign(numOTUs, 0);
1160         
1161         for(int i=startSeq;i<stopSeq;i++){
1162             
1163             if (m->control_pressed) { break; }
1164             
1165             int indexOffset = i * numOTUs;
1166             
1167             double offset = 1e8;
1168             
1169             for(int j=0;j<numOTUs;j++){
1170                 
1171                 if(weight[j] > MIN_WEIGHT && change[j] == 1){
1172                     dist[indexOffset + j] = getDistToCentroid(centroids[j], i, lengths[i]);
1173                 }
1174                 
1175                 if(weight[j] > MIN_WEIGHT && dist[indexOffset + j] < offset){
1176                     offset = dist[indexOffset + j];
1177                 }
1178             }
1179             
1180             for(int j=0;j<numOTUs;j++){
1181                 if(weight[j] > MIN_WEIGHT){
1182                     newTau[j] = exp(sigma * (-dist[indexOffset + j] + offset)) * weight[j];
1183                     norms[i] += newTau[j];
1184                 }
1185                 else{
1186                     newTau[j] = 0.0;
1187                 }
1188             }
1189             
1190             for(int j=0;j<numOTUs;j++){
1191                 newTau[j] /= norms[i];
1192             }
1193             
1194             for(int j=0;j<numOTUs;j++){
1195                 if(newTau[j] > MIN_TAU){
1196                     
1197                     int oldTotal = total;
1198                     
1199                     total++;
1200                     
1201                     singleTau.resize(total, 0);
1202                     seqNumber.resize(total, 0);
1203                     seqIndex.resize(total, 0);
1204                     
1205                     singleTau[oldTotal] = newTau[j];
1206                     
1207                     aaP[j][nSeqsPerOTU[j]] = oldTotal;
1208                     aaI[j][nSeqsPerOTU[j]] = i;
1209                     nSeqsPerOTU[j]++;
1210                 }
1211             }
1212             
1213         }
1214         
1215     }
1216     catch(exception& e) {
1217         m->errorOut(e, "ShhherCommand", "calcNewDistancesParent");
1218         exit(1);        
1219     }           
1220 }
1221
1222 /**************************************************************************************************/
1223
1224 void ShhherCommand::setOTUs(){
1225     
1226     try {
1227         vector<double> bigTauMatrix(numOTUs * numSeqs, 0.0000);
1228         
1229         for(int i=0;i<numOTUs;i++){
1230             
1231             if (m->control_pressed) { break; }
1232             
1233             for(int j=0;j<nSeqsPerOTU[i];j++){
1234                 int index = cumNumSeqs[i] + j;
1235                 double tauValue = singleTau[seqNumber[index]];
1236                 int sIndex = seqIndex[index];
1237                 bigTauMatrix[sIndex * numOTUs + i] = tauValue;                          
1238             }
1239         }
1240         
1241         for(int i=0;i<numSeqs;i++){
1242             double maxTau = -1.0000;
1243             int maxOTU = -1;
1244             for(int j=0;j<numOTUs;j++){
1245                 if(bigTauMatrix[i * numOTUs + j] > maxTau){
1246                     maxTau = bigTauMatrix[i * numOTUs + j];
1247                     maxOTU = j;
1248                 }
1249             }
1250             
1251             otuData[i] = maxOTU;
1252         }
1253         
1254         nSeqsPerOTU.assign(numOTUs, 0);         
1255         
1256         for(int i=0;i<numSeqs;i++){
1257             int index = otuData[i];
1258             
1259             singleTau[i] = 1.0000;
1260             dist[i] = 0.0000;
1261             
1262             aaP[index][nSeqsPerOTU[index]] = i;
1263             aaI[index][nSeqsPerOTU[index]] = i;
1264             
1265             nSeqsPerOTU[index]++;
1266         }
1267         fill(); 
1268     }
1269     catch(exception& e) {
1270         m->errorOut(e, "ShhherCommand", "setOTUs");
1271         exit(1);        
1272     }           
1273 }
1274
1275 /**************************************************************************************************/
1276  
1277 void ShhherCommand::getUniques(){
1278     try{
1279         
1280         
1281         numUniques = 0;
1282         uniqueFlowgrams.assign(numFlowCells * numSeqs, -1);
1283         uniqueCount.assign(numSeqs, 0);                                                 //      anWeights
1284         uniqueLengths.assign(numSeqs, 0);
1285         mapSeqToUnique.assign(numSeqs, -1);
1286         mapUniqueToSeq.assign(numSeqs, -1);
1287         
1288         vector<short> uniqueFlowDataIntI(numFlowCells * numSeqs, -1);
1289         
1290         for(int i=0;i<numSeqs;i++){
1291             
1292             if (m->control_pressed) { break; }
1293             
1294             int index = 0;
1295             
1296             vector<short> current(numFlowCells);
1297             for(int j=0;j<numFlowCells;j++){
1298                 current[j] = short(((flowDataIntI[i * numFlowCells + j] + 50.0)/100.0));
1299             }
1300             
1301             for(int j=0;j<numUniques;j++){
1302                 int offset = j * numFlowCells;
1303                 bool toEnd = 1;
1304                 
1305                 int shorterLength;
1306                 if(lengths[i] < uniqueLengths[j])       {       shorterLength = lengths[i];                     }
1307                 else                                                            {       shorterLength = uniqueLengths[j];       }
1308                 
1309                 for(int k=0;k<shorterLength;k++){
1310                     if(current[k] != uniqueFlowgrams[offset + k]){
1311                         toEnd = 0;
1312                         break;
1313                     }
1314                 }
1315                 
1316                 if(toEnd){
1317                     mapSeqToUnique[i] = j;
1318                     uniqueCount[j]++;
1319                     index = j;
1320                     if(lengths[i] > uniqueLengths[j])   {       uniqueLengths[j] = lengths[i];  }
1321                     break;
1322                 }
1323                 index++;
1324             }
1325             
1326             if(index == numUniques){
1327                 uniqueLengths[numUniques] = lengths[i];
1328                 uniqueCount[numUniques] = 1;
1329                 mapSeqToUnique[i] = numUniques;//anMap
1330                 mapUniqueToSeq[numUniques] = i;//anF
1331                 
1332                 for(int k=0;k<numFlowCells;k++){
1333                     uniqueFlowgrams[numUniques * numFlowCells + k] = current[k];
1334                     uniqueFlowDataIntI[numUniques * numFlowCells + k] = flowDataIntI[i * numFlowCells + k];
1335                 }
1336                 
1337                 numUniques++;
1338             }
1339         }
1340         uniqueFlowDataIntI.resize(numFlowCells * numUniques);
1341         uniqueLengths.resize(numUniques);       
1342         
1343         flowDataPrI.resize(numSeqs * numFlowCells, 0);
1344         for(int i=0;i<flowDataPrI.size();i++)   {       if (m->control_pressed) { break; } flowDataPrI[i] = getProbIntensity(flowDataIntI[i]);          }
1345     }
1346     catch(exception& e) {
1347         m->errorOut(e, "ShhherCommand", "getUniques");
1348         exit(1);
1349     }
1350 }
1351
1352 /**************************************************************************************************/
1353
1354 float ShhherCommand::calcPairwiseDist(int seqA, int seqB){
1355     try{
1356         int minLength = lengths[mapSeqToUnique[seqA]];
1357         if(lengths[seqB] < minLength){  minLength = lengths[mapSeqToUnique[seqB]];      }
1358         
1359         int ANumFlowCells = seqA * numFlowCells;
1360         int BNumFlowCells = seqB * numFlowCells;
1361         
1362         float dist = 0;
1363         
1364         for(int i=0;i<minLength;i++){
1365             
1366             if (m->control_pressed) { break; }
1367             
1368             int flowAIntI = flowDataIntI[ANumFlowCells + i];
1369             float flowAPrI = flowDataPrI[ANumFlowCells + i];
1370             
1371             int flowBIntI = flowDataIntI[BNumFlowCells + i];
1372             float flowBPrI = flowDataPrI[BNumFlowCells + i];
1373             dist += jointLookUp[flowAIntI * NUMBINS + flowBIntI] - flowAPrI - flowBPrI;
1374         }
1375         
1376         dist /= (float) minLength;
1377         return dist;
1378     }
1379     catch(exception& e) {
1380         m->errorOut(e, "ShhherCommand", "calcPairwiseDist");
1381         exit(1);
1382     }
1383 }
1384
1385 //**********************************************************************************************************************/
1386
1387 string ShhherCommand::cluster(string distFileName, string namesFileName){
1388     try {
1389         
1390         ReadMatrix* read = new ReadColumnMatrix(distFileName);  
1391                 read->setCutoff(cutoff);
1392                 
1393                 NameAssignment* clusterNameMap = new NameAssignment(namesFileName);
1394                 clusterNameMap->readMap();
1395                 read->read(clusterNameMap);
1396         
1397                 ListVector* list = read->getListVector();
1398                 SparseDistanceMatrix* matrix = read->getDMatrix();
1399                 
1400                 delete read; 
1401                 delete clusterNameMap; 
1402         
1403         RAbundVector* rabund = new RAbundVector(list->getRAbundVector());
1404         
1405         Cluster* cluster = new CompleteLinkage(rabund, list, matrix, cutoff, "furthest"); 
1406         string tag = cluster->getTag();
1407         
1408         double clusterCutoff = cutoff;
1409         while (matrix->getSmallDist() <= clusterCutoff && matrix->getNNodes() > 0){
1410             
1411             if (m->control_pressed) { break; }
1412             
1413             cluster->update(clusterCutoff);
1414         }
1415         
1416         list->setLabel(toString(cutoff));
1417         
1418         string listFileName = flowFileName.substr(0,flowFileName.find_last_of('.')) + ".shhh.list";
1419         ofstream listFile;
1420         m->openOutputFile(listFileName, listFile);
1421         list->print(listFile);
1422         listFile.close();
1423         
1424         delete matrix;  delete cluster; delete rabund; delete list;
1425         
1426         return listFileName;
1427     }
1428     catch(exception& e) {
1429         m->errorOut(e, "ShhherCommand", "cluster");
1430         exit(1);        
1431     }           
1432 }
1433
1434 /**************************************************************************************************/
1435
1436 void ShhherCommand::calcCentroidsDriver(int start, int finish){                          
1437     
1438     //this function gets the most likely homopolymer length at a flow position for a group of sequences
1439     //within an otu
1440     
1441     try{
1442         
1443         for(int i=start;i<finish;i++){
1444             
1445             if (m->control_pressed) { break; }
1446             
1447             double count = 0;
1448             int position = 0;
1449             int minFlowGram = 100000000;
1450             double minFlowValue = 1e8;
1451             change[i] = 0; //FALSE
1452             
1453             for(int j=0;j<nSeqsPerOTU[i];j++){
1454                 count += singleTau[seqNumber[cumNumSeqs[i] + j]];
1455             }
1456             
1457             if(nSeqsPerOTU[i] > 0 && count > MIN_COUNT){
1458                 vector<double> adF(nSeqsPerOTU[i]);
1459                 vector<int> anL(nSeqsPerOTU[i]);
1460                 
1461                 for(int j=0;j<nSeqsPerOTU[i];j++){
1462                     int index = cumNumSeqs[i] + j;
1463                     int nI = seqIndex[index];
1464                     int nIU = mapSeqToUnique[nI];
1465                     
1466                     int k;
1467                     for(k=0;k<position;k++){
1468                         if(nIU == anL[k]){
1469                             break;
1470                         }
1471                     }
1472                     if(k == position){
1473                         anL[position] = nIU;
1474                         adF[position] = 0.0000;
1475                         position++;
1476                     }                                           
1477                 }
1478                 
1479                 for(int j=0;j<nSeqsPerOTU[i];j++){
1480                     int index = cumNumSeqs[i] + j;
1481                     int nI = seqIndex[index];
1482                     
1483                     double tauValue = singleTau[seqNumber[index]];
1484                     
1485                     for(int k=0;k<position;k++){
1486                         double dist = getDistToCentroid(anL[k], nI, lengths[nI]);
1487                         adF[k] += dist * tauValue;
1488                     }
1489                 }
1490                 
1491                 for(int j=0;j<position;j++){
1492                     if(adF[j] < minFlowValue){
1493                         minFlowGram = j;
1494                         minFlowValue = adF[j];
1495                     }
1496                 }
1497                 
1498                 if(centroids[i] != anL[minFlowGram]){
1499                     change[i] = 1;
1500                     centroids[i] = anL[minFlowGram];
1501                 }
1502             }
1503             else if(centroids[i] != -1){
1504                 change[i] = 1;
1505                 centroids[i] = -1;                      
1506             }
1507         }
1508     }
1509     catch(exception& e) {
1510         m->errorOut(e, "ShhherCommand", "calcCentroidsDriver");
1511         exit(1);        
1512     }           
1513 }
1514
1515 /**************************************************************************************************/
1516
1517 double ShhherCommand::getDistToCentroid(int cent, int flow, int length){
1518     try{
1519         
1520         int flowAValue = cent * numFlowCells;
1521         int flowBValue = flow * numFlowCells;
1522         
1523         double dist = 0;
1524         
1525         for(int i=0;i<length;i++){
1526             dist += singleLookUp[uniqueFlowgrams[flowAValue] * NUMBINS + flowDataIntI[flowBValue]];
1527             flowAValue++;
1528             flowBValue++;
1529         }
1530         
1531         return dist / (double)length;
1532     }
1533     catch(exception& e) {
1534         m->errorOut(e, "ShhherCommand", "getDistToCentroid");
1535         exit(1);        
1536     }           
1537 }
1538
1539 /**************************************************************************************************/
1540
1541 double ShhherCommand::getNewWeights(){
1542     try{
1543         
1544         double maxChange = 0;
1545         
1546         for(int i=0;i<numOTUs;i++){
1547             
1548             if (m->control_pressed) { break; }
1549             
1550             double difference = weight[i];
1551             weight[i] = 0;
1552             
1553             for(int j=0;j<nSeqsPerOTU[i];j++){
1554                 int index = cumNumSeqs[i] + j;
1555                 double tauValue = singleTau[seqNumber[index]];
1556                 weight[i] += tauValue;
1557             }
1558             
1559             difference = fabs(weight[i] - difference);
1560             if(difference > maxChange){ maxChange = difference; }
1561         }
1562         return maxChange;
1563     }
1564     catch(exception& e) {
1565         m->errorOut(e, "ShhherCommand", "getNewWeights");
1566         exit(1);        
1567     }           
1568 }
1569  
1570  /**************************************************************************************************/
1571  
1572 double ShhherCommand::getLikelihood(){
1573     
1574     try{
1575         
1576         vector<long double> P(numSeqs, 0);
1577         int effNumOTUs = 0;
1578         
1579         for(int i=0;i<numOTUs;i++){
1580             if(weight[i] > MIN_WEIGHT){
1581                 effNumOTUs++;
1582             }
1583         }
1584         
1585         string hold;
1586         for(int i=0;i<numOTUs;i++){
1587             
1588             if (m->control_pressed) { break; }
1589             
1590             for(int j=0;j<nSeqsPerOTU[i];j++){
1591                 int index = cumNumSeqs[i] + j;
1592                 int nI = seqIndex[index];
1593                 double singleDist = dist[seqNumber[index]];
1594                 
1595                 P[nI] += weight[i] * exp(-singleDist * sigma);
1596             }
1597         }
1598         double nLL = 0.00;
1599         for(int i=0;i<numSeqs;i++){
1600             if(P[i] == 0){      P[i] = DBL_EPSILON;     }
1601             
1602             nLL += -log(P[i]);
1603         }
1604         
1605         nLL = nLL -(double)numSeqs * log(sigma);
1606         
1607         return nLL; 
1608     }
1609     catch(exception& e) {
1610         m->errorOut(e, "ShhherCommand", "getNewWeights");
1611         exit(1);        
1612     }           
1613 }
1614
1615 /**************************************************************************************************/
1616
1617 void ShhherCommand::checkCentroids(){
1618     try{
1619         vector<int> unique(numOTUs, 1);
1620         
1621         for(int i=0;i<numOTUs;i++){
1622             if(centroids[i] == -1 || weight[i] < MIN_WEIGHT){
1623                 unique[i] = -1;
1624             }
1625         }
1626         
1627         for(int i=0;i<numOTUs;i++){
1628             
1629             if (m->control_pressed) { break; }
1630             
1631             if(unique[i] == 1){
1632                 for(int j=i+1;j<numOTUs;j++){
1633                     if(unique[j] == 1){
1634                         
1635                         if(centroids[j] == centroids[i]){
1636                             unique[j] = 0;
1637                             centroids[j] = -1;
1638                             
1639                             weight[i] += weight[j];
1640                             weight[j] = 0.0;
1641                         }
1642                     }
1643                 }
1644             }
1645         }
1646     }
1647     catch(exception& e) {
1648         m->errorOut(e, "ShhherCommand", "checkCentroids");
1649         exit(1);        
1650     }           
1651 }
1652  /**************************************************************************************************/
1653
1654
1655  
1656 void ShhherCommand::writeQualities(vector<int> otuCounts){
1657     
1658     try {
1659         string thisOutputDir = outputDir;
1660         if (outputDir == "") {  thisOutputDir += m->hasPath(flowFileName);  }
1661         map<string, string> variables; 
1662                 variables["[filename]"] = outputDir + m->getRootName(m->getSimpleName(flowFileName));
1663         string qualityFileName = getOutputFileName("qfile",variables);
1664         
1665         ofstream qualityFile;
1666         m->openOutputFile(qualityFileName, qualityFile);
1667         
1668         qualityFile.setf(ios::fixed, ios::floatfield);
1669         qualityFile.setf(ios::showpoint);
1670         qualityFile << setprecision(6);
1671         
1672         vector<vector<int> > qualities(numOTUs);
1673         vector<double> pr(HOMOPS, 0);
1674         
1675         
1676         for(int i=0;i<numOTUs;i++){
1677             
1678             if (m->control_pressed) { break; }
1679             
1680             int index = 0;
1681             int base = 0;
1682             
1683             if(nSeqsPerOTU[i] > 0){
1684                 qualities[i].assign(1024, -1);
1685                 
1686                 while(index < numFlowCells){
1687                     double maxPrValue = 1e8;
1688                     short maxPrIndex = -1;
1689                     double count = 0.0000;
1690                     
1691                     pr.assign(HOMOPS, 0);
1692                     
1693                     for(int j=0;j<nSeqsPerOTU[i];j++){
1694                         int lIndex = cumNumSeqs[i] + j;
1695                         double tauValue = singleTau[seqNumber[lIndex]];
1696                         int sequenceIndex = aaI[i][j];
1697                         short intensity = flowDataIntI[sequenceIndex * numFlowCells + index];
1698                         
1699                         count += tauValue;
1700                         
1701                         for(int s=0;s<HOMOPS;s++){
1702                             pr[s] += tauValue * singleLookUp[s * NUMBINS + intensity];
1703                         }
1704                     }
1705                     
1706                     maxPrIndex = uniqueFlowgrams[centroids[i] * numFlowCells + index];
1707                     maxPrValue = pr[maxPrIndex];
1708                     
1709                     if(count > MIN_COUNT){
1710                         double U = 0.0000;
1711                         double norm = 0.0000;
1712                         
1713                         for(int s=0;s<HOMOPS;s++){
1714                             norm += exp(-(pr[s] - maxPrValue));
1715                         }
1716                         
1717                         for(int s=1;s<=maxPrIndex;s++){
1718                             int value = 0;
1719                             double temp = 0.0000;
1720                             
1721                             U += exp(-(pr[s-1]-maxPrValue))/norm;
1722                             
1723                             if(U>0.00){
1724                                 temp = log10(U);
1725                             }
1726                             else{
1727                                 temp = -10.1;
1728                             }
1729                             temp = floor(-10 * temp);
1730                             value = (int)floor(temp);
1731                             if(value > 100){    value = 100;    }
1732                             
1733                             qualities[i][base] = (int)value;
1734                             base++;
1735                         }
1736                     }
1737                     
1738                     index++;
1739                 }
1740             }
1741             
1742             
1743             if(otuCounts[i] > 0){
1744                 qualityFile << '>' << seqNameVector[mapUniqueToSeq[i]] << endl;
1745                 
1746                 int j=4;        //need to get past the first four bases
1747                 while(qualities[i][j] != -1){
1748                     qualityFile << qualities[i][j] << ' ';
1749                     j++;
1750                 }
1751                 qualityFile << endl;
1752             }
1753         }
1754         qualityFile.close();
1755         outputNames.push_back(qualityFileName); outputTypes["qfile"].push_back(qualityFileName);
1756         
1757     }
1758     catch(exception& e) {
1759         m->errorOut(e, "ShhherCommand", "writeQualities");
1760         exit(1);        
1761     }           
1762 }
1763
1764 /**************************************************************************************************/
1765
1766 void ShhherCommand::writeSequences(vector<int> otuCounts){
1767     try {
1768         string thisOutputDir = outputDir;
1769         if (outputDir == "") {  thisOutputDir += m->hasPath(flowFileName);  }
1770         map<string, string> variables; 
1771                 variables["[filename]"] = thisOutputDir + m->getRootName(m->getSimpleName(flowFileName));
1772         string fastaFileName = getOutputFileName("fasta",variables);
1773         ofstream fastaFile;
1774         m->openOutputFile(fastaFileName, fastaFile);
1775         
1776         vector<string> names(numOTUs, "");
1777         
1778         for(int i=0;i<numOTUs;i++){
1779             
1780             if (m->control_pressed) { break; }
1781             
1782             int index = centroids[i];
1783             
1784             if(otuCounts[i] > 0){
1785                 fastaFile << '>' << seqNameVector[aaI[i][0]] << endl;
1786                 
1787                 string newSeq = "";
1788                 
1789                 for(int j=0;j<numFlowCells;j++){
1790                     
1791                     char base = flowOrder[j % flowOrder.length()];
1792                     for(int k=0;k<uniqueFlowgrams[index * numFlowCells + j];k++){
1793                         newSeq += base;
1794                     }
1795                 }
1796                 
1797                 fastaFile << newSeq.substr(4) << endl;
1798             }
1799         }
1800         fastaFile.close();
1801         
1802         outputNames.push_back(fastaFileName); outputTypes["fasta"].push_back(fastaFileName);
1803         
1804         if(compositeFASTAFileName != ""){
1805             m->appendFiles(fastaFileName, compositeFASTAFileName);
1806         }
1807     }
1808     catch(exception& e) {
1809         m->errorOut(e, "ShhherCommand", "writeSequences");
1810         exit(1);        
1811     }           
1812 }
1813
1814 /**************************************************************************************************/
1815
1816 void ShhherCommand::writeNames(vector<int> otuCounts){
1817     try {
1818         string thisOutputDir = outputDir;
1819         if (outputDir == "") {  thisOutputDir += m->hasPath(flowFileName);  }
1820         map<string, string> variables; 
1821                 variables["[filename]"] = thisOutputDir + m->getRootName(m->getSimpleName(flowFileName));
1822         string nameFileName = getOutputFileName("name",variables);
1823         ofstream nameFile;
1824         m->openOutputFile(nameFileName, nameFile);
1825         
1826         for(int i=0;i<numOTUs;i++){
1827             
1828             if (m->control_pressed) { break; }
1829             
1830             if(otuCounts[i] > 0){
1831                 nameFile << seqNameVector[aaI[i][0]] << '\t' << seqNameVector[aaI[i][0]];
1832                 
1833                 for(int j=1;j<nSeqsPerOTU[i];j++){
1834                     nameFile << ',' << seqNameVector[aaI[i][j]];
1835                 }
1836                 
1837                 nameFile << endl;
1838             }
1839         }
1840         nameFile.close();
1841         outputNames.push_back(nameFileName); outputTypes["name"].push_back(nameFileName);
1842         
1843         
1844         if(compositeNamesFileName != ""){
1845             m->appendFiles(nameFileName, compositeNamesFileName);
1846         }               
1847     }
1848     catch(exception& e) {
1849         m->errorOut(e, "ShhherCommand", "writeNames");
1850         exit(1);        
1851     }           
1852 }
1853
1854 /**************************************************************************************************/
1855
1856 void ShhherCommand::writeGroups(){
1857     try {
1858         string thisOutputDir = outputDir;
1859         if (outputDir == "") {  thisOutputDir += m->hasPath(flowFileName);  }
1860         string fileRoot = m->getRootName(m->getSimpleName(flowFileName));
1861         int pos = fileRoot.find_first_of('.');
1862         string fileGroup = fileRoot;
1863         if (pos != string::npos) {  fileGroup = fileRoot.substr(pos+1, (fileRoot.length()-1-(pos+1)));  }
1864         map<string, string> variables; 
1865                 variables["[filename]"] = thisOutputDir + fileRoot;
1866         string groupFileName = getOutputFileName("group",variables);
1867         ofstream groupFile;
1868         m->openOutputFile(groupFileName, groupFile);
1869         
1870         for(int i=0;i<numSeqs;i++){
1871             if (m->control_pressed) { break; }
1872             groupFile << seqNameVector[i] << '\t' << fileGroup << endl;
1873         }
1874         groupFile.close();
1875         outputNames.push_back(groupFileName); outputTypes["group"].push_back(groupFileName);
1876         
1877     }
1878     catch(exception& e) {
1879         m->errorOut(e, "ShhherCommand", "writeGroups");
1880         exit(1);        
1881     }           
1882 }
1883
1884 /**************************************************************************************************/
1885
1886 void ShhherCommand::writeClusters(vector<int> otuCounts){
1887     try {
1888         string thisOutputDir = outputDir;
1889         if (outputDir == "") {  thisOutputDir += m->hasPath(flowFileName);  }
1890         map<string, string> variables; 
1891                 variables["[filename]"] = thisOutputDir + m->getRootName(m->getSimpleName(flowFileName));
1892         string otuCountsFileName = getOutputFileName("counts",variables);
1893         ofstream otuCountsFile;
1894         m->openOutputFile(otuCountsFileName, otuCountsFile);
1895         
1896         string bases = flowOrder;
1897         
1898         for(int i=0;i<numOTUs;i++){
1899             
1900             if (m->control_pressed) {
1901                 break;
1902             }
1903             //output the translated version of the centroid sequence for the otu
1904             if(otuCounts[i] > 0){
1905                 int index = centroids[i];
1906                 
1907                 otuCountsFile << "ideal\t";
1908                 for(int j=8;j<numFlowCells;j++){
1909                     char base = bases[j % bases.length()];
1910                     for(int s=0;s<uniqueFlowgrams[index * numFlowCells + j];s++){
1911                         otuCountsFile << base;
1912                     }
1913                 }
1914                 otuCountsFile << endl;
1915                 
1916                 for(int j=0;j<nSeqsPerOTU[i];j++){
1917                     int sequence = aaI[i][j];
1918                     otuCountsFile << seqNameVector[sequence] << '\t';
1919                     
1920                     string newSeq = "";
1921                     
1922                     for(int k=0;k<lengths[sequence];k++){
1923                         char base = bases[k % bases.length()];
1924                         int freq = int(0.01 * (double)flowDataIntI[sequence * numFlowCells + k] + 0.5);
1925                         
1926                         for(int s=0;s<freq;s++){
1927                             newSeq += base;
1928                             //otuCountsFile << base;
1929                         }
1930                     }
1931                     otuCountsFile << newSeq.substr(4) << endl;
1932                 }
1933                 otuCountsFile << endl;
1934             }
1935         }
1936         otuCountsFile.close();
1937         outputNames.push_back(otuCountsFileName); outputTypes["counts"].push_back(otuCountsFileName);
1938         
1939     }
1940     catch(exception& e) {
1941         m->errorOut(e, "ShhherCommand", "writeClusters");
1942         exit(1);        
1943     }           
1944 }
1945
1946 #else
1947 //**********************************************************************************************************************
1948
1949 int ShhherCommand::execute(){
1950         try {
1951                 if (abort == true) { if (calledHelp) { return 0; }  return 2;   }
1952                 
1953                 getSingleLookUp();      if (m->control_pressed) { return 0; }
1954                 getJointLookUp();       if (m->control_pressed) { return 0; }
1955                 
1956         int numFiles = flowFileVector.size();
1957                 
1958         if (numFiles < processors) { processors = numFiles; }
1959         
1960 #if defined (__APPLE__) || (__MACH__) || (linux) || (__linux) || (__linux__) || (__unix__) || (__unix)
1961         if (processors == 1) { driver(flowFileVector, compositeFASTAFileName, compositeNamesFileName); }
1962         else { createProcesses(flowFileVector); } //each processor processes one file
1963 #else
1964         driver(flowFileVector, compositeFASTAFileName, compositeNamesFileName);
1965 #endif
1966         
1967                 if(compositeFASTAFileName != ""){
1968                         outputNames.push_back(compositeFASTAFileName); outputTypes["fasta"].push_back(compositeFASTAFileName);
1969                         outputNames.push_back(compositeNamesFileName); outputTypes["name"].push_back(compositeNamesFileName);
1970                 }
1971
1972                 m->mothurOutEndLine();
1973                 m->mothurOut("Output File Names: "); m->mothurOutEndLine();
1974                 for (int i = 0; i < outputNames.size(); i++) {  m->mothurOut(outputNames[i]); m->mothurOutEndLine();    }
1975                 m->mothurOutEndLine();
1976                 
1977                 return 0;
1978         }
1979         catch(exception& e) {
1980                 m->errorOut(e, "ShhherCommand", "execute");
1981                 exit(1);
1982         }
1983 }
1984 #endif
1985 //********************************************************************************************************************
1986 //sorts biggest to smallest
1987 inline bool compareFileSizes(string left, string right){
1988     
1989     FILE * pFile;
1990     long leftsize = 0;
1991     
1992     //get num bytes in file
1993     string filename = left;
1994     pFile = fopen (filename.c_str(),"rb");
1995     string error = "Error opening " + filename;
1996     if (pFile==NULL) perror (error.c_str());
1997     else{
1998         fseek (pFile, 0, SEEK_END);
1999         leftsize=ftell (pFile);
2000         fclose (pFile);
2001     }
2002     
2003     FILE * pFile2;
2004     long rightsize = 0;
2005     
2006     //get num bytes in file
2007     filename = right;
2008     pFile2 = fopen (filename.c_str(),"rb");
2009     error = "Error opening " + filename;
2010     if (pFile2==NULL) perror (error.c_str());
2011     else{
2012         fseek (pFile2, 0, SEEK_END);
2013         rightsize=ftell (pFile2);
2014         fclose (pFile2);
2015     }
2016     
2017     return (leftsize > rightsize);      
2018
2019 /**************************************************************************************************/
2020
2021 int ShhherCommand::createProcesses(vector<string> filenames){
2022     try {
2023         vector<int> processIDS;
2024                 int process = 1;
2025                 int num = 0;
2026                 
2027                 //sanity check
2028                 if (filenames.size() < processors) { processors = filenames.size(); }
2029         
2030         //sort file names by size to divide load better
2031         sort(filenames.begin(), filenames.end(), compareFileSizes);
2032         
2033         vector < vector <string> > dividedFiles; //dividedFiles[1] = vector of filenames for process 1...
2034         dividedFiles.resize(processors);
2035         
2036         //for each file, figure out which process will complete it
2037         //want to divide the load intelligently so the big files are spread between processes
2038         for (int i = 0; i < filenames.size(); i++) { 
2039             int processToAssign = (i+1) % processors; 
2040             if (processToAssign == 0) { processToAssign = processors; }
2041             
2042             dividedFiles[(processToAssign-1)].push_back(filenames[i]);
2043         }
2044         
2045         //now lets reverse the order of ever other process, so we balance big files running with little ones
2046         for (int i = 0; i < processors; i++) {
2047             int remainder = ((i+1) % processors);
2048             if (remainder) {  reverse(dividedFiles[i].begin(), dividedFiles[i].end());  }
2049         }
2050         
2051                         
2052                 //divide the groups between the processors
2053                 /*vector<linePair> lines;
2054         vector<int> numFilesToComplete;
2055                 int numFilesPerProcessor = filenames.size() / processors;
2056                 for (int i = 0; i < processors; i++) {
2057                         int startIndex =  i * numFilesPerProcessor;
2058                         int endIndex = (i+1) * numFilesPerProcessor;
2059                         if(i == (processors - 1)){      endIndex = filenames.size();    }
2060                         lines.push_back(linePair(startIndex, endIndex));
2061             numFilesToComplete.push_back((endIndex-startIndex));
2062                 }*/
2063                 
2064         #if defined (__APPLE__) || (__MACH__) || (linux) || (__linux) || (__linux__) || (__unix__) || (__unix)          
2065                 
2066                 //loop through and create all the processes you want
2067                 while (process != processors) {
2068                         int pid = fork();
2069                         
2070                         if (pid > 0) {
2071                                 processIDS.push_back(pid);  //create map from line number to pid so you can append files in correct order later
2072                                 process++;
2073                         }else if (pid == 0){
2074                                 num = driver(dividedFiles[process], compositeFASTAFileName + toString(getpid()) + ".temp", compositeNamesFileName  + toString(getpid()) + ".temp");
2075                 
2076                 //pass numSeqs to parent
2077                                 ofstream out;
2078                                 string tempFile = compositeFASTAFileName + toString(getpid()) + ".num.temp";
2079                                 m->openOutputFile(tempFile, out);
2080                                 out << num << endl;
2081                                 out.close();
2082                 
2083                                 exit(0);
2084                         }else { 
2085                                 m->mothurOut("[ERROR]: unable to spawn the necessary processes."); m->mothurOutEndLine(); 
2086                                 for (int i = 0; i < processIDS.size(); i++) { kill (processIDS[i], SIGINT); }
2087                                 exit(0);
2088                         }
2089                 }
2090                 
2091                 //do my part
2092                 driver(dividedFiles[0], compositeFASTAFileName, compositeNamesFileName);
2093                 
2094                 //force parent to wait until all the processes are done
2095                 for (int i=0;i<processIDS.size();i++) { 
2096                         int temp = processIDS[i];
2097                         wait(&temp);
2098                 }
2099         
2100         #else
2101         
2102         //////////////////////////////////////////////////////////////////////////////////////////////////////
2103         
2104         /////////////////////// NOT WORKING, ACCESS VIOLATION ON READ OF FLOWGRAMS IN THREAD /////////////////
2105         
2106         //////////////////////////////////////////////////////////////////////////////////////////////////////
2107                 //Windows version shared memory, so be careful when passing variables through the shhhFlowsData struct. 
2108                 //Above fork() will clone, so memory is separate, but that's not the case with windows, 
2109                 //////////////////////////////////////////////////////////////////////////////////////////////////////
2110                 /*
2111                 vector<shhhFlowsData*> pDataArray; 
2112                 DWORD   dwThreadIdArray[processors-1];
2113                 HANDLE  hThreadArray[processors-1]; 
2114                 
2115                 //Create processor worker threads.
2116                 for( int i=0; i<processors-1; i++ ){
2117                         // Allocate memory for thread data.
2118                         string extension = "";
2119                         if (i != 0) { extension = toString(i) + ".temp"; }
2120                         
2121             shhhFlowsData* tempFlow = new shhhFlowsData(filenames, (compositeFASTAFileName + extension), (compositeNamesFileName + extension), outputDir, flowOrder, jointLookUp, singleLookUp, m, lines[i].start, lines[i].end, cutoff, sigma, minDelta, maxIters, i);
2122                         pDataArray.push_back(tempFlow);
2123                         processIDS.push_back(i);
2124             
2125                         hThreadArray[i] = CreateThread(NULL, 0, ShhhFlowsThreadFunction, pDataArray[i], 0, &dwThreadIdArray[i]);   
2126                 }
2127                 
2128                 //using the main process as a worker saves time and memory
2129                 //do my part
2130                 driver(filenames, compositeFASTAFileName, compositeNamesFileName, lines[processors-1].start, lines[processors-1].end);
2131                 
2132                 //Wait until all threads have terminated.
2133                 WaitForMultipleObjects(processors-1, hThreadArray, TRUE, INFINITE);
2134                 
2135                 //Close all thread handles and free memory allocations.
2136                 for(int i=0; i < pDataArray.size(); i++){
2137                         for(int j=0; j < pDataArray[i]->outputNames.size(); j++){ outputNames.push_back(pDataArray[i]->outputNames[j]); }
2138                         CloseHandle(hThreadArray[i]);
2139                         delete pDataArray[i];
2140                 }
2141                 */
2142         #endif
2143         
2144         for (int i=0;i<processIDS.size();i++) { 
2145             ifstream in;
2146                         string tempFile =  compositeFASTAFileName + toString(processIDS[i]) + ".num.temp";
2147                         m->openInputFile(tempFile, in);
2148                         if (!in.eof()) { 
2149                 int tempNum = 0; 
2150                 in >> tempNum; 
2151                 if (tempNum != dividedFiles[i+1].size()) {
2152                     m->mothurOut("[ERROR]: main process expected " + toString(processIDS[i]) + " to complete " + toString(dividedFiles[i+1].size()) + " files, and it only reported completing " + toString(tempNum) + ". This will cause file mismatches.  The flow files may be too large to process with multiple processors. \n");
2153                 }
2154             }
2155                         in.close(); m->mothurRemove(tempFile);
2156             
2157             if (compositeFASTAFileName != "") {
2158                 m->appendFiles((compositeFASTAFileName + toString(processIDS[i]) + ".temp"), compositeFASTAFileName);
2159                 m->appendFiles((compositeNamesFileName + toString(processIDS[i]) + ".temp"), compositeNamesFileName);
2160                 m->mothurRemove((compositeFASTAFileName + toString(processIDS[i]) + ".temp"));
2161                 m->mothurRemove((compositeNamesFileName + toString(processIDS[i]) + ".temp"));
2162             }
2163         }
2164         
2165         return 0;
2166         
2167     }
2168         catch(exception& e) {
2169                 m->errorOut(e, "ShhherCommand", "createProcesses");
2170                 exit(1);
2171         }
2172 }
2173 /**************************************************************************************************/
2174
2175 vector<string> ShhherCommand::parseFlowFiles(string filename){
2176     try {
2177         vector<string> files;
2178         int count = 0;
2179         
2180         ifstream in;
2181         m->openInputFile(filename, in);
2182         
2183         int thisNumFLows = 0;
2184         in >> thisNumFLows; m->gobble(in);
2185         
2186         while (!in.eof()) {
2187             if (m->control_pressed) { break; }
2188             
2189             ofstream out;
2190             string outputFileName = filename + toString(count) + ".temp";
2191             m->openOutputFile(outputFileName, out);
2192             out << thisNumFLows << endl;
2193             files.push_back(outputFileName);
2194             
2195             int numLinesWrote = 0;
2196             for (int i = 0; i < largeSize; i++) {
2197                 if (in.eof()) { break; }
2198                 string line = m->getline(in); m->gobble(in);
2199                 out << line << endl;
2200                 numLinesWrote++;
2201             }
2202             out.close();
2203             
2204             if (numLinesWrote == 0) {  m->mothurRemove(outputFileName); files.pop_back();  }
2205             count++;
2206         }
2207         in.close();
2208         
2209         if (m->control_pressed) { for (int i = 0; i < files.size(); i++) { m->mothurRemove(files[i]); }  files.clear(); }
2210         
2211         m->mothurOut("\nDivided " + filename + " into " + toString(files.size()) + " files.\n\n"); 
2212         
2213         return files;
2214     }
2215         catch(exception& e) {
2216                 m->errorOut(e, "ShhherCommand", "parseFlowFiles");
2217                 exit(1);
2218         }
2219 }
2220 /**************************************************************************************************/
2221
2222 int ShhherCommand::driver(vector<string> filenames, string thisCompositeFASTAFileName, string thisCompositeNamesFileName){
2223     try {
2224         
2225         int numCompleted = 0;
2226         
2227         for(int i=0;i<filenames.size();i++){
2228                         
2229                         if (m->control_pressed) { break; }
2230                         
2231             vector<string> theseFlowFileNames; theseFlowFileNames.push_back(filenames[i]);
2232             if (large) {  theseFlowFileNames = parseFlowFiles(filenames[i]);  }
2233             
2234             if (m->control_pressed) { break; }
2235             
2236             double begClock = clock();
2237             unsigned long long begTime;
2238             
2239             string fileNameForOutput = filenames[i];
2240             
2241             for (int g = 0; g < theseFlowFileNames.size(); g++) {
2242                 
2243                 string flowFileName = theseFlowFileNames[g];
2244                 m->mothurOut("\n>>>>>\tProcessing " + flowFileName + " (file " + toString(i+1) + " of " + toString(filenames.size()) + ")\t<<<<<\n");
2245                 m->mothurOut("Reading flowgrams...\n");
2246                 
2247                 vector<string> seqNameVector;
2248                 vector<int> lengths;
2249                 vector<short> flowDataIntI;
2250                 vector<double> flowDataPrI;
2251                 map<string, int> nameMap;
2252                 vector<short> uniqueFlowgrams;
2253                 vector<int> uniqueCount;
2254                 vector<int> mapSeqToUnique;
2255                 vector<int> mapUniqueToSeq;
2256                 vector<int> uniqueLengths;
2257                 int numFlowCells;
2258                 
2259                 if (m->debug) { m->mothurOut("[DEBUG]: About to read flowgrams.\n"); }
2260                 int numSeqs = getFlowData(flowFileName, seqNameVector, lengths, flowDataIntI, nameMap, numFlowCells);
2261                 
2262                 if (m->control_pressed) { break; }
2263                 
2264                 m->mothurOut("Identifying unique flowgrams...\n");
2265                 int numUniques = getUniques(numSeqs, numFlowCells, uniqueFlowgrams, uniqueCount, uniqueLengths, mapSeqToUnique, mapUniqueToSeq, lengths, flowDataPrI, flowDataIntI);
2266                 
2267                 if (m->control_pressed) { break; }
2268                 
2269                 m->mothurOut("Calculating distances between flowgrams...\n");
2270                 string distFileName = flowFileName.substr(0,flowFileName.find_last_of('.')) + ".shhh.dist";
2271                 begTime = time(NULL);
2272                
2273                 
2274                 flowDistParentFork(numFlowCells, distFileName, numUniques, mapUniqueToSeq, mapSeqToUnique, lengths, flowDataPrI, flowDataIntI);
2275                 
2276                 m->mothurOutEndLine();
2277                 m->mothurOut("Total time: " + toString(time(NULL) - begTime) + '\t' + toString((clock() - begClock)/CLOCKS_PER_SEC) + '\n');
2278                 
2279                 
2280                 string namesFileName = flowFileName.substr(0,flowFileName.find_last_of('.')) + ".shhh.names";
2281                 createNamesFile(numSeqs, numUniques, namesFileName, seqNameVector, mapSeqToUnique, mapUniqueToSeq);
2282                 
2283                 if (m->control_pressed) { break; }
2284                 
2285                 m->mothurOut("\nClustering flowgrams...\n");
2286                 string listFileName = flowFileName.substr(0,flowFileName.find_last_of('.')) + ".shhh.list";
2287                 cluster(listFileName, distFileName, namesFileName);
2288                 
2289                 if (m->control_pressed) { break; }
2290                 
2291                 vector<int> otuData;
2292                 vector<int> cumNumSeqs;
2293                 vector<int> nSeqsPerOTU;
2294                 vector<vector<int> > aaP;       //tMaster->aanP:        each row is a different otu / each col contains the sequence indices
2295                 vector<vector<int> > aaI;       //tMaster->aanI:        that are in each otu - can't differentiate between aaP and aaI 
2296                 vector<int> seqNumber;          //tMaster->anP:         the sequence id number sorted by OTU
2297                 vector<int> seqIndex;           //tMaster->anI;         the index that corresponds to seqNumber
2298                 
2299                 
2300                 int numOTUs = getOTUData(numSeqs, listFileName, otuData, cumNumSeqs, nSeqsPerOTU, aaP, aaI, seqNumber, seqIndex, nameMap);
2301                 
2302                 if (m->control_pressed) { break; }
2303                 
2304                 m->mothurRemove(distFileName);
2305                 m->mothurRemove(namesFileName);
2306                 m->mothurRemove(listFileName);
2307                 
2308                 vector<double> dist;            //adDist - distance of sequences to centroids
2309                 vector<short> change;           //did the centroid sequence change? 0 = no; 1 = yes
2310                 vector<int> centroids;          //the representative flowgram for each cluster m
2311                 vector<double> weight;
2312                 vector<double> singleTau;       //tMaster->adTau:       1-D Tau vector (1xnumSeqs)
2313                 vector<int> nSeqsBreaks;
2314                 vector<int> nOTUsBreaks;
2315                 
2316                 if (m->debug) { m->mothurOut("[DEBUG]: numSeqs = " + toString(numSeqs) + " numOTUS = " + toString(numOTUs) + " about to alloc a dist vector with size = " + toString((numSeqs * numOTUs)) + ".\n"); }
2317                 
2318                 dist.assign(numSeqs * numOTUs, 0);
2319                 change.assign(numOTUs, 1);
2320                 centroids.assign(numOTUs, -1);
2321                 weight.assign(numOTUs, 0);
2322                 singleTau.assign(numSeqs, 1.0);
2323                 
2324                 nSeqsBreaks.assign(2, 0);
2325                 nOTUsBreaks.assign(2, 0);
2326                 
2327                 nSeqsBreaks[0] = 0;
2328                 nSeqsBreaks[1] = numSeqs;
2329                 nOTUsBreaks[1] = numOTUs;
2330                 
2331                 if (m->debug) { m->mothurOut("[DEBUG]: done allocating memory, about to denoise.\n"); }
2332                 
2333                 if (m->control_pressed) { break; }
2334                 
2335                 double maxDelta = 0;
2336                 int iter = 0;
2337                 
2338                 begClock = clock();
2339                 begTime = time(NULL);
2340                 
2341                 m->mothurOut("\nDenoising flowgrams...\n");
2342                 m->mothurOut("iter\tmaxDelta\tnLL\t\tcycletime\n");
2343                 
2344                 while((maxIters == 0 && maxDelta > minDelta) || iter < MIN_ITER || (maxDelta > minDelta && iter < maxIters)){
2345                     
2346                     if (m->control_pressed) { break; }
2347                     
2348                     double cycClock = clock();
2349                     unsigned long long cycTime = time(NULL);
2350                     fill(numOTUs, seqNumber, seqIndex, cumNumSeqs, nSeqsPerOTU, aaP, aaI);
2351                     
2352                     if (m->control_pressed) { break; }
2353                     
2354                     calcCentroidsDriver(numOTUs, cumNumSeqs, nSeqsPerOTU, seqIndex, change, centroids, singleTau, mapSeqToUnique, uniqueFlowgrams, flowDataIntI, lengths, numFlowCells, seqNumber);
2355                     
2356                     if (m->control_pressed) { break; }
2357                     
2358                     maxDelta = getNewWeights(numOTUs, cumNumSeqs, nSeqsPerOTU, singleTau, seqNumber, weight);  
2359                     
2360                     if (m->control_pressed) { break; }
2361                     
2362                     double nLL = getLikelihood(numSeqs, numOTUs, nSeqsPerOTU, seqNumber, cumNumSeqs, seqIndex, dist, weight); 
2363                     
2364                     if (m->control_pressed) { break; }
2365                     
2366                     checkCentroids(numOTUs, centroids, weight);
2367                     
2368                     if (m->control_pressed) { break; }
2369                     
2370                     calcNewDistances(numSeqs, numOTUs, nSeqsPerOTU,  dist, weight, change, centroids, aaP, singleTau, aaI, seqNumber, seqIndex, uniqueFlowgrams, flowDataIntI, numFlowCells, lengths);
2371                     
2372                     if (m->control_pressed) { break; }
2373                     
2374                     iter++;
2375                     
2376                     m->mothurOut(toString(iter) + '\t' + toString(maxDelta) + '\t' + toString(nLL) + '\t' + toString(time(NULL) - cycTime) + '\t' + toString((clock() - cycClock)/(double)CLOCKS_PER_SEC) + '\n');
2377                     
2378                 }       
2379                 
2380                 if (m->control_pressed) { break; }
2381                 
2382                 m->mothurOut("\nFinalizing...\n");
2383                 fill(numOTUs, seqNumber, seqIndex, cumNumSeqs, nSeqsPerOTU, aaP, aaI);
2384                 
2385                 if (m->control_pressed) { break; }
2386                 
2387                 setOTUs(numOTUs, numSeqs, seqNumber, seqIndex, cumNumSeqs, nSeqsPerOTU, otuData, singleTau, dist, aaP, aaI);
2388                 
2389                 if (m->control_pressed) { break; }
2390                 
2391                 vector<int> otuCounts(numOTUs, 0);
2392                 for(int j=0;j<numSeqs;j++)      {       otuCounts[otuData[j]]++;        }
2393                 
2394                 calcCentroidsDriver(numOTUs, cumNumSeqs, nSeqsPerOTU, seqIndex, change, centroids, singleTau, mapSeqToUnique, uniqueFlowgrams, flowDataIntI, lengths, numFlowCells, seqNumber); 
2395                 
2396                 if (m->control_pressed) { break; }
2397                 
2398                 if ((large) && (g == 0)) {  flowFileName = filenames[i]; theseFlowFileNames[0] = filenames[i]; }
2399                 string thisOutputDir = outputDir;
2400                 if (outputDir == "") {  thisOutputDir = m->hasPath(flowFileName);  }
2401                 map<string, string> variables; 
2402                 variables["[filename]"] = thisOutputDir + m->getRootName(m->getSimpleName(flowFileName));
2403                 string qualityFileName = getOutputFileName("qfile",variables);
2404                 string fastaFileName = getOutputFileName("fasta",variables);
2405                 string nameFileName = getOutputFileName("name",variables);
2406                 string otuCountsFileName = getOutputFileName("counts",variables);
2407                 string fileRoot = m->getRootName(m->getSimpleName(flowFileName));
2408                 int pos = fileRoot.find_first_of('.');
2409                 string fileGroup = fileRoot;
2410                 if (pos != string::npos) {  fileGroup = fileRoot.substr(pos+1, (fileRoot.length()-1-(pos+1)));  }
2411                 string groupFileName = getOutputFileName("group",variables);
2412
2413                 
2414                 writeQualities(numOTUs, numFlowCells, qualityFileName, otuCounts, nSeqsPerOTU, seqNumber, singleTau, flowDataIntI, uniqueFlowgrams, cumNumSeqs, mapUniqueToSeq, seqNameVector, centroids, aaI); if (m->control_pressed) { break; }
2415                 writeSequences(thisCompositeFASTAFileName, numOTUs, numFlowCells, fastaFileName, otuCounts, uniqueFlowgrams, seqNameVector, aaI, centroids);if (m->control_pressed) { break; }
2416                 writeNames(thisCompositeNamesFileName, numOTUs, nameFileName, otuCounts, seqNameVector, aaI, nSeqsPerOTU);                              if (m->control_pressed) { break; }
2417                 writeClusters(otuCountsFileName, numOTUs, numFlowCells,otuCounts, centroids, uniqueFlowgrams, seqNameVector, aaI, nSeqsPerOTU, lengths, flowDataIntI);                  if (m->control_pressed) { break; }
2418                 writeGroups(groupFileName, fileGroup, numSeqs, seqNameVector);                                          if (m->control_pressed) { break; }
2419                 
2420                 if (large) {
2421                     if (g > 0) {
2422                         variables["[filename]"] = thisOutputDir + m->getRootName(m->getSimpleName(theseFlowFileNames[0]));
2423                         m->appendFiles(qualityFileName, getOutputFileName("qfile",variables));
2424                         m->mothurRemove(qualityFileName);
2425                         m->appendFiles(fastaFileName, getOutputFileName("fasta",variables));
2426                         m->mothurRemove(fastaFileName);
2427                         m->appendFiles(nameFileName, getOutputFileName("name",variables));
2428                         m->mothurRemove(nameFileName);
2429                         m->appendFiles(otuCountsFileName, getOutputFileName("counts",variables));
2430                         m->mothurRemove(otuCountsFileName);
2431                         m->appendFiles(groupFileName, getOutputFileName("group",variables));
2432                         m->mothurRemove(groupFileName);
2433                     }
2434                     m->mothurRemove(theseFlowFileNames[g]);
2435                 }
2436                         }
2437             
2438             numCompleted++;
2439                         m->mothurOut("Total time to process " + fileNameForOutput + ":\t" + toString(time(NULL) - begTime) + '\t' + toString((clock() - begClock)/(double)CLOCKS_PER_SEC) + '\n');
2440                 }
2441                 
2442         if (m->control_pressed) { for (int i = 0; i < outputNames.size(); i++) { m->mothurRemove(outputNames[i]); } return 0; }
2443         
2444         return numCompleted;
2445         
2446     }catch(exception& e) {
2447             m->errorOut(e, "ShhherCommand", "driver");
2448             exit(1);
2449     }
2450 }
2451
2452 /**************************************************************************************************/
2453 int ShhherCommand::getFlowData(string filename, vector<string>& thisSeqNameVector, vector<int>& thisLengths, vector<short>& thisFlowDataIntI, map<string, int>& thisNameMap, int& numFlowCells){
2454         try{
2455        
2456                 ifstream flowFile;
2457        
2458                 m->openInputFile(filename, flowFile);
2459                 
2460                 string seqName;
2461                 int currentNumFlowCells;
2462                 float intensity;
2463         thisSeqNameVector.clear();
2464                 thisLengths.clear();
2465                 thisFlowDataIntI.clear();
2466                 thisNameMap.clear();
2467                 
2468                 string numFlowTest;
2469         flowFile >> numFlowTest;
2470         
2471         if (!m->isContainingOnlyDigits(numFlowTest)) { m->mothurOut("[ERROR]: expected a number and got " + numFlowTest + ", quitting. Did you use the flow parameter instead of the file parameter?"); m->mothurOutEndLine(); exit(1); }
2472         else { convert(numFlowTest, numFlowCells); }
2473         
2474         if (m->debug) { m->mothurOut("[DEBUG]: numFlowCells = " + toString(numFlowCells) + ".\n"); }
2475                 int index = 0;//pcluster
2476                 while(!flowFile.eof()){
2477                         
2478                         if (m->control_pressed) { break; }
2479                         
2480                         flowFile >> seqName >> currentNumFlowCells;
2481             
2482                         thisLengths.push_back(currentNumFlowCells);
2483            
2484                         thisSeqNameVector.push_back(seqName);
2485                         thisNameMap[seqName] = index++;//pcluster
2486             
2487             if (m->debug) { m->mothurOut("[DEBUG]: seqName = " + seqName + " length = " + toString(currentNumFlowCells) + " index = " + toString(index) + "\n"); }
2488             
2489                         for(int i=0;i<numFlowCells;i++){
2490                                 flowFile >> intensity;
2491                                 if(intensity > 9.99)    {       intensity = 9.99;       }
2492                                 int intI = int(100 * intensity + 0.0001);
2493                                 thisFlowDataIntI.push_back(intI);
2494                         }
2495                         m->gobble(flowFile);
2496                 }
2497                 flowFile.close();
2498                 
2499                 int numSeqs = thisSeqNameVector.size();         
2500                 
2501                 for(int i=0;i<numSeqs;i++){
2502                         
2503                         if (m->control_pressed) { break; }
2504                         
2505                         int iNumFlowCells = i * numFlowCells;
2506                         for(int j=thisLengths[i];j<numFlowCells;j++){
2507                                 thisFlowDataIntI[iNumFlowCells + j] = 0;
2508                         }
2509                 }
2510         
2511         return numSeqs;
2512                 
2513         }
2514         catch(exception& e) {
2515                 m->errorOut(e, "ShhherCommand", "getFlowData");
2516                 exit(1);
2517         }
2518 }
2519 /**************************************************************************************************/
2520
2521 int ShhherCommand::flowDistParentFork(int numFlowCells, string distFileName, int stopSeq, vector<int>& mapUniqueToSeq, vector<int>& mapSeqToUnique, vector<int>& lengths, vector<double>& flowDataPrI, vector<short>& flowDataIntI){
2522         try{            
2523         
2524                 ostringstream outStream;
2525                 outStream.setf(ios::fixed, ios::floatfield);
2526                 outStream.setf(ios::dec, ios::basefield);
2527                 outStream.setf(ios::showpoint);
2528                 outStream.precision(6);
2529                 
2530                 int begTime = time(NULL);
2531                 double begClock = clock();
2532         
2533                 for(int i=0;i<stopSeq;i++){
2534                         
2535                         if (m->control_pressed) { break; }
2536                         
2537                         for(int j=0;j<i;j++){
2538                                 float flowDistance = calcPairwiseDist(numFlowCells, mapUniqueToSeq[i], mapUniqueToSeq[j], mapSeqToUnique, lengths, flowDataPrI, flowDataIntI);
2539                 
2540                                 if(flowDistance < 1e-6){
2541                                         outStream << mapUniqueToSeq[i] << '\t' << mapUniqueToSeq[j] << '\t' << 0.000000 << endl;
2542                                 }
2543                                 else if(flowDistance <= cutoff){
2544                                         outStream << mapUniqueToSeq[i] << '\t' << mapUniqueToSeq[j] << '\t' << flowDistance << endl;
2545                                 }
2546                         }
2547                         if(i % 100 == 0){
2548                                 m->mothurOut(toString(i) + "\t" + toString(time(NULL) - begTime));
2549                                 m->mothurOut("\t" + toString((clock()-begClock)/CLOCKS_PER_SEC));
2550                                 m->mothurOutEndLine();
2551                         }
2552                 }
2553                 
2554                 ofstream distFile(distFileName.c_str());
2555                 distFile << outStream.str();            
2556                 distFile.close();
2557                 
2558                 if (m->control_pressed) {}
2559                 else {
2560                         m->mothurOut(toString(stopSeq-1) + "\t" + toString(time(NULL) - begTime));
2561                         m->mothurOut("\t" + toString((clock()-begClock)/CLOCKS_PER_SEC));
2562                         m->mothurOutEndLine();
2563                 }
2564         
2565         return 0;
2566         }
2567         catch(exception& e) {
2568                 m->errorOut(e, "ShhherCommand", "flowDistParentFork");
2569                 exit(1);
2570         }
2571 }
2572 /**************************************************************************************************/
2573
2574 float ShhherCommand::calcPairwiseDist(int numFlowCells, int seqA, int seqB, vector<int>& mapSeqToUnique, vector<int>& lengths, vector<double>& flowDataPrI, vector<short>& flowDataIntI){
2575         try{
2576                 int minLength = lengths[mapSeqToUnique[seqA]];
2577                 if(lengths[seqB] < minLength){  minLength = lengths[mapSeqToUnique[seqB]];      }
2578                 
2579                 int ANumFlowCells = seqA * numFlowCells;
2580                 int BNumFlowCells = seqB * numFlowCells;
2581                 
2582                 float dist = 0;
2583                 
2584                 for(int i=0;i<minLength;i++){
2585                         
2586                         if (m->control_pressed) { break; }
2587                         
2588                         int flowAIntI = flowDataIntI[ANumFlowCells + i];
2589                         float flowAPrI = flowDataPrI[ANumFlowCells + i];
2590                         
2591                         int flowBIntI = flowDataIntI[BNumFlowCells + i];
2592                         float flowBPrI = flowDataPrI[BNumFlowCells + i];
2593                         dist += jointLookUp[flowAIntI * NUMBINS + flowBIntI] - flowAPrI - flowBPrI;
2594                 }
2595                 
2596                 dist /= (float) minLength;
2597                 return dist;
2598         }
2599         catch(exception& e) {
2600                 m->errorOut(e, "ShhherCommand", "calcPairwiseDist");
2601                 exit(1);
2602         }
2603 }
2604
2605 /**************************************************************************************************/
2606
2607 int ShhherCommand::getUniques(int numSeqs, int numFlowCells, vector<short>& uniqueFlowgrams, vector<int>& uniqueCount, vector<int>& uniqueLengths, vector<int>& mapSeqToUnique, vector<int>& mapUniqueToSeq, vector<int>& lengths, vector<double>& flowDataPrI, vector<short>& flowDataIntI){
2608         try{
2609                 int numUniques = 0;
2610                 uniqueFlowgrams.assign(numFlowCells * numSeqs, -1);
2611                 uniqueCount.assign(numSeqs, 0);                                                 //      anWeights
2612                 uniqueLengths.assign(numSeqs, 0);
2613                 mapSeqToUnique.assign(numSeqs, -1);
2614                 mapUniqueToSeq.assign(numSeqs, -1);
2615                 
2616                 vector<short> uniqueFlowDataIntI(numFlowCells * numSeqs, -1);
2617                 
2618                 for(int i=0;i<numSeqs;i++){
2619                         
2620                         if (m->control_pressed) { break; }
2621                         
2622                         int index = 0;
2623                         
2624                         vector<short> current(numFlowCells);
2625                         for(int j=0;j<numFlowCells;j++){
2626                                 current[j] = short(((flowDataIntI[i * numFlowCells + j] + 50.0)/100.0));
2627                         }
2628             
2629                         for(int j=0;j<numUniques;j++){
2630                                 int offset = j * numFlowCells;
2631                                 bool toEnd = 1;
2632                                 
2633                                 int shorterLength;
2634                                 if(lengths[i] < uniqueLengths[j])       {       shorterLength = lengths[i];                     }
2635                                 else                                                            {       shorterLength = uniqueLengths[j];       }
2636                 
2637                                 for(int k=0;k<shorterLength;k++){
2638                                         if(current[k] != uniqueFlowgrams[offset + k]){
2639                                                 toEnd = 0;
2640                                                 break;
2641                                         }
2642                                 }
2643                                 
2644                                 if(toEnd){
2645                                         mapSeqToUnique[i] = j;
2646                                         uniqueCount[j]++;
2647                                         index = j;
2648                                         if(lengths[i] > uniqueLengths[j])       {       uniqueLengths[j] = lengths[i];  }
2649                                         break;
2650                                 }
2651                                 index++;
2652                         }
2653                         
2654                         if(index == numUniques){
2655                                 uniqueLengths[numUniques] = lengths[i];
2656                                 uniqueCount[numUniques] = 1;
2657                                 mapSeqToUnique[i] = numUniques;//anMap
2658                                 mapUniqueToSeq[numUniques] = i;//anF
2659                                 
2660                                 for(int k=0;k<numFlowCells;k++){
2661                                         uniqueFlowgrams[numUniques * numFlowCells + k] = current[k];
2662                                         uniqueFlowDataIntI[numUniques * numFlowCells + k] = flowDataIntI[i * numFlowCells + k];
2663                                 }
2664                                 
2665                                 numUniques++;
2666                         }
2667                 }
2668                 uniqueFlowDataIntI.resize(numFlowCells * numUniques);
2669                 uniqueLengths.resize(numUniques);       
2670                 
2671                 flowDataPrI.resize(numSeqs * numFlowCells, 0);
2672                 for(int i=0;i<flowDataPrI.size();i++)   {       if (m->control_pressed) { break; } flowDataPrI[i] = getProbIntensity(flowDataIntI[i]);          }
2673         
2674         return numUniques;
2675         }
2676         catch(exception& e) {
2677                 m->errorOut(e, "ShhherCommand", "getUniques");
2678                 exit(1);
2679         }
2680 }
2681 /**************************************************************************************************/
2682 int ShhherCommand::createNamesFile(int numSeqs, int numUniques, string filename, vector<string>& seqNameVector, vector<int>& mapSeqToUnique, vector<int>& mapUniqueToSeq){
2683         try{
2684                 
2685                 vector<string> duplicateNames(numUniques, "");
2686                 for(int i=0;i<numSeqs;i++){
2687                         duplicateNames[mapSeqToUnique[i]] += seqNameVector[i] + ',';
2688                 }
2689                 
2690                 ofstream nameFile;
2691                 m->openOutputFile(filename, nameFile);
2692                 
2693                 for(int i=0;i<numUniques;i++){
2694                         
2695                         if (m->control_pressed) { break; }
2696                         
2697             //                  nameFile << seqNameVector[mapUniqueToSeq[i]] << '\t' << duplicateNames[i].substr(0, duplicateNames[i].find_last_of(',')) << endl;
2698                         nameFile << mapUniqueToSeq[i] << '\t' << duplicateNames[i].substr(0, duplicateNames[i].find_last_of(',')) << endl;
2699                 }
2700                 
2701                 nameFile.close();
2702         
2703                 return 0;
2704         }
2705         catch(exception& e) {
2706                 m->errorOut(e, "ShhherCommand", "createNamesFile");
2707                 exit(1);
2708         }
2709 }
2710 //**********************************************************************************************************************
2711
2712 int ShhherCommand::cluster(string filename, string distFileName, string namesFileName){
2713         try {
2714                 
2715                 ReadMatrix* read = new ReadColumnMatrix(distFileName);  
2716                 read->setCutoff(cutoff);
2717                 
2718                 NameAssignment* clusterNameMap = new NameAssignment(namesFileName);
2719                 clusterNameMap->readMap();
2720                 read->read(clusterNameMap);
2721         
2722                 ListVector* list = read->getListVector();
2723                 SparseDistanceMatrix* matrix = read->getDMatrix();
2724                 
2725                 delete read; 
2726                 delete clusterNameMap; 
2727         
2728                 RAbundVector* rabund = new RAbundVector(list->getRAbundVector());
2729                 
2730                 Cluster* cluster = new CompleteLinkage(rabund, list, matrix, cutoff, "furthest"); 
2731                 string tag = cluster->getTag();
2732                 
2733                 double clusterCutoff = cutoff;
2734                 while (matrix->getSmallDist() <= clusterCutoff && matrix->getNNodes() > 0){
2735                         
2736                         if (m->control_pressed) { break; }
2737                         
2738                         cluster->update(clusterCutoff);
2739                 }
2740                 
2741                 list->setLabel(toString(cutoff));
2742                 
2743                 ofstream listFile;
2744                 m->openOutputFile(filename, listFile);
2745                 list->print(listFile);
2746                 listFile.close();
2747                 
2748                 delete matrix;  delete cluster; delete rabund; delete list;
2749         
2750                 return 0;
2751         }
2752         catch(exception& e) {
2753                 m->errorOut(e, "ShhherCommand", "cluster");
2754                 exit(1);        
2755         }               
2756 }
2757 /**************************************************************************************************/
2758
2759 int ShhherCommand::getOTUData(int numSeqs, string fileName,  vector<int>& otuData,
2760                                vector<int>& cumNumSeqs,
2761                                vector<int>& nSeqsPerOTU,
2762                                vector<vector<int> >& aaP,       //tMaster->aanP:        each row is a different otu / each col contains the sequence indices
2763                                vector<vector<int> >& aaI,       //tMaster->aanI:        that are in each otu - can't differentiate between aaP and aaI 
2764                                vector<int>& seqNumber,          //tMaster->anP:         the sequence id number sorted by OTU
2765                                vector<int>& seqIndex,
2766                                map<string, int>& nameMap){
2767         try {
2768         
2769                 ifstream listFile;
2770                 m->openInputFile(fileName, listFile);
2771                 string label;
2772         int numOTUs;
2773                 
2774                 listFile >> label >> numOTUs;
2775         
2776         if (m->debug) { m->mothurOut("[DEBUG]: Getting OTU Data...\n"); }
2777         
2778                 otuData.assign(numSeqs, 0);
2779                 cumNumSeqs.assign(numOTUs, 0);
2780                 nSeqsPerOTU.assign(numOTUs, 0);
2781                 aaP.clear();aaP.resize(numOTUs);
2782                 
2783                 seqNumber.clear();
2784                 aaI.clear();
2785                 seqIndex.clear();
2786                 
2787                 string singleOTU = "";
2788                 
2789                 for(int i=0;i<numOTUs;i++){
2790                         
2791                         if (m->control_pressed) { break; }
2792             if (m->debug) { m->mothurOut("[DEBUG]: processing OTU " + toString(i) + ".\n"); }
2793             
2794                         listFile >> singleOTU;
2795                         
2796                         istringstream otuString(singleOTU);
2797             
2798                         while(otuString){
2799                                 
2800                                 string seqName = "";
2801                                 
2802                                 for(int j=0;j<singleOTU.length();j++){
2803                                         char letter = otuString.get();
2804                                         
2805                                         if(letter != ','){
2806                                                 seqName += letter;
2807                                         }
2808                                         else{
2809                                                 map<string,int>::iterator nmIt = nameMap.find(seqName);
2810                                                 int index = nmIt->second;
2811                                                 
2812                                                 nameMap.erase(nmIt);
2813                                                 
2814                                                 otuData[index] = i;
2815                                                 nSeqsPerOTU[i]++;
2816                                                 aaP[i].push_back(index);
2817                                                 seqName = "";
2818                                         }
2819                                 }
2820                                 
2821                                 map<string,int>::iterator nmIt = nameMap.find(seqName);
2822                 
2823                                 int index = nmIt->second;
2824                                 nameMap.erase(nmIt);
2825                 
2826                                 otuData[index] = i;
2827                                 nSeqsPerOTU[i]++;
2828                                 aaP[i].push_back(index);        
2829                                 
2830                                 otuString.get();
2831                         }
2832                         
2833                         sort(aaP[i].begin(), aaP[i].end());
2834                         for(int j=0;j<nSeqsPerOTU[i];j++){
2835                                 seqNumber.push_back(aaP[i][j]);
2836                         }
2837                         for(int j=nSeqsPerOTU[i];j<numSeqs;j++){
2838                                 aaP[i].push_back(0);
2839                         }
2840                         
2841                         
2842                 }
2843                 
2844                 for(int i=1;i<numOTUs;i++){
2845                         cumNumSeqs[i] = cumNumSeqs[i-1] + nSeqsPerOTU[i-1];
2846                 }
2847                 aaI = aaP;
2848                 seqIndex = seqNumber;
2849                 
2850                 listFile.close();       
2851         
2852         return numOTUs;
2853                 
2854         }
2855         catch(exception& e) {
2856                 m->errorOut(e, "ShhherCommand", "getOTUData");
2857                 exit(1);        
2858         }               
2859 }
2860 /**************************************************************************************************/
2861
2862 int ShhherCommand::calcCentroidsDriver(int numOTUs, 
2863                                           vector<int>& cumNumSeqs,
2864                                           vector<int>& nSeqsPerOTU,
2865                                           vector<int>& seqIndex,
2866                                           vector<short>& change,                //did the centroid sequence change? 0 = no; 1 = yes
2867                                           vector<int>& centroids,               //the representative flowgram for each cluster m
2868                                           vector<double>& singleTau,    //tMaster->adTau:       1-D Tau vector (1xnumSeqs)
2869                                           vector<int>& mapSeqToUnique,
2870                                           vector<short>& uniqueFlowgrams,
2871                                           vector<short>& flowDataIntI,
2872                                           vector<int>& lengths,
2873                                           int numFlowCells,
2874                                           vector<int>& seqNumber){                          
2875         
2876         //this function gets the most likely homopolymer length at a flow position for a group of sequences
2877         //within an otu
2878         
2879         try{
2880                 
2881                 for(int i=0;i<numOTUs;i++){
2882                         
2883                         if (m->control_pressed) { break; }
2884                         
2885                         double count = 0;
2886                         int position = 0;
2887                         int minFlowGram = 100000000;
2888                         double minFlowValue = 1e8;
2889                         change[i] = 0; //FALSE
2890                         
2891                         for(int j=0;j<nSeqsPerOTU[i];j++){
2892                                 count += singleTau[seqNumber[cumNumSeqs[i] + j]];
2893                         }
2894             
2895                         if(nSeqsPerOTU[i] > 0 && count > MIN_COUNT){
2896                                 vector<double> adF(nSeqsPerOTU[i]);
2897                                 vector<int> anL(nSeqsPerOTU[i]);
2898                                 
2899                                 for(int j=0;j<nSeqsPerOTU[i];j++){
2900                                         int index = cumNumSeqs[i] + j;
2901                                         int nI = seqIndex[index];
2902                                         int nIU = mapSeqToUnique[nI];
2903                                         
2904                                         int k;
2905                                         for(k=0;k<position;k++){
2906                                                 if(nIU == anL[k]){
2907                                                         break;
2908                                                 }
2909                                         }
2910                                         if(k == position){
2911                                                 anL[position] = nIU;
2912                                                 adF[position] = 0.0000;
2913                                                 position++;
2914                                         }                                               
2915                                 }
2916                                 
2917                                 for(int j=0;j<nSeqsPerOTU[i];j++){
2918                                         int index = cumNumSeqs[i] + j;
2919                                         int nI = seqIndex[index];
2920                                         
2921                                         double tauValue = singleTau[seqNumber[index]];
2922                                         
2923                                         for(int k=0;k<position;k++){
2924                                                 double dist = getDistToCentroid(anL[k], nI, lengths[nI], uniqueFlowgrams, flowDataIntI, numFlowCells);
2925                                                 adF[k] += dist * tauValue;
2926                                         }
2927                                 }
2928                                 
2929                                 for(int j=0;j<position;j++){
2930                                         if(adF[j] < minFlowValue){
2931                                                 minFlowGram = j;
2932                                                 minFlowValue = adF[j];
2933                                         }
2934                                 }
2935                                 
2936                                 if(centroids[i] != anL[minFlowGram]){
2937                                         change[i] = 1;
2938                                         centroids[i] = anL[minFlowGram];
2939                                 }
2940                         }
2941                         else if(centroids[i] != -1){
2942                                 change[i] = 1;
2943                                 centroids[i] = -1;                      
2944                         }
2945                 }
2946         
2947         return 0;
2948         }
2949         catch(exception& e) {
2950                 m->errorOut(e, "ShhherCommand", "calcCentroidsDriver");
2951                 exit(1);        
2952         }               
2953 }
2954 /**************************************************************************************************/
2955
2956 double ShhherCommand::getDistToCentroid(int cent, int flow, int length, vector<short>& uniqueFlowgrams,
2957                                         vector<short>& flowDataIntI, int numFlowCells){
2958         try{
2959                 
2960                 int flowAValue = cent * numFlowCells;
2961                 int flowBValue = flow * numFlowCells;
2962                 
2963                 double dist = 0;
2964         
2965                 for(int i=0;i<length;i++){
2966                         dist += singleLookUp[uniqueFlowgrams[flowAValue] * NUMBINS + flowDataIntI[flowBValue]];
2967                         flowAValue++;
2968                         flowBValue++;
2969                 }
2970                 
2971                 return dist / (double)length;
2972         }
2973         catch(exception& e) {
2974                 m->errorOut(e, "ShhherCommand", "getDistToCentroid");
2975                 exit(1);        
2976         }               
2977 }
2978 /**************************************************************************************************/
2979
2980 double ShhherCommand::getNewWeights(int numOTUs, vector<int>& cumNumSeqs, vector<int>& nSeqsPerOTU, vector<double>& singleTau, vector<int>& seqNumber, vector<double>& weight){
2981         try{
2982                 
2983                 double maxChange = 0;
2984                 
2985                 for(int i=0;i<numOTUs;i++){
2986                         
2987                         if (m->control_pressed) { break; }
2988                         
2989                         double difference = weight[i];
2990                         weight[i] = 0;
2991                         
2992                         for(int j=0;j<nSeqsPerOTU[i];j++){
2993                                 int index = cumNumSeqs[i] + j;
2994                                 double tauValue = singleTau[seqNumber[index]];
2995                                 weight[i] += tauValue;
2996                         }
2997                         
2998                         difference = fabs(weight[i] - difference);
2999                         if(difference > maxChange){     maxChange = difference; }
3000                 }
3001                 return maxChange;
3002         }
3003         catch(exception& e) {
3004                 m->errorOut(e, "ShhherCommand", "getNewWeights");
3005                 exit(1);        
3006         }               
3007 }
3008
3009 /**************************************************************************************************/
3010
3011 double ShhherCommand::getLikelihood(int numSeqs, int numOTUs, vector<int>& nSeqsPerOTU, vector<int>& seqNumber, vector<int>& cumNumSeqs, vector<int>& seqIndex, vector<double>& dist, vector<double>& weight){
3012         
3013         try{
3014                 
3015                 vector<long double> P(numSeqs, 0);
3016                 int effNumOTUs = 0;
3017                 
3018                 for(int i=0;i<numOTUs;i++){
3019                         if(weight[i] > MIN_WEIGHT){
3020                                 effNumOTUs++;
3021                         }
3022                 }
3023                 
3024                 string hold;
3025                 for(int i=0;i<numOTUs;i++){
3026                         
3027                         if (m->control_pressed) { break; }
3028                         
3029                         for(int j=0;j<nSeqsPerOTU[i];j++){
3030                                 int index = cumNumSeqs[i] + j;
3031                                 int nI = seqIndex[index];
3032                                 double singleDist = dist[seqNumber[index]];
3033                                 
3034                                 P[nI] += weight[i] * exp(-singleDist * sigma);
3035                         }
3036                 }
3037                 double nLL = 0.00;
3038                 for(int i=0;i<numSeqs;i++){
3039                         if(P[i] == 0){  P[i] = DBL_EPSILON;     }
3040             
3041                         nLL += -log(P[i]);
3042                 }
3043                 
3044                 nLL = nLL -(double)numSeqs * log(sigma);
3045         
3046                 return nLL; 
3047         }
3048         catch(exception& e) {
3049                 m->errorOut(e, "ShhherCommand", "getNewWeights");
3050                 exit(1);        
3051         }               
3052 }
3053
3054 /**************************************************************************************************/
3055
3056 int ShhherCommand::checkCentroids(int numOTUs, vector<int>& centroids, vector<double>& weight){
3057         try{
3058                 vector<int> unique(numOTUs, 1);
3059                 
3060                 for(int i=0;i<numOTUs;i++){
3061                         if(centroids[i] == -1 || weight[i] < MIN_WEIGHT){
3062                                 unique[i] = -1;
3063                         }
3064                 }
3065                 
3066                 for(int i=0;i<numOTUs;i++){
3067                         
3068                         if (m->control_pressed) { break; }
3069                         
3070                         if(unique[i] == 1){
3071                                 for(int j=i+1;j<numOTUs;j++){
3072                                         if(unique[j] == 1){
3073                                                 
3074                                                 if(centroids[j] == centroids[i]){
3075                                                         unique[j] = 0;
3076                                                         centroids[j] = -1;
3077                                                         
3078                                                         weight[i] += weight[j];
3079                                                         weight[j] = 0.0;
3080                                                 }
3081                                         }
3082                                 }
3083                         }
3084                 }
3085         
3086         return 0;
3087         }
3088         catch(exception& e) {
3089                 m->errorOut(e, "ShhherCommand", "checkCentroids");
3090                 exit(1);        
3091         }               
3092 }
3093 /**************************************************************************************************/
3094
3095 void ShhherCommand::calcNewDistances(int numSeqs, int numOTUs, vector<int>& nSeqsPerOTU, vector<double>& dist, 
3096                                      vector<double>& weight, vector<short>& change, vector<int>& centroids,
3097                                      vector<vector<int> >& aaP, vector<double>& singleTau, vector<vector<int> >& aaI,   
3098                                      vector<int>& seqNumber, vector<int>& seqIndex,
3099                                      vector<short>& uniqueFlowgrams,
3100                                      vector<short>& flowDataIntI, int numFlowCells, vector<int>& lengths){
3101         
3102         try{
3103                 
3104                 int total = 0;
3105                 vector<double> newTau(numOTUs,0);
3106                 vector<double> norms(numSeqs, 0);
3107                 nSeqsPerOTU.assign(numOTUs, 0);
3108         
3109                 for(int i=0;i<numSeqs;i++){
3110                         
3111                         if (m->control_pressed) { break; }
3112                         
3113                         int indexOffset = i * numOTUs;
3114             
3115                         double offset = 1e8;
3116                         
3117                         for(int j=0;j<numOTUs;j++){
3118                 
3119                                 if(weight[j] > MIN_WEIGHT && change[j] == 1){
3120                                         dist[indexOffset + j] = getDistToCentroid(centroids[j], i, lengths[i], uniqueFlowgrams, flowDataIntI, numFlowCells);
3121                                 }
3122                 
3123                                 if(weight[j] > MIN_WEIGHT && dist[indexOffset + j] < offset){
3124                                         offset = dist[indexOffset + j];
3125                                 }
3126                         }
3127             
3128                         for(int j=0;j<numOTUs;j++){
3129                                 if(weight[j] > MIN_WEIGHT){
3130                                         newTau[j] = exp(sigma * (-dist[indexOffset + j] + offset)) * weight[j];
3131                                         norms[i] += newTau[j];
3132                                 }
3133                                 else{
3134                                         newTau[j] = 0.0;
3135                                 }
3136                         }
3137             
3138                         for(int j=0;j<numOTUs;j++){
3139                                 newTau[j] /= norms[i];
3140                         }
3141             
3142                         for(int j=0;j<numOTUs;j++){
3143                                 if(newTau[j] > MIN_TAU){
3144                                         
3145                                         int oldTotal = total;
3146                                         
3147                                         total++;
3148                                         
3149                                         singleTau.resize(total, 0);
3150                                         seqNumber.resize(total, 0);
3151                                         seqIndex.resize(total, 0);
3152                                         
3153                                         singleTau[oldTotal] = newTau[j];
3154                                         
3155                                         aaP[j][nSeqsPerOTU[j]] = oldTotal;
3156                                         aaI[j][nSeqsPerOTU[j]] = i;
3157                                         nSeqsPerOTU[j]++;
3158                                 }
3159                         }
3160             
3161                 }
3162         
3163         }
3164         catch(exception& e) {
3165                 m->errorOut(e, "ShhherCommand", "calcNewDistances");
3166                 exit(1);        
3167         }               
3168 }
3169 /**************************************************************************************************/
3170
3171 int ShhherCommand::fill(int numOTUs, vector<int>& seqNumber, vector<int>& seqIndex, vector<int>& cumNumSeqs, vector<int>& nSeqsPerOTU, vector<vector<int> >& aaP, vector<vector<int> >& aaI){
3172         try {
3173                 int index = 0;
3174                 for(int i=0;i<numOTUs;i++){
3175                         
3176                         if (m->control_pressed) { return 0; }
3177                         
3178                         cumNumSeqs[i] = index;
3179                         for(int j=0;j<nSeqsPerOTU[i];j++){
3180                                 seqNumber[index] = aaP[i][j];
3181                                 seqIndex[index] = aaI[i][j];
3182                                 
3183                                 index++;
3184                         }
3185                 }
3186         
3187         return 0; 
3188         }
3189         catch(exception& e) {
3190                 m->errorOut(e, "ShhherCommand", "fill");
3191                 exit(1);        
3192         }               
3193 }
3194 /**************************************************************************************************/
3195
3196 void ShhherCommand::setOTUs(int numOTUs, int numSeqs, vector<int>& seqNumber, vector<int>& seqIndex, vector<int>& cumNumSeqs, vector<int>& nSeqsPerOTU,
3197                             vector<int>& otuData, vector<double>& singleTau, vector<double>& dist, vector<vector<int> >& aaP, vector<vector<int> >& aaI){
3198         
3199         try {
3200                 vector<double> bigTauMatrix(numOTUs * numSeqs, 0.0000);
3201                 
3202                 for(int i=0;i<numOTUs;i++){
3203                         
3204                         if (m->control_pressed) { break; }
3205                         
3206                         for(int j=0;j<nSeqsPerOTU[i];j++){
3207                                 int index = cumNumSeqs[i] + j;
3208                                 double tauValue = singleTau[seqNumber[index]];
3209                                 int sIndex = seqIndex[index];
3210                                 bigTauMatrix[sIndex * numOTUs + i] = tauValue;                          
3211                         }
3212                 }
3213                 
3214                 for(int i=0;i<numSeqs;i++){
3215                         double maxTau = -1.0000;
3216                         int maxOTU = -1;
3217                         for(int j=0;j<numOTUs;j++){
3218                                 if(bigTauMatrix[i * numOTUs + j] > maxTau){
3219                                         maxTau = bigTauMatrix[i * numOTUs + j];
3220                                         maxOTU = j;
3221                                 }
3222                         }
3223                         
3224                         otuData[i] = maxOTU;
3225                 }
3226                 
3227                 nSeqsPerOTU.assign(numOTUs, 0);         
3228                 
3229                 for(int i=0;i<numSeqs;i++){
3230                         int index = otuData[i];
3231                         
3232                         singleTau[i] = 1.0000;
3233                         dist[i] = 0.0000;
3234                         
3235                         aaP[index][nSeqsPerOTU[index]] = i;
3236                         aaI[index][nSeqsPerOTU[index]] = i;
3237                         
3238                         nSeqsPerOTU[index]++;
3239                 }
3240         
3241                 fill(numOTUs, seqNumber, seqIndex, cumNumSeqs, nSeqsPerOTU, aaP, aaI);  
3242         }
3243         catch(exception& e) {
3244                 m->errorOut(e, "ShhherCommand", "setOTUs");
3245                 exit(1);        
3246         }               
3247 }
3248 /**************************************************************************************************/
3249
3250 void ShhherCommand::writeQualities(int numOTUs, int numFlowCells, string qualityFileName, vector<int> otuCounts, vector<int>& nSeqsPerOTU, vector<int>& seqNumber,
3251                                    vector<double>& singleTau, vector<short>& flowDataIntI, vector<short>& uniqueFlowgrams, vector<int>& cumNumSeqs,
3252                                    vector<int>& mapUniqueToSeq, vector<string>& seqNameVector, vector<int>& centroids, vector<vector<int> >& aaI){
3253         
3254         try {
3255         
3256                 ofstream qualityFile;
3257                 m->openOutputFile(qualityFileName, qualityFile);
3258         
3259                 qualityFile.setf(ios::fixed, ios::floatfield);
3260                 qualityFile.setf(ios::showpoint);
3261                 qualityFile << setprecision(6);
3262                 
3263                 vector<vector<int> > qualities(numOTUs);
3264                 vector<double> pr(HOMOPS, 0);
3265                 
3266                 
3267                 for(int i=0;i<numOTUs;i++){
3268                         
3269                         if (m->control_pressed) { break; }
3270                         
3271                         int index = 0;
3272                         int base = 0;
3273                         
3274                         if(nSeqsPerOTU[i] > 0){
3275                                 qualities[i].assign(1024, -1);
3276                                 
3277                                 while(index < numFlowCells){
3278                                         double maxPrValue = 1e8;
3279                                         short maxPrIndex = -1;
3280                                         double count = 0.0000;
3281                                         
3282                                         pr.assign(HOMOPS, 0);
3283                                         
3284                                         for(int j=0;j<nSeqsPerOTU[i];j++){
3285                                                 int lIndex = cumNumSeqs[i] + j;
3286                                                 double tauValue = singleTau[seqNumber[lIndex]];
3287                                                 int sequenceIndex = aaI[i][j];
3288                                                 short intensity = flowDataIntI[sequenceIndex * numFlowCells + index];
3289                                                 
3290                                                 count += tauValue;
3291                                                 
3292                                                 for(int s=0;s<HOMOPS;s++){
3293                                                         pr[s] += tauValue * singleLookUp[s * NUMBINS + intensity];
3294                                                 }
3295                                         }
3296                                         
3297                                         maxPrIndex = uniqueFlowgrams[centroids[i] * numFlowCells + index];
3298                                         maxPrValue = pr[maxPrIndex];
3299                                         
3300                                         if(count > MIN_COUNT){
3301                                                 double U = 0.0000;
3302                                                 double norm = 0.0000;
3303                                                 
3304                                                 for(int s=0;s<HOMOPS;s++){
3305                                                         norm += exp(-(pr[s] - maxPrValue));
3306                                                 }
3307                                                 
3308                                                 for(int s=1;s<=maxPrIndex;s++){
3309                                                         int value = 0;
3310                                                         double temp = 0.0000;
3311                                                         
3312                                                         U += exp(-(pr[s-1]-maxPrValue))/norm;
3313                                                         
3314                                                         if(U>0.00){
3315                                                                 temp = log10(U);
3316                                                         }
3317                                                         else{
3318                                                                 temp = -10.1;
3319                                                         }
3320                                                         temp = floor(-10 * temp);
3321                                                         value = (int)floor(temp);
3322                                                         if(value > 100){        value = 100;    }
3323                                                         
3324                                                         qualities[i][base] = (int)value;
3325                                                         base++;
3326                                                 }
3327                                         }
3328                                         
3329                                         index++;
3330                                 }
3331                         }
3332                         
3333                         
3334                         if(otuCounts[i] > 0){
3335                                 qualityFile << '>' << seqNameVector[mapUniqueToSeq[i]] << endl;
3336                         
3337                                 int j=4;        //need to get past the first four bases
3338                                 while(qualities[i][j] != -1){
3339                     qualityFile << qualities[i][j] << ' ';
3340                     if (j > qualities[i].size()) { break; }
3341                     j++;
3342                                 }
3343                                 qualityFile << endl;
3344                         }
3345                 }
3346                 qualityFile.close();
3347                 outputNames.push_back(qualityFileName); outputTypes["qfile"].push_back(qualityFileName);
3348         
3349         }
3350         catch(exception& e) {
3351                 m->errorOut(e, "ShhherCommand", "writeQualities");
3352                 exit(1);        
3353         }               
3354 }
3355
3356 /**************************************************************************************************/
3357
3358 void ShhherCommand::writeSequences(string thisCompositeFASTAFileName, int numOTUs, int numFlowCells, string fastaFileName, vector<int> otuCounts, vector<short>& uniqueFlowgrams, vector<string>& seqNameVector, vector<vector<int> >& aaI, vector<int>& centroids){
3359         try {
3360                 
3361                 ofstream fastaFile;
3362                 m->openOutputFile(fastaFileName, fastaFile);
3363                 
3364                 vector<string> names(numOTUs, "");
3365                 
3366                 for(int i=0;i<numOTUs;i++){
3367                         
3368                         if (m->control_pressed) { break; }
3369                         
3370                         int index = centroids[i];
3371                         
3372                         if(otuCounts[i] > 0){
3373                                 fastaFile << '>' << seqNameVector[aaI[i][0]] << endl;
3374                                 
3375                                 string newSeq = "";
3376                                 
3377                                 for(int j=0;j<numFlowCells;j++){
3378                                         
3379                                         char base = flowOrder[j % flowOrder.length()];
3380                                         for(int k=0;k<uniqueFlowgrams[index * numFlowCells + j];k++){
3381                                                 newSeq += base;
3382                                         }
3383                                 }
3384                                 
3385                                 if (newSeq.length() >= 4) {  fastaFile << newSeq.substr(4) << endl;  }
3386                 else {  fastaFile << "NNNN" << endl;  }
3387                         }
3388                 }
3389                 fastaFile.close();
3390         
3391                 outputNames.push_back(fastaFileName); outputTypes["fasta"].push_back(fastaFileName);
3392         
3393                 if(thisCompositeFASTAFileName != ""){
3394                         m->appendFiles(fastaFileName, thisCompositeFASTAFileName);
3395                 }
3396         }
3397         catch(exception& e) {
3398                 m->errorOut(e, "ShhherCommand", "writeSequences");
3399                 exit(1);        
3400         }               
3401 }
3402
3403 /**************************************************************************************************/
3404
3405 void ShhherCommand::writeNames(string thisCompositeNamesFileName, int numOTUs, string nameFileName, vector<int> otuCounts, vector<string>& seqNameVector, vector<vector<int> >& aaI, vector<int>& nSeqsPerOTU){
3406         try {
3407                 
3408                 ofstream nameFile;
3409                 m->openOutputFile(nameFileName, nameFile);
3410                 
3411                 for(int i=0;i<numOTUs;i++){
3412                         
3413                         if (m->control_pressed) { break; }
3414                         
3415                         if(otuCounts[i] > 0){
3416                                 nameFile << seqNameVector[aaI[i][0]] << '\t' << seqNameVector[aaI[i][0]];
3417                                 
3418                                 for(int j=1;j<nSeqsPerOTU[i];j++){
3419                                         nameFile << ',' << seqNameVector[aaI[i][j]];
3420                                 }
3421                                 
3422                                 nameFile << endl;
3423                         }
3424                 }
3425                 nameFile.close();
3426                 outputNames.push_back(nameFileName); outputTypes["name"].push_back(nameFileName);
3427                 
3428                 
3429                 if(thisCompositeNamesFileName != ""){
3430                         m->appendFiles(nameFileName, thisCompositeNamesFileName);
3431                 }               
3432         }
3433         catch(exception& e) {
3434                 m->errorOut(e, "ShhherCommand", "writeNames");
3435                 exit(1);        
3436         }               
3437 }
3438
3439 /**************************************************************************************************/
3440
3441 void ShhherCommand::writeGroups(string groupFileName, string fileRoot, int numSeqs, vector<string>& seqNameVector){
3442         try {
3443         ofstream groupFile;
3444                 m->openOutputFile(groupFileName, groupFile);
3445                 
3446                 for(int i=0;i<numSeqs;i++){
3447                         if (m->control_pressed) { break; }
3448                         groupFile << seqNameVector[i] << '\t' << fileRoot << endl;
3449                 }
3450                 groupFile.close();
3451                 outputNames.push_back(groupFileName); outputTypes["group"].push_back(groupFileName);
3452         
3453         }
3454         catch(exception& e) {
3455                 m->errorOut(e, "ShhherCommand", "writeGroups");
3456                 exit(1);        
3457         }               
3458 }
3459
3460 /**************************************************************************************************/
3461
3462 void ShhherCommand::writeClusters(string otuCountsFileName, int numOTUs, int numFlowCells, vector<int> otuCounts, vector<int>& centroids, vector<short>& uniqueFlowgrams, vector<string>& seqNameVector, vector<vector<int> >& aaI, vector<int>& nSeqsPerOTU, vector<int>& lengths, vector<short>& flowDataIntI){
3463         try {
3464                 ofstream otuCountsFile;
3465                 m->openOutputFile(otuCountsFileName, otuCountsFile);
3466                 
3467                 string bases = flowOrder;
3468                 
3469                 for(int i=0;i<numOTUs;i++){
3470                         
3471                         if (m->control_pressed) {
3472                                 break;
3473                         }
3474                         //output the translated version of the centroid sequence for the otu
3475                         if(otuCounts[i] > 0){
3476                                 int index = centroids[i];
3477                                 
3478                                 otuCountsFile << "ideal\t";
3479                                 for(int j=8;j<numFlowCells;j++){
3480                                         char base = bases[j % bases.length()];
3481                                         for(int s=0;s<uniqueFlowgrams[index * numFlowCells + j];s++){
3482                                                 otuCountsFile << base;
3483                                         }
3484                                 }
3485                                 otuCountsFile << endl;
3486                                 
3487                                 for(int j=0;j<nSeqsPerOTU[i];j++){
3488                                         int sequence = aaI[i][j];
3489                                         otuCountsFile << seqNameVector[sequence] << '\t';
3490                                         
3491                                         string newSeq = "";
3492                                         
3493                                         for(int k=0;k<lengths[sequence];k++){
3494                                                 char base = bases[k % bases.length()];
3495                                                 int freq = int(0.01 * (double)flowDataIntI[sequence * numFlowCells + k] + 0.5);
3496                         
3497                                                 for(int s=0;s<freq;s++){
3498                                                         newSeq += base;
3499                                                         //otuCountsFile << base;
3500                                                 }
3501                                         }
3502                                         
3503                     if (newSeq.length() >= 4) {  otuCountsFile << newSeq.substr(4) << endl;  }
3504                     else {  otuCountsFile << "NNNN" << endl;  }
3505                                 }
3506                                 otuCountsFile << endl;
3507                         }
3508                 }
3509                 otuCountsFile.close();
3510                 outputNames.push_back(otuCountsFileName); outputTypes["counts"].push_back(otuCountsFileName);
3511         
3512         }
3513         catch(exception& e) {
3514                 m->errorOut(e, "ShhherCommand", "writeClusters");
3515                 exit(1);        
3516         }               
3517 }
3518
3519 /**************************************************************************************************/
3520
3521 void ShhherCommand::getSingleLookUp(){
3522         try{
3523                 //      these are the -log probabilities that a signal corresponds to a particular homopolymer length
3524                 singleLookUp.assign(HOMOPS * NUMBINS, 0);
3525                 
3526                 int index = 0;
3527                 ifstream lookUpFile;
3528                 m->openInputFile(lookupFileName, lookUpFile);
3529                 
3530                 for(int i=0;i<HOMOPS;i++){
3531                         
3532                         if (m->control_pressed) { break; }
3533                         
3534                         float logFracFreq;
3535                         lookUpFile >> logFracFreq;
3536                         
3537                         for(int j=0;j<NUMBINS;j++)      {
3538                                 lookUpFile >> singleLookUp[index];
3539                                 index++;
3540                         }
3541                 }       
3542                 lookUpFile.close();
3543         }
3544         catch(exception& e) {
3545                 m->errorOut(e, "ShhherCommand", "getSingleLookUp");
3546                 exit(1);
3547         }
3548 }
3549
3550 /**************************************************************************************************/
3551
3552 void ShhherCommand::getJointLookUp(){
3553         try{
3554                 
3555                 //      the most likely joint probability (-log) that two intenities have the same polymer length
3556                 jointLookUp.resize(NUMBINS * NUMBINS, 0);
3557                 
3558                 for(int i=0;i<NUMBINS;i++){
3559                         
3560                         if (m->control_pressed) { break; }
3561                         
3562                         for(int j=0;j<NUMBINS;j++){             
3563                                 
3564                                 double minSum = 100000000;
3565                                 
3566                                 for(int k=0;k<HOMOPS;k++){
3567                                         double sum = singleLookUp[k * NUMBINS + i] + singleLookUp[k * NUMBINS + j];
3568                                         
3569                                         if(sum < minSum)        {       minSum = sum;           }
3570                                 }       
3571                                 jointLookUp[i * NUMBINS + j] = minSum;
3572                         }
3573                 }
3574         }
3575         catch(exception& e) {
3576                 m->errorOut(e, "ShhherCommand", "getJointLookUp");
3577                 exit(1);
3578         }
3579 }
3580
3581 /**************************************************************************************************/
3582
3583 double ShhherCommand::getProbIntensity(int intIntensity){                          
3584         try{
3585
3586                 double minNegLogProb = 100000000; 
3587
3588                 
3589                 for(int i=0;i<HOMOPS;i++){//loop signal strength
3590                         
3591                         if (m->control_pressed) { break; }
3592                         
3593                         float negLogProb = singleLookUp[i * NUMBINS + intIntensity];
3594                         if(negLogProb < minNegLogProb)  {       minNegLogProb = negLogProb; }
3595                 }
3596                 
3597                 return minNegLogProb;
3598         }
3599         catch(exception& e) {
3600                 m->errorOut(e, "ShhherCommand", "getProbIntensity");
3601                 exit(1);
3602         }
3603 }
3604
3605
3606
3607