]> git.donarmstrong.com Git - mothur.git/blob - shhhercommand.cpp
added modify names parameter to set.dir
[mothur.git] / shhhercommand.cpp
1 /*
2  *  shhher.cpp
3  *  Mothur
4  *
5  *  Created by Pat Schloss on 12/27/10.
6  *  Copyright 2010 Schloss Lab. All rights reserved.
7  *
8  */
9
10 #include "shhhercommand.h"
11
12 //**********************************************************************************************************************
13 vector<string> ShhherCommand::setParameters(){  
14         try {
15                 CommandParameter pflow("flow", "InputTypes", "", "", "none", "fileflow", "none","fasta-name-group-counts-qfile",false,false,true); parameters.push_back(pflow);
16                 CommandParameter pfile("file", "InputTypes", "", "", "none", "fileflow", "none","fasta-name-group-counts-qfile",false,false,true); parameters.push_back(pfile);
17                 CommandParameter plookup("lookup", "InputTypes", "", "", "none", "none", "none","",false,false,true); parameters.push_back(plookup);
18                 CommandParameter pcutoff("cutoff", "Number", "", "0.01", "", "", "","",false,false); parameters.push_back(pcutoff);
19                 CommandParameter pprocessors("processors", "Number", "", "1", "", "", "","",false,false,true); parameters.push_back(pprocessors);
20                 CommandParameter pmaxiter("maxiter", "Number", "", "1000", "", "", "","",false,false); parameters.push_back(pmaxiter);
21         CommandParameter plarge("large", "Number", "", "-1", "", "", "","",false,false); parameters.push_back(plarge);
22                 CommandParameter psigma("sigma", "Number", "", "60", "", "", "","",false,false); parameters.push_back(psigma);
23                 CommandParameter pmindelta("mindelta", "Number", "", "0.000001", "", "", "","",false,false); parameters.push_back(pmindelta);
24         CommandParameter porder("order", "Multiple", "A-B-I", "A", "", "", "","",false,false, true); parameters.push_back(porder);              CommandParameter pinputdir("inputdir", "String", "", "", "", "", "","",false,false); parameters.push_back(pinputdir);
25                 CommandParameter poutputdir("outputdir", "String", "", "", "", "", "","",false,false); parameters.push_back(poutputdir);
26                 
27                 vector<string> myArray;
28                 for (int i = 0; i < parameters.size(); i++) {   myArray.push_back(parameters[i].name);          }
29                 return myArray;
30         }
31         catch(exception& e) {
32                 m->errorOut(e, "ShhherCommand", "setParameters");
33                 exit(1);
34         }
35 }
36 //**********************************************************************************************************************
37 string ShhherCommand::getHelpString(){  
38         try {
39                 string helpString = "";
40                 helpString += "The shhh.flows command reads a file containing flowgrams and creates a file of corrected sequences.\n";
41         helpString += "The shhh.flows command parameters are flow, file, lookup, cutoff, processors, large, maxiter, sigma, mindelta and order.\n";
42         helpString += "The flow parameter is used to input your flow file.\n";
43         helpString += "The file parameter is used to input the *flow.files file created by trim.flows.\n";
44         helpString += "The lookup parameter is used specify the lookup file you would like to use. http://www.mothur.org/wiki/Lookup_files.\n";
45         helpString += "The order parameter options are A, B or I.  Default=A. A = TACG and B = TACGTACGTACGATGTAGTCGAGCATCATCTGACGCAGTACGTGCATGATCTCAGTCAGCAGCTATGTCAGTGCATGCAGTGACTGATCGTCATCAGCTAGCATCGACTGCATAGATCGCATGACGATCGCATATCGTCAGTGCATGTAGTCGAGCATCATCTGACGCAGTACGTGCATGATCTCAGTCAGCAGCTATGTCAGTGCATGCATAGATCGCATGACGATCGCATATCGTCAGTGCAGTGACTGATCGTCATCAGCTAGCATCGACTGCATGTAGTCGAGCATCATCTGACGCAGTACGTGCATAGATCGCATGACGATCGCATATCGTCAGTGCATGATCTCAGTCAGCAGCTATGTCAGTGCATGCAGTGACTGATCGTCATCAGCTAGCATCGACTGCATGTAGTCGAGCATCATCTGACGCAGTACGTGCAGTGACTGATCGTCATCAGCTAGCATCGACTGCATAGATCGCATGACGATCGCATATCGTCAGTGCATGATCTCAGTCAGCAGCTATGTCAGTGCATGCATGTAGTCGAGCATCATCTGACGCAGTACGTGCATAGATCGCATGACGATCGCATATCGTCAGTGCAGTGACTGATCGTCATCAGCTAGCATCGACTGCATGATCTCAGTCAGCAGCTATGTCAGTGCATGCAGTGACTGATCGTCATCAGCTAGCATCGACTGCATAGATCGCATGACGATCGCATATCGTCAGTGCATGATCTCAGTCAGCAGCTATGTCAGTGCATGCATGTAGTCGAGCATCATCTGACGCAGTACGTGCATAGATCGCATGACGATCGCATATCGTCAGTGCATGATCTCAGTCAGCAGCTATGTCAGTGCATGCAGTGACTGATCGTCATCAGCTAGCATCGACTGCATGTAGTCGAGCATCATCTGACGCAGTACGTGCATGATCTCAGTCAGCAGCTATGTCAGTGCATGCATAGATCGCATGACGATCGCATATCGTCAGTGCATGTAGTCGAGCATCATCTGACGCAGTACGTGCAGTGACTGATCGTCATCAGCTAGCATCGACTGCATAGATCGCATGACGATCGCATATCGTCAGTGCATGTAGTCGAGCATCATCTGACGCAGTACGTGCATGATCTCAGTCAGCAGCTATGTCAGTGCATGCAGTGACTGATCGTCATCAGCTAGCATCGACTGCATGATCTCAGTCAGCAGCTATGTCAGTGCATGCAGTGACTGATCGTCATCAGCTAGCATCGACTGCATAGATCGCATGACGATCGCATATCGTCAGTGCATGTAGTCGAGCATCATCTGACGCAGTACGTGCATGATCTCAGTCAGCAGCTATGTCAGTGCATGCATGTAGTCGAGCATCATCTGACGCAGTACGTGCAGTGACTGATCGTCATCAGCTAGCATCGACTGCATAGATCGCATGACGATCGCATATCGTCAGTGCAGTGACTGATCGTCATCAGCTAGCATCGACTGCATGTAGTCGAGCATCATCTGACGCAGTACGTGCATAGATCGCATGACGATCGCATATCGTCAGTGCATGATCTCAGTCAGCAGCTATGTCAGTGCATGCAGTGACTGATCGTCATCAGCTAGCATCGACTGCATGTAGTCGAGCATCATCTGACGCAGTACGTGCATGATCTCAGTCAGCAGCTATGTCAGTGCATGCATAGATCGCATGACGATCGCATATCGTCAGTGCAGTGACTGATCGTCATCAGCTAGCATCGACTGCATGATCTCAGTCAGCAGC and I = TACGTACGTCTGAGCATCGATCGATGTACAGCTACGTACGTCTGAGCATCGATCGATGTACAGCTACGTACGTCTGAGCATCGATCGATGTACAGCTACGTACGTCTGAGCATCGATCGATGTACAGCTACGTACGTCTGAGCATCGATCGATGTACAGCTACGTACGTCTGAGCATCGATCGATGTACAGCTACGTACGTCTGAGCATCGATCGATGTACAGCTACGTACGTCTGAGCATCGATCGATGTACAGCTACGTACGTCTGAGCATCGATCGATGTACAGCTACGTACGTCTGAGCATCGATCGATGTACAGCTACGTACGTCTGAGCATCGATCGATGTACAGCTACGTACGTCTGAGCATCGATCGATGTACAGCTACGTACGTCTGAGCATCGATCGATGTACAGCTACGTACGTCTGAGCATCGATCGATGTACAGCTACGTACGTCTGAGCATCGATCGATGTACAGCTACGTACGTCTGAGCATCGATCGATGTACAGCTACGTACGTCTGAGCATCGATCGATGTACAGCTACGTACGTCTGAGCATCGATCGATGTACAGCTACGTACGTCTGAGCATCGATCGATGTACAGCTACGTACGTCTGAGCATCGATCGATGTACAGC.\n";
46                 return helpString;
47         }
48         catch(exception& e) {
49                 m->errorOut(e, "ShhherCommand", "getHelpString");
50                 exit(1);
51         }
52 }
53 //**********************************************************************************************************************
54 string ShhherCommand::getOutputPattern(string type) {
55     try {
56         string pattern = "";
57         
58         if (type == "fasta")            {   pattern = "[filename],shhh.fasta";   }
59         else if (type == "name")    {   pattern = "[filename],shhh.names";   }
60         else if (type == "group")        {   pattern = "[filename],shhh.groups";   }
61         else if (type == "counts")        {   pattern = "[filename],shhh.counts";   }
62         else if (type == "qfile")        {   pattern = "[filename],shhh.qual";   }
63         else { m->mothurOut("[ERROR]: No definition for type " + type + " output pattern.\n"); m->control_pressed = true;  }
64         
65         return pattern;
66     }
67     catch(exception& e) {
68         m->errorOut(e, "ShhherCommand", "getOutputPattern");
69         exit(1);
70     }
71 }
72 //**********************************************************************************************************************
73
74 ShhherCommand::ShhherCommand(){ 
75         try {
76                 abort = true; calledHelp = true;
77                 setParameters();
78                 
79                 //initialize outputTypes
80                 vector<string> tempOutNames;
81                 outputTypes["fasta"] = tempOutNames;
82         outputTypes["name"] = tempOutNames;
83         outputTypes["group"] = tempOutNames;
84         outputTypes["counts"] = tempOutNames;
85         outputTypes["qfile"] = tempOutNames;
86
87         }
88         catch(exception& e) {
89                 m->errorOut(e, "ShhherCommand", "ShhherCommand");
90                 exit(1);
91         }
92 }
93
94 //**********************************************************************************************************************
95
96 ShhherCommand::ShhherCommand(string option) {
97         try {
98         
99 #ifdef USE_MPI
100                 MPI_Comm_rank(MPI_COMM_WORLD, &pid); //find out who we are
101                 MPI_Comm_size(MPI_COMM_WORLD, &ncpus);
102         
103                 if(pid == 0){
104 #endif
105                 abort = false; calledHelp = false;   
106                 
107                 //allow user to run help
108                 if(option == "help") { help(); abort = true; calledHelp = true; }
109                 else if(option == "citation") { citation(); abort = true; calledHelp = true;}
110                 
111                 else {
112                         vector<string> myArray = setParameters();
113                         
114                         OptionParser parser(option);
115                         map<string,string> parameters = parser.getParameters();
116                         
117                         ValidParameters validParameter;
118                         map<string,string>::iterator it;
119                         
120                         //check to make sure all parameters are valid for command
121                         for (it = parameters.begin(); it != parameters.end(); it++) { 
122                                 if (validParameter.isValidParameter(it->first, myArray, it->second) != true) {  abort = true;  }
123                         }
124                         
125                         //initialize outputTypes
126             vector<string> tempOutNames;
127             outputTypes["fasta"] = tempOutNames;
128             outputTypes["name"] = tempOutNames;
129             outputTypes["group"] = tempOutNames;
130             outputTypes["counts"] = tempOutNames;
131             outputTypes["qfile"] = tempOutNames;
132
133                         
134                         //if the user changes the input directory command factory will send this info to us in the output parameter 
135                         string inputDir = validParameter.validFile(parameters, "inputdir", false);              
136                         if (inputDir == "not found"){   inputDir = "";          }
137                         else {
138                                 string path;
139                                 it = parameters.find("flow");
140                                 //user has given a template file
141                                 if(it != parameters.end()){ 
142                                         path = m->hasPath(it->second);
143                                         //if the user has not given a path then, add inputdir. else leave path alone.
144                                         if (path == "") {       parameters["flow"] = inputDir + it->second;             }
145                                 }
146                                 
147                                 it = parameters.find("lookup");
148                                 //user has given a template file
149                                 if(it != parameters.end()){ 
150                                         path = m->hasPath(it->second);
151                                         //if the user has not given a path then, add inputdir. else leave path alone.
152                                         if (path == "") {       parameters["lookup"] = inputDir + it->second;           }
153                                 }
154
155                                 it = parameters.find("file");
156                                 //user has given a template file
157                                 if(it != parameters.end()){ 
158                                         path = m->hasPath(it->second);
159                                         //if the user has not given a path then, add inputdir. else leave path alone.
160                                         if (path == "") {       parameters["file"] = inputDir + it->second;             }
161                                 }
162                         }
163             
164             //if the user changes the output directory command factory will send this info to us in the output parameter 
165                         outputDir = validParameter.validFile(parameters, "outputdir", false);           if (outputDir == "not found"){  outputDir = ""; }
166             
167                         //check for required parameters
168                         flowFileName = validParameter.validFile(parameters, "flow", true);
169                         flowFilesFileName = validParameter.validFile(parameters, "file", true);
170                         if (flowFileName == "not found" && flowFilesFileName == "not found") {
171                                 m->mothurOut("values for either flow or file must be provided for the shhh.flows command.");
172                                 m->mothurOutEndLine();
173                                 abort = true; 
174                         }
175                         else if (flowFileName == "not open" || flowFilesFileName == "not open") { abort = true; }
176                         
177                         if(flowFileName != "not found"){
178                                 compositeFASTAFileName = "";    
179                                 compositeNamesFileName = "";    
180                         }
181                         else{
182                                 ofstream temp;
183                 
184                 string thisoutputDir = outputDir;
185                 if (outputDir == "") {  thisoutputDir =  m->hasPath(flowFilesFileName); } //if user entered a file with a path then preserve it
186                 
187                                 //we want to rip off .files, and also .flow if its there
188                 string fileroot = m->getRootName(m->getSimpleName(flowFilesFileName));
189                 if (fileroot[fileroot.length()-1] == '.') {  fileroot = fileroot.substr(0, fileroot.length()-1); } //rip off dot
190                 string extension = m->getExtension(fileroot);
191                 if (extension == ".flow") { fileroot = m->getRootName(fileroot);  }
192                 else { fileroot += "."; } //add back if needed
193                 
194                                 compositeFASTAFileName = thisoutputDir + fileroot + "shhh.fasta";
195                                 m->openOutputFile(compositeFASTAFileName, temp);
196                                 temp.close();
197                                 
198                                 compositeNamesFileName = thisoutputDir + fileroot + "shhh.names";
199                                 m->openOutputFile(compositeNamesFileName, temp);
200                                 temp.close();
201                         }
202             
203             if(flowFilesFileName != "not found"){
204                 string fName;
205                 
206                 ifstream flowFilesFile;
207                 m->openInputFile(flowFilesFileName, flowFilesFile);
208                 while(flowFilesFile){
209                     fName = m->getline(flowFilesFile);
210                     
211                     //test if file is valid
212                     ifstream in;
213                     int ableToOpen = m->openInputFile(fName, in, "noerror");
214                     in.close(); 
215                     if (ableToOpen == 1) {
216                         if (inputDir != "") { //default path is set
217                             string tryPath = inputDir + fName;
218                             m->mothurOut("Unable to open " + fName + ". Trying input directory " + tryPath); m->mothurOutEndLine();
219                             ifstream in2;
220                             ableToOpen = m->openInputFile(tryPath, in2, "noerror");
221                             in2.close();
222                             fName = tryPath;
223                         }
224                     }
225                     
226                     if (ableToOpen == 1) {
227                         if (m->getDefaultPath() != "") { //default path is set
228                             string tryPath = m->getDefaultPath() + m->getSimpleName(fName);
229                             m->mothurOut("Unable to open " + fName + ". Trying default " + tryPath); m->mothurOutEndLine();
230                             ifstream in2;
231                             ableToOpen = m->openInputFile(tryPath, in2, "noerror");
232                             in2.close();
233                             fName = tryPath;
234                         }
235                     }
236                     
237                     //if you can't open it its not in current working directory or inputDir, try mothur excutable location
238                     if (ableToOpen == 1) {
239                         string exepath = m->argv;
240                         string tempPath = exepath;
241                         for (int i = 0; i < exepath.length(); i++) { tempPath[i] = tolower(exepath[i]); }
242                         exepath = exepath.substr(0, (tempPath.find_last_of('m')));
243                         
244                         string tryPath = m->getFullPathName(exepath) + m->getSimpleName(fName);
245                         m->mothurOut("Unable to open " + fName + ". Trying mothur's executable location " + tryPath); m->mothurOutEndLine();
246                         ifstream in2;
247                         ableToOpen = m->openInputFile(tryPath, in2, "noerror");
248                         in2.close();
249                         fName = tryPath;
250                     }
251                     
252                     if (ableToOpen == 1) {  m->mothurOut("Unable to open " + fName + ". Disregarding. "); m->mothurOutEndLine();  }
253                     else { flowFileVector.push_back(fName); }
254                     m->gobble(flowFilesFile);
255                 }
256                 flowFilesFile.close();
257                 if (flowFileVector.size() == 0) {  m->mothurOut("[ERROR]: no valid files."); m->mothurOutEndLine(); abort = true; }
258             }
259             else{
260                 if (outputDir == "") { outputDir = m->hasPath(flowFileName); }
261                 flowFileVector.push_back(flowFileName);
262             }
263                 
264                         //check for optional parameter and set defaults
265                         // ...at some point should added some additional type checking...
266                         string temp;
267                         temp = validParameter.validFile(parameters, "lookup", true);
268                         if (temp == "not found")        {       
269                                 lookupFileName = "LookUp_Titanium.pat"; 
270                                 
271                                 int ableToOpen;
272                                 ifstream in;
273                                 ableToOpen = m->openInputFile(lookupFileName, in, "noerror");
274                                 in.close();     
275                                 
276                                 //if you can't open it, try input location
277                                 if (ableToOpen == 1) {
278                                         if (inputDir != "") { //default path is set
279                                                 string tryPath = inputDir + lookupFileName;
280                                                 m->mothurOut("Unable to open " + lookupFileName + ". Trying input directory " + tryPath); m->mothurOutEndLine();
281                                                 ifstream in2;
282                                                 ableToOpen = m->openInputFile(tryPath, in2, "noerror");
283                                                 in2.close();
284                                                 lookupFileName = tryPath;
285                                         }
286                                 }
287                                 
288                                 //if you can't open it, try default location
289                                 if (ableToOpen == 1) {
290                                         if (m->getDefaultPath() != "") { //default path is set
291                                                 string tryPath = m->getDefaultPath() + m->getSimpleName(lookupFileName);
292                                                 m->mothurOut("Unable to open " + lookupFileName + ". Trying default " + tryPath); m->mothurOutEndLine();
293                                                 ifstream in2;
294                                                 ableToOpen = m->openInputFile(tryPath, in2, "noerror");
295                                                 in2.close();
296                                                 lookupFileName = tryPath;
297                                         }
298                                 }
299                                 
300                                 //if you can't open it its not in current working directory or inputDir, try mothur excutable location
301                                 if (ableToOpen == 1) {
302                                         string exepath = m->argv;
303                                         string tempPath = exepath;
304                                         for (int i = 0; i < exepath.length(); i++) { tempPath[i] = tolower(exepath[i]); }
305                                         exepath = exepath.substr(0, (tempPath.find_last_of('m')));
306                                         
307                                         string tryPath = m->getFullPathName(exepath) + m->getSimpleName(lookupFileName);
308                                         m->mothurOut("Unable to open " + lookupFileName + ". Trying mothur's executable location " + tryPath); m->mothurOutEndLine();
309                                         ifstream in2;
310                                         ableToOpen = m->openInputFile(tryPath, in2, "noerror");
311                                         in2.close();
312                                         lookupFileName = tryPath;
313                                 }
314                                 
315                                 if (ableToOpen == 1) {  m->mothurOut("Unable to open " + lookupFileName + "."); m->mothurOutEndLine(); abort=true;  }
316                         }
317                         else if(temp == "not open")     {       
318                                 
319                                 lookupFileName = validParameter.validFile(parameters, "lookup", false);
320                                 
321                                 //if you can't open it its not inputDir, try mothur excutable location
322                                 string exepath = m->argv;
323                                 string tempPath = exepath;
324                                 for (int i = 0; i < exepath.length(); i++) { tempPath[i] = tolower(exepath[i]); }
325                                 exepath = exepath.substr(0, (tempPath.find_last_of('m')));
326                                         
327                                 string tryPath = m->getFullPathName(exepath) + lookupFileName;
328                                 m->mothurOut("Unable to open " + lookupFileName + ". Trying mothur's executable location " + tryPath); m->mothurOutEndLine();
329                                 ifstream in2;
330                                 int ableToOpen = m->openInputFile(tryPath, in2, "noerror");
331                                 in2.close();
332                                 lookupFileName = tryPath;
333                                 
334                                 if (ableToOpen == 1) {  m->mothurOut("Unable to open " + lookupFileName + "."); m->mothurOutEndLine(); abort=true;  }
335                         }else                                           {       lookupFileName = temp;  }
336                         
337                         temp = validParameter.validFile(parameters, "processors", false);       if (temp == "not found"){       temp = m->getProcessors();      }
338                         m->setProcessors(temp);
339                         m->mothurConvert(temp, processors);
340
341                         temp = validParameter.validFile(parameters, "cutoff", false);   if (temp == "not found"){       temp = "0.01";          }
342                         m->mothurConvert(temp, cutoff); 
343                         
344                         temp = validParameter.validFile(parameters, "mindelta", false); if (temp == "not found"){       temp = "0.000001";      }
345                         m->mothurConvert(temp, minDelta); 
346
347                         temp = validParameter.validFile(parameters, "maxiter", false);  if (temp == "not found"){       temp = "1000";          }
348                         m->mothurConvert(temp, maxIters); 
349             
350             temp = validParameter.validFile(parameters, "large", false);        if (temp == "not found"){       temp = "0";             }
351                         m->mothurConvert(temp, largeSize); 
352             if (largeSize != 0) { large = true; }
353             else { large = false;  }
354             if (largeSize < 0) {  m->mothurOut("The value of the large cannot be negative.\n"); }
355             
356 #ifdef USE_MPI
357             if (large) { m->mothurOut("The large parameter is not available with the MPI-Enabled version.\n"); large=false; }                           
358 #endif
359             
360
361                         temp = validParameter.validFile(parameters, "sigma", false);if (temp == "not found")    {       temp = "60";            }
362                         m->mothurConvert(temp, sigma); 
363                         
364                         temp = validParameter.validFile(parameters, "order", false);  if (temp == "not found"){         temp = "A";     }
365             if (temp.length() > 1) {  m->mothurOut("[ERROR]: " + temp + " is not a valid option for order. order options are A, B, or I. A = TACG, B = TACGTACGTACGATGTAGTCGAGCATCATCTGACGCAGTACGTGCATGATCTCAGTCAGCAGCTATGTCAGTGCATGCAGTGACTGATCGTCATCAGCTAGCATCGACTGCATAGATCGCATGACGATCGCATATCGTCAGTGCATGTAGTCGAGCATCATCTGACGCAGTACGTGCATGATCTCAGTCAGCAGCTATGTCAGTGCATGCATAGATCGCATGACGATCGCATATCGTCAGTGCAGTGACTGATCGTCATCAGCTAGCATCGACTGCATGTAGTCGAGCATCATCTGACGCAGTACGTGCATAGATCGCATGACGATCGCATATCGTCAGTGCATGATCTCAGTCAGCAGCTATGTCAGTGCATGCAGTGACTGATCGTCATCAGCTAGCATCGACTGCATGTAGTCGAGCATCATCTGACGCAGTACGTGCAGTGACTGATCGTCATCAGCTAGCATCGACTGCATAGATCGCATGACGATCGCATATCGTCAGTGCATGATCTCAGTCAGCAGCTATGTCAGTGCATGCATGTAGTCGAGCATCATCTGACGCAGTACGTGCATAGATCGCATGACGATCGCATATCGTCAGTGCAGTGACTGATCGTCATCAGCTAGCATCGACTGCATGATCTCAGTCAGCAGCTATGTCAGTGCATGCAGTGACTGATCGTCATCAGCTAGCATCGACTGCATAGATCGCATGACGATCGCATATCGTCAGTGCATGATCTCAGTCAGCAGCTATGTCAGTGCATGCATGTAGTCGAGCATCATCTGACGCAGTACGTGCATAGATCGCATGACGATCGCATATCGTCAGTGCATGATCTCAGTCAGCAGCTATGTCAGTGCATGCAGTGACTGATCGTCATCAGCTAGCATCGACTGCATGTAGTCGAGCATCATCTGACGCAGTACGTGCATGATCTCAGTCAGCAGCTATGTCAGTGCATGCATAGATCGCATGACGATCGCATATCGTCAGTGCATGTAGTCGAGCATCATCTGACGCAGTACGTGCAGTGACTGATCGTCATCAGCTAGCATCGACTGCATAGATCGCATGACGATCGCATATCGTCAGTGCATGTAGTCGAGCATCATCTGACGCAGTACGTGCATGATCTCAGTCAGCAGCTATGTCAGTGCATGCAGTGACTGATCGTCATCAGCTAGCATCGACTGCATGATCTCAGTCAGCAGCTATGTCAGTGCATGCAGTGACTGATCGTCATCAGCTAGCATCGACTGCATAGATCGCATGACGATCGCATATCGTCAGTGCATGTAGTCGAGCATCATCTGACGCAGTACGTGCATGATCTCAGTCAGCAGCTATGTCAGTGCATGCATGTAGTCGAGCATCATCTGACGCAGTACGTGCAGTGACTGATCGTCATCAGCTAGCATCGACTGCATAGATCGCATGACGATCGCATATCGTCAGTGCAGTGACTGATCGTCATCAGCTAGCATCGACTGCATGTAGTCGAGCATCATCTGACGCAGTACGTGCATAGATCGCATGACGATCGCATATCGTCAGTGCATGATCTCAGTCAGCAGCTATGTCAGTGCATGCAGTGACTGATCGTCATCAGCTAGCATCGACTGCATGTAGTCGAGCATCATCTGACGCAGTACGTGCATGATCTCAGTCAGCAGCTATGTCAGTGCATGCATAGATCGCATGACGATCGCATATCGTCAGTGCAGTGACTGATCGTCATCAGCTAGCATCGACTGCATGATCTCAGTCAGCAGC, and I = TACGTACGTCTGAGCATCGATCGATGTACAGCTACGTACGTCTGAGCATCGATCGATGTACAGCTACGTACGTCTGAGCATCGATCGATGTACAGCTACGTACGTCTGAGCATCGATCGATGTACAGCTACGTACGTCTGAGCATCGATCGATGTACAGCTACGTACGTCTGAGCATCGATCGATGTACAGCTACGTACGTCTGAGCATCGATCGATGTACAGCTACGTACGTCTGAGCATCGATCGATGTACAGCTACGTACGTCTGAGCATCGATCGATGTACAGCTACGTACGTCTGAGCATCGATCGATGTACAGCTACGTACGTCTGAGCATCGATCGATGTACAGCTACGTACGTCTGAGCATCGATCGATGTACAGCTACGTACGTCTGAGCATCGATCGATGTACAGCTACGTACGTCTGAGCATCGATCGATGTACAGCTACGTACGTCTGAGCATCGATCGATGTACAGCTACGTACGTCTGAGCATCGATCGATGTACAGCTACGTACGTCTGAGCATCGATCGATGTACAGCTACGTACGTCTGAGCATCGATCGATGTACAGCTACGTACGTCTGAGCATCGATCGATGTACAGCTACGTACGTCTGAGCATCGATCGATGTACAGC.\n");  abort=true;
366             }
367             else {
368                 if (toupper(temp[0]) == 'A') {  flowOrder = "TACG";   }
369                 else if(toupper(temp[0]) == 'B'){
370                     flowOrder = "TACGTACGTACGATGTAGTCGAGCATCATCTGACGCAGTACGTGCATGATCTCAGTCAGCAGCTATGTCAGTGCATGCAGTGACTGATCGTCATCAGCTAGCATCGACTGCATAGATCGCATGACGATCGCATATCGTCAGTGCATGTAGTCGAGCATCATCTGACGCAGTACGTGCATGATCTCAGTCAGCAGCTATGTCAGTGCATGCATAGATCGCATGACGATCGCATATCGTCAGTGCAGTGACTGATCGTCATCAGCTAGCATCGACTGCATGTAGTCGAGCATCATCTGACGCAGTACGTGCATAGATCGCATGACGATCGCATATCGTCAGTGCATGATCTCAGTCAGCAGCTATGTCAGTGCATGCAGTGACTGATCGTCATCAGCTAGCATCGACTGCATGTAGTCGAGCATCATCTGACGCAGTACGTGCAGTGACTGATCGTCATCAGCTAGCATCGACTGCATAGATCGCATGACGATCGCATATCGTCAGTGCATGATCTCAGTCAGCAGCTATGTCAGTGCATGCATGTAGTCGAGCATCATCTGACGCAGTACGTGCATAGATCGCATGACGATCGCATATCGTCAGTGCAGTGACTGATCGTCATCAGCTAGCATCGACTGCATGATCTCAGTCAGCAGCTATGTCAGTGCATGCAGTGACTGATCGTCATCAGCTAGCATCGACTGCATAGATCGCATGACGATCGCATATCGTCAGTGCATGATCTCAGTCAGCAGCTATGTCAGTGCATGCATGTAGTCGAGCATCATCTGACGCAGTACGTGCATAGATCGCATGACGATCGCATATCGTCAGTGCATGATCTCAGTCAGCAGCTATGTCAGTGCATGCAGTGACTGATCGTCATCAGCTAGCATCGACTGCATGTAGTCGAGCATCATCTGACGCAGTACGTGCATGATCTCAGTCAGCAGCTATGTCAGTGCATGCATAGATCGCATGACGATCGCATATCGTCAGTGCATGTAGTCGAGCATCATCTGACGCAGTACGTGCAGTGACTGATCGTCATCAGCTAGCATCGACTGCATAGATCGCATGACGATCGCATATCGTCAGTGCATGTAGTCGAGCATCATCTGACGCAGTACGTGCATGATCTCAGTCAGCAGCTATGTCAGTGCATGCAGTGACTGATCGTCATCAGCTAGCATCGACTGCATGATCTCAGTCAGCAGCTATGTCAGTGCATGCAGTGACTGATCGTCATCAGCTAGCATCGACTGCATAGATCGCATGACGATCGCATATCGTCAGTGCATGTAGTCGAGCATCATCTGACGCAGTACGTGCATGATCTCAGTCAGCAGCTATGTCAGTGCATGCATGTAGTCGAGCATCATCTGACGCAGTACGTGCAGTGACTGATCGTCATCAGCTAGCATCGACTGCATAGATCGCATGACGATCGCATATCGTCAGTGCAGTGACTGATCGTCATCAGCTAGCATCGACTGCATGTAGTCGAGCATCATCTGACGCAGTACGTGCATAGATCGCATGACGATCGCATATCGTCAGTGCATGATCTCAGTCAGCAGCTATGTCAGTGCATGCAGTGACTGATCGTCATCAGCTAGCATCGACTGCATGTAGTCGAGCATCATCTGACGCAGTACGTGCATGATCTCAGTCAGCAGCTATGTCAGTGCATGCATAGATCGCATGACGATCGCATATCGTCAGTGCAGTGACTGATCGTCATCAGCTAGCATCGACTGCATGATCTCAGTCAGCAGC";   }
371                 else if(toupper(temp[0]) == 'I'){
372                     flowOrder = "TACGTACGTCTGAGCATCGATCGATGTACAGCTACGTACGTCTGAGCATCGATCGATGTACAGCTACGTACGTCTGAGCATCGATCGATGTACAGCTACGTACGTCTGAGCATCGATCGATGTACAGCTACGTACGTCTGAGCATCGATCGATGTACAGCTACGTACGTCTGAGCATCGATCGATGTACAGCTACGTACGTCTGAGCATCGATCGATGTACAGCTACGTACGTCTGAGCATCGATCGATGTACAGCTACGTACGTCTGAGCATCGATCGATGTACAGCTACGTACGTCTGAGCATCGATCGATGTACAGCTACGTACGTCTGAGCATCGATCGATGTACAGCTACGTACGTCTGAGCATCGATCGATGTACAGCTACGTACGTCTGAGCATCGATCGATGTACAGCTACGTACGTCTGAGCATCGATCGATGTACAGCTACGTACGTCTGAGCATCGATCGATGTACAGCTACGTACGTCTGAGCATCGATCGATGTACAGCTACGTACGTCTGAGCATCGATCGATGTACAGCTACGTACGTCTGAGCATCGATCGATGTACAGCTACGTACGTCTGAGCATCGATCGATGTACAGCTACGTACGTCTGAGCATCGATCGATGTACAGC";   }
373                 else {
374                     m->mothurOut("[ERROR]: " + temp + " is not a valid option for order. order options are A, B, or I. A = TACG, B = TACGTACGTACGATGTAGTCGAGCATCATCTGACGCAGTACGTGCATGATCTCAGTCAGCAGCTATGTCAGTGCATGCAGTGACTGATCGTCATCAGCTAGCATCGACTGCATAGATCGCATGACGATCGCATATCGTCAGTGCATGTAGTCGAGCATCATCTGACGCAGTACGTGCATGATCTCAGTCAGCAGCTATGTCAGTGCATGCATAGATCGCATGACGATCGCATATCGTCAGTGCAGTGACTGATCGTCATCAGCTAGCATCGACTGCATGTAGTCGAGCATCATCTGACGCAGTACGTGCATAGATCGCATGACGATCGCATATCGTCAGTGCATGATCTCAGTCAGCAGCTATGTCAGTGCATGCAGTGACTGATCGTCATCAGCTAGCATCGACTGCATGTAGTCGAGCATCATCTGACGCAGTACGTGCAGTGACTGATCGTCATCAGCTAGCATCGACTGCATAGATCGCATGACGATCGCATATCGTCAGTGCATGATCTCAGTCAGCAGCTATGTCAGTGCATGCATGTAGTCGAGCATCATCTGACGCAGTACGTGCATAGATCGCATGACGATCGCATATCGTCAGTGCAGTGACTGATCGTCATCAGCTAGCATCGACTGCATGATCTCAGTCAGCAGCTATGTCAGTGCATGCAGTGACTGATCGTCATCAGCTAGCATCGACTGCATAGATCGCATGACGATCGCATATCGTCAGTGCATGATCTCAGTCAGCAGCTATGTCAGTGCATGCATGTAGTCGAGCATCATCTGACGCAGTACGTGCATAGATCGCATGACGATCGCATATCGTCAGTGCATGATCTCAGTCAGCAGCTATGTCAGTGCATGCAGTGACTGATCGTCATCAGCTAGCATCGACTGCATGTAGTCGAGCATCATCTGACGCAGTACGTGCATGATCTCAGTCAGCAGCTATGTCAGTGCATGCATAGATCGCATGACGATCGCATATCGTCAGTGCATGTAGTCGAGCATCATCTGACGCAGTACGTGCAGTGACTGATCGTCATCAGCTAGCATCGACTGCATAGATCGCATGACGATCGCATATCGTCAGTGCATGTAGTCGAGCATCATCTGACGCAGTACGTGCATGATCTCAGTCAGCAGCTATGTCAGTGCATGCAGTGACTGATCGTCATCAGCTAGCATCGACTGCATGATCTCAGTCAGCAGCTATGTCAGTGCATGCAGTGACTGATCGTCATCAGCTAGCATCGACTGCATAGATCGCATGACGATCGCATATCGTCAGTGCATGTAGTCGAGCATCATCTGACGCAGTACGTGCATGATCTCAGTCAGCAGCTATGTCAGTGCATGCATGTAGTCGAGCATCATCTGACGCAGTACGTGCAGTGACTGATCGTCATCAGCTAGCATCGACTGCATAGATCGCATGACGATCGCATATCGTCAGTGCAGTGACTGATCGTCATCAGCTAGCATCGACTGCATGTAGTCGAGCATCATCTGACGCAGTACGTGCATAGATCGCATGACGATCGCATATCGTCAGTGCATGATCTCAGTCAGCAGCTATGTCAGTGCATGCAGTGACTGATCGTCATCAGCTAGCATCGACTGCATGTAGTCGAGCATCATCTGACGCAGTACGTGCATGATCTCAGTCAGCAGCTATGTCAGTGCATGCATAGATCGCATGACGATCGCATATCGTCAGTGCAGTGACTGATCGTCATCAGCTAGCATCGACTGCATGATCTCAGTCAGCAGC, and I = TACGTACGTCTGAGCATCGATCGATGTACAGCTACGTACGTCTGAGCATCGATCGATGTACAGCTACGTACGTCTGAGCATCGATCGATGTACAGCTACGTACGTCTGAGCATCGATCGATGTACAGCTACGTACGTCTGAGCATCGATCGATGTACAGCTACGTACGTCTGAGCATCGATCGATGTACAGCTACGTACGTCTGAGCATCGATCGATGTACAGCTACGTACGTCTGAGCATCGATCGATGTACAGCTACGTACGTCTGAGCATCGATCGATGTACAGCTACGTACGTCTGAGCATCGATCGATGTACAGCTACGTACGTCTGAGCATCGATCGATGTACAGCTACGTACGTCTGAGCATCGATCGATGTACAGCTACGTACGTCTGAGCATCGATCGATGTACAGCTACGTACGTCTGAGCATCGATCGATGTACAGCTACGTACGTCTGAGCATCGATCGATGTACAGCTACGTACGTCTGAGCATCGATCGATGTACAGCTACGTACGTCTGAGCATCGATCGATGTACAGCTACGTACGTCTGAGCATCGATCGATGTACAGCTACGTACGTCTGAGCATCGATCGATGTACAGCTACGTACGTCTGAGCATCGATCGATGTACAGC.\n");  abort=true;
375                 }
376             }
377
378                         
379                 }
380 #ifdef USE_MPI
381                 }                               
382 #endif
383         }
384         catch(exception& e) {
385                 m->errorOut(e, "ShhherCommand", "ShhherCommand");
386                 exit(1);
387         }
388 }
389 //**********************************************************************************************************************
390 #ifdef USE_MPI
391 int ShhherCommand::execute(){
392         try {
393                 if (abort == true) { if (calledHelp) { return 0; }  return 2;   }
394                 
395                 int tag = 1976;
396                 MPI_Status status; 
397                         
398                 if(pid == 0){
399
400                         for(int i=1;i<ncpus;i++){
401                                 MPI_Send(&abort, 1, MPI_INT, i, tag, MPI_COMM_WORLD);
402                         }
403                         if(abort == 1){ return 0;       }
404
405                         processors = ncpus;
406                         
407                         m->mothurOut("\nGetting preliminary data...\n");
408                         getSingleLookUp();      if (m->control_pressed) { return 0; }
409                         getJointLookUp();       if (m->control_pressed) { return 0; }
410                         
411             vector<string> flowFileVector;
412                         if(flowFilesFileName != "not found"){
413                                 string fName;
414                 
415                                 ifstream flowFilesFile;
416                                 m->openInputFile(flowFilesFileName, flowFilesFile);
417                                 while(flowFilesFile){
418                                         fName = m->getline(flowFilesFile);
419                                         flowFileVector.push_back(fName);
420                                         m->gobble(flowFilesFile);
421                                 }
422                         }
423                         else{
424                                 flowFileVector.push_back(flowFileName);
425                         }
426             
427                         int numFiles = flowFileVector.size();
428
429                         for(int i=1;i<ncpus;i++){
430                                 MPI_Send(&numFiles, 1, MPI_INT, i, tag, MPI_COMM_WORLD);
431                         }
432                         
433                         for(int i=0;i<numFiles;i++){
434                                 
435                                 if (m->control_pressed) { break; }
436                                 
437                                 double begClock = clock();
438                                 unsigned long long begTime = time(NULL);
439
440                                 flowFileName = flowFileVector[i];
441                                 
442                                 m->mothurOut("\n>>>>>\tProcessing " + flowFileName + " (file " + toString(i+1) + " of " + toString(numFiles) + ")\t<<<<<\n");
443                                 m->mothurOut("Reading flowgrams...\n");
444                                 getFlowData();
445                                 
446                                 if (m->control_pressed) { break; }
447
448                                 m->mothurOut("Identifying unique flowgrams...\n");
449                                 getUniques();
450                                 
451                                 if (m->control_pressed) { break; }
452
453                                 m->mothurOut("Calculating distances between flowgrams...\n");
454                                 char fileName[1024];
455                                 strcpy(fileName, flowFileName.c_str());
456
457                                 for(int i=1;i<ncpus;i++){
458                                         MPI_Send(&fileName[0], 1024, MPI_CHAR, i, tag, MPI_COMM_WORLD);
459
460                                         MPI_Send(&numSeqs, 1, MPI_INT, i, tag, MPI_COMM_WORLD);
461                                         MPI_Send(&numUniques, 1, MPI_INT, i, tag, MPI_COMM_WORLD);
462                                         MPI_Send(&numFlowCells, 1, MPI_INT, i, tag, MPI_COMM_WORLD);
463                                         MPI_Send(&flowDataIntI[0], numSeqs * numFlowCells, MPI_SHORT, i, tag, MPI_COMM_WORLD);
464                                         MPI_Send(&flowDataPrI[0], numSeqs * numFlowCells, MPI_DOUBLE, i, tag, MPI_COMM_WORLD);
465                                         MPI_Send(&mapUniqueToSeq[0], numSeqs, MPI_INT, i, tag, MPI_COMM_WORLD);
466                                         MPI_Send(&mapSeqToUnique[0], numSeqs, MPI_INT, i, tag, MPI_COMM_WORLD);
467                                         MPI_Send(&lengths[0], numSeqs, MPI_INT, i, tag, MPI_COMM_WORLD);
468                                         MPI_Send(&jointLookUp[0], NUMBINS * NUMBINS, MPI_DOUBLE, i, tag, MPI_COMM_WORLD);
469                                         MPI_Send(&cutoff, 1, MPI_DOUBLE, i, tag, MPI_COMM_WORLD);
470                                 }                               
471                                                         
472                                 string distFileName = flowDistMPI(0, int(sqrt(1.0/float(ncpus)) * numUniques));
473                                 
474                                 if (m->control_pressed) { break; }
475                                 
476                                 int done;
477                                 for(int i=1;i<ncpus;i++){
478                                         MPI_Recv(&done, 1, MPI_INT, i, tag, MPI_COMM_WORLD, &status);
479                                         
480                                         m->appendFiles((distFileName + ".temp." + toString(i)), distFileName);
481                                         m->mothurRemove((distFileName + ".temp." + toString(i)));
482                                 }
483
484                                 string namesFileName = createNamesFile();
485                                 
486                                 if (m->control_pressed) { break; }
487                                 
488                                 m->mothurOut("\nClustering flowgrams...\n");
489                                 string listFileName = cluster(distFileName, namesFileName);
490                                 
491                                 if (m->control_pressed) { break; }
492                                 
493                 
494                 
495                                 getOTUData(listFileName);
496
497                                 m->mothurRemove(distFileName);
498                                 m->mothurRemove(namesFileName);
499                                 m->mothurRemove(listFileName);
500                                 
501                                 if (m->control_pressed) { break; }
502                                 
503                                 initPyroCluster();
504
505                                 if (m->control_pressed) { break; }
506                                 
507             
508                                 for(int i=1;i<ncpus;i++){
509                                         MPI_Send(&numOTUs, 1, MPI_INT, i, tag, MPI_COMM_WORLD);
510                                         MPI_Send(&singleLookUp[0], singleLookUp.size(), MPI_DOUBLE, i, tag, MPI_COMM_WORLD);
511                                         MPI_Send(&uniqueFlowgrams[0], numFlowCells * numUniques, MPI_SHORT, i, tag, MPI_COMM_WORLD);
512                                         MPI_Send(&sigma, 1, MPI_DOUBLE, i, tag, MPI_COMM_WORLD);
513                                 }
514                                 
515                                 if (m->control_pressed) { break; }
516                                 
517                                 double maxDelta = 0;
518                                 int iter = 0;
519                                 
520                                 int numOTUsOnCPU = numOTUs / ncpus;
521                                 int numSeqsOnCPU = numSeqs / ncpus;
522                                 m->mothurOut("\nDenoising flowgrams...\n");
523                                 m->mothurOut("iter\tmaxDelta\tnLL\t\tcycletime\n");
524                                 
525                                 while((maxIters == 0 && maxDelta > minDelta) || iter < MIN_ITER || (maxDelta > minDelta && iter < maxIters)){
526                                         
527                                         double cycClock = clock();
528                                         unsigned long long cycTime = time(NULL);
529                                         fill();
530                                         
531                                         if (m->control_pressed) { break; }
532
533                                         int total = singleTau.size();
534                                         for(int i=1;i<ncpus;i++){
535                                                 MPI_Send(&total, 1, MPI_INT, i, tag, MPI_COMM_WORLD);
536                                                 MPI_Send(&change[0], numOTUs, MPI_SHORT, i, tag, MPI_COMM_WORLD);
537                                                 MPI_Send(&centroids[0], numOTUs, MPI_INT, i, tag, MPI_COMM_WORLD);
538                                                 
539                                                 MPI_Send(&singleTau[0], total, MPI_DOUBLE, i, tag, MPI_COMM_WORLD);
540                                                 MPI_Send(&seqNumber[0], total, MPI_INT, i, tag, MPI_COMM_WORLD);
541                                                 MPI_Send(&seqIndex[0], total, MPI_INT, i, tag, MPI_COMM_WORLD);
542                                                 MPI_Send(&nSeqsPerOTU[0], numOTUs, MPI_INT, i, tag, MPI_COMM_WORLD);
543                                                 MPI_Send(&cumNumSeqs[0], numOTUs, MPI_INT, i, tag, MPI_COMM_WORLD);
544                                         }
545                                         
546                                         calcCentroidsDriver(0, numOTUsOnCPU);
547                                         
548                                         for(int i=1;i<ncpus;i++){
549                                                 int otuStart = i * numOTUs / ncpus;
550                                                 int otuStop = (i + 1) * numOTUs / ncpus;
551                                                 
552                                                 vector<int> tempCentroids(numOTUs, 0);
553                                                 vector<short> tempChange(numOTUs, 0);
554                                                 
555                                                 MPI_Recv(&tempCentroids[0], numOTUs, MPI_INT, i, tag, MPI_COMM_WORLD, &status);
556                                                 MPI_Recv(&tempChange[0], numOTUs, MPI_SHORT, i, tag, MPI_COMM_WORLD, &status);
557                                                 
558                                                 for(int j=otuStart;j<otuStop;j++){
559                                                         centroids[j] = tempCentroids[j];
560                                                         change[j] = tempChange[j];
561                                                 }
562                                         }
563                                                                         
564                                         maxDelta = getNewWeights(); if (m->control_pressed) { break; }
565                                         double nLL = getLikelihood(); if (m->control_pressed) { break; }
566                                         checkCentroids(); if (m->control_pressed) { break; }
567                                         
568                                         for(int i=1;i<ncpus;i++){
569                                                 MPI_Send(&centroids[0], numOTUs, MPI_INT, i, tag, MPI_COMM_WORLD);
570                                                 MPI_Send(&weight[0], numOTUs, MPI_DOUBLE, i, tag, MPI_COMM_WORLD);
571                                                 MPI_Send(&change[0], numOTUs, MPI_SHORT, i, tag, MPI_COMM_WORLD);
572                                         }
573                                         
574                                         calcNewDistancesParent(0, numSeqsOnCPU);
575
576                                         total = singleTau.size();
577
578                                         for(int i=1;i<ncpus;i++){
579                                                 int childTotal;
580                                                 int seqStart = i * numSeqs / ncpus;
581                                                 int seqStop = (i + 1) * numSeqs / ncpus;
582                                                 
583                                                 MPI_Recv(&childTotal, 1, MPI_INT, i, tag, MPI_COMM_WORLD, &status);
584
585                                                 vector<int> childSeqIndex(childTotal, 0);
586                                                 vector<double> childSingleTau(childTotal, 0);
587                                                 vector<double> childDist(numSeqs * numOTUs, 0);
588                                                 vector<int> otuIndex(childTotal, 0);
589                                                 
590                                                 MPI_Recv(&childSeqIndex[0], childTotal, MPI_INT, i, tag, MPI_COMM_WORLD, &status);
591                                                 MPI_Recv(&childSingleTau[0], childTotal, MPI_DOUBLE, i, tag, MPI_COMM_WORLD, &status);
592                                                 MPI_Recv(&childDist[0], numOTUs * numSeqs, MPI_DOUBLE, i, tag, MPI_COMM_WORLD, &status);
593                                                 MPI_Recv(&otuIndex[0], childTotal, MPI_INT, i, tag, MPI_COMM_WORLD, &status);
594
595                                                 int oldTotal = total;
596                                                 total += childTotal;
597                                                 singleTau.resize(total, 0);
598                                                 seqIndex.resize(total, 0);
599                                                 seqNumber.resize(total, 0);
600                                                 
601                                                 int childIndex = 0;
602                                                 
603                                                 for(int j=oldTotal;j<total;j++){
604                                                         int otuI = otuIndex[childIndex];
605                                                         int seqI = childSeqIndex[childIndex];
606                                                         
607                                                         singleTau[j] = childSingleTau[childIndex];
608                                                         
609                                                         aaP[otuI][nSeqsPerOTU[otuI]] = j;
610                                                         aaI[otuI][nSeqsPerOTU[otuI]] = seqI;
611                                                         nSeqsPerOTU[otuI]++;
612                                                         childIndex++;
613                                                 }
614                                                 
615                                                 int index = seqStart * numOTUs;
616                                                 for(int j=seqStart;j<seqStop;j++){
617                                                         for(int k=0;k<numOTUs;k++){
618                                                                 dist[index] = childDist[index];
619                                                                 index++;
620                                                         }
621                                                 }                                       
622                                         }
623                                         
624                                         iter++;
625                                         
626                                         m->mothurOut(toString(iter) + '\t' + toString(maxDelta) + '\t' + toString(nLL) + '\t' + toString(time(NULL) - cycTime) + '\t' + toString((clock() - cycClock)/(double)CLOCKS_PER_SEC) + '\n');                  
627
628                                         if((maxIters == 0 && maxDelta > minDelta) || iter < MIN_ITER || (maxDelta > minDelta && iter < maxIters)){
629                                                 int live = 1;
630                                                 for(int i=1;i<ncpus;i++){
631                                                         MPI_Send(&live, 1, MPI_INT, i, tag, MPI_COMM_WORLD);
632                                                 }
633                                         }
634                                         else{
635                                                 int live = 0;
636                                                 for(int i=1;i<ncpus;i++){
637                                                         MPI_Send(&live, 1, MPI_INT, i, tag, MPI_COMM_WORLD); //send kill command
638                                                 }
639                                         }
640                                         
641                                 }       
642                                 
643                                 if (m->control_pressed) { break; }
644                                 
645                                 m->mothurOut("\nFinalizing...\n");
646                                 fill();
647                                 
648                                 if (m->control_pressed) { break; }
649                                 
650                                 setOTUs();
651                                 
652                                 vector<int> otuCounts(numOTUs, 0);
653                                 for(int i=0;i<numSeqs;i++)      {       otuCounts[otuData[i]]++;        }
654                                 calcCentroidsDriver(0, numOTUs);
655                                 
656                                 if (m->control_pressed) { break; }
657                                 
658                                 writeQualities(otuCounts);      if (m->control_pressed) { break; }
659                                 writeSequences(otuCounts);      if (m->control_pressed) { break; }
660                                 writeNames(otuCounts);          if (m->control_pressed) { break; }
661                                 writeClusters(otuCounts);       if (m->control_pressed) { break; }
662                                 writeGroups();                          if (m->control_pressed) { break; }
663                                 
664                                                                  
665                                 m->mothurOut("Total time to process " + toString(flowFileName) + ":\t" + toString(time(NULL) - begTime) + '\t' + toString((clock() - begClock)/(double)CLOCKS_PER_SEC) + '\n');                 
666                         }
667                 }
668                 else{
669                         int abort = 1;
670
671                         MPI_Recv(&abort, 1, MPI_INT, 0, tag, MPI_COMM_WORLD, &status);
672                         if(abort){      return 0;       }
673
674                         int numFiles;
675                         MPI_Recv(&numFiles, 1, MPI_INT, 0, tag, MPI_COMM_WORLD, &status);
676
677                         for(int i=0;i<numFiles;i++){
678                                 
679                                 if (m->control_pressed) { break; }
680                                 
681                                 //Now into the pyrodist part
682                                 bool live = 1;
683
684                                 char fileName[1024];
685                                 MPI_Recv(&fileName, 1024, MPI_CHAR, 0, tag, MPI_COMM_WORLD, &status);
686                                 MPI_Recv(&numSeqs, 1, MPI_INT, 0, tag, MPI_COMM_WORLD, &status);
687                                 MPI_Recv(&numUniques, 1, MPI_INT, 0, tag, MPI_COMM_WORLD, &status);
688                                 MPI_Recv(&numFlowCells, 1, MPI_INT, 0, tag, MPI_COMM_WORLD, &status);
689                                 
690                                 flowDataIntI.resize(numSeqs * numFlowCells);
691                                 flowDataPrI.resize(numSeqs * numFlowCells);
692                                 mapUniqueToSeq.resize(numSeqs);
693                                 mapSeqToUnique.resize(numSeqs);
694                                 lengths.resize(numSeqs);
695                                 jointLookUp.resize(NUMBINS * NUMBINS);
696
697                                 MPI_Recv(&flowDataIntI[0], numSeqs * numFlowCells, MPI_SHORT, 0, tag, MPI_COMM_WORLD, &status);
698                                 MPI_Recv(&flowDataPrI[0], numSeqs * numFlowCells, MPI_DOUBLE, 0, tag, MPI_COMM_WORLD, &status);
699                                 MPI_Recv(&mapUniqueToSeq[0], numSeqs, MPI_INT, 0, tag, MPI_COMM_WORLD, &status);
700                                 MPI_Recv(&mapSeqToUnique[0], numSeqs, MPI_INT, 0, tag, MPI_COMM_WORLD, &status);
701                                 MPI_Recv(&lengths[0], numSeqs, MPI_INT, 0, tag, MPI_COMM_WORLD, &status);
702                                 MPI_Recv(&jointLookUp[0], NUMBINS * NUMBINS, MPI_DOUBLE, 0, tag, MPI_COMM_WORLD, &status);
703                                 MPI_Recv(&cutoff, 1, MPI_DOUBLE, 0, tag, MPI_COMM_WORLD, &status);
704
705                                 flowFileName = string(fileName);
706                                 int flowDistStart = int(sqrt(float(pid)/float(ncpus)) * numUniques);
707                                 int flowDistEnd = int(sqrt(float(pid+1)/float(ncpus)) * numUniques);
708                                 
709                                 string distanceStringChild = flowDistMPI(flowDistStart, flowDistEnd);
710                                 
711                                 if (m->control_pressed) { break; }
712                                 
713                                 int done = 1;
714                                 MPI_Send(&done, 1, MPI_INT, 0, tag, MPI_COMM_WORLD);
715
716                                 //Now into the pyronoise part
717                                 MPI_Recv(&numOTUs, 1, MPI_INT, 0, tag, MPI_COMM_WORLD, &status);
718                                 
719                                 singleLookUp.resize(HOMOPS * NUMBINS);
720                                 uniqueFlowgrams.resize(numUniques * numFlowCells);
721                                 weight.resize(numOTUs);
722                                 centroids.resize(numOTUs);
723                                 change.resize(numOTUs);
724                                 dist.assign(numOTUs * numSeqs, 0);
725                                 nSeqsPerOTU.resize(numOTUs);
726                                 cumNumSeqs.resize(numOTUs);
727
728                                 MPI_Recv(&singleLookUp[0], singleLookUp.size(), MPI_DOUBLE, 0, tag, MPI_COMM_WORLD, &status);
729                                 MPI_Recv(&uniqueFlowgrams[0], uniqueFlowgrams.size(), MPI_SHORT, 0, tag, MPI_COMM_WORLD, &status);
730                                 MPI_Recv(&sigma, 1, MPI_DOUBLE, 0, tag, MPI_COMM_WORLD, &status);
731                                 
732                                 int startOTU = pid * numOTUs / ncpus;
733                                 int endOTU = (pid + 1) * numOTUs / ncpus;
734
735                                 int startSeq = pid * numSeqs / ncpus;
736                                 int endSeq = (pid + 1) * numSeqs /ncpus;
737                                 
738                                 int total;
739
740                                 while(live){
741                                         
742                                         if (m->control_pressed) { break; }
743                                         
744                                         MPI_Recv(&total, 1, MPI_INT, 0, tag, MPI_COMM_WORLD, &status);
745                                         singleTau.assign(total, 0.0000);
746                                         seqNumber.assign(total, 0);
747                                         seqIndex.assign(total, 0);
748
749                                         MPI_Recv(&change[0], numOTUs, MPI_SHORT, 0, tag, MPI_COMM_WORLD, &status);
750                                         MPI_Recv(&centroids[0], numOTUs, MPI_INT, 0, tag, MPI_COMM_WORLD, &status);
751                                         MPI_Recv(&singleTau[0], total, MPI_DOUBLE, 0, tag, MPI_COMM_WORLD, &status);
752                                         MPI_Recv(&seqNumber[0], total, MPI_DOUBLE, 0, tag, MPI_COMM_WORLD, &status);
753                                         MPI_Recv(&seqIndex[0], total, MPI_INT, 0, tag, MPI_COMM_WORLD, &status);
754                                         MPI_Recv(&nSeqsPerOTU[0], total, MPI_INT, 0, tag, MPI_COMM_WORLD, &status);
755                                         MPI_Recv(&cumNumSeqs[0], numOTUs, MPI_INT, 0, tag, MPI_COMM_WORLD, &status);
756
757                                         calcCentroidsDriver(startOTU, endOTU);
758
759                                         MPI_Send(&centroids[0], numOTUs, MPI_INT, 0, tag, MPI_COMM_WORLD);
760                                         MPI_Send(&change[0], numOTUs, MPI_SHORT, 0, tag, MPI_COMM_WORLD);
761
762                                         MPI_Recv(&centroids[0], numOTUs, MPI_INT, 0, tag, MPI_COMM_WORLD, &status);
763                                         MPI_Recv(&weight[0], numOTUs, MPI_DOUBLE, 0, tag, MPI_COMM_WORLD, &status);
764                                         MPI_Recv(&change[0], numOTUs, MPI_SHORT, 0, tag, MPI_COMM_WORLD, &status);
765
766                                         vector<int> otuIndex(total, 0);
767                                         calcNewDistancesChildMPI(startSeq, endSeq, otuIndex);
768                                         total = otuIndex.size();
769                                         
770                                         MPI_Send(&total, 1, MPI_INT, 0, tag, MPI_COMM_WORLD);
771                                         MPI_Send(&seqIndex[0], total, MPI_INT, 0, tag, MPI_COMM_WORLD);
772                                         MPI_Send(&singleTau[0], total, MPI_DOUBLE, 0, tag, MPI_COMM_WORLD);
773                                         MPI_Send(&dist[0], numOTUs * numSeqs, MPI_DOUBLE, 0, tag, MPI_COMM_WORLD);
774                                         MPI_Send(&otuIndex[0], total, MPI_INT, 0, tag, MPI_COMM_WORLD);
775                                 
776                                         MPI_Recv(&live, 1, MPI_INT, 0, tag, MPI_COMM_WORLD, &status);
777                                 }
778                         }
779                 }
780                 
781                 if (m->control_pressed) { for (int i = 0; i < outputNames.size(); i++) { m->mothurRemove(outputNames[i]); } return 0; }
782                 
783                 MPI_Barrier(MPI_COMM_WORLD);
784
785                 
786                 if(compositeFASTAFileName != ""){
787                         outputNames.push_back(compositeFASTAFileName); outputTypes["fasta"].push_back(compositeFASTAFileName);
788                         outputNames.push_back(compositeNamesFileName); outputTypes["name"].push_back(compositeNamesFileName); 
789                 }
790
791                 m->mothurOutEndLine();
792                 m->mothurOut("Output File Names: "); m->mothurOutEndLine();
793                 for (int i = 0; i < outputNames.size(); i++) {  m->mothurOut(outputNames[i]); m->mothurOutEndLine();    }
794                 m->mothurOutEndLine();
795                 
796                 return 0;
797
798         }
799         catch(exception& e) {
800                 m->errorOut(e, "ShhherCommand", "execute");
801                 exit(1);
802         }
803 }
804 /**************************************************************************************************/
805 string ShhherCommand::createNamesFile(){
806         try{
807                 
808                 vector<string> duplicateNames(numUniques, "");
809                 for(int i=0;i<numSeqs;i++){
810                         duplicateNames[mapSeqToUnique[i]] += seqNameVector[i] + ',';
811                 }
812                 map<string, string> variables; 
813                 variables["[filename]"] = outputDir + m->getRootName(m->getSimpleName(flowFileName));
814                 string nameFileName = getOutputFileName("name",variables);
815                 
816                 ofstream nameFile;
817                 m->openOutputFile(nameFileName, nameFile);
818                 
819                 for(int i=0;i<numUniques;i++){
820                         
821                         if (m->control_pressed) { break; }
822                         
823             //                  nameFile << seqNameVector[mapUniqueToSeq[i]] << '\t' << duplicateNames[i].substr(0, duplicateNames[i].find_last_of(',')) << endl;
824                         nameFile << mapUniqueToSeq[i] << '\t' << duplicateNames[i].substr(0, duplicateNames[i].find_last_of(',')) << endl;
825                 }
826                 
827                 nameFile.close();
828                 return  nameFileName;
829         }
830         catch(exception& e) {
831                 m->errorOut(e, "ShhherCommand", "createNamesFile");
832                 exit(1);
833         }
834 }
835 /**************************************************************************************************/
836
837 string ShhherCommand::flowDistMPI(int startSeq, int stopSeq){
838         try{            
839                 ostringstream outStream;
840                 outStream.setf(ios::fixed, ios::floatfield);
841                 outStream.setf(ios::dec, ios::basefield);
842                 outStream.setf(ios::showpoint);
843                 outStream.precision(6);
844                 
845                 int begTime = time(NULL);
846                 double begClock = clock();
847                 
848                 for(int i=startSeq;i<stopSeq;i++){
849                         
850                         if (m->control_pressed) { break; }
851                         
852                         for(int j=0;j<i;j++){
853                                 float flowDistance = calcPairwiseDist(mapUniqueToSeq[i], mapUniqueToSeq[j]);
854                                 
855                                 if(flowDistance < 1e-6){
856                                         outStream << mapUniqueToSeq[i] << '\t' << mapUniqueToSeq[j] << '\t' << 0.000000 << endl;
857                                 }
858                                 else if(flowDistance <= cutoff){
859                                         outStream << mapUniqueToSeq[i] << '\t' << mapUniqueToSeq[j] << '\t' << flowDistance << endl;
860                                 }
861                         }
862                         if(i % 100 == 0){
863                                 m->mothurOutJustToScreen(toString(i) + '\t' + toString(time(NULL) - begTime) + '\t' + toString((clock()-begClock)/CLOCKS_PER_SEC) + '\n');
864                         }
865                 }
866                 
867                 string fDistFileName = flowFileName.substr(0,flowFileName.find_last_of('.')) + ".shhh.dist";
868                 if(pid != 0){   fDistFileName += ".temp." + toString(pid);      }
869                 
870                 if (m->control_pressed) { return fDistFileName; }
871                 
872                 m->mothurOutJustToScreen(toString(stopSeq) + '\t' + toString(time(NULL) - begTime) + '\t' + toString((clock()-begClock)/CLOCKS_PER_SEC) + '\n');
873
874                 ofstream distFile(fDistFileName.c_str());
875                 distFile << outStream.str();            
876                 distFile.close();
877                 
878                 return fDistFileName;
879         }
880         catch(exception& e) {
881                 m->errorOut(e, "ShhherCommand", "flowDistMPI");
882                 exit(1);
883         }
884 }
885 /**************************************************************************************************/
886  
887 void ShhherCommand::getOTUData(string listFileName){
888     try {
889         
890         ifstream listFile;
891         m->openInputFile(listFileName, listFile);
892         string label;
893         
894         listFile >> label >> numOTUs;
895         
896         otuData.assign(numSeqs, 0);
897         cumNumSeqs.assign(numOTUs, 0);
898         nSeqsPerOTU.assign(numOTUs, 0);
899         aaP.clear();aaP.resize(numOTUs);
900         
901         seqNumber.clear();
902         aaI.clear();
903         seqIndex.clear();
904         
905         string singleOTU = "";
906         
907         for(int i=0;i<numOTUs;i++){
908             
909             if (m->control_pressed) { break; }
910             
911             listFile >> singleOTU;
912             
913             istringstream otuString(singleOTU);
914             
915             while(otuString){
916                 
917                 string seqName = "";
918                 
919                 for(int j=0;j<singleOTU.length();j++){
920                     char letter = otuString.get();
921                     
922                     if(letter != ','){
923                         seqName += letter;
924                     }
925                     else{
926                         map<string,int>::iterator nmIt = nameMap.find(seqName);
927                         int index = nmIt->second;
928                         
929                         nameMap.erase(nmIt);
930                         
931                         otuData[index] = i;
932                         nSeqsPerOTU[i]++;
933                         aaP[i].push_back(index);
934                         seqName = "";
935                     }
936                 }
937                 
938                 map<string,int>::iterator nmIt = nameMap.find(seqName);
939                 
940                 int index = nmIt->second;
941                 nameMap.erase(nmIt);
942                 
943                 otuData[index] = i;
944                 nSeqsPerOTU[i]++;
945                 aaP[i].push_back(index);        
946                 
947                 otuString.get();
948             }
949             
950             sort(aaP[i].begin(), aaP[i].end());
951             for(int j=0;j<nSeqsPerOTU[i];j++){
952                 seqNumber.push_back(aaP[i][j]);
953             }
954             for(int j=nSeqsPerOTU[i];j<numSeqs;j++){
955                 aaP[i].push_back(0);
956             }
957             
958             
959         }
960         
961         for(int i=1;i<numOTUs;i++){
962             cumNumSeqs[i] = cumNumSeqs[i-1] + nSeqsPerOTU[i-1];
963         }
964         aaI = aaP;
965         seqIndex = seqNumber;
966         
967         listFile.close();       
968         
969     }
970     catch(exception& e) {
971         m->errorOut(e, "ShhherCommand", "getOTUData");
972         exit(1);        
973     }           
974 }
975
976 /**************************************************************************************************/
977
978 void ShhherCommand::initPyroCluster(){                          
979     try{
980         if (numOTUs < processors) { processors = 1; }
981         
982         if (m->debug) { m->mothurOut("[DEBUG]: numSeqs = " + toString(numSeqs) + " numOTUS = " + toString(numOTUs) + " about to alloc a dist vector with size = " + toString((numSeqs * numOTUs)) + ".\n"); }
983         
984         dist.assign(numSeqs * numOTUs, 0);
985         change.assign(numOTUs, 1);
986         centroids.assign(numOTUs, -1);
987         weight.assign(numOTUs, 0);
988         singleTau.assign(numSeqs, 1.0);
989         
990         nSeqsBreaks.assign(processors+1, 0);
991         nOTUsBreaks.assign(processors+1, 0);
992         
993         if (m->debug) { m->mothurOut("[DEBUG]: made it through the memory allocation.\n"); }
994         
995         nSeqsBreaks[0] = 0;
996         for(int i=0;i<processors;i++){
997             nSeqsBreaks[i+1] = nSeqsBreaks[i] + (int)((double) numSeqs / (double) processors);
998             nOTUsBreaks[i+1] = nOTUsBreaks[i] + (int)((double) numOTUs / (double) processors);
999         }
1000         nSeqsBreaks[processors] = numSeqs;
1001         nOTUsBreaks[processors] = numOTUs;
1002     }
1003     catch(exception& e) {
1004         m->errorOut(e, "ShhherCommand", "initPyroCluster");
1005         exit(1);        
1006     }
1007 }
1008
1009 /**************************************************************************************************/
1010
1011 void ShhherCommand::fill(){
1012     try {
1013         int index = 0;
1014         for(int i=0;i<numOTUs;i++){
1015             
1016             if (m->control_pressed) { break; }
1017             
1018             cumNumSeqs[i] = index;
1019             for(int j=0;j<nSeqsPerOTU[i];j++){
1020                 seqNumber[index] = aaP[i][j];
1021                 seqIndex[index] = aaI[i][j];
1022                 
1023                 index++;
1024             }
1025         }
1026     }
1027     catch(exception& e) {
1028         m->errorOut(e, "ShhherCommand", "fill");
1029         exit(1);        
1030     }           
1031 }
1032
1033 /**************************************************************************************************/
1034  
1035 void ShhherCommand::getFlowData(){
1036     try{
1037         ifstream flowFile;
1038         m->openInputFile(flowFileName, flowFile);
1039         
1040         string seqName;
1041         seqNameVector.clear();
1042         lengths.clear();
1043         flowDataIntI.clear();
1044         nameMap.clear();
1045         
1046         
1047         int currentNumFlowCells;
1048         
1049         float intensity;
1050         
1051         string numFlowTest;
1052         flowFile >> numFlowTest;
1053         
1054         if (!m->isContainingOnlyDigits(numFlowTest)) { m->mothurOut("[ERROR]: expected a number and got " + numFlowTest + ", quitting. Did you use the flow parameter instead of the file parameter?"); m->mothurOutEndLine(); exit(1); }
1055         else { convert(numFlowTest, numFlowCells); }
1056         
1057         int index = 0;//pcluster
1058         while(!flowFile.eof()){
1059             
1060             if (m->control_pressed) { break; }
1061             
1062             flowFile >> seqName >> currentNumFlowCells;
1063             lengths.push_back(currentNumFlowCells);
1064             
1065             seqNameVector.push_back(seqName);
1066             nameMap[seqName] = index++;//pcluster
1067             
1068             for(int i=0;i<numFlowCells;i++){
1069                 flowFile >> intensity;
1070                 if(intensity > 9.99)    {       intensity = 9.99;       }
1071                 int intI = int(100 * intensity + 0.0001);
1072                 flowDataIntI.push_back(intI);
1073             }
1074             m->gobble(flowFile);
1075         }
1076         flowFile.close();
1077         
1078         numSeqs = seqNameVector.size();         
1079         
1080         for(int i=0;i<numSeqs;i++){
1081             
1082             if (m->control_pressed) { break; }
1083             
1084             int iNumFlowCells = i * numFlowCells;
1085             for(int j=lengths[i];j<numFlowCells;j++){
1086                 flowDataIntI[iNumFlowCells + j] = 0;
1087             }
1088         }
1089         
1090     }
1091     catch(exception& e) {
1092         m->errorOut(e, "ShhherCommand", "getFlowData");
1093         exit(1);
1094     }
1095 }
1096 /**************************************************************************************************/
1097 void ShhherCommand::calcNewDistancesChildMPI(int startSeq, int stopSeq, vector<int>& otuIndex){
1098         
1099         try{
1100                 vector<double> newTau(numOTUs,0);
1101                 vector<double> norms(numSeqs, 0);
1102                 otuIndex.clear();
1103                 seqIndex.clear();
1104                 singleTau.clear();
1105                 
1106                 for(int i=startSeq;i<stopSeq;i++){
1107                         
1108                         if (m->control_pressed) { break; }
1109                         
1110                         double offset = 1e8;
1111                         int indexOffset = i * numOTUs;
1112                         
1113                         for(int j=0;j<numOTUs;j++){
1114                                 
1115                                 if(weight[j] > MIN_WEIGHT && change[j] == 1){
1116                                         dist[indexOffset + j] = getDistToCentroid(centroids[j], i, lengths[i]);
1117                                 }
1118                                 if(weight[j] > MIN_WEIGHT && dist[indexOffset + j] < offset){
1119                                         offset = dist[indexOffset + j];
1120                                 }
1121                         }
1122                         
1123                         for(int j=0;j<numOTUs;j++){
1124                                 if(weight[j] > MIN_WEIGHT){
1125                                         newTau[j] = exp(sigma * (-dist[indexOffset + j] + offset)) * weight[j];
1126                                         norms[i] += newTau[j];
1127                                 }
1128                                 else{
1129                                         newTau[j] = 0.0;
1130                                 }
1131                         }
1132                         
1133                         for(int j=0;j<numOTUs;j++){
1134                 
1135                                 newTau[j] /= norms[i];
1136                                 
1137                                 if(newTau[j] > MIN_TAU){
1138                                         otuIndex.push_back(j);
1139                                         seqIndex.push_back(i);
1140                                         singleTau.push_back(newTau[j]);
1141                                 }
1142                         }
1143                         
1144                 }
1145         }
1146         catch(exception& e) {
1147                 m->errorOut(e, "ShhherCommand", "calcNewDistancesChildMPI");
1148                 exit(1);        
1149         }               
1150 }
1151
1152 /**************************************************************************************************/
1153
1154 void ShhherCommand::calcNewDistancesParent(int startSeq, int stopSeq){
1155     
1156     try{
1157         
1158         int total = 0;
1159         vector<double> newTau(numOTUs,0);
1160         vector<double> norms(numSeqs, 0);
1161         nSeqsPerOTU.assign(numOTUs, 0);
1162         
1163         for(int i=startSeq;i<stopSeq;i++){
1164             
1165             if (m->control_pressed) { break; }
1166             
1167             int indexOffset = i * numOTUs;
1168             
1169             double offset = 1e8;
1170             
1171             for(int j=0;j<numOTUs;j++){
1172                 
1173                 if(weight[j] > MIN_WEIGHT && change[j] == 1){
1174                     dist[indexOffset + j] = getDistToCentroid(centroids[j], i, lengths[i]);
1175                 }
1176                 
1177                 if(weight[j] > MIN_WEIGHT && dist[indexOffset + j] < offset){
1178                     offset = dist[indexOffset + j];
1179                 }
1180             }
1181             
1182             for(int j=0;j<numOTUs;j++){
1183                 if(weight[j] > MIN_WEIGHT){
1184                     newTau[j] = exp(sigma * (-dist[indexOffset + j] + offset)) * weight[j];
1185                     norms[i] += newTau[j];
1186                 }
1187                 else{
1188                     newTau[j] = 0.0;
1189                 }
1190             }
1191             
1192             for(int j=0;j<numOTUs;j++){
1193                 newTau[j] /= norms[i];
1194             }
1195             
1196             for(int j=0;j<numOTUs;j++){
1197                 if(newTau[j] > MIN_TAU){
1198                     
1199                     int oldTotal = total;
1200                     
1201                     total++;
1202                     
1203                     singleTau.resize(total, 0);
1204                     seqNumber.resize(total, 0);
1205                     seqIndex.resize(total, 0);
1206                     
1207                     singleTau[oldTotal] = newTau[j];
1208                     
1209                     aaP[j][nSeqsPerOTU[j]] = oldTotal;
1210                     aaI[j][nSeqsPerOTU[j]] = i;
1211                     nSeqsPerOTU[j]++;
1212                 }
1213             }
1214             
1215         }
1216         
1217     }
1218     catch(exception& e) {
1219         m->errorOut(e, "ShhherCommand", "calcNewDistancesParent");
1220         exit(1);        
1221     }           
1222 }
1223
1224 /**************************************************************************************************/
1225
1226 void ShhherCommand::setOTUs(){
1227     
1228     try {
1229         vector<double> bigTauMatrix(numOTUs * numSeqs, 0.0000);
1230         
1231         for(int i=0;i<numOTUs;i++){
1232             
1233             if (m->control_pressed) { break; }
1234             
1235             for(int j=0;j<nSeqsPerOTU[i];j++){
1236                 int index = cumNumSeqs[i] + j;
1237                 double tauValue = singleTau[seqNumber[index]];
1238                 int sIndex = seqIndex[index];
1239                 bigTauMatrix[sIndex * numOTUs + i] = tauValue;                          
1240             }
1241         }
1242         
1243         for(int i=0;i<numSeqs;i++){
1244             double maxTau = -1.0000;
1245             int maxOTU = -1;
1246             for(int j=0;j<numOTUs;j++){
1247                 if(bigTauMatrix[i * numOTUs + j] > maxTau){
1248                     maxTau = bigTauMatrix[i * numOTUs + j];
1249                     maxOTU = j;
1250                 }
1251             }
1252             
1253             otuData[i] = maxOTU;
1254         }
1255         
1256         nSeqsPerOTU.assign(numOTUs, 0);         
1257         
1258         for(int i=0;i<numSeqs;i++){
1259             int index = otuData[i];
1260             
1261             singleTau[i] = 1.0000;
1262             dist[i] = 0.0000;
1263             
1264             aaP[index][nSeqsPerOTU[index]] = i;
1265             aaI[index][nSeqsPerOTU[index]] = i;
1266             
1267             nSeqsPerOTU[index]++;
1268         }
1269         fill(); 
1270     }
1271     catch(exception& e) {
1272         m->errorOut(e, "ShhherCommand", "setOTUs");
1273         exit(1);        
1274     }           
1275 }
1276
1277 /**************************************************************************************************/
1278  
1279 void ShhherCommand::getUniques(){
1280     try{
1281         
1282         
1283         numUniques = 0;
1284         uniqueFlowgrams.assign(numFlowCells * numSeqs, -1);
1285         uniqueCount.assign(numSeqs, 0);                                                 //      anWeights
1286         uniqueLengths.assign(numSeqs, 0);
1287         mapSeqToUnique.assign(numSeqs, -1);
1288         mapUniqueToSeq.assign(numSeqs, -1);
1289         
1290         vector<short> uniqueFlowDataIntI(numFlowCells * numSeqs, -1);
1291         
1292         for(int i=0;i<numSeqs;i++){
1293             
1294             if (m->control_pressed) { break; }
1295             
1296             int index = 0;
1297             
1298             vector<short> current(numFlowCells);
1299             for(int j=0;j<numFlowCells;j++){
1300                 current[j] = short(((flowDataIntI[i * numFlowCells + j] + 50.0)/100.0));
1301             }
1302             
1303             for(int j=0;j<numUniques;j++){
1304                 int offset = j * numFlowCells;
1305                 bool toEnd = 1;
1306                 
1307                 int shorterLength;
1308                 if(lengths[i] < uniqueLengths[j])       {       shorterLength = lengths[i];                     }
1309                 else                                                            {       shorterLength = uniqueLengths[j];       }
1310                 
1311                 for(int k=0;k<shorterLength;k++){
1312                     if(current[k] != uniqueFlowgrams[offset + k]){
1313                         toEnd = 0;
1314                         break;
1315                     }
1316                 }
1317                 
1318                 if(toEnd){
1319                     mapSeqToUnique[i] = j;
1320                     uniqueCount[j]++;
1321                     index = j;
1322                     if(lengths[i] > uniqueLengths[j])   {       uniqueLengths[j] = lengths[i];  }
1323                     break;
1324                 }
1325                 index++;
1326             }
1327             
1328             if(index == numUniques){
1329                 uniqueLengths[numUniques] = lengths[i];
1330                 uniqueCount[numUniques] = 1;
1331                 mapSeqToUnique[i] = numUniques;//anMap
1332                 mapUniqueToSeq[numUniques] = i;//anF
1333                 
1334                 for(int k=0;k<numFlowCells;k++){
1335                     uniqueFlowgrams[numUniques * numFlowCells + k] = current[k];
1336                     uniqueFlowDataIntI[numUniques * numFlowCells + k] = flowDataIntI[i * numFlowCells + k];
1337                 }
1338                 
1339                 numUniques++;
1340             }
1341         }
1342         uniqueFlowDataIntI.resize(numFlowCells * numUniques);
1343         uniqueLengths.resize(numUniques);       
1344         
1345         flowDataPrI.resize(numSeqs * numFlowCells, 0);
1346         for(int i=0;i<flowDataPrI.size();i++)   {       if (m->control_pressed) { break; } flowDataPrI[i] = getProbIntensity(flowDataIntI[i]);          }
1347     }
1348     catch(exception& e) {
1349         m->errorOut(e, "ShhherCommand", "getUniques");
1350         exit(1);
1351     }
1352 }
1353
1354 /**************************************************************************************************/
1355
1356 float ShhherCommand::calcPairwiseDist(int seqA, int seqB){
1357     try{
1358         int minLength = lengths[mapSeqToUnique[seqA]];
1359         if(lengths[seqB] < minLength){  minLength = lengths[mapSeqToUnique[seqB]];      }
1360         
1361         int ANumFlowCells = seqA * numFlowCells;
1362         int BNumFlowCells = seqB * numFlowCells;
1363         
1364         float dist = 0;
1365         
1366         for(int i=0;i<minLength;i++){
1367             
1368             if (m->control_pressed) { break; }
1369             
1370             int flowAIntI = flowDataIntI[ANumFlowCells + i];
1371             float flowAPrI = flowDataPrI[ANumFlowCells + i];
1372             
1373             int flowBIntI = flowDataIntI[BNumFlowCells + i];
1374             float flowBPrI = flowDataPrI[BNumFlowCells + i];
1375             dist += jointLookUp[flowAIntI * NUMBINS + flowBIntI] - flowAPrI - flowBPrI;
1376         }
1377         
1378         dist /= (float) minLength;
1379         return dist;
1380     }
1381     catch(exception& e) {
1382         m->errorOut(e, "ShhherCommand", "calcPairwiseDist");
1383         exit(1);
1384     }
1385 }
1386
1387 //**********************************************************************************************************************/
1388
1389 string ShhherCommand::cluster(string distFileName, string namesFileName){
1390     try {
1391         
1392         ReadMatrix* read = new ReadColumnMatrix(distFileName);  
1393                 read->setCutoff(cutoff);
1394                 
1395                 NameAssignment* clusterNameMap = new NameAssignment(namesFileName);
1396                 clusterNameMap->readMap();
1397                 read->read(clusterNameMap);
1398         
1399                 ListVector* list = read->getListVector();
1400                 SparseDistanceMatrix* matrix = read->getDMatrix();
1401                 
1402                 delete read; 
1403                 delete clusterNameMap; 
1404         
1405         RAbundVector* rabund = new RAbundVector(list->getRAbundVector());
1406         
1407         Cluster* cluster = new CompleteLinkage(rabund, list, matrix, cutoff, "furthest"); 
1408         string tag = cluster->getTag();
1409         
1410         double clusterCutoff = cutoff;
1411         while (matrix->getSmallDist() <= clusterCutoff && matrix->getNNodes() > 0){
1412             
1413             if (m->control_pressed) { break; }
1414             
1415             cluster->update(clusterCutoff);
1416         }
1417         
1418         list->setLabel(toString(cutoff));
1419         
1420         string listFileName = flowFileName.substr(0,flowFileName.find_last_of('.')) + ".shhh.list";
1421         ofstream listFile;
1422         m->openOutputFile(listFileName, listFile);
1423         list->print(listFile);
1424         listFile.close();
1425         
1426         delete matrix;  delete cluster; delete rabund; delete list;
1427         
1428         return listFileName;
1429     }
1430     catch(exception& e) {
1431         m->errorOut(e, "ShhherCommand", "cluster");
1432         exit(1);        
1433     }           
1434 }
1435
1436 /**************************************************************************************************/
1437
1438 void ShhherCommand::calcCentroidsDriver(int start, int finish){                          
1439     
1440     //this function gets the most likely homopolymer length at a flow position for a group of sequences
1441     //within an otu
1442     
1443     try{
1444         
1445         for(int i=start;i<finish;i++){
1446             
1447             if (m->control_pressed) { break; }
1448             
1449             double count = 0;
1450             int position = 0;
1451             int minFlowGram = 100000000;
1452             double minFlowValue = 1e8;
1453             change[i] = 0; //FALSE
1454             
1455             for(int j=0;j<nSeqsPerOTU[i];j++){
1456                 count += singleTau[seqNumber[cumNumSeqs[i] + j]];
1457             }
1458             
1459             if(nSeqsPerOTU[i] > 0 && count > MIN_COUNT){
1460                 vector<double> adF(nSeqsPerOTU[i]);
1461                 vector<int> anL(nSeqsPerOTU[i]);
1462                 
1463                 for(int j=0;j<nSeqsPerOTU[i];j++){
1464                     int index = cumNumSeqs[i] + j;
1465                     int nI = seqIndex[index];
1466                     int nIU = mapSeqToUnique[nI];
1467                     
1468                     int k;
1469                     for(k=0;k<position;k++){
1470                         if(nIU == anL[k]){
1471                             break;
1472                         }
1473                     }
1474                     if(k == position){
1475                         anL[position] = nIU;
1476                         adF[position] = 0.0000;
1477                         position++;
1478                     }                                           
1479                 }
1480                 
1481                 for(int j=0;j<nSeqsPerOTU[i];j++){
1482                     int index = cumNumSeqs[i] + j;
1483                     int nI = seqIndex[index];
1484                     
1485                     double tauValue = singleTau[seqNumber[index]];
1486                     
1487                     for(int k=0;k<position;k++){
1488                         double dist = getDistToCentroid(anL[k], nI, lengths[nI]);
1489                         adF[k] += dist * tauValue;
1490                     }
1491                 }
1492                 
1493                 for(int j=0;j<position;j++){
1494                     if(adF[j] < minFlowValue){
1495                         minFlowGram = j;
1496                         minFlowValue = adF[j];
1497                     }
1498                 }
1499                 
1500                 if(centroids[i] != anL[minFlowGram]){
1501                     change[i] = 1;
1502                     centroids[i] = anL[minFlowGram];
1503                 }
1504             }
1505             else if(centroids[i] != -1){
1506                 change[i] = 1;
1507                 centroids[i] = -1;                      
1508             }
1509         }
1510     }
1511     catch(exception& e) {
1512         m->errorOut(e, "ShhherCommand", "calcCentroidsDriver");
1513         exit(1);        
1514     }           
1515 }
1516
1517 /**************************************************************************************************/
1518
1519 double ShhherCommand::getDistToCentroid(int cent, int flow, int length){
1520     try{
1521         
1522         int flowAValue = cent * numFlowCells;
1523         int flowBValue = flow * numFlowCells;
1524         
1525         double dist = 0;
1526         
1527         for(int i=0;i<length;i++){
1528             dist += singleLookUp[uniqueFlowgrams[flowAValue] * NUMBINS + flowDataIntI[flowBValue]];
1529             flowAValue++;
1530             flowBValue++;
1531         }
1532         
1533         return dist / (double)length;
1534     }
1535     catch(exception& e) {
1536         m->errorOut(e, "ShhherCommand", "getDistToCentroid");
1537         exit(1);        
1538     }           
1539 }
1540
1541 /**************************************************************************************************/
1542
1543 double ShhherCommand::getNewWeights(){
1544     try{
1545         
1546         double maxChange = 0;
1547         
1548         for(int i=0;i<numOTUs;i++){
1549             
1550             if (m->control_pressed) { break; }
1551             
1552             double difference = weight[i];
1553             weight[i] = 0;
1554             
1555             for(int j=0;j<nSeqsPerOTU[i];j++){
1556                 int index = cumNumSeqs[i] + j;
1557                 double tauValue = singleTau[seqNumber[index]];
1558                 weight[i] += tauValue;
1559             }
1560             
1561             difference = fabs(weight[i] - difference);
1562             if(difference > maxChange){ maxChange = difference; }
1563         }
1564         return maxChange;
1565     }
1566     catch(exception& e) {
1567         m->errorOut(e, "ShhherCommand", "getNewWeights");
1568         exit(1);        
1569     }           
1570 }
1571  
1572  /**************************************************************************************************/
1573  
1574 double ShhherCommand::getLikelihood(){
1575     
1576     try{
1577         
1578         vector<long double> P(numSeqs, 0);
1579         int effNumOTUs = 0;
1580         
1581         for(int i=0;i<numOTUs;i++){
1582             if(weight[i] > MIN_WEIGHT){
1583                 effNumOTUs++;
1584             }
1585         }
1586         
1587         string hold;
1588         for(int i=0;i<numOTUs;i++){
1589             
1590             if (m->control_pressed) { break; }
1591             
1592             for(int j=0;j<nSeqsPerOTU[i];j++){
1593                 int index = cumNumSeqs[i] + j;
1594                 int nI = seqIndex[index];
1595                 double singleDist = dist[seqNumber[index]];
1596                 
1597                 P[nI] += weight[i] * exp(-singleDist * sigma);
1598             }
1599         }
1600         double nLL = 0.00;
1601         for(int i=0;i<numSeqs;i++){
1602             if(P[i] == 0){      P[i] = DBL_EPSILON;     }
1603             
1604             nLL += -log(P[i]);
1605         }
1606         
1607         nLL = nLL -(double)numSeqs * log(sigma);
1608         
1609         return nLL; 
1610     }
1611     catch(exception& e) {
1612         m->errorOut(e, "ShhherCommand", "getNewWeights");
1613         exit(1);        
1614     }           
1615 }
1616
1617 /**************************************************************************************************/
1618
1619 void ShhherCommand::checkCentroids(){
1620     try{
1621         vector<int> unique(numOTUs, 1);
1622         
1623         for(int i=0;i<numOTUs;i++){
1624             if(centroids[i] == -1 || weight[i] < MIN_WEIGHT){
1625                 unique[i] = -1;
1626             }
1627         }
1628         
1629         for(int i=0;i<numOTUs;i++){
1630             
1631             if (m->control_pressed) { break; }
1632             
1633             if(unique[i] == 1){
1634                 for(int j=i+1;j<numOTUs;j++){
1635                     if(unique[j] == 1){
1636                         
1637                         if(centroids[j] == centroids[i]){
1638                             unique[j] = 0;
1639                             centroids[j] = -1;
1640                             
1641                             weight[i] += weight[j];
1642                             weight[j] = 0.0;
1643                         }
1644                     }
1645                 }
1646             }
1647         }
1648     }
1649     catch(exception& e) {
1650         m->errorOut(e, "ShhherCommand", "checkCentroids");
1651         exit(1);        
1652     }           
1653 }
1654  /**************************************************************************************************/
1655
1656
1657  
1658 void ShhherCommand::writeQualities(vector<int> otuCounts){
1659     
1660     try {
1661         string thisOutputDir = outputDir;
1662         if (outputDir == "") {  thisOutputDir += m->hasPath(flowFileName);  }
1663         map<string, string> variables; 
1664                 variables["[filename]"] = outputDir + m->getRootName(m->getSimpleName(flowFileName));
1665         string qualityFileName = getOutputFileName("qfile",variables);
1666         
1667         ofstream qualityFile;
1668         m->openOutputFile(qualityFileName, qualityFile);
1669         
1670         qualityFile.setf(ios::fixed, ios::floatfield);
1671         qualityFile.setf(ios::showpoint);
1672         qualityFile << setprecision(6);
1673         
1674         vector<vector<int> > qualities(numOTUs);
1675         vector<double> pr(HOMOPS, 0);
1676         
1677         
1678         for(int i=0;i<numOTUs;i++){
1679             
1680             if (m->control_pressed) { break; }
1681             
1682             int index = 0;
1683             int base = 0;
1684             
1685             if(nSeqsPerOTU[i] > 0){
1686                 qualities[i].assign(1024, -1);
1687                 
1688                 while(index < numFlowCells){
1689                     double maxPrValue = 1e8;
1690                     short maxPrIndex = -1;
1691                     double count = 0.0000;
1692                     
1693                     pr.assign(HOMOPS, 0);
1694                     
1695                     for(int j=0;j<nSeqsPerOTU[i];j++){
1696                         int lIndex = cumNumSeqs[i] + j;
1697                         double tauValue = singleTau[seqNumber[lIndex]];
1698                         int sequenceIndex = aaI[i][j];
1699                         short intensity = flowDataIntI[sequenceIndex * numFlowCells + index];
1700                         
1701                         count += tauValue;
1702                         
1703                         for(int s=0;s<HOMOPS;s++){
1704                             pr[s] += tauValue * singleLookUp[s * NUMBINS + intensity];
1705                         }
1706                     }
1707                     
1708                     maxPrIndex = uniqueFlowgrams[centroids[i] * numFlowCells + index];
1709                     maxPrValue = pr[maxPrIndex];
1710                     
1711                     if(count > MIN_COUNT){
1712                         double U = 0.0000;
1713                         double norm = 0.0000;
1714                         
1715                         for(int s=0;s<HOMOPS;s++){
1716                             norm += exp(-(pr[s] - maxPrValue));
1717                         }
1718                         
1719                         for(int s=1;s<=maxPrIndex;s++){
1720                             int value = 0;
1721                             double temp = 0.0000;
1722                             
1723                             U += exp(-(pr[s-1]-maxPrValue))/norm;
1724                             
1725                             if(U>0.00){
1726                                 temp = log10(U);
1727                             }
1728                             else{
1729                                 temp = -10.1;
1730                             }
1731                             temp = floor(-10 * temp);
1732                             value = (int)floor(temp);
1733                             if(value > 100){    value = 100;    }
1734                             
1735                             qualities[i][base] = (int)value;
1736                             base++;
1737                         }
1738                     }
1739                     
1740                     index++;
1741                 }
1742             }
1743             
1744             
1745             if(otuCounts[i] > 0){
1746                 qualityFile << '>' << seqNameVector[mapUniqueToSeq[i]] << endl;
1747                 
1748                 int j=4;        //need to get past the first four bases
1749                 while(qualities[i][j] != -1){
1750                     qualityFile << qualities[i][j] << ' ';
1751                     j++;
1752                 }
1753                 qualityFile << endl;
1754             }
1755         }
1756         qualityFile.close();
1757         outputNames.push_back(qualityFileName); outputTypes["qfile"].push_back(qualityFileName);
1758         
1759     }
1760     catch(exception& e) {
1761         m->errorOut(e, "ShhherCommand", "writeQualities");
1762         exit(1);        
1763     }           
1764 }
1765
1766 /**************************************************************************************************/
1767
1768 void ShhherCommand::writeSequences(vector<int> otuCounts){
1769     try {
1770         string thisOutputDir = outputDir;
1771         if (outputDir == "") {  thisOutputDir += m->hasPath(flowFileName);  }
1772         map<string, string> variables; 
1773                 variables["[filename]"] = thisOutputDir + m->getRootName(m->getSimpleName(flowFileName));
1774         string fastaFileName = getOutputFileName("fasta",variables);
1775         ofstream fastaFile;
1776         m->openOutputFile(fastaFileName, fastaFile);
1777         
1778         vector<string> names(numOTUs, "");
1779         
1780         for(int i=0;i<numOTUs;i++){
1781             
1782             if (m->control_pressed) { break; }
1783             
1784             int index = centroids[i];
1785             
1786             if(otuCounts[i] > 0){
1787                 fastaFile << '>' << seqNameVector[aaI[i][0]] << endl;
1788                 
1789                 string newSeq = "";
1790                 
1791                 for(int j=0;j<numFlowCells;j++){
1792                     
1793                     char base = flowOrder[j % flowOrder.length()];
1794                     for(int k=0;k<uniqueFlowgrams[index * numFlowCells + j];k++){
1795                         newSeq += base;
1796                     }
1797                 }
1798                 
1799                 fastaFile << newSeq.substr(4) << endl;
1800             }
1801         }
1802         fastaFile.close();
1803         
1804         outputNames.push_back(fastaFileName); outputTypes["fasta"].push_back(fastaFileName);
1805         
1806         if(compositeFASTAFileName != ""){
1807             m->appendFiles(fastaFileName, compositeFASTAFileName);
1808         }
1809     }
1810     catch(exception& e) {
1811         m->errorOut(e, "ShhherCommand", "writeSequences");
1812         exit(1);        
1813     }           
1814 }
1815
1816 /**************************************************************************************************/
1817
1818 void ShhherCommand::writeNames(vector<int> otuCounts){
1819     try {
1820         string thisOutputDir = outputDir;
1821         if (outputDir == "") {  thisOutputDir += m->hasPath(flowFileName);  }
1822         map<string, string> variables; 
1823                 variables["[filename]"] = thisOutputDir + m->getRootName(m->getSimpleName(flowFileName));
1824         string nameFileName = getOutputFileName("name",variables);
1825         ofstream nameFile;
1826         m->openOutputFile(nameFileName, nameFile);
1827         
1828         for(int i=0;i<numOTUs;i++){
1829             
1830             if (m->control_pressed) { break; }
1831             
1832             if(otuCounts[i] > 0){
1833                 nameFile << seqNameVector[aaI[i][0]] << '\t' << seqNameVector[aaI[i][0]];
1834                 
1835                 for(int j=1;j<nSeqsPerOTU[i];j++){
1836                     nameFile << ',' << seqNameVector[aaI[i][j]];
1837                 }
1838                 
1839                 nameFile << endl;
1840             }
1841         }
1842         nameFile.close();
1843         outputNames.push_back(nameFileName); outputTypes["name"].push_back(nameFileName);
1844         
1845         
1846         if(compositeNamesFileName != ""){
1847             m->appendFiles(nameFileName, compositeNamesFileName);
1848         }               
1849     }
1850     catch(exception& e) {
1851         m->errorOut(e, "ShhherCommand", "writeNames");
1852         exit(1);        
1853     }           
1854 }
1855
1856 /**************************************************************************************************/
1857
1858 void ShhherCommand::writeGroups(){
1859     try {
1860         string thisOutputDir = outputDir;
1861         if (outputDir == "") {  thisOutputDir += m->hasPath(flowFileName);  }
1862         string fileRoot = m->getRootName(m->getSimpleName(flowFileName));
1863         int pos = fileRoot.find_first_of('.');
1864         string fileGroup = fileRoot;
1865         if (pos != string::npos) {  fileGroup = fileRoot.substr(pos+1, (fileRoot.length()-1-(pos+1)));  }
1866         map<string, string> variables; 
1867                 variables["[filename]"] = thisOutputDir + fileRoot;
1868         string groupFileName = getOutputFileName("group",variables);
1869         ofstream groupFile;
1870         m->openOutputFile(groupFileName, groupFile);
1871         
1872         for(int i=0;i<numSeqs;i++){
1873             if (m->control_pressed) { break; }
1874             groupFile << seqNameVector[i] << '\t' << fileGroup << endl;
1875         }
1876         groupFile.close();
1877         outputNames.push_back(groupFileName); outputTypes["group"].push_back(groupFileName);
1878         
1879     }
1880     catch(exception& e) {
1881         m->errorOut(e, "ShhherCommand", "writeGroups");
1882         exit(1);        
1883     }           
1884 }
1885
1886 /**************************************************************************************************/
1887
1888 void ShhherCommand::writeClusters(vector<int> otuCounts){
1889     try {
1890         string thisOutputDir = outputDir;
1891         if (outputDir == "") {  thisOutputDir += m->hasPath(flowFileName);  }
1892         map<string, string> variables; 
1893                 variables["[filename]"] = thisOutputDir + m->getRootName(m->getSimpleName(flowFileName));
1894         string otuCountsFileName = getOutputFileName("counts",variables);
1895         ofstream otuCountsFile;
1896         m->openOutputFile(otuCountsFileName, otuCountsFile);
1897         
1898         string bases = flowOrder;
1899         
1900         for(int i=0;i<numOTUs;i++){
1901             
1902             if (m->control_pressed) {
1903                 break;
1904             }
1905             //output the translated version of the centroid sequence for the otu
1906             if(otuCounts[i] > 0){
1907                 int index = centroids[i];
1908                 
1909                 otuCountsFile << "ideal\t";
1910                 for(int j=8;j<numFlowCells;j++){
1911                     char base = bases[j % bases.length()];
1912                     for(int s=0;s<uniqueFlowgrams[index * numFlowCells + j];s++){
1913                         otuCountsFile << base;
1914                     }
1915                 }
1916                 otuCountsFile << endl;
1917                 
1918                 for(int j=0;j<nSeqsPerOTU[i];j++){
1919                     int sequence = aaI[i][j];
1920                     otuCountsFile << seqNameVector[sequence] << '\t';
1921                     
1922                     string newSeq = "";
1923                     
1924                     for(int k=0;k<lengths[sequence];k++){
1925                         char base = bases[k % bases.length()];
1926                         int freq = int(0.01 * (double)flowDataIntI[sequence * numFlowCells + k] + 0.5);
1927                         
1928                         for(int s=0;s<freq;s++){
1929                             newSeq += base;
1930                             //otuCountsFile << base;
1931                         }
1932                     }
1933                     otuCountsFile << newSeq.substr(4) << endl;
1934                 }
1935                 otuCountsFile << endl;
1936             }
1937         }
1938         otuCountsFile.close();
1939         outputNames.push_back(otuCountsFileName); outputTypes["counts"].push_back(otuCountsFileName);
1940         
1941     }
1942     catch(exception& e) {
1943         m->errorOut(e, "ShhherCommand", "writeClusters");
1944         exit(1);        
1945     }           
1946 }
1947
1948 #else
1949 //**********************************************************************************************************************
1950
1951 int ShhherCommand::execute(){
1952         try {
1953                 if (abort == true) { if (calledHelp) { return 0; }  return 2;   }
1954                 
1955                 getSingleLookUp();      if (m->control_pressed) { return 0; }
1956                 getJointLookUp();       if (m->control_pressed) { return 0; }
1957                 
1958         int numFiles = flowFileVector.size();
1959                 
1960         if (numFiles < processors) { processors = numFiles; }
1961         
1962 #if defined (__APPLE__) || (__MACH__) || (linux) || (__linux) || (__linux__) || (__unix__) || (__unix)
1963         if (processors == 1) { driver(flowFileVector, compositeFASTAFileName, compositeNamesFileName); }
1964         else { createProcesses(flowFileVector); } //each processor processes one file
1965 #else
1966         driver(flowFileVector, compositeFASTAFileName, compositeNamesFileName);
1967 #endif
1968         
1969                 if(compositeFASTAFileName != ""){
1970                         outputNames.push_back(compositeFASTAFileName); outputTypes["fasta"].push_back(compositeFASTAFileName);
1971                         outputNames.push_back(compositeNamesFileName); outputTypes["name"].push_back(compositeNamesFileName);
1972                 }
1973
1974                 m->mothurOutEndLine();
1975                 m->mothurOut("Output File Names: "); m->mothurOutEndLine();
1976                 for (int i = 0; i < outputNames.size(); i++) {  m->mothurOut(outputNames[i]); m->mothurOutEndLine();    }
1977                 m->mothurOutEndLine();
1978                 
1979                 return 0;
1980         }
1981         catch(exception& e) {
1982                 m->errorOut(e, "ShhherCommand", "execute");
1983                 exit(1);
1984         }
1985 }
1986 #endif
1987 //********************************************************************************************************************
1988 //sorts biggest to smallest
1989 inline bool compareFileSizes(string left, string right){
1990     
1991     FILE * pFile;
1992     long leftsize = 0;
1993     
1994     //get num bytes in file
1995     string filename = left;
1996     pFile = fopen (filename.c_str(),"rb");
1997     string error = "Error opening " + filename;
1998     if (pFile==NULL) perror (error.c_str());
1999     else{
2000         fseek (pFile, 0, SEEK_END);
2001         leftsize=ftell (pFile);
2002         fclose (pFile);
2003     }
2004     
2005     FILE * pFile2;
2006     long rightsize = 0;
2007     
2008     //get num bytes in file
2009     filename = right;
2010     pFile2 = fopen (filename.c_str(),"rb");
2011     error = "Error opening " + filename;
2012     if (pFile2==NULL) perror (error.c_str());
2013     else{
2014         fseek (pFile2, 0, SEEK_END);
2015         rightsize=ftell (pFile2);
2016         fclose (pFile2);
2017     }
2018     
2019     return (leftsize > rightsize);      
2020
2021 /**************************************************************************************************/
2022
2023 int ShhherCommand::createProcesses(vector<string> filenames){
2024     try {
2025         vector<int> processIDS;
2026                 int process = 1;
2027                 int num = 0;
2028                 
2029                 //sanity check
2030                 if (filenames.size() < processors) { processors = filenames.size(); }
2031         
2032         //sort file names by size to divide load better
2033         sort(filenames.begin(), filenames.end(), compareFileSizes);
2034         
2035         vector < vector <string> > dividedFiles; //dividedFiles[1] = vector of filenames for process 1...
2036         dividedFiles.resize(processors);
2037         
2038         //for each file, figure out which process will complete it
2039         //want to divide the load intelligently so the big files are spread between processes
2040         for (int i = 0; i < filenames.size(); i++) { 
2041             int processToAssign = (i+1) % processors; 
2042             if (processToAssign == 0) { processToAssign = processors; }
2043             
2044             dividedFiles[(processToAssign-1)].push_back(filenames[i]);
2045         }
2046         
2047         //now lets reverse the order of ever other process, so we balance big files running with little ones
2048         for (int i = 0; i < processors; i++) {
2049             int remainder = ((i+1) % processors);
2050             if (remainder) {  reverse(dividedFiles[i].begin(), dividedFiles[i].end());  }
2051         }
2052         
2053                         
2054                 //divide the groups between the processors
2055                 /*vector<linePair> lines;
2056         vector<int> numFilesToComplete;
2057                 int numFilesPerProcessor = filenames.size() / processors;
2058                 for (int i = 0; i < processors; i++) {
2059                         int startIndex =  i * numFilesPerProcessor;
2060                         int endIndex = (i+1) * numFilesPerProcessor;
2061                         if(i == (processors - 1)){      endIndex = filenames.size();    }
2062                         lines.push_back(linePair(startIndex, endIndex));
2063             numFilesToComplete.push_back((endIndex-startIndex));
2064                 }*/
2065                 
2066         #if defined (__APPLE__) || (__MACH__) || (linux) || (__linux) || (__linux__) || (__unix__) || (__unix)          
2067                 
2068                 //loop through and create all the processes you want
2069                 while (process != processors) {
2070                         int pid = fork();
2071                         
2072                         if (pid > 0) {
2073                                 processIDS.push_back(pid);  //create map from line number to pid so you can append files in correct order later
2074                                 process++;
2075                         }else if (pid == 0){
2076                                 num = driver(dividedFiles[process], compositeFASTAFileName + toString(getpid()) + ".temp", compositeNamesFileName  + toString(getpid()) + ".temp");
2077                 
2078                 //pass numSeqs to parent
2079                                 ofstream out;
2080                                 string tempFile = compositeFASTAFileName + toString(getpid()) + ".num.temp";
2081                                 m->openOutputFile(tempFile, out);
2082                                 out << num << endl;
2083                                 out.close();
2084                 
2085                                 exit(0);
2086                         }else { 
2087                                 m->mothurOut("[ERROR]: unable to spawn the necessary processes."); m->mothurOutEndLine(); 
2088                                 for (int i = 0; i < processIDS.size(); i++) { kill (processIDS[i], SIGINT); }
2089                                 exit(0);
2090                         }
2091                 }
2092                 
2093                 //do my part
2094                 driver(dividedFiles[0], compositeFASTAFileName, compositeNamesFileName);
2095                 
2096                 //force parent to wait until all the processes are done
2097                 for (int i=0;i<processIDS.size();i++) { 
2098                         int temp = processIDS[i];
2099                         wait(&temp);
2100                 }
2101         
2102         #else
2103         
2104         //////////////////////////////////////////////////////////////////////////////////////////////////////
2105         
2106         /////////////////////// NOT WORKING, ACCESS VIOLATION ON READ OF FLOWGRAMS IN THREAD /////////////////
2107         
2108         //////////////////////////////////////////////////////////////////////////////////////////////////////
2109                 //Windows version shared memory, so be careful when passing variables through the shhhFlowsData struct. 
2110                 //Above fork() will clone, so memory is separate, but that's not the case with windows, 
2111                 //////////////////////////////////////////////////////////////////////////////////////////////////////
2112                 /*
2113                 vector<shhhFlowsData*> pDataArray; 
2114                 DWORD   dwThreadIdArray[processors-1];
2115                 HANDLE  hThreadArray[processors-1]; 
2116                 
2117                 //Create processor worker threads.
2118                 for( int i=0; i<processors-1; i++ ){
2119                         // Allocate memory for thread data.
2120                         string extension = "";
2121                         if (i != 0) { extension = toString(i) + ".temp"; }
2122                         
2123             shhhFlowsData* tempFlow = new shhhFlowsData(filenames, (compositeFASTAFileName + extension), (compositeNamesFileName + extension), outputDir, flowOrder, jointLookUp, singleLookUp, m, lines[i].start, lines[i].end, cutoff, sigma, minDelta, maxIters, i);
2124                         pDataArray.push_back(tempFlow);
2125                         processIDS.push_back(i);
2126             
2127                         hThreadArray[i] = CreateThread(NULL, 0, ShhhFlowsThreadFunction, pDataArray[i], 0, &dwThreadIdArray[i]);   
2128                 }
2129                 
2130                 //using the main process as a worker saves time and memory
2131                 //do my part
2132                 driver(filenames, compositeFASTAFileName, compositeNamesFileName, lines[processors-1].start, lines[processors-1].end);
2133                 
2134                 //Wait until all threads have terminated.
2135                 WaitForMultipleObjects(processors-1, hThreadArray, TRUE, INFINITE);
2136                 
2137                 //Close all thread handles and free memory allocations.
2138                 for(int i=0; i < pDataArray.size(); i++){
2139                         for(int j=0; j < pDataArray[i]->outputNames.size(); j++){ outputNames.push_back(pDataArray[i]->outputNames[j]); }
2140                         CloseHandle(hThreadArray[i]);
2141                         delete pDataArray[i];
2142                 }
2143                 */
2144         #endif
2145         
2146         for (int i=0;i<processIDS.size();i++) { 
2147             ifstream in;
2148                         string tempFile =  compositeFASTAFileName + toString(processIDS[i]) + ".num.temp";
2149                         m->openInputFile(tempFile, in);
2150                         if (!in.eof()) { 
2151                 int tempNum = 0; 
2152                 in >> tempNum; 
2153                 if (tempNum != dividedFiles[i+1].size()) {
2154                     m->mothurOut("[ERROR]: main process expected " + toString(processIDS[i]) + " to complete " + toString(dividedFiles[i+1].size()) + " files, and it only reported completing " + toString(tempNum) + ". This will cause file mismatches.  The flow files may be too large to process with multiple processors. \n");
2155                 }
2156             }
2157                         in.close(); m->mothurRemove(tempFile);
2158             
2159             if (compositeFASTAFileName != "") {
2160                 m->appendFiles((compositeFASTAFileName + toString(processIDS[i]) + ".temp"), compositeFASTAFileName);
2161                 m->appendFiles((compositeNamesFileName + toString(processIDS[i]) + ".temp"), compositeNamesFileName);
2162                 m->mothurRemove((compositeFASTAFileName + toString(processIDS[i]) + ".temp"));
2163                 m->mothurRemove((compositeNamesFileName + toString(processIDS[i]) + ".temp"));
2164             }
2165         }
2166         
2167         return 0;
2168         
2169     }
2170         catch(exception& e) {
2171                 m->errorOut(e, "ShhherCommand", "createProcesses");
2172                 exit(1);
2173         }
2174 }
2175 /**************************************************************************************************/
2176
2177 vector<string> ShhherCommand::parseFlowFiles(string filename){
2178     try {
2179         vector<string> files;
2180         int count = 0;
2181         
2182         ifstream in;
2183         m->openInputFile(filename, in);
2184         
2185         int thisNumFLows = 0;
2186         in >> thisNumFLows; m->gobble(in);
2187         
2188         while (!in.eof()) {
2189             if (m->control_pressed) { break; }
2190             
2191             ofstream out;
2192             string outputFileName = filename + toString(count) + ".temp";
2193             m->openOutputFile(outputFileName, out);
2194             out << thisNumFLows << endl;
2195             files.push_back(outputFileName);
2196             
2197             int numLinesWrote = 0;
2198             for (int i = 0; i < largeSize; i++) {
2199                 if (in.eof()) { break; }
2200                 string line = m->getline(in); m->gobble(in);
2201                 out << line << endl;
2202                 numLinesWrote++;
2203             }
2204             out.close();
2205             
2206             if (numLinesWrote == 0) {  m->mothurRemove(outputFileName); files.pop_back();  }
2207             count++;
2208         }
2209         in.close();
2210         
2211         if (m->control_pressed) { for (int i = 0; i < files.size(); i++) { m->mothurRemove(files[i]); }  files.clear(); }
2212         
2213         m->mothurOut("\nDivided " + filename + " into " + toString(files.size()) + " files.\n\n"); 
2214         
2215         return files;
2216     }
2217         catch(exception& e) {
2218                 m->errorOut(e, "ShhherCommand", "parseFlowFiles");
2219                 exit(1);
2220         }
2221 }
2222 /**************************************************************************************************/
2223
2224 int ShhherCommand::driver(vector<string> filenames, string thisCompositeFASTAFileName, string thisCompositeNamesFileName){
2225     try {
2226         
2227         int numCompleted = 0;
2228         
2229         for(int i=0;i<filenames.size();i++){
2230                         
2231                         if (m->control_pressed) { break; }
2232                         
2233             vector<string> theseFlowFileNames; theseFlowFileNames.push_back(filenames[i]);
2234             if (large) {  theseFlowFileNames = parseFlowFiles(filenames[i]);  }
2235             
2236             if (m->control_pressed) { break; }
2237             
2238             double begClock = clock();
2239             unsigned long long begTime;
2240             
2241             string fileNameForOutput = filenames[i];
2242             
2243             for (int g = 0; g < theseFlowFileNames.size(); g++) {
2244                 
2245                 string flowFileName = theseFlowFileNames[g];
2246                 m->mothurOut("\n>>>>>\tProcessing " + flowFileName + " (file " + toString(i+1) + " of " + toString(filenames.size()) + ")\t<<<<<\n");
2247                 m->mothurOut("Reading flowgrams...\n");
2248                 
2249                 vector<string> seqNameVector;
2250                 vector<int> lengths;
2251                 vector<short> flowDataIntI;
2252                 vector<double> flowDataPrI;
2253                 map<string, int> nameMap;
2254                 vector<short> uniqueFlowgrams;
2255                 vector<int> uniqueCount;
2256                 vector<int> mapSeqToUnique;
2257                 vector<int> mapUniqueToSeq;
2258                 vector<int> uniqueLengths;
2259                 int numFlowCells;
2260                 
2261                 if (m->debug) { m->mothurOut("[DEBUG]: About to read flowgrams.\n"); }
2262                 int numSeqs = getFlowData(flowFileName, seqNameVector, lengths, flowDataIntI, nameMap, numFlowCells);
2263                 
2264                 if (m->control_pressed) { break; }
2265                 
2266                 m->mothurOut("Identifying unique flowgrams...\n");
2267                 int numUniques = getUniques(numSeqs, numFlowCells, uniqueFlowgrams, uniqueCount, uniqueLengths, mapSeqToUnique, mapUniqueToSeq, lengths, flowDataPrI, flowDataIntI);
2268                 
2269                 if (m->control_pressed) { break; }
2270                 
2271                 m->mothurOut("Calculating distances between flowgrams...\n");
2272                 string distFileName = flowFileName.substr(0,flowFileName.find_last_of('.')) + ".shhh.dist";
2273                 begTime = time(NULL);
2274                
2275                 
2276                 flowDistParentFork(numFlowCells, distFileName, numUniques, mapUniqueToSeq, mapSeqToUnique, lengths, flowDataPrI, flowDataIntI);
2277                 
2278                 m->mothurOutEndLine();
2279                 m->mothurOut("Total time: " + toString(time(NULL) - begTime) + '\t' + toString((clock() - begClock)/CLOCKS_PER_SEC) + '\n');
2280                 
2281                 
2282                 string namesFileName = flowFileName.substr(0,flowFileName.find_last_of('.')) + ".shhh.names";
2283                 createNamesFile(numSeqs, numUniques, namesFileName, seqNameVector, mapSeqToUnique, mapUniqueToSeq);
2284                 
2285                 if (m->control_pressed) { break; }
2286                 
2287                 m->mothurOut("\nClustering flowgrams...\n");
2288                 string listFileName = flowFileName.substr(0,flowFileName.find_last_of('.')) + ".shhh.list";
2289                 cluster(listFileName, distFileName, namesFileName);
2290                 
2291                 if (m->control_pressed) { break; }
2292                 
2293                 vector<int> otuData;
2294                 vector<int> cumNumSeqs;
2295                 vector<int> nSeqsPerOTU;
2296                 vector<vector<int> > aaP;       //tMaster->aanP:        each row is a different otu / each col contains the sequence indices
2297                 vector<vector<int> > aaI;       //tMaster->aanI:        that are in each otu - can't differentiate between aaP and aaI 
2298                 vector<int> seqNumber;          //tMaster->anP:         the sequence id number sorted by OTU
2299                 vector<int> seqIndex;           //tMaster->anI;         the index that corresponds to seqNumber
2300                 
2301                 
2302                 int numOTUs = getOTUData(numSeqs, listFileName, otuData, cumNumSeqs, nSeqsPerOTU, aaP, aaI, seqNumber, seqIndex, nameMap);
2303                 
2304                 if (m->control_pressed) { break; }
2305                 
2306                 m->mothurRemove(distFileName);
2307                 m->mothurRemove(namesFileName);
2308                 m->mothurRemove(listFileName);
2309                 
2310                 vector<double> dist;            //adDist - distance of sequences to centroids
2311                 vector<short> change;           //did the centroid sequence change? 0 = no; 1 = yes
2312                 vector<int> centroids;          //the representative flowgram for each cluster m
2313                 vector<double> weight;
2314                 vector<double> singleTau;       //tMaster->adTau:       1-D Tau vector (1xnumSeqs)
2315                 vector<int> nSeqsBreaks;
2316                 vector<int> nOTUsBreaks;
2317                 
2318                 if (m->debug) { m->mothurOut("[DEBUG]: numSeqs = " + toString(numSeqs) + " numOTUS = " + toString(numOTUs) + " about to alloc a dist vector with size = " + toString((numSeqs * numOTUs)) + ".\n"); }
2319                 
2320                 dist.assign(numSeqs * numOTUs, 0);
2321                 change.assign(numOTUs, 1);
2322                 centroids.assign(numOTUs, -1);
2323                 weight.assign(numOTUs, 0);
2324                 singleTau.assign(numSeqs, 1.0);
2325                 
2326                 nSeqsBreaks.assign(2, 0);
2327                 nOTUsBreaks.assign(2, 0);
2328                 
2329                 nSeqsBreaks[0] = 0;
2330                 nSeqsBreaks[1] = numSeqs;
2331                 nOTUsBreaks[1] = numOTUs;
2332                 
2333                 if (m->debug) { m->mothurOut("[DEBUG]: done allocating memory, about to denoise.\n"); }
2334                 
2335                 if (m->control_pressed) { break; }
2336                 
2337                 double maxDelta = 0;
2338                 int iter = 0;
2339                 
2340                 begClock = clock();
2341                 begTime = time(NULL);
2342                 
2343                 m->mothurOut("\nDenoising flowgrams...\n");
2344                 m->mothurOut("iter\tmaxDelta\tnLL\t\tcycletime\n");
2345                 
2346                 while((maxIters == 0 && maxDelta > minDelta) || iter < MIN_ITER || (maxDelta > minDelta && iter < maxIters)){
2347                     
2348                     if (m->control_pressed) { break; }
2349                     
2350                     double cycClock = clock();
2351                     unsigned long long cycTime = time(NULL);
2352                     fill(numOTUs, seqNumber, seqIndex, cumNumSeqs, nSeqsPerOTU, aaP, aaI);
2353                     
2354                     if (m->control_pressed) { break; }
2355                     
2356                     calcCentroidsDriver(numOTUs, cumNumSeqs, nSeqsPerOTU, seqIndex, change, centroids, singleTau, mapSeqToUnique, uniqueFlowgrams, flowDataIntI, lengths, numFlowCells, seqNumber);
2357                     
2358                     if (m->control_pressed) { break; }
2359                     
2360                     maxDelta = getNewWeights(numOTUs, cumNumSeqs, nSeqsPerOTU, singleTau, seqNumber, weight);  
2361                     
2362                     if (m->control_pressed) { break; }
2363                     
2364                     double nLL = getLikelihood(numSeqs, numOTUs, nSeqsPerOTU, seqNumber, cumNumSeqs, seqIndex, dist, weight); 
2365                     
2366                     if (m->control_pressed) { break; }
2367                     
2368                     checkCentroids(numOTUs, centroids, weight);
2369                     
2370                     if (m->control_pressed) { break; }
2371                     
2372                     calcNewDistances(numSeqs, numOTUs, nSeqsPerOTU,  dist, weight, change, centroids, aaP, singleTau, aaI, seqNumber, seqIndex, uniqueFlowgrams, flowDataIntI, numFlowCells, lengths);
2373                     
2374                     if (m->control_pressed) { break; }
2375                     
2376                     iter++;
2377                     
2378                     m->mothurOut(toString(iter) + '\t' + toString(maxDelta) + '\t' + toString(nLL) + '\t' + toString(time(NULL) - cycTime) + '\t' + toString((clock() - cycClock)/(double)CLOCKS_PER_SEC) + '\n');
2379                     
2380                 }       
2381                 
2382                 if (m->control_pressed) { break; }
2383                 
2384                 m->mothurOut("\nFinalizing...\n");
2385                 fill(numOTUs, seqNumber, seqIndex, cumNumSeqs, nSeqsPerOTU, aaP, aaI);
2386                 
2387                 if (m->control_pressed) { break; }
2388                 
2389                 setOTUs(numOTUs, numSeqs, seqNumber, seqIndex, cumNumSeqs, nSeqsPerOTU, otuData, singleTau, dist, aaP, aaI);
2390                 
2391                 if (m->control_pressed) { break; }
2392                 
2393                 vector<int> otuCounts(numOTUs, 0);
2394                 for(int j=0;j<numSeqs;j++)      {       otuCounts[otuData[j]]++;        }
2395                 
2396                 calcCentroidsDriver(numOTUs, cumNumSeqs, nSeqsPerOTU, seqIndex, change, centroids, singleTau, mapSeqToUnique, uniqueFlowgrams, flowDataIntI, lengths, numFlowCells, seqNumber); 
2397                 
2398                 if (m->control_pressed) { break; }
2399                 
2400                 if ((large) && (g == 0)) {  flowFileName = filenames[i]; theseFlowFileNames[0] = filenames[i]; }
2401                 string thisOutputDir = outputDir;
2402                 if (outputDir == "") {  thisOutputDir = m->hasPath(flowFileName);  }
2403                 map<string, string> variables; 
2404                 variables["[filename]"] = thisOutputDir + m->getRootName(m->getSimpleName(flowFileName));
2405                 string qualityFileName = getOutputFileName("qfile",variables);
2406                 string fastaFileName = getOutputFileName("fasta",variables);
2407                 string nameFileName = getOutputFileName("name",variables);
2408                 string otuCountsFileName = getOutputFileName("counts",variables);
2409                 string fileRoot = m->getRootName(m->getSimpleName(flowFileName));
2410                 int pos = fileRoot.find_first_of('.');
2411                 string fileGroup = fileRoot;
2412                 if (pos != string::npos) {  fileGroup = fileRoot.substr(pos+1, (fileRoot.length()-1-(pos+1)));  }
2413                 string groupFileName = getOutputFileName("group",variables);
2414
2415                 
2416                 writeQualities(numOTUs, numFlowCells, qualityFileName, otuCounts, nSeqsPerOTU, seqNumber, singleTau, flowDataIntI, uniqueFlowgrams, cumNumSeqs, mapUniqueToSeq, seqNameVector, centroids, aaI); if (m->control_pressed) { break; }
2417                 writeSequences(thisCompositeFASTAFileName, numOTUs, numFlowCells, fastaFileName, otuCounts, uniqueFlowgrams, seqNameVector, aaI, centroids);if (m->control_pressed) { break; }
2418                 writeNames(thisCompositeNamesFileName, numOTUs, nameFileName, otuCounts, seqNameVector, aaI, nSeqsPerOTU);                              if (m->control_pressed) { break; }
2419                 writeClusters(otuCountsFileName, numOTUs, numFlowCells,otuCounts, centroids, uniqueFlowgrams, seqNameVector, aaI, nSeqsPerOTU, lengths, flowDataIntI);                  if (m->control_pressed) { break; }
2420                 writeGroups(groupFileName, fileGroup, numSeqs, seqNameVector);                                          if (m->control_pressed) { break; }
2421                 
2422                 if (large) {
2423                     if (g > 0) {
2424                         variables["[filename]"] = thisOutputDir + m->getRootName(m->getSimpleName(theseFlowFileNames[0]));
2425                         m->appendFiles(qualityFileName, getOutputFileName("qfile",variables));
2426                         m->mothurRemove(qualityFileName);
2427                         m->appendFiles(fastaFileName, getOutputFileName("fasta",variables));
2428                         m->mothurRemove(fastaFileName);
2429                         m->appendFiles(nameFileName, getOutputFileName("name",variables));
2430                         m->mothurRemove(nameFileName);
2431                         m->appendFiles(otuCountsFileName, getOutputFileName("counts",variables));
2432                         m->mothurRemove(otuCountsFileName);
2433                         m->appendFiles(groupFileName, getOutputFileName("group",variables));
2434                         m->mothurRemove(groupFileName);
2435                     }
2436                     m->mothurRemove(theseFlowFileNames[g]);
2437                 }
2438                         }
2439             
2440             numCompleted++;
2441                         m->mothurOut("Total time to process " + fileNameForOutput + ":\t" + toString(time(NULL) - begTime) + '\t' + toString((clock() - begClock)/(double)CLOCKS_PER_SEC) + '\n');
2442                 }
2443                 
2444         if (m->control_pressed) { for (int i = 0; i < outputNames.size(); i++) { m->mothurRemove(outputNames[i]); } return 0; }
2445         
2446         return numCompleted;
2447         
2448     }catch(exception& e) {
2449             m->errorOut(e, "ShhherCommand", "driver");
2450             exit(1);
2451     }
2452 }
2453
2454 /**************************************************************************************************/
2455 int ShhherCommand::getFlowData(string filename, vector<string>& thisSeqNameVector, vector<int>& thisLengths, vector<short>& thisFlowDataIntI, map<string, int>& thisNameMap, int& numFlowCells){
2456         try{
2457        
2458                 ifstream flowFile;
2459        
2460                 m->openInputFile(filename, flowFile);
2461                 
2462                 string seqName;
2463                 int currentNumFlowCells;
2464                 float intensity;
2465         thisSeqNameVector.clear();
2466                 thisLengths.clear();
2467                 thisFlowDataIntI.clear();
2468                 thisNameMap.clear();
2469                 
2470                 string numFlowTest;
2471         flowFile >> numFlowTest;
2472         
2473         if (!m->isContainingOnlyDigits(numFlowTest)) { m->mothurOut("[ERROR]: expected a number and got " + numFlowTest + ", quitting. Did you use the flow parameter instead of the file parameter?"); m->mothurOutEndLine(); exit(1); }
2474         else { convert(numFlowTest, numFlowCells); }
2475         
2476         if (m->debug) { m->mothurOut("[DEBUG]: numFlowCells = " + toString(numFlowCells) + ".\n"); }
2477                 int index = 0;//pcluster
2478                 while(!flowFile.eof()){
2479                         
2480                         if (m->control_pressed) { break; }
2481                         
2482                         flowFile >> seqName >> currentNumFlowCells;
2483             
2484                         thisLengths.push_back(currentNumFlowCells);
2485            
2486                         thisSeqNameVector.push_back(seqName);
2487                         thisNameMap[seqName] = index++;//pcluster
2488             
2489             if (m->debug) { m->mothurOut("[DEBUG]: seqName = " + seqName + " length = " + toString(currentNumFlowCells) + " index = " + toString(index) + "\n"); }
2490             
2491                         for(int i=0;i<numFlowCells;i++){
2492                                 flowFile >> intensity;
2493                                 if(intensity > 9.99)    {       intensity = 9.99;       }
2494                                 int intI = int(100 * intensity + 0.0001);
2495                                 thisFlowDataIntI.push_back(intI);
2496                         }
2497                         m->gobble(flowFile);
2498                 }
2499                 flowFile.close();
2500                 
2501                 int numSeqs = thisSeqNameVector.size();         
2502                 
2503                 for(int i=0;i<numSeqs;i++){
2504                         
2505                         if (m->control_pressed) { break; }
2506                         
2507                         int iNumFlowCells = i * numFlowCells;
2508                         for(int j=thisLengths[i];j<numFlowCells;j++){
2509                                 thisFlowDataIntI[iNumFlowCells + j] = 0;
2510                         }
2511                 }
2512         
2513         return numSeqs;
2514                 
2515         }
2516         catch(exception& e) {
2517                 m->errorOut(e, "ShhherCommand", "getFlowData");
2518                 exit(1);
2519         }
2520 }
2521 /**************************************************************************************************/
2522
2523 int ShhherCommand::flowDistParentFork(int numFlowCells, string distFileName, int stopSeq, vector<int>& mapUniqueToSeq, vector<int>& mapSeqToUnique, vector<int>& lengths, vector<double>& flowDataPrI, vector<short>& flowDataIntI){
2524         try{            
2525         
2526                 ostringstream outStream;
2527                 outStream.setf(ios::fixed, ios::floatfield);
2528                 outStream.setf(ios::dec, ios::basefield);
2529                 outStream.setf(ios::showpoint);
2530                 outStream.precision(6);
2531                 
2532                 int begTime = time(NULL);
2533                 double begClock = clock();
2534         
2535                 for(int i=0;i<stopSeq;i++){
2536                         
2537                         if (m->control_pressed) { break; }
2538                         
2539                         for(int j=0;j<i;j++){
2540                                 float flowDistance = calcPairwiseDist(numFlowCells, mapUniqueToSeq[i], mapUniqueToSeq[j], mapSeqToUnique, lengths, flowDataPrI, flowDataIntI);
2541                 
2542                                 if(flowDistance < 1e-6){
2543                                         outStream << mapUniqueToSeq[i] << '\t' << mapUniqueToSeq[j] << '\t' << 0.000000 << endl;
2544                                 }
2545                                 else if(flowDistance <= cutoff){
2546                                         outStream << mapUniqueToSeq[i] << '\t' << mapUniqueToSeq[j] << '\t' << flowDistance << endl;
2547                                 }
2548                         }
2549                         if(i % 100 == 0){
2550                                 m->mothurOutJustToScreen(toString(i) + "\t" + toString(time(NULL) - begTime));
2551                                 m->mothurOutJustToScreen("\t" + toString((clock()-begClock)/CLOCKS_PER_SEC)+"\n");
2552                         }
2553                 }
2554                 
2555                 ofstream distFile(distFileName.c_str());
2556                 distFile << outStream.str();            
2557                 distFile.close();
2558                 
2559                 if (m->control_pressed) {}
2560                 else {
2561                         m->mothurOutJustToScreen(toString(stopSeq-1) + "\t" + toString(time(NULL) - begTime));
2562                         m->mothurOutJustToScreen("\t" + toString((clock()-begClock)/CLOCKS_PER_SEC)+"\n");
2563                 }
2564         
2565         return 0;
2566         }
2567         catch(exception& e) {
2568                 m->errorOut(e, "ShhherCommand", "flowDistParentFork");
2569                 exit(1);
2570         }
2571 }
2572 /**************************************************************************************************/
2573
2574 float ShhherCommand::calcPairwiseDist(int numFlowCells, int seqA, int seqB, vector<int>& mapSeqToUnique, vector<int>& lengths, vector<double>& flowDataPrI, vector<short>& flowDataIntI){
2575         try{
2576                 int minLength = lengths[mapSeqToUnique[seqA]];
2577                 if(lengths[seqB] < minLength){  minLength = lengths[mapSeqToUnique[seqB]];      }
2578                 
2579                 int ANumFlowCells = seqA * numFlowCells;
2580                 int BNumFlowCells = seqB * numFlowCells;
2581                 
2582                 float dist = 0;
2583                 
2584                 for(int i=0;i<minLength;i++){
2585                         
2586                         if (m->control_pressed) { break; }
2587                         
2588                         int flowAIntI = flowDataIntI[ANumFlowCells + i];
2589                         float flowAPrI = flowDataPrI[ANumFlowCells + i];
2590                         
2591                         int flowBIntI = flowDataIntI[BNumFlowCells + i];
2592                         float flowBPrI = flowDataPrI[BNumFlowCells + i];
2593                         dist += jointLookUp[flowAIntI * NUMBINS + flowBIntI] - flowAPrI - flowBPrI;
2594                 }
2595                 
2596                 dist /= (float) minLength;
2597                 return dist;
2598         }
2599         catch(exception& e) {
2600                 m->errorOut(e, "ShhherCommand", "calcPairwiseDist");
2601                 exit(1);
2602         }
2603 }
2604
2605 /**************************************************************************************************/
2606
2607 int ShhherCommand::getUniques(int numSeqs, int numFlowCells, vector<short>& uniqueFlowgrams, vector<int>& uniqueCount, vector<int>& uniqueLengths, vector<int>& mapSeqToUnique, vector<int>& mapUniqueToSeq, vector<int>& lengths, vector<double>& flowDataPrI, vector<short>& flowDataIntI){
2608         try{
2609                 int numUniques = 0;
2610                 uniqueFlowgrams.assign(numFlowCells * numSeqs, -1);
2611                 uniqueCount.assign(numSeqs, 0);                                                 //      anWeights
2612                 uniqueLengths.assign(numSeqs, 0);
2613                 mapSeqToUnique.assign(numSeqs, -1);
2614                 mapUniqueToSeq.assign(numSeqs, -1);
2615                 
2616                 vector<short> uniqueFlowDataIntI(numFlowCells * numSeqs, -1);
2617                 
2618                 for(int i=0;i<numSeqs;i++){
2619                         
2620                         if (m->control_pressed) { break; }
2621                         
2622                         int index = 0;
2623                         
2624                         vector<short> current(numFlowCells);
2625                         for(int j=0;j<numFlowCells;j++){
2626                                 current[j] = short(((flowDataIntI[i * numFlowCells + j] + 50.0)/100.0));
2627                         }
2628             
2629                         for(int j=0;j<numUniques;j++){
2630                                 int offset = j * numFlowCells;
2631                                 bool toEnd = 1;
2632                                 
2633                                 int shorterLength;
2634                                 if(lengths[i] < uniqueLengths[j])       {       shorterLength = lengths[i];                     }
2635                                 else                                                            {       shorterLength = uniqueLengths[j];       }
2636                 
2637                                 for(int k=0;k<shorterLength;k++){
2638                                         if(current[k] != uniqueFlowgrams[offset + k]){
2639                                                 toEnd = 0;
2640                                                 break;
2641                                         }
2642                                 }
2643                                 
2644                                 if(toEnd){
2645                                         mapSeqToUnique[i] = j;
2646                                         uniqueCount[j]++;
2647                                         index = j;
2648                                         if(lengths[i] > uniqueLengths[j])       {       uniqueLengths[j] = lengths[i];  }
2649                                         break;
2650                                 }
2651                                 index++;
2652                         }
2653                         
2654                         if(index == numUniques){
2655                                 uniqueLengths[numUniques] = lengths[i];
2656                                 uniqueCount[numUniques] = 1;
2657                                 mapSeqToUnique[i] = numUniques;//anMap
2658                                 mapUniqueToSeq[numUniques] = i;//anF
2659                                 
2660                                 for(int k=0;k<numFlowCells;k++){
2661                                         uniqueFlowgrams[numUniques * numFlowCells + k] = current[k];
2662                                         uniqueFlowDataIntI[numUniques * numFlowCells + k] = flowDataIntI[i * numFlowCells + k];
2663                                 }
2664                                 
2665                                 numUniques++;
2666                         }
2667                 }
2668                 uniqueFlowDataIntI.resize(numFlowCells * numUniques);
2669                 uniqueLengths.resize(numUniques);       
2670                 
2671                 flowDataPrI.resize(numSeqs * numFlowCells, 0);
2672                 for(int i=0;i<flowDataPrI.size();i++)   {       if (m->control_pressed) { break; } flowDataPrI[i] = getProbIntensity(flowDataIntI[i]);          }
2673         
2674         return numUniques;
2675         }
2676         catch(exception& e) {
2677                 m->errorOut(e, "ShhherCommand", "getUniques");
2678                 exit(1);
2679         }
2680 }
2681 /**************************************************************************************************/
2682 int ShhherCommand::createNamesFile(int numSeqs, int numUniques, string filename, vector<string>& seqNameVector, vector<int>& mapSeqToUnique, vector<int>& mapUniqueToSeq){
2683         try{
2684                 
2685                 vector<string> duplicateNames(numUniques, "");
2686                 for(int i=0;i<numSeqs;i++){
2687                         duplicateNames[mapSeqToUnique[i]] += seqNameVector[i] + ',';
2688                 }
2689                 
2690                 ofstream nameFile;
2691                 m->openOutputFile(filename, nameFile);
2692                 
2693                 for(int i=0;i<numUniques;i++){
2694                         
2695                         if (m->control_pressed) { break; }
2696                         
2697             //                  nameFile << seqNameVector[mapUniqueToSeq[i]] << '\t' << duplicateNames[i].substr(0, duplicateNames[i].find_last_of(',')) << endl;
2698                         nameFile << mapUniqueToSeq[i] << '\t' << duplicateNames[i].substr(0, duplicateNames[i].find_last_of(',')) << endl;
2699                 }
2700                 
2701                 nameFile.close();
2702         
2703                 return 0;
2704         }
2705         catch(exception& e) {
2706                 m->errorOut(e, "ShhherCommand", "createNamesFile");
2707                 exit(1);
2708         }
2709 }
2710 //**********************************************************************************************************************
2711
2712 int ShhherCommand::cluster(string filename, string distFileName, string namesFileName){
2713         try {
2714                 
2715                 ReadMatrix* read = new ReadColumnMatrix(distFileName);  
2716                 read->setCutoff(cutoff);
2717                 
2718                 NameAssignment* clusterNameMap = new NameAssignment(namesFileName);
2719                 clusterNameMap->readMap();
2720                 read->read(clusterNameMap);
2721         
2722                 ListVector* list = read->getListVector();
2723                 SparseDistanceMatrix* matrix = read->getDMatrix();
2724                 
2725                 delete read; 
2726                 delete clusterNameMap; 
2727         
2728                 RAbundVector* rabund = new RAbundVector(list->getRAbundVector());
2729                 
2730                 Cluster* cluster = new CompleteLinkage(rabund, list, matrix, cutoff, "furthest"); 
2731                 string tag = cluster->getTag();
2732                 
2733                 double clusterCutoff = cutoff;
2734                 while (matrix->getSmallDist() <= clusterCutoff && matrix->getNNodes() > 0){
2735                         
2736                         if (m->control_pressed) { break; }
2737                         
2738                         cluster->update(clusterCutoff);
2739                 }
2740                 
2741                 list->setLabel(toString(cutoff));
2742                 
2743                 ofstream listFile;
2744                 m->openOutputFile(filename, listFile);
2745                 list->print(listFile);
2746                 listFile.close();
2747                 
2748                 delete matrix;  delete cluster; delete rabund; delete list;
2749         
2750                 return 0;
2751         }
2752         catch(exception& e) {
2753                 m->errorOut(e, "ShhherCommand", "cluster");
2754                 exit(1);        
2755         }               
2756 }
2757 /**************************************************************************************************/
2758
2759 int ShhherCommand::getOTUData(int numSeqs, string fileName,  vector<int>& otuData,
2760                                vector<int>& cumNumSeqs,
2761                                vector<int>& nSeqsPerOTU,
2762                                vector<vector<int> >& aaP,       //tMaster->aanP:        each row is a different otu / each col contains the sequence indices
2763                                vector<vector<int> >& aaI,       //tMaster->aanI:        that are in each otu - can't differentiate between aaP and aaI 
2764                                vector<int>& seqNumber,          //tMaster->anP:         the sequence id number sorted by OTU
2765                                vector<int>& seqIndex,
2766                                map<string, int>& nameMap){
2767         try {
2768         
2769                 ifstream listFile;
2770                 m->openInputFile(fileName, listFile);
2771                 string label;
2772         int numOTUs;
2773                 
2774                 listFile >> label >> numOTUs;
2775         
2776         if (m->debug) { m->mothurOut("[DEBUG]: Getting OTU Data...\n"); }
2777         
2778                 otuData.assign(numSeqs, 0);
2779                 cumNumSeqs.assign(numOTUs, 0);
2780                 nSeqsPerOTU.assign(numOTUs, 0);
2781                 aaP.clear();aaP.resize(numOTUs);
2782                 
2783                 seqNumber.clear();
2784                 aaI.clear();
2785                 seqIndex.clear();
2786                 
2787                 string singleOTU = "";
2788                 
2789                 for(int i=0;i<numOTUs;i++){
2790                         
2791                         if (m->control_pressed) { break; }
2792             if (m->debug) { m->mothurOut("[DEBUG]: processing OTU " + toString(i) + ".\n"); }
2793             
2794                         listFile >> singleOTU;
2795                         
2796                         istringstream otuString(singleOTU);
2797             
2798                         while(otuString){
2799                                 
2800                                 string seqName = "";
2801                                 
2802                                 for(int j=0;j<singleOTU.length();j++){
2803                                         char letter = otuString.get();
2804                                         
2805                                         if(letter != ','){
2806                                                 seqName += letter;
2807                                         }
2808                                         else{
2809                                                 map<string,int>::iterator nmIt = nameMap.find(seqName);
2810                                                 int index = nmIt->second;
2811                                                 
2812                                                 nameMap.erase(nmIt);
2813                                                 
2814                                                 otuData[index] = i;
2815                                                 nSeqsPerOTU[i]++;
2816                                                 aaP[i].push_back(index);
2817                                                 seqName = "";
2818                                         }
2819                                 }
2820                                 
2821                                 map<string,int>::iterator nmIt = nameMap.find(seqName);
2822                 
2823                                 int index = nmIt->second;
2824                                 nameMap.erase(nmIt);
2825                 
2826                                 otuData[index] = i;
2827                                 nSeqsPerOTU[i]++;
2828                                 aaP[i].push_back(index);        
2829                                 
2830                                 otuString.get();
2831                         }
2832                         
2833                         sort(aaP[i].begin(), aaP[i].end());
2834                         for(int j=0;j<nSeqsPerOTU[i];j++){
2835                                 seqNumber.push_back(aaP[i][j]);
2836                         }
2837                         for(int j=nSeqsPerOTU[i];j<numSeqs;j++){
2838                                 aaP[i].push_back(0);
2839                         }
2840                         
2841                         
2842                 }
2843                 
2844                 for(int i=1;i<numOTUs;i++){
2845                         cumNumSeqs[i] = cumNumSeqs[i-1] + nSeqsPerOTU[i-1];
2846                 }
2847                 aaI = aaP;
2848                 seqIndex = seqNumber;
2849                 
2850                 listFile.close();       
2851         
2852         return numOTUs;
2853                 
2854         }
2855         catch(exception& e) {
2856                 m->errorOut(e, "ShhherCommand", "getOTUData");
2857                 exit(1);        
2858         }               
2859 }
2860 /**************************************************************************************************/
2861
2862 int ShhherCommand::calcCentroidsDriver(int numOTUs, 
2863                                           vector<int>& cumNumSeqs,
2864                                           vector<int>& nSeqsPerOTU,
2865                                           vector<int>& seqIndex,
2866                                           vector<short>& change,                //did the centroid sequence change? 0 = no; 1 = yes
2867                                           vector<int>& centroids,               //the representative flowgram for each cluster m
2868                                           vector<double>& singleTau,    //tMaster->adTau:       1-D Tau vector (1xnumSeqs)
2869                                           vector<int>& mapSeqToUnique,
2870                                           vector<short>& uniqueFlowgrams,
2871                                           vector<short>& flowDataIntI,
2872                                           vector<int>& lengths,
2873                                           int numFlowCells,
2874                                           vector<int>& seqNumber){                          
2875         
2876         //this function gets the most likely homopolymer length at a flow position for a group of sequences
2877         //within an otu
2878         
2879         try{
2880                 
2881                 for(int i=0;i<numOTUs;i++){
2882                         
2883                         if (m->control_pressed) { break; }
2884                         
2885                         double count = 0;
2886                         int position = 0;
2887                         int minFlowGram = 100000000;
2888                         double minFlowValue = 1e8;
2889                         change[i] = 0; //FALSE
2890                         
2891                         for(int j=0;j<nSeqsPerOTU[i];j++){
2892                                 count += singleTau[seqNumber[cumNumSeqs[i] + j]];
2893                         }
2894             
2895                         if(nSeqsPerOTU[i] > 0 && count > MIN_COUNT){
2896                                 vector<double> adF(nSeqsPerOTU[i]);
2897                                 vector<int> anL(nSeqsPerOTU[i]);
2898                                 
2899                                 for(int j=0;j<nSeqsPerOTU[i];j++){
2900                                         int index = cumNumSeqs[i] + j;
2901                                         int nI = seqIndex[index];
2902                                         int nIU = mapSeqToUnique[nI];
2903                                         
2904                                         int k;
2905                                         for(k=0;k<position;k++){
2906                                                 if(nIU == anL[k]){
2907                                                         break;
2908                                                 }
2909                                         }
2910                                         if(k == position){
2911                                                 anL[position] = nIU;
2912                                                 adF[position] = 0.0000;
2913                                                 position++;
2914                                         }                                               
2915                                 }
2916                                 
2917                                 for(int j=0;j<nSeqsPerOTU[i];j++){
2918                                         int index = cumNumSeqs[i] + j;
2919                                         int nI = seqIndex[index];
2920                                         
2921                                         double tauValue = singleTau[seqNumber[index]];
2922                                         
2923                                         for(int k=0;k<position;k++){
2924                                                 double dist = getDistToCentroid(anL[k], nI, lengths[nI], uniqueFlowgrams, flowDataIntI, numFlowCells);
2925                                                 adF[k] += dist * tauValue;
2926                                         }
2927                                 }
2928                                 
2929                                 for(int j=0;j<position;j++){
2930                                         if(adF[j] < minFlowValue){
2931                                                 minFlowGram = j;
2932                                                 minFlowValue = adF[j];
2933                                         }
2934                                 }
2935                                 
2936                                 if(centroids[i] != anL[minFlowGram]){
2937                                         change[i] = 1;
2938                                         centroids[i] = anL[minFlowGram];
2939                                 }
2940                         }
2941                         else if(centroids[i] != -1){
2942                                 change[i] = 1;
2943                                 centroids[i] = -1;                      
2944                         }
2945                 }
2946         
2947         return 0;
2948         }
2949         catch(exception& e) {
2950                 m->errorOut(e, "ShhherCommand", "calcCentroidsDriver");
2951                 exit(1);        
2952         }               
2953 }
2954 /**************************************************************************************************/
2955
2956 double ShhherCommand::getDistToCentroid(int cent, int flow, int length, vector<short>& uniqueFlowgrams,
2957                                         vector<short>& flowDataIntI, int numFlowCells){
2958         try{
2959                 
2960                 int flowAValue = cent * numFlowCells;
2961                 int flowBValue = flow * numFlowCells;
2962                 
2963                 double dist = 0;
2964         
2965                 for(int i=0;i<length;i++){
2966                         dist += singleLookUp[uniqueFlowgrams[flowAValue] * NUMBINS + flowDataIntI[flowBValue]];
2967                         flowAValue++;
2968                         flowBValue++;
2969                 }
2970                 
2971                 return dist / (double)length;
2972         }
2973         catch(exception& e) {
2974                 m->errorOut(e, "ShhherCommand", "getDistToCentroid");
2975                 exit(1);        
2976         }               
2977 }
2978 /**************************************************************************************************/
2979
2980 double ShhherCommand::getNewWeights(int numOTUs, vector<int>& cumNumSeqs, vector<int>& nSeqsPerOTU, vector<double>& singleTau, vector<int>& seqNumber, vector<double>& weight){
2981         try{
2982                 
2983                 double maxChange = 0;
2984                 
2985                 for(int i=0;i<numOTUs;i++){
2986                         
2987                         if (m->control_pressed) { break; }
2988                         
2989                         double difference = weight[i];
2990                         weight[i] = 0;
2991                         
2992                         for(int j=0;j<nSeqsPerOTU[i];j++){
2993                                 int index = cumNumSeqs[i] + j;
2994                                 double tauValue = singleTau[seqNumber[index]];
2995                                 weight[i] += tauValue;
2996                         }
2997                         
2998                         difference = fabs(weight[i] - difference);
2999                         if(difference > maxChange){     maxChange = difference; }
3000                 }
3001                 return maxChange;
3002         }
3003         catch(exception& e) {
3004                 m->errorOut(e, "ShhherCommand", "getNewWeights");
3005                 exit(1);        
3006         }               
3007 }
3008
3009 /**************************************************************************************************/
3010
3011 double ShhherCommand::getLikelihood(int numSeqs, int numOTUs, vector<int>& nSeqsPerOTU, vector<int>& seqNumber, vector<int>& cumNumSeqs, vector<int>& seqIndex, vector<double>& dist, vector<double>& weight){
3012         
3013         try{
3014                 
3015                 vector<long double> P(numSeqs, 0);
3016                 int effNumOTUs = 0;
3017                 
3018                 for(int i=0;i<numOTUs;i++){
3019                         if(weight[i] > MIN_WEIGHT){
3020                                 effNumOTUs++;
3021                         }
3022                 }
3023                 
3024                 string hold;
3025                 for(int i=0;i<numOTUs;i++){
3026                         
3027                         if (m->control_pressed) { break; }
3028                         
3029                         for(int j=0;j<nSeqsPerOTU[i];j++){
3030                                 int index = cumNumSeqs[i] + j;
3031                                 int nI = seqIndex[index];
3032                                 double singleDist = dist[seqNumber[index]];
3033                                 
3034                                 P[nI] += weight[i] * exp(-singleDist * sigma);
3035                         }
3036                 }
3037                 double nLL = 0.00;
3038                 for(int i=0;i<numSeqs;i++){
3039                         if(P[i] == 0){  P[i] = DBL_EPSILON;     }
3040             
3041                         nLL += -log(P[i]);
3042                 }
3043                 
3044                 nLL = nLL -(double)numSeqs * log(sigma);
3045         
3046                 return nLL; 
3047         }
3048         catch(exception& e) {
3049                 m->errorOut(e, "ShhherCommand", "getNewWeights");
3050                 exit(1);        
3051         }               
3052 }
3053
3054 /**************************************************************************************************/
3055
3056 int ShhherCommand::checkCentroids(int numOTUs, vector<int>& centroids, vector<double>& weight){
3057         try{
3058                 vector<int> unique(numOTUs, 1);
3059                 
3060                 for(int i=0;i<numOTUs;i++){
3061                         if(centroids[i] == -1 || weight[i] < MIN_WEIGHT){
3062                                 unique[i] = -1;
3063                         }
3064                 }
3065                 
3066                 for(int i=0;i<numOTUs;i++){
3067                         
3068                         if (m->control_pressed) { break; }
3069                         
3070                         if(unique[i] == 1){
3071                                 for(int j=i+1;j<numOTUs;j++){
3072                                         if(unique[j] == 1){
3073                                                 
3074                                                 if(centroids[j] == centroids[i]){
3075                                                         unique[j] = 0;
3076                                                         centroids[j] = -1;
3077                                                         
3078                                                         weight[i] += weight[j];
3079                                                         weight[j] = 0.0;
3080                                                 }
3081                                         }
3082                                 }
3083                         }
3084                 }
3085         
3086         return 0;
3087         }
3088         catch(exception& e) {
3089                 m->errorOut(e, "ShhherCommand", "checkCentroids");
3090                 exit(1);        
3091         }               
3092 }
3093 /**************************************************************************************************/
3094
3095 void ShhherCommand::calcNewDistances(int numSeqs, int numOTUs, vector<int>& nSeqsPerOTU, vector<double>& dist, 
3096                                      vector<double>& weight, vector<short>& change, vector<int>& centroids,
3097                                      vector<vector<int> >& aaP, vector<double>& singleTau, vector<vector<int> >& aaI,   
3098                                      vector<int>& seqNumber, vector<int>& seqIndex,
3099                                      vector<short>& uniqueFlowgrams,
3100                                      vector<short>& flowDataIntI, int numFlowCells, vector<int>& lengths){
3101         
3102         try{
3103                 
3104                 int total = 0;
3105                 vector<double> newTau(numOTUs,0);
3106                 vector<double> norms(numSeqs, 0);
3107                 nSeqsPerOTU.assign(numOTUs, 0);
3108         
3109                 for(int i=0;i<numSeqs;i++){
3110                         
3111                         if (m->control_pressed) { break; }
3112                         
3113                         int indexOffset = i * numOTUs;
3114             
3115                         double offset = 1e8;
3116                         
3117                         for(int j=0;j<numOTUs;j++){
3118                 
3119                                 if(weight[j] > MIN_WEIGHT && change[j] == 1){
3120                                         dist[indexOffset + j] = getDistToCentroid(centroids[j], i, lengths[i], uniqueFlowgrams, flowDataIntI, numFlowCells);
3121                                 }
3122                 
3123                                 if(weight[j] > MIN_WEIGHT && dist[indexOffset + j] < offset){
3124                                         offset = dist[indexOffset + j];
3125                                 }
3126                         }
3127             
3128                         for(int j=0;j<numOTUs;j++){
3129                                 if(weight[j] > MIN_WEIGHT){
3130                                         newTau[j] = exp(sigma * (-dist[indexOffset + j] + offset)) * weight[j];
3131                                         norms[i] += newTau[j];
3132                                 }
3133                                 else{
3134                                         newTau[j] = 0.0;
3135                                 }
3136                         }
3137             
3138                         for(int j=0;j<numOTUs;j++){
3139                                 newTau[j] /= norms[i];
3140                         }
3141             
3142                         for(int j=0;j<numOTUs;j++){
3143                                 if(newTau[j] > MIN_TAU){
3144                                         
3145                                         int oldTotal = total;
3146                                         
3147                                         total++;
3148                                         
3149                                         singleTau.resize(total, 0);
3150                                         seqNumber.resize(total, 0);
3151                                         seqIndex.resize(total, 0);
3152                                         
3153                                         singleTau[oldTotal] = newTau[j];
3154                                         
3155                                         aaP[j][nSeqsPerOTU[j]] = oldTotal;
3156                                         aaI[j][nSeqsPerOTU[j]] = i;
3157                                         nSeqsPerOTU[j]++;
3158                                 }
3159                         }
3160             
3161                 }
3162         
3163         }
3164         catch(exception& e) {
3165                 m->errorOut(e, "ShhherCommand", "calcNewDistances");
3166                 exit(1);        
3167         }               
3168 }
3169 /**************************************************************************************************/
3170
3171 int ShhherCommand::fill(int numOTUs, vector<int>& seqNumber, vector<int>& seqIndex, vector<int>& cumNumSeqs, vector<int>& nSeqsPerOTU, vector<vector<int> >& aaP, vector<vector<int> >& aaI){
3172         try {
3173                 int index = 0;
3174                 for(int i=0;i<numOTUs;i++){
3175                         
3176                         if (m->control_pressed) { return 0; }
3177                         
3178                         cumNumSeqs[i] = index;
3179                         for(int j=0;j<nSeqsPerOTU[i];j++){
3180                                 seqNumber[index] = aaP[i][j];
3181                                 seqIndex[index] = aaI[i][j];
3182                                 
3183                                 index++;
3184                         }
3185                 }
3186         
3187         return 0; 
3188         }
3189         catch(exception& e) {
3190                 m->errorOut(e, "ShhherCommand", "fill");
3191                 exit(1);        
3192         }               
3193 }
3194 /**************************************************************************************************/
3195
3196 void ShhherCommand::setOTUs(int numOTUs, int numSeqs, vector<int>& seqNumber, vector<int>& seqIndex, vector<int>& cumNumSeqs, vector<int>& nSeqsPerOTU,
3197                             vector<int>& otuData, vector<double>& singleTau, vector<double>& dist, vector<vector<int> >& aaP, vector<vector<int> >& aaI){
3198         
3199         try {
3200                 vector<double> bigTauMatrix(numOTUs * numSeqs, 0.0000);
3201                 
3202                 for(int i=0;i<numOTUs;i++){
3203                         
3204                         if (m->control_pressed) { break; }
3205                         
3206                         for(int j=0;j<nSeqsPerOTU[i];j++){
3207                                 int index = cumNumSeqs[i] + j;
3208                                 double tauValue = singleTau[seqNumber[index]];
3209                                 int sIndex = seqIndex[index];
3210                                 bigTauMatrix[sIndex * numOTUs + i] = tauValue;                          
3211                         }
3212                 }
3213                 
3214                 for(int i=0;i<numSeqs;i++){
3215                         double maxTau = -1.0000;
3216                         int maxOTU = -1;
3217                         for(int j=0;j<numOTUs;j++){
3218                                 if(bigTauMatrix[i * numOTUs + j] > maxTau){
3219                                         maxTau = bigTauMatrix[i * numOTUs + j];
3220                                         maxOTU = j;
3221                                 }
3222                         }
3223                         
3224                         otuData[i] = maxOTU;
3225                 }
3226                 
3227                 nSeqsPerOTU.assign(numOTUs, 0);         
3228                 
3229                 for(int i=0;i<numSeqs;i++){
3230                         int index = otuData[i];
3231                         
3232                         singleTau[i] = 1.0000;
3233                         dist[i] = 0.0000;
3234                         
3235                         aaP[index][nSeqsPerOTU[index]] = i;
3236                         aaI[index][nSeqsPerOTU[index]] = i;
3237                         
3238                         nSeqsPerOTU[index]++;
3239                 }
3240         
3241                 fill(numOTUs, seqNumber, seqIndex, cumNumSeqs, nSeqsPerOTU, aaP, aaI);  
3242         }
3243         catch(exception& e) {
3244                 m->errorOut(e, "ShhherCommand", "setOTUs");
3245                 exit(1);        
3246         }               
3247 }
3248 /**************************************************************************************************/
3249
3250 void ShhherCommand::writeQualities(int numOTUs, int numFlowCells, string qualityFileName, vector<int> otuCounts, vector<int>& nSeqsPerOTU, vector<int>& seqNumber,
3251                                    vector<double>& singleTau, vector<short>& flowDataIntI, vector<short>& uniqueFlowgrams, vector<int>& cumNumSeqs,
3252                                    vector<int>& mapUniqueToSeq, vector<string>& seqNameVector, vector<int>& centroids, vector<vector<int> >& aaI){
3253         
3254         try {
3255         
3256                 ofstream qualityFile;
3257                 m->openOutputFile(qualityFileName, qualityFile);
3258         
3259                 qualityFile.setf(ios::fixed, ios::floatfield);
3260                 qualityFile.setf(ios::showpoint);
3261                 qualityFile << setprecision(6);
3262                 
3263                 vector<vector<int> > qualities(numOTUs);
3264                 vector<double> pr(HOMOPS, 0);
3265                 
3266                 
3267                 for(int i=0;i<numOTUs;i++){
3268                         
3269                         if (m->control_pressed) { break; }
3270                         
3271                         int index = 0;
3272                         int base = 0;
3273                         
3274                         if(nSeqsPerOTU[i] > 0){
3275                                 qualities[i].assign(1024, -1);
3276                                 
3277                                 while(index < numFlowCells){
3278                                         double maxPrValue = 1e8;
3279                                         short maxPrIndex = -1;
3280                                         double count = 0.0000;
3281                                         
3282                                         pr.assign(HOMOPS, 0);
3283                                         
3284                                         for(int j=0;j<nSeqsPerOTU[i];j++){
3285                                                 int lIndex = cumNumSeqs[i] + j;
3286                                                 double tauValue = singleTau[seqNumber[lIndex]];
3287                                                 int sequenceIndex = aaI[i][j];
3288                                                 short intensity = flowDataIntI[sequenceIndex * numFlowCells + index];
3289                                                 
3290                                                 count += tauValue;
3291                                                 
3292                                                 for(int s=0;s<HOMOPS;s++){
3293                                                         pr[s] += tauValue * singleLookUp[s * NUMBINS + intensity];
3294                                                 }
3295                                         }
3296                                         
3297                                         maxPrIndex = uniqueFlowgrams[centroids[i] * numFlowCells + index];
3298                                         maxPrValue = pr[maxPrIndex];
3299                                         
3300                                         if(count > MIN_COUNT){
3301                                                 double U = 0.0000;
3302                                                 double norm = 0.0000;
3303                                                 
3304                                                 for(int s=0;s<HOMOPS;s++){
3305                                                         norm += exp(-(pr[s] - maxPrValue));
3306                                                 }
3307                                                 
3308                                                 for(int s=1;s<=maxPrIndex;s++){
3309                                                         int value = 0;
3310                                                         double temp = 0.0000;
3311                                                         
3312                                                         U += exp(-(pr[s-1]-maxPrValue))/norm;
3313                                                         
3314                                                         if(U>0.00){
3315                                                                 temp = log10(U);
3316                                                         }
3317                                                         else{
3318                                                                 temp = -10.1;
3319                                                         }
3320                                                         temp = floor(-10 * temp);
3321                                                         value = (int)floor(temp);
3322                                                         if(value > 100){        value = 100;    }
3323                                                         
3324                                                         qualities[i][base] = (int)value;
3325                                                         base++;
3326                                                 }
3327                                         }
3328                                         
3329                                         index++;
3330                                 }
3331                         }
3332                         
3333                         
3334                         if(otuCounts[i] > 0){
3335                                 qualityFile << '>' << seqNameVector[mapUniqueToSeq[i]] << endl;
3336                         
3337                                 int j=4;        //need to get past the first four bases
3338                                 while(qualities[i][j] != -1){
3339                     qualityFile << qualities[i][j] << ' ';
3340                     if (j > qualities[i].size()) { break; }
3341                     j++;
3342                                 }
3343                                 qualityFile << endl;
3344                         }
3345                 }
3346                 qualityFile.close();
3347                 outputNames.push_back(qualityFileName); outputTypes["qfile"].push_back(qualityFileName);
3348         
3349         }
3350         catch(exception& e) {
3351                 m->errorOut(e, "ShhherCommand", "writeQualities");
3352                 exit(1);        
3353         }               
3354 }
3355
3356 /**************************************************************************************************/
3357
3358 void ShhherCommand::writeSequences(string thisCompositeFASTAFileName, int numOTUs, int numFlowCells, string fastaFileName, vector<int> otuCounts, vector<short>& uniqueFlowgrams, vector<string>& seqNameVector, vector<vector<int> >& aaI, vector<int>& centroids){
3359         try {
3360                 
3361                 ofstream fastaFile;
3362                 m->openOutputFile(fastaFileName, fastaFile);
3363                 
3364                 vector<string> names(numOTUs, "");
3365                 
3366                 for(int i=0;i<numOTUs;i++){
3367                         
3368                         if (m->control_pressed) { break; }
3369                         
3370                         int index = centroids[i];
3371                         
3372                         if(otuCounts[i] > 0){
3373                                 fastaFile << '>' << seqNameVector[aaI[i][0]] << endl;
3374                                 
3375                                 string newSeq = "";
3376                                 
3377                                 for(int j=0;j<numFlowCells;j++){
3378                                         
3379                                         char base = flowOrder[j % flowOrder.length()];
3380                                         for(int k=0;k<uniqueFlowgrams[index * numFlowCells + j];k++){
3381                                                 newSeq += base;
3382                                         }
3383                                 }
3384                                 
3385                                 if (newSeq.length() >= 4) {  fastaFile << newSeq.substr(4) << endl;  }
3386                 else {  fastaFile << "NNNN" << endl;  }
3387                         }
3388                 }
3389                 fastaFile.close();
3390         
3391                 outputNames.push_back(fastaFileName); outputTypes["fasta"].push_back(fastaFileName);
3392         
3393                 if(thisCompositeFASTAFileName != ""){
3394                         m->appendFiles(fastaFileName, thisCompositeFASTAFileName);
3395                 }
3396         }
3397         catch(exception& e) {
3398                 m->errorOut(e, "ShhherCommand", "writeSequences");
3399                 exit(1);        
3400         }               
3401 }
3402
3403 /**************************************************************************************************/
3404
3405 void ShhherCommand::writeNames(string thisCompositeNamesFileName, int numOTUs, string nameFileName, vector<int> otuCounts, vector<string>& seqNameVector, vector<vector<int> >& aaI, vector<int>& nSeqsPerOTU){
3406         try {
3407                 
3408                 ofstream nameFile;
3409                 m->openOutputFile(nameFileName, nameFile);
3410                 
3411                 for(int i=0;i<numOTUs;i++){
3412                         
3413                         if (m->control_pressed) { break; }
3414                         
3415                         if(otuCounts[i] > 0){
3416                                 nameFile << seqNameVector[aaI[i][0]] << '\t' << seqNameVector[aaI[i][0]];
3417                                 
3418                                 for(int j=1;j<nSeqsPerOTU[i];j++){
3419                                         nameFile << ',' << seqNameVector[aaI[i][j]];
3420                                 }
3421                                 
3422                                 nameFile << endl;
3423                         }
3424                 }
3425                 nameFile.close();
3426                 outputNames.push_back(nameFileName); outputTypes["name"].push_back(nameFileName);
3427                 
3428                 
3429                 if(thisCompositeNamesFileName != ""){
3430                         m->appendFiles(nameFileName, thisCompositeNamesFileName);
3431                 }               
3432         }
3433         catch(exception& e) {
3434                 m->errorOut(e, "ShhherCommand", "writeNames");
3435                 exit(1);        
3436         }               
3437 }
3438
3439 /**************************************************************************************************/
3440
3441 void ShhherCommand::writeGroups(string groupFileName, string fileRoot, int numSeqs, vector<string>& seqNameVector){
3442         try {
3443         ofstream groupFile;
3444                 m->openOutputFile(groupFileName, groupFile);
3445                 
3446                 for(int i=0;i<numSeqs;i++){
3447                         if (m->control_pressed) { break; }
3448                         groupFile << seqNameVector[i] << '\t' << fileRoot << endl;
3449                 }
3450                 groupFile.close();
3451                 outputNames.push_back(groupFileName); outputTypes["group"].push_back(groupFileName);
3452         
3453         }
3454         catch(exception& e) {
3455                 m->errorOut(e, "ShhherCommand", "writeGroups");
3456                 exit(1);        
3457         }               
3458 }
3459
3460 /**************************************************************************************************/
3461
3462 void ShhherCommand::writeClusters(string otuCountsFileName, int numOTUs, int numFlowCells, vector<int> otuCounts, vector<int>& centroids, vector<short>& uniqueFlowgrams, vector<string>& seqNameVector, vector<vector<int> >& aaI, vector<int>& nSeqsPerOTU, vector<int>& lengths, vector<short>& flowDataIntI){
3463         try {
3464                 ofstream otuCountsFile;
3465                 m->openOutputFile(otuCountsFileName, otuCountsFile);
3466                 
3467                 string bases = flowOrder;
3468                 
3469                 for(int i=0;i<numOTUs;i++){
3470                         
3471                         if (m->control_pressed) {
3472                                 break;
3473                         }
3474                         //output the translated version of the centroid sequence for the otu
3475                         if(otuCounts[i] > 0){
3476                                 int index = centroids[i];
3477                                 
3478                                 otuCountsFile << "ideal\t";
3479                                 for(int j=8;j<numFlowCells;j++){
3480                                         char base = bases[j % bases.length()];
3481                                         for(int s=0;s<uniqueFlowgrams[index * numFlowCells + j];s++){
3482                                                 otuCountsFile << base;
3483                                         }
3484                                 }
3485                                 otuCountsFile << endl;
3486                                 
3487                                 for(int j=0;j<nSeqsPerOTU[i];j++){
3488                                         int sequence = aaI[i][j];
3489                                         otuCountsFile << seqNameVector[sequence] << '\t';
3490                                         
3491                                         string newSeq = "";
3492                                         
3493                                         for(int k=0;k<lengths[sequence];k++){
3494                                                 char base = bases[k % bases.length()];
3495                                                 int freq = int(0.01 * (double)flowDataIntI[sequence * numFlowCells + k] + 0.5);
3496                         
3497                                                 for(int s=0;s<freq;s++){
3498                                                         newSeq += base;
3499                                                         //otuCountsFile << base;
3500                                                 }
3501                                         }
3502                                         
3503                     if (newSeq.length() >= 4) {  otuCountsFile << newSeq.substr(4) << endl;  }
3504                     else {  otuCountsFile << "NNNN" << endl;  }
3505                                 }
3506                                 otuCountsFile << endl;
3507                         }
3508                 }
3509                 otuCountsFile.close();
3510                 outputNames.push_back(otuCountsFileName); outputTypes["counts"].push_back(otuCountsFileName);
3511         
3512         }
3513         catch(exception& e) {
3514                 m->errorOut(e, "ShhherCommand", "writeClusters");
3515                 exit(1);        
3516         }               
3517 }
3518
3519 /**************************************************************************************************/
3520
3521 void ShhherCommand::getSingleLookUp(){
3522         try{
3523                 //      these are the -log probabilities that a signal corresponds to a particular homopolymer length
3524                 singleLookUp.assign(HOMOPS * NUMBINS, 0);
3525                 
3526                 int index = 0;
3527                 ifstream lookUpFile;
3528                 m->openInputFile(lookupFileName, lookUpFile);
3529                 
3530                 for(int i=0;i<HOMOPS;i++){
3531                         
3532                         if (m->control_pressed) { break; }
3533                         
3534                         float logFracFreq;
3535                         lookUpFile >> logFracFreq;
3536                         
3537                         for(int j=0;j<NUMBINS;j++)      {
3538                                 lookUpFile >> singleLookUp[index];
3539                                 index++;
3540                         }
3541                 }       
3542                 lookUpFile.close();
3543         }
3544         catch(exception& e) {
3545                 m->errorOut(e, "ShhherCommand", "getSingleLookUp");
3546                 exit(1);
3547         }
3548 }
3549
3550 /**************************************************************************************************/
3551
3552 void ShhherCommand::getJointLookUp(){
3553         try{
3554                 
3555                 //      the most likely joint probability (-log) that two intenities have the same polymer length
3556                 jointLookUp.resize(NUMBINS * NUMBINS, 0);
3557                 
3558                 for(int i=0;i<NUMBINS;i++){
3559                         
3560                         if (m->control_pressed) { break; }
3561                         
3562                         for(int j=0;j<NUMBINS;j++){             
3563                                 
3564                                 double minSum = 100000000;
3565                                 
3566                                 for(int k=0;k<HOMOPS;k++){
3567                                         double sum = singleLookUp[k * NUMBINS + i] + singleLookUp[k * NUMBINS + j];
3568                                         
3569                                         if(sum < minSum)        {       minSum = sum;           }
3570                                 }       
3571                                 jointLookUp[i * NUMBINS + j] = minSum;
3572                         }
3573                 }
3574         }
3575         catch(exception& e) {
3576                 m->errorOut(e, "ShhherCommand", "getJointLookUp");
3577                 exit(1);
3578         }
3579 }
3580
3581 /**************************************************************************************************/
3582
3583 double ShhherCommand::getProbIntensity(int intIntensity){                          
3584         try{
3585
3586                 double minNegLogProb = 100000000; 
3587
3588                 
3589                 for(int i=0;i<HOMOPS;i++){//loop signal strength
3590                         
3591                         if (m->control_pressed) { break; }
3592                         
3593                         float negLogProb = singleLookUp[i * NUMBINS + intIntensity];
3594                         if(negLogProb < minNegLogProb)  {       minNegLogProb = negLogProb; }
3595                 }
3596                 
3597                 return minNegLogProb;
3598         }
3599         catch(exception& e) {
3600                 m->errorOut(e, "ShhherCommand", "getProbIntensity");
3601                 exit(1);
3602         }
3603 }
3604
3605
3606
3607