]> git.donarmstrong.com Git - mothur.git/blob - shhhercommand.cpp
changed name of get.metacommunity to get.communitytype. fixed bug in metastats comman...
[mothur.git] / shhhercommand.cpp
1 /*
2  *  shhher.cpp
3  *  Mothur
4  *
5  *  Created by Pat Schloss on 12/27/10.
6  *  Copyright 2010 Schloss Lab. All rights reserved.
7  *
8  */
9
10 #include "shhhercommand.h"
11
12 //**********************************************************************************************************************
13 vector<string> ShhherCommand::setParameters(){  
14         try {
15                 CommandParameter pflow("flow", "InputTypes", "", "", "none", "fileflow", "none","fasta-name-group-counts-qfile",false,false,true); parameters.push_back(pflow);
16                 CommandParameter pfile("file", "InputTypes", "", "", "none", "fileflow", "none","fasta-name-group-counts-qfile",false,false,true); parameters.push_back(pfile);
17                 CommandParameter plookup("lookup", "InputTypes", "", "", "none", "none", "none","",false,false,true); parameters.push_back(plookup);
18                 CommandParameter pcutoff("cutoff", "Number", "", "0.01", "", "", "","",false,false); parameters.push_back(pcutoff);
19                 CommandParameter pprocessors("processors", "Number", "", "1", "", "", "","",false,false,true); parameters.push_back(pprocessors);
20                 CommandParameter pmaxiter("maxiter", "Number", "", "1000", "", "", "","",false,false); parameters.push_back(pmaxiter);
21         CommandParameter plarge("large", "Number", "", "-1", "", "", "","",false,false); parameters.push_back(plarge);
22                 CommandParameter psigma("sigma", "Number", "", "60", "", "", "","",false,false); parameters.push_back(psigma);
23                 CommandParameter pmindelta("mindelta", "Number", "", "0.000001", "", "", "","",false,false); parameters.push_back(pmindelta);
24         CommandParameter porder("order", "Multiple", "A-B-I", "A", "", "", "","",false,false, true); parameters.push_back(porder);              CommandParameter pinputdir("inputdir", "String", "", "", "", "", "","",false,false); parameters.push_back(pinputdir);
25                 CommandParameter poutputdir("outputdir", "String", "", "", "", "", "","",false,false); parameters.push_back(poutputdir);
26                 
27                 vector<string> myArray;
28                 for (int i = 0; i < parameters.size(); i++) {   myArray.push_back(parameters[i].name);          }
29                 return myArray;
30         }
31         catch(exception& e) {
32                 m->errorOut(e, "ShhherCommand", "setParameters");
33                 exit(1);
34         }
35 }
36 //**********************************************************************************************************************
37 string ShhherCommand::getHelpString(){  
38         try {
39                 string helpString = "";
40                 helpString += "The shhh.flows command reads a file containing flowgrams and creates a file of corrected sequences.\n";
41         helpString += "The shhh.flows command parameters are flow, file, lookup, cutoff, processors, large, maxiter, sigma, mindelta and order.\n";
42         helpString += "The flow parameter is used to input your flow file.\n";
43         helpString += "The file parameter is used to input the *flow.files file created by trim.flows.\n";
44         helpString += "The lookup parameter is used specify the lookup file you would like to use. http://www.mothur.org/wiki/Lookup_files.\n";
45         helpString += "The order parameter options are A, B or I.  Default=A. A = TACG and B = TACGTACGTACGATGTAGTCGAGCATCATCTGACGCAGTACGTGCATGATCTCAGTCAGCAGCTATGTCAGTGCATGCAGTGACTGATCGTCATCAGCTAGCATCGACTGCATAGATCGCATGACGATCGCATATCGTCAGTGCATGTAGTCGAGCATCATCTGACGCAGTACGTGCATGATCTCAGTCAGCAGCTATGTCAGTGCATGCATAGATCGCATGACGATCGCATATCGTCAGTGCAGTGACTGATCGTCATCAGCTAGCATCGACTGCATGTAGTCGAGCATCATCTGACGCAGTACGTGCATAGATCGCATGACGATCGCATATCGTCAGTGCATGATCTCAGTCAGCAGCTATGTCAGTGCATGCAGTGACTGATCGTCATCAGCTAGCATCGACTGCATGTAGTCGAGCATCATCTGACGCAGTACGTGCAGTGACTGATCGTCATCAGCTAGCATCGACTGCATAGATCGCATGACGATCGCATATCGTCAGTGCATGATCTCAGTCAGCAGCTATGTCAGTGCATGCATGTAGTCGAGCATCATCTGACGCAGTACGTGCATAGATCGCATGACGATCGCATATCGTCAGTGCAGTGACTGATCGTCATCAGCTAGCATCGACTGCATGATCTCAGTCAGCAGCTATGTCAGTGCATGCAGTGACTGATCGTCATCAGCTAGCATCGACTGCATAGATCGCATGACGATCGCATATCGTCAGTGCATGATCTCAGTCAGCAGCTATGTCAGTGCATGCATGTAGTCGAGCATCATCTGACGCAGTACGTGCATAGATCGCATGACGATCGCATATCGTCAGTGCATGATCTCAGTCAGCAGCTATGTCAGTGCATGCAGTGACTGATCGTCATCAGCTAGCATCGACTGCATGTAGTCGAGCATCATCTGACGCAGTACGTGCATGATCTCAGTCAGCAGCTATGTCAGTGCATGCATAGATCGCATGACGATCGCATATCGTCAGTGCATGTAGTCGAGCATCATCTGACGCAGTACGTGCAGTGACTGATCGTCATCAGCTAGCATCGACTGCATAGATCGCATGACGATCGCATATCGTCAGTGCATGTAGTCGAGCATCATCTGACGCAGTACGTGCATGATCTCAGTCAGCAGCTATGTCAGTGCATGCAGTGACTGATCGTCATCAGCTAGCATCGACTGCATGATCTCAGTCAGCAGCTATGTCAGTGCATGCAGTGACTGATCGTCATCAGCTAGCATCGACTGCATAGATCGCATGACGATCGCATATCGTCAGTGCATGTAGTCGAGCATCATCTGACGCAGTACGTGCATGATCTCAGTCAGCAGCTATGTCAGTGCATGCATGTAGTCGAGCATCATCTGACGCAGTACGTGCAGTGACTGATCGTCATCAGCTAGCATCGACTGCATAGATCGCATGACGATCGCATATCGTCAGTGCAGTGACTGATCGTCATCAGCTAGCATCGACTGCATGTAGTCGAGCATCATCTGACGCAGTACGTGCATAGATCGCATGACGATCGCATATCGTCAGTGCATGATCTCAGTCAGCAGCTATGTCAGTGCATGCAGTGACTGATCGTCATCAGCTAGCATCGACTGCATGTAGTCGAGCATCATCTGACGCAGTACGTGCATGATCTCAGTCAGCAGCTATGTCAGTGCATGCATAGATCGCATGACGATCGCATATCGTCAGTGCAGTGACTGATCGTCATCAGCTAGCATCGACTGCATGATCTCAGTCAGCAGC and I = TACGTACGTCTGAGCATCGATCGATGTACAGCTACGTACGTCTGAGCATCGATCGATGTACAGCTACGTACGTCTGAGCATCGATCGATGTACAGCTACGTACGTCTGAGCATCGATCGATGTACAGCTACGTACGTCTGAGCATCGATCGATGTACAGCTACGTACGTCTGAGCATCGATCGATGTACAGCTACGTACGTCTGAGCATCGATCGATGTACAGCTACGTACGTCTGAGCATCGATCGATGTACAGCTACGTACGTCTGAGCATCGATCGATGTACAGCTACGTACGTCTGAGCATCGATCGATGTACAGCTACGTACGTCTGAGCATCGATCGATGTACAGCTACGTACGTCTGAGCATCGATCGATGTACAGCTACGTACGTCTGAGCATCGATCGATGTACAGCTACGTACGTCTGAGCATCGATCGATGTACAGCTACGTACGTCTGAGCATCGATCGATGTACAGCTACGTACGTCTGAGCATCGATCGATGTACAGCTACGTACGTCTGAGCATCGATCGATGTACAGCTACGTACGTCTGAGCATCGATCGATGTACAGCTACGTACGTCTGAGCATCGATCGATGTACAGCTACGTACGTCTGAGCATCGATCGATGTACAGC.\n";
46                 return helpString;
47         }
48         catch(exception& e) {
49                 m->errorOut(e, "ShhherCommand", "getHelpString");
50                 exit(1);
51         }
52 }
53 //**********************************************************************************************************************
54 string ShhherCommand::getOutputPattern(string type) {
55     try {
56         string pattern = "";
57         
58         if (type == "fasta")            {   pattern = "[filename],shhh.fasta";   }
59         else if (type == "name")    {   pattern = "[filename],shhh.names";   }
60         else if (type == "group")        {   pattern = "[filename],shhh.groups";   }
61         else if (type == "counts")        {   pattern = "[filename],shhh.counts";   }
62         else if (type == "qfile")        {   pattern = "[filename],shhh.qual";   }
63         else { m->mothurOut("[ERROR]: No definition for type " + type + " output pattern.\n"); m->control_pressed = true;  }
64         
65         return pattern;
66     }
67     catch(exception& e) {
68         m->errorOut(e, "ShhherCommand", "getOutputPattern");
69         exit(1);
70     }
71 }
72 //**********************************************************************************************************************
73
74 ShhherCommand::ShhherCommand(){ 
75         try {
76                 abort = true; calledHelp = true;
77                 setParameters();
78                 
79                 //initialize outputTypes
80                 vector<string> tempOutNames;
81                 outputTypes["fasta"] = tempOutNames;
82         outputTypes["name"] = tempOutNames;
83         outputTypes["group"] = tempOutNames;
84         outputTypes["counts"] = tempOutNames;
85         outputTypes["qfile"] = tempOutNames;
86
87         }
88         catch(exception& e) {
89                 m->errorOut(e, "ShhherCommand", "ShhherCommand");
90                 exit(1);
91         }
92 }
93
94 //**********************************************************************************************************************
95
96 ShhherCommand::ShhherCommand(string option) {
97         try {
98         
99 #ifdef USE_MPI
100                 MPI_Comm_rank(MPI_COMM_WORLD, &pid); //find out who we are
101                 MPI_Comm_size(MPI_COMM_WORLD, &ncpus);
102         
103                 if(pid == 0){
104 #endif
105                 abort = false; calledHelp = false;   
106                 
107                 //allow user to run help
108                 if(option == "help") { help(); abort = true; calledHelp = true; }
109                 else if(option == "citation") { citation(); abort = true; calledHelp = true;}
110                 
111                 else {
112                         vector<string> myArray = setParameters();
113                         
114                         OptionParser parser(option);
115                         map<string,string> parameters = parser.getParameters();
116                         
117                         ValidParameters validParameter;
118                         map<string,string>::iterator it;
119                         
120                         //check to make sure all parameters are valid for command
121                         for (it = parameters.begin(); it != parameters.end(); it++) { 
122                                 if (validParameter.isValidParameter(it->first, myArray, it->second) != true) {  abort = true;  }
123                         }
124                         
125                         //initialize outputTypes
126             vector<string> tempOutNames;
127             outputTypes["fasta"] = tempOutNames;
128             outputTypes["name"] = tempOutNames;
129             outputTypes["group"] = tempOutNames;
130             outputTypes["counts"] = tempOutNames;
131             outputTypes["qfile"] = tempOutNames;
132
133                         
134                         //if the user changes the input directory command factory will send this info to us in the output parameter 
135                         string inputDir = validParameter.validFile(parameters, "inputdir", false);              
136                         if (inputDir == "not found"){   inputDir = "";          }
137                         else {
138                                 string path;
139                                 it = parameters.find("flow");
140                                 //user has given a template file
141                                 if(it != parameters.end()){ 
142                                         path = m->hasPath(it->second);
143                                         //if the user has not given a path then, add inputdir. else leave path alone.
144                                         if (path == "") {       parameters["flow"] = inputDir + it->second;             }
145                                 }
146                                 
147                                 it = parameters.find("lookup");
148                                 //user has given a template file
149                                 if(it != parameters.end()){ 
150                                         path = m->hasPath(it->second);
151                                         //if the user has not given a path then, add inputdir. else leave path alone.
152                                         if (path == "") {       parameters["lookup"] = inputDir + it->second;           }
153                                 }
154
155                                 it = parameters.find("file");
156                                 //user has given a template file
157                                 if(it != parameters.end()){ 
158                                         path = m->hasPath(it->second);
159                                         //if the user has not given a path then, add inputdir. else leave path alone.
160                                         if (path == "") {       parameters["file"] = inputDir + it->second;             }
161                                 }
162                         }
163             
164             //if the user changes the output directory command factory will send this info to us in the output parameter 
165                         outputDir = validParameter.validFile(parameters, "outputdir", false);           if (outputDir == "not found"){  outputDir = ""; }
166             
167                         //check for required parameters
168                         flowFileName = validParameter.validFile(parameters, "flow", true);
169                         flowFilesFileName = validParameter.validFile(parameters, "file", true);
170                         if (flowFileName == "not found" && flowFilesFileName == "not found") {
171                                 m->mothurOut("values for either flow or file must be provided for the shhh.flows command.");
172                                 m->mothurOutEndLine();
173                                 abort = true; 
174                         }
175                         else if (flowFileName == "not open" || flowFilesFileName == "not open") { abort = true; }
176                         
177                         if(flowFileName != "not found"){
178                                 compositeFASTAFileName = "";    
179                                 compositeNamesFileName = "";    
180                         }
181                         else{
182                                 ofstream temp;
183                 
184                 string thisoutputDir = outputDir;
185                 if (outputDir == "") {  thisoutputDir =  m->hasPath(flowFilesFileName); } //if user entered a file with a path then preserve it
186                 
187                                 //we want to rip off .files, and also .flow if its there
188                 string fileroot = m->getRootName(m->getSimpleName(flowFilesFileName));
189                 if (fileroot[fileroot.length()-1] == '.') {  fileroot = fileroot.substr(0, fileroot.length()-1); } //rip off dot
190                 string extension = m->getExtension(fileroot);
191                 if (extension == ".flow") { fileroot = m->getRootName(fileroot);  }
192                 else { fileroot += "."; } //add back if needed
193                 
194                                 compositeFASTAFileName = thisoutputDir + fileroot + "shhh.fasta";
195                                 m->openOutputFile(compositeFASTAFileName, temp);
196                                 temp.close();
197                                 
198                                 compositeNamesFileName = thisoutputDir + fileroot + "shhh.names";
199                                 m->openOutputFile(compositeNamesFileName, temp);
200                                 temp.close();
201                         }
202             
203             if(flowFilesFileName != "not found"){
204                 string fName;
205                 
206                 ifstream flowFilesFile;
207                 m->openInputFile(flowFilesFileName, flowFilesFile);
208                 while(flowFilesFile){
209                     fName = m->getline(flowFilesFile);
210                     
211                     //test if file is valid
212                     ifstream in;
213                     int ableToOpen = m->openInputFile(fName, in, "noerror");
214                     in.close(); 
215                     if (ableToOpen == 1) {
216                         if (inputDir != "") { //default path is set
217                             string tryPath = inputDir + fName;
218                             m->mothurOut("Unable to open " + fName + ". Trying input directory " + tryPath); m->mothurOutEndLine();
219                             ifstream in2;
220                             ableToOpen = m->openInputFile(tryPath, in2, "noerror");
221                             in2.close();
222                             fName = tryPath;
223                         }
224                     }
225                     
226                     if (ableToOpen == 1) {
227                         if (m->getDefaultPath() != "") { //default path is set
228                             string tryPath = m->getDefaultPath() + m->getSimpleName(fName);
229                             m->mothurOut("Unable to open " + fName + ". Trying default " + tryPath); m->mothurOutEndLine();
230                             ifstream in2;
231                             ableToOpen = m->openInputFile(tryPath, in2, "noerror");
232                             in2.close();
233                             fName = tryPath;
234                         }
235                     }
236                     
237                     //if you can't open it its not in current working directory or inputDir, try mothur excutable location
238                     if (ableToOpen == 1) {
239                         string exepath = m->argv;
240                         string tempPath = exepath;
241                         for (int i = 0; i < exepath.length(); i++) { tempPath[i] = tolower(exepath[i]); }
242                         exepath = exepath.substr(0, (tempPath.find_last_of('m')));
243                         
244                         string tryPath = m->getFullPathName(exepath) + m->getSimpleName(fName);
245                         m->mothurOut("Unable to open " + fName + ". Trying mothur's executable location " + tryPath); m->mothurOutEndLine();
246                         ifstream in2;
247                         ableToOpen = m->openInputFile(tryPath, in2, "noerror");
248                         in2.close();
249                         fName = tryPath;
250                     }
251                     
252                     if (ableToOpen == 1) {  m->mothurOut("Unable to open " + fName + ". Disregarding. "); m->mothurOutEndLine();  }
253                     else { flowFileVector.push_back(fName); }
254                     m->gobble(flowFilesFile);
255                 }
256                 flowFilesFile.close();
257                 if (flowFileVector.size() == 0) {  m->mothurOut("[ERROR]: no valid files."); m->mothurOutEndLine(); abort = true; }
258             }
259             else{
260                 if (outputDir == "") { outputDir = m->hasPath(flowFileName); }
261                 flowFileVector.push_back(flowFileName);
262             }
263                 
264                         //check for optional parameter and set defaults
265                         // ...at some point should added some additional type checking...
266                         string temp;
267                         temp = validParameter.validFile(parameters, "lookup", true);
268                         if (temp == "not found")        {       
269                                 string path = m->argv;
270                 string tempPath = path;
271                 for (int i = 0; i < path.length(); i++) { tempPath[i] = tolower(path[i]); }
272                 path = path.substr(0, (tempPath.find_last_of('m')));
273                 
274 #if defined (__APPLE__) || (__MACH__) || (linux) || (__linux) || (__linux__) || (__unix__) || (__unix)
275                 path += "lookupFiles/";
276 #else
277                 path += "lookupFiles\\";
278 #endif
279                                 lookupFileName = m->getFullPathName(path) + "LookUp_Titanium.pat";
280                                 
281                                 int ableToOpen;
282                                 ifstream in;
283                                 ableToOpen = m->openInputFile(lookupFileName, in, "noerror");
284                                 in.close();     
285                                 
286                                 //if you can't open it, try input location
287                                 if (ableToOpen == 1) {
288                                         if (inputDir != "") { //default path is set
289                                                 string tryPath = inputDir + m->getSimpleName(lookupFileName);
290                                                 m->mothurOut("Unable to open " + lookupFileName + ". Trying input directory " + tryPath); m->mothurOutEndLine();
291                                                 ifstream in2;
292                                                 ableToOpen = m->openInputFile(tryPath, in2, "noerror");
293                                                 in2.close();
294                                                 lookupFileName = tryPath;
295                                         }
296                                 }
297                                 
298                                 //if you can't open it, try default location
299                                 if (ableToOpen == 1) {
300                                         if (m->getDefaultPath() != "") { //default path is set
301                                                 string tryPath = m->getDefaultPath() + m->getSimpleName(lookupFileName);
302                                                 m->mothurOut("Unable to open " + lookupFileName + ". Trying default " + tryPath); m->mothurOutEndLine();
303                                                 ifstream in2;
304                                                 ableToOpen = m->openInputFile(tryPath, in2, "noerror");
305                                                 in2.close();
306                                                 lookupFileName = tryPath;
307                                         }
308                                 }
309                                 
310                                 //if you can't open it its not in current working directory or inputDir, try mothur excutable location
311                                 if (ableToOpen == 1) {
312                                         string exepath = m->argv;
313                                         string tempPath = exepath;
314                                         for (int i = 0; i < exepath.length(); i++) { tempPath[i] = tolower(exepath[i]); }
315                                         exepath = exepath.substr(0, (tempPath.find_last_of('m')));
316                                         
317                                         string tryPath = m->getFullPathName(exepath) + m->getSimpleName(lookupFileName);
318                                         m->mothurOut("Unable to open " + lookupFileName + ". Trying mothur's executable location " + tryPath); m->mothurOutEndLine();
319                                         ifstream in2;
320                                         ableToOpen = m->openInputFile(tryPath, in2, "noerror");
321                                         in2.close();
322                                         lookupFileName = tryPath;
323                                 }
324                                 
325                                 if (ableToOpen == 1) {  m->mothurOut("Unable to open " + lookupFileName + "."); m->mothurOutEndLine(); abort=true;  }
326                         }
327                         else if(temp == "not open")     {       
328                                 
329                                 lookupFileName = validParameter.validFile(parameters, "lookup", false);
330                                 
331                                 //if you can't open it its not inputDir, try mothur excutable location
332                                 string exepath = m->argv;
333                                 string tempPath = exepath;
334                                 for (int i = 0; i < exepath.length(); i++) { tempPath[i] = tolower(exepath[i]); }
335                                 exepath = exepath.substr(0, (tempPath.find_last_of('m')));
336                                         
337                                 string tryPath = m->getFullPathName(exepath) + m->getSimpleName(lookupFileName);
338                                 m->mothurOut("Unable to open " + lookupFileName + ". Trying mothur's executable location " + tryPath); m->mothurOutEndLine();
339                                 ifstream in2;
340                                 int ableToOpen = m->openInputFile(tryPath, in2, "noerror");
341                                 in2.close();
342                                 lookupFileName = tryPath;
343                                 
344                                 if (ableToOpen == 1) {  m->mothurOut("Unable to open " + lookupFileName + "."); m->mothurOutEndLine(); abort=true;  }
345                         }else                                           {       lookupFileName = temp;  }
346                         
347                         temp = validParameter.validFile(parameters, "processors", false);       if (temp == "not found"){       temp = m->getProcessors();      }
348                         m->setProcessors(temp);
349                         m->mothurConvert(temp, processors);
350
351                         temp = validParameter.validFile(parameters, "cutoff", false);   if (temp == "not found"){       temp = "0.01";          }
352                         m->mothurConvert(temp, cutoff); 
353                         
354                         temp = validParameter.validFile(parameters, "mindelta", false); if (temp == "not found"){       temp = "0.000001";      }
355                         m->mothurConvert(temp, minDelta); 
356
357                         temp = validParameter.validFile(parameters, "maxiter", false);  if (temp == "not found"){       temp = "1000";          }
358                         m->mothurConvert(temp, maxIters); 
359             
360             temp = validParameter.validFile(parameters, "large", false);        if (temp == "not found"){       temp = "0";             }
361                         m->mothurConvert(temp, largeSize); 
362             if (largeSize != 0) { large = true; }
363             else { large = false;  }
364             if (largeSize < 0) {  m->mothurOut("The value of the large cannot be negative.\n"); }
365             
366 #ifdef USE_MPI
367             if (large) { m->mothurOut("The large parameter is not available with the MPI-Enabled version.\n"); large=false; }                           
368 #endif
369             
370
371                         temp = validParameter.validFile(parameters, "sigma", false);if (temp == "not found")    {       temp = "60";            }
372                         m->mothurConvert(temp, sigma); 
373                         
374                         temp = validParameter.validFile(parameters, "order", false);  if (temp == "not found"){         temp = "A";     }
375             if (temp.length() > 1) {  m->mothurOut("[ERROR]: " + temp + " is not a valid option for order. order options are A, B, or I. A = TACG, B = TACGTACGTACGATGTAGTCGAGCATCATCTGACGCAGTACGTGCATGATCTCAGTCAGCAGCTATGTCAGTGCATGCAGTGACTGATCGTCATCAGCTAGCATCGACTGCATAGATCGCATGACGATCGCATATCGTCAGTGCATGTAGTCGAGCATCATCTGACGCAGTACGTGCATGATCTCAGTCAGCAGCTATGTCAGTGCATGCATAGATCGCATGACGATCGCATATCGTCAGTGCAGTGACTGATCGTCATCAGCTAGCATCGACTGCATGTAGTCGAGCATCATCTGACGCAGTACGTGCATAGATCGCATGACGATCGCATATCGTCAGTGCATGATCTCAGTCAGCAGCTATGTCAGTGCATGCAGTGACTGATCGTCATCAGCTAGCATCGACTGCATGTAGTCGAGCATCATCTGACGCAGTACGTGCAGTGACTGATCGTCATCAGCTAGCATCGACTGCATAGATCGCATGACGATCGCATATCGTCAGTGCATGATCTCAGTCAGCAGCTATGTCAGTGCATGCATGTAGTCGAGCATCATCTGACGCAGTACGTGCATAGATCGCATGACGATCGCATATCGTCAGTGCAGTGACTGATCGTCATCAGCTAGCATCGACTGCATGATCTCAGTCAGCAGCTATGTCAGTGCATGCAGTGACTGATCGTCATCAGCTAGCATCGACTGCATAGATCGCATGACGATCGCATATCGTCAGTGCATGATCTCAGTCAGCAGCTATGTCAGTGCATGCATGTAGTCGAGCATCATCTGACGCAGTACGTGCATAGATCGCATGACGATCGCATATCGTCAGTGCATGATCTCAGTCAGCAGCTATGTCAGTGCATGCAGTGACTGATCGTCATCAGCTAGCATCGACTGCATGTAGTCGAGCATCATCTGACGCAGTACGTGCATGATCTCAGTCAGCAGCTATGTCAGTGCATGCATAGATCGCATGACGATCGCATATCGTCAGTGCATGTAGTCGAGCATCATCTGACGCAGTACGTGCAGTGACTGATCGTCATCAGCTAGCATCGACTGCATAGATCGCATGACGATCGCATATCGTCAGTGCATGTAGTCGAGCATCATCTGACGCAGTACGTGCATGATCTCAGTCAGCAGCTATGTCAGTGCATGCAGTGACTGATCGTCATCAGCTAGCATCGACTGCATGATCTCAGTCAGCAGCTATGTCAGTGCATGCAGTGACTGATCGTCATCAGCTAGCATCGACTGCATAGATCGCATGACGATCGCATATCGTCAGTGCATGTAGTCGAGCATCATCTGACGCAGTACGTGCATGATCTCAGTCAGCAGCTATGTCAGTGCATGCATGTAGTCGAGCATCATCTGACGCAGTACGTGCAGTGACTGATCGTCATCAGCTAGCATCGACTGCATAGATCGCATGACGATCGCATATCGTCAGTGCAGTGACTGATCGTCATCAGCTAGCATCGACTGCATGTAGTCGAGCATCATCTGACGCAGTACGTGCATAGATCGCATGACGATCGCATATCGTCAGTGCATGATCTCAGTCAGCAGCTATGTCAGTGCATGCAGTGACTGATCGTCATCAGCTAGCATCGACTGCATGTAGTCGAGCATCATCTGACGCAGTACGTGCATGATCTCAGTCAGCAGCTATGTCAGTGCATGCATAGATCGCATGACGATCGCATATCGTCAGTGCAGTGACTGATCGTCATCAGCTAGCATCGACTGCATGATCTCAGTCAGCAGC, and I = TACGTACGTCTGAGCATCGATCGATGTACAGCTACGTACGTCTGAGCATCGATCGATGTACAGCTACGTACGTCTGAGCATCGATCGATGTACAGCTACGTACGTCTGAGCATCGATCGATGTACAGCTACGTACGTCTGAGCATCGATCGATGTACAGCTACGTACGTCTGAGCATCGATCGATGTACAGCTACGTACGTCTGAGCATCGATCGATGTACAGCTACGTACGTCTGAGCATCGATCGATGTACAGCTACGTACGTCTGAGCATCGATCGATGTACAGCTACGTACGTCTGAGCATCGATCGATGTACAGCTACGTACGTCTGAGCATCGATCGATGTACAGCTACGTACGTCTGAGCATCGATCGATGTACAGCTACGTACGTCTGAGCATCGATCGATGTACAGCTACGTACGTCTGAGCATCGATCGATGTACAGCTACGTACGTCTGAGCATCGATCGATGTACAGCTACGTACGTCTGAGCATCGATCGATGTACAGCTACGTACGTCTGAGCATCGATCGATGTACAGCTACGTACGTCTGAGCATCGATCGATGTACAGCTACGTACGTCTGAGCATCGATCGATGTACAGCTACGTACGTCTGAGCATCGATCGATGTACAGC.\n");  abort=true;
376             }
377             else {
378                 if (toupper(temp[0]) == 'A') {  flowOrder = "TACG";   }
379                 else if(toupper(temp[0]) == 'B'){
380                     flowOrder = "TACGTACGTACGATGTAGTCGAGCATCATCTGACGCAGTACGTGCATGATCTCAGTCAGCAGCTATGTCAGTGCATGCAGTGACTGATCGTCATCAGCTAGCATCGACTGCATAGATCGCATGACGATCGCATATCGTCAGTGCATGTAGTCGAGCATCATCTGACGCAGTACGTGCATGATCTCAGTCAGCAGCTATGTCAGTGCATGCATAGATCGCATGACGATCGCATATCGTCAGTGCAGTGACTGATCGTCATCAGCTAGCATCGACTGCATGTAGTCGAGCATCATCTGACGCAGTACGTGCATAGATCGCATGACGATCGCATATCGTCAGTGCATGATCTCAGTCAGCAGCTATGTCAGTGCATGCAGTGACTGATCGTCATCAGCTAGCATCGACTGCATGTAGTCGAGCATCATCTGACGCAGTACGTGCAGTGACTGATCGTCATCAGCTAGCATCGACTGCATAGATCGCATGACGATCGCATATCGTCAGTGCATGATCTCAGTCAGCAGCTATGTCAGTGCATGCATGTAGTCGAGCATCATCTGACGCAGTACGTGCATAGATCGCATGACGATCGCATATCGTCAGTGCAGTGACTGATCGTCATCAGCTAGCATCGACTGCATGATCTCAGTCAGCAGCTATGTCAGTGCATGCAGTGACTGATCGTCATCAGCTAGCATCGACTGCATAGATCGCATGACGATCGCATATCGTCAGTGCATGATCTCAGTCAGCAGCTATGTCAGTGCATGCATGTAGTCGAGCATCATCTGACGCAGTACGTGCATAGATCGCATGACGATCGCATATCGTCAGTGCATGATCTCAGTCAGCAGCTATGTCAGTGCATGCAGTGACTGATCGTCATCAGCTAGCATCGACTGCATGTAGTCGAGCATCATCTGACGCAGTACGTGCATGATCTCAGTCAGCAGCTATGTCAGTGCATGCATAGATCGCATGACGATCGCATATCGTCAGTGCATGTAGTCGAGCATCATCTGACGCAGTACGTGCAGTGACTGATCGTCATCAGCTAGCATCGACTGCATAGATCGCATGACGATCGCATATCGTCAGTGCATGTAGTCGAGCATCATCTGACGCAGTACGTGCATGATCTCAGTCAGCAGCTATGTCAGTGCATGCAGTGACTGATCGTCATCAGCTAGCATCGACTGCATGATCTCAGTCAGCAGCTATGTCAGTGCATGCAGTGACTGATCGTCATCAGCTAGCATCGACTGCATAGATCGCATGACGATCGCATATCGTCAGTGCATGTAGTCGAGCATCATCTGACGCAGTACGTGCATGATCTCAGTCAGCAGCTATGTCAGTGCATGCATGTAGTCGAGCATCATCTGACGCAGTACGTGCAGTGACTGATCGTCATCAGCTAGCATCGACTGCATAGATCGCATGACGATCGCATATCGTCAGTGCAGTGACTGATCGTCATCAGCTAGCATCGACTGCATGTAGTCGAGCATCATCTGACGCAGTACGTGCATAGATCGCATGACGATCGCATATCGTCAGTGCATGATCTCAGTCAGCAGCTATGTCAGTGCATGCAGTGACTGATCGTCATCAGCTAGCATCGACTGCATGTAGTCGAGCATCATCTGACGCAGTACGTGCATGATCTCAGTCAGCAGCTATGTCAGTGCATGCATAGATCGCATGACGATCGCATATCGTCAGTGCAGTGACTGATCGTCATCAGCTAGCATCGACTGCATGATCTCAGTCAGCAGC";   }
381                 else if(toupper(temp[0]) == 'I'){
382                     flowOrder = "TACGTACGTCTGAGCATCGATCGATGTACAGCTACGTACGTCTGAGCATCGATCGATGTACAGCTACGTACGTCTGAGCATCGATCGATGTACAGCTACGTACGTCTGAGCATCGATCGATGTACAGCTACGTACGTCTGAGCATCGATCGATGTACAGCTACGTACGTCTGAGCATCGATCGATGTACAGCTACGTACGTCTGAGCATCGATCGATGTACAGCTACGTACGTCTGAGCATCGATCGATGTACAGCTACGTACGTCTGAGCATCGATCGATGTACAGCTACGTACGTCTGAGCATCGATCGATGTACAGCTACGTACGTCTGAGCATCGATCGATGTACAGCTACGTACGTCTGAGCATCGATCGATGTACAGCTACGTACGTCTGAGCATCGATCGATGTACAGCTACGTACGTCTGAGCATCGATCGATGTACAGCTACGTACGTCTGAGCATCGATCGATGTACAGCTACGTACGTCTGAGCATCGATCGATGTACAGCTACGTACGTCTGAGCATCGATCGATGTACAGCTACGTACGTCTGAGCATCGATCGATGTACAGCTACGTACGTCTGAGCATCGATCGATGTACAGCTACGTACGTCTGAGCATCGATCGATGTACAGC";   }
383                 else {
384                     m->mothurOut("[ERROR]: " + temp + " is not a valid option for order. order options are A, B, or I. A = TACG, B = TACGTACGTACGATGTAGTCGAGCATCATCTGACGCAGTACGTGCATGATCTCAGTCAGCAGCTATGTCAGTGCATGCAGTGACTGATCGTCATCAGCTAGCATCGACTGCATAGATCGCATGACGATCGCATATCGTCAGTGCATGTAGTCGAGCATCATCTGACGCAGTACGTGCATGATCTCAGTCAGCAGCTATGTCAGTGCATGCATAGATCGCATGACGATCGCATATCGTCAGTGCAGTGACTGATCGTCATCAGCTAGCATCGACTGCATGTAGTCGAGCATCATCTGACGCAGTACGTGCATAGATCGCATGACGATCGCATATCGTCAGTGCATGATCTCAGTCAGCAGCTATGTCAGTGCATGCAGTGACTGATCGTCATCAGCTAGCATCGACTGCATGTAGTCGAGCATCATCTGACGCAGTACGTGCAGTGACTGATCGTCATCAGCTAGCATCGACTGCATAGATCGCATGACGATCGCATATCGTCAGTGCATGATCTCAGTCAGCAGCTATGTCAGTGCATGCATGTAGTCGAGCATCATCTGACGCAGTACGTGCATAGATCGCATGACGATCGCATATCGTCAGTGCAGTGACTGATCGTCATCAGCTAGCATCGACTGCATGATCTCAGTCAGCAGCTATGTCAGTGCATGCAGTGACTGATCGTCATCAGCTAGCATCGACTGCATAGATCGCATGACGATCGCATATCGTCAGTGCATGATCTCAGTCAGCAGCTATGTCAGTGCATGCATGTAGTCGAGCATCATCTGACGCAGTACGTGCATAGATCGCATGACGATCGCATATCGTCAGTGCATGATCTCAGTCAGCAGCTATGTCAGTGCATGCAGTGACTGATCGTCATCAGCTAGCATCGACTGCATGTAGTCGAGCATCATCTGACGCAGTACGTGCATGATCTCAGTCAGCAGCTATGTCAGTGCATGCATAGATCGCATGACGATCGCATATCGTCAGTGCATGTAGTCGAGCATCATCTGACGCAGTACGTGCAGTGACTGATCGTCATCAGCTAGCATCGACTGCATAGATCGCATGACGATCGCATATCGTCAGTGCATGTAGTCGAGCATCATCTGACGCAGTACGTGCATGATCTCAGTCAGCAGCTATGTCAGTGCATGCAGTGACTGATCGTCATCAGCTAGCATCGACTGCATGATCTCAGTCAGCAGCTATGTCAGTGCATGCAGTGACTGATCGTCATCAGCTAGCATCGACTGCATAGATCGCATGACGATCGCATATCGTCAGTGCATGTAGTCGAGCATCATCTGACGCAGTACGTGCATGATCTCAGTCAGCAGCTATGTCAGTGCATGCATGTAGTCGAGCATCATCTGACGCAGTACGTGCAGTGACTGATCGTCATCAGCTAGCATCGACTGCATAGATCGCATGACGATCGCATATCGTCAGTGCAGTGACTGATCGTCATCAGCTAGCATCGACTGCATGTAGTCGAGCATCATCTGACGCAGTACGTGCATAGATCGCATGACGATCGCATATCGTCAGTGCATGATCTCAGTCAGCAGCTATGTCAGTGCATGCAGTGACTGATCGTCATCAGCTAGCATCGACTGCATGTAGTCGAGCATCATCTGACGCAGTACGTGCATGATCTCAGTCAGCAGCTATGTCAGTGCATGCATAGATCGCATGACGATCGCATATCGTCAGTGCAGTGACTGATCGTCATCAGCTAGCATCGACTGCATGATCTCAGTCAGCAGC, and I = TACGTACGTCTGAGCATCGATCGATGTACAGCTACGTACGTCTGAGCATCGATCGATGTACAGCTACGTACGTCTGAGCATCGATCGATGTACAGCTACGTACGTCTGAGCATCGATCGATGTACAGCTACGTACGTCTGAGCATCGATCGATGTACAGCTACGTACGTCTGAGCATCGATCGATGTACAGCTACGTACGTCTGAGCATCGATCGATGTACAGCTACGTACGTCTGAGCATCGATCGATGTACAGCTACGTACGTCTGAGCATCGATCGATGTACAGCTACGTACGTCTGAGCATCGATCGATGTACAGCTACGTACGTCTGAGCATCGATCGATGTACAGCTACGTACGTCTGAGCATCGATCGATGTACAGCTACGTACGTCTGAGCATCGATCGATGTACAGCTACGTACGTCTGAGCATCGATCGATGTACAGCTACGTACGTCTGAGCATCGATCGATGTACAGCTACGTACGTCTGAGCATCGATCGATGTACAGCTACGTACGTCTGAGCATCGATCGATGTACAGCTACGTACGTCTGAGCATCGATCGATGTACAGCTACGTACGTCTGAGCATCGATCGATGTACAGCTACGTACGTCTGAGCATCGATCGATGTACAGC.\n");  abort=true;
385                 }
386             }
387
388                         
389                 }
390 #ifdef USE_MPI
391                 }                               
392 #endif
393         }
394         catch(exception& e) {
395                 m->errorOut(e, "ShhherCommand", "ShhherCommand");
396                 exit(1);
397         }
398 }
399 //**********************************************************************************************************************
400 #ifdef USE_MPI
401 int ShhherCommand::execute(){
402         try {
403                 if (abort == true) { if (calledHelp) { return 0; }  return 2;   }
404                 
405                 int tag = 1976;
406                 MPI_Status status; 
407                         
408                 if(pid == 0){
409
410                         for(int i=1;i<ncpus;i++){
411                                 MPI_Send(&abort, 1, MPI_INT, i, tag, MPI_COMM_WORLD);
412                         }
413                         if(abort == 1){ return 0;       }
414
415                         processors = ncpus;
416                         
417                         m->mothurOut("\nGetting preliminary data...\n");
418                         getSingleLookUp();      if (m->control_pressed) { return 0; }
419                         getJointLookUp();       if (m->control_pressed) { return 0; }
420                         
421             vector<string> flowFileVector;
422                         if(flowFilesFileName != "not found"){
423                                 string fName;
424                 
425                                 ifstream flowFilesFile;
426                                 m->openInputFile(flowFilesFileName, flowFilesFile);
427                                 while(flowFilesFile){
428                                         fName = m->getline(flowFilesFile);
429                                         flowFileVector.push_back(fName);
430                                         m->gobble(flowFilesFile);
431                                 }
432                         }
433                         else{
434                                 flowFileVector.push_back(flowFileName);
435                         }
436             
437                         int numFiles = flowFileVector.size();
438
439                         for(int i=1;i<ncpus;i++){
440                                 MPI_Send(&numFiles, 1, MPI_INT, i, tag, MPI_COMM_WORLD);
441                         }
442                         
443                         for(int i=0;i<numFiles;i++){
444                                 
445                                 if (m->control_pressed) { break; }
446                                 
447                                 double begClock = clock();
448                                 unsigned long long begTime = time(NULL);
449
450                                 flowFileName = flowFileVector[i];
451                                 
452                                 m->mothurOut("\n>>>>>\tProcessing " + flowFileName + " (file " + toString(i+1) + " of " + toString(numFiles) + ")\t<<<<<\n");
453                                 m->mothurOut("Reading flowgrams...\n");
454                                 getFlowData();
455                                 
456                                 if (m->control_pressed) { break; }
457
458                                 m->mothurOut("Identifying unique flowgrams...\n");
459                                 getUniques();
460                                 
461                                 if (m->control_pressed) { break; }
462
463                                 m->mothurOut("Calculating distances between flowgrams...\n");
464                                 char fileName[1024];
465                                 strcpy(fileName, flowFileName.c_str());
466
467                                 for(int i=1;i<ncpus;i++){
468                                         MPI_Send(&fileName[0], 1024, MPI_CHAR, i, tag, MPI_COMM_WORLD);
469
470                                         MPI_Send(&numSeqs, 1, MPI_INT, i, tag, MPI_COMM_WORLD);
471                                         MPI_Send(&numUniques, 1, MPI_INT, i, tag, MPI_COMM_WORLD);
472                                         MPI_Send(&numFlowCells, 1, MPI_INT, i, tag, MPI_COMM_WORLD);
473                                         MPI_Send(&flowDataIntI[0], numSeqs * numFlowCells, MPI_SHORT, i, tag, MPI_COMM_WORLD);
474                                         MPI_Send(&flowDataPrI[0], numSeqs * numFlowCells, MPI_DOUBLE, i, tag, MPI_COMM_WORLD);
475                                         MPI_Send(&mapUniqueToSeq[0], numSeqs, MPI_INT, i, tag, MPI_COMM_WORLD);
476                                         MPI_Send(&mapSeqToUnique[0], numSeqs, MPI_INT, i, tag, MPI_COMM_WORLD);
477                                         MPI_Send(&lengths[0], numSeqs, MPI_INT, i, tag, MPI_COMM_WORLD);
478                                         MPI_Send(&jointLookUp[0], NUMBINS * NUMBINS, MPI_DOUBLE, i, tag, MPI_COMM_WORLD);
479                                         MPI_Send(&cutoff, 1, MPI_DOUBLE, i, tag, MPI_COMM_WORLD);
480                                 }                               
481                                                         
482                                 string distFileName = flowDistMPI(0, int(sqrt(1.0/float(ncpus)) * numUniques));
483                                 
484                                 if (m->control_pressed) { break; }
485                                 
486                                 int done;
487                                 for(int i=1;i<ncpus;i++){
488                                         MPI_Recv(&done, 1, MPI_INT, i, tag, MPI_COMM_WORLD, &status);
489                                         
490                                         m->appendFiles((distFileName + ".temp." + toString(i)), distFileName);
491                                         m->mothurRemove((distFileName + ".temp." + toString(i)));
492                                 }
493
494                                 string namesFileName = createNamesFile();
495                                 
496                                 if (m->control_pressed) { break; }
497                                 
498                                 m->mothurOut("\nClustering flowgrams...\n");
499                                 string listFileName = cluster(distFileName, namesFileName);
500                                 
501                                 if (m->control_pressed) { break; }
502                                 
503                 
504                 
505                                 getOTUData(listFileName);
506
507                                 m->mothurRemove(distFileName);
508                                 m->mothurRemove(namesFileName);
509                                 m->mothurRemove(listFileName);
510                                 
511                                 if (m->control_pressed) { break; }
512                                 
513                                 initPyroCluster();
514
515                                 if (m->control_pressed) { break; }
516                                 
517             
518                                 for(int i=1;i<ncpus;i++){
519                                         MPI_Send(&numOTUs, 1, MPI_INT, i, tag, MPI_COMM_WORLD);
520                                         MPI_Send(&singleLookUp[0], singleLookUp.size(), MPI_DOUBLE, i, tag, MPI_COMM_WORLD);
521                                         MPI_Send(&uniqueFlowgrams[0], numFlowCells * numUniques, MPI_SHORT, i, tag, MPI_COMM_WORLD);
522                                         MPI_Send(&sigma, 1, MPI_DOUBLE, i, tag, MPI_COMM_WORLD);
523                                 }
524                                 
525                                 if (m->control_pressed) { break; }
526                                 
527                                 double maxDelta = 0;
528                                 int iter = 0;
529                                 
530                                 int numOTUsOnCPU = numOTUs / ncpus;
531                                 int numSeqsOnCPU = numSeqs / ncpus;
532                                 m->mothurOut("\nDenoising flowgrams...\n");
533                                 m->mothurOut("iter\tmaxDelta\tnLL\t\tcycletime\n");
534                                 
535                                 while((maxIters == 0 && maxDelta > minDelta) || iter < MIN_ITER || (maxDelta > minDelta && iter < maxIters)){
536                                         
537                                         double cycClock = clock();
538                                         unsigned long long cycTime = time(NULL);
539                                         fill();
540                                         
541                                         if (m->control_pressed) { break; }
542
543                                         int total = singleTau.size();
544                                         for(int i=1;i<ncpus;i++){
545                                                 MPI_Send(&total, 1, MPI_INT, i, tag, MPI_COMM_WORLD);
546                                                 MPI_Send(&change[0], numOTUs, MPI_SHORT, i, tag, MPI_COMM_WORLD);
547                                                 MPI_Send(&centroids[0], numOTUs, MPI_INT, i, tag, MPI_COMM_WORLD);
548                                                 
549                                                 MPI_Send(&singleTau[0], total, MPI_DOUBLE, i, tag, MPI_COMM_WORLD);
550                                                 MPI_Send(&seqNumber[0], total, MPI_INT, i, tag, MPI_COMM_WORLD);
551                                                 MPI_Send(&seqIndex[0], total, MPI_INT, i, tag, MPI_COMM_WORLD);
552                                                 MPI_Send(&nSeqsPerOTU[0], numOTUs, MPI_INT, i, tag, MPI_COMM_WORLD);
553                                                 MPI_Send(&cumNumSeqs[0], numOTUs, MPI_INT, i, tag, MPI_COMM_WORLD);
554                                         }
555                                         
556                                         calcCentroidsDriver(0, numOTUsOnCPU);
557                                         
558                                         for(int i=1;i<ncpus;i++){
559                                                 int otuStart = i * numOTUs / ncpus;
560                                                 int otuStop = (i + 1) * numOTUs / ncpus;
561                                                 
562                                                 vector<int> tempCentroids(numOTUs, 0);
563                                                 vector<short> tempChange(numOTUs, 0);
564                                                 
565                                                 MPI_Recv(&tempCentroids[0], numOTUs, MPI_INT, i, tag, MPI_COMM_WORLD, &status);
566                                                 MPI_Recv(&tempChange[0], numOTUs, MPI_SHORT, i, tag, MPI_COMM_WORLD, &status);
567                                                 
568                                                 for(int j=otuStart;j<otuStop;j++){
569                                                         centroids[j] = tempCentroids[j];
570                                                         change[j] = tempChange[j];
571                                                 }
572                                         }
573                                                                         
574                                         maxDelta = getNewWeights(); if (m->control_pressed) { break; }
575                                         double nLL = getLikelihood(); if (m->control_pressed) { break; }
576                                         checkCentroids(); if (m->control_pressed) { break; }
577                                         
578                                         for(int i=1;i<ncpus;i++){
579                                                 MPI_Send(&centroids[0], numOTUs, MPI_INT, i, tag, MPI_COMM_WORLD);
580                                                 MPI_Send(&weight[0], numOTUs, MPI_DOUBLE, i, tag, MPI_COMM_WORLD);
581                                                 MPI_Send(&change[0], numOTUs, MPI_SHORT, i, tag, MPI_COMM_WORLD);
582                                         }
583                                         
584                                         calcNewDistancesParent(0, numSeqsOnCPU);
585
586                                         total = singleTau.size();
587
588                                         for(int i=1;i<ncpus;i++){
589                                                 int childTotal;
590                                                 int seqStart = i * numSeqs / ncpus;
591                                                 int seqStop = (i + 1) * numSeqs / ncpus;
592                                                 
593                                                 MPI_Recv(&childTotal, 1, MPI_INT, i, tag, MPI_COMM_WORLD, &status);
594
595                                                 vector<int> childSeqIndex(childTotal, 0);
596                                                 vector<double> childSingleTau(childTotal, 0);
597                                                 vector<double> childDist(numSeqs * numOTUs, 0);
598                                                 vector<int> otuIndex(childTotal, 0);
599                                                 
600                                                 MPI_Recv(&childSeqIndex[0], childTotal, MPI_INT, i, tag, MPI_COMM_WORLD, &status);
601                                                 MPI_Recv(&childSingleTau[0], childTotal, MPI_DOUBLE, i, tag, MPI_COMM_WORLD, &status);
602                                                 MPI_Recv(&childDist[0], numOTUs * numSeqs, MPI_DOUBLE, i, tag, MPI_COMM_WORLD, &status);
603                                                 MPI_Recv(&otuIndex[0], childTotal, MPI_INT, i, tag, MPI_COMM_WORLD, &status);
604
605                                                 int oldTotal = total;
606                                                 total += childTotal;
607                                                 singleTau.resize(total, 0);
608                                                 seqIndex.resize(total, 0);
609                                                 seqNumber.resize(total, 0);
610                                                 
611                                                 int childIndex = 0;
612                                                 
613                                                 for(int j=oldTotal;j<total;j++){
614                                                         int otuI = otuIndex[childIndex];
615                                                         int seqI = childSeqIndex[childIndex];
616                                                         
617                                                         singleTau[j] = childSingleTau[childIndex];
618                                                         
619                                                         aaP[otuI][nSeqsPerOTU[otuI]] = j;
620                                                         aaI[otuI][nSeqsPerOTU[otuI]] = seqI;
621                                                         nSeqsPerOTU[otuI]++;
622                                                         childIndex++;
623                                                 }
624                                                 
625                                                 int index = seqStart * numOTUs;
626                                                 for(int j=seqStart;j<seqStop;j++){
627                                                         for(int k=0;k<numOTUs;k++){
628                                                                 dist[index] = childDist[index];
629                                                                 index++;
630                                                         }
631                                                 }                                       
632                                         }
633                                         
634                                         iter++;
635                                         
636                                         m->mothurOut(toString(iter) + '\t' + toString(maxDelta) + '\t' + toString(nLL) + '\t' + toString(time(NULL) - cycTime) + '\t' + toString((clock() - cycClock)/(double)CLOCKS_PER_SEC) + '\n');                  
637
638                                         if((maxIters == 0 && maxDelta > minDelta) || iter < MIN_ITER || (maxDelta > minDelta && iter < maxIters)){
639                                                 int live = 1;
640                                                 for(int i=1;i<ncpus;i++){
641                                                         MPI_Send(&live, 1, MPI_INT, i, tag, MPI_COMM_WORLD);
642                                                 }
643                                         }
644                                         else{
645                                                 int live = 0;
646                                                 for(int i=1;i<ncpus;i++){
647                                                         MPI_Send(&live, 1, MPI_INT, i, tag, MPI_COMM_WORLD); //send kill command
648                                                 }
649                                         }
650                                         
651                                 }       
652                                 
653                                 if (m->control_pressed) { break; }
654                                 
655                                 m->mothurOut("\nFinalizing...\n");
656                                 fill();
657                                 
658                                 if (m->control_pressed) { break; }
659                                 
660                                 setOTUs();
661                                 
662                                 vector<int> otuCounts(numOTUs, 0);
663                                 for(int i=0;i<numSeqs;i++)      {       otuCounts[otuData[i]]++;        }
664                                 calcCentroidsDriver(0, numOTUs);
665                                 
666                                 if (m->control_pressed) { break; }
667                                 
668                                 writeQualities(otuCounts);      if (m->control_pressed) { break; }
669                                 writeSequences(otuCounts);      if (m->control_pressed) { break; }
670                                 writeNames(otuCounts);          if (m->control_pressed) { break; }
671                                 writeClusters(otuCounts);       if (m->control_pressed) { break; }
672                                 writeGroups();                          if (m->control_pressed) { break; }
673                                 
674                                                                  
675                                 m->mothurOut("Total time to process " + toString(flowFileName) + ":\t" + toString(time(NULL) - begTime) + '\t' + toString((clock() - begClock)/(double)CLOCKS_PER_SEC) + '\n');                 
676                         }
677                 }
678                 else{
679                         int abort = 1;
680
681                         MPI_Recv(&abort, 1, MPI_INT, 0, tag, MPI_COMM_WORLD, &status);
682                         if(abort){      return 0;       }
683
684                         int numFiles;
685                         MPI_Recv(&numFiles, 1, MPI_INT, 0, tag, MPI_COMM_WORLD, &status);
686
687                         for(int i=0;i<numFiles;i++){
688                                 
689                                 if (m->control_pressed) { break; }
690                                 
691                                 //Now into the pyrodist part
692                                 bool live = 1;
693
694                                 char fileName[1024];
695                                 MPI_Recv(&fileName, 1024, MPI_CHAR, 0, tag, MPI_COMM_WORLD, &status);
696                                 MPI_Recv(&numSeqs, 1, MPI_INT, 0, tag, MPI_COMM_WORLD, &status);
697                                 MPI_Recv(&numUniques, 1, MPI_INT, 0, tag, MPI_COMM_WORLD, &status);
698                                 MPI_Recv(&numFlowCells, 1, MPI_INT, 0, tag, MPI_COMM_WORLD, &status);
699                                 
700                                 flowDataIntI.resize(numSeqs * numFlowCells);
701                                 flowDataPrI.resize(numSeqs * numFlowCells);
702                                 mapUniqueToSeq.resize(numSeqs);
703                                 mapSeqToUnique.resize(numSeqs);
704                                 lengths.resize(numSeqs);
705                                 jointLookUp.resize(NUMBINS * NUMBINS);
706
707                                 MPI_Recv(&flowDataIntI[0], numSeqs * numFlowCells, MPI_SHORT, 0, tag, MPI_COMM_WORLD, &status);
708                                 MPI_Recv(&flowDataPrI[0], numSeqs * numFlowCells, MPI_DOUBLE, 0, tag, MPI_COMM_WORLD, &status);
709                                 MPI_Recv(&mapUniqueToSeq[0], numSeqs, MPI_INT, 0, tag, MPI_COMM_WORLD, &status);
710                                 MPI_Recv(&mapSeqToUnique[0], numSeqs, MPI_INT, 0, tag, MPI_COMM_WORLD, &status);
711                                 MPI_Recv(&lengths[0], numSeqs, MPI_INT, 0, tag, MPI_COMM_WORLD, &status);
712                                 MPI_Recv(&jointLookUp[0], NUMBINS * NUMBINS, MPI_DOUBLE, 0, tag, MPI_COMM_WORLD, &status);
713                                 MPI_Recv(&cutoff, 1, MPI_DOUBLE, 0, tag, MPI_COMM_WORLD, &status);
714
715                                 flowFileName = string(fileName);
716                                 int flowDistStart = int(sqrt(float(pid)/float(ncpus)) * numUniques);
717                                 int flowDistEnd = int(sqrt(float(pid+1)/float(ncpus)) * numUniques);
718                                 
719                                 string distanceStringChild = flowDistMPI(flowDistStart, flowDistEnd);
720                                 
721                                 if (m->control_pressed) { break; }
722                                 
723                                 int done = 1;
724                                 MPI_Send(&done, 1, MPI_INT, 0, tag, MPI_COMM_WORLD);
725
726                                 //Now into the pyronoise part
727                                 MPI_Recv(&numOTUs, 1, MPI_INT, 0, tag, MPI_COMM_WORLD, &status);
728                                 
729                                 singleLookUp.resize(HOMOPS * NUMBINS);
730                                 uniqueFlowgrams.resize(numUniques * numFlowCells);
731                                 weight.resize(numOTUs);
732                                 centroids.resize(numOTUs);
733                                 change.resize(numOTUs);
734                                 dist.assign(numOTUs * numSeqs, 0);
735                                 nSeqsPerOTU.resize(numOTUs);
736                                 cumNumSeqs.resize(numOTUs);
737
738                                 MPI_Recv(&singleLookUp[0], singleLookUp.size(), MPI_DOUBLE, 0, tag, MPI_COMM_WORLD, &status);
739                                 MPI_Recv(&uniqueFlowgrams[0], uniqueFlowgrams.size(), MPI_SHORT, 0, tag, MPI_COMM_WORLD, &status);
740                                 MPI_Recv(&sigma, 1, MPI_DOUBLE, 0, tag, MPI_COMM_WORLD, &status);
741                                 
742                                 int startOTU = pid * numOTUs / ncpus;
743                                 int endOTU = (pid + 1) * numOTUs / ncpus;
744
745                                 int startSeq = pid * numSeqs / ncpus;
746                                 int endSeq = (pid + 1) * numSeqs /ncpus;
747                                 
748                                 int total;
749
750                                 while(live){
751                                         
752                                         if (m->control_pressed) { break; }
753                                         
754                                         MPI_Recv(&total, 1, MPI_INT, 0, tag, MPI_COMM_WORLD, &status);
755                                         singleTau.assign(total, 0.0000);
756                                         seqNumber.assign(total, 0);
757                                         seqIndex.assign(total, 0);
758
759                                         MPI_Recv(&change[0], numOTUs, MPI_SHORT, 0, tag, MPI_COMM_WORLD, &status);
760                                         MPI_Recv(&centroids[0], numOTUs, MPI_INT, 0, tag, MPI_COMM_WORLD, &status);
761                                         MPI_Recv(&singleTau[0], total, MPI_DOUBLE, 0, tag, MPI_COMM_WORLD, &status);
762                                         MPI_Recv(&seqNumber[0], total, MPI_DOUBLE, 0, tag, MPI_COMM_WORLD, &status);
763                                         MPI_Recv(&seqIndex[0], total, MPI_INT, 0, tag, MPI_COMM_WORLD, &status);
764                                         MPI_Recv(&nSeqsPerOTU[0], total, MPI_INT, 0, tag, MPI_COMM_WORLD, &status);
765                                         MPI_Recv(&cumNumSeqs[0], numOTUs, MPI_INT, 0, tag, MPI_COMM_WORLD, &status);
766
767                                         calcCentroidsDriver(startOTU, endOTU);
768
769                                         MPI_Send(&centroids[0], numOTUs, MPI_INT, 0, tag, MPI_COMM_WORLD);
770                                         MPI_Send(&change[0], numOTUs, MPI_SHORT, 0, tag, MPI_COMM_WORLD);
771
772                                         MPI_Recv(&centroids[0], numOTUs, MPI_INT, 0, tag, MPI_COMM_WORLD, &status);
773                                         MPI_Recv(&weight[0], numOTUs, MPI_DOUBLE, 0, tag, MPI_COMM_WORLD, &status);
774                                         MPI_Recv(&change[0], numOTUs, MPI_SHORT, 0, tag, MPI_COMM_WORLD, &status);
775
776                                         vector<int> otuIndex(total, 0);
777                                         calcNewDistancesChildMPI(startSeq, endSeq, otuIndex);
778                                         total = otuIndex.size();
779                                         
780                                         MPI_Send(&total, 1, MPI_INT, 0, tag, MPI_COMM_WORLD);
781                                         MPI_Send(&seqIndex[0], total, MPI_INT, 0, tag, MPI_COMM_WORLD);
782                                         MPI_Send(&singleTau[0], total, MPI_DOUBLE, 0, tag, MPI_COMM_WORLD);
783                                         MPI_Send(&dist[0], numOTUs * numSeqs, MPI_DOUBLE, 0, tag, MPI_COMM_WORLD);
784                                         MPI_Send(&otuIndex[0], total, MPI_INT, 0, tag, MPI_COMM_WORLD);
785                                 
786                                         MPI_Recv(&live, 1, MPI_INT, 0, tag, MPI_COMM_WORLD, &status);
787                                 }
788                         }
789                 }
790                 
791                 if (m->control_pressed) { for (int i = 0; i < outputNames.size(); i++) { m->mothurRemove(outputNames[i]); } return 0; }
792                 
793                 MPI_Barrier(MPI_COMM_WORLD);
794
795                 
796                 if(compositeFASTAFileName != ""){
797                         outputNames.push_back(compositeFASTAFileName); outputTypes["fasta"].push_back(compositeFASTAFileName);
798                         outputNames.push_back(compositeNamesFileName); outputTypes["name"].push_back(compositeNamesFileName); 
799                 }
800
801                 m->mothurOutEndLine();
802                 m->mothurOut("Output File Names: "); m->mothurOutEndLine();
803                 for (int i = 0; i < outputNames.size(); i++) {  m->mothurOut(outputNames[i]); m->mothurOutEndLine();    }
804                 m->mothurOutEndLine();
805                 
806                 return 0;
807
808         }
809         catch(exception& e) {
810                 m->errorOut(e, "ShhherCommand", "execute");
811                 exit(1);
812         }
813 }
814 /**************************************************************************************************/
815 string ShhherCommand::createNamesFile(){
816         try{
817                 
818                 vector<string> duplicateNames(numUniques, "");
819                 for(int i=0;i<numSeqs;i++){
820                         duplicateNames[mapSeqToUnique[i]] += seqNameVector[i] + ',';
821                 }
822                 map<string, string> variables; 
823                 variables["[filename]"] = outputDir + m->getRootName(m->getSimpleName(flowFileName));
824                 string nameFileName = getOutputFileName("name",variables);
825                 
826                 ofstream nameFile;
827                 m->openOutputFile(nameFileName, nameFile);
828                 
829                 for(int i=0;i<numUniques;i++){
830                         
831                         if (m->control_pressed) { break; }
832                         
833             //                  nameFile << seqNameVector[mapUniqueToSeq[i]] << '\t' << duplicateNames[i].substr(0, duplicateNames[i].find_last_of(',')) << endl;
834                         nameFile << mapUniqueToSeq[i] << '\t' << duplicateNames[i].substr(0, duplicateNames[i].find_last_of(',')) << endl;
835                 }
836                 
837                 nameFile.close();
838                 return  nameFileName;
839         }
840         catch(exception& e) {
841                 m->errorOut(e, "ShhherCommand", "createNamesFile");
842                 exit(1);
843         }
844 }
845 /**************************************************************************************************/
846
847 string ShhherCommand::flowDistMPI(int startSeq, int stopSeq){
848         try{            
849                 ostringstream outStream;
850                 outStream.setf(ios::fixed, ios::floatfield);
851                 outStream.setf(ios::dec, ios::basefield);
852                 outStream.setf(ios::showpoint);
853                 outStream.precision(6);
854                 
855                 int begTime = time(NULL);
856                 double begClock = clock();
857                 
858                 for(int i=startSeq;i<stopSeq;i++){
859                         
860                         if (m->control_pressed) { break; }
861                         
862                         for(int j=0;j<i;j++){
863                                 float flowDistance = calcPairwiseDist(mapUniqueToSeq[i], mapUniqueToSeq[j]);
864                                 
865                                 if(flowDistance < 1e-6){
866                                         outStream << mapUniqueToSeq[i] << '\t' << mapUniqueToSeq[j] << '\t' << 0.000000 << endl;
867                                 }
868                                 else if(flowDistance <= cutoff){
869                                         outStream << mapUniqueToSeq[i] << '\t' << mapUniqueToSeq[j] << '\t' << flowDistance << endl;
870                                 }
871                         }
872                         if(i % 100 == 0){
873                                 m->mothurOutJustToScreen(toString(i) + '\t' + toString(time(NULL) - begTime) + '\t' + toString((clock()-begClock)/CLOCKS_PER_SEC) + '\n');
874                         }
875                 }
876                 
877                 string fDistFileName = flowFileName.substr(0,flowFileName.find_last_of('.')) + ".shhh.dist";
878                 if(pid != 0){   fDistFileName += ".temp." + toString(pid);      }
879                 
880                 if (m->control_pressed) { return fDistFileName; }
881                 
882                 m->mothurOutJustToScreen(toString(stopSeq) + '\t' + toString(time(NULL) - begTime) + '\t' + toString((clock()-begClock)/CLOCKS_PER_SEC) + '\n');
883
884                 ofstream distFile(fDistFileName.c_str());
885                 distFile << outStream.str();            
886                 distFile.close();
887                 
888                 return fDistFileName;
889         }
890         catch(exception& e) {
891                 m->errorOut(e, "ShhherCommand", "flowDistMPI");
892                 exit(1);
893         }
894 }
895 /**************************************************************************************************/
896  
897 void ShhherCommand::getOTUData(string listFileName){
898     try {
899         
900         ifstream listFile;
901         m->openInputFile(listFileName, listFile);
902         string label;
903         
904         listFile >> label >> numOTUs;
905         
906         otuData.assign(numSeqs, 0);
907         cumNumSeqs.assign(numOTUs, 0);
908         nSeqsPerOTU.assign(numOTUs, 0);
909         aaP.clear();aaP.resize(numOTUs);
910         
911         seqNumber.clear();
912         aaI.clear();
913         seqIndex.clear();
914         
915         string singleOTU = "";
916         
917         for(int i=0;i<numOTUs;i++){
918             
919             if (m->control_pressed) { break; }
920             
921             listFile >> singleOTU;
922             
923             istringstream otuString(singleOTU);
924             
925             while(otuString){
926                 
927                 string seqName = "";
928                 
929                 for(int j=0;j<singleOTU.length();j++){
930                     char letter = otuString.get();
931                     
932                     if(letter != ','){
933                         seqName += letter;
934                     }
935                     else{
936                         map<string,int>::iterator nmIt = nameMap.find(seqName);
937                         int index = nmIt->second;
938                         
939                         nameMap.erase(nmIt);
940                         
941                         otuData[index] = i;
942                         nSeqsPerOTU[i]++;
943                         aaP[i].push_back(index);
944                         seqName = "";
945                     }
946                 }
947                 
948                 map<string,int>::iterator nmIt = nameMap.find(seqName);
949                 
950                 int index = nmIt->second;
951                 nameMap.erase(nmIt);
952                 
953                 otuData[index] = i;
954                 nSeqsPerOTU[i]++;
955                 aaP[i].push_back(index);        
956                 
957                 otuString.get();
958             }
959             
960             sort(aaP[i].begin(), aaP[i].end());
961             for(int j=0;j<nSeqsPerOTU[i];j++){
962                 seqNumber.push_back(aaP[i][j]);
963             }
964             for(int j=nSeqsPerOTU[i];j<numSeqs;j++){
965                 aaP[i].push_back(0);
966             }
967             
968             
969         }
970         
971         for(int i=1;i<numOTUs;i++){
972             cumNumSeqs[i] = cumNumSeqs[i-1] + nSeqsPerOTU[i-1];
973         }
974         aaI = aaP;
975         seqIndex = seqNumber;
976         
977         listFile.close();       
978         
979     }
980     catch(exception& e) {
981         m->errorOut(e, "ShhherCommand", "getOTUData");
982         exit(1);        
983     }           
984 }
985
986 /**************************************************************************************************/
987
988 void ShhherCommand::initPyroCluster(){                          
989     try{
990         if (numOTUs < processors) { processors = 1; }
991         
992         if (m->debug) { m->mothurOut("[DEBUG]: numSeqs = " + toString(numSeqs) + " numOTUS = " + toString(numOTUs) + " about to alloc a dist vector with size = " + toString((numSeqs * numOTUs)) + ".\n"); }
993         
994         dist.assign(numSeqs * numOTUs, 0);
995         change.assign(numOTUs, 1);
996         centroids.assign(numOTUs, -1);
997         weight.assign(numOTUs, 0);
998         singleTau.assign(numSeqs, 1.0);
999         
1000         nSeqsBreaks.assign(processors+1, 0);
1001         nOTUsBreaks.assign(processors+1, 0);
1002         
1003         if (m->debug) { m->mothurOut("[DEBUG]: made it through the memory allocation.\n"); }
1004         
1005         nSeqsBreaks[0] = 0;
1006         for(int i=0;i<processors;i++){
1007             nSeqsBreaks[i+1] = nSeqsBreaks[i] + (int)((double) numSeqs / (double) processors);
1008             nOTUsBreaks[i+1] = nOTUsBreaks[i] + (int)((double) numOTUs / (double) processors);
1009         }
1010         nSeqsBreaks[processors] = numSeqs;
1011         nOTUsBreaks[processors] = numOTUs;
1012     }
1013     catch(exception& e) {
1014         m->errorOut(e, "ShhherCommand", "initPyroCluster");
1015         exit(1);        
1016     }
1017 }
1018
1019 /**************************************************************************************************/
1020
1021 void ShhherCommand::fill(){
1022     try {
1023         int index = 0;
1024         for(int i=0;i<numOTUs;i++){
1025             
1026             if (m->control_pressed) { break; }
1027             
1028             cumNumSeqs[i] = index;
1029             for(int j=0;j<nSeqsPerOTU[i];j++){
1030                 seqNumber[index] = aaP[i][j];
1031                 seqIndex[index] = aaI[i][j];
1032                 
1033                 index++;
1034             }
1035         }
1036     }
1037     catch(exception& e) {
1038         m->errorOut(e, "ShhherCommand", "fill");
1039         exit(1);        
1040     }           
1041 }
1042
1043 /**************************************************************************************************/
1044  
1045 void ShhherCommand::getFlowData(){
1046     try{
1047         ifstream flowFile;
1048         m->openInputFile(flowFileName, flowFile);
1049         
1050         string seqName;
1051         seqNameVector.clear();
1052         lengths.clear();
1053         flowDataIntI.clear();
1054         nameMap.clear();
1055         
1056         
1057         int currentNumFlowCells;
1058         
1059         float intensity;
1060         
1061         string numFlowTest;
1062         flowFile >> numFlowTest;
1063         
1064         if (!m->isContainingOnlyDigits(numFlowTest)) { m->mothurOut("[ERROR]: expected a number and got " + numFlowTest + ", quitting. Did you use the flow parameter instead of the file parameter?"); m->mothurOutEndLine(); exit(1); }
1065         else { convert(numFlowTest, numFlowCells); }
1066         
1067         int index = 0;//pcluster
1068         while(!flowFile.eof()){
1069             
1070             if (m->control_pressed) { break; }
1071             
1072             flowFile >> seqName >> currentNumFlowCells;
1073             lengths.push_back(currentNumFlowCells);
1074             
1075             seqNameVector.push_back(seqName);
1076             nameMap[seqName] = index++;//pcluster
1077             
1078             for(int i=0;i<numFlowCells;i++){
1079                 flowFile >> intensity;
1080                 if(intensity > 9.99)    {       intensity = 9.99;       }
1081                 int intI = int(100 * intensity + 0.0001);
1082                 flowDataIntI.push_back(intI);
1083             }
1084             m->gobble(flowFile);
1085         }
1086         flowFile.close();
1087         
1088         numSeqs = seqNameVector.size();         
1089         
1090         for(int i=0;i<numSeqs;i++){
1091             
1092             if (m->control_pressed) { break; }
1093             
1094             int iNumFlowCells = i * numFlowCells;
1095             for(int j=lengths[i];j<numFlowCells;j++){
1096                 flowDataIntI[iNumFlowCells + j] = 0;
1097             }
1098         }
1099         
1100     }
1101     catch(exception& e) {
1102         m->errorOut(e, "ShhherCommand", "getFlowData");
1103         exit(1);
1104     }
1105 }
1106 /**************************************************************************************************/
1107 void ShhherCommand::calcNewDistancesChildMPI(int startSeq, int stopSeq, vector<int>& otuIndex){
1108         
1109         try{
1110                 vector<double> newTau(numOTUs,0);
1111                 vector<double> norms(numSeqs, 0);
1112                 otuIndex.clear();
1113                 seqIndex.clear();
1114                 singleTau.clear();
1115                 
1116                 for(int i=startSeq;i<stopSeq;i++){
1117                         
1118                         if (m->control_pressed) { break; }
1119                         
1120                         double offset = 1e8;
1121                         int indexOffset = i * numOTUs;
1122                         
1123                         for(int j=0;j<numOTUs;j++){
1124                                 
1125                                 if(weight[j] > MIN_WEIGHT && change[j] == 1){
1126                                         dist[indexOffset + j] = getDistToCentroid(centroids[j], i, lengths[i]);
1127                                 }
1128                                 if(weight[j] > MIN_WEIGHT && dist[indexOffset + j] < offset){
1129                                         offset = dist[indexOffset + j];
1130                                 }
1131                         }
1132                         
1133                         for(int j=0;j<numOTUs;j++){
1134                                 if(weight[j] > MIN_WEIGHT){
1135                                         newTau[j] = exp(sigma * (-dist[indexOffset + j] + offset)) * weight[j];
1136                                         norms[i] += newTau[j];
1137                                 }
1138                                 else{
1139                                         newTau[j] = 0.0;
1140                                 }
1141                         }
1142                         
1143                         for(int j=0;j<numOTUs;j++){
1144                 
1145                                 newTau[j] /= norms[i];
1146                                 
1147                                 if(newTau[j] > MIN_TAU){
1148                                         otuIndex.push_back(j);
1149                                         seqIndex.push_back(i);
1150                                         singleTau.push_back(newTau[j]);
1151                                 }
1152                         }
1153                         
1154                 }
1155         }
1156         catch(exception& e) {
1157                 m->errorOut(e, "ShhherCommand", "calcNewDistancesChildMPI");
1158                 exit(1);        
1159         }               
1160 }
1161
1162 /**************************************************************************************************/
1163
1164 void ShhherCommand::calcNewDistancesParent(int startSeq, int stopSeq){
1165     
1166     try{
1167         
1168         int total = 0;
1169         vector<double> newTau(numOTUs,0);
1170         vector<double> norms(numSeqs, 0);
1171         nSeqsPerOTU.assign(numOTUs, 0);
1172         
1173         for(int i=startSeq;i<stopSeq;i++){
1174             
1175             if (m->control_pressed) { break; }
1176             
1177             int indexOffset = i * numOTUs;
1178             
1179             double offset = 1e8;
1180             
1181             for(int j=0;j<numOTUs;j++){
1182                 
1183                 if(weight[j] > MIN_WEIGHT && change[j] == 1){
1184                     dist[indexOffset + j] = getDistToCentroid(centroids[j], i, lengths[i]);
1185                 }
1186                 
1187                 if(weight[j] > MIN_WEIGHT && dist[indexOffset + j] < offset){
1188                     offset = dist[indexOffset + j];
1189                 }
1190             }
1191             
1192             for(int j=0;j<numOTUs;j++){
1193                 if(weight[j] > MIN_WEIGHT){
1194                     newTau[j] = exp(sigma * (-dist[indexOffset + j] + offset)) * weight[j];
1195                     norms[i] += newTau[j];
1196                 }
1197                 else{
1198                     newTau[j] = 0.0;
1199                 }
1200             }
1201             
1202             for(int j=0;j<numOTUs;j++){
1203                 newTau[j] /= norms[i];
1204             }
1205             
1206             for(int j=0;j<numOTUs;j++){
1207                 if(newTau[j] > MIN_TAU){
1208                     
1209                     int oldTotal = total;
1210                     
1211                     total++;
1212                     
1213                     singleTau.resize(total, 0);
1214                     seqNumber.resize(total, 0);
1215                     seqIndex.resize(total, 0);
1216                     
1217                     singleTau[oldTotal] = newTau[j];
1218                     
1219                     aaP[j][nSeqsPerOTU[j]] = oldTotal;
1220                     aaI[j][nSeqsPerOTU[j]] = i;
1221                     nSeqsPerOTU[j]++;
1222                 }
1223             }
1224             
1225         }
1226         
1227     }
1228     catch(exception& e) {
1229         m->errorOut(e, "ShhherCommand", "calcNewDistancesParent");
1230         exit(1);        
1231     }           
1232 }
1233
1234 /**************************************************************************************************/
1235
1236 void ShhherCommand::setOTUs(){
1237     
1238     try {
1239         vector<double> bigTauMatrix(numOTUs * numSeqs, 0.0000);
1240         
1241         for(int i=0;i<numOTUs;i++){
1242             
1243             if (m->control_pressed) { break; }
1244             
1245             for(int j=0;j<nSeqsPerOTU[i];j++){
1246                 int index = cumNumSeqs[i] + j;
1247                 double tauValue = singleTau[seqNumber[index]];
1248                 int sIndex = seqIndex[index];
1249                 bigTauMatrix[sIndex * numOTUs + i] = tauValue;                          
1250             }
1251         }
1252         
1253         for(int i=0;i<numSeqs;i++){
1254             double maxTau = -1.0000;
1255             int maxOTU = -1;
1256             for(int j=0;j<numOTUs;j++){
1257                 if(bigTauMatrix[i * numOTUs + j] > maxTau){
1258                     maxTau = bigTauMatrix[i * numOTUs + j];
1259                     maxOTU = j;
1260                 }
1261             }
1262             
1263             otuData[i] = maxOTU;
1264         }
1265         
1266         nSeqsPerOTU.assign(numOTUs, 0);         
1267         
1268         for(int i=0;i<numSeqs;i++){
1269             int index = otuData[i];
1270             
1271             singleTau[i] = 1.0000;
1272             dist[i] = 0.0000;
1273             
1274             aaP[index][nSeqsPerOTU[index]] = i;
1275             aaI[index][nSeqsPerOTU[index]] = i;
1276             
1277             nSeqsPerOTU[index]++;
1278         }
1279         fill(); 
1280     }
1281     catch(exception& e) {
1282         m->errorOut(e, "ShhherCommand", "setOTUs");
1283         exit(1);        
1284     }           
1285 }
1286
1287 /**************************************************************************************************/
1288  
1289 void ShhherCommand::getUniques(){
1290     try{
1291         
1292         
1293         numUniques = 0;
1294         uniqueFlowgrams.assign(numFlowCells * numSeqs, -1);
1295         uniqueCount.assign(numSeqs, 0);                                                 //      anWeights
1296         uniqueLengths.assign(numSeqs, 0);
1297         mapSeqToUnique.assign(numSeqs, -1);
1298         mapUniqueToSeq.assign(numSeqs, -1);
1299         
1300         vector<short> uniqueFlowDataIntI(numFlowCells * numSeqs, -1);
1301         
1302         for(int i=0;i<numSeqs;i++){
1303             
1304             if (m->control_pressed) { break; }
1305             
1306             int index = 0;
1307             
1308             vector<short> current(numFlowCells);
1309             for(int j=0;j<numFlowCells;j++){
1310                 current[j] = short(((flowDataIntI[i * numFlowCells + j] + 50.0)/100.0));
1311             }
1312             
1313             for(int j=0;j<numUniques;j++){
1314                 int offset = j * numFlowCells;
1315                 bool toEnd = 1;
1316                 
1317                 int shorterLength;
1318                 if(lengths[i] < uniqueLengths[j])       {       shorterLength = lengths[i];                     }
1319                 else                                                            {       shorterLength = uniqueLengths[j];       }
1320                 
1321                 for(int k=0;k<shorterLength;k++){
1322                     if(current[k] != uniqueFlowgrams[offset + k]){
1323                         toEnd = 0;
1324                         break;
1325                     }
1326                 }
1327                 
1328                 if(toEnd){
1329                     mapSeqToUnique[i] = j;
1330                     uniqueCount[j]++;
1331                     index = j;
1332                     if(lengths[i] > uniqueLengths[j])   {       uniqueLengths[j] = lengths[i];  }
1333                     break;
1334                 }
1335                 index++;
1336             }
1337             
1338             if(index == numUniques){
1339                 uniqueLengths[numUniques] = lengths[i];
1340                 uniqueCount[numUniques] = 1;
1341                 mapSeqToUnique[i] = numUniques;//anMap
1342                 mapUniqueToSeq[numUniques] = i;//anF
1343                 
1344                 for(int k=0;k<numFlowCells;k++){
1345                     uniqueFlowgrams[numUniques * numFlowCells + k] = current[k];
1346                     uniqueFlowDataIntI[numUniques * numFlowCells + k] = flowDataIntI[i * numFlowCells + k];
1347                 }
1348                 
1349                 numUniques++;
1350             }
1351         }
1352         uniqueFlowDataIntI.resize(numFlowCells * numUniques);
1353         uniqueLengths.resize(numUniques);       
1354         
1355         flowDataPrI.resize(numSeqs * numFlowCells, 0);
1356         for(int i=0;i<flowDataPrI.size();i++)   {       if (m->control_pressed) { break; } flowDataPrI[i] = getProbIntensity(flowDataIntI[i]);          }
1357     }
1358     catch(exception& e) {
1359         m->errorOut(e, "ShhherCommand", "getUniques");
1360         exit(1);
1361     }
1362 }
1363
1364 /**************************************************************************************************/
1365
1366 float ShhherCommand::calcPairwiseDist(int seqA, int seqB){
1367     try{
1368         int minLength = lengths[mapSeqToUnique[seqA]];
1369         if(lengths[seqB] < minLength){  minLength = lengths[mapSeqToUnique[seqB]];      }
1370         
1371         int ANumFlowCells = seqA * numFlowCells;
1372         int BNumFlowCells = seqB * numFlowCells;
1373         
1374         float dist = 0;
1375         
1376         for(int i=0;i<minLength;i++){
1377             
1378             if (m->control_pressed) { break; }
1379             
1380             int flowAIntI = flowDataIntI[ANumFlowCells + i];
1381             float flowAPrI = flowDataPrI[ANumFlowCells + i];
1382             
1383             int flowBIntI = flowDataIntI[BNumFlowCells + i];
1384             float flowBPrI = flowDataPrI[BNumFlowCells + i];
1385             dist += jointLookUp[flowAIntI * NUMBINS + flowBIntI] - flowAPrI - flowBPrI;
1386         }
1387         
1388         dist /= (float) minLength;
1389         return dist;
1390     }
1391     catch(exception& e) {
1392         m->errorOut(e, "ShhherCommand", "calcPairwiseDist");
1393         exit(1);
1394     }
1395 }
1396
1397 //**********************************************************************************************************************/
1398
1399 string ShhherCommand::cluster(string distFileName, string namesFileName){
1400     try {
1401         
1402         ReadMatrix* read = new ReadColumnMatrix(distFileName);  
1403                 read->setCutoff(cutoff);
1404                 
1405                 NameAssignment* clusterNameMap = new NameAssignment(namesFileName);
1406                 clusterNameMap->readMap();
1407                 read->read(clusterNameMap);
1408         
1409                 ListVector* list = read->getListVector();
1410                 SparseDistanceMatrix* matrix = read->getDMatrix();
1411                 
1412                 delete read; 
1413                 delete clusterNameMap; 
1414         
1415         RAbundVector* rabund = new RAbundVector(list->getRAbundVector());
1416         
1417         Cluster* cluster = new CompleteLinkage(rabund, list, matrix, cutoff, "furthest"); 
1418         string tag = cluster->getTag();
1419         
1420         double clusterCutoff = cutoff;
1421         while (matrix->getSmallDist() <= clusterCutoff && matrix->getNNodes() > 0){
1422             
1423             if (m->control_pressed) { break; }
1424             
1425             cluster->update(clusterCutoff);
1426         }
1427         
1428         list->setLabel(toString(cutoff));
1429         
1430         string listFileName = flowFileName.substr(0,flowFileName.find_last_of('.')) + ".shhh.list";
1431         ofstream listFile;
1432         m->openOutputFile(listFileName, listFile);
1433         list->print(listFile);
1434         listFile.close();
1435         
1436         delete matrix;  delete cluster; delete rabund; delete list;
1437         
1438         return listFileName;
1439     }
1440     catch(exception& e) {
1441         m->errorOut(e, "ShhherCommand", "cluster");
1442         exit(1);        
1443     }           
1444 }
1445
1446 /**************************************************************************************************/
1447
1448 void ShhherCommand::calcCentroidsDriver(int start, int finish){                          
1449     
1450     //this function gets the most likely homopolymer length at a flow position for a group of sequences
1451     //within an otu
1452     
1453     try{
1454         
1455         for(int i=start;i<finish;i++){
1456             
1457             if (m->control_pressed) { break; }
1458             
1459             double count = 0;
1460             int position = 0;
1461             int minFlowGram = 100000000;
1462             double minFlowValue = 1e8;
1463             change[i] = 0; //FALSE
1464             
1465             for(int j=0;j<nSeqsPerOTU[i];j++){
1466                 count += singleTau[seqNumber[cumNumSeqs[i] + j]];
1467             }
1468             
1469             if(nSeqsPerOTU[i] > 0 && count > MIN_COUNT){
1470                 vector<double> adF(nSeqsPerOTU[i]);
1471                 vector<int> anL(nSeqsPerOTU[i]);
1472                 
1473                 for(int j=0;j<nSeqsPerOTU[i];j++){
1474                     int index = cumNumSeqs[i] + j;
1475                     int nI = seqIndex[index];
1476                     int nIU = mapSeqToUnique[nI];
1477                     
1478                     int k;
1479                     for(k=0;k<position;k++){
1480                         if(nIU == anL[k]){
1481                             break;
1482                         }
1483                     }
1484                     if(k == position){
1485                         anL[position] = nIU;
1486                         adF[position] = 0.0000;
1487                         position++;
1488                     }                                           
1489                 }
1490                 
1491                 for(int j=0;j<nSeqsPerOTU[i];j++){
1492                     int index = cumNumSeqs[i] + j;
1493                     int nI = seqIndex[index];
1494                     
1495                     double tauValue = singleTau[seqNumber[index]];
1496                     
1497                     for(int k=0;k<position;k++){
1498                         double dist = getDistToCentroid(anL[k], nI, lengths[nI]);
1499                         adF[k] += dist * tauValue;
1500                     }
1501                 }
1502                 
1503                 for(int j=0;j<position;j++){
1504                     if(adF[j] < minFlowValue){
1505                         minFlowGram = j;
1506                         minFlowValue = adF[j];
1507                     }
1508                 }
1509                 
1510                 if(centroids[i] != anL[minFlowGram]){
1511                     change[i] = 1;
1512                     centroids[i] = anL[minFlowGram];
1513                 }
1514             }
1515             else if(centroids[i] != -1){
1516                 change[i] = 1;
1517                 centroids[i] = -1;                      
1518             }
1519         }
1520     }
1521     catch(exception& e) {
1522         m->errorOut(e, "ShhherCommand", "calcCentroidsDriver");
1523         exit(1);        
1524     }           
1525 }
1526
1527 /**************************************************************************************************/
1528
1529 double ShhherCommand::getDistToCentroid(int cent, int flow, int length){
1530     try{
1531         
1532         int flowAValue = cent * numFlowCells;
1533         int flowBValue = flow * numFlowCells;
1534         
1535         double dist = 0;
1536         
1537         for(int i=0;i<length;i++){
1538             dist += singleLookUp[uniqueFlowgrams[flowAValue] * NUMBINS + flowDataIntI[flowBValue]];
1539             flowAValue++;
1540             flowBValue++;
1541         }
1542         
1543         return dist / (double)length;
1544     }
1545     catch(exception& e) {
1546         m->errorOut(e, "ShhherCommand", "getDistToCentroid");
1547         exit(1);        
1548     }           
1549 }
1550
1551 /**************************************************************************************************/
1552
1553 double ShhherCommand::getNewWeights(){
1554     try{
1555         
1556         double maxChange = 0;
1557         
1558         for(int i=0;i<numOTUs;i++){
1559             
1560             if (m->control_pressed) { break; }
1561             
1562             double difference = weight[i];
1563             weight[i] = 0;
1564             
1565             for(int j=0;j<nSeqsPerOTU[i];j++){
1566                 int index = cumNumSeqs[i] + j;
1567                 double tauValue = singleTau[seqNumber[index]];
1568                 weight[i] += tauValue;
1569             }
1570             
1571             difference = fabs(weight[i] - difference);
1572             if(difference > maxChange){ maxChange = difference; }
1573         }
1574         return maxChange;
1575     }
1576     catch(exception& e) {
1577         m->errorOut(e, "ShhherCommand", "getNewWeights");
1578         exit(1);        
1579     }           
1580 }
1581  
1582  /**************************************************************************************************/
1583  
1584 double ShhherCommand::getLikelihood(){
1585     
1586     try{
1587         
1588         vector<long double> P(numSeqs, 0);
1589         int effNumOTUs = 0;
1590         
1591         for(int i=0;i<numOTUs;i++){
1592             if(weight[i] > MIN_WEIGHT){
1593                 effNumOTUs++;
1594             }
1595         }
1596         
1597         string hold;
1598         for(int i=0;i<numOTUs;i++){
1599             
1600             if (m->control_pressed) { break; }
1601             
1602             for(int j=0;j<nSeqsPerOTU[i];j++){
1603                 int index = cumNumSeqs[i] + j;
1604                 int nI = seqIndex[index];
1605                 double singleDist = dist[seqNumber[index]];
1606                 
1607                 P[nI] += weight[i] * exp(-singleDist * sigma);
1608             }
1609         }
1610         double nLL = 0.00;
1611         for(int i=0;i<numSeqs;i++){
1612             if(P[i] == 0){      P[i] = DBL_EPSILON;     }
1613             
1614             nLL += -log(P[i]);
1615         }
1616         
1617         nLL = nLL -(double)numSeqs * log(sigma);
1618         
1619         return nLL; 
1620     }
1621     catch(exception& e) {
1622         m->errorOut(e, "ShhherCommand", "getNewWeights");
1623         exit(1);        
1624     }           
1625 }
1626
1627 /**************************************************************************************************/
1628
1629 void ShhherCommand::checkCentroids(){
1630     try{
1631         vector<int> unique(numOTUs, 1);
1632         
1633         for(int i=0;i<numOTUs;i++){
1634             if(centroids[i] == -1 || weight[i] < MIN_WEIGHT){
1635                 unique[i] = -1;
1636             }
1637         }
1638         
1639         for(int i=0;i<numOTUs;i++){
1640             
1641             if (m->control_pressed) { break; }
1642             
1643             if(unique[i] == 1){
1644                 for(int j=i+1;j<numOTUs;j++){
1645                     if(unique[j] == 1){
1646                         
1647                         if(centroids[j] == centroids[i]){
1648                             unique[j] = 0;
1649                             centroids[j] = -1;
1650                             
1651                             weight[i] += weight[j];
1652                             weight[j] = 0.0;
1653                         }
1654                     }
1655                 }
1656             }
1657         }
1658     }
1659     catch(exception& e) {
1660         m->errorOut(e, "ShhherCommand", "checkCentroids");
1661         exit(1);        
1662     }           
1663 }
1664  /**************************************************************************************************/
1665
1666
1667  
1668 void ShhherCommand::writeQualities(vector<int> otuCounts){
1669     
1670     try {
1671         string thisOutputDir = outputDir;
1672         if (outputDir == "") {  thisOutputDir += m->hasPath(flowFileName);  }
1673         map<string, string> variables; 
1674                 variables["[filename]"] = outputDir + m->getRootName(m->getSimpleName(flowFileName));
1675         string qualityFileName = getOutputFileName("qfile",variables);
1676         
1677         ofstream qualityFile;
1678         m->openOutputFile(qualityFileName, qualityFile);
1679         
1680         qualityFile.setf(ios::fixed, ios::floatfield);
1681         qualityFile.setf(ios::showpoint);
1682         qualityFile << setprecision(6);
1683         
1684         vector<vector<int> > qualities(numOTUs);
1685         vector<double> pr(HOMOPS, 0);
1686         
1687         
1688         for(int i=0;i<numOTUs;i++){
1689             
1690             if (m->control_pressed) { break; }
1691             
1692             int index = 0;
1693             int base = 0;
1694             
1695             if(nSeqsPerOTU[i] > 0){
1696                 qualities[i].assign(1024, -1);
1697                 
1698                 while(index < numFlowCells){
1699                     double maxPrValue = 1e8;
1700                     short maxPrIndex = -1;
1701                     double count = 0.0000;
1702                     
1703                     pr.assign(HOMOPS, 0);
1704                     
1705                     for(int j=0;j<nSeqsPerOTU[i];j++){
1706                         int lIndex = cumNumSeqs[i] + j;
1707                         double tauValue = singleTau[seqNumber[lIndex]];
1708                         int sequenceIndex = aaI[i][j];
1709                         short intensity = flowDataIntI[sequenceIndex * numFlowCells + index];
1710                         
1711                         count += tauValue;
1712                         
1713                         for(int s=0;s<HOMOPS;s++){
1714                             pr[s] += tauValue * singleLookUp[s * NUMBINS + intensity];
1715                         }
1716                     }
1717                     
1718                     maxPrIndex = uniqueFlowgrams[centroids[i] * numFlowCells + index];
1719                     maxPrValue = pr[maxPrIndex];
1720                     
1721                     if(count > MIN_COUNT){
1722                         double U = 0.0000;
1723                         double norm = 0.0000;
1724                         
1725                         for(int s=0;s<HOMOPS;s++){
1726                             norm += exp(-(pr[s] - maxPrValue));
1727                         }
1728                         
1729                         for(int s=1;s<=maxPrIndex;s++){
1730                             int value = 0;
1731                             double temp = 0.0000;
1732                             
1733                             U += exp(-(pr[s-1]-maxPrValue))/norm;
1734                             
1735                             if(U>0.00){
1736                                 temp = log10(U);
1737                             }
1738                             else{
1739                                 temp = -10.1;
1740                             }
1741                             temp = floor(-10 * temp);
1742                             value = (int)floor(temp);
1743                             if(value > 100){    value = 100;    }
1744                             
1745                             qualities[i][base] = (int)value;
1746                             base++;
1747                         }
1748                     }
1749                     
1750                     index++;
1751                 }
1752             }
1753             
1754             
1755             if(otuCounts[i] > 0){
1756                 qualityFile << '>' << seqNameVector[mapUniqueToSeq[i]] << endl;
1757                 
1758                 int j=4;        //need to get past the first four bases
1759                 while(qualities[i][j] != -1){
1760                     qualityFile << qualities[i][j] << ' ';
1761                     j++;
1762                 }
1763                 qualityFile << endl;
1764             }
1765         }
1766         qualityFile.close();
1767         outputNames.push_back(qualityFileName); outputTypes["qfile"].push_back(qualityFileName);
1768         
1769     }
1770     catch(exception& e) {
1771         m->errorOut(e, "ShhherCommand", "writeQualities");
1772         exit(1);        
1773     }           
1774 }
1775
1776 /**************************************************************************************************/
1777
1778 void ShhherCommand::writeSequences(vector<int> otuCounts){
1779     try {
1780         string thisOutputDir = outputDir;
1781         if (outputDir == "") {  thisOutputDir += m->hasPath(flowFileName);  }
1782         map<string, string> variables; 
1783                 variables["[filename]"] = thisOutputDir + m->getRootName(m->getSimpleName(flowFileName));
1784         string fastaFileName = getOutputFileName("fasta",variables);
1785         ofstream fastaFile;
1786         m->openOutputFile(fastaFileName, fastaFile);
1787         
1788         vector<string> names(numOTUs, "");
1789         
1790         for(int i=0;i<numOTUs;i++){
1791             
1792             if (m->control_pressed) { break; }
1793             
1794             int index = centroids[i];
1795             
1796             if(otuCounts[i] > 0){
1797                 fastaFile << '>' << seqNameVector[aaI[i][0]] << endl;
1798                 
1799                 string newSeq = "";
1800                 
1801                 for(int j=0;j<numFlowCells;j++){
1802                     
1803                     char base = flowOrder[j % flowOrder.length()];
1804                     for(int k=0;k<uniqueFlowgrams[index * numFlowCells + j];k++){
1805                         newSeq += base;
1806                     }
1807                 }
1808                 
1809                 fastaFile << newSeq.substr(4) << endl;
1810             }
1811         }
1812         fastaFile.close();
1813         
1814         outputNames.push_back(fastaFileName); outputTypes["fasta"].push_back(fastaFileName);
1815         
1816         if(compositeFASTAFileName != ""){
1817             m->appendFiles(fastaFileName, compositeFASTAFileName);
1818         }
1819     }
1820     catch(exception& e) {
1821         m->errorOut(e, "ShhherCommand", "writeSequences");
1822         exit(1);        
1823     }           
1824 }
1825
1826 /**************************************************************************************************/
1827
1828 void ShhherCommand::writeNames(vector<int> otuCounts){
1829     try {
1830         string thisOutputDir = outputDir;
1831         if (outputDir == "") {  thisOutputDir += m->hasPath(flowFileName);  }
1832         map<string, string> variables; 
1833                 variables["[filename]"] = thisOutputDir + m->getRootName(m->getSimpleName(flowFileName));
1834         string nameFileName = getOutputFileName("name",variables);
1835         ofstream nameFile;
1836         m->openOutputFile(nameFileName, nameFile);
1837         
1838         for(int i=0;i<numOTUs;i++){
1839             
1840             if (m->control_pressed) { break; }
1841             
1842             if(otuCounts[i] > 0){
1843                 nameFile << seqNameVector[aaI[i][0]] << '\t' << seqNameVector[aaI[i][0]];
1844                 
1845                 for(int j=1;j<nSeqsPerOTU[i];j++){
1846                     nameFile << ',' << seqNameVector[aaI[i][j]];
1847                 }
1848                 
1849                 nameFile << endl;
1850             }
1851         }
1852         nameFile.close();
1853         outputNames.push_back(nameFileName); outputTypes["name"].push_back(nameFileName);
1854         
1855         
1856         if(compositeNamesFileName != ""){
1857             m->appendFiles(nameFileName, compositeNamesFileName);
1858         }               
1859     }
1860     catch(exception& e) {
1861         m->errorOut(e, "ShhherCommand", "writeNames");
1862         exit(1);        
1863     }           
1864 }
1865
1866 /**************************************************************************************************/
1867
1868 void ShhherCommand::writeGroups(){
1869     try {
1870         string thisOutputDir = outputDir;
1871         if (outputDir == "") {  thisOutputDir += m->hasPath(flowFileName);  }
1872         string fileRoot = m->getRootName(m->getSimpleName(flowFileName));
1873         int pos = fileRoot.find_first_of('.');
1874         string fileGroup = fileRoot;
1875         if (pos != string::npos) {  fileGroup = fileRoot.substr(pos+1, (fileRoot.length()-1-(pos+1)));  }
1876         map<string, string> variables; 
1877                 variables["[filename]"] = thisOutputDir + fileRoot;
1878         string groupFileName = getOutputFileName("group",variables);
1879         ofstream groupFile;
1880         m->openOutputFile(groupFileName, groupFile);
1881         
1882         for(int i=0;i<numSeqs;i++){
1883             if (m->control_pressed) { break; }
1884             groupFile << seqNameVector[i] << '\t' << fileGroup << endl;
1885         }
1886         groupFile.close();
1887         outputNames.push_back(groupFileName); outputTypes["group"].push_back(groupFileName);
1888         
1889     }
1890     catch(exception& e) {
1891         m->errorOut(e, "ShhherCommand", "writeGroups");
1892         exit(1);        
1893     }           
1894 }
1895
1896 /**************************************************************************************************/
1897
1898 void ShhherCommand::writeClusters(vector<int> otuCounts){
1899     try {
1900         string thisOutputDir = outputDir;
1901         if (outputDir == "") {  thisOutputDir += m->hasPath(flowFileName);  }
1902         map<string, string> variables; 
1903                 variables["[filename]"] = thisOutputDir + m->getRootName(m->getSimpleName(flowFileName));
1904         string otuCountsFileName = getOutputFileName("counts",variables);
1905         ofstream otuCountsFile;
1906         m->openOutputFile(otuCountsFileName, otuCountsFile);
1907         
1908         string bases = flowOrder;
1909         
1910         for(int i=0;i<numOTUs;i++){
1911             
1912             if (m->control_pressed) {
1913                 break;
1914             }
1915             //output the translated version of the centroid sequence for the otu
1916             if(otuCounts[i] > 0){
1917                 int index = centroids[i];
1918                 
1919                 otuCountsFile << "ideal\t";
1920                 for(int j=8;j<numFlowCells;j++){
1921                     char base = bases[j % bases.length()];
1922                     for(int s=0;s<uniqueFlowgrams[index * numFlowCells + j];s++){
1923                         otuCountsFile << base;
1924                     }
1925                 }
1926                 otuCountsFile << endl;
1927                 
1928                 for(int j=0;j<nSeqsPerOTU[i];j++){
1929                     int sequence = aaI[i][j];
1930                     otuCountsFile << seqNameVector[sequence] << '\t';
1931                     
1932                     string newSeq = "";
1933                     
1934                     for(int k=0;k<lengths[sequence];k++){
1935                         char base = bases[k % bases.length()];
1936                         int freq = int(0.01 * (double)flowDataIntI[sequence * numFlowCells + k] + 0.5);
1937                         
1938                         for(int s=0;s<freq;s++){
1939                             newSeq += base;
1940                             //otuCountsFile << base;
1941                         }
1942                     }
1943                     otuCountsFile << newSeq.substr(4) << endl;
1944                 }
1945                 otuCountsFile << endl;
1946             }
1947         }
1948         otuCountsFile.close();
1949         outputNames.push_back(otuCountsFileName); outputTypes["counts"].push_back(otuCountsFileName);
1950         
1951     }
1952     catch(exception& e) {
1953         m->errorOut(e, "ShhherCommand", "writeClusters");
1954         exit(1);        
1955     }           
1956 }
1957
1958 #else
1959 //**********************************************************************************************************************
1960
1961 int ShhherCommand::execute(){
1962         try {
1963                 if (abort == true) { if (calledHelp) { return 0; }  return 2;   }
1964                 
1965                 getSingleLookUp();      if (m->control_pressed) { return 0; }
1966                 getJointLookUp();       if (m->control_pressed) { return 0; }
1967                 
1968         int numFiles = flowFileVector.size();
1969                 
1970         if (numFiles < processors) { processors = numFiles; }
1971         
1972 #if defined (__APPLE__) || (__MACH__) || (linux) || (__linux) || (__linux__) || (__unix__) || (__unix)
1973         if (processors == 1) { driver(flowFileVector, compositeFASTAFileName, compositeNamesFileName); }
1974         else { createProcesses(flowFileVector); } //each processor processes one file
1975 #else
1976         driver(flowFileVector, compositeFASTAFileName, compositeNamesFileName);
1977 #endif
1978         
1979                 if(compositeFASTAFileName != ""){
1980                         outputNames.push_back(compositeFASTAFileName); outputTypes["fasta"].push_back(compositeFASTAFileName);
1981                         outputNames.push_back(compositeNamesFileName); outputTypes["name"].push_back(compositeNamesFileName);
1982                 }
1983
1984                 m->mothurOutEndLine();
1985                 m->mothurOut("Output File Names: "); m->mothurOutEndLine();
1986                 for (int i = 0; i < outputNames.size(); i++) {  m->mothurOut(outputNames[i]); m->mothurOutEndLine();    }
1987                 m->mothurOutEndLine();
1988                 
1989                 return 0;
1990         }
1991         catch(exception& e) {
1992                 m->errorOut(e, "ShhherCommand", "execute");
1993                 exit(1);
1994         }
1995 }
1996 #endif
1997 //********************************************************************************************************************
1998 //sorts biggest to smallest
1999 inline bool compareFileSizes(string left, string right){
2000     
2001     FILE * pFile;
2002     long leftsize = 0;
2003     
2004     //get num bytes in file
2005     string filename = left;
2006     pFile = fopen (filename.c_str(),"rb");
2007     string error = "Error opening " + filename;
2008     if (pFile==NULL) perror (error.c_str());
2009     else{
2010         fseek (pFile, 0, SEEK_END);
2011         leftsize=ftell (pFile);
2012         fclose (pFile);
2013     }
2014     
2015     FILE * pFile2;
2016     long rightsize = 0;
2017     
2018     //get num bytes in file
2019     filename = right;
2020     pFile2 = fopen (filename.c_str(),"rb");
2021     error = "Error opening " + filename;
2022     if (pFile2==NULL) perror (error.c_str());
2023     else{
2024         fseek (pFile2, 0, SEEK_END);
2025         rightsize=ftell (pFile2);
2026         fclose (pFile2);
2027     }
2028     
2029     return (leftsize > rightsize);      
2030
2031 /**************************************************************************************************/
2032
2033 int ShhherCommand::createProcesses(vector<string> filenames){
2034     try {
2035         vector<int> processIDS;
2036                 int process = 1;
2037                 int num = 0;
2038                 
2039                 //sanity check
2040                 if (filenames.size() < processors) { processors = filenames.size(); }
2041         
2042         //sort file names by size to divide load better
2043         sort(filenames.begin(), filenames.end(), compareFileSizes);
2044         
2045         vector < vector <string> > dividedFiles; //dividedFiles[1] = vector of filenames for process 1...
2046         dividedFiles.resize(processors);
2047         
2048         //for each file, figure out which process will complete it
2049         //want to divide the load intelligently so the big files are spread between processes
2050         for (int i = 0; i < filenames.size(); i++) { 
2051             int processToAssign = (i+1) % processors; 
2052             if (processToAssign == 0) { processToAssign = processors; }
2053             
2054             dividedFiles[(processToAssign-1)].push_back(filenames[i]);
2055         }
2056         
2057         //now lets reverse the order of ever other process, so we balance big files running with little ones
2058         for (int i = 0; i < processors; i++) {
2059             int remainder = ((i+1) % processors);
2060             if (remainder) {  reverse(dividedFiles[i].begin(), dividedFiles[i].end());  }
2061         }
2062         
2063                         
2064                 //divide the groups between the processors
2065                 /*vector<linePair> lines;
2066         vector<int> numFilesToComplete;
2067                 int numFilesPerProcessor = filenames.size() / processors;
2068                 for (int i = 0; i < processors; i++) {
2069                         int startIndex =  i * numFilesPerProcessor;
2070                         int endIndex = (i+1) * numFilesPerProcessor;
2071                         if(i == (processors - 1)){      endIndex = filenames.size();    }
2072                         lines.push_back(linePair(startIndex, endIndex));
2073             numFilesToComplete.push_back((endIndex-startIndex));
2074                 }*/
2075                 
2076         #if defined (__APPLE__) || (__MACH__) || (linux) || (__linux) || (__linux__) || (__unix__) || (__unix)          
2077                 
2078                 //loop through and create all the processes you want
2079                 while (process != processors) {
2080                         int pid = fork();
2081                         
2082                         if (pid > 0) {
2083                                 processIDS.push_back(pid);  //create map from line number to pid so you can append files in correct order later
2084                                 process++;
2085                         }else if (pid == 0){
2086                                 num = driver(dividedFiles[process], compositeFASTAFileName + toString(getpid()) + ".temp", compositeNamesFileName  + toString(getpid()) + ".temp");
2087                 
2088                 //pass numSeqs to parent
2089                                 ofstream out;
2090                                 string tempFile = compositeFASTAFileName + toString(getpid()) + ".num.temp";
2091                                 m->openOutputFile(tempFile, out);
2092                                 out << num << endl;
2093                                 out.close();
2094                 
2095                                 exit(0);
2096                         }else { 
2097                                 m->mothurOut("[ERROR]: unable to spawn the necessary processes."); m->mothurOutEndLine(); 
2098                                 for (int i = 0; i < processIDS.size(); i++) { kill (processIDS[i], SIGINT); }
2099                                 exit(0);
2100                         }
2101                 }
2102                 
2103                 //do my part
2104                 driver(dividedFiles[0], compositeFASTAFileName, compositeNamesFileName);
2105                 
2106                 //force parent to wait until all the processes are done
2107                 for (int i=0;i<processIDS.size();i++) { 
2108                         int temp = processIDS[i];
2109                         wait(&temp);
2110                 }
2111         
2112         #else
2113         
2114         //////////////////////////////////////////////////////////////////////////////////////////////////////
2115         
2116         /////////////////////// NOT WORKING, ACCESS VIOLATION ON READ OF FLOWGRAMS IN THREAD /////////////////
2117         
2118         //////////////////////////////////////////////////////////////////////////////////////////////////////
2119                 //Windows version shared memory, so be careful when passing variables through the shhhFlowsData struct. 
2120                 //Above fork() will clone, so memory is separate, but that's not the case with windows, 
2121                 //////////////////////////////////////////////////////////////////////////////////////////////////////
2122                 /*
2123                 vector<shhhFlowsData*> pDataArray; 
2124                 DWORD   dwThreadIdArray[processors-1];
2125                 HANDLE  hThreadArray[processors-1]; 
2126                 
2127                 //Create processor worker threads.
2128                 for( int i=0; i<processors-1; i++ ){
2129                         // Allocate memory for thread data.
2130                         string extension = "";
2131                         if (i != 0) { extension = toString(i) + ".temp"; }
2132                         
2133             shhhFlowsData* tempFlow = new shhhFlowsData(filenames, (compositeFASTAFileName + extension), (compositeNamesFileName + extension), outputDir, flowOrder, jointLookUp, singleLookUp, m, lines[i].start, lines[i].end, cutoff, sigma, minDelta, maxIters, i);
2134                         pDataArray.push_back(tempFlow);
2135                         processIDS.push_back(i);
2136             
2137                         hThreadArray[i] = CreateThread(NULL, 0, ShhhFlowsThreadFunction, pDataArray[i], 0, &dwThreadIdArray[i]);   
2138                 }
2139                 
2140                 //using the main process as a worker saves time and memory
2141                 //do my part
2142                 driver(filenames, compositeFASTAFileName, compositeNamesFileName, lines[processors-1].start, lines[processors-1].end);
2143                 
2144                 //Wait until all threads have terminated.
2145                 WaitForMultipleObjects(processors-1, hThreadArray, TRUE, INFINITE);
2146                 
2147                 //Close all thread handles and free memory allocations.
2148                 for(int i=0; i < pDataArray.size(); i++){
2149                         for(int j=0; j < pDataArray[i]->outputNames.size(); j++){ outputNames.push_back(pDataArray[i]->outputNames[j]); }
2150                         CloseHandle(hThreadArray[i]);
2151                         delete pDataArray[i];
2152                 }
2153                 */
2154         #endif
2155         
2156         for (int i=0;i<processIDS.size();i++) { 
2157             ifstream in;
2158                         string tempFile =  compositeFASTAFileName + toString(processIDS[i]) + ".num.temp";
2159                         m->openInputFile(tempFile, in);
2160                         if (!in.eof()) { 
2161                 int tempNum = 0; 
2162                 in >> tempNum; 
2163                 if (tempNum != dividedFiles[i+1].size()) {
2164                     m->mothurOut("[ERROR]: main process expected " + toString(processIDS[i]) + " to complete " + toString(dividedFiles[i+1].size()) + " files, and it only reported completing " + toString(tempNum) + ". This will cause file mismatches.  The flow files may be too large to process with multiple processors. \n");
2165                 }
2166             }
2167                         in.close(); m->mothurRemove(tempFile);
2168             
2169             if (compositeFASTAFileName != "") {
2170                 m->appendFiles((compositeFASTAFileName + toString(processIDS[i]) + ".temp"), compositeFASTAFileName);
2171                 m->appendFiles((compositeNamesFileName + toString(processIDS[i]) + ".temp"), compositeNamesFileName);
2172                 m->mothurRemove((compositeFASTAFileName + toString(processIDS[i]) + ".temp"));
2173                 m->mothurRemove((compositeNamesFileName + toString(processIDS[i]) + ".temp"));
2174             }
2175         }
2176         
2177         return 0;
2178         
2179     }
2180         catch(exception& e) {
2181                 m->errorOut(e, "ShhherCommand", "createProcesses");
2182                 exit(1);
2183         }
2184 }
2185 /**************************************************************************************************/
2186
2187 vector<string> ShhherCommand::parseFlowFiles(string filename){
2188     try {
2189         vector<string> files;
2190         int count = 0;
2191         
2192         ifstream in;
2193         m->openInputFile(filename, in);
2194         
2195         int thisNumFLows = 0;
2196         in >> thisNumFLows; m->gobble(in);
2197         
2198         while (!in.eof()) {
2199             if (m->control_pressed) { break; }
2200             
2201             ofstream out;
2202             string outputFileName = filename + toString(count) + ".temp";
2203             m->openOutputFile(outputFileName, out);
2204             out << thisNumFLows << endl;
2205             files.push_back(outputFileName);
2206             
2207             int numLinesWrote = 0;
2208             for (int i = 0; i < largeSize; i++) {
2209                 if (in.eof()) { break; }
2210                 string line = m->getline(in); m->gobble(in);
2211                 out << line << endl;
2212                 numLinesWrote++;
2213             }
2214             out.close();
2215             
2216             if (numLinesWrote == 0) {  m->mothurRemove(outputFileName); files.pop_back();  }
2217             count++;
2218         }
2219         in.close();
2220         
2221         if (m->control_pressed) { for (int i = 0; i < files.size(); i++) { m->mothurRemove(files[i]); }  files.clear(); }
2222         
2223         m->mothurOut("\nDivided " + filename + " into " + toString(files.size()) + " files.\n\n"); 
2224         
2225         return files;
2226     }
2227         catch(exception& e) {
2228                 m->errorOut(e, "ShhherCommand", "parseFlowFiles");
2229                 exit(1);
2230         }
2231 }
2232 /**************************************************************************************************/
2233
2234 int ShhherCommand::driver(vector<string> filenames, string thisCompositeFASTAFileName, string thisCompositeNamesFileName){
2235     try {
2236         
2237         int numCompleted = 0;
2238         
2239         for(int i=0;i<filenames.size();i++){
2240                         
2241                         if (m->control_pressed) { break; }
2242                         
2243             vector<string> theseFlowFileNames; theseFlowFileNames.push_back(filenames[i]);
2244             if (large) {  theseFlowFileNames = parseFlowFiles(filenames[i]);  }
2245             
2246             if (m->control_pressed) { break; }
2247             
2248             double begClock = clock();
2249             unsigned long long begTime;
2250             
2251             string fileNameForOutput = filenames[i];
2252             
2253             for (int g = 0; g < theseFlowFileNames.size(); g++) {
2254                 
2255                 string flowFileName = theseFlowFileNames[g];
2256                 m->mothurOut("\n>>>>>\tProcessing " + flowFileName + " (file " + toString(i+1) + " of " + toString(filenames.size()) + ")\t<<<<<\n");
2257                 m->mothurOut("Reading flowgrams...\n");
2258                 
2259                 vector<string> seqNameVector;
2260                 vector<int> lengths;
2261                 vector<short> flowDataIntI;
2262                 vector<double> flowDataPrI;
2263                 map<string, int> nameMap;
2264                 vector<short> uniqueFlowgrams;
2265                 vector<int> uniqueCount;
2266                 vector<int> mapSeqToUnique;
2267                 vector<int> mapUniqueToSeq;
2268                 vector<int> uniqueLengths;
2269                 int numFlowCells;
2270                 
2271                 if (m->debug) { m->mothurOut("[DEBUG]: About to read flowgrams.\n"); }
2272                 int numSeqs = getFlowData(flowFileName, seqNameVector, lengths, flowDataIntI, nameMap, numFlowCells);
2273                 
2274                 if (m->control_pressed) { break; }
2275                 
2276                 m->mothurOut("Identifying unique flowgrams...\n");
2277                 int numUniques = getUniques(numSeqs, numFlowCells, uniqueFlowgrams, uniqueCount, uniqueLengths, mapSeqToUnique, mapUniqueToSeq, lengths, flowDataPrI, flowDataIntI);
2278                 
2279                 if (m->control_pressed) { break; }
2280                 
2281                 m->mothurOut("Calculating distances between flowgrams...\n");
2282                 string distFileName = flowFileName.substr(0,flowFileName.find_last_of('.')) + ".shhh.dist";
2283                 begTime = time(NULL);
2284                
2285                 
2286                 flowDistParentFork(numFlowCells, distFileName, numUniques, mapUniqueToSeq, mapSeqToUnique, lengths, flowDataPrI, flowDataIntI);
2287                 
2288                 m->mothurOutEndLine();
2289                 m->mothurOut("Total time: " + toString(time(NULL) - begTime) + '\t' + toString((clock() - begClock)/CLOCKS_PER_SEC) + '\n');
2290                 
2291                 
2292                 string namesFileName = flowFileName.substr(0,flowFileName.find_last_of('.')) + ".shhh.names";
2293                 createNamesFile(numSeqs, numUniques, namesFileName, seqNameVector, mapSeqToUnique, mapUniqueToSeq);
2294                 
2295                 if (m->control_pressed) { break; }
2296                 
2297                 m->mothurOut("\nClustering flowgrams...\n");
2298                 string listFileName = flowFileName.substr(0,flowFileName.find_last_of('.')) + ".shhh.list";
2299                 cluster(listFileName, distFileName, namesFileName);
2300                 
2301                 if (m->control_pressed) { break; }
2302                 
2303                 vector<int> otuData;
2304                 vector<int> cumNumSeqs;
2305                 vector<int> nSeqsPerOTU;
2306                 vector<vector<int> > aaP;       //tMaster->aanP:        each row is a different otu / each col contains the sequence indices
2307                 vector<vector<int> > aaI;       //tMaster->aanI:        that are in each otu - can't differentiate between aaP and aaI 
2308                 vector<int> seqNumber;          //tMaster->anP:         the sequence id number sorted by OTU
2309                 vector<int> seqIndex;           //tMaster->anI;         the index that corresponds to seqNumber
2310                 
2311                 
2312                 int numOTUs = getOTUData(numSeqs, listFileName, otuData, cumNumSeqs, nSeqsPerOTU, aaP, aaI, seqNumber, seqIndex, nameMap);
2313                 
2314                 if (m->control_pressed) { break; }
2315                 
2316                 m->mothurRemove(distFileName);
2317                 m->mothurRemove(namesFileName);
2318                 m->mothurRemove(listFileName);
2319                 
2320                 vector<double> dist;            //adDist - distance of sequences to centroids
2321                 vector<short> change;           //did the centroid sequence change? 0 = no; 1 = yes
2322                 vector<int> centroids;          //the representative flowgram for each cluster m
2323                 vector<double> weight;
2324                 vector<double> singleTau;       //tMaster->adTau:       1-D Tau vector (1xnumSeqs)
2325                 vector<int> nSeqsBreaks;
2326                 vector<int> nOTUsBreaks;
2327                 
2328                 if (m->debug) { m->mothurOut("[DEBUG]: numSeqs = " + toString(numSeqs) + " numOTUS = " + toString(numOTUs) + " about to alloc a dist vector with size = " + toString((numSeqs * numOTUs)) + ".\n"); }
2329                 
2330                 dist.assign(numSeqs * numOTUs, 0);
2331                 change.assign(numOTUs, 1);
2332                 centroids.assign(numOTUs, -1);
2333                 weight.assign(numOTUs, 0);
2334                 singleTau.assign(numSeqs, 1.0);
2335                 
2336                 nSeqsBreaks.assign(2, 0);
2337                 nOTUsBreaks.assign(2, 0);
2338                 
2339                 nSeqsBreaks[0] = 0;
2340                 nSeqsBreaks[1] = numSeqs;
2341                 nOTUsBreaks[1] = numOTUs;
2342                 
2343                 if (m->debug) { m->mothurOut("[DEBUG]: done allocating memory, about to denoise.\n"); }
2344                 
2345                 if (m->control_pressed) { break; }
2346                 
2347                 double maxDelta = 0;
2348                 int iter = 0;
2349                 
2350                 begClock = clock();
2351                 begTime = time(NULL);
2352                 
2353                 m->mothurOut("\nDenoising flowgrams...\n");
2354                 m->mothurOut("iter\tmaxDelta\tnLL\t\tcycletime\n");
2355                 
2356                 while((maxIters == 0 && maxDelta > minDelta) || iter < MIN_ITER || (maxDelta > minDelta && iter < maxIters)){
2357                     
2358                     if (m->control_pressed) { break; }
2359                     
2360                     double cycClock = clock();
2361                     unsigned long long cycTime = time(NULL);
2362                     fill(numOTUs, seqNumber, seqIndex, cumNumSeqs, nSeqsPerOTU, aaP, aaI);
2363                     
2364                     if (m->control_pressed) { break; }
2365                     
2366                     calcCentroidsDriver(numOTUs, cumNumSeqs, nSeqsPerOTU, seqIndex, change, centroids, singleTau, mapSeqToUnique, uniqueFlowgrams, flowDataIntI, lengths, numFlowCells, seqNumber);
2367                     
2368                     if (m->control_pressed) { break; }
2369                     
2370                     maxDelta = getNewWeights(numOTUs, cumNumSeqs, nSeqsPerOTU, singleTau, seqNumber, weight);  
2371                     
2372                     if (m->control_pressed) { break; }
2373                     
2374                     double nLL = getLikelihood(numSeqs, numOTUs, nSeqsPerOTU, seqNumber, cumNumSeqs, seqIndex, dist, weight); 
2375                     
2376                     if (m->control_pressed) { break; }
2377                     
2378                     checkCentroids(numOTUs, centroids, weight);
2379                     
2380                     if (m->control_pressed) { break; }
2381                     
2382                     calcNewDistances(numSeqs, numOTUs, nSeqsPerOTU,  dist, weight, change, centroids, aaP, singleTau, aaI, seqNumber, seqIndex, uniqueFlowgrams, flowDataIntI, numFlowCells, lengths);
2383                     
2384                     if (m->control_pressed) { break; }
2385                     
2386                     iter++;
2387                     
2388                     m->mothurOut(toString(iter) + '\t' + toString(maxDelta) + '\t' + toString(nLL) + '\t' + toString(time(NULL) - cycTime) + '\t' + toString((clock() - cycClock)/(double)CLOCKS_PER_SEC) + '\n');
2389                     
2390                 }       
2391                 
2392                 if (m->control_pressed) { break; }
2393                 
2394                 m->mothurOut("\nFinalizing...\n");
2395                 fill(numOTUs, seqNumber, seqIndex, cumNumSeqs, nSeqsPerOTU, aaP, aaI);
2396                 
2397                 if (m->control_pressed) { break; }
2398                 
2399                 setOTUs(numOTUs, numSeqs, seqNumber, seqIndex, cumNumSeqs, nSeqsPerOTU, otuData, singleTau, dist, aaP, aaI);
2400                 
2401                 if (m->control_pressed) { break; }
2402                 
2403                 vector<int> otuCounts(numOTUs, 0);
2404                 for(int j=0;j<numSeqs;j++)      {       otuCounts[otuData[j]]++;        }
2405                 
2406                 calcCentroidsDriver(numOTUs, cumNumSeqs, nSeqsPerOTU, seqIndex, change, centroids, singleTau, mapSeqToUnique, uniqueFlowgrams, flowDataIntI, lengths, numFlowCells, seqNumber); 
2407                 
2408                 if (m->control_pressed) { break; }
2409                 
2410                 if ((large) && (g == 0)) {  flowFileName = filenames[i]; theseFlowFileNames[0] = filenames[i]; }
2411                 string thisOutputDir = outputDir;
2412                 if (outputDir == "") {  thisOutputDir = m->hasPath(flowFileName);  }
2413                 map<string, string> variables; 
2414                 variables["[filename]"] = thisOutputDir + m->getRootName(m->getSimpleName(flowFileName));
2415                 string qualityFileName = getOutputFileName("qfile",variables);
2416                 string fastaFileName = getOutputFileName("fasta",variables);
2417                 string nameFileName = getOutputFileName("name",variables);
2418                 string otuCountsFileName = getOutputFileName("counts",variables);
2419                 string fileRoot = m->getRootName(m->getSimpleName(flowFileName));
2420                 int pos = fileRoot.find_first_of('.');
2421                 string fileGroup = fileRoot;
2422                 if (pos != string::npos) {  fileGroup = fileRoot.substr(pos+1, (fileRoot.length()-1-(pos+1)));  }
2423                 string groupFileName = getOutputFileName("group",variables);
2424
2425                 
2426                 writeQualities(numOTUs, numFlowCells, qualityFileName, otuCounts, nSeqsPerOTU, seqNumber, singleTau, flowDataIntI, uniqueFlowgrams, cumNumSeqs, mapUniqueToSeq, seqNameVector, centroids, aaI); if (m->control_pressed) { break; }
2427                 writeSequences(thisCompositeFASTAFileName, numOTUs, numFlowCells, fastaFileName, otuCounts, uniqueFlowgrams, seqNameVector, aaI, centroids);if (m->control_pressed) { break; }
2428                 writeNames(thisCompositeNamesFileName, numOTUs, nameFileName, otuCounts, seqNameVector, aaI, nSeqsPerOTU);                              if (m->control_pressed) { break; }
2429                 writeClusters(otuCountsFileName, numOTUs, numFlowCells,otuCounts, centroids, uniqueFlowgrams, seqNameVector, aaI, nSeqsPerOTU, lengths, flowDataIntI);                  if (m->control_pressed) { break; }
2430                 writeGroups(groupFileName, fileGroup, numSeqs, seqNameVector);                                          if (m->control_pressed) { break; }
2431                 
2432                 if (large) {
2433                     if (g > 0) {
2434                         variables["[filename]"] = thisOutputDir + m->getRootName(m->getSimpleName(theseFlowFileNames[0]));
2435                         m->appendFiles(qualityFileName, getOutputFileName("qfile",variables));
2436                         m->mothurRemove(qualityFileName);
2437                         m->appendFiles(fastaFileName, getOutputFileName("fasta",variables));
2438                         m->mothurRemove(fastaFileName);
2439                         m->appendFiles(nameFileName, getOutputFileName("name",variables));
2440                         m->mothurRemove(nameFileName);
2441                         m->appendFiles(otuCountsFileName, getOutputFileName("counts",variables));
2442                         m->mothurRemove(otuCountsFileName);
2443                         m->appendFiles(groupFileName, getOutputFileName("group",variables));
2444                         m->mothurRemove(groupFileName);
2445                     }
2446                     m->mothurRemove(theseFlowFileNames[g]);
2447                 }
2448                         }
2449             
2450             numCompleted++;
2451                         m->mothurOut("Total time to process " + fileNameForOutput + ":\t" + toString(time(NULL) - begTime) + '\t' + toString((clock() - begClock)/(double)CLOCKS_PER_SEC) + '\n');
2452                 }
2453                 
2454         if (m->control_pressed) { for (int i = 0; i < outputNames.size(); i++) { m->mothurRemove(outputNames[i]); } return 0; }
2455         
2456         return numCompleted;
2457         
2458     }catch(exception& e) {
2459             m->errorOut(e, "ShhherCommand", "driver");
2460             exit(1);
2461     }
2462 }
2463
2464 /**************************************************************************************************/
2465 int ShhherCommand::getFlowData(string filename, vector<string>& thisSeqNameVector, vector<int>& thisLengths, vector<short>& thisFlowDataIntI, map<string, int>& thisNameMap, int& numFlowCells){
2466         try{
2467        
2468                 ifstream flowFile;
2469        
2470                 m->openInputFile(filename, flowFile);
2471                 
2472                 string seqName;
2473                 int currentNumFlowCells;
2474                 float intensity;
2475         thisSeqNameVector.clear();
2476                 thisLengths.clear();
2477                 thisFlowDataIntI.clear();
2478                 thisNameMap.clear();
2479                 
2480                 string numFlowTest;
2481         flowFile >> numFlowTest;
2482         
2483         if (!m->isContainingOnlyDigits(numFlowTest)) { m->mothurOut("[ERROR]: expected a number and got " + numFlowTest + ", quitting. Did you use the flow parameter instead of the file parameter?"); m->mothurOutEndLine(); exit(1); }
2484         else { convert(numFlowTest, numFlowCells); }
2485         
2486         if (m->debug) { m->mothurOut("[DEBUG]: numFlowCells = " + toString(numFlowCells) + ".\n"); }
2487                 int index = 0;//pcluster
2488                 while(!flowFile.eof()){
2489                         
2490                         if (m->control_pressed) { break; }
2491                         
2492                         flowFile >> seqName >> currentNumFlowCells;
2493             
2494                         thisLengths.push_back(currentNumFlowCells);
2495            
2496                         thisSeqNameVector.push_back(seqName);
2497                         thisNameMap[seqName] = index++;//pcluster
2498             
2499             if (m->debug) { m->mothurOut("[DEBUG]: seqName = " + seqName + " length = " + toString(currentNumFlowCells) + " index = " + toString(index) + "\n"); }
2500             
2501                         for(int i=0;i<numFlowCells;i++){
2502                                 flowFile >> intensity;
2503                                 if(intensity > 9.99)    {       intensity = 9.99;       }
2504                                 int intI = int(100 * intensity + 0.0001);
2505                                 thisFlowDataIntI.push_back(intI);
2506                         }
2507                         m->gobble(flowFile);
2508                 }
2509                 flowFile.close();
2510                 
2511                 int numSeqs = thisSeqNameVector.size();         
2512                 
2513                 for(int i=0;i<numSeqs;i++){
2514                         
2515                         if (m->control_pressed) { break; }
2516                         
2517                         int iNumFlowCells = i * numFlowCells;
2518                         for(int j=thisLengths[i];j<numFlowCells;j++){
2519                                 thisFlowDataIntI[iNumFlowCells + j] = 0;
2520                         }
2521                 }
2522         
2523         return numSeqs;
2524                 
2525         }
2526         catch(exception& e) {
2527                 m->errorOut(e, "ShhherCommand", "getFlowData");
2528                 exit(1);
2529         }
2530 }
2531 /**************************************************************************************************/
2532
2533 int ShhherCommand::flowDistParentFork(int numFlowCells, string distFileName, int stopSeq, vector<int>& mapUniqueToSeq, vector<int>& mapSeqToUnique, vector<int>& lengths, vector<double>& flowDataPrI, vector<short>& flowDataIntI){
2534         try{            
2535         
2536                 ostringstream outStream;
2537                 outStream.setf(ios::fixed, ios::floatfield);
2538                 outStream.setf(ios::dec, ios::basefield);
2539                 outStream.setf(ios::showpoint);
2540                 outStream.precision(6);
2541                 
2542                 int begTime = time(NULL);
2543                 double begClock = clock();
2544         
2545                 for(int i=0;i<stopSeq;i++){
2546                         
2547                         if (m->control_pressed) { break; }
2548                         
2549                         for(int j=0;j<i;j++){
2550                                 float flowDistance = calcPairwiseDist(numFlowCells, mapUniqueToSeq[i], mapUniqueToSeq[j], mapSeqToUnique, lengths, flowDataPrI, flowDataIntI);
2551                 
2552                                 if(flowDistance < 1e-6){
2553                                         outStream << mapUniqueToSeq[i] << '\t' << mapUniqueToSeq[j] << '\t' << 0.000000 << endl;
2554                                 }
2555                                 else if(flowDistance <= cutoff){
2556                                         outStream << mapUniqueToSeq[i] << '\t' << mapUniqueToSeq[j] << '\t' << flowDistance << endl;
2557                                 }
2558                         }
2559                         if(i % 100 == 0){
2560                                 m->mothurOutJustToScreen(toString(i) + "\t" + toString(time(NULL) - begTime));
2561                                 m->mothurOutJustToScreen("\t" + toString((clock()-begClock)/CLOCKS_PER_SEC)+"\n");
2562                         }
2563                 }
2564                 
2565                 ofstream distFile(distFileName.c_str());
2566                 distFile << outStream.str();            
2567                 distFile.close();
2568                 
2569                 if (m->control_pressed) {}
2570                 else {
2571                         m->mothurOutJustToScreen(toString(stopSeq-1) + "\t" + toString(time(NULL) - begTime));
2572                         m->mothurOutJustToScreen("\t" + toString((clock()-begClock)/CLOCKS_PER_SEC)+"\n");
2573                 }
2574         
2575         return 0;
2576         }
2577         catch(exception& e) {
2578                 m->errorOut(e, "ShhherCommand", "flowDistParentFork");
2579                 exit(1);
2580         }
2581 }
2582 /**************************************************************************************************/
2583
2584 float ShhherCommand::calcPairwiseDist(int numFlowCells, int seqA, int seqB, vector<int>& mapSeqToUnique, vector<int>& lengths, vector<double>& flowDataPrI, vector<short>& flowDataIntI){
2585         try{
2586                 int minLength = lengths[mapSeqToUnique[seqA]];
2587                 if(lengths[seqB] < minLength){  minLength = lengths[mapSeqToUnique[seqB]];      }
2588                 
2589                 int ANumFlowCells = seqA * numFlowCells;
2590                 int BNumFlowCells = seqB * numFlowCells;
2591                 
2592                 float dist = 0;
2593                 
2594                 for(int i=0;i<minLength;i++){
2595                         
2596                         if (m->control_pressed) { break; }
2597                         
2598                         int flowAIntI = flowDataIntI[ANumFlowCells + i];
2599                         float flowAPrI = flowDataPrI[ANumFlowCells + i];
2600                         
2601                         int flowBIntI = flowDataIntI[BNumFlowCells + i];
2602                         float flowBPrI = flowDataPrI[BNumFlowCells + i];
2603                         dist += jointLookUp[flowAIntI * NUMBINS + flowBIntI] - flowAPrI - flowBPrI;
2604                 }
2605                 
2606                 dist /= (float) minLength;
2607                 return dist;
2608         }
2609         catch(exception& e) {
2610                 m->errorOut(e, "ShhherCommand", "calcPairwiseDist");
2611                 exit(1);
2612         }
2613 }
2614
2615 /**************************************************************************************************/
2616
2617 int ShhherCommand::getUniques(int numSeqs, int numFlowCells, vector<short>& uniqueFlowgrams, vector<int>& uniqueCount, vector<int>& uniqueLengths, vector<int>& mapSeqToUnique, vector<int>& mapUniqueToSeq, vector<int>& lengths, vector<double>& flowDataPrI, vector<short>& flowDataIntI){
2618         try{
2619                 int numUniques = 0;
2620                 uniqueFlowgrams.assign(numFlowCells * numSeqs, -1);
2621                 uniqueCount.assign(numSeqs, 0);                                                 //      anWeights
2622                 uniqueLengths.assign(numSeqs, 0);
2623                 mapSeqToUnique.assign(numSeqs, -1);
2624                 mapUniqueToSeq.assign(numSeqs, -1);
2625                 
2626                 vector<short> uniqueFlowDataIntI(numFlowCells * numSeqs, -1);
2627                 
2628                 for(int i=0;i<numSeqs;i++){
2629                         
2630                         if (m->control_pressed) { break; }
2631                         
2632                         int index = 0;
2633                         
2634                         vector<short> current(numFlowCells);
2635                         for(int j=0;j<numFlowCells;j++){
2636                                 current[j] = short(((flowDataIntI[i * numFlowCells + j] + 50.0)/100.0));
2637                         }
2638             
2639                         for(int j=0;j<numUniques;j++){
2640                                 int offset = j * numFlowCells;
2641                                 bool toEnd = 1;
2642                                 
2643                                 int shorterLength;
2644                                 if(lengths[i] < uniqueLengths[j])       {       shorterLength = lengths[i];                     }
2645                                 else                                                            {       shorterLength = uniqueLengths[j];       }
2646                 
2647                                 for(int k=0;k<shorterLength;k++){
2648                                         if(current[k] != uniqueFlowgrams[offset + k]){
2649                                                 toEnd = 0;
2650                                                 break;
2651                                         }
2652                                 }
2653                                 
2654                                 if(toEnd){
2655                                         mapSeqToUnique[i] = j;
2656                                         uniqueCount[j]++;
2657                                         index = j;
2658                                         if(lengths[i] > uniqueLengths[j])       {       uniqueLengths[j] = lengths[i];  }
2659                                         break;
2660                                 }
2661                                 index++;
2662                         }
2663                         
2664                         if(index == numUniques){
2665                                 uniqueLengths[numUniques] = lengths[i];
2666                                 uniqueCount[numUniques] = 1;
2667                                 mapSeqToUnique[i] = numUniques;//anMap
2668                                 mapUniqueToSeq[numUniques] = i;//anF
2669                                 
2670                                 for(int k=0;k<numFlowCells;k++){
2671                                         uniqueFlowgrams[numUniques * numFlowCells + k] = current[k];
2672                                         uniqueFlowDataIntI[numUniques * numFlowCells + k] = flowDataIntI[i * numFlowCells + k];
2673                                 }
2674                                 
2675                                 numUniques++;
2676                         }
2677                 }
2678                 uniqueFlowDataIntI.resize(numFlowCells * numUniques);
2679                 uniqueLengths.resize(numUniques);       
2680                 
2681                 flowDataPrI.resize(numSeqs * numFlowCells, 0);
2682                 for(int i=0;i<flowDataPrI.size();i++)   {       if (m->control_pressed) { break; } flowDataPrI[i] = getProbIntensity(flowDataIntI[i]);          }
2683         
2684         return numUniques;
2685         }
2686         catch(exception& e) {
2687                 m->errorOut(e, "ShhherCommand", "getUniques");
2688                 exit(1);
2689         }
2690 }
2691 /**************************************************************************************************/
2692 int ShhherCommand::createNamesFile(int numSeqs, int numUniques, string filename, vector<string>& seqNameVector, vector<int>& mapSeqToUnique, vector<int>& mapUniqueToSeq){
2693         try{
2694                 
2695                 vector<string> duplicateNames(numUniques, "");
2696                 for(int i=0;i<numSeqs;i++){
2697                         duplicateNames[mapSeqToUnique[i]] += seqNameVector[i] + ',';
2698                 }
2699                 
2700                 ofstream nameFile;
2701                 m->openOutputFile(filename, nameFile);
2702                 
2703                 for(int i=0;i<numUniques;i++){
2704                         
2705                         if (m->control_pressed) { break; }
2706                         
2707             //                  nameFile << seqNameVector[mapUniqueToSeq[i]] << '\t' << duplicateNames[i].substr(0, duplicateNames[i].find_last_of(',')) << endl;
2708                         nameFile << mapUniqueToSeq[i] << '\t' << duplicateNames[i].substr(0, duplicateNames[i].find_last_of(',')) << endl;
2709                 }
2710                 
2711                 nameFile.close();
2712         
2713                 return 0;
2714         }
2715         catch(exception& e) {
2716                 m->errorOut(e, "ShhherCommand", "createNamesFile");
2717                 exit(1);
2718         }
2719 }
2720 //**********************************************************************************************************************
2721
2722 int ShhherCommand::cluster(string filename, string distFileName, string namesFileName){
2723         try {
2724                 
2725                 ReadMatrix* read = new ReadColumnMatrix(distFileName);  
2726                 read->setCutoff(cutoff);
2727                 
2728                 NameAssignment* clusterNameMap = new NameAssignment(namesFileName);
2729                 clusterNameMap->readMap();
2730                 read->read(clusterNameMap);
2731         
2732                 ListVector* list = read->getListVector();
2733                 SparseDistanceMatrix* matrix = read->getDMatrix();
2734                 
2735                 delete read; 
2736                 delete clusterNameMap; 
2737         
2738                 RAbundVector* rabund = new RAbundVector(list->getRAbundVector());
2739                 
2740                 Cluster* cluster = new CompleteLinkage(rabund, list, matrix, cutoff, "furthest"); 
2741                 string tag = cluster->getTag();
2742                 
2743                 double clusterCutoff = cutoff;
2744                 while (matrix->getSmallDist() <= clusterCutoff && matrix->getNNodes() > 0){
2745                         
2746                         if (m->control_pressed) { break; }
2747                         
2748                         cluster->update(clusterCutoff);
2749                 }
2750                 
2751                 list->setLabel(toString(cutoff));
2752                 
2753                 ofstream listFile;
2754                 m->openOutputFile(filename, listFile);
2755                 list->print(listFile);
2756                 listFile.close();
2757                 
2758                 delete matrix;  delete cluster; delete rabund; delete list;
2759         
2760                 return 0;
2761         }
2762         catch(exception& e) {
2763                 m->errorOut(e, "ShhherCommand", "cluster");
2764                 exit(1);        
2765         }               
2766 }
2767 /**************************************************************************************************/
2768
2769 int ShhherCommand::getOTUData(int numSeqs, string fileName,  vector<int>& otuData,
2770                                vector<int>& cumNumSeqs,
2771                                vector<int>& nSeqsPerOTU,
2772                                vector<vector<int> >& aaP,       //tMaster->aanP:        each row is a different otu / each col contains the sequence indices
2773                                vector<vector<int> >& aaI,       //tMaster->aanI:        that are in each otu - can't differentiate between aaP and aaI 
2774                                vector<int>& seqNumber,          //tMaster->anP:         the sequence id number sorted by OTU
2775                                vector<int>& seqIndex,
2776                                map<string, int>& nameMap){
2777         try {
2778         
2779                 ifstream listFile;
2780                 m->openInputFile(fileName, listFile);
2781                 string label;
2782         int numOTUs;
2783                 
2784                 listFile >> label >> numOTUs;
2785         
2786         if (m->debug) { m->mothurOut("[DEBUG]: Getting OTU Data...\n"); }
2787         
2788                 otuData.assign(numSeqs, 0);
2789                 cumNumSeqs.assign(numOTUs, 0);
2790                 nSeqsPerOTU.assign(numOTUs, 0);
2791                 aaP.clear();aaP.resize(numOTUs);
2792                 
2793                 seqNumber.clear();
2794                 aaI.clear();
2795                 seqIndex.clear();
2796                 
2797                 string singleOTU = "";
2798                 
2799                 for(int i=0;i<numOTUs;i++){
2800                         
2801                         if (m->control_pressed) { break; }
2802             if (m->debug) { m->mothurOut("[DEBUG]: processing OTU " + toString(i) + ".\n"); }
2803             
2804                         listFile >> singleOTU;
2805                         
2806                         istringstream otuString(singleOTU);
2807             
2808                         while(otuString){
2809                                 
2810                                 string seqName = "";
2811                                 
2812                                 for(int j=0;j<singleOTU.length();j++){
2813                                         char letter = otuString.get();
2814                                         
2815                                         if(letter != ','){
2816                                                 seqName += letter;
2817                                         }
2818                                         else{
2819                                                 map<string,int>::iterator nmIt = nameMap.find(seqName);
2820                                                 int index = nmIt->second;
2821                                                 
2822                                                 nameMap.erase(nmIt);
2823                                                 
2824                                                 otuData[index] = i;
2825                                                 nSeqsPerOTU[i]++;
2826                                                 aaP[i].push_back(index);
2827                                                 seqName = "";
2828                                         }
2829                                 }
2830                                 
2831                                 map<string,int>::iterator nmIt = nameMap.find(seqName);
2832                 
2833                                 int index = nmIt->second;
2834                                 nameMap.erase(nmIt);
2835                 
2836                                 otuData[index] = i;
2837                                 nSeqsPerOTU[i]++;
2838                                 aaP[i].push_back(index);        
2839                                 
2840                                 otuString.get();
2841                         }
2842                         
2843                         sort(aaP[i].begin(), aaP[i].end());
2844                         for(int j=0;j<nSeqsPerOTU[i];j++){
2845                                 seqNumber.push_back(aaP[i][j]);
2846                         }
2847                         for(int j=nSeqsPerOTU[i];j<numSeqs;j++){
2848                                 aaP[i].push_back(0);
2849                         }
2850                         
2851                         
2852                 }
2853                 
2854                 for(int i=1;i<numOTUs;i++){
2855                         cumNumSeqs[i] = cumNumSeqs[i-1] + nSeqsPerOTU[i-1];
2856                 }
2857                 aaI = aaP;
2858                 seqIndex = seqNumber;
2859                 
2860                 listFile.close();       
2861         
2862         return numOTUs;
2863                 
2864         }
2865         catch(exception& e) {
2866                 m->errorOut(e, "ShhherCommand", "getOTUData");
2867                 exit(1);        
2868         }               
2869 }
2870 /**************************************************************************************************/
2871
2872 int ShhherCommand::calcCentroidsDriver(int numOTUs, 
2873                                           vector<int>& cumNumSeqs,
2874                                           vector<int>& nSeqsPerOTU,
2875                                           vector<int>& seqIndex,
2876                                           vector<short>& change,                //did the centroid sequence change? 0 = no; 1 = yes
2877                                           vector<int>& centroids,               //the representative flowgram for each cluster m
2878                                           vector<double>& singleTau,    //tMaster->adTau:       1-D Tau vector (1xnumSeqs)
2879                                           vector<int>& mapSeqToUnique,
2880                                           vector<short>& uniqueFlowgrams,
2881                                           vector<short>& flowDataIntI,
2882                                           vector<int>& lengths,
2883                                           int numFlowCells,
2884                                           vector<int>& seqNumber){                          
2885         
2886         //this function gets the most likely homopolymer length at a flow position for a group of sequences
2887         //within an otu
2888         
2889         try{
2890                 
2891                 for(int i=0;i<numOTUs;i++){
2892                         
2893                         if (m->control_pressed) { break; }
2894                         
2895                         double count = 0;
2896                         int position = 0;
2897                         int minFlowGram = 100000000;
2898                         double minFlowValue = 1e8;
2899                         change[i] = 0; //FALSE
2900                         
2901                         for(int j=0;j<nSeqsPerOTU[i];j++){
2902                                 count += singleTau[seqNumber[cumNumSeqs[i] + j]];
2903                         }
2904             
2905                         if(nSeqsPerOTU[i] > 0 && count > MIN_COUNT){
2906                                 vector<double> adF(nSeqsPerOTU[i]);
2907                                 vector<int> anL(nSeqsPerOTU[i]);
2908                                 
2909                                 for(int j=0;j<nSeqsPerOTU[i];j++){
2910                                         int index = cumNumSeqs[i] + j;
2911                                         int nI = seqIndex[index];
2912                                         int nIU = mapSeqToUnique[nI];
2913                                         
2914                                         int k;
2915                                         for(k=0;k<position;k++){
2916                                                 if(nIU == anL[k]){
2917                                                         break;
2918                                                 }
2919                                         }
2920                                         if(k == position){
2921                                                 anL[position] = nIU;
2922                                                 adF[position] = 0.0000;
2923                                                 position++;
2924                                         }                                               
2925                                 }
2926                                 
2927                                 for(int j=0;j<nSeqsPerOTU[i];j++){
2928                                         int index = cumNumSeqs[i] + j;
2929                                         int nI = seqIndex[index];
2930                                         
2931                                         double tauValue = singleTau[seqNumber[index]];
2932                                         
2933                                         for(int k=0;k<position;k++){
2934                                                 double dist = getDistToCentroid(anL[k], nI, lengths[nI], uniqueFlowgrams, flowDataIntI, numFlowCells);
2935                                                 adF[k] += dist * tauValue;
2936                                         }
2937                                 }
2938                                 
2939                                 for(int j=0;j<position;j++){
2940                                         if(adF[j] < minFlowValue){
2941                                                 minFlowGram = j;
2942                                                 minFlowValue = adF[j];
2943                                         }
2944                                 }
2945                                 
2946                                 if(centroids[i] != anL[minFlowGram]){
2947                                         change[i] = 1;
2948                                         centroids[i] = anL[minFlowGram];
2949                                 }
2950                         }
2951                         else if(centroids[i] != -1){
2952                                 change[i] = 1;
2953                                 centroids[i] = -1;                      
2954                         }
2955                 }
2956         
2957         return 0;
2958         }
2959         catch(exception& e) {
2960                 m->errorOut(e, "ShhherCommand", "calcCentroidsDriver");
2961                 exit(1);        
2962         }               
2963 }
2964 /**************************************************************************************************/
2965
2966 double ShhherCommand::getDistToCentroid(int cent, int flow, int length, vector<short>& uniqueFlowgrams,
2967                                         vector<short>& flowDataIntI, int numFlowCells){
2968         try{
2969                 
2970                 int flowAValue = cent * numFlowCells;
2971                 int flowBValue = flow * numFlowCells;
2972                 
2973                 double dist = 0;
2974         
2975                 for(int i=0;i<length;i++){
2976                         dist += singleLookUp[uniqueFlowgrams[flowAValue] * NUMBINS + flowDataIntI[flowBValue]];
2977                         flowAValue++;
2978                         flowBValue++;
2979                 }
2980                 
2981                 return dist / (double)length;
2982         }
2983         catch(exception& e) {
2984                 m->errorOut(e, "ShhherCommand", "getDistToCentroid");
2985                 exit(1);        
2986         }               
2987 }
2988 /**************************************************************************************************/
2989
2990 double ShhherCommand::getNewWeights(int numOTUs, vector<int>& cumNumSeqs, vector<int>& nSeqsPerOTU, vector<double>& singleTau, vector<int>& seqNumber, vector<double>& weight){
2991         try{
2992                 
2993                 double maxChange = 0;
2994                 
2995                 for(int i=0;i<numOTUs;i++){
2996                         
2997                         if (m->control_pressed) { break; }
2998                         
2999                         double difference = weight[i];
3000                         weight[i] = 0;
3001                         
3002                         for(int j=0;j<nSeqsPerOTU[i];j++){
3003                                 int index = cumNumSeqs[i] + j;
3004                                 double tauValue = singleTau[seqNumber[index]];
3005                                 weight[i] += tauValue;
3006                         }
3007                         
3008                         difference = fabs(weight[i] - difference);
3009                         if(difference > maxChange){     maxChange = difference; }
3010                 }
3011                 return maxChange;
3012         }
3013         catch(exception& e) {
3014                 m->errorOut(e, "ShhherCommand", "getNewWeights");
3015                 exit(1);        
3016         }               
3017 }
3018
3019 /**************************************************************************************************/
3020
3021 double ShhherCommand::getLikelihood(int numSeqs, int numOTUs, vector<int>& nSeqsPerOTU, vector<int>& seqNumber, vector<int>& cumNumSeqs, vector<int>& seqIndex, vector<double>& dist, vector<double>& weight){
3022         
3023         try{
3024                 
3025                 vector<long double> P(numSeqs, 0);
3026                 int effNumOTUs = 0;
3027                 
3028                 for(int i=0;i<numOTUs;i++){
3029                         if(weight[i] > MIN_WEIGHT){
3030                                 effNumOTUs++;
3031                         }
3032                 }
3033                 
3034                 string hold;
3035                 for(int i=0;i<numOTUs;i++){
3036                         
3037                         if (m->control_pressed) { break; }
3038                         
3039                         for(int j=0;j<nSeqsPerOTU[i];j++){
3040                                 int index = cumNumSeqs[i] + j;
3041                                 int nI = seqIndex[index];
3042                                 double singleDist = dist[seqNumber[index]];
3043                                 
3044                                 P[nI] += weight[i] * exp(-singleDist * sigma);
3045                         }
3046                 }
3047                 double nLL = 0.00;
3048                 for(int i=0;i<numSeqs;i++){
3049                         if(P[i] == 0){  P[i] = DBL_EPSILON;     }
3050             
3051                         nLL += -log(P[i]);
3052                 }
3053                 
3054                 nLL = nLL -(double)numSeqs * log(sigma);
3055         
3056                 return nLL; 
3057         }
3058         catch(exception& e) {
3059                 m->errorOut(e, "ShhherCommand", "getNewWeights");
3060                 exit(1);        
3061         }               
3062 }
3063
3064 /**************************************************************************************************/
3065
3066 int ShhherCommand::checkCentroids(int numOTUs, vector<int>& centroids, vector<double>& weight){
3067         try{
3068                 vector<int> unique(numOTUs, 1);
3069                 
3070                 for(int i=0;i<numOTUs;i++){
3071                         if(centroids[i] == -1 || weight[i] < MIN_WEIGHT){
3072                                 unique[i] = -1;
3073                         }
3074                 }
3075                 
3076                 for(int i=0;i<numOTUs;i++){
3077                         
3078                         if (m->control_pressed) { break; }
3079                         
3080                         if(unique[i] == 1){
3081                                 for(int j=i+1;j<numOTUs;j++){
3082                                         if(unique[j] == 1){
3083                                                 
3084                                                 if(centroids[j] == centroids[i]){
3085                                                         unique[j] = 0;
3086                                                         centroids[j] = -1;
3087                                                         
3088                                                         weight[i] += weight[j];
3089                                                         weight[j] = 0.0;
3090                                                 }
3091                                         }
3092                                 }
3093                         }
3094                 }
3095         
3096         return 0;
3097         }
3098         catch(exception& e) {
3099                 m->errorOut(e, "ShhherCommand", "checkCentroids");
3100                 exit(1);        
3101         }               
3102 }
3103 /**************************************************************************************************/
3104
3105 void ShhherCommand::calcNewDistances(int numSeqs, int numOTUs, vector<int>& nSeqsPerOTU, vector<double>& dist, 
3106                                      vector<double>& weight, vector<short>& change, vector<int>& centroids,
3107                                      vector<vector<int> >& aaP, vector<double>& singleTau, vector<vector<int> >& aaI,   
3108                                      vector<int>& seqNumber, vector<int>& seqIndex,
3109                                      vector<short>& uniqueFlowgrams,
3110                                      vector<short>& flowDataIntI, int numFlowCells, vector<int>& lengths){
3111         
3112         try{
3113                 
3114                 int total = 0;
3115                 vector<double> newTau(numOTUs,0);
3116                 vector<double> norms(numSeqs, 0);
3117                 nSeqsPerOTU.assign(numOTUs, 0);
3118         
3119                 for(int i=0;i<numSeqs;i++){
3120                         
3121                         if (m->control_pressed) { break; }
3122                         
3123                         int indexOffset = i * numOTUs;
3124             
3125                         double offset = 1e8;
3126                         
3127                         for(int j=0;j<numOTUs;j++){
3128                 
3129                                 if(weight[j] > MIN_WEIGHT && change[j] == 1){
3130                                         dist[indexOffset + j] = getDistToCentroid(centroids[j], i, lengths[i], uniqueFlowgrams, flowDataIntI, numFlowCells);
3131                                 }
3132                 
3133                                 if(weight[j] > MIN_WEIGHT && dist[indexOffset + j] < offset){
3134                                         offset = dist[indexOffset + j];
3135                                 }
3136                         }
3137             
3138                         for(int j=0;j<numOTUs;j++){
3139                                 if(weight[j] > MIN_WEIGHT){
3140                                         newTau[j] = exp(sigma * (-dist[indexOffset + j] + offset)) * weight[j];
3141                                         norms[i] += newTau[j];
3142                                 }
3143                                 else{
3144                                         newTau[j] = 0.0;
3145                                 }
3146                         }
3147             
3148                         for(int j=0;j<numOTUs;j++){
3149                                 newTau[j] /= norms[i];
3150                         }
3151             
3152                         for(int j=0;j<numOTUs;j++){
3153                                 if(newTau[j] > MIN_TAU){
3154                                         
3155                                         int oldTotal = total;
3156                                         
3157                                         total++;
3158                                         
3159                                         singleTau.resize(total, 0);
3160                                         seqNumber.resize(total, 0);
3161                                         seqIndex.resize(total, 0);
3162                                         
3163                                         singleTau[oldTotal] = newTau[j];
3164                                         
3165                                         aaP[j][nSeqsPerOTU[j]] = oldTotal;
3166                                         aaI[j][nSeqsPerOTU[j]] = i;
3167                                         nSeqsPerOTU[j]++;
3168                                 }
3169                         }
3170             
3171                 }
3172         
3173         }
3174         catch(exception& e) {
3175                 m->errorOut(e, "ShhherCommand", "calcNewDistances");
3176                 exit(1);        
3177         }               
3178 }
3179 /**************************************************************************************************/
3180
3181 int ShhherCommand::fill(int numOTUs, vector<int>& seqNumber, vector<int>& seqIndex, vector<int>& cumNumSeqs, vector<int>& nSeqsPerOTU, vector<vector<int> >& aaP, vector<vector<int> >& aaI){
3182         try {
3183                 int index = 0;
3184                 for(int i=0;i<numOTUs;i++){
3185                         
3186                         if (m->control_pressed) { return 0; }
3187                         
3188                         cumNumSeqs[i] = index;
3189                         for(int j=0;j<nSeqsPerOTU[i];j++){
3190                                 seqNumber[index] = aaP[i][j];
3191                                 seqIndex[index] = aaI[i][j];
3192                                 
3193                                 index++;
3194                         }
3195                 }
3196         
3197         return 0; 
3198         }
3199         catch(exception& e) {
3200                 m->errorOut(e, "ShhherCommand", "fill");
3201                 exit(1);        
3202         }               
3203 }
3204 /**************************************************************************************************/
3205
3206 void ShhherCommand::setOTUs(int numOTUs, int numSeqs, vector<int>& seqNumber, vector<int>& seqIndex, vector<int>& cumNumSeqs, vector<int>& nSeqsPerOTU,
3207                             vector<int>& otuData, vector<double>& singleTau, vector<double>& dist, vector<vector<int> >& aaP, vector<vector<int> >& aaI){
3208         
3209         try {
3210                 vector<double> bigTauMatrix(numOTUs * numSeqs, 0.0000);
3211                 
3212                 for(int i=0;i<numOTUs;i++){
3213                         
3214                         if (m->control_pressed) { break; }
3215                         
3216                         for(int j=0;j<nSeqsPerOTU[i];j++){
3217                                 int index = cumNumSeqs[i] + j;
3218                                 double tauValue = singleTau[seqNumber[index]];
3219                                 int sIndex = seqIndex[index];
3220                                 bigTauMatrix[sIndex * numOTUs + i] = tauValue;                          
3221                         }
3222                 }
3223                 
3224                 for(int i=0;i<numSeqs;i++){
3225                         double maxTau = -1.0000;
3226                         int maxOTU = -1;
3227                         for(int j=0;j<numOTUs;j++){
3228                                 if(bigTauMatrix[i * numOTUs + j] > maxTau){
3229                                         maxTau = bigTauMatrix[i * numOTUs + j];
3230                                         maxOTU = j;
3231                                 }
3232                         }
3233                         
3234                         otuData[i] = maxOTU;
3235                 }
3236                 
3237                 nSeqsPerOTU.assign(numOTUs, 0);         
3238                 
3239                 for(int i=0;i<numSeqs;i++){
3240                         int index = otuData[i];
3241                         
3242                         singleTau[i] = 1.0000;
3243                         dist[i] = 0.0000;
3244                         
3245                         aaP[index][nSeqsPerOTU[index]] = i;
3246                         aaI[index][nSeqsPerOTU[index]] = i;
3247                         
3248                         nSeqsPerOTU[index]++;
3249                 }
3250         
3251                 fill(numOTUs, seqNumber, seqIndex, cumNumSeqs, nSeqsPerOTU, aaP, aaI);  
3252         }
3253         catch(exception& e) {
3254                 m->errorOut(e, "ShhherCommand", "setOTUs");
3255                 exit(1);        
3256         }               
3257 }
3258 /**************************************************************************************************/
3259
3260 void ShhherCommand::writeQualities(int numOTUs, int numFlowCells, string qualityFileName, vector<int> otuCounts, vector<int>& nSeqsPerOTU, vector<int>& seqNumber,
3261                                    vector<double>& singleTau, vector<short>& flowDataIntI, vector<short>& uniqueFlowgrams, vector<int>& cumNumSeqs,
3262                                    vector<int>& mapUniqueToSeq, vector<string>& seqNameVector, vector<int>& centroids, vector<vector<int> >& aaI){
3263         
3264         try {
3265         
3266                 ofstream qualityFile;
3267                 m->openOutputFile(qualityFileName, qualityFile);
3268         
3269                 qualityFile.setf(ios::fixed, ios::floatfield);
3270                 qualityFile.setf(ios::showpoint);
3271                 qualityFile << setprecision(6);
3272                 
3273                 vector<vector<int> > qualities(numOTUs);
3274                 vector<double> pr(HOMOPS, 0);
3275                 
3276                 
3277                 for(int i=0;i<numOTUs;i++){
3278                         
3279                         if (m->control_pressed) { break; }
3280                         
3281                         int index = 0;
3282                         int base = 0;
3283                         
3284                         if(nSeqsPerOTU[i] > 0){
3285                                 qualities[i].assign(1024, -1);
3286                                 
3287                                 while(index < numFlowCells){
3288                                         double maxPrValue = 1e8;
3289                                         short maxPrIndex = -1;
3290                                         double count = 0.0000;
3291                                         
3292                                         pr.assign(HOMOPS, 0);
3293                                         
3294                                         for(int j=0;j<nSeqsPerOTU[i];j++){
3295                                                 int lIndex = cumNumSeqs[i] + j;
3296                                                 double tauValue = singleTau[seqNumber[lIndex]];
3297                                                 int sequenceIndex = aaI[i][j];
3298                                                 short intensity = flowDataIntI[sequenceIndex * numFlowCells + index];
3299                                                 
3300                                                 count += tauValue;
3301                                                 
3302                                                 for(int s=0;s<HOMOPS;s++){
3303                                                         pr[s] += tauValue * singleLookUp[s * NUMBINS + intensity];
3304                                                 }
3305                                         }
3306                                         
3307                                         maxPrIndex = uniqueFlowgrams[centroids[i] * numFlowCells + index];
3308                                         maxPrValue = pr[maxPrIndex];
3309                                         
3310                                         if(count > MIN_COUNT){
3311                                                 double U = 0.0000;
3312                                                 double norm = 0.0000;
3313                                                 
3314                                                 for(int s=0;s<HOMOPS;s++){
3315                                                         norm += exp(-(pr[s] - maxPrValue));
3316                                                 }
3317                                                 
3318                                                 for(int s=1;s<=maxPrIndex;s++){
3319                                                         int value = 0;
3320                                                         double temp = 0.0000;
3321                                                         
3322                                                         U += exp(-(pr[s-1]-maxPrValue))/norm;
3323                                                         
3324                                                         if(U>0.00){
3325                                                                 temp = log10(U);
3326                                                         }
3327                                                         else{
3328                                                                 temp = -10.1;
3329                                                         }
3330                                                         temp = floor(-10 * temp);
3331                                                         value = (int)floor(temp);
3332                                                         if(value > 100){        value = 100;    }
3333                                                         
3334                                                         qualities[i][base] = (int)value;
3335                                                         base++;
3336                                                 }
3337                                         }
3338                                         
3339                                         index++;
3340                                 }
3341                         }
3342                         
3343                         
3344                         if(otuCounts[i] > 0){
3345                                 qualityFile << '>' << seqNameVector[mapUniqueToSeq[i]] << endl;
3346                         
3347                                 int j=4;        //need to get past the first four bases
3348                                 while(qualities[i][j] != -1){
3349                     qualityFile << qualities[i][j] << ' ';
3350                     if (j > qualities[i].size()) { break; }
3351                     j++;
3352                                 }
3353                                 qualityFile << endl;
3354                         }
3355                 }
3356                 qualityFile.close();
3357                 outputNames.push_back(qualityFileName); outputTypes["qfile"].push_back(qualityFileName);
3358         
3359         }
3360         catch(exception& e) {
3361                 m->errorOut(e, "ShhherCommand", "writeQualities");
3362                 exit(1);        
3363         }               
3364 }
3365
3366 /**************************************************************************************************/
3367
3368 void ShhherCommand::writeSequences(string thisCompositeFASTAFileName, int numOTUs, int numFlowCells, string fastaFileName, vector<int> otuCounts, vector<short>& uniqueFlowgrams, vector<string>& seqNameVector, vector<vector<int> >& aaI, vector<int>& centroids){
3369         try {
3370                 
3371                 ofstream fastaFile;
3372                 m->openOutputFile(fastaFileName, fastaFile);
3373                 
3374                 vector<string> names(numOTUs, "");
3375                 
3376                 for(int i=0;i<numOTUs;i++){
3377                         
3378                         if (m->control_pressed) { break; }
3379                         
3380                         int index = centroids[i];
3381                         
3382                         if(otuCounts[i] > 0){
3383                                 fastaFile << '>' << seqNameVector[aaI[i][0]] << endl;
3384                                 
3385                                 string newSeq = "";
3386                                 
3387                                 for(int j=0;j<numFlowCells;j++){
3388                                         
3389                                         char base = flowOrder[j % flowOrder.length()];
3390                                         for(int k=0;k<uniqueFlowgrams[index * numFlowCells + j];k++){
3391                                                 newSeq += base;
3392                                         }
3393                                 }
3394                                 
3395                                 if (newSeq.length() >= 4) {  fastaFile << newSeq.substr(4) << endl;  }
3396                 else {  fastaFile << "NNNN" << endl;  }
3397                         }
3398                 }
3399                 fastaFile.close();
3400         
3401                 outputNames.push_back(fastaFileName); outputTypes["fasta"].push_back(fastaFileName);
3402         
3403                 if(thisCompositeFASTAFileName != ""){
3404                         m->appendFiles(fastaFileName, thisCompositeFASTAFileName);
3405                 }
3406         }
3407         catch(exception& e) {
3408                 m->errorOut(e, "ShhherCommand", "writeSequences");
3409                 exit(1);        
3410         }               
3411 }
3412
3413 /**************************************************************************************************/
3414
3415 void ShhherCommand::writeNames(string thisCompositeNamesFileName, int numOTUs, string nameFileName, vector<int> otuCounts, vector<string>& seqNameVector, vector<vector<int> >& aaI, vector<int>& nSeqsPerOTU){
3416         try {
3417                 
3418                 ofstream nameFile;
3419                 m->openOutputFile(nameFileName, nameFile);
3420                 
3421                 for(int i=0;i<numOTUs;i++){
3422                         
3423                         if (m->control_pressed) { break; }
3424                         
3425                         if(otuCounts[i] > 0){
3426                                 nameFile << seqNameVector[aaI[i][0]] << '\t' << seqNameVector[aaI[i][0]];
3427                                 
3428                                 for(int j=1;j<nSeqsPerOTU[i];j++){
3429                                         nameFile << ',' << seqNameVector[aaI[i][j]];
3430                                 }
3431                                 
3432                                 nameFile << endl;
3433                         }
3434                 }
3435                 nameFile.close();
3436                 outputNames.push_back(nameFileName); outputTypes["name"].push_back(nameFileName);
3437                 
3438                 
3439                 if(thisCompositeNamesFileName != ""){
3440                         m->appendFiles(nameFileName, thisCompositeNamesFileName);
3441                 }               
3442         }
3443         catch(exception& e) {
3444                 m->errorOut(e, "ShhherCommand", "writeNames");
3445                 exit(1);        
3446         }               
3447 }
3448
3449 /**************************************************************************************************/
3450
3451 void ShhherCommand::writeGroups(string groupFileName, string fileRoot, int numSeqs, vector<string>& seqNameVector){
3452         try {
3453         ofstream groupFile;
3454                 m->openOutputFile(groupFileName, groupFile);
3455                 
3456                 for(int i=0;i<numSeqs;i++){
3457                         if (m->control_pressed) { break; }
3458                         groupFile << seqNameVector[i] << '\t' << fileRoot << endl;
3459                 }
3460                 groupFile.close();
3461                 outputNames.push_back(groupFileName); outputTypes["group"].push_back(groupFileName);
3462         
3463         }
3464         catch(exception& e) {
3465                 m->errorOut(e, "ShhherCommand", "writeGroups");
3466                 exit(1);        
3467         }               
3468 }
3469
3470 /**************************************************************************************************/
3471
3472 void ShhherCommand::writeClusters(string otuCountsFileName, int numOTUs, int numFlowCells, vector<int> otuCounts, vector<int>& centroids, vector<short>& uniqueFlowgrams, vector<string>& seqNameVector, vector<vector<int> >& aaI, vector<int>& nSeqsPerOTU, vector<int>& lengths, vector<short>& flowDataIntI){
3473         try {
3474                 ofstream otuCountsFile;
3475                 m->openOutputFile(otuCountsFileName, otuCountsFile);
3476                 
3477                 string bases = flowOrder;
3478                 
3479                 for(int i=0;i<numOTUs;i++){
3480                         
3481                         if (m->control_pressed) {
3482                                 break;
3483                         }
3484                         //output the translated version of the centroid sequence for the otu
3485                         if(otuCounts[i] > 0){
3486                                 int index = centroids[i];
3487                                 
3488                                 otuCountsFile << "ideal\t";
3489                                 for(int j=8;j<numFlowCells;j++){
3490                                         char base = bases[j % bases.length()];
3491                                         for(int s=0;s<uniqueFlowgrams[index * numFlowCells + j];s++){
3492                                                 otuCountsFile << base;
3493                                         }
3494                                 }
3495                                 otuCountsFile << endl;
3496                                 
3497                                 for(int j=0;j<nSeqsPerOTU[i];j++){
3498                                         int sequence = aaI[i][j];
3499                                         otuCountsFile << seqNameVector[sequence] << '\t';
3500                                         
3501                                         string newSeq = "";
3502                                         
3503                                         for(int k=0;k<lengths[sequence];k++){
3504                                                 char base = bases[k % bases.length()];
3505                                                 int freq = int(0.01 * (double)flowDataIntI[sequence * numFlowCells + k] + 0.5);
3506                         
3507                                                 for(int s=0;s<freq;s++){
3508                                                         newSeq += base;
3509                                                         //otuCountsFile << base;
3510                                                 }
3511                                         }
3512                                         
3513                     if (newSeq.length() >= 4) {  otuCountsFile << newSeq.substr(4) << endl;  }
3514                     else {  otuCountsFile << "NNNN" << endl;  }
3515                                 }
3516                                 otuCountsFile << endl;
3517                         }
3518                 }
3519                 otuCountsFile.close();
3520                 outputNames.push_back(otuCountsFileName); outputTypes["counts"].push_back(otuCountsFileName);
3521         
3522         }
3523         catch(exception& e) {
3524                 m->errorOut(e, "ShhherCommand", "writeClusters");
3525                 exit(1);        
3526         }               
3527 }
3528
3529 /**************************************************************************************************/
3530
3531 void ShhherCommand::getSingleLookUp(){
3532         try{
3533                 //      these are the -log probabilities that a signal corresponds to a particular homopolymer length
3534                 singleLookUp.assign(HOMOPS * NUMBINS, 0);
3535                 
3536                 int index = 0;
3537                 ifstream lookUpFile;
3538                 m->openInputFile(lookupFileName, lookUpFile);
3539                 
3540                 for(int i=0;i<HOMOPS;i++){
3541                         
3542                         if (m->control_pressed) { break; }
3543                         
3544                         float logFracFreq;
3545                         lookUpFile >> logFracFreq;
3546                         
3547                         for(int j=0;j<NUMBINS;j++)      {
3548                                 lookUpFile >> singleLookUp[index];
3549                                 index++;
3550                         }
3551                 }       
3552                 lookUpFile.close();
3553         }
3554         catch(exception& e) {
3555                 m->errorOut(e, "ShhherCommand", "getSingleLookUp");
3556                 exit(1);
3557         }
3558 }
3559
3560 /**************************************************************************************************/
3561
3562 void ShhherCommand::getJointLookUp(){
3563         try{
3564                 
3565                 //      the most likely joint probability (-log) that two intenities have the same polymer length
3566                 jointLookUp.resize(NUMBINS * NUMBINS, 0);
3567                 
3568                 for(int i=0;i<NUMBINS;i++){
3569                         
3570                         if (m->control_pressed) { break; }
3571                         
3572                         for(int j=0;j<NUMBINS;j++){             
3573                                 
3574                                 double minSum = 100000000;
3575                                 
3576                                 for(int k=0;k<HOMOPS;k++){
3577                                         double sum = singleLookUp[k * NUMBINS + i] + singleLookUp[k * NUMBINS + j];
3578                                         
3579                                         if(sum < minSum)        {       minSum = sum;           }
3580                                 }       
3581                                 jointLookUp[i * NUMBINS + j] = minSum;
3582                         }
3583                 }
3584         }
3585         catch(exception& e) {
3586                 m->errorOut(e, "ShhherCommand", "getJointLookUp");
3587                 exit(1);
3588         }
3589 }
3590
3591 /**************************************************************************************************/
3592
3593 double ShhherCommand::getProbIntensity(int intIntensity){                          
3594         try{
3595
3596                 double minNegLogProb = 100000000; 
3597
3598                 
3599                 for(int i=0;i<HOMOPS;i++){//loop signal strength
3600                         
3601                         if (m->control_pressed) { break; }
3602                         
3603                         float negLogProb = singleLookUp[i * NUMBINS + intIntensity];
3604                         if(negLogProb < minNegLogProb)  {       minNegLogProb = negLogProb; }
3605                 }
3606                 
3607                 return minNegLogProb;
3608         }
3609         catch(exception& e) {
3610                 m->errorOut(e, "ShhherCommand", "getProbIntensity");
3611                 exit(1);
3612         }
3613 }
3614
3615
3616
3617