]> git.donarmstrong.com Git - mothur.git/blob - rarefactsharedcommand.cpp
added files for regularized random forest
[mothur.git] / rarefactsharedcommand.cpp
1 /*
2  *  rarefactsharedcommand.cpp
3  *  Dotur
4  *
5  *  Created by Sarah Westcott on 1/6/09.
6  *  Copyright 2009 Schloss Lab UMASS Amherst. All rights reserved.
7  *
8  */
9
10 #include "rarefactsharedcommand.h"
11 #include "sharedsobs.h"
12 #include "sharednseqs.h"
13 #include "sharedutilities.h"
14 #include "subsample.h"
15
16 //**********************************************************************************************************************
17 vector<string> RareFactSharedCommand::setParameters(){  
18         try {
19                 CommandParameter pshared("shared", "InputTypes", "", "", "none", "none", "none",false,true); parameters.push_back(pshared);
20         CommandParameter pdesign("design", "InputTypes", "", "", "none", "none", "none",false,false); parameters.push_back(pdesign);
21                 CommandParameter plabel("label", "String", "", "", "", "", "",false,false); parameters.push_back(plabel);
22                 CommandParameter pfreq("freq", "Number", "", "100", "", "", "",false,false); parameters.push_back(pfreq);
23                 CommandParameter piters("iters", "Number", "", "1000", "", "", "",false,false); parameters.push_back(piters);
24                 CommandParameter pcalc("calc", "Multiple", "sharednseqs-sharedobserved", "sharedobserved", "", "", "",true,false); parameters.push_back(pcalc);
25         CommandParameter psubsampleiters("subsampleiters", "Number", "", "1000", "", "", "",false,false); parameters.push_back(psubsampleiters);
26         CommandParameter psubsample("subsample", "String", "", "", "", "", "",false,false); parameters.push_back(psubsample);
27                 CommandParameter pjumble("jumble", "Boolean", "", "T", "", "", "",false,false); parameters.push_back(pjumble);
28                 CommandParameter pgroups("groups", "String", "", "", "", "", "",false,false); parameters.push_back(pgroups);
29         CommandParameter psets("sets", "String", "", "", "", "", "",false,false); parameters.push_back(psets);
30                 CommandParameter pgroupmode("groupmode", "Boolean", "", "T", "", "", "",false,false); parameters.push_back(pgroupmode);
31         CommandParameter pinputdir("inputdir", "String", "", "", "", "", "",false,false); parameters.push_back(pinputdir);
32                 CommandParameter poutputdir("outputdir", "String", "", "", "", "", "",false,false); parameters.push_back(poutputdir);
33                 
34                 vector<string> myArray;
35                 for (int i = 0; i < parameters.size(); i++) {   myArray.push_back(parameters[i].name);          }
36                 return myArray;
37         }
38         catch(exception& e) {
39                 m->errorOut(e, "RareFactSharedCommand", "setParameters");
40                 exit(1);
41         }
42 }
43 //**********************************************************************************************************************
44 string RareFactSharedCommand::getHelpString(){  
45         try {
46                 string helpString = "";
47                 ValidCalculators validCalculator;
48                 helpString += "The rarefaction.shared command parameters are shared, design, label, iters, groups, sets, jumble, groupmode and calc.  shared is required if there is no current sharedfile. \n";
49         helpString += "The design parameter allows you to assign your groups to sets. If provided mothur will run rarefaction.shared on a per set basis. \n";
50         helpString += "The sets parameter allows you to specify which of the sets in your designfile you would like to analyze. The set names are separated by dashes. THe default is all sets in the designfile.\n";
51                 helpString += "The rarefaction command should be in the following format: \n";
52                 helpString += "rarefaction.shared(label=yourLabel, iters=yourIters, calc=yourEstimators, jumble=yourJumble, groups=yourGroups).\n";
53                 helpString += "The freq parameter is used indicate when to output your data, by default it is set to 100. But you can set it to a percentage of the number of sequence. For example freq=0.10, means 10%. \n";
54                 helpString += "Example rarefaction.shared(label=unique-0.01-0.03,  iters=10000, groups=B-C, jumble=T, calc=sharedobserved).\n";
55                 helpString += "The default values for iters is 1000, freq is 100, and calc is sharedobserved which calculates the shared rarefaction curve for the observed richness.\n";
56         helpString += "The subsampleiters parameter allows you to choose the number of times you would like to run the subsample.\n";
57         helpString += "The subsample parameter allows you to enter the size pergroup of the sample or you can set subsample=T and mothur will use the size of your smallest group.\n";
58                 helpString += "The default value for groups is all the groups in your groupfile, and jumble is true.\n";
59                 helpString += validCalculator.printCalc("sharedrarefaction");
60                 helpString += "The label parameter is used to analyze specific labels in your input.\n";
61                 helpString += "The groups parameter allows you to specify which of the groups in your groupfile you would like analyzed.  You must enter at least 2 valid groups.\n";
62                 helpString += "Note: No spaces between parameter labels (i.e. freq), '=' and parameters (i.e.yourFreq).\n";
63                 return helpString;
64         }
65         catch(exception& e) {
66                 m->errorOut(e, "RareFactSharedCommand", "getHelpString");
67                 exit(1);
68         }
69 }
70 //**********************************************************************************************************************
71 string RareFactSharedCommand::getOutputFileNameTag(string type, string inputName=""){   
72         try {
73         string outputFileName = "";
74                 map<string, vector<string> >::iterator it;
75         
76         //is this a type this command creates
77         it = outputTypes.find(type);
78         if (it == outputTypes.end()) {  m->mothurOut("[ERROR]: this command doesn't create a " + type + " output file.\n"); }
79         else {
80             if (type == "sharedrarefaction") {  outputFileName =  "shared.rarefaction"; }
81             else if (type == "sharedr_nseqs") {  outputFileName =  "shared.r_nseqs"; }
82             else { m->mothurOut("[ERROR]: No definition for type " + type + " output file tag.\n"); m->control_pressed = true;  }
83         }
84         return outputFileName;
85         }
86         catch(exception& e) {
87                 m->errorOut(e, "RareFactSharedCommand", "getOutputFileNameTag");
88                 exit(1);
89         }
90 }
91 //**********************************************************************************************************************
92 RareFactSharedCommand::RareFactSharedCommand(){ 
93         try {
94                 abort = true; calledHelp = true; 
95                 setParameters();
96                 vector<string> tempOutNames;
97                 outputTypes["sharedrarefaction"] = tempOutNames;
98                 outputTypes["sharedr_nseqs"] = tempOutNames;
99         }
100         catch(exception& e) {
101                 m->errorOut(e, "RareFactSharedCommand", "RareFactSharedCommand");
102                 exit(1);
103         }
104 }
105 //**********************************************************************************************************************
106
107 RareFactSharedCommand::RareFactSharedCommand(string option)  {
108         try {
109                 abort = false; calledHelp = false;   
110                 allLines = 1;
111                                 
112                 //allow user to run help
113                 if(option == "help") {  help(); abort = true; calledHelp = true; }
114                 else if(option == "citation") { citation(); abort = true; calledHelp = true;}
115                 
116                 else {
117                         vector<string> myArray = setParameters();
118                         
119                         OptionParser parser(option);
120                         map<string,string> parameters = parser.getParameters();
121                         map<string,string>::iterator it;
122                         
123                         ValidParameters validParameter;
124                         
125                         //check to make sure all parameters are valid for command
126                         for (it = parameters.begin(); it != parameters.end(); it++) { 
127                                 if (validParameter.isValidParameter(it->first, myArray, it->second) != true) {  abort = true;  }
128                         }
129                         
130                         //initialize outputTypes
131                         vector<string> tempOutNames;
132                         outputTypes["sharedrarefaction"] = tempOutNames;
133                         outputTypes["sharedr_nseqs"] = tempOutNames;
134                         
135                         //if the user changes the input directory command factory will send this info to us in the output parameter 
136                         string inputDir = validParameter.validFile(parameters, "inputdir", false);              
137                         if (inputDir == "not found"){   inputDir = "";          }
138                         else {
139                                 string path;
140                                 it = parameters.find("shared");
141                                 //user has given a template file
142                                 if(it != parameters.end()){ 
143                                         path = m->hasPath(it->second);
144                                         //if the user has not given a path then, add inputdir. else leave path alone.
145                                         if (path == "") {       parameters["shared"] = inputDir + it->second;           }
146                                 }
147                 
148                 it = parameters.find("design");
149                                 //user has given a template file
150                                 if(it != parameters.end()){ 
151                                         path = m->hasPath(it->second);
152                                         //if the user has not given a path then, add inputdir. else leave path alone.
153                                         if (path == "") {       parameters["design"] = inputDir + it->second;           }
154                                 }
155
156                         }
157                         
158                         //get shared file
159                         sharedfile = validParameter.validFile(parameters, "shared", true);
160                         if (sharedfile == "not open") { sharedfile = ""; abort = true; }        
161                         else if (sharedfile == "not found") { 
162                                 //if there is a current shared file, use it
163                                 sharedfile = m->getSharedFile(); 
164                                 if (sharedfile != "") { m->mothurOut("Using " + sharedfile + " as input file for the shared parameter."); m->mothurOutEndLine(); }
165                                 else {  m->mothurOut("You have no current sharedfile and the shared parameter is required."); m->mothurOutEndLine(); abort = true; }
166                         }else { m->setSharedFile(sharedfile); }
167             
168             designfile = validParameter.validFile(parameters, "design", true);
169                         if (designfile == "not open") { abort = true; designfile = ""; }
170                         else if (designfile == "not found") {   designfile = "";        }
171                         else { m->setDesignFile(designfile); }
172                         
173                         
174                         //if the user changes the output directory command factory will send this info to us in the output parameter 
175                         outputDir = validParameter.validFile(parameters, "outputdir", false);           if (outputDir == "not found"){  outputDir = m->hasPath(sharedfile);             }
176                         
177                         
178                         //check for optional parameter and set defaults
179                         // ...at some point should added some additional type checking...
180                         label = validParameter.validFile(parameters, "label", false);                   
181                         if (label == "not found") { label = ""; }
182                         else { 
183                                 if(label != "all") {  m->splitAtDash(label, labels);  allLines = 0;  }
184                                 else { allLines = 1;  }
185                         }
186                         
187                                 
188                         calc = validParameter.validFile(parameters, "calc", false);                     
189                         if (calc == "not found") { calc = "sharedobserved";  }
190                         else { 
191                                  if (calc == "default")  {  calc = "sharedobserved";  }
192                         }
193                         m->splitAtDash(calc, Estimators);
194                         if (m->inUsersGroups("citation", Estimators)) { 
195                                 ValidCalculators validCalc; validCalc.printCitations(Estimators); 
196                                 //remove citation from list of calcs
197                                 for (int i = 0; i < Estimators.size(); i++) { if (Estimators[i] == "citation") {  Estimators.erase(Estimators.begin()+i); break; } }
198                         }
199                         
200                         groups = validParameter.validFile(parameters, "groups", false);                 
201                         if (groups == "not found") { groups = ""; }
202                         else { 
203                                 m->splitAtDash(groups, Groups);
204                         }
205                         m->setGroups(Groups);
206             
207             string sets = validParameter.validFile(parameters, "sets", false);                  
208                         if (sets == "not found") { sets = ""; }
209                         else { 
210                                 m->splitAtDash(sets, Sets);
211                         }
212                         
213                         string temp;
214                         temp = validParameter.validFile(parameters, "freq", false);                     if (temp == "not found") { temp = "100"; }
215                         m->mothurConvert(temp, freq); 
216                         
217                         temp = validParameter.validFile(parameters, "iters", false);                    if (temp == "not found") { temp = "1000"; }
218                         m->mothurConvert(temp, nIters); 
219                         
220                         temp = validParameter.validFile(parameters, "jumble", false);                   if (temp == "not found") { temp = "T"; }
221                         if (m->isTrue(temp)) { jumble = true; }
222                         else { jumble = false; }
223                         m->jumble = jumble;
224             
225             temp = validParameter.validFile(parameters, "groupmode", false);            if (temp == "not found") { temp = "T"; }
226                         groupMode = m->isTrue(temp);
227             
228             temp = validParameter.validFile(parameters, "subsampleiters", false);                       if (temp == "not found") { temp = "1000"; }
229                         m->mothurConvert(temp, iters); 
230             
231             temp = validParameter.validFile(parameters, "subsample", false);            if (temp == "not found") { temp = "F"; }
232                         if (m->isNumeric1(temp)) { m->mothurConvert(temp, subsampleSize); subsample = true; }
233             else {  
234                 if (m->isTrue(temp)) { subsample = true; subsampleSize = -1; }  //we will set it to smallest group later 
235                 else { subsample = false; }
236             }
237             
238             if (subsample == false) { iters = 1; }
239                                 
240                 }
241
242         }
243         catch(exception& e) {
244                 m->errorOut(e, "RareFactSharedCommand", "RareFactSharedCommand");
245                 exit(1);
246         }
247 }
248 //**********************************************************************************************************************
249
250 int RareFactSharedCommand::execute(){
251         try {
252         
253                 if (abort == true) { if (calledHelp) { return 0; }  return 2;   }
254         
255         GroupMap designMap;
256         if (designfile == "") { //fake out designMap to run with process
257             process(designMap, "");
258         }else {
259             designMap.readDesignMap(designfile);
260             
261             //fill Sets - checks for "all" and for any typo groups
262                         SharedUtil util;
263                         vector<string> nameSets = designMap.getNamesOfGroups();
264                         util.setGroups(Sets, nameSets);
265             
266             for (int i = 0; i < Sets.size(); i++) {
267                 process(designMap, Sets[i]);
268             }
269             
270             if (groupMode) { outputNames = createGroupFile(outputNames); }
271         }
272                     
273                 if (m->control_pressed) { for (int i = 0; i < outputNames.size(); i++) {        m->mothurRemove(outputNames[i]); } return 0; }
274                 
275                 m->mothurOutEndLine();
276                 m->mothurOut("Output File Names: "); m->mothurOutEndLine();
277                 for (int i = 0; i < outputNames.size(); i++) {  m->mothurOut(outputNames[i]); m->mothurOutEndLine();    }
278                 m->mothurOutEndLine();
279
280                 return 0;
281         }
282         catch(exception& e) {
283                 m->errorOut(e, "RareFactSharedCommand", "execute");
284                 exit(1);
285         }
286 }
287 //**********************************************************************************************************************
288
289 int RareFactSharedCommand::process(GroupMap& designMap, string thisSet){
290         try {
291         Rarefact* rCurve;
292         vector<Display*> rDisplays;
293         
294         InputData input(sharedfile, "sharedfile");
295                 lookup = input.getSharedRAbundVectors();
296         if (lookup.size() < 2) { 
297                         m->mothurOut("I cannot run the command without at least 2 valid groups."); 
298             for (int i = 0; i < lookup.size(); i++) { delete lookup[i]; }
299             return 0;
300         }
301         
302         string fileNameRoot = outputDir + m->getRootName(m->getSimpleName(sharedfile));
303         
304         vector<string> newGroups = m->getGroups();
305         if (thisSet != "") {  //make groups only filled with groups from this set so that's all inputdata will read
306             vector<string> thisSets; thisSets.push_back(thisSet);
307             newGroups = designMap.getNamesSeqs(thisSets);
308             fileNameRoot += thisSet + ".";
309         }
310         
311         vector<SharedRAbundVector*> subset;
312         if (thisSet == "") {  subset.clear(); subset = lookup;  }
313         else {//fill subset with this sets groups
314             subset.clear();
315             for (int i = 0; i < lookup.size(); i++) {
316                 if (m->inUsersGroups(lookup[i]->getGroup(), newGroups)) {
317                     subset.push_back(lookup[i]);
318                 }
319             }
320         }
321
322         /******************************************************/
323         if (subsample) { 
324             if (subsampleSize == -1) { //user has not set size, set size = smallest samples size
325                 subsampleSize = subset[0]->getNumSeqs();
326                 for (int i = 1; i < subset.size(); i++) {
327                     int thisSize = subset[i]->getNumSeqs();
328                     
329                     if (thisSize < subsampleSize) {     subsampleSize = thisSize;       }
330                 }
331             }else {
332                 newGroups.clear();
333                 vector<SharedRAbundVector*> temp;
334                 for (int i = 0; i < subset.size(); i++) {
335                     if (subset[i]->getNumSeqs() < subsampleSize) { 
336                         m->mothurOut(subset[i]->getGroup() + " contains " + toString(subset[i]->getNumSeqs()) + ". Eliminating."); m->mothurOutEndLine();
337                         delete subset[i];
338                     }else { 
339                         newGroups.push_back(subset[i]->getGroup()); 
340                         temp.push_back(subset[i]);
341                     }
342                 } 
343                 subset = temp;
344             }
345             
346             if (subset.size() < 2) { m->mothurOut("You have not provided enough valid groups.  I cannot run the command."); m->mothurOutEndLine(); m->control_pressed = true; return 0; }
347         }
348         /******************************************************/
349                 
350                 ValidCalculators validCalculator;
351                 for (int i=0; i<Estimators.size(); i++) {
352                         if (validCalculator.isValidCalculator("sharedrarefaction", Estimators[i]) == true) { 
353                                 if (Estimators[i] == "sharedobserved") { 
354                                         rDisplays.push_back(new RareDisplay(new SharedSobs(), new SharedThreeColumnFile(fileNameRoot+getOutputFileNameTag("sharedrarefaction"), "")));
355                                         outputNames.push_back(fileNameRoot+getOutputFileNameTag("sharedrarefaction")); outputTypes["sharedrarefaction"].push_back(fileNameRoot+getOutputFileNameTag("sharedrarefaction"));
356                                 }else if (Estimators[i] == "sharednseqs") { 
357                                         rDisplays.push_back(new RareDisplay(new SharedNSeqs(), new SharedThreeColumnFile(fileNameRoot+getOutputFileNameTag("sharedr_nseqs"), "")));
358                                         outputNames.push_back(fileNameRoot+getOutputFileNameTag("sharedr_nseqs")); outputTypes["sharedr_nseqs"].push_back(fileNameRoot+getOutputFileNameTag("sharedr_nseqs"));
359                                 }
360                         }
361             file2Group[outputNames.size()-1] = thisSet;
362                 }
363                 
364                 //if the users entered no valid calculators don't execute command
365                 if (rDisplays.size() == 0) { for (int i = 0; i < lookup.size(); i++) {  delete lookup[i];  }  return 0; }
366                 
367                 if (m->control_pressed) { 
368                         for(int i=0;i<rDisplays.size();i++){    delete rDisplays[i];    }
369                         for (int i = 0; i < outputNames.size(); i++) {  m->mothurRemove(outputNames[i]);        }
370                         for (int i = 0; i < lookup.size(); i++) {  delete lookup[i];  } 
371                         return 0;
372                 }
373         
374         
375                 //if the users enters label "0.06" and there is no "0.06" in their file use the next lowest label.
376         string lastLabel = subset[0]->getLabel();
377                 set<string> processedLabels;
378                 set<string> userLabels = labels;
379         
380                 //as long as you are not at the end of the file or done wih the lines you want
381                 while((subset[0] != NULL) && ((allLines == 1) || (userLabels.size() != 0))) {
382                         if (m->control_pressed) { 
383                                 for(int i=0;i<rDisplays.size();i++){    delete rDisplays[i];    }
384                                 for (int i = 0; i < outputNames.size(); i++) {  m->mothurRemove(outputNames[i]);        }
385                                 for (int i = 0; i < lookup.size(); i++) {  delete lookup[i];  } 
386                                 return 0;
387                         }
388                         
389                         if(allLines == 1 || labels.count(subset[0]->getLabel()) == 1){
390                                 m->mothurOut(subset[0]->getLabel() + '\t' + thisSet); m->mothurOutEndLine();
391                                 rCurve = new Rarefact(subset, rDisplays);
392                                 rCurve->getSharedCurve(freq, nIters);
393                                 delete rCurve;
394                 
395                 if (subsample) { subsampleLookup(subset, fileNameRoot);  }
396                     
397                                 processedLabels.insert(subset[0]->getLabel());
398                                 userLabels.erase(subset[0]->getLabel());
399                         }
400                         
401                         if ((m->anyLabelsToProcess(subset[0]->getLabel(), userLabels, "") == true) && (processedLabels.count(lastLabel) != 1)) {
402                 string saveLabel = subset[0]->getLabel();
403                 
404                 for (int i = 0; i < lookup.size(); i++) {  delete lookup[i];  } 
405                 lookup = input.getSharedRAbundVectors(lastLabel);
406                 
407                 if (thisSet == "") {  subset.clear(); subset = lookup;  }
408                 else {//fill subset with this sets groups
409                     subset.clear();
410                     for (int i = 0; i < lookup.size(); i++) {
411                         if (m->inUsersGroups(lookup[i]->getGroup(), newGroups)) {
412                             subset.push_back(lookup[i]);
413                         }
414                     }
415                 }
416
417                 m->mothurOut(subset[0]->getLabel() + '\t' + thisSet); m->mothurOutEndLine();
418                 rCurve = new Rarefact(subset, rDisplays);
419                 rCurve->getSharedCurve(freq, nIters);
420                 delete rCurve;
421                 
422                 if (subsample) { subsampleLookup(subset, fileNameRoot);  }
423                 
424                 processedLabels.insert(subset[0]->getLabel());
425                 userLabels.erase(subset[0]->getLabel());
426                 
427                 //restore real lastlabel to save below
428                 subset[0]->setLabel(saveLabel);
429                         }
430             
431                         
432                         lastLabel = subset[0]->getLabel();
433                         
434                         //get next line to process
435                         for (int i = 0; i < lookup.size(); i++) {  delete lookup[i];  } 
436                         lookup = input.getSharedRAbundVectors();
437             
438             if (lookup[0] != NULL) {
439                 if (thisSet == "") {  subset.clear(); subset = lookup;  }
440                 else {//fill subset with this sets groups
441                     subset.clear();
442                     for (int i = 0; i < lookup.size(); i++) {
443                         if (m->inUsersGroups(lookup[i]->getGroup(), newGroups)) {
444                             subset.push_back(lookup[i]);
445                         }
446                     }
447                 }
448             }else {  subset.clear(); subset.push_back(NULL); }
449
450                 }
451                 
452                 if (m->control_pressed) { 
453                         for(int i=0;i<rDisplays.size();i++){    delete rDisplays[i];    }
454                         for (int i = 0; i < outputNames.size(); i++) {  m->mothurRemove(outputNames[i]);        }
455                         return 0;
456                 }
457                 
458                 //output error messages about any remaining user labels
459                 set<string>::iterator it;
460                 bool needToRun = false;
461                 for (it = userLabels.begin(); it != userLabels.end(); it++) {  
462                         m->mothurOut("Your file does not include the label " + *it); 
463                         if (processedLabels.count(lastLabel) != 1) {
464                                 m->mothurOut(". I will use " + lastLabel + "."); m->mothurOutEndLine();
465                                 needToRun = true;
466                         }else {
467                                 m->mothurOut(". Please refer to " + lastLabel + "."); m->mothurOutEndLine();
468                         }
469                 }
470                 
471                 if (m->control_pressed) { 
472                         for(int i=0;i<rDisplays.size();i++){    delete rDisplays[i];    }
473                         for (int i = 0; i < outputNames.size(); i++) {  m->mothurRemove(outputNames[i]);        }
474                         return 0;
475                 }
476                 
477                 //run last label if you need to
478                 if (needToRun == true)  {
479                         for (int i = 0; i < lookup.size(); i++) {  if (lookup[i] != NULL) {     delete lookup[i]; }  } 
480                         lookup = input.getSharedRAbundVectors(lastLabel);
481             
482             if (thisSet == "") {  subset.clear(); subset = lookup;  }
483             else {//fill subset with this sets groups
484                 subset.clear();
485                 for (int i = 0; i < lookup.size(); i++) {
486                     if (m->inUsersGroups(lookup[i]->getGroup(), newGroups)) {
487                         subset.push_back(lookup[i]);
488                     }
489                 }
490             }
491             
492                         m->mothurOut(subset[0]->getLabel() + '\t' + thisSet); m->mothurOutEndLine();
493                         rCurve = new Rarefact(subset, rDisplays);
494                         rCurve->getSharedCurve(freq, nIters);
495                         delete rCurve;
496             
497             if (subsample) { subsampleLookup(subset, fileNameRoot);  }
498                 
499                         for (int i = 0; i < lookup.size(); i++) {  delete lookup[i];  } 
500                 }
501                 
502                 for(int i=0;i<rDisplays.size();i++){    delete rDisplays[i];    }       
503
504         
505         return 0;
506     }
507         catch(exception& e) {
508                 m->errorOut(e, "RareFactSharedCommand", "process");
509                 exit(1);
510         }
511 }
512 //**********************************************************************************************************************
513 int RareFactSharedCommand::subsampleLookup(vector<SharedRAbundVector*>& thisLookup, string fileNameRoot) {
514         try {
515         
516         map<string, vector<string> > filenames;
517         for (int thisIter = 0; thisIter < iters; thisIter++) {
518             
519             vector<SharedRAbundVector*> thisItersLookup = thisLookup;
520             
521              //we want the summary results for the whole dataset, then the subsampling
522             SubSample sample;
523             vector<string> tempLabels; //dont need since we arent printing the sampled sharedRabunds
524                 
525             //make copy of lookup so we don't get access violations
526             vector<SharedRAbundVector*> newLookup;
527             for (int k = 0; k < thisItersLookup.size(); k++) {
528                 SharedRAbundVector* temp = new SharedRAbundVector();
529                 temp->setLabel(thisItersLookup[k]->getLabel());
530                 temp->setGroup(thisItersLookup[k]->getGroup());
531                 newLookup.push_back(temp);
532             }
533                 
534             //for each bin
535             for (int k = 0; k < thisItersLookup[0]->getNumBins(); k++) {
536                 if (m->control_pressed) { for (int j = 0; j < newLookup.size(); j++) {  delete newLookup[j];  } return 0; }
537                 for (int j = 0; j < thisItersLookup.size(); j++) { newLookup[j]->push_back(thisItersLookup[j]->getAbundance(k), thisItersLookup[j]->getGroup()); }
538                 }
539                 
540             tempLabels = sample.getSample(newLookup, subsampleSize);
541             thisItersLookup = newLookup;
542             
543             
544             Rarefact* rCurve;
545             vector<Display*> rDisplays;
546             
547             string thisfileNameRoot = fileNameRoot + toString(thisIter);
548             
549             ValidCalculators validCalculator;
550             for (int i=0; i<Estimators.size(); i++) {
551                 if (validCalculator.isValidCalculator("sharedrarefaction", Estimators[i]) == true) { 
552                     if (Estimators[i] == "sharedobserved") { 
553                         rDisplays.push_back(new RareDisplay(new SharedSobs(), new SharedThreeColumnFile(thisfileNameRoot+getOutputFileNameTag("sharedrarefaction"), "")));
554                         filenames["sharedrarefaction"].push_back(thisfileNameRoot+getOutputFileNameTag("sharedrarefaction"));
555                     }else if (Estimators[i] == "sharednseqs") { 
556                         rDisplays.push_back(new RareDisplay(new SharedNSeqs(), new SharedThreeColumnFile(thisfileNameRoot+getOutputFileNameTag("sharedr_nseqs"), "")));
557                         filenames["sharedr_nseqs"].push_back(thisfileNameRoot+getOutputFileNameTag("sharedr_nseqs"));
558                     }
559                 }
560             }
561             
562             rCurve = new Rarefact(thisItersLookup, rDisplays);
563                         rCurve->getSharedCurve(freq, nIters);
564                         delete rCurve;
565             
566             //clean up memory
567             for (int i = 0; i < thisItersLookup.size(); i++) { delete thisItersLookup[i]; }
568             thisItersLookup.clear();
569             for(int i=0;i<rDisplays.size();i++){        delete rDisplays[i];    }
570         }
571         
572         //create std and ave outputs
573         vector< vector< vector< double > > > results; //iter -> numSampled -> data
574         for (map<string, vector<string> >::iterator it = filenames.begin(); it != filenames.end(); it++) {
575             vector<string> thisTypesFiles = it->second;
576             vector<string> columnHeaders;
577             for (int i = 0; i < thisTypesFiles.size(); i++) {
578                 ifstream in;
579                 m->openInputFile(thisTypesFiles[i], in);
580                 
581                 string headers = m->getline(in); m->gobble(in);
582                 columnHeaders = m->splitWhiteSpace(headers);
583                 int numCols = columnHeaders.size();
584                 
585                 vector<vector<double> > thisFilesLines;
586                 while (!in.eof()) {
587                     if (m->control_pressed) { break; }
588                     vector<double> data; data.resize(numCols, 0);
589                     //read numSampled line
590                     for (int j = 0; j < numCols; j++) { in >> data[j]; m->gobble(in); }
591                     thisFilesLines.push_back(data);
592                 }
593                 in.close();
594                 results.push_back(thisFilesLines);
595                 m->mothurRemove(thisTypesFiles[i]);
596             }
597             
598             if (!m->control_pressed) {
599                 //process results
600                 string outputFile = fileNameRoot + "ave-std." + thisLookup[0]->getLabel() + "." + getOutputFileNameTag(it->first);
601                 ofstream out;
602                 m->openOutputFile(outputFile, out);
603                 outputNames.push_back(outputFile); outputTypes[it->first].push_back(outputFile);
604                 
605                 out << columnHeaders[0] << '\t' << "method\t";
606                 for (int i = 1; i < columnHeaders.size(); i++) { out << columnHeaders[i] << '\t'; }
607                 out << endl;
608             
609                 vector< vector<double> > aveResults; aveResults.resize(results[0].size());
610                 for (int i = 0; i < aveResults.size(); i++) { aveResults[i].resize(results[0][i].size(), 0.0); }
611                 
612                 for (int thisIter = 0; thisIter < iters; thisIter++) { //sum all groups dists for each calculator
613                     for (int i = 0; i < aveResults.size(); i++) {  //initialize sums to zero.
614                         aveResults[i][0] = results[thisIter][i][0];
615                         for (int j = 1; j < aveResults[i].size(); j++) {
616                             aveResults[i][j] += results[thisIter][i][j];
617                         }
618                     }
619                 }
620                 
621                 for (int i = 0; i < aveResults.size(); i++) {  //finds average.
622                     for (int j = 1; j < aveResults[i].size(); j++) {
623                         aveResults[i][j] /= (float) iters;
624                     }
625                 }
626                 
627                 //standard deviation
628                 vector< vector<double> > stdResults; stdResults.resize(results[0].size());
629                 for (int i = 0; i < stdResults.size(); i++) { stdResults[i].resize(results[0][i].size(), 0.0); }
630                 
631                 for (int thisIter = 0; thisIter < iters; thisIter++) { //compute the difference of each dist from the mean, and square the result of each
632                     for (int i = 0; i < stdResults.size(); i++) {  
633                         stdResults[i][0] = aveResults[i][0];
634                         for (int j = 1; j < stdResults[i].size(); j++) {
635                             stdResults[i][j] += ((results[thisIter][i][j] - aveResults[i][j]) * (results[thisIter][i][j] - aveResults[i][j]));
636                         }
637                     }
638                 }
639                 
640                 for (int i = 0; i < stdResults.size(); i++) {  //finds average.
641                     out << aveResults[i][0] << '\t' << "ave\t";
642                     for (int j = 1; j < aveResults[i].size(); j++) { out << aveResults[i][j] << '\t'; }
643                     out << endl;
644                     out << stdResults[i][0] << '\t' << "std\t";
645                     for (int j = 1; j < stdResults[i].size(); j++) {
646                         stdResults[i][j] /= (float) iters;
647                         stdResults[i][j] = sqrt(stdResults[i][j]);
648                         out << stdResults[i][j] << '\t';
649                     }
650                     out << endl;
651                 }
652                 out.close();
653             }
654         }
655         
656         
657         return 0;
658     }
659         catch(exception& e) {
660                 m->errorOut(e, "RareFactSharedCommand", "subsample");
661                 exit(1);
662         }
663 }
664 //**********************************************************************************************************************
665 vector<string> RareFactSharedCommand::createGroupFile(vector<string>& outputNames) {
666         try {
667                 
668                 vector<string> newFileNames;
669                 
670                 //find different types of files
671                 map<string, map<string, string> > typesFiles;
672         map<string, vector< vector<string> > > fileLabels; //combofile name to labels. each label is a vector because it may be unique lci hci.
673         vector<string> groupNames;
674                 for (int i = 0; i < outputNames.size(); i++) {
675             
676                         string extension = m->getExtension(outputNames[i]);
677             string combineFileName = outputDir + m->getRootName(m->getSimpleName(sharedfile)) + "groups" + extension;
678                         m->mothurRemove(combineFileName); //remove old file
679             
680                         ifstream in;
681                         m->openInputFile(outputNames[i], in);
682                         
683                         string labels = m->getline(in);
684             
685                         istringstream iss (labels,istringstream::in);
686             string newLabel = ""; vector<string> theseLabels;
687             while(!iss.eof()) {  iss >> newLabel; m->gobble(iss); theseLabels.push_back(newLabel); }
688             vector< vector<string> > allLabels;
689             vector<string> thisSet; thisSet.push_back(theseLabels[0]); allLabels.push_back(thisSet); thisSet.clear(); //makes "numSampled" its own grouping
690             for (int j = 1; j < theseLabels.size()-1; j++) {
691                 if (theseLabels[j+1] == "lci") {
692                     thisSet.push_back(theseLabels[j]); 
693                     thisSet.push_back(theseLabels[j+1]); 
694                     thisSet.push_back(theseLabels[j+2]);
695                     j++; j++;
696                 }else{ //no lci or hci for this calc.
697                     thisSet.push_back(theseLabels[j]); 
698                 }
699                 allLabels.push_back(thisSet); 
700                 thisSet.clear();
701             }
702             fileLabels[combineFileName] = allLabels;
703             
704             map<string, map<string, string> >::iterator itfind = typesFiles.find(extension);
705             if (itfind != typesFiles.end()) {
706                 (itfind->second)[outputNames[i]] = file2Group[i];
707             }else {
708                 map<string, string> temp;  
709                 temp[outputNames[i]] = file2Group[i];
710                 typesFiles[extension] = temp;
711             }
712             if (!(m->inUsersGroups(file2Group[i], groupNames))) {  groupNames.push_back(file2Group[i]); }
713                 }
714                 
715                 //for each type create a combo file
716                 
717                 for (map<string, map<string, string> >::iterator it = typesFiles.begin(); it != typesFiles.end(); it++) {
718                         
719                         ofstream out;
720                         string combineFileName = outputDir + m->getRootName(m->getSimpleName(sharedfile)) + "groups" + it->first;
721                         m->openOutputFileAppend(combineFileName, out);
722                         newFileNames.push_back(combineFileName);
723                         map<string, string> thisTypesFiles = it->second; //it->second maps filename to group
724             set<int> numSampledSet;
725             
726                         //open each type summary file
727                         map<string, map<int, vector< vector<string> > > > files; //maps file name to lines in file
728                         int maxLines = 0;
729                         for (map<string, string>::iterator itFileNameGroup = thisTypesFiles.begin(); itFileNameGroup != thisTypesFiles.end(); itFileNameGroup++) {
730                 
731                 string thisfilename = itFileNameGroup->first;
732                 string group = itFileNameGroup->second;
733                 
734                                 ifstream temp;
735                                 m->openInputFile(thisfilename, temp);
736                                 
737                                 //read through first line - labels
738                                 m->getline(temp);       m->gobble(temp);
739                                 
740                                 map<int, vector< vector<string> > > thisFilesLines;
741                                 while (!temp.eof()){
742                     int numSampled = 0;
743                     temp >> numSampled; m->gobble(temp);
744                     
745                     vector< vector<string> > theseReads;
746                     vector<string> thisSet; thisSet.push_back(toString(numSampled)); theseReads.push_back(thisSet); thisSet.clear();
747                     for (int k = 1; k < fileLabels[combineFileName].size(); k++) { //output thing like 0.03-A lci-A hci-A
748                         vector<string> reads;
749                         string next = "";
750                         for (int l = 0; l < fileLabels[combineFileName][k].size(); l++) { //output modified labels
751                             temp >> next; m->gobble(temp);
752                             reads.push_back(next);
753                         }
754                         theseReads.push_back(reads);
755                     }
756                     thisFilesLines[numSampled] = theseReads;
757                     m->gobble(temp);
758                     
759                     numSampledSet.insert(numSampled);
760                                 }
761                                 
762                                 files[group] = thisFilesLines;
763                                 
764                                 //save longest file for below
765                                 if (maxLines < thisFilesLines.size()) { maxLines = thisFilesLines.size(); }
766                                 
767                                 temp.close();
768                                 m->mothurRemove(thisfilename);
769                         }
770                         
771             //output new labels line
772             out << fileLabels[combineFileName][0][0] << '\t';
773             for (int k = 1; k < fileLabels[combineFileName].size(); k++) { //output thing like 0.03-A lci-A hci-A
774                 for (int n = 0; n < groupNames.size(); n++) { // for each group
775                     for (int l = 0; l < fileLabels[combineFileName][k].size(); l++) { //output modified labels
776                         out << fileLabels[combineFileName][k][l] << '-' << groupNames[n] << '\t';
777                     }
778                 }
779             }
780                         out << endl;
781             
782                         //for each label
783                         for (set<int>::iterator itNumSampled = numSampledSet.begin(); itNumSampled != numSampledSet.end(); itNumSampled++) {
784                                 
785                 out << (*itNumSampled) << '\t';
786                 
787                 if (m->control_pressed) { break; }
788                 
789                 for (int k = 1; k < fileLabels[combineFileName].size(); k++) { //each chunk
790                                     //grab data for each group
791                     for (map<string, map<int, vector< vector<string> > > >::iterator itFileNameGroup = files.begin(); itFileNameGroup != files.end(); itFileNameGroup++) {
792                         
793                         string group = itFileNameGroup->first;
794                         
795                         map<int, vector< vector<string> > >::iterator itLine = files[group].find(*itNumSampled);
796                         if (itLine != files[group].end()) { 
797                             for (int l = 0; l < (itLine->second)[k].size(); l++) { 
798                                 out << (itLine->second)[k][l] << '\t';
799                                 
800                             }                             
801                         }else { 
802                             for (int l = 0; l < fileLabels[combineFileName][k].size(); l++) { 
803                                 out << "NA" << '\t';
804                             } 
805                         }
806                     }
807                 }
808                 out << endl;
809                         }       
810                         out.close();
811                 }
812                 
813                 //return combine file name
814                 return newFileNames;
815                 
816         }
817         catch(exception& e) {
818                 m->errorOut(e, "RareFactSharedCommand", "createGroupFile");
819                 exit(1);
820         }
821 }
822 //**********************************************************************************************************************