]> git.donarmstrong.com Git - mothur.git/blob - unifracunweightedcommand.cpp
fixed bug in phylo.diversity rooting. added filename patterns and create filename...
[mothur.git] / unifracunweightedcommand.cpp
1 /*
2  *  unifracunweightedcommand.cpp
3  *  Mothur
4  *
5  *  Created by Sarah Westcott on 2/9/09.
6  *  Copyright 2009 Schloss Lab UMASS Amherst. All rights reserved.
7  *
8  */
9
10 #include "unifracunweightedcommand.h"
11 #include "treereader.h"
12 #include "subsample.h"
13 #include "consensus.h"
14
15 //**********************************************************************************************************************
16 vector<string> UnifracUnweightedCommand::setParameters(){       
17         try {
18                 CommandParameter ptree("tree", "InputTypes", "", "", "none", "none", "none","unweighted-uwsummary",false,true,true); parameters.push_back(ptree);
19         CommandParameter pname("name", "InputTypes", "", "", "NameCount", "none", "none","",false,false,true); parameters.push_back(pname);
20         CommandParameter pcount("count", "InputTypes", "", "", "NameCount-CountGroup", "none", "none","",false,false,true); parameters.push_back(pcount);
21                 CommandParameter pgroup("group", "InputTypes", "", "", "CountGroup", "none", "none","",false,false,true); parameters.push_back(pgroup);
22                 CommandParameter pgroups("groups", "String", "", "", "", "", "","",false,false); parameters.push_back(pgroups);
23                 CommandParameter piters("iters", "Number", "", "1000", "", "", "","",false,false); parameters.push_back(piters);
24                 CommandParameter pprocessors("processors", "Number", "", "1", "", "", "","",false,false,true); parameters.push_back(pprocessors);
25                 CommandParameter prandom("random", "Boolean", "", "F", "", "", "","",false,false); parameters.push_back(prandom);
26                 CommandParameter pdistance("distance", "Multiple", "column-lt-square-phylip", "column", "", "", "","phylip-column",false,false); parameters.push_back(pdistance);
27         CommandParameter psubsample("subsample", "String", "", "", "", "", "","",false,false); parameters.push_back(psubsample);
28         CommandParameter pconsensus("consensus", "Boolean", "", "F", "", "", "","tree",false,false); parameters.push_back(pconsensus);
29         CommandParameter proot("root", "Boolean", "F", "", "", "", "","",false,false); parameters.push_back(proot);
30                 CommandParameter pinputdir("inputdir", "String", "", "", "", "", "","",false,false); parameters.push_back(pinputdir);
31                 CommandParameter poutputdir("outputdir", "String", "", "", "", "", "","",false,false); parameters.push_back(poutputdir);
32                 
33                 vector<string> myArray;
34                 for (int i = 0; i < parameters.size(); i++) {   myArray.push_back(parameters[i].name);          }
35                 return myArray;
36         }
37         catch(exception& e) {
38                 m->errorOut(e, "UnifracUnweightedCommand", "setParameters");
39                 exit(1);
40         }
41 }
42 //**********************************************************************************************************************
43 string UnifracUnweightedCommand::getHelpString(){       
44         try {
45                 string helpString = "";
46                 helpString += "The unifrac.unweighted command parameters are tree, group, name, count, groups, iters, distance, processors, root and random.  tree parameter is required unless you have valid current tree file.\n";
47                 helpString += "The groups parameter allows you to specify which of the groups in your groupfile you would like analyzed.  You must enter at least 1 valid group.\n";
48                 helpString += "The group names are separated by dashes.  The iters parameter allows you to specify how many random trees you would like compared to your tree.\n";
49                 helpString += "The distance parameter allows you to create a distance file from the results. The default is false. You may set distance to lt, square or column.\n";
50                 helpString += "The random parameter allows you to shut off the comparison to random trees. The default is false, meaning compare don't your trees with randomly generated trees.\n";
51                 helpString += "The root parameter allows you to include the entire root in your calculations. The default is false, meaning stop at the root for this comparision instead of the root of the entire tree.\n";
52                 helpString += "The processors parameter allows you to specify the number of processors to use. The default is 1.\n";
53                 helpString += "The unifrac.unweighted command should be in the following format: unifrac.unweighted(groups=yourGroups, iters=yourIters).\n";
54         helpString += "The subsample parameter allows you to enter the size pergroup of the sample or you can set subsample=T and mothur will use the size of your smallest group. The subsample parameter may only be used with a group file.\n";
55         helpString += "The consensus parameter allows you to indicate you would like trees built from distance matrices created with the results of the subsampling, as well as a consensus tree built from these trees. Default=F.\n";
56                 helpString += "Example unifrac.unweighted(groups=A-B-C, iters=500).\n";
57                 helpString += "The default value for groups is all the groups in your groupfile, and iters is 1000.\n";
58                 helpString += "The unifrac.unweighted command output two files: .unweighted and .uwsummary their descriptions are in the manual.\n";
59                 helpString += "Note: No spaces between parameter labels (i.e. groups), '=' and parameters (i.e.yourGroups).\n";
60                 return helpString;
61         }
62         catch(exception& e) {
63                 m->errorOut(e, "UnifracUnweightedCommand", "getHelpString");
64                 exit(1);
65         }
66 }
67 //**********************************************************************************************************************
68 string UnifracUnweightedCommand::getOutputPattern(string type) {
69     try {
70         string pattern = "";
71         if (type == "unweighted")            {  pattern = "[filename],unweighted-[filename],[tag],unweighted";   }
72         else if (type == "uwsummary")        {  pattern = "[filename],uwsummary";   }
73         else if (type == "phylip")           {  pattern = "[filename],[tag],[tag2],dist";   }
74         else if (type == "column")           {  pattern = "[filename],[tag],[tag2],dist";   }
75         else if (type == "tree")             {  pattern = "[filename],[tag],[tag2],tre";   }
76         else { m->mothurOut("[ERROR]: No definition for type " + type + " output pattern.\n"); m->control_pressed = true;  }
77         
78         return pattern;
79     }
80     catch(exception& e) {
81         m->errorOut(e, "UnifracUnweightedCommand", "getOutputPattern");
82         exit(1);
83     }
84 }
85 //**********************************************************************************************************************
86 UnifracUnweightedCommand::UnifracUnweightedCommand(){   
87         try {
88                 abort = true; calledHelp = true; 
89                 setParameters();
90                 vector<string> tempOutNames;
91                 outputTypes["unweighted"] = tempOutNames;
92                 outputTypes["uwsummary"] = tempOutNames;
93                 outputTypes["phylip"] = tempOutNames;
94                 outputTypes["column"] = tempOutNames;
95         outputTypes["tree"] = tempOutNames;
96         }
97         catch(exception& e) {
98                 m->errorOut(e, "UnifracUnweightedCommand", "UnifracUnweightedCommand");
99                 exit(1);
100         }
101 }
102 /***********************************************************/
103 UnifracUnweightedCommand::UnifracUnweightedCommand(string option)  {
104         try {
105                 abort = false; calledHelp = false;   
106                 
107                         
108                 //allow user to run help
109                 if(option == "help") { help(); abort = true; calledHelp = true; }
110                 else if(option == "citation") { citation(); abort = true; calledHelp = true;}
111                 
112                 else {
113                         vector<string> myArray = setParameters();
114                         
115                         OptionParser parser(option);
116                         map<string,string> parameters = parser.getParameters();
117                         map<string,string>::iterator it;
118                         
119                         ValidParameters validParameter;
120                 
121                         //check to make sure all parameters are valid for command
122                         for (map<string,string>::iterator it = parameters.begin(); it != parameters.end(); it++) { 
123                                 if (validParameter.isValidParameter(it->first, myArray, it->second) != true) {  abort = true;  }
124                         }
125                         
126                         //initialize outputTypes
127                         vector<string> tempOutNames;
128                         outputTypes["unweighted"] = tempOutNames;
129                         outputTypes["uwsummary"] = tempOutNames;
130                         outputTypes["phylip"] = tempOutNames;
131                         outputTypes["column"] = tempOutNames;
132             outputTypes["tree"] = tempOutNames;
133                         
134                         //if the user changes the input directory command factory will send this info to us in the output parameter 
135                         string inputDir = validParameter.validFile(parameters, "inputdir", false);              
136                         if (inputDir == "not found"){   inputDir = "";          }
137                         else {
138                                 string path;
139                                 it = parameters.find("tree");
140                                 //user has given a template file
141                                 if(it != parameters.end()){ 
142                                         path = m->hasPath(it->second);
143                                         //if the user has not given a path then, add inputdir. else leave path alone.
144                                         if (path == "") {       parameters["tree"] = inputDir + it->second;             }
145                                 }
146                                 
147                                 it = parameters.find("group");
148                                 //user has given a template file
149                                 if(it != parameters.end()){ 
150                                         path = m->hasPath(it->second);
151                                         //if the user has not given a path then, add inputdir. else leave path alone.
152                                         if (path == "") {       parameters["group"] = inputDir + it->second;            }
153                                 }
154                                 
155                                 it = parameters.find("name");
156                                 //user has given a template file
157                                 if(it != parameters.end()){ 
158                                         path = m->hasPath(it->second);
159                                         //if the user has not given a path then, add inputdir. else leave path alone.
160                                         if (path == "") {       parameters["name"] = inputDir + it->second;             }
161                                 }
162                 
163                 it = parameters.find("count");
164                                 //user has given a template file
165                                 if(it != parameters.end()){ 
166                                         path = m->hasPath(it->second);
167                                         //if the user has not given a path then, add inputdir. else leave path alone.
168                                         if (path == "") {       parameters["count"] = inputDir + it->second;            }
169                                 }
170                         }
171                         
172             //check for required parameters
173                         treefile = validParameter.validFile(parameters, "tree", true);
174                         if (treefile == "not open") { abort = true; }
175                         else if (treefile == "not found") {                             //if there is a current design file, use it
176                                 treefile = m->getTreeFile(); 
177                                 if (treefile != "") { m->mothurOut("Using " + treefile + " as input file for the tree parameter."); m->mothurOutEndLine(); }
178                                 else {  m->mothurOut("You have no current tree file and the tree parameter is required."); m->mothurOutEndLine(); abort = true; }                                                               
179                         }else { m->setTreeFile(treefile); }     
180                         
181                         //check for required parameters
182                         groupfile = validParameter.validFile(parameters, "group", true);
183                         if (groupfile == "not open") { abort = true; }
184                         else if (groupfile == "not found") { groupfile = ""; }
185                         else { m->setGroupFile(groupfile); }
186                         
187                         namefile = validParameter.validFile(parameters, "name", true);
188                         if (namefile == "not open") { namefile = ""; abort = true; }
189                         else if (namefile == "not found") { namefile = ""; }
190                         else { m->setNameFile(namefile); }
191             
192             countfile = validParameter.validFile(parameters, "count", true);
193                         if (countfile == "not open") { countfile = ""; abort = true; }
194                         else if (countfile == "not found") { countfile = "";  } 
195                         else { m->setCountTableFile(countfile); }
196             
197             if ((namefile != "") && (countfile != "")) {
198                 m->mothurOut("[ERROR]: you may only use one of the following: name or count."); m->mothurOutEndLine(); abort = true;
199             }
200                         
201             if ((groupfile != "") && (countfile != "")) {
202                 m->mothurOut("[ERROR]: you may only use one of the following: group or count."); m->mothurOutEndLine(); abort=true;
203             }
204                         
205                         outputDir = validParameter.validFile(parameters, "outputdir", false);           if (outputDir == "not found"){  outputDir = m->hasPath(treefile);       }
206                         
207                         //check for optional parameter and set defaults
208                         // ...at some point should added some additional type checking...
209                         groups = validParameter.validFile(parameters, "groups", false);                 
210                         if (groups == "not found") { groups = ""; }
211                         else { 
212                                 m->splitAtDash(groups, Groups);
213                                 m->setGroups(Groups);
214                         }
215                                 
216                         itersString = validParameter.validFile(parameters, "iters", false);                             if (itersString == "not found") { itersString = "1000"; }
217                         m->mothurConvert(itersString, iters); 
218                         
219                         string temp = validParameter.validFile(parameters, "distance", false);                  
220                         if (temp == "not found") { phylip = false; outputForm = ""; }
221                         else{
222                 if (temp=="phylip") { temp = "lt"; }
223                                 if ((temp == "lt") || (temp == "column") || (temp == "square")) {  phylip = true;  outputForm = temp; }
224                                 else { m->mothurOut("Options for distance are: lt, square, or column. Using lt."); m->mothurOutEndLine(); phylip = true; outputForm = "lt"; }
225                         }
226                         
227                         temp = validParameter.validFile(parameters, "random", false);                                   if (temp == "not found") { temp = "f"; }
228                         random = m->isTrue(temp);
229                         
230                         temp = validParameter.validFile(parameters, "root", false);                                     if (temp == "not found") { temp = "F"; }
231                         includeRoot = m->isTrue(temp);
232                         
233                         temp = validParameter.validFile(parameters, "processors", false);       if (temp == "not found"){       temp = m->getProcessors();      }
234                         m->setProcessors(temp);
235                         m->mothurConvert(temp, processors); 
236                         
237             temp = validParameter.validFile(parameters, "subsample", false);            if (temp == "not found") { temp = "F"; }
238                         if (m->isNumeric1(temp)) { m->mothurConvert(temp, subsampleSize); subsample = true; }
239             else {  
240                 if (m->isTrue(temp)) { subsample = true; subsampleSize = -1; }  //we will set it to smallest group later 
241                 else { subsample = false; }
242             }
243                         
244             if (!subsample) { subsampleIters = 0;   }
245             else { subsampleIters = iters;          }
246             
247             temp = validParameter.validFile(parameters, "consensus", false);                                    if (temp == "not found") { temp = "F"; }
248                         consensus = m->isTrue(temp);
249             
250                         if (subsample && random) {  m->mothurOut("[ERROR]: random must be false, if subsample=t.\n"); abort=true;  } 
251             if (countfile == "") { if (subsample && (groupfile == "")) {  m->mothurOut("[ERROR]: if subsample=t, a group file must be provided.\n"); abort=true;  } }
252             else {  
253                 CountTable testCt; 
254                 if ((!testCt.testGroups(countfile)) && (subsample)) {
255                     m->mothurOut("[ERROR]: if subsample=t, a count file with group info must be provided.\n"); abort=true;  
256                 }
257             }
258             if (subsample && (!phylip)) { phylip=true; outputForm = "lt"; }
259             if (consensus && (!subsample)) { m->mothurOut("[ERROR]: you cannot use consensus without subsample.\n"); abort=true; }
260
261                         if (!random) {  iters = 0;  } //turn off random calcs
262                         
263                         //if user selects distance = true and no groups it won't calc the pairwise
264                         if ((phylip) && (Groups.size() == 0)) {
265                                 groups = "all";
266                                 m->splitAtDash(groups, Groups);
267                                 m->setGroups(Groups);
268                         }
269                         
270                         if (countfile=="") {
271                 if (namefile == "") {
272                     vector<string> files; files.push_back(treefile);
273                     parser.getNameFile(files);
274                 } 
275             }
276                 }
277                 
278         }
279         catch(exception& e) {
280                 m->errorOut(e, "UnifracUnweightedCommand", "UnifracUnweightedCommand");
281                 exit(1);
282         }
283 }
284
285 /***********************************************************/
286 int UnifracUnweightedCommand::execute() {
287         try {
288                 
289                 if (abort == true) { if (calledHelp) { return 0; }  return 2;   }
290                 
291                 m->setTreeFile(treefile);
292                 
293                 TreeReader* reader;
294         if (countfile == "") { reader = new TreeReader(treefile, groupfile, namefile); }
295         else { reader = new TreeReader(treefile, countfile); }
296         T = reader->getTrees();
297         ct = T[0]->getCountTable();
298         delete reader;
299         
300         map<string, string> variables; 
301                 variables["[filename]"] = outputDir + m->getRootName(m->getSimpleName(treefile));
302                 sumFile = getOutputFileName("uwsummary",variables);
303                 outputNames.push_back(sumFile); outputTypes["uwsummary"].push_back(sumFile);
304                 m->openOutputFile(sumFile, outSum);
305                 
306                 SharedUtil util;
307                 Groups = m->getGroups();
308                 vector<string> namesGroups = ct->getNamesOfGroups();
309                 util.setGroups(Groups, namesGroups, allGroups, numGroups, "unweighted");        //sets the groups the user wants to analyze
310                 
311                 Unweighted unweighted(includeRoot);
312                 
313                 int start = time(NULL);
314         
315         //set or check size
316         if (subsample) {
317             //user has not set size, set size = smallest samples size
318             if (subsampleSize == -1) { 
319                 vector<string> temp; temp.push_back(Groups[0]);
320                 subsampleSize = ct->getGroupCount(Groups[0]); //num in first group
321                 for (int i = 1; i < Groups.size(); i++) {
322                     int thisSize = ct->getGroupCount(Groups[i]);
323                     if (thisSize < subsampleSize) {     subsampleSize = thisSize;       }
324                 }
325                 m->mothurOut("\nSetting subsample size to " + toString(subsampleSize) + ".\n\n");
326             }else { //eliminate any too small groups
327                 vector<string> newGroups = Groups;
328                 Groups.clear();
329                 for (int i = 0; i < newGroups.size(); i++) {
330                     int thisSize = ct->getGroupCount(newGroups[i]);
331                     
332                     if (thisSize >= subsampleSize) {    Groups.push_back(newGroups[i]); }
333                     else {   m->mothurOut("You have selected a size that is larger than "+newGroups[i]+" number of sequences, removing "+newGroups[i]+".\n"); }
334                 } 
335                 m->setGroups(Groups);
336             }
337         }
338                 
339         util.getCombos(groupComb, Groups, numComp);
340                 m->setGroups(Groups);
341         
342                 if (numGroups == 1) { numComp++; groupComb.push_back(allGroups); }
343         
344                 if (numComp < processors) { processors = numComp;  }
345         
346         if (consensus && (numComp < 2)) { m->mothurOut("consensus can only be used with numComparisions greater than 1, setting consensus=f.\n"); consensus=false; }
347                 
348                 outSum << "Tree#" << '\t' << "Groups" << '\t'  <<  "UWScore" <<'\t';
349                 m->mothurOut("Tree#\tGroups\tUWScore\t");
350                 if (random) { outSum << "UWSig"; m->mothurOut("UWSig"); }
351                 outSum << endl; m->mothurOutEndLine();
352          
353                 //get pscores for users trees
354                 for (int i = 0; i < T.size(); i++) {
355                         if (m->control_pressed) { delete ct; for (int i = 0; i < T.size(); i++) { delete T[i]; }outSum.close(); for (int i = 0; i < outputNames.size(); i++) {  m->mothurRemove(outputNames[i]);  } return 0; }
356                         
357             counter = 0;
358                         
359                         if (random)  {  
360                 variables["[filename]"] = outputDir + m->getSimpleName(treefile);
361                 variables["[tag]"] = toString(i+1);
362                 string unFileName = getOutputFileName("unweighted", variables);
363                                 output = new ColumnFile(unFileName, itersString);
364                                 outputNames.push_back(unFileName); outputTypes["unweighted"].push_back(unFileName);
365                         }
366                         
367                         
368                         //get unweighted for users tree
369                         rscoreFreq.resize(numComp);  
370                         rCumul.resize(numComp);  
371                         utreeScores.resize(numComp);  
372                         UWScoreSig.resize(numComp); 
373             
374             vector<double> userData; userData.resize(numComp,0);  //weighted score info for user tree. data[0] = weightedscore AB, data[1] = weightedscore AC...
375
376                         userData = unweighted.getValues(T[i], processors, outputDir);  //userData[0] = unweightedscore
377                 
378                         if (m->control_pressed) { delete ct; for (int i = 0; i < T.size(); i++) { delete T[i]; }if (random) { delete output;  } outSum.close();  for (int i = 0; i < outputNames.size(); i++) { m->mothurRemove(outputNames[i]);  }return 0; }
379                         
380                         //output scores for each combination
381                         for(int k = 0; k < numComp; k++) {
382                                 //saves users score
383                                 utreeScores[k].push_back(userData[k]);
384                                 
385                                 //add users score to validscores
386                                 validScores[userData[k]] = userData[k];
387                 
388                 if (!random) { UWScoreSig[k].push_back(0.0);    }
389                         }
390             
391             if (random) {  runRandomCalcs(T[i], userData);  }
392                         
393                         if (m->control_pressed) { delete ct; for (int i = 0; i < T.size(); i++) { delete T[i]; }if (random) { delete output;  } outSum.close(); for (int i = 0; i < outputNames.size(); i++) {  m->mothurRemove(outputNames[i]);  } return 0;  }
394             
395             int startSubsample = time(NULL);
396             
397             //subsample loop
398             vector< vector<double> > calcDistsTotals;  //each iter, each groupCombos dists. this will be used to make .dist files
399             for (int thisIter = 0; thisIter < subsampleIters; thisIter++) { //subsampleIters=0, if subsample=f.
400                 if (m->control_pressed) { break; }
401                 
402                 //copy to preserve old one - would do this in subsample but memory cleanup becomes messy.
403                 CountTable* newCt = new CountTable();
404                  
405                 //uses method of setting groups to doNotIncludeMe
406                 int sampleTime = 0;
407                 if (m->debug) { sampleTime = time(NULL); }
408                 SubSample sample;
409                 Tree* subSampleTree = sample.getSample(T[i], ct, newCt, subsampleSize);
410                 
411                 if (m->debug) { m->mothurOut("[DEBUG]: iter " + toString(thisIter) + " took " + toString(time(NULL) - sampleTime) + " seconds to sample tree.\n"); }
412                 
413                 //call new weighted function
414                 vector<double> iterData; iterData.resize(numComp,0);
415                 Unweighted thisUnweighted(includeRoot);
416                 iterData = thisUnweighted.getValues(subSampleTree, processors, outputDir); //userData[0] = weightedscore
417         
418                 //save data to make ave dist, std dist
419                 calcDistsTotals.push_back(iterData);
420                 
421                 delete newCt;
422                 delete subSampleTree;
423                 
424                 if((thisIter+1) % 100 == 0){    m->mothurOut(toString(thisIter+1)); m->mothurOutEndLine();              }
425             }
426             if (subsample) { m->mothurOut("It took " + toString(time(NULL) - startSubsample) + " secs to run the subsampling."); m->mothurOutEndLine(); }
427             
428             if (m->control_pressed) { delete ct; for (int i = 0; i < T.size(); i++) { delete T[i]; }if (random) { delete output;  } outSum.close(); for (int i = 0; i < outputNames.size(); i++) {      m->mothurRemove(outputNames[i]);  } return 0;  }
429
430             if (subsample) {  getAverageSTDMatrices(calcDistsTotals, i); }
431             if (consensus) {  getConsensusTrees(calcDistsTotals, i);  }
432             
433             //print output files
434                         printUWSummaryFile(i);
435                         if (random)  {  printUnweightedFile();  delete output;  }
436                         if (phylip) {   createPhylipFile(i);            }
437                         
438                         rscoreFreq.clear(); 
439                         rCumul.clear();  
440                         validScores.clear(); 
441                         utreeScores.clear();  
442                         UWScoreSig.clear(); 
443                 }
444                 
445
446                 outSum.close();
447                 delete ct; 
448                 for (int i = 0; i < T.size(); i++) { delete T[i]; }
449                 
450                 if (m->control_pressed) { for (int i = 0; i < outputNames.size(); i++) {        m->mothurRemove(outputNames[i]);  }     return 0; }
451                 
452                 m->mothurOut("It took " + toString(time(NULL) - start) + " secs to run unifrac.unweighted."); m->mothurOutEndLine();
453                 
454                 //set phylip file as new current phylipfile
455                 string current = "";
456                 itTypes = outputTypes.find("phylip");
457                 if (itTypes != outputTypes.end()) {
458                         if ((itTypes->second).size() != 0) { current = (itTypes->second)[0]; m->setPhylipFile(current); }
459                 }
460                 
461                 //set column file as new current columnfile
462                 itTypes = outputTypes.find("column");
463                 if (itTypes != outputTypes.end()) {
464                         if ((itTypes->second).size() != 0) { current = (itTypes->second)[0]; m->setColumnFile(current); }
465                 }
466                 
467                 m->mothurOutEndLine();
468                 m->mothurOut("Output File Names: "); m->mothurOutEndLine();
469                 for (int i = 0; i < outputNames.size(); i++) {  m->mothurOut(outputNames[i]); m->mothurOutEndLine();    }
470                 m->mothurOutEndLine();
471                 
472                 return 0;
473                 
474         }
475         catch(exception& e) {
476                 m->errorOut(e, "UnifracUnweightedCommand", "execute");
477                 exit(1);
478         }
479 }
480 /**************************************************************************************************/
481 int UnifracUnweightedCommand::getAverageSTDMatrices(vector< vector<double> >& dists, int treeNum) {
482         try {
483         //we need to find the average distance and standard deviation for each groups distance
484         
485         //finds sum
486         vector<double> averages; averages.resize(numComp, 0); 
487         for (int thisIter = 0; thisIter < subsampleIters; thisIter++) {
488             for (int i = 0; i < dists[thisIter].size(); i++) {  
489                 averages[i] += dists[thisIter][i];
490             }
491         }
492         
493         //finds average.
494         for (int i = 0; i < averages.size(); i++) {  averages[i] /= (float) subsampleIters; }
495         
496         //find standard deviation
497         vector<double> stdDev; stdDev.resize(numComp, 0);
498         
499         for (int thisIter = 0; thisIter < subsampleIters; thisIter++) { //compute the difference of each dist from the mean, and square the result of each
500             for (int j = 0; j < dists[thisIter].size(); j++) {
501                 stdDev[j] += ((dists[thisIter][j] - averages[j]) * (dists[thisIter][j] - averages[j]));
502             }
503         }
504         for (int i = 0; i < stdDev.size(); i++) {  
505             stdDev[i] /= (float) subsampleIters; 
506             stdDev[i] = sqrt(stdDev[i]);
507         }
508         
509         //make matrix with scores in it
510         vector< vector<double> > avedists;      avedists.resize(m->getNumGroups());
511         for (int i = 0; i < m->getNumGroups(); i++) {
512             avedists[i].resize(m->getNumGroups(), 0.0);
513         }
514         
515         //make matrix with scores in it
516         vector< vector<double> > stddists;      stddists.resize(m->getNumGroups());
517         for (int i = 0; i < m->getNumGroups(); i++) {
518             stddists[i].resize(m->getNumGroups(), 0.0);
519         }
520         
521         //flip it so you can print it
522         int count = 0;
523         for (int r=0; r<m->getNumGroups(); r++) { 
524             for (int l = 0; l < r; l++) {
525                 avedists[r][l] = averages[count];
526                 avedists[l][r] = averages[count];
527                 stddists[r][l] = stdDev[count];
528                 stddists[l][r] = stdDev[count];
529                 count++;
530             }
531         }
532         
533         map<string, string> variables; 
534                 variables["[filename]"] = outputDir + m->getRootName(m->getSimpleName(treefile));
535         variables["[tag]"] = toString(treeNum+1);
536         variables["[tag2]"] = "unweighted.ave";
537         string aveFileName = getOutputFileName("phylip",variables);
538         if (outputForm != "column") { outputNames.push_back(aveFileName); outputTypes["phylip"].push_back(aveFileName);  }
539         else { outputNames.push_back(aveFileName); outputTypes["column"].push_back(aveFileName);  }
540         ofstream out;
541         m->openOutputFile(aveFileName, out);
542         
543         variables["[tag2]"] = "unweighted.std";
544         string stdFileName = getOutputFileName("phylip",variables);
545         if (outputForm != "column") { outputNames.push_back(stdFileName); outputTypes["phylip"].push_back(stdFileName); }
546         else { outputNames.push_back(stdFileName); outputTypes["column"].push_back(stdFileName); }
547         ofstream outStd;
548         m->openOutputFile(stdFileName, outStd);
549         
550         if ((outputForm == "lt") || (outputForm == "square")) {
551             //output numSeqs
552             out << m->getNumGroups() << endl;
553             outStd << m->getNumGroups() << endl;
554         }
555         
556         //output to file
557         for (int r=0; r<m->getNumGroups(); r++) { 
558             //output name
559             string name = (m->getGroups())[r];
560             if (name.length() < 10) { //pad with spaces to make compatible
561                 while (name.length() < 10) {  name += " ";  }
562             }
563             
564             if (outputForm == "lt") {
565                 out << name << '\t';
566                 outStd << name << '\t';
567                 
568                 //output distances
569                 for (int l = 0; l < r; l++) {   out  << avedists[r][l] << '\t';  outStd  << stddists[r][l] << '\t';}
570                 out << endl;  outStd << endl;
571             }else if (outputForm == "square") {
572                 out << name << '\t';
573                 outStd << name << '\t';
574                 
575                 //output distances
576                 for (int l = 0; l < m->getNumGroups(); l++) {   out  << avedists[r][l] << '\t'; outStd  << stddists[r][l] << '\t'; }
577                 out << endl; outStd << endl;
578             }else{
579                 //output distances
580                 for (int l = 0; l < r; l++) {   
581                     string otherName = (m->getGroups())[l];
582                     if (otherName.length() < 10) { //pad with spaces to make compatible
583                         while (otherName.length() < 10) {  otherName += " ";  }
584                     }
585                     
586                     out  << name << '\t' << otherName << avedists[r][l] << endl;  
587                     outStd  << name << '\t' << otherName << stddists[r][l] << endl; 
588                 }
589             }
590         }
591         out.close();
592         outStd.close();
593         
594         return 0;
595     }
596         catch(exception& e) {
597                 m->errorOut(e, "UnifracUnweightedCommand", "getAverageSTDMatrices");
598                 exit(1);
599         }
600 }
601
602 /**************************************************************************************************/
603 int UnifracUnweightedCommand::getConsensusTrees(vector< vector<double> >& dists, int treeNum) {
604         try {
605         
606         //used in tree constructor 
607         m->runParse = false;
608         
609         //create treemap class from groupmap for tree class to use
610         CountTable newCt;
611         set<string> nameMap;
612         map<string, string> groupMap;
613         set<string> gps;
614         for (int i = 0; i < m->getGroups().size(); i++) { 
615             nameMap.insert(m->getGroups()[i]); 
616             gps.insert(m->getGroups()[i]); 
617             groupMap[m->getGroups()[i]] = m->getGroups()[i];
618         }
619         newCt.createTable(nameMap, groupMap, gps);
620         
621         //clear  old tree names if any
622         m->Treenames.clear();
623         
624         //fills globaldatas tree names
625         m->Treenames = m->getGroups();
626         
627         vector<Tree*> newTrees = buildTrees(dists, treeNum, newCt); //also creates .all.tre file containing the trees created
628         
629         if (m->control_pressed) { return 0; }
630         
631         Consensus con;
632         Tree* conTree = con.getTree(newTrees);
633         
634         //create a new filename
635         map<string, string> variables; 
636                 variables["[filename]"] = outputDir + m->getRootName(m->getSimpleName(treefile));
637         variables["[tag]"] = toString(treeNum+1);
638         variables["[tag2]"] = "unweighted.cons";
639         string conFile = getOutputFileName("tree",variables);                           
640         outputNames.push_back(conFile); outputTypes["tree"].push_back(conFile); 
641         ofstream outTree;
642         m->openOutputFile(conFile, outTree);
643         
644         if (conTree != NULL) { conTree->print(outTree, "boot"); delete conTree; }
645         outTree.close();
646         
647         return 0;
648         
649     }
650         catch(exception& e) {
651                 m->errorOut(e, "UnifracUnweightedCommand", "getConsensusTrees");
652                 exit(1);
653         }
654 }
655 /**************************************************************************************************/
656
657 vector<Tree*> UnifracUnweightedCommand::buildTrees(vector< vector<double> >& dists, int treeNum, CountTable& myct) {
658         try {
659         
660         vector<Tree*> trees;
661         
662         //create a new filename
663         map<string, string> variables; 
664                 variables["[filename]"] = outputDir + m->getRootName(m->getSimpleName(treefile));
665         variables["[tag]"] = toString(treeNum+1);
666         variables["[tag2]"] = "unweighted.all";
667         string outputFile = getOutputFileName("tree",variables);                                
668         outputNames.push_back(outputFile); outputTypes["tree"].push_back(outputFile); 
669         
670         ofstream outAll;
671         m->openOutputFile(outputFile, outAll);
672         
673         
674         for (int i = 0; i < dists.size(); i++) { //dists[0] are the dists for the first subsampled tree.
675             
676             if (m->control_pressed) { break; }
677             
678             //make matrix with scores in it
679             vector< vector<double> > sims;      sims.resize(m->getNumGroups());
680             for (int j = 0; j < m->getNumGroups(); j++) {
681                 sims[j].resize(m->getNumGroups(), 0.0);
682             }
683             
684             int count = 0;
685                         for (int r=0; r<m->getNumGroups(); r++) { 
686                                 for (int l = 0; l < r; l++) {
687                     double sim = -(dists[i][count]-1.0);
688                                         sims[r][l] = sim;
689                                         sims[l][r] = sim;
690                                         count++;
691                                 }
692                         }
693             
694             //create tree
695             Tree* tempTree = new Tree(&myct, sims);
696             tempTree->assembleTree();
697             
698             trees.push_back(tempTree);
699             
700             //print tree
701             tempTree->print(outAll);
702         }
703         
704         outAll.close();
705         
706         if (m->control_pressed) {  for (int i = 0; i < trees.size(); i++) {  delete trees[i]; trees[i] = NULL; } m->mothurRemove(outputFile); }
707         
708         return trees;
709     }
710         catch(exception& e) {
711                 m->errorOut(e, "UnifracUnweightedCommand", "buildTrees");
712                 exit(1);
713         }
714 }
715 /**************************************************************************************************/
716
717 int UnifracUnweightedCommand::runRandomCalcs(Tree* thisTree, vector<double> usersScores) {
718         try {
719         vector<double> randomData; randomData.resize(numComp,0); //weighted score info for random trees. data[0] = weightedscore AB, data[1] = weightedscore AC...
720         
721         Unweighted unweighted(includeRoot);
722         
723         //get unweighted scores for random trees - if random is false iters = 0
724         for (int j = 0; j < iters; j++) {
725             
726             //we need a different getValues because when we swap the labels we only want to swap those in each pairwise comparison
727             randomData = unweighted.getValues(thisTree, "", "", processors, outputDir);
728             
729             if (m->control_pressed) { return 0; }
730                         
731             for(int k = 0; k < numComp; k++) {  
732                 //add trees unweighted score to map of scores
733                 map<float,float>::iterator it = rscoreFreq[k].find(randomData[k]);
734                 if (it != rscoreFreq[k].end()) {//already have that score
735                     rscoreFreq[k][randomData[k]]++;
736                 }else{//first time we have seen this score
737                     rscoreFreq[k][randomData[k]] = 1;
738                 }
739                                 
740                 //add randoms score to validscores
741                 validScores[randomData[k]] = randomData[k];
742             }
743         }
744         
745         for(int a = 0; a < numComp; a++) {
746             float rcumul = 1.0000;
747     
748             //this loop fills the cumulative maps and put 0.0000 in the score freq map to make it easier to print.
749             for (map<float,float>::iterator it = validScores.begin(); it != validScores.end(); it++) { 
750                 //make rscoreFreq map and rCumul
751                 map<float,float>::iterator it2 = rscoreFreq[a].find(it->first);
752                 rCumul[a][it->first] = rcumul;
753                 //get percentage of random trees with that info
754                 if (it2 != rscoreFreq[a].end()) {  rscoreFreq[a][it->first] /= iters; rcumul-= it2->second;  }
755                 else { rscoreFreq[a][it->first] = 0.0000; } //no random trees with that score
756             }
757             UWScoreSig[a].push_back(rCumul[a][usersScores[a]]);
758         }
759         
760         return 0;
761         }
762         catch(exception& e) {
763                 m->errorOut(e, "UnifracUnweightedCommand", "runRandomCalcs");
764                 exit(1);
765         }
766 }
767 /***********************************************************/
768 void UnifracUnweightedCommand::printUnweightedFile() {
769         try {
770                 vector<double> data;
771                 vector<string> tags;
772                 
773                 tags.push_back("Score");
774                 tags.push_back("RandFreq"); tags.push_back("RandCumul");
775                         
776                 for(int a = 0; a < numComp; a++) {
777                         output->initFile(groupComb[a], tags);
778                         //print each line
779                         for (map<float,float>::iterator it = validScores.begin(); it != validScores.end(); it++) { 
780                                 data.push_back(it->first);  data.push_back(rscoreFreq[a][it->first]); data.push_back(rCumul[a][it->first]);                                             
781                                 output->output(data);
782                                 data.clear();
783                         } 
784                         output->resetFile();
785                 }
786         }
787         catch(exception& e) {
788                 m->errorOut(e, "UnifracUnweightedCommand", "printUnweightedFile");
789                 exit(1);
790         }
791 }
792
793 /***********************************************************/
794 void UnifracUnweightedCommand::printUWSummaryFile(int i) {
795         try {
796                                 
797                 //format output
798                 outSum.setf(ios::fixed, ios::floatfield); outSum.setf(ios::showpoint);
799                         
800                 //print each line
801
802                 for(int a = 0; a < numComp; a++) {
803                         outSum << i+1 << '\t';
804                         m->mothurOut(toString(i+1) + "\t");
805                         
806                         if (random) {
807                                 if (UWScoreSig[a][0] > (1/(float)iters)) {
808                                         outSum << setprecision(6) << groupComb[a]  << '\t' << utreeScores[a][0] << '\t' << setprecision(itersString.length()) << UWScoreSig[a][0] << endl;
809                                         cout << setprecision(6)  << groupComb[a]  << '\t' << utreeScores[a][0] << '\t' << setprecision(itersString.length()) << UWScoreSig[a][0] << endl; 
810                                         m->mothurOutJustToLog(groupComb[a]  + "\t" + toString(utreeScores[a][0])  + "\t" + toString(UWScoreSig[a][0])+ "\n"); 
811                                 }else {
812                                         outSum << setprecision(6) << groupComb[a]  << '\t' << utreeScores[a][0] << '\t' << setprecision(itersString.length()) << "<" << (1/float(iters)) << endl;
813                                         cout << setprecision(6)  << groupComb[a]  << '\t' << utreeScores[a][0] << '\t' << setprecision(itersString.length()) << "<" << (1/float(iters)) << endl; 
814                                         m->mothurOutJustToLog(groupComb[a]  + "\t" + toString(utreeScores[a][0])  + "\t<" + toString((1/float(iters))) + "\n"); 
815                                 }
816                         }else{
817                                 outSum << setprecision(6) << groupComb[a]  << '\t' << utreeScores[a][0]  << endl;
818                                 cout << setprecision(6)  << groupComb[a]  << '\t' << utreeScores[a][0]  << endl; 
819                                 m->mothurOutJustToLog(groupComb[a]  + "\t" + toString(utreeScores[a][0]) + "\n");
820                         }
821                 }
822                 
823         }
824         catch(exception& e) {
825                 m->errorOut(e, "UnifracUnweightedCommand", "printUWSummaryFile");
826                 exit(1);
827         }
828 }
829 /***********************************************************/
830 void UnifracUnweightedCommand::createPhylipFile(int i) {
831         try {
832                 string phylipFileName;
833         map<string, string> variables; 
834                 variables["[filename]"] = outputDir + m->getSimpleName(treefile);
835         variables["[tag]"] = toString(i+1);
836                 if ((outputForm == "lt") || (outputForm == "square")) {
837             variables["[tag2]"] = "unweighted.phylip";
838                         phylipFileName = getOutputFileName("phylip",variables);
839                         outputNames.push_back(phylipFileName); outputTypes["phylip"].push_back(phylipFileName); 
840                 }else { //column
841             variables["[tag2]"] = "unweighted.column";
842                         phylipFileName = getOutputFileName("column",variables);
843                         outputNames.push_back(phylipFileName); outputTypes["column"].push_back(phylipFileName); 
844                 }
845                 
846                 ofstream out;
847                 m->openOutputFile(phylipFileName, out);
848                 
849                 if ((outputForm == "lt") || (outputForm == "square")) {
850                         //output numSeqs
851                         out << m->getNumGroups() << endl;
852                 }
853                 
854                 //make matrix with scores in it
855                 vector< vector<float> > dists;  dists.resize(m->getNumGroups());
856                 for (int i = 0; i < m->getNumGroups(); i++) {
857                         dists[i].resize(m->getNumGroups(), 0.0);
858                 }
859                 
860                 //flip it so you can print it
861                 int count = 0;
862                 for (int r=0; r<m->getNumGroups(); r++) { 
863                         for (int l = 0; l < r; l++) {
864                                 dists[r][l] = utreeScores[count][0];
865                                 dists[l][r] = utreeScores[count][0];
866                                 count++;
867                         }
868                 }
869                 
870                 //output to file
871                 for (int r=0; r<m->getNumGroups(); r++) { 
872                         //output name
873                         string name = (m->getGroups())[r];
874                         if (name.length() < 10) { //pad with spaces to make compatible
875                                 while (name.length() < 10) {  name += " ";  }
876                         }
877                         
878                         if (outputForm == "lt") {
879                                 out << name << '\t';
880                         
881                                 //output distances
882                                 for (int l = 0; l < r; l++) {   out  << dists[r][l] << '\t';  }
883                                 out << endl;
884                         }else if (outputForm == "square") {
885                                 out << name << '\t';
886                                 
887                                 //output distances
888                                 for (int l = 0; l < m->getNumGroups(); l++) {   out << dists[r][l] << '\t';  }
889                                 out << endl;
890                         }else{
891                                 //output distances
892                                 for (int l = 0; l < r; l++) {   
893                                         string otherName = (m->getGroups())[l];
894                                         if (otherName.length() < 10) { //pad with spaces to make compatible
895                                                 while (otherName.length() < 10) {  otherName += " ";  }
896                                         }
897                                         
898                                         out  << name << '\t' << otherName << dists[r][l] << endl;  
899                                 }
900                         }
901                 }
902                 out.close();
903         }
904         catch(exception& e) {
905                 m->errorOut(e, "UnifracUnweightedCommand", "createPhylipFile");
906                 exit(1);
907         }
908 }
909 /***********************************************************/
910
911
912
913