]> git.donarmstrong.com Git - mothur.git/blob - subsample.cpp
added subsample and consensus parameters to unifrac.weighted command
[mothur.git] / subsample.cpp
1 //
2 //  subsample.cpp
3 //  Mothur
4 //
5 //  Created by Sarah Westcott on 4/2/12.
6 //  Copyright (c) 2012 Schloss Lab. All rights reserved.
7 //
8
9 #include "subsample.h"
10
11 //**********************************************************************************************************************
12 Tree* SubSample::getSample(Tree* T, TreeMap* tmap, map<string, string> whole, int size) {
13     try {
14         Tree* newTree = NULL;
15         
16         vector<string> subsampledSeqs = getSample(tmap, size);
17         map<string, string> sampledNameMap = deconvolute(whole, subsampledSeqs); 
18         
19         //remove seqs not in sample from treemap
20         for (int i = 0; i < tmap->namesOfSeqs.size(); i++) {
21             //is that name in the subsample?
22             int count = 0;
23             for (int j = 0; j < subsampledSeqs.size(); j++) {
24                 if (tmap->namesOfSeqs[i] == subsampledSeqs[j]) { break; } //found it
25                 count++;
26             }
27
28             if (m->control_pressed) { return newTree; }
29             
30             //if you didnt find it, remove it 
31             if (count == subsampledSeqs.size()) { 
32                 tmap->removeSeq(tmap->namesOfSeqs[i]);
33                 i--; //need this because removeSeq removes name from namesOfSeqs
34             }
35         }
36         
37         //create new tree
38         int numUniques = sampledNameMap.size();
39         if (sampledNameMap.size() == 0) { numUniques = subsampledSeqs.size(); }
40         
41         newTree = new Tree(numUniques, tmap); //numNodes, treemap
42         newTree->getSubTree(T, subsampledSeqs, sampledNameMap);
43         
44         return newTree;
45     }
46     catch(exception& e) {
47         m->errorOut(e, "SubSample", "getSample-Tree");
48         exit(1);
49     }
50 }       
51 //**********************************************************************************************************************
52 //assumes whole maps dupName -> uniqueName
53 map<string, string> SubSample::deconvolute(map<string, string> whole, vector<string>& wanted) {
54     try {
55         map<string, string> nameMap;
56         
57         //whole will be empty if user gave no name file, so we don't need to make a new one
58         if (whole.size() == 0) { return nameMap; }
59         
60         vector<string> newWanted;
61         for (int i = 0; i < wanted.size(); i++) {
62             
63             if (m->control_pressed) { break; }
64             
65             string dupName = wanted[i];
66             
67             map<string, string>::iterator itWhole = whole.find(dupName);
68             if (itWhole != whole.end()) {
69                 string repName = itWhole->second;
70                 
71                 //do we already have this rep?
72                 map<string, string>::iterator itName = nameMap.find(repName);
73                 if (itName != nameMap.end()) { //add this seqs to dups list
74                     (itName->second) += "," + dupName;
75                 }else { //first sighting of this seq
76                     nameMap[repName] = dupName;
77                     newWanted.push_back(repName);
78                 }
79             }else { m->mothurOut("[ERROR]: "+dupName+" is not in your name file, please correct.\n"); m->control_pressed = true; }
80         }
81         
82         wanted = newWanted;
83         return nameMap;
84     }
85         catch(exception& e) {
86                 m->errorOut(e, "SubSample", "deconvolute");
87                 exit(1);
88         }
89 }       
90 //**********************************************************************************************************************
91 vector<string> SubSample::getSample(TreeMap* tMap, int size) {
92     try {
93         vector<string> sample;
94         
95         vector<string> Groups = tMap->getNamesOfGroups();    
96         for (int i = 0; i < Groups.size(); i++) {
97             
98             if (m->control_pressed) { break; }
99             
100             vector<string> thisGroup; thisGroup.push_back(Groups[i]);
101             vector<string> thisGroupsSeqs = tMap->getNamesSeqs(thisGroup);
102             int thisSize = thisGroupsSeqs.size();
103             
104             if (thisSize >= size) {     
105                 
106                 random_shuffle(thisGroupsSeqs.begin(), thisGroupsSeqs.end());
107                 
108                 for (int j = 0; j < size; j++) { sample.push_back(thisGroupsSeqs[j]); }
109             }else {  m->mothurOut("[ERROR]: You have selected a size that is larger than "+Groups[i]+" number of sequences.\n"); m->control_pressed = true; }
110         } 
111         
112         return sample;
113     }
114         catch(exception& e) {
115                 m->errorOut(e, "SubSample", "getSample-TreeMap");
116                 exit(1);
117         }
118 }       
119 //**********************************************************************************************************************
120 vector<string> SubSample::getSample(vector<SharedRAbundVector*>& thislookup, int size) {
121         try {
122                 
123                 //save mothurOut's binLabels to restore for next label
124                 vector<string> saveBinLabels = m->currentBinLabels;
125                 
126                 int numBins = thislookup[0]->getNumBins();
127                 for (int i = 0; i < thislookup.size(); i++) {           
128                         int thisSize = thislookup[i]->getNumSeqs();
129                         
130                         if (thisSize != size) {
131                                 
132                                 string thisgroup = thislookup[i]->getGroup();
133                                 
134                                 OrderVector order;
135                                 for(int p=0;p<numBins;p++){
136                                         for(int j=0;j<thislookup[i]->getAbundance(p);j++){
137                                                 order.push_back(p);
138                                         }
139                                 }
140                                 random_shuffle(order.begin(), order.end());
141                                 
142                                 SharedRAbundVector* temp = new SharedRAbundVector(numBins);
143                                 temp->setLabel(thislookup[i]->getLabel());
144                                 temp->setGroup(thislookup[i]->getGroup());
145                                 
146                                 delete thislookup[i];
147                                 thislookup[i] = temp;
148                                 
149                                 
150                                 for (int j = 0; j < size; j++) {
151                                         
152                                         if (m->control_pressed) {  return m->currentBinLabels; }
153                                         
154                                         int bin = order.get(j);
155                                         
156                                         int abund = thislookup[i]->getAbundance(bin);
157                                         thislookup[i]->set(bin, (abund+1), thisgroup);
158                                 }       
159                         }
160                 }
161                 
162                 //subsampling may have created some otus with no sequences in them
163                 eliminateZeroOTUS(thislookup);
164                 
165                 if (m->control_pressed) { return m->currentBinLabels; }
166                 
167                 //save mothurOut's binLabels to restore for next label
168         vector<string> subsampleBinLabels = m->currentBinLabels;
169                 m->currentBinLabels = saveBinLabels;
170                 
171                 return subsampleBinLabels;
172                 
173         }
174         catch(exception& e) {
175                 m->errorOut(e, "SubSample", "getSample-shared");
176                 exit(1);
177         }
178 }       
179 //**********************************************************************************************************************
180 int SubSample::eliminateZeroOTUS(vector<SharedRAbundVector*>& thislookup) {
181         try {
182                 
183                 vector<SharedRAbundVector*> newLookup;
184                 for (int i = 0; i < thislookup.size(); i++) {
185                         SharedRAbundVector* temp = new SharedRAbundVector();
186                         temp->setLabel(thislookup[i]->getLabel());
187                         temp->setGroup(thislookup[i]->getGroup());
188                         newLookup.push_back(temp);
189                 }
190                 
191                 //for each bin
192                 vector<string> newBinLabels;
193                 string snumBins = toString(thislookup[0]->getNumBins());
194                 for (int i = 0; i < thislookup[0]->getNumBins(); i++) {
195                         if (m->control_pressed) { for (int j = 0; j < newLookup.size(); j++) {  delete newLookup[j];  } return 0; }
196                         
197                         //look at each sharedRabund and make sure they are not all zero
198                         bool allZero = true;
199                         for (int j = 0; j < thislookup.size(); j++) {
200                                 if (thislookup[j]->getAbundance(i) != 0) { allZero = false;  break;  }
201                         }
202                         
203                         //if they are not all zero add this bin
204                         if (!allZero) {
205                                 for (int j = 0; j < thislookup.size(); j++) {
206                                         newLookup[j]->push_back(thislookup[j]->getAbundance(i), thislookup[j]->getGroup());
207                                 }
208                                 //if there is a bin label use it otherwise make one
209                                 string binLabel = "Otu";
210                                 string sbinNumber = toString(i+1);
211                                 if (sbinNumber.length() < snumBins.length()) { 
212                                         int diff = snumBins.length() - sbinNumber.length();
213                                         for (int h = 0; h < diff; h++) { binLabel += "0"; }
214                                 }
215                                 binLabel += sbinNumber; 
216                                 if (i < m->currentBinLabels.size()) {  binLabel = m->currentBinLabels[i]; }
217                                 
218                                 newBinLabels.push_back(binLabel);
219                         }
220                 }
221                 
222                 for (int j = 0; j < thislookup.size(); j++) {  delete thislookup[j];  }
223                 thislookup.clear();
224                 
225                 thislookup = newLookup;
226                 m->currentBinLabels = newBinLabels;
227                 
228                 return 0;
229                 
230         }
231         catch(exception& e) {
232                 m->errorOut(e, "SubSample", "eliminateZeroOTUS");
233                 exit(1);
234         }
235 }
236
237
238 //**********************************************************************************************************************
239
240