]> git.donarmstrong.com Git - mothur.git/blob - subsample.cpp
Merge remote-tracking branch 'mothur/master'
[mothur.git] / subsample.cpp
1 //
2 //  subsample.cpp
3 //  Mothur
4 //
5 //  Created by Sarah Westcott on 4/2/12.
6 //  Copyright (c) 2012 Schloss Lab. All rights reserved.
7 //
8
9 #include "subsample.h"
10 //**********************************************************************************************************************
11 Tree* SubSample::getSample(Tree* T, TreeMap* tmap, TreeMap* newTmap, int size, map<string, string> originalNameMap) {
12     try {
13         Tree* newTree = NULL;
14         
15         map<string, vector<string> > newGroups;
16         vector<string> subsampledSeqs = getSample(tmap, size, newGroups);
17         
18         //remove seqs not in sample from treemap
19         for (map<string, vector<string> >::iterator it = newGroups.begin(); it != newGroups.end(); it++) {
20             for (int i = 0; i < (it->second).size(); i++) {
21                 newTmap->addSeq((it->second)[i], it->first);
22             }
23         }
24         
25         newTree = new Tree(newTmap);
26         newTree->getCopy(T, originalNameMap);
27         
28         return newTree;
29     }
30     catch(exception& e) {
31         m->errorOut(e, "SubSample", "getSample-Tree");
32         exit(1);
33     }
34 }
35 /**********************************************************************************************************************
36 Tree* SubSample::getSample(Tree* T, TreeMap* tmap, map<string, string> whole, int size) {
37     try {
38         Tree* newTree = NULL;
39         
40         vector<string> subsampledSeqs = getSample(tmap, size);
41         map<string, string> sampledNameMap = deconvolute(whole, subsampledSeqs); 
42         
43         //remove seqs not in sample from treemap
44         for (int i = 0; i < tmap->namesOfSeqs.size(); i++) {
45             //is that name in the subsample?
46             int count = 0;
47             for (int j = 0; j < subsampledSeqs.size(); j++) {
48                 if (tmap->namesOfSeqs[i] == subsampledSeqs[j]) { break; } //found it
49                 count++;
50             }
51
52             if (m->control_pressed) { return newTree; }
53             
54             //if you didnt find it, remove it 
55             if (count == subsampledSeqs.size()) { 
56                 tmap->removeSeq(tmap->namesOfSeqs[i]);
57                 i--; //need this because removeSeq removes name from namesOfSeqs
58             }
59         }
60         
61         //create new tree
62         int numUniques = sampledNameMap.size();
63         if (sampledNameMap.size() == 0) { numUniques = subsampledSeqs.size(); }
64         
65         newTree = new Tree(numUniques, tmap); //numNodes, treemap
66         newTree->getSubTree(T, subsampledSeqs, sampledNameMap);
67         
68         return newTree;
69     }
70     catch(exception& e) {
71         m->errorOut(e, "SubSample", "getSample-Tree");
72         exit(1);
73     }
74 }*/
75 //**********************************************************************************************************************
76 //assumes whole maps dupName -> uniqueName
77 map<string, string> SubSample::deconvolute(map<string, string> whole, vector<string>& wanted) {
78     try {
79         map<string, string> nameMap;
80         
81         //whole will be empty if user gave no name file, so we don't need to make a new one
82         if (whole.size() == 0) { return nameMap; }
83         
84         vector<string> newWanted;
85         for (int i = 0; i < wanted.size(); i++) {
86             
87             if (m->control_pressed) { break; }
88             
89             string dupName = wanted[i];
90             
91             map<string, string>::iterator itWhole = whole.find(dupName);
92             if (itWhole != whole.end()) {
93                 string repName = itWhole->second;
94                 
95                 //do we already have this rep?
96                 map<string, string>::iterator itName = nameMap.find(repName);
97                 if (itName != nameMap.end()) { //add this seqs to dups list
98                     (itName->second) += "," + dupName;
99                 }else { //first sighting of this seq
100                     nameMap[repName] = dupName;
101                     newWanted.push_back(repName);
102                 }
103             }else { m->mothurOut("[ERROR]: "+dupName+" is not in your name file, please correct.\n"); m->control_pressed = true; }
104         }
105         
106         wanted = newWanted;
107         return nameMap;
108     }
109         catch(exception& e) {
110                 m->errorOut(e, "SubSample", "deconvolute");
111                 exit(1);
112         }
113 }
114 //**********************************************************************************************************************
115 vector<string> SubSample::getSample(TreeMap* tMap, int size, map<string, vector<string> >& sample) {
116     try {
117         vector<string> temp2;
118         sample["doNotIncludeMe"] = temp2;
119         
120         vector<string> namesInSample;
121         
122         vector<string> Groups = tMap->getNamesOfGroups();    
123         for (int i = 0; i < Groups.size(); i++) {
124             
125             if (m->inUsersGroups(Groups[i], m->getGroups())) {
126                 if (m->control_pressed) { break; }
127                 
128                 vector<string> thisGroup; thisGroup.push_back(Groups[i]);
129                 vector<string> thisGroupsSeqs = tMap->getNamesSeqs(thisGroup);
130                 int thisSize = thisGroupsSeqs.size();
131                 vector<string> temp;
132                 sample[Groups[i]] = temp;
133                 
134                 if (thisSize >= size) { 
135                     
136                     random_shuffle(thisGroupsSeqs.begin(), thisGroupsSeqs.end());
137                     
138                     for (int j = 0; j < size; j++) { sample[Groups[i]].push_back(thisGroupsSeqs[j]); namesInSample.push_back(thisGroupsSeqs[j]); }
139                     for (int j = size; j < thisSize; j++) { sample["doNotIncludeMe"].push_back(thisGroupsSeqs[j]); }
140                     
141                 }else {  m->mothurOut("[ERROR]: You have selected a size that is larger than "+Groups[i]+" number of sequences.\n"); m->control_pressed = true; }
142             }
143         } 
144         
145         return namesInSample;
146     }
147         catch(exception& e) {
148                 m->errorOut(e, "SubSample", "getSample-TreeMap");
149                 exit(1);
150         }
151 }       
152
153 //**********************************************************************************************************************
154 vector<string> SubSample::getSample(TreeMap* tMap, int size) {
155     try {
156         vector<string> sample;
157         
158         vector<string> Groups = tMap->getNamesOfGroups();    
159         for (int i = 0; i < Groups.size(); i++) {
160             
161             if (m->inUsersGroups(Groups[i], m->getGroups())) {
162                 if (m->control_pressed) { break; }
163                 
164                 vector<string> thisGroup; thisGroup.push_back(Groups[i]);
165                 vector<string> thisGroupsSeqs = tMap->getNamesSeqs(thisGroup);
166                 int thisSize = thisGroupsSeqs.size();
167                 
168                 if (thisSize >= size) { 
169                     
170                     random_shuffle(thisGroupsSeqs.begin(), thisGroupsSeqs.end());
171                     
172                     for (int j = 0; j < size; j++) { sample.push_back(thisGroupsSeqs[j]); }
173                 }else {  m->mothurOut("[ERROR]: You have selected a size that is larger than "+Groups[i]+" number of sequences.\n"); m->control_pressed = true; }
174             }
175         } 
176         
177         return sample;
178     }
179         catch(exception& e) {
180                 m->errorOut(e, "SubSample", "getSample-TreeMap");
181                 exit(1);
182         }
183 }       
184 //**********************************************************************************************************************
185 vector<string> SubSample::getSample(TreeMap* tMap, vector<string> Groups) {
186     try {
187         vector<string> sample;
188         
189         //vector<string> Groups = tMap->getNamesOfGroups();    
190         for (int i = 0; i < Groups.size(); i++) {
191             
192             if (m->control_pressed) { break; }
193             
194             vector<string> thisGroup; thisGroup.push_back(Groups[i]);
195             vector<string> thisGroupsSeqs = tMap->getNamesSeqs(thisGroup);
196             int thisSize = thisGroupsSeqs.size();
197                 
198             for (int j = 0; j < thisSize; j++) { sample.push_back(thisGroupsSeqs[j]); }
199         } 
200         
201         return sample;
202     }
203         catch(exception& e) {
204                 m->errorOut(e, "SubSample", "getSample-TreeMap");
205                 exit(1);
206         }
207 }       
208 //**********************************************************************************************************************
209 vector<string> SubSample::getSample(vector<SharedRAbundVector*>& thislookup, int size) {
210         try {
211                 
212                 //save mothurOut's binLabels to restore for next label
213                 vector<string> saveBinLabels = m->currentBinLabels;
214                 
215                 int numBins = thislookup[0]->getNumBins();
216                 for (int i = 0; i < thislookup.size(); i++) {           
217                         int thisSize = thislookup[i]->getNumSeqs();
218                         
219                         if (thisSize != size) {
220                                 
221                                 string thisgroup = thislookup[i]->getGroup();
222                                 
223                                 OrderVector order;
224                                 for(int p=0;p<numBins;p++){
225                                         for(int j=0;j<thislookup[i]->getAbundance(p);j++){
226                                                 order.push_back(p);
227                                         }
228                                 }
229                                 random_shuffle(order.begin(), order.end());
230                                 
231                                 SharedRAbundVector* temp = new SharedRAbundVector(numBins);
232                                 temp->setLabel(thislookup[i]->getLabel());
233                                 temp->setGroup(thislookup[i]->getGroup());
234                                 
235                                 delete thislookup[i];
236                                 thislookup[i] = temp;
237                                 
238                                 
239                                 for (int j = 0; j < size; j++) {
240                                         
241                                         if (m->control_pressed) {  return m->currentBinLabels; }
242                                         
243                                         int bin = order.get(j);
244                                         
245                                         int abund = thislookup[i]->getAbundance(bin);
246                                         thislookup[i]->set(bin, (abund+1), thisgroup);
247                                 }       
248                         }
249                 }
250                 
251                 //subsampling may have created some otus with no sequences in them
252                 eliminateZeroOTUS(thislookup);
253         
254                 if (m->control_pressed) { return m->currentBinLabels; }
255                 
256                 //save mothurOut's binLabels to restore for next label
257         vector<string> subsampleBinLabels = m->currentBinLabels;
258                 m->currentBinLabels = saveBinLabels;
259                 
260                 return subsampleBinLabels;
261                 
262         }
263         catch(exception& e) {
264                 m->errorOut(e, "SubSample", "getSample-shared");
265                 exit(1);
266         }
267 }       
268 //**********************************************************************************************************************
269 int SubSample::eliminateZeroOTUS(vector<SharedRAbundVector*>& thislookup) {
270         try {
271                 
272                 vector<SharedRAbundVector*> newLookup;
273                 for (int i = 0; i < thislookup.size(); i++) {
274                         SharedRAbundVector* temp = new SharedRAbundVector();
275                         temp->setLabel(thislookup[i]->getLabel());
276                         temp->setGroup(thislookup[i]->getGroup());
277                         newLookup.push_back(temp);
278                 }
279                 
280                 //for each bin
281                 vector<string> newBinLabels;
282                 string snumBins = toString(thislookup[0]->getNumBins());
283                 for (int i = 0; i < thislookup[0]->getNumBins(); i++) {
284                         if (m->control_pressed) { for (int j = 0; j < newLookup.size(); j++) {  delete newLookup[j];  } return 0; }
285                         
286                         //look at each sharedRabund and make sure they are not all zero
287                         bool allZero = true;
288                         for (int j = 0; j < thislookup.size(); j++) {
289                                 if (thislookup[j]->getAbundance(i) != 0) { allZero = false;  break;  }
290                         }
291                         
292                         //if they are not all zero add this bin
293                         if (!allZero) {
294                                 for (int j = 0; j < thislookup.size(); j++) {
295                                         newLookup[j]->push_back(thislookup[j]->getAbundance(i), thislookup[j]->getGroup());
296                                 }
297                                 //if there is a bin label use it otherwise make one
298                                 string binLabel = "Otu";
299                                 string sbinNumber = toString(i+1);
300                                 if (sbinNumber.length() < snumBins.length()) { 
301                                         int diff = snumBins.length() - sbinNumber.length();
302                                         for (int h = 0; h < diff; h++) { binLabel += "0"; }
303                                 }
304                                 binLabel += sbinNumber; 
305                                 if (i < m->currentBinLabels.size()) {  binLabel = m->currentBinLabels[i]; }
306                                 
307                                 newBinLabels.push_back(binLabel);
308                         }
309                 }
310                 
311                 for (int j = 0; j < thislookup.size(); j++) {  delete thislookup[j];  }
312                 thislookup.clear();
313                 
314                 thislookup = newLookup;
315                 m->currentBinLabels = newBinLabels;
316                 
317                 return 0;
318                 
319         }
320         catch(exception& e) {
321                 m->errorOut(e, "SubSample", "eliminateZeroOTUS");
322                 exit(1);
323         }
324 }
325 //**********************************************************************************************************************
326 int SubSample::getSample(SAbundVector*& sabund, int size) {
327         try {
328                 
329         OrderVector* order = new OrderVector();
330         *order = sabund->getOrderVector(NULL);
331         
332                 int numBins = order->getNumBins();
333                 int thisSize = order->getNumSeqs();
334         
335                 if (thisSize > size) {
336                         random_shuffle(order->begin(), order->end());
337                         
338             RAbundVector* rabund = new RAbundVector(numBins);
339                         rabund->setLabel(order->getLabel());
340
341                         for (int j = 0; j < size; j++) {
342                 
343                                 if (m->control_pressed) { delete order; delete rabund; return 0; }
344                                 
345                                 int bin = order->get(j);
346                                 
347                                 int abund = rabund->get(bin);
348                                 rabund->set(bin, (abund+1));
349                         }
350                         
351             delete sabund;
352             sabund = new SAbundVector();
353             *sabund = rabund->getSAbundVector();
354             delete rabund;
355             
356                 }else if (thisSize < size) { m->mothurOut("[ERROR]: The size you requested is larger than the number of sequences in the sabund vector. You requested " + toString(size) + " and you only have " + toString(thisSize) + " seqs in your sabund vector.\n"); m->control_pressed = true; }
357                 
358                 if (m->control_pressed) { return 0; }
359         
360                 delete order;
361                 
362                 return 0;
363                 
364         }
365         catch(exception& e) {
366                 m->errorOut(e, "SubSampleCommand", "getSample");
367                 exit(1);
368         }
369 }                       
370 //**********************************************************************************************************************
371
372