]> git.donarmstrong.com Git - mothur.git/blob - subsample.cpp
major change to the tree class to use the count table class instead of tree map....
[mothur.git] / subsample.cpp
1 //
2 //  subsample.cpp
3 //  Mothur
4 //
5 //  Created by Sarah Westcott on 4/2/12.
6 //  Copyright (c) 2012 Schloss Lab. All rights reserved.
7 //
8
9 #include "subsample.h"
10 //**********************************************************************************************************************
11 Tree* SubSample::getSample(Tree* T, CountTable* ct, CountTable* newCt, int size) {
12     try {
13         Tree* newTree = NULL;
14         
15         //remove seqs not in sample from counttable
16         vector<string> Groups = ct->getNamesOfGroups();
17         newCt->copy(ct); 
18         newCt->addGroup("doNotIncludeMe");
19         
20         map<string, int> doNotIncludeTotals; 
21         vector<string> namesSeqs = ct->getNamesOfSeqs();
22         for (int i = 0; i < namesSeqs.size(); i++) {  doNotIncludeTotals[namesSeqs[i]] = 0; }
23     
24         for (int i = 0; i < Groups.size(); i++) {
25             if (m->inUsersGroups(Groups[i], m->getGroups())) {
26                 if (m->control_pressed) { break; }
27         
28                 int thisSize = ct->getGroupCount(Groups[i]);
29                 
30                 if (thisSize >= size) { 
31                     
32                     vector<string> names = ct->getNamesOfSeqs(Groups[i]);
33                     vector<int> random;
34                     for (int j = 0; j < names.size(); j++) {
35                         int num = ct->getGroupCount(names[j], Groups[i]);
36                         for (int k = 0; k < num; k++) { random.push_back(j); }
37                     }
38                     random_shuffle(random.begin(), random.end());
39                     
40                     vector<int> sampleRandoms; sampleRandoms.resize(names.size(), 0);
41                     for (int j = 0; j < size; j++) { sampleRandoms[random[j]]++; }
42                     for (int j = 0; j < sampleRandoms.size(); j++) {
43                         newCt->setAbund(names[j], Groups[i], sampleRandoms[j]);
44                     }
45                     sampleRandoms.clear(); sampleRandoms.resize(names.size(), 0);
46                     for (int j = size; j < thisSize; j++) { sampleRandoms[random[j]]++; }
47                     for (int j = 0; j < sampleRandoms.size(); j++) {  doNotIncludeTotals[names[j]] += sampleRandoms[j]; }
48                 }else {  m->mothurOut("[ERROR]: You have selected a size that is larger than "+Groups[i]+" number of sequences.\n"); m->control_pressed = true; }
49             }
50
51         }
52         
53         for (map<string, int>::iterator it = doNotIncludeTotals.begin(); it != doNotIncludeTotals.end(); it++) {  
54             newCt->setAbund(it->first, "doNotIncludeMe", it->second);
55         } 
56         
57         newTree = new Tree(newCt);
58         newTree->getCopy(T, true);
59         
60         return newTree;
61     }
62     catch(exception& e) {
63         m->errorOut(e, "SubSample", "getSample-Tree");
64         exit(1);
65     }
66 }
67 //**********************************************************************************************************************
68 //assumes whole maps dupName -> uniqueName
69 map<string, string> SubSample::deconvolute(map<string, string> whole, vector<string>& wanted) {
70     try {
71         map<string, string> nameMap;
72         
73         //whole will be empty if user gave no name file, so we don't need to make a new one
74         if (whole.size() == 0) { return nameMap; }
75         
76         vector<string> newWanted;
77         for (int i = 0; i < wanted.size(); i++) {
78             
79             if (m->control_pressed) { break; }
80             
81             string dupName = wanted[i];
82             
83             map<string, string>::iterator itWhole = whole.find(dupName);
84             if (itWhole != whole.end()) {
85                 string repName = itWhole->second;
86                 
87                 //do we already have this rep?
88                 map<string, string>::iterator itName = nameMap.find(repName);
89                 if (itName != nameMap.end()) { //add this seqs to dups list
90                     (itName->second) += "," + dupName;
91                 }else { //first sighting of this seq
92                     nameMap[repName] = dupName;
93                     newWanted.push_back(repName);
94                 }
95             }else { m->mothurOut("[ERROR]: "+dupName+" is not in your name file, please correct.\n"); m->control_pressed = true; }
96         }
97         
98         wanted = newWanted;
99         return nameMap;
100     }
101         catch(exception& e) {
102                 m->errorOut(e, "SubSample", "deconvolute");
103                 exit(1);
104         }
105 }
106 //**********************************************************************************************************************
107 vector<string> SubSample::getSample(vector<SharedRAbundVector*>& thislookup, int size) {
108         try {
109                 
110                 //save mothurOut's binLabels to restore for next label
111                 vector<string> saveBinLabels = m->currentBinLabels;
112                 
113                 int numBins = thislookup[0]->getNumBins();
114                 for (int i = 0; i < thislookup.size(); i++) {           
115                         int thisSize = thislookup[i]->getNumSeqs();
116                         
117                         if (thisSize != size) {
118                                 
119                                 string thisgroup = thislookup[i]->getGroup();
120                                 
121                                 OrderVector order;
122                                 for(int p=0;p<numBins;p++){
123                                         for(int j=0;j<thislookup[i]->getAbundance(p);j++){
124                                                 order.push_back(p);
125                                         }
126                                 }
127                                 random_shuffle(order.begin(), order.end());
128                                 
129                                 SharedRAbundVector* temp = new SharedRAbundVector(numBins);
130                                 temp->setLabel(thislookup[i]->getLabel());
131                                 temp->setGroup(thislookup[i]->getGroup());
132                                 
133                                 delete thislookup[i];
134                                 thislookup[i] = temp;
135                                 
136                                 
137                                 for (int j = 0; j < size; j++) {
138                                         
139                                         if (m->control_pressed) {  return m->currentBinLabels; }
140                                         
141                                         int bin = order.get(j);
142                                         
143                                         int abund = thislookup[i]->getAbundance(bin);
144                                         thislookup[i]->set(bin, (abund+1), thisgroup);
145                                 }       
146                         }
147                 }
148                 
149                 //subsampling may have created some otus with no sequences in them
150                 eliminateZeroOTUS(thislookup);
151         
152                 if (m->control_pressed) { return m->currentBinLabels; }
153                 
154                 //save mothurOut's binLabels to restore for next label
155         vector<string> subsampleBinLabels = m->currentBinLabels;
156                 m->currentBinLabels = saveBinLabels;
157                 
158                 return subsampleBinLabels;
159                 
160         }
161         catch(exception& e) {
162                 m->errorOut(e, "SubSample", "getSample-shared");
163                 exit(1);
164         }
165 }       
166 //**********************************************************************************************************************
167 int SubSample::eliminateZeroOTUS(vector<SharedRAbundVector*>& thislookup) {
168         try {
169                 
170                 vector<SharedRAbundVector*> newLookup;
171                 for (int i = 0; i < thislookup.size(); i++) {
172                         SharedRAbundVector* temp = new SharedRAbundVector();
173                         temp->setLabel(thislookup[i]->getLabel());
174                         temp->setGroup(thislookup[i]->getGroup());
175                         newLookup.push_back(temp);
176                 }
177                 
178                 //for each bin
179                 vector<string> newBinLabels;
180                 string snumBins = toString(thislookup[0]->getNumBins());
181                 for (int i = 0; i < thislookup[0]->getNumBins(); i++) {
182                         if (m->control_pressed) { for (int j = 0; j < newLookup.size(); j++) {  delete newLookup[j];  } return 0; }
183                         
184                         //look at each sharedRabund and make sure they are not all zero
185                         bool allZero = true;
186                         for (int j = 0; j < thislookup.size(); j++) {
187                                 if (thislookup[j]->getAbundance(i) != 0) { allZero = false;  break;  }
188                         }
189                         
190                         //if they are not all zero add this bin
191                         if (!allZero) {
192                                 for (int j = 0; j < thislookup.size(); j++) {
193                                         newLookup[j]->push_back(thislookup[j]->getAbundance(i), thislookup[j]->getGroup());
194                                 }
195                                 //if there is a bin label use it otherwise make one
196                                 string binLabel = "Otu";
197                                 string sbinNumber = toString(i+1);
198                                 if (sbinNumber.length() < snumBins.length()) { 
199                                         int diff = snumBins.length() - sbinNumber.length();
200                                         for (int h = 0; h < diff; h++) { binLabel += "0"; }
201                                 }
202                                 binLabel += sbinNumber; 
203                                 if (i < m->currentBinLabels.size()) {  binLabel = m->currentBinLabels[i]; }
204                                 
205                                 newBinLabels.push_back(binLabel);
206                         }
207                 }
208                 
209                 for (int j = 0; j < thislookup.size(); j++) {  delete thislookup[j];  }
210                 thislookup.clear();
211                 
212                 thislookup = newLookup;
213                 m->currentBinLabels = newBinLabels;
214                 
215                 return 0;
216                 
217         }
218         catch(exception& e) {
219                 m->errorOut(e, "SubSample", "eliminateZeroOTUS");
220                 exit(1);
221         }
222 }
223 //**********************************************************************************************************************
224 int SubSample::getSample(SAbundVector*& sabund, int size) {
225         try {
226                 
227         OrderVector* order = new OrderVector();
228         *order = sabund->getOrderVector(NULL);
229         
230                 int numBins = order->getNumBins();
231                 int thisSize = order->getNumSeqs();
232         
233                 if (thisSize > size) {
234                         random_shuffle(order->begin(), order->end());
235                         
236             RAbundVector* rabund = new RAbundVector(numBins);
237                         rabund->setLabel(order->getLabel());
238
239                         for (int j = 0; j < size; j++) {
240                 
241                                 if (m->control_pressed) { delete order; delete rabund; return 0; }
242                                 
243                                 int bin = order->get(j);
244                                 
245                                 int abund = rabund->get(bin);
246                                 rabund->set(bin, (abund+1));
247                         }
248                         
249             delete sabund;
250             sabund = new SAbundVector();
251             *sabund = rabund->getSAbundVector();
252             delete rabund;
253             
254                 }else if (thisSize < size) { m->mothurOut("[ERROR]: The size you requested is larger than the number of sequences in the sabund vector. You requested " + toString(size) + " and you only have " + toString(thisSize) + " seqs in your sabund vector.\n"); m->control_pressed = true; }
255                 
256                 if (m->control_pressed) { return 0; }
257         
258                 delete order;
259                 
260                 return 0;
261                 
262         }
263         catch(exception& e) {
264                 m->errorOut(e, "SubSampleCommand", "getSample");
265                 exit(1);
266         }
267 }                       
268 //**********************************************************************************************************************
269
270