]> git.donarmstrong.com Git - mothur.git/blob - subsample.cpp
working on pam
[mothur.git] / subsample.cpp
1 //
2 //  subsample.cpp
3 //  Mothur
4 //
5 //  Created by Sarah Westcott on 4/2/12.
6 //  Copyright (c) 2012 Schloss Lab. All rights reserved.
7 //
8
9 #include "subsample.h"
10 //**********************************************************************************************************************
11 Tree* SubSample::getSample(Tree* T, CountTable* ct, CountTable* newCt, int size) {
12     try {
13         Tree* newTree = NULL;
14         
15         //remove seqs not in sample from counttable
16         vector<string> Groups = ct->getNamesOfGroups();
17         newCt->copy(ct); 
18         newCt->addGroup("doNotIncludeMe");
19         
20         map<string, int> doNotIncludeTotals; 
21         vector<string> namesSeqs = ct->getNamesOfSeqs();
22         for (int i = 0; i < namesSeqs.size(); i++) {  doNotIncludeTotals[namesSeqs[i]] = 0; }
23     
24         for (int i = 0; i < Groups.size(); i++) {
25             if (m->inUsersGroups(Groups[i], m->getGroups())) {
26                 if (m->control_pressed) { break; }
27         
28                 int thisSize = ct->getGroupCount(Groups[i]);
29                 
30                 if (thisSize >= size) { 
31                     
32                     vector<string> names = ct->getNamesOfSeqs(Groups[i]);
33                     vector<int> random;
34                     for (int j = 0; j < names.size(); j++) {
35                         int num = ct->getGroupCount(names[j], Groups[i]);
36                         for (int k = 0; k < num; k++) { random.push_back(j); }
37                     }
38                     random_shuffle(random.begin(), random.end());
39                     
40                     vector<int> sampleRandoms; sampleRandoms.resize(names.size(), 0);
41                     for (int j = 0; j < size; j++) { sampleRandoms[random[j]]++; }
42                     for (int j = 0; j < sampleRandoms.size(); j++) {
43                         newCt->setAbund(names[j], Groups[i], sampleRandoms[j]);
44                     }
45                     sampleRandoms.clear(); sampleRandoms.resize(names.size(), 0);
46                     for (int j = size; j < thisSize; j++) { sampleRandoms[random[j]]++; }
47                     for (int j = 0; j < sampleRandoms.size(); j++) {  doNotIncludeTotals[names[j]] += sampleRandoms[j]; }
48                 }else {  m->mothurOut("[ERROR]: You have selected a size that is larger than "+Groups[i]+" number of sequences.\n"); m->control_pressed = true; }
49             }
50
51         }
52         
53         for (map<string, int>::iterator it = doNotIncludeTotals.begin(); it != doNotIncludeTotals.end(); it++) {  
54             newCt->setAbund(it->first, "doNotIncludeMe", it->second);
55         } 
56         
57         newTree = new Tree(newCt);
58         newTree->getCopy(T, true);
59         
60         return newTree;
61     }
62     catch(exception& e) {
63         m->errorOut(e, "SubSample", "getSample-Tree");
64         exit(1);
65     }
66 }
67 //**********************************************************************************************************************
68 //assumes whole maps dupName -> uniqueName
69 map<string, string> SubSample::deconvolute(map<string, string> whole, vector<string>& wanted) {
70     try {
71         map<string, string> nameMap;
72         
73         //whole will be empty if user gave no name file, so we don't need to make a new one
74         if (whole.size() == 0) { return nameMap; }
75         
76         vector<string> newWanted;
77         for (int i = 0; i < wanted.size(); i++) {
78             
79             if (m->control_pressed) { break; }
80             
81             string dupName = wanted[i];
82             
83             map<string, string>::iterator itWhole = whole.find(dupName);
84             if (itWhole != whole.end()) {
85                 string repName = itWhole->second;
86                 
87                 //do we already have this rep?
88                 map<string, string>::iterator itName = nameMap.find(repName);
89                 if (itName != nameMap.end()) { //add this seqs to dups list
90                     (itName->second) += "," + dupName;
91                 }else { //first sighting of this seq
92                     nameMap[repName] = dupName;
93                     newWanted.push_back(repName);
94                 }
95             }else { m->mothurOut("[ERROR]: "+dupName+" is not in your name file, please correct.\n"); m->control_pressed = true; }
96         }
97         
98         wanted = newWanted;
99         return nameMap;
100     }
101         catch(exception& e) {
102                 m->errorOut(e, "SubSample", "deconvolute");
103                 exit(1);
104         }
105 }
106 //**********************************************************************************************************************
107 vector<string> SubSample::getSample(vector<SharedRAbundVector*>& thislookup, int size) {
108         try {
109                 
110                 //save mothurOut's binLabels to restore for next label
111                 vector<string> saveBinLabels = m->currentSharedBinLabels;
112                 
113                 int numBins = thislookup[0]->getNumBins();
114                 for (int i = 0; i < thislookup.size(); i++) {           
115                         int thisSize = thislookup[i]->getNumSeqs();
116                         
117                         if (thisSize != size) {
118                                 
119                                 string thisgroup = thislookup[i]->getGroup();
120                                 
121                                 OrderVector order;
122                                 for(int p=0;p<numBins;p++){
123                                         for(int j=0;j<thislookup[i]->getAbundance(p);j++){
124                                                 order.push_back(p);
125                                         }
126                                 }
127                                 random_shuffle(order.begin(), order.end());
128                                 
129                                 SharedRAbundVector* temp = new SharedRAbundVector(numBins);
130                                 temp->setLabel(thislookup[i]->getLabel());
131                                 temp->setGroup(thislookup[i]->getGroup());
132                                 
133                                 delete thislookup[i];
134                                 thislookup[i] = temp;
135                                 
136                                 
137                                 for (int j = 0; j < size; j++) {
138                                         
139                                         if (m->control_pressed) {  return m->currentSharedBinLabels; }
140                                         
141                                         int bin = order.get(j);
142                                         
143                                         int abund = thislookup[i]->getAbundance(bin);
144                                         thislookup[i]->set(bin, (abund+1), thisgroup);
145                                 }       
146                         }
147                 }
148                 
149                 //subsampling may have created some otus with no sequences in them
150                 eliminateZeroOTUS(thislookup);
151         
152                 if (m->control_pressed) { return m->currentSharedBinLabels; }
153                 
154                 //save mothurOut's binLabels to restore for next label
155         vector<string> subsampleBinLabels = m->currentSharedBinLabels;
156                 m->currentSharedBinLabels = saveBinLabels;
157                 
158                 return subsampleBinLabels;
159                 
160         }
161         catch(exception& e) {
162                 m->errorOut(e, "SubSample", "getSample-shared");
163                 exit(1);
164         }
165 }       
166 //**********************************************************************************************************************
167 int SubSample::eliminateZeroOTUS(vector<SharedRAbundVector*>& thislookup) {
168         try {
169                 
170                 vector<SharedRAbundVector*> newLookup;
171                 for (int i = 0; i < thislookup.size(); i++) {
172                         SharedRAbundVector* temp = new SharedRAbundVector();
173                         temp->setLabel(thislookup[i]->getLabel());
174                         temp->setGroup(thislookup[i]->getGroup());
175                         newLookup.push_back(temp);
176                 }
177                 
178                 //for each bin
179                 vector<string> newBinLabels;
180                 string snumBins = toString(thislookup[0]->getNumBins());
181                 for (int i = 0; i < thislookup[0]->getNumBins(); i++) {
182                         if (m->control_pressed) { for (int j = 0; j < newLookup.size(); j++) {  delete newLookup[j];  } return 0; }
183                         
184                         //look at each sharedRabund and make sure they are not all zero
185                         bool allZero = true;
186                         for (int j = 0; j < thislookup.size(); j++) {
187                                 if (thislookup[j]->getAbundance(i) != 0) { allZero = false;  break;  }
188                         }
189                         
190                         //if they are not all zero add this bin
191                         if (!allZero) {
192                                 for (int j = 0; j < thislookup.size(); j++) {
193                                         newLookup[j]->push_back(thislookup[j]->getAbundance(i), thislookup[j]->getGroup());
194                                 }
195                                 //if there is a bin label use it otherwise make one
196                                 string binLabel = "Otu";
197                                 string sbinNumber = toString(i+1);
198                                 if (sbinNumber.length() < snumBins.length()) { 
199                                         int diff = snumBins.length() - sbinNumber.length();
200                                         for (int h = 0; h < diff; h++) { binLabel += "0"; }
201                                 }
202                                 binLabel += sbinNumber; 
203                                 if (i < m->currentSharedBinLabels.size()) {  binLabel = m->currentSharedBinLabels[i]; }
204                                 
205                                 newBinLabels.push_back(binLabel);
206                         }
207                 }
208                 
209                 for (int j = 0; j < thislookup.size(); j++) {  delete thislookup[j];  }
210                 thislookup.clear();
211                 
212                 thislookup = newLookup;
213                 m->currentSharedBinLabels = newBinLabels;
214                 
215                 return 0;
216                 
217         }
218         catch(exception& e) {
219                 m->errorOut(e, "SubSample", "eliminateZeroOTUS");
220                 exit(1);
221         }
222 }
223 //**********************************************************************************************************************
224 int SubSample::getSample(SAbundVector*& sabund, int size) {
225         try {
226                 
227         OrderVector* order = new OrderVector();
228         *order = sabund->getOrderVector(NULL);
229         
230                 int numBins = order->getNumBins();
231                 int thisSize = order->getNumSeqs();
232         
233                 if (thisSize > size) {
234                         random_shuffle(order->begin(), order->end());
235                         
236             RAbundVector* rabund = new RAbundVector(numBins);
237                         rabund->setLabel(order->getLabel());
238
239                         for (int j = 0; j < size; j++) {
240                 
241                                 if (m->control_pressed) { delete order; delete rabund; return 0; }
242                                 
243                                 int bin = order->get(j);
244                                 
245                                 int abund = rabund->get(bin);
246                                 rabund->set(bin, (abund+1));
247                         }
248                         
249             delete sabund;
250             sabund = new SAbundVector();
251             *sabund = rabund->getSAbundVector();
252             delete rabund;
253             
254                 }else if (thisSize < size) { m->mothurOut("[ERROR]: The size you requested is larger than the number of sequences in the sabund vector. You requested " + toString(size) + " and you only have " + toString(thisSize) + " seqs in your sabund vector.\n"); m->control_pressed = true; }
255                 
256                 if (m->control_pressed) { return 0; }
257         
258                 delete order;
259                 
260                 return 0;
261                 
262         }
263         catch(exception& e) {
264                 m->errorOut(e, "SubSampleCommand", "getSample");
265                 exit(1);
266         }
267 }
268 //**********************************************************************************************************************
269 CountTable SubSample::getSample(CountTable& ct, int size, vector<string> Groups) {
270         try {
271         if (!ct.hasGroupInfo()) { m->mothurOut("[ERROR]: Cannot subsample by group because your count table doesn't have group information.\n"); m->control_pressed = true; }
272             
273         CountTable sampledCt;
274         map<string, vector<int> > tempCount;
275         for (int i = 0; i < Groups.size(); i++) {
276             sampledCt.addGroup(Groups[i]);
277             
278             vector<string> names = ct.getNamesOfSeqs(Groups[i]);
279             vector<string> allNames;
280             for (int j = 0; j < names.size(); j++) {
281                 
282                 if (m->control_pressed) { return sampledCt; }
283                 
284                 int num = ct. getGroupCount(names[j], Groups[i]);
285                 for (int k = 0; k < num; k++) { allNames.push_back(names[j]); }
286             }
287             
288             random_shuffle(allNames.begin(), allNames.end());
289             
290             if (allNames.size() < size) { m->mothurOut("[ERROR]: You have selected a size that is larger than "+Groups[i]+" number of sequences.\n"); m->control_pressed = true; }
291             else{
292                 for (int j = 0; j < size; j++) {
293                     
294                     if (m->control_pressed) { return sampledCt; }
295                     
296                     map<string, vector<int> >::iterator it = tempCount.find(allNames[j]);
297                     
298                     if (it == tempCount.end()) { //we have not seen this sequence at all yet
299                         vector<int> tempGroups; tempGroups.resize(Groups.size(), 0);
300                         tempGroups[i]++;
301                         tempCount[allNames[j]] = tempGroups;
302                     }else{
303                         tempCount[allNames[j]][i]++;
304                     }
305                 }
306             }
307         }
308         
309         //build count table
310         for (map<string, vector<int> >::iterator it = tempCount.begin(); it != tempCount.end();) {
311             sampledCt.push_back(it->first, it->second);
312             tempCount.erase(it++);
313         }
314         
315         return sampledCt;
316     }
317         catch(exception& e) {
318                 m->errorOut(e, "SubSampleCommand", "getSample");
319                 exit(1);
320         }
321 }
322 //**********************************************************************************************************************
323 CountTable SubSample::getSample(CountTable& ct, int size, vector<string> Groups, bool pickedGroups) {
324         try {
325         CountTable sampledCt;
326         if (!ct.hasGroupInfo() && pickedGroups) { m->mothurOut("[ERROR]: Cannot subsample with groups because your count table doesn't have group information.\n"); m->control_pressed = true; return sampledCt; }
327         
328         if (ct.hasGroupInfo()) {
329             map<string, vector<int> > tempCount;
330             vector<item> allNames;
331             map<string, int> groupMap;
332             
333             vector<string> myGroups;
334             if (pickedGroups) { myGroups = Groups; }
335             else {  myGroups = ct.getNamesOfGroups(); }
336             
337             for (int i = 0; i < myGroups.size(); i++) {
338                 sampledCt.addGroup(myGroups[i]);
339                 groupMap[myGroups[i]] = i;
340                 
341                 vector<string> names = ct.getNamesOfSeqs(myGroups[i]);
342                 for (int j = 0; j < names.size(); j++) {
343                     
344                     if (m->control_pressed) { return sampledCt; }
345                     
346                     int num = ct. getGroupCount(names[j], myGroups[i]);
347                     for (int k = 0; k < num; k++) { 
348                         item temp(names[j], myGroups[i]);
349                         allNames.push_back(temp); 
350                     }
351                 }
352             }
353             
354             random_shuffle(allNames.begin(), allNames.end());
355             
356             if (allNames.size() < size) { 
357                 if (pickedGroups) { m->mothurOut("[ERROR]: You have selected a size that is larger than the number of sequences.\n"); } 
358                 else { m->mothurOut("[ERROR]: You have selected a size that is larger than the number of sequences in the groups you chose.\n"); }
359                 m->control_pressed = true; return sampledCt; }
360             else{
361                 for (int j = 0; j < size; j++) {
362                     
363                     if (m->control_pressed) { return sampledCt; }
364                     
365                     map<string, vector<int> >::iterator it = tempCount.find(allNames[j].name);
366                     
367                     if (it == tempCount.end()) { //we have not seen this sequence at all yet
368                         vector<int> tempGroups; tempGroups.resize(myGroups.size(), 0);
369                         tempGroups[groupMap[allNames[j].group]]++;
370                         tempCount[allNames[j].name] = tempGroups;
371                     }else{
372                         tempCount[allNames[j].name][groupMap[allNames[j].group]]++;
373                     }
374                 }
375             }
376             
377             //build count table
378             for (map<string, vector<int> >::iterator it = tempCount.begin(); it != tempCount.end();) {
379                 sampledCt.push_back(it->first, it->second);
380                 tempCount.erase(it++);
381             }
382             
383             //remove empty groups 
384             for (int i = 0; i < myGroups.size(); i++) { if (sampledCt.getGroupCount(myGroups[i]) == 0) { sampledCt.removeGroup(myGroups[i]); } }
385             
386         }else {
387             vector<string> names = ct.getNamesOfSeqs();
388             map<string, int> nameMap;
389             vector<string> allNames;
390             
391             for (int i = 0; i < names.size(); i++) {
392                 int num = ct.getNumSeqs(names[i]);
393                 for (int j = 0; j < num; j++) { allNames.push_back(names[i]); }
394             }
395             
396             if (allNames.size() < size) { m->mothurOut("[ERROR]: You have selected a size that is larger than the number of sequences.\n"); m->control_pressed = true; return sampledCt; }
397             else {
398                 random_shuffle(allNames.begin(), allNames.end());
399                 
400                 for (int j = 0; j < size; j++) {
401                     if (m->control_pressed) { return sampledCt; }
402                     
403                     map<string, int>::iterator it = nameMap.find(allNames[j]);
404                     
405                     //we have not seen this sequence at all yet
406                     if (it == nameMap.end()) { nameMap[allNames[j]] = 1;  }
407                     else{  nameMap[allNames[j]]++;  }
408                 }
409                 
410                 //build count table
411                 for (map<string, int>::iterator it = nameMap.begin(); it != nameMap.end();) {
412                     sampledCt.push_back(it->first, it->second);
413                     nameMap.erase(it++);
414                 }
415             }
416         }
417         
418         return sampledCt;
419     }
420         catch(exception& e) {
421                 m->errorOut(e, "SubSampleCommand", "getSample");
422                 exit(1);
423         }
424 }
425 //**********************************************************************************************************************
426
427