]> git.donarmstrong.com Git - mothur.git/blob - subsample.cpp
fixed segfault in unifrac with subsample. in progress of implementing a version of...
[mothur.git] / subsample.cpp
1 //
2 //  subsample.cpp
3 //  Mothur
4 //
5 //  Created by Sarah Westcott on 4/2/12.
6 //  Copyright (c) 2012 Schloss Lab. All rights reserved.
7 //
8
9 #include "subsample.h"
10 //**********************************************************************************************************************
11 Tree* SubSample::getSample(Tree* T, TreeMap* tmap, map<string, string> whole, int size, map<string, string> originalNameMap) {
12     try {
13         Tree* newTree = NULL;
14         
15         vector<string> subsampledSeqs = getSample(tmap, size);
16         map<string, string> sampledNameMap = deconvolute(whole, subsampledSeqs); 
17         
18         //remove seqs not in sample from treemap
19         for (int i = 0; i < tmap->namesOfSeqs.size(); i++) {
20             //is that name in the subsample?
21             int count = 0;
22             string name = tmap->namesOfSeqs[i];
23             for (int j = 0; j < subsampledSeqs.size(); j++) {
24                 if (name == subsampledSeqs[j]) { break; } //found it
25                 count++;
26             }
27             
28             if (m->control_pressed) { return newTree; }
29             
30             //if you didnt find it, remove it 
31             if (count == subsampledSeqs.size()) { 
32                 tmap->removeSeq(name);
33                 tmap->addSeq(name, "doNotIncludeMe");
34             }
35         }
36         
37         //create new tree
38         int numUniques = sampledNameMap.size();
39         if (sampledNameMap.size() == 0) { numUniques = subsampledSeqs.size(); }
40         
41         newTree = new Tree(tmap);
42         newTree->getCopy(T, originalNameMap, subsampledSeqs);
43         
44         return newTree;
45     }
46     catch(exception& e) {
47         m->errorOut(e, "SubSample", "getSample-Tree");
48         exit(1);
49     }
50 }
51 //**********************************************************************************************************************
52 Tree* SubSample::getSample(Tree* T, TreeMap* tmap, map<string, string> whole, int size) {
53     try {
54         Tree* newTree = NULL;
55         
56         vector<string> subsampledSeqs = getSample(tmap, size);
57         map<string, string> sampledNameMap = deconvolute(whole, subsampledSeqs); 
58         
59         //remove seqs not in sample from treemap
60         for (int i = 0; i < tmap->namesOfSeqs.size(); i++) {
61             //is that name in the subsample?
62             int count = 0;
63             for (int j = 0; j < subsampledSeqs.size(); j++) {
64                 if (tmap->namesOfSeqs[i] == subsampledSeqs[j]) { break; } //found it
65                 count++;
66             }
67
68             if (m->control_pressed) { return newTree; }
69             
70             //if you didnt find it, remove it 
71             if (count == subsampledSeqs.size()) { 
72                 tmap->removeSeq(tmap->namesOfSeqs[i]);
73                 i--; //need this because removeSeq removes name from namesOfSeqs
74             }
75         }
76         
77         //create new tree
78         int numUniques = sampledNameMap.size();
79         if (sampledNameMap.size() == 0) { numUniques = subsampledSeqs.size(); }
80         
81         newTree = new Tree(numUniques, tmap); //numNodes, treemap
82         newTree->getSubTree(T, subsampledSeqs, sampledNameMap);
83         
84         return newTree;
85     }
86     catch(exception& e) {
87         m->errorOut(e, "SubSample", "getSample-Tree");
88         exit(1);
89     }
90 }
91 //**********************************************************************************************************************
92 //assumes whole maps dupName -> uniqueName
93 map<string, string> SubSample::deconvolute(map<string, string> whole, vector<string>& wanted) {
94     try {
95         map<string, string> nameMap;
96         
97         //whole will be empty if user gave no name file, so we don't need to make a new one
98         if (whole.size() == 0) { return nameMap; }
99         
100         vector<string> newWanted;
101         for (int i = 0; i < wanted.size(); i++) {
102             
103             if (m->control_pressed) { break; }
104             
105             string dupName = wanted[i];
106             
107             map<string, string>::iterator itWhole = whole.find(dupName);
108             if (itWhole != whole.end()) {
109                 string repName = itWhole->second;
110                 
111                 //do we already have this rep?
112                 map<string, string>::iterator itName = nameMap.find(repName);
113                 if (itName != nameMap.end()) { //add this seqs to dups list
114                     (itName->second) += "," + dupName;
115                 }else { //first sighting of this seq
116                     nameMap[repName] = dupName;
117                     newWanted.push_back(repName);
118                 }
119             }else { m->mothurOut("[ERROR]: "+dupName+" is not in your name file, please correct.\n"); m->control_pressed = true; }
120         }
121         
122         wanted = newWanted;
123         return nameMap;
124     }
125         catch(exception& e) {
126                 m->errorOut(e, "SubSample", "deconvolute");
127                 exit(1);
128         }
129 }       
130 //**********************************************************************************************************************
131 vector<string> SubSample::getSample(TreeMap* tMap, int size) {
132     try {
133         vector<string> sample;
134         
135         vector<string> Groups = tMap->getNamesOfGroups();    
136         for (int i = 0; i < Groups.size(); i++) {
137             
138             if (m->inUsersGroups(Groups[i], m->getGroups())) {
139                 if (m->control_pressed) { break; }
140                 
141                 vector<string> thisGroup; thisGroup.push_back(Groups[i]);
142                 vector<string> thisGroupsSeqs = tMap->getNamesSeqs(thisGroup);
143                 int thisSize = thisGroupsSeqs.size();
144                 
145                 if (thisSize >= size) { 
146                     
147                     random_shuffle(thisGroupsSeqs.begin(), thisGroupsSeqs.end());
148                     
149                     for (int j = 0; j < size; j++) { sample.push_back(thisGroupsSeqs[j]); }
150                 }else {  m->mothurOut("[ERROR]: You have selected a size that is larger than "+Groups[i]+" number of sequences.\n"); m->control_pressed = true; }
151             }
152         } 
153         
154         return sample;
155     }
156         catch(exception& e) {
157                 m->errorOut(e, "SubSample", "getSample-TreeMap");
158                 exit(1);
159         }
160 }       
161 //**********************************************************************************************************************
162 vector<string> SubSample::getSample(TreeMap* tMap, vector<string> Groups) {
163     try {
164         vector<string> sample;
165         
166         //vector<string> Groups = tMap->getNamesOfGroups();    
167         for (int i = 0; i < Groups.size(); i++) {
168             
169             if (m->control_pressed) { break; }
170             
171             vector<string> thisGroup; thisGroup.push_back(Groups[i]);
172             vector<string> thisGroupsSeqs = tMap->getNamesSeqs(thisGroup);
173             int thisSize = thisGroupsSeqs.size();
174                 
175             for (int j = 0; j < thisSize; j++) { sample.push_back(thisGroupsSeqs[j]); }
176         } 
177         
178         return sample;
179     }
180         catch(exception& e) {
181                 m->errorOut(e, "SubSample", "getSample-TreeMap");
182                 exit(1);
183         }
184 }       
185 //**********************************************************************************************************************
186 vector<string> SubSample::getSample(vector<SharedRAbundVector*>& thislookup, int size) {
187         try {
188                 
189                 //save mothurOut's binLabels to restore for next label
190                 vector<string> saveBinLabels = m->currentBinLabels;
191                 
192                 int numBins = thislookup[0]->getNumBins();
193                 for (int i = 0; i < thislookup.size(); i++) {           
194                         int thisSize = thislookup[i]->getNumSeqs();
195                         
196                         if (thisSize != size) {
197                                 
198                                 string thisgroup = thislookup[i]->getGroup();
199                                 
200                                 OrderVector order;
201                                 for(int p=0;p<numBins;p++){
202                                         for(int j=0;j<thislookup[i]->getAbundance(p);j++){
203                                                 order.push_back(p);
204                                         }
205                                 }
206                                 random_shuffle(order.begin(), order.end());
207                                 
208                                 SharedRAbundVector* temp = new SharedRAbundVector(numBins);
209                                 temp->setLabel(thislookup[i]->getLabel());
210                                 temp->setGroup(thislookup[i]->getGroup());
211                                 
212                                 delete thislookup[i];
213                                 thislookup[i] = temp;
214                                 
215                                 
216                                 for (int j = 0; j < size; j++) {
217                                         
218                                         if (m->control_pressed) {  return m->currentBinLabels; }
219                                         
220                                         int bin = order.get(j);
221                                         
222                                         int abund = thislookup[i]->getAbundance(bin);
223                                         thislookup[i]->set(bin, (abund+1), thisgroup);
224                                 }       
225                         }
226                 }
227                 
228                 //subsampling may have created some otus with no sequences in them
229                 eliminateZeroOTUS(thislookup);
230                 
231                 if (m->control_pressed) { return m->currentBinLabels; }
232                 
233                 //save mothurOut's binLabels to restore for next label
234         vector<string> subsampleBinLabels = m->currentBinLabels;
235                 m->currentBinLabels = saveBinLabels;
236                 
237                 return subsampleBinLabels;
238                 
239         }
240         catch(exception& e) {
241                 m->errorOut(e, "SubSample", "getSample-shared");
242                 exit(1);
243         }
244 }       
245 //**********************************************************************************************************************
246 int SubSample::eliminateZeroOTUS(vector<SharedRAbundVector*>& thislookup) {
247         try {
248                 
249                 vector<SharedRAbundVector*> newLookup;
250                 for (int i = 0; i < thislookup.size(); i++) {
251                         SharedRAbundVector* temp = new SharedRAbundVector();
252                         temp->setLabel(thislookup[i]->getLabel());
253                         temp->setGroup(thislookup[i]->getGroup());
254                         newLookup.push_back(temp);
255                 }
256                 
257                 //for each bin
258                 vector<string> newBinLabels;
259                 string snumBins = toString(thislookup[0]->getNumBins());
260                 for (int i = 0; i < thislookup[0]->getNumBins(); i++) {
261                         if (m->control_pressed) { for (int j = 0; j < newLookup.size(); j++) {  delete newLookup[j];  } return 0; }
262                         
263                         //look at each sharedRabund and make sure they are not all zero
264                         bool allZero = true;
265                         for (int j = 0; j < thislookup.size(); j++) {
266                                 if (thislookup[j]->getAbundance(i) != 0) { allZero = false;  break;  }
267                         }
268                         
269                         //if they are not all zero add this bin
270                         if (!allZero) {
271                                 for (int j = 0; j < thislookup.size(); j++) {
272                                         newLookup[j]->push_back(thislookup[j]->getAbundance(i), thislookup[j]->getGroup());
273                                 }
274                                 //if there is a bin label use it otherwise make one
275                                 string binLabel = "Otu";
276                                 string sbinNumber = toString(i+1);
277                                 if (sbinNumber.length() < snumBins.length()) { 
278                                         int diff = snumBins.length() - sbinNumber.length();
279                                         for (int h = 0; h < diff; h++) { binLabel += "0"; }
280                                 }
281                                 binLabel += sbinNumber; 
282                                 if (i < m->currentBinLabels.size()) {  binLabel = m->currentBinLabels[i]; }
283                                 
284                                 newBinLabels.push_back(binLabel);
285                         }
286                 }
287                 
288                 for (int j = 0; j < thislookup.size(); j++) {  delete thislookup[j];  }
289                 thislookup.clear();
290                 
291                 thislookup = newLookup;
292                 m->currentBinLabels = newBinLabels;
293                 
294                 return 0;
295                 
296         }
297         catch(exception& e) {
298                 m->errorOut(e, "SubSample", "eliminateZeroOTUS");
299                 exit(1);
300         }
301 }
302
303
304 //**********************************************************************************************************************
305
306