5 // Created by Sarah Westcott on 4/2/12.
6 // Copyright (c) 2012 Schloss Lab. All rights reserved.
10 //**********************************************************************************************************************
11 Tree* SubSample::getSample(Tree* T, CountTable* ct, CountTable* newCt, int size) {
15 //remove seqs not in sample from counttable
16 vector<string> Groups = ct->getNamesOfGroups();
18 newCt->addGroup("doNotIncludeMe");
20 map<string, int> doNotIncludeTotals;
21 vector<string> namesSeqs = ct->getNamesOfSeqs();
22 for (int i = 0; i < namesSeqs.size(); i++) { doNotIncludeTotals[namesSeqs[i]] = 0; }
24 for (int i = 0; i < Groups.size(); i++) {
25 if (m->inUsersGroups(Groups[i], m->getGroups())) {
26 if (m->control_pressed) { break; }
28 int thisSize = ct->getGroupCount(Groups[i]);
30 if (thisSize >= size) {
32 vector<string> names = ct->getNamesOfSeqs(Groups[i]);
34 for (int j = 0; j < names.size(); j++) {
35 int num = ct->getGroupCount(names[j], Groups[i]);
36 for (int k = 0; k < num; k++) { random.push_back(j); }
38 random_shuffle(random.begin(), random.end());
40 vector<int> sampleRandoms; sampleRandoms.resize(names.size(), 0);
41 for (int j = 0; j < size; j++) { sampleRandoms[random[j]]++; }
42 for (int j = 0; j < sampleRandoms.size(); j++) {
43 newCt->setAbund(names[j], Groups[i], sampleRandoms[j]);
45 sampleRandoms.clear(); sampleRandoms.resize(names.size(), 0);
46 for (int j = size; j < thisSize; j++) { sampleRandoms[random[j]]++; }
47 for (int j = 0; j < sampleRandoms.size(); j++) { doNotIncludeTotals[names[j]] += sampleRandoms[j]; }
48 }else { m->mothurOut("[ERROR]: You have selected a size that is larger than "+Groups[i]+" number of sequences.\n"); m->control_pressed = true; }
53 for (map<string, int>::iterator it = doNotIncludeTotals.begin(); it != doNotIncludeTotals.end(); it++) {
54 newCt->setAbund(it->first, "doNotIncludeMe", it->second);
57 newTree = new Tree(newCt);
58 newTree->getCopy(T, true);
63 m->errorOut(e, "SubSample", "getSample-Tree");
67 //**********************************************************************************************************************
68 //assumes whole maps dupName -> uniqueName
69 map<string, string> SubSample::deconvolute(map<string, string> whole, vector<string>& wanted) {
71 map<string, string> nameMap;
73 //whole will be empty if user gave no name file, so we don't need to make a new one
74 if (whole.size() == 0) { return nameMap; }
76 vector<string> newWanted;
77 for (int i = 0; i < wanted.size(); i++) {
79 if (m->control_pressed) { break; }
81 string dupName = wanted[i];
83 map<string, string>::iterator itWhole = whole.find(dupName);
84 if (itWhole != whole.end()) {
85 string repName = itWhole->second;
87 //do we already have this rep?
88 map<string, string>::iterator itName = nameMap.find(repName);
89 if (itName != nameMap.end()) { //add this seqs to dups list
90 (itName->second) += "," + dupName;
91 }else { //first sighting of this seq
92 nameMap[repName] = dupName;
93 newWanted.push_back(repName);
95 }else { m->mothurOut("[ERROR]: "+dupName+" is not in your name file, please correct.\n"); m->control_pressed = true; }
101 catch(exception& e) {
102 m->errorOut(e, "SubSample", "deconvolute");
106 //**********************************************************************************************************************
107 vector<string> SubSample::getSample(vector<SharedRAbundVector*>& thislookup, int size) {
110 //save mothurOut's binLabels to restore for next label
111 vector<string> saveBinLabels = m->currentBinLabels;
113 int numBins = thislookup[0]->getNumBins();
114 for (int i = 0; i < thislookup.size(); i++) {
115 int thisSize = thislookup[i]->getNumSeqs();
117 if (thisSize != size) {
119 string thisgroup = thislookup[i]->getGroup();
122 for(int p=0;p<numBins;p++){
123 for(int j=0;j<thislookup[i]->getAbundance(p);j++){
127 random_shuffle(order.begin(), order.end());
129 SharedRAbundVector* temp = new SharedRAbundVector(numBins);
130 temp->setLabel(thislookup[i]->getLabel());
131 temp->setGroup(thislookup[i]->getGroup());
133 delete thislookup[i];
134 thislookup[i] = temp;
137 for (int j = 0; j < size; j++) {
139 if (m->control_pressed) { return m->currentBinLabels; }
141 int bin = order.get(j);
143 int abund = thislookup[i]->getAbundance(bin);
144 thislookup[i]->set(bin, (abund+1), thisgroup);
149 //subsampling may have created some otus with no sequences in them
150 eliminateZeroOTUS(thislookup);
152 if (m->control_pressed) { return m->currentBinLabels; }
154 //save mothurOut's binLabels to restore for next label
155 vector<string> subsampleBinLabels = m->currentBinLabels;
156 m->currentBinLabels = saveBinLabels;
158 return subsampleBinLabels;
161 catch(exception& e) {
162 m->errorOut(e, "SubSample", "getSample-shared");
166 //**********************************************************************************************************************
167 int SubSample::eliminateZeroOTUS(vector<SharedRAbundVector*>& thislookup) {
170 vector<SharedRAbundVector*> newLookup;
171 for (int i = 0; i < thislookup.size(); i++) {
172 SharedRAbundVector* temp = new SharedRAbundVector();
173 temp->setLabel(thislookup[i]->getLabel());
174 temp->setGroup(thislookup[i]->getGroup());
175 newLookup.push_back(temp);
179 vector<string> newBinLabels;
180 string snumBins = toString(thislookup[0]->getNumBins());
181 for (int i = 0; i < thislookup[0]->getNumBins(); i++) {
182 if (m->control_pressed) { for (int j = 0; j < newLookup.size(); j++) { delete newLookup[j]; } return 0; }
184 //look at each sharedRabund and make sure they are not all zero
186 for (int j = 0; j < thislookup.size(); j++) {
187 if (thislookup[j]->getAbundance(i) != 0) { allZero = false; break; }
190 //if they are not all zero add this bin
192 for (int j = 0; j < thislookup.size(); j++) {
193 newLookup[j]->push_back(thislookup[j]->getAbundance(i), thislookup[j]->getGroup());
195 //if there is a bin label use it otherwise make one
196 string binLabel = "Otu";
197 string sbinNumber = toString(i+1);
198 if (sbinNumber.length() < snumBins.length()) {
199 int diff = snumBins.length() - sbinNumber.length();
200 for (int h = 0; h < diff; h++) { binLabel += "0"; }
202 binLabel += sbinNumber;
203 if (i < m->currentBinLabels.size()) { binLabel = m->currentBinLabels[i]; }
205 newBinLabels.push_back(binLabel);
209 for (int j = 0; j < thislookup.size(); j++) { delete thislookup[j]; }
212 thislookup = newLookup;
213 m->currentBinLabels = newBinLabels;
218 catch(exception& e) {
219 m->errorOut(e, "SubSample", "eliminateZeroOTUS");
223 //**********************************************************************************************************************
224 int SubSample::getSample(SAbundVector*& sabund, int size) {
227 OrderVector* order = new OrderVector();
228 *order = sabund->getOrderVector(NULL);
230 int numBins = order->getNumBins();
231 int thisSize = order->getNumSeqs();
233 if (thisSize > size) {
234 random_shuffle(order->begin(), order->end());
236 RAbundVector* rabund = new RAbundVector(numBins);
237 rabund->setLabel(order->getLabel());
239 for (int j = 0; j < size; j++) {
241 if (m->control_pressed) { delete order; delete rabund; return 0; }
243 int bin = order->get(j);
245 int abund = rabund->get(bin);
246 rabund->set(bin, (abund+1));
250 sabund = new SAbundVector();
251 *sabund = rabund->getSAbundVector();
254 }else if (thisSize < size) { m->mothurOut("[ERROR]: The size you requested is larger than the number of sequences in the sabund vector. You requested " + toString(size) + " and you only have " + toString(thisSize) + " seqs in your sabund vector.\n"); m->control_pressed = true; }
256 if (m->control_pressed) { return 0; }
263 catch(exception& e) {
264 m->errorOut(e, "SubSampleCommand", "getSample");
268 //**********************************************************************************************************************
269 CountTable SubSample::getSample(CountTable& ct, int size, vector<string> Groups) {
271 if (!ct.hasGroupInfo()) { m->mothurOut("[ERROR]: Cannot subsample by group because your count table doesn't have group information.\n"); m->control_pressed = true; }
273 CountTable sampledCt;
274 map<string, vector<int> > tempCount;
275 for (int i = 0; i < Groups.size(); i++) {
276 sampledCt.addGroup(Groups[i]);
278 vector<string> names = ct.getNamesOfSeqs(Groups[i]);
279 vector<string> allNames;
280 for (int j = 0; j < names.size(); j++) {
282 if (m->control_pressed) { return sampledCt; }
284 int num = ct. getGroupCount(names[j], Groups[i]);
285 for (int k = 0; k < num; k++) { allNames.push_back(names[j]); }
288 random_shuffle(allNames.begin(), allNames.end());
290 if (allNames.size() < size) { m->mothurOut("[ERROR]: You have selected a size that is larger than "+Groups[i]+" number of sequences.\n"); m->control_pressed = true; }
292 for (int j = 0; j < size; j++) {
294 if (m->control_pressed) { return sampledCt; }
296 map<string, vector<int> >::iterator it = tempCount.find(allNames[j]);
298 if (it == tempCount.end()) { //we have not seen this sequence at all yet
299 vector<int> tempGroups; tempGroups.resize(Groups.size(), 0);
301 tempCount[allNames[j]] = tempGroups;
303 tempCount[allNames[j]][i]++;
310 for (map<string, vector<int> >::iterator it = tempCount.begin(); it != tempCount.end();) {
311 sampledCt.push_back(it->first, it->second);
312 tempCount.erase(it++);
317 catch(exception& e) {
318 m->errorOut(e, "SubSampleCommand", "getSample");
322 //**********************************************************************************************************************
323 CountTable SubSample::getSample(CountTable& ct, int size, vector<string> Groups, bool pickedGroups) {
325 CountTable sampledCt;
326 if (!ct.hasGroupInfo() && pickedGroups) { m->mothurOut("[ERROR]: Cannot subsample with groups because your count table doesn't have group information.\n"); m->control_pressed = true; return sampledCt; }
328 if (ct.hasGroupInfo()) {
329 map<string, vector<int> > tempCount;
330 vector<item> allNames;
331 map<string, int> groupMap;
333 vector<string> myGroups;
334 if (pickedGroups) { myGroups = Groups; }
335 else { myGroups = ct.getNamesOfGroups(); }
337 for (int i = 0; i < myGroups.size(); i++) {
338 sampledCt.addGroup(myGroups[i]);
339 groupMap[myGroups[i]] = i;
341 vector<string> names = ct.getNamesOfSeqs(myGroups[i]);
342 for (int j = 0; j < names.size(); j++) {
344 if (m->control_pressed) { return sampledCt; }
346 int num = ct. getGroupCount(names[j], myGroups[i]);
347 for (int k = 0; k < num; k++) {
348 item temp(names[j], myGroups[i]);
349 allNames.push_back(temp);
354 random_shuffle(allNames.begin(), allNames.end());
356 if (allNames.size() < size) {
357 if (pickedGroups) { m->mothurOut("[ERROR]: You have selected a size that is larger than the number of sequences.\n"); }
358 else { m->mothurOut("[ERROR]: You have selected a size that is larger than the number of sequences in the groups you chose.\n"); }
359 m->control_pressed = true; return sampledCt; }
361 for (int j = 0; j < size; j++) {
363 if (m->control_pressed) { return sampledCt; }
365 map<string, vector<int> >::iterator it = tempCount.find(allNames[j].name);
367 if (it == tempCount.end()) { //we have not seen this sequence at all yet
368 vector<int> tempGroups; tempGroups.resize(myGroups.size(), 0);
369 tempGroups[groupMap[allNames[j].group]]++;
370 tempCount[allNames[j].name] = tempGroups;
372 tempCount[allNames[j].name][groupMap[allNames[j].group]]++;
378 for (map<string, vector<int> >::iterator it = tempCount.begin(); it != tempCount.end();) {
379 sampledCt.push_back(it->first, it->second);
380 tempCount.erase(it++);
383 //remove empty groups
384 for (int i = 0; i < myGroups.size(); i++) { if (sampledCt.getGroupCount(myGroups[i]) == 0) { sampledCt.removeGroup(myGroups[i]); } }
387 vector<string> names = ct.getNamesOfSeqs();
388 map<string, int> nameMap;
389 vector<string> allNames;
391 for (int i = 0; i < names.size(); i++) {
392 int num = ct.getNumSeqs(names[i]);
393 for (int j = 0; j < num; j++) { allNames.push_back(names[i]); }
396 if (allNames.size() < size) { m->mothurOut("[ERROR]: You have selected a size that is larger than the number of sequences.\n"); m->control_pressed = true; return sampledCt; }
398 random_shuffle(allNames.begin(), allNames.end());
400 for (int j = 0; j < size; j++) {
401 if (m->control_pressed) { return sampledCt; }
403 map<string, int>::iterator it = nameMap.find(allNames[j]);
405 //we have not seen this sequence at all yet
406 if (it == nameMap.end()) { nameMap[allNames[j]] = 1; }
407 else{ nameMap[allNames[j]]++; }
411 for (map<string, int>::iterator it = nameMap.begin(); it != nameMap.end();) {
412 sampledCt.push_back(it->first, it->second);
420 catch(exception& e) {
421 m->errorOut(e, "SubSampleCommand", "getSample");
425 //**********************************************************************************************************************