X-Git-Url: https://git.donarmstrong.com/?a=blobdiff_plain;f=subsample.cpp;h=457b7b9d5f6a1273b57e898b3adec90dff5dd9f0;hb=ffc44592ff7ae94f14f9e21f87198e33d323cd1d;hp=d5b4e3ecf19f2114855426350e1ecfbe57780736;hpb=53171f07cc0c0e560e2b4ba2946f690d59fc2dc4;p=mothur.git diff --git a/subsample.cpp b/subsample.cpp index d5b4e3e..457b7b9 100644 --- a/subsample.cpp +++ b/subsample.cpp @@ -7,64 +7,201 @@ // #include "subsample.h" +//********************************************************************************************************************** +Tree* SubSample::getSample(Tree* T, TreeMap* tmap, TreeMap* newTmap, int size, map originalNameMap) { + try { + Tree* newTree = NULL; + + map > newGroups; + vector subsampledSeqs = getSample(tmap, size, newGroups); + + //remove seqs not in sample from treemap + for (map >::iterator it = newGroups.begin(); it != newGroups.end(); it++) { + for (int i = 0; i < (it->second).size(); i++) { + newTmap->addSeq((it->second)[i], it->first); + } + } + + newTree = new Tree(newTmap); + newTree->getCopy(T, originalNameMap); + + return newTree; + } + catch(exception& e) { + m->errorOut(e, "SubSample", "getSample-Tree"); + exit(1); + } +} +/********************************************************************************************************************** +Tree* SubSample::getSample(Tree* T, TreeMap* tmap, map whole, int size) { + try { + Tree* newTree = NULL; + + vector subsampledSeqs = getSample(tmap, size); + map sampledNameMap = deconvolute(whole, subsampledSeqs); + + //remove seqs not in sample from treemap + for (int i = 0; i < tmap->namesOfSeqs.size(); i++) { + //is that name in the subsample? + int count = 0; + for (int j = 0; j < subsampledSeqs.size(); j++) { + if (tmap->namesOfSeqs[i] == subsampledSeqs[j]) { break; } //found it + count++; + } + if (m->control_pressed) { return newTree; } + + //if you didnt find it, remove it + if (count == subsampledSeqs.size()) { + tmap->removeSeq(tmap->namesOfSeqs[i]); + i--; //need this because removeSeq removes name from namesOfSeqs + } + } + + //create new tree + int numUniques = sampledNameMap.size(); + if (sampledNameMap.size() == 0) { numUniques = subsampledSeqs.size(); } + + newTree = new Tree(numUniques, tmap); //numNodes, treemap + newTree->getSubTree(T, subsampledSeqs, sampledNameMap); + + return newTree; + } + catch(exception& e) { + m->errorOut(e, "SubSample", "getSample-Tree"); + exit(1); + } +}*/ //********************************************************************************************************************** -vector SubSample::getSamplePreserve(vector& thislookup, vector& newLabels, int size) { - try { - - vector newlookup; newlookup.resize(thislookup.size(), NULL); +//assumes whole maps dupName -> uniqueName +map SubSample::deconvolute(map whole, vector& wanted) { + try { + map nameMap; - //save mothurOut's binLabels to restore for next label - vector saveBinLabels = m->currentBinLabels; - - int numBins = thislookup[0]->getNumBins(); - for (int i = 0; i < thislookup.size(); i++) { - int thisSize = thislookup[i]->getNumSeqs(); - - if (thisSize != size) { - - string thisgroup = thislookup[i]->getGroup(); - - OrderVector order; - for(int p=0;pgetAbundance(p);j++){ - order.push_back(p); - } - } - random_shuffle(order.begin(), order.end()); - - SharedRAbundVector* temp = new SharedRAbundVector(numBins); - temp->setLabel(thislookup[i]->getLabel()); - temp->setGroup(thislookup[i]->getGroup()); - - newlookup[i] = temp; - - for (int j = 0; j < size; j++) { - - if (m->control_pressed) { return newlookup; } - - int bin = order.get(j); - - int abund = newlookup[i]->getAbundance(bin); - newlookup[i]->set(bin, (abund+1), thisgroup); - } - } - } - - //subsampling may have created some otus with no sequences in them - eliminateZeroOTUS(newlookup); - - if (m->control_pressed) { return newlookup; } - - //save mothurOut's binLabels to restore for next label - newLabels = m->currentBinLabels; - m->currentBinLabels = saveBinLabels; - - return newlookup; - + //whole will be empty if user gave no name file, so we don't need to make a new one + if (whole.size() == 0) { return nameMap; } + + vector newWanted; + for (int i = 0; i < wanted.size(); i++) { + + if (m->control_pressed) { break; } + + string dupName = wanted[i]; + + map::iterator itWhole = whole.find(dupName); + if (itWhole != whole.end()) { + string repName = itWhole->second; + + //do we already have this rep? + map::iterator itName = nameMap.find(repName); + if (itName != nameMap.end()) { //add this seqs to dups list + (itName->second) += "," + dupName; + }else { //first sighting of this seq + nameMap[repName] = dupName; + newWanted.push_back(repName); + } + }else { m->mothurOut("[ERROR]: "+dupName+" is not in your name file, please correct.\n"); m->control_pressed = true; } + } + + wanted = newWanted; + return nameMap; + } + catch(exception& e) { + m->errorOut(e, "SubSample", "deconvolute"); + exit(1); + } +} +//********************************************************************************************************************** +vector SubSample::getSample(TreeMap* tMap, int size, map >& sample) { + try { + vector temp2; + sample["doNotIncludeMe"] = temp2; + + vector namesInSample; + + vector Groups = tMap->getNamesOfGroups(); + for (int i = 0; i < Groups.size(); i++) { + + if (m->inUsersGroups(Groups[i], m->getGroups())) { + if (m->control_pressed) { break; } + + vector thisGroup; thisGroup.push_back(Groups[i]); + vector thisGroupsSeqs = tMap->getNamesSeqs(thisGroup); + int thisSize = thisGroupsSeqs.size(); + vector temp; + sample[Groups[i]] = temp; + + if (thisSize >= size) { + + random_shuffle(thisGroupsSeqs.begin(), thisGroupsSeqs.end()); + + for (int j = 0; j < size; j++) { sample[Groups[i]].push_back(thisGroupsSeqs[j]); namesInSample.push_back(thisGroupsSeqs[j]); } + for (int j = size; j < thisSize; j++) { sample["doNotIncludeMe"].push_back(thisGroupsSeqs[j]); } + + }else { m->mothurOut("[ERROR]: You have selected a size that is larger than "+Groups[i]+" number of sequences.\n"); m->control_pressed = true; } + } + } + + return namesInSample; + } + catch(exception& e) { + m->errorOut(e, "SubSample", "getSample-TreeMap"); + exit(1); } +} + +//********************************************************************************************************************** +vector SubSample::getSample(TreeMap* tMap, int size) { + try { + vector sample; + + vector Groups = tMap->getNamesOfGroups(); + for (int i = 0; i < Groups.size(); i++) { + + if (m->inUsersGroups(Groups[i], m->getGroups())) { + if (m->control_pressed) { break; } + + vector thisGroup; thisGroup.push_back(Groups[i]); + vector thisGroupsSeqs = tMap->getNamesSeqs(thisGroup); + int thisSize = thisGroupsSeqs.size(); + + if (thisSize >= size) { + + random_shuffle(thisGroupsSeqs.begin(), thisGroupsSeqs.end()); + + for (int j = 0; j < size; j++) { sample.push_back(thisGroupsSeqs[j]); } + }else { m->mothurOut("[ERROR]: You have selected a size that is larger than "+Groups[i]+" number of sequences.\n"); m->control_pressed = true; } + } + } + + return sample; + } + catch(exception& e) { + m->errorOut(e, "SubSample", "getSample-TreeMap"); + exit(1); + } +} +//********************************************************************************************************************** +vector SubSample::getSample(TreeMap* tMap, vector Groups) { + try { + vector sample; + + //vector Groups = tMap->getNamesOfGroups(); + for (int i = 0; i < Groups.size(); i++) { + + if (m->control_pressed) { break; } + + vector thisGroup; thisGroup.push_back(Groups[i]); + vector thisGroupsSeqs = tMap->getNamesSeqs(thisGroup); + int thisSize = thisGroupsSeqs.size(); + + for (int j = 0; j < thisSize; j++) { sample.push_back(thisGroupsSeqs[j]); } + } + + return sample; + } catch(exception& e) { - m->errorOut(e, "SubSample", "getSamplePreserve"); + m->errorOut(e, "SubSample", "getSample-TreeMap"); exit(1); } } @@ -124,7 +261,7 @@ vector SubSample::getSample(vector& thislookup, int } catch(exception& e) { - m->errorOut(e, "SubSample", "getSample"); + m->errorOut(e, "SubSample", "getSample-shared"); exit(1); } }