X-Git-Url: https://git.donarmstrong.com/?p=mothur.git;a=blobdiff_plain;f=subsample.cpp;h=392f97bd51f1d2ac67d6d0a810a408ab507fdcee;hp=d5b4e3ecf19f2114855426350e1ecfbe57780736;hb=615301e57c25e241356a9c2380648d117709458d;hpb=53171f07cc0c0e560e2b4ba2946f690d59fc2dc4 diff --git a/subsample.cpp b/subsample.cpp index d5b4e3e..392f97b 100644 --- a/subsample.cpp +++ b/subsample.cpp @@ -7,67 +7,102 @@ // #include "subsample.h" +//********************************************************************************************************************** +Tree* SubSample::getSample(Tree* T, CountTable* ct, CountTable* newCt, int size) { + try { + Tree* newTree = NULL; + + //remove seqs not in sample from counttable + vector Groups = ct->getNamesOfGroups(); + newCt->copy(ct); + newCt->addGroup("doNotIncludeMe"); + + map doNotIncludeTotals; + vector namesSeqs = ct->getNamesOfSeqs(); + for (int i = 0; i < namesSeqs.size(); i++) { doNotIncludeTotals[namesSeqs[i]] = 0; } + + for (int i = 0; i < Groups.size(); i++) { + if (m->inUsersGroups(Groups[i], m->getGroups())) { + if (m->control_pressed) { break; } + + int thisSize = ct->getGroupCount(Groups[i]); + + if (thisSize >= size) { + + vector names = ct->getNamesOfSeqs(Groups[i]); + vector random; + for (int j = 0; j < names.size(); j++) { + int num = ct->getGroupCount(names[j], Groups[i]); + for (int k = 0; k < num; k++) { random.push_back(j); } + } + random_shuffle(random.begin(), random.end()); + + vector sampleRandoms; sampleRandoms.resize(names.size(), 0); + for (int j = 0; j < size; j++) { sampleRandoms[random[j]]++; } + for (int j = 0; j < sampleRandoms.size(); j++) { + newCt->setAbund(names[j], Groups[i], sampleRandoms[j]); + } + sampleRandoms.clear(); sampleRandoms.resize(names.size(), 0); + for (int j = size; j < thisSize; j++) { sampleRandoms[random[j]]++; } + for (int j = 0; j < sampleRandoms.size(); j++) { doNotIncludeTotals[names[j]] += sampleRandoms[j]; } + }else { m->mothurOut("[ERROR]: You have selected a size that is larger than "+Groups[i]+" number of sequences.\n"); m->control_pressed = true; } + } + } + + for (map::iterator it = doNotIncludeTotals.begin(); it != doNotIncludeTotals.end(); it++) { + newCt->setAbund(it->first, "doNotIncludeMe", it->second); + } + + newTree = new Tree(newCt); + newTree->getCopy(T, true); + + return newTree; + } + catch(exception& e) { + m->errorOut(e, "SubSample", "getSample-Tree"); + exit(1); + } +} //********************************************************************************************************************** -vector SubSample::getSamplePreserve(vector& thislookup, vector& newLabels, int size) { - try { - - vector newlookup; newlookup.resize(thislookup.size(), NULL); +//assumes whole maps dupName -> uniqueName +map SubSample::deconvolute(map whole, vector& wanted) { + try { + map nameMap; - //save mothurOut's binLabels to restore for next label - vector saveBinLabels = m->currentBinLabels; - - int numBins = thislookup[0]->getNumBins(); - for (int i = 0; i < thislookup.size(); i++) { - int thisSize = thislookup[i]->getNumSeqs(); - - if (thisSize != size) { - - string thisgroup = thislookup[i]->getGroup(); - - OrderVector order; - for(int p=0;pgetAbundance(p);j++){ - order.push_back(p); - } - } - random_shuffle(order.begin(), order.end()); - - SharedRAbundVector* temp = new SharedRAbundVector(numBins); - temp->setLabel(thislookup[i]->getLabel()); - temp->setGroup(thislookup[i]->getGroup()); - - newlookup[i] = temp; - - for (int j = 0; j < size; j++) { - - if (m->control_pressed) { return newlookup; } - - int bin = order.get(j); - - int abund = newlookup[i]->getAbundance(bin); - newlookup[i]->set(bin, (abund+1), thisgroup); - } - } - } - - //subsampling may have created some otus with no sequences in them - eliminateZeroOTUS(newlookup); - - if (m->control_pressed) { return newlookup; } - - //save mothurOut's binLabels to restore for next label - newLabels = m->currentBinLabels; - m->currentBinLabels = saveBinLabels; - - return newlookup; - - } + //whole will be empty if user gave no name file, so we don't need to make a new one + if (whole.size() == 0) { return nameMap; } + + vector newWanted; + for (int i = 0; i < wanted.size(); i++) { + + if (m->control_pressed) { break; } + + string dupName = wanted[i]; + + map::iterator itWhole = whole.find(dupName); + if (itWhole != whole.end()) { + string repName = itWhole->second; + + //do we already have this rep? + map::iterator itName = nameMap.find(repName); + if (itName != nameMap.end()) { //add this seqs to dups list + (itName->second) += "," + dupName; + }else { //first sighting of this seq + nameMap[repName] = dupName; + newWanted.push_back(repName); + } + }else { m->mothurOut("[ERROR]: "+dupName+" is not in your name file, please correct.\n"); m->control_pressed = true; } + } + + wanted = newWanted; + return nameMap; + } catch(exception& e) { - m->errorOut(e, "SubSample", "getSamplePreserve"); + m->errorOut(e, "SubSample", "deconvolute"); exit(1); } -} +} //********************************************************************************************************************** vector SubSample::getSample(vector& thislookup, int size) { try { @@ -113,7 +148,7 @@ vector SubSample::getSample(vector& thislookup, int //subsampling may have created some otus with no sequences in them eliminateZeroOTUS(thislookup); - + if (m->control_pressed) { return m->currentBinLabels; } //save mothurOut's binLabels to restore for next label @@ -124,7 +159,7 @@ vector SubSample::getSample(vector& thislookup, int } catch(exception& e) { - m->errorOut(e, "SubSample", "getSample"); + m->errorOut(e, "SubSample", "getSample-shared"); exit(1); } } @@ -185,8 +220,208 @@ int SubSample::eliminateZeroOTUS(vector& thislookup) { exit(1); } } +//********************************************************************************************************************** +int SubSample::getSample(SAbundVector*& sabund, int size) { + try { + + OrderVector* order = new OrderVector(); + *order = sabund->getOrderVector(NULL); + + int numBins = order->getNumBins(); + int thisSize = order->getNumSeqs(); + + if (thisSize > size) { + random_shuffle(order->begin(), order->end()); + + RAbundVector* rabund = new RAbundVector(numBins); + rabund->setLabel(order->getLabel()); - + for (int j = 0; j < size; j++) { + + if (m->control_pressed) { delete order; delete rabund; return 0; } + + int bin = order->get(j); + + int abund = rabund->get(bin); + rabund->set(bin, (abund+1)); + } + + delete sabund; + sabund = new SAbundVector(); + *sabund = rabund->getSAbundVector(); + delete rabund; + + }else if (thisSize < size) { m->mothurOut("[ERROR]: The size you requested is larger than the number of sequences in the sabund vector. You requested " + toString(size) + " and you only have " + toString(thisSize) + " seqs in your sabund vector.\n"); m->control_pressed = true; } + + if (m->control_pressed) { return 0; } + + delete order; + + return 0; + + } + catch(exception& e) { + m->errorOut(e, "SubSampleCommand", "getSample"); + exit(1); + } +} +//********************************************************************************************************************** +CountTable SubSample::getSample(CountTable& ct, int size, vector Groups) { + try { + if (!ct.hasGroupInfo()) { m->mothurOut("[ERROR]: Cannot subsample by group because your count table doesn't have group information.\n"); m->control_pressed = true; } + + CountTable sampledCt; + map > tempCount; + for (int i = 0; i < Groups.size(); i++) { + sampledCt.addGroup(Groups[i]); + + vector names = ct.getNamesOfSeqs(Groups[i]); + vector allNames; + for (int j = 0; j < names.size(); j++) { + + if (m->control_pressed) { return sampledCt; } + + int num = ct. getGroupCount(names[j], Groups[i]); + for (int k = 0; k < num; k++) { allNames.push_back(names[j]); } + } + + random_shuffle(allNames.begin(), allNames.end()); + + if (allNames.size() < size) { m->mothurOut("[ERROR]: You have selected a size that is larger than "+Groups[i]+" number of sequences.\n"); m->control_pressed = true; } + else{ + for (int j = 0; j < size; j++) { + + if (m->control_pressed) { return sampledCt; } + + map >::iterator it = tempCount.find(allNames[j]); + + if (it == tempCount.end()) { //we have not seen this sequence at all yet + vector tempGroups; tempGroups.resize(Groups.size(), 0); + tempGroups[i]++; + tempCount[allNames[j]] = tempGroups; + }else{ + tempCount[allNames[j]][i]++; + } + } + } + } + + //build count table + for (map >::iterator it = tempCount.begin(); it != tempCount.end();) { + sampledCt.push_back(it->first, it->second); + tempCount.erase(it++); + } + + return sampledCt; + } + catch(exception& e) { + m->errorOut(e, "SubSampleCommand", "getSample"); + exit(1); + } +} +//********************************************************************************************************************** +CountTable SubSample::getSample(CountTable& ct, int size, vector Groups, bool pickedGroups) { + try { + CountTable sampledCt; + if (!ct.hasGroupInfo() && pickedGroups) { m->mothurOut("[ERROR]: Cannot subsample with groups because your count table doesn't have group information.\n"); m->control_pressed = true; return sampledCt; } + + if (ct.hasGroupInfo()) { + map > tempCount; + vector allNames; + map groupMap; + + vector myGroups; + if (pickedGroups) { myGroups = Groups; } + else { myGroups = ct.getNamesOfGroups(); } + + for (int i = 0; i < myGroups.size(); i++) { + sampledCt.addGroup(myGroups[i]); + groupMap[myGroups[i]] = i; + + vector names = ct.getNamesOfSeqs(myGroups[i]); + for (int j = 0; j < names.size(); j++) { + + if (m->control_pressed) { return sampledCt; } + + int num = ct. getGroupCount(names[j], myGroups[i]); + for (int k = 0; k < num; k++) { + item temp(names[j], myGroups[i]); + allNames.push_back(temp); + } + } + } + + random_shuffle(allNames.begin(), allNames.end()); + + if (allNames.size() < size) { + if (pickedGroups) { m->mothurOut("[ERROR]: You have selected a size that is larger than the number of sequences.\n"); } + else { m->mothurOut("[ERROR]: You have selected a size that is larger than the number of sequences in the groups you chose.\n"); } + m->control_pressed = true; return sampledCt; } + else{ + for (int j = 0; j < size; j++) { + + if (m->control_pressed) { return sampledCt; } + + map >::iterator it = tempCount.find(allNames[j].name); + + if (it == tempCount.end()) { //we have not seen this sequence at all yet + vector tempGroups; tempGroups.resize(myGroups.size(), 0); + tempGroups[groupMap[allNames[j].group]]++; + tempCount[allNames[j].name] = tempGroups; + }else{ + tempCount[allNames[j].name][groupMap[allNames[j].group]]++; + } + } + } + + //build count table + for (map >::iterator it = tempCount.begin(); it != tempCount.end();) { + sampledCt.push_back(it->first, it->second); + tempCount.erase(it++); + } + + //remove empty groups + for (int i = 0; i < myGroups.size(); i++) { if (sampledCt.getGroupCount(myGroups[i]) == 0) { sampledCt.removeGroup(myGroups[i]); } } + + }else { + vector names = ct.getNamesOfSeqs(); + map nameMap; + vector allNames; + + for (int i = 0; i < names.size(); i++) { + int num = ct.getNumSeqs(names[i]); + for (int j = 0; j < num; j++) { allNames.push_back(names[i]); } + } + + if (allNames.size() < size) { m->mothurOut("[ERROR]: You have selected a size that is larger than the number of sequences.\n"); m->control_pressed = true; return sampledCt; } + else { + random_shuffle(allNames.begin(), allNames.end()); + + for (int j = 0; j < size; j++) { + if (m->control_pressed) { return sampledCt; } + + map::iterator it = nameMap.find(allNames[j]); + + //we have not seen this sequence at all yet + if (it == nameMap.end()) { nameMap[allNames[j]] = 1; } + else{ nameMap[allNames[j]]++; } + } + + //build count table + for (map::iterator it = nameMap.begin(); it != nameMap.end();) { + sampledCt.push_back(it->first, it->second); + nameMap.erase(it++); + } + } + } + + return sampledCt; + } + catch(exception& e) { + m->errorOut(e, "SubSampleCommand", "getSample"); + exit(1); + } +} //**********************************************************************************************************************