X-Git-Url: https://git.donarmstrong.com/?a=blobdiff_plain;f=subsample.cpp;h=2eb1d497494849e316b58103f84a2c95a475a832;hb=250e3b11b1c9c1e1ad458ab6c7e71ac2e67e11d9;hp=c55accd618991279bb3a1a0ff6d29e0d0250a81e;hpb=6c2b1e530a5c0bb87040e58a3e410097acdfcc3d;p=mothur.git diff --git a/subsample.cpp b/subsample.cpp index c55accd..2eb1d49 100644 --- a/subsample.cpp +++ b/subsample.cpp @@ -108,7 +108,7 @@ vector SubSample::getSample(vector& thislookup, int try { //save mothurOut's binLabels to restore for next label - vector saveBinLabels = m->currentBinLabels; + vector saveBinLabels = m->currentSharedBinLabels; int numBins = thislookup[0]->getNumBins(); for (int i = 0; i < thislookup.size(); i++) { @@ -136,7 +136,7 @@ vector SubSample::getSample(vector& thislookup, int for (int j = 0; j < size; j++) { - if (m->control_pressed) { return m->currentBinLabels; } + if (m->control_pressed) { return m->currentSharedBinLabels; } int bin = order.get(j); @@ -149,11 +149,11 @@ vector SubSample::getSample(vector& thislookup, int //subsampling may have created some otus with no sequences in them eliminateZeroOTUS(thislookup); - if (m->control_pressed) { return m->currentBinLabels; } + if (m->control_pressed) { return m->currentSharedBinLabels; } //save mothurOut's binLabels to restore for next label - vector subsampleBinLabels = m->currentBinLabels; - m->currentBinLabels = saveBinLabels; + vector subsampleBinLabels = m->currentSharedBinLabels; + m->currentSharedBinLabels = saveBinLabels; return subsampleBinLabels; @@ -200,7 +200,7 @@ int SubSample::eliminateZeroOTUS(vector& thislookup) { for (int h = 0; h < diff; h++) { binLabel += "0"; } } binLabel += sbinNumber; - if (i < m->currentBinLabels.size()) { binLabel = m->currentBinLabels[i]; } + if (i < m->currentSharedBinLabels.size()) { binLabel = m->currentSharedBinLabels[i]; } newBinLabels.push_back(binLabel); } @@ -210,7 +210,7 @@ int SubSample::eliminateZeroOTUS(vector& thislookup) { thislookup.clear(); thislookup = newLookup; - m->currentBinLabels = newBinLabels; + m->currentSharedBinLabels = newBinLabels; return 0; @@ -264,7 +264,164 @@ int SubSample::getSample(SAbundVector*& sabund, int size) { m->errorOut(e, "SubSampleCommand", "getSample"); exit(1); } -} +} +//********************************************************************************************************************** +CountTable SubSample::getSample(CountTable& ct, int size, vector Groups) { + try { + if (!ct.hasGroupInfo()) { m->mothurOut("[ERROR]: Cannot subsample by group because your count table doesn't have group information.\n"); m->control_pressed = true; } + + CountTable sampledCt; + map > tempCount; + for (int i = 0; i < Groups.size(); i++) { + sampledCt.addGroup(Groups[i]); + + vector names = ct.getNamesOfSeqs(Groups[i]); + vector allNames; + for (int j = 0; j < names.size(); j++) { + + if (m->control_pressed) { return sampledCt; } + + int num = ct. getGroupCount(names[j], Groups[i]); + for (int k = 0; k < num; k++) { allNames.push_back(names[j]); } + } + + random_shuffle(allNames.begin(), allNames.end()); + + if (allNames.size() < size) { m->mothurOut("[ERROR]: You have selected a size that is larger than "+Groups[i]+" number of sequences.\n"); m->control_pressed = true; } + else{ + for (int j = 0; j < size; j++) { + + if (m->control_pressed) { return sampledCt; } + + map >::iterator it = tempCount.find(allNames[j]); + + if (it == tempCount.end()) { //we have not seen this sequence at all yet + vector tempGroups; tempGroups.resize(Groups.size(), 0); + tempGroups[i]++; + tempCount[allNames[j]] = tempGroups; + }else{ + tempCount[allNames[j]][i]++; + } + } + } + } + + //build count table + for (map >::iterator it = tempCount.begin(); it != tempCount.end();) { + sampledCt.push_back(it->first, it->second); + tempCount.erase(it++); + } + + return sampledCt; + } + catch(exception& e) { + m->errorOut(e, "SubSampleCommand", "getSample"); + exit(1); + } +} +//********************************************************************************************************************** +CountTable SubSample::getSample(CountTable& ct, int size, vector Groups, bool pickedGroups) { + try { + CountTable sampledCt; + if (!ct.hasGroupInfo() && pickedGroups) { m->mothurOut("[ERROR]: Cannot subsample with groups because your count table doesn't have group information.\n"); m->control_pressed = true; return sampledCt; } + + if (ct.hasGroupInfo()) { + map > tempCount; + vector allNames; + map groupMap; + + vector myGroups; + if (pickedGroups) { myGroups = Groups; } + else { myGroups = ct.getNamesOfGroups(); } + + for (int i = 0; i < myGroups.size(); i++) { + sampledCt.addGroup(myGroups[i]); + groupMap[myGroups[i]] = i; + + vector names = ct.getNamesOfSeqs(myGroups[i]); + for (int j = 0; j < names.size(); j++) { + + if (m->control_pressed) { return sampledCt; } + + int num = ct. getGroupCount(names[j], myGroups[i]); + for (int k = 0; k < num; k++) { + item temp(names[j], myGroups[i]); + allNames.push_back(temp); + } + } + } + + random_shuffle(allNames.begin(), allNames.end()); + + if (allNames.size() < size) { + if (pickedGroups) { m->mothurOut("[ERROR]: You have selected a size that is larger than the number of sequences.\n"); } + else { m->mothurOut("[ERROR]: You have selected a size that is larger than the number of sequences in the groups you chose.\n"); } + m->control_pressed = true; return sampledCt; } + else{ + for (int j = 0; j < size; j++) { + + if (m->control_pressed) { return sampledCt; } + + map >::iterator it = tempCount.find(allNames[j].name); + + if (it == tempCount.end()) { //we have not seen this sequence at all yet + vector tempGroups; tempGroups.resize(myGroups.size(), 0); + tempGroups[groupMap[allNames[j].group]]++; + tempCount[allNames[j].name] = tempGroups; + }else{ + tempCount[allNames[j].name][groupMap[allNames[j].group]]++; + } + } + } + + //build count table + for (map >::iterator it = tempCount.begin(); it != tempCount.end();) { + sampledCt.push_back(it->first, it->second); + tempCount.erase(it++); + } + + //remove empty groups + for (int i = 0; i < myGroups.size(); i++) { if (sampledCt.getGroupCount(myGroups[i]) == 0) { sampledCt.removeGroup(myGroups[i]); } } + + }else { + vector names = ct.getNamesOfSeqs(); + map nameMap; + vector allNames; + + for (int i = 0; i < names.size(); i++) { + int num = ct.getNumSeqs(names[i]); + for (int j = 0; j < num; j++) { allNames.push_back(names[i]); } + } + + if (allNames.size() < size) { m->mothurOut("[ERROR]: You have selected a size that is larger than the number of sequences.\n"); m->control_pressed = true; return sampledCt; } + else { + random_shuffle(allNames.begin(), allNames.end()); + + for (int j = 0; j < size; j++) { + if (m->control_pressed) { return sampledCt; } + + map::iterator it = nameMap.find(allNames[j]); + + //we have not seen this sequence at all yet + if (it == nameMap.end()) { nameMap[allNames[j]] = 1; } + else{ nameMap[allNames[j]]++; } + } + + //build count table + for (map::iterator it = nameMap.begin(); it != nameMap.end();) { + sampledCt.push_back(it->first, it->second); + nameMap.erase(it++); + } + } + } + + return sampledCt; + } + catch(exception& e) { + m->errorOut(e, "SubSampleCommand", "getSample"); + exit(1); + } +} //**********************************************************************************************************************