From: westcott Date: Wed, 23 Jun 2010 17:42:48 +0000 (+0000) Subject: fixed cluster.split with average method X-Git-Url: https://git.donarmstrong.com/?p=mothur.git;a=commitdiff_plain;h=3102812d94898439646131cecdb64fc542913c87 fixed cluster.split with average method --- diff --git a/clustersplitcommand.cpp b/clustersplitcommand.cpp index 38e8394..deea384 100644 --- a/clustersplitcommand.cpp +++ b/clustersplitcommand.cpp @@ -237,6 +237,7 @@ int ClusterSplitCommand::execute(){ vector listFileNames; set labels; string singletonName = ""; + double saveCutoff = cutoff; //****************** file prep work ******************************// #ifdef USE_MPI @@ -361,6 +362,10 @@ int ClusterSplitCommand::execute(){ for(int i = 1; i < processors; i++) { int num = dividedNames[i].size(); + double tempCutoff; + MPI_Recv(&tempCutoff, 1, MPI_DOUBLE, i, tag, MPI_COMM_WORLD, &status); + if (tempCutoff < cutoff) { cutoff = tempCutoff; } + //send list filenames to root process for (int j = 0; j < num; j++) { int lengthList = 0; @@ -429,6 +434,9 @@ int ClusterSplitCommand::execute(){ //process them listFileNames = cluster(myNames, labels); + //send cutoff + MPI_Send(&cutoff, 1, MPI_DOUBLE, 0, tag, MPI_COMM_WORLD); + //send list filenames to root process for (int j = 0; j < num; j++) { char tempListFileName[1024]; @@ -506,6 +514,10 @@ int ClusterSplitCommand::execute(){ ifstream in2; openInputFile(filename, in2); + float tempCutoff; + in2 >> tempCutoff; gobble(in2); + if (tempCutoff < cutoff) { cutoff = tempCutoff; } + while(!in2.eof()) { string tempName; in2 >> tempName; gobble(in2); @@ -521,6 +533,8 @@ int ClusterSplitCommand::execute(){ #endif if (m->control_pressed) { for (int i = 0; i < listFileNames.size(); i++) { remove(listFileNames[i].c_str()); } return 0; } + if (saveCutoff != cutoff) { m->mothurOut("Cutoff was " + toString(saveCutoff) + " changed cutoff to " + toString(cutoff)); m->mothurOutEndLine(); } + m->mothurOut("It took " + toString(time(NULL) - estart) + " seconds to cluster"); m->mothurOutEndLine(); //****************** merge list file and create rabund and sabund files ******************************// @@ -592,9 +606,11 @@ map ClusterSplitCommand::completeListFile(vector listNames, if ((*it != "unique") && (convertTestFloat(*it, temp) == true)) { convert(*it, temp); } else if (*it == "unique") { temp = -1.0; } - - orderFloat.push_back(temp); - labelBin[temp] = numSingleBins; //initialize numbins + + if (temp <= cutoff) { + orderFloat.push_back(temp); + labelBin[temp] = numSingleBins; //initialize numbins + } } //sort order @@ -804,7 +820,8 @@ int ClusterSplitCommand::createProcesses(vector < vector < map > ofstream outLabels; filename = toString(getpid()) + ".temp.labels"; openOutputFile(filename, outLabels); - + + outLabels << cutoff << endl; for (set::iterator it = labels.begin(); it != labels.end(); it++) { outLabels << (*it) << endl; } @@ -841,8 +858,11 @@ vector ClusterSplitCommand::cluster(vector< map > distNa vector listFileNames; + double smallestCutoff = cutoff; + //cluster each distance file for (int i = 0; i < distNames.size(); i++) { + if (m->control_pressed) { return listFileNames; } string thisNamefile = distNames[i].begin()->second; string thisDistFile = distNames[i].begin()->first; @@ -925,7 +945,7 @@ vector ClusterSplitCommand::cluster(vector< map > distNa listFileNames.clear(); return listFileNames; } - cluster->update(cutoff); + cluster->update(saveCutoff); float dist = matrix->getSmallDist(); float rndDist; @@ -973,8 +993,13 @@ vector ClusterSplitCommand::cluster(vector< map > distNa remove(thisDistFile.c_str()); remove(thisNamefile.c_str()); + + if (saveCutoff != cutoff) { m->mothurOut("Cutoff was " + toString(cutoff) + " changed cutoff to " + toString(saveCutoff)); m->mothurOutEndLine(); } + + if (saveCutoff < smallestCutoff) { smallestCutoff = saveCutoff; } } + cutoff = smallestCutoff; return listFileNames; diff --git a/mgclustercommand.cpp b/mgclustercommand.cpp index 023f214..f8f3c78 100644 --- a/mgclustercommand.cpp +++ b/mgclustercommand.cpp @@ -171,6 +171,8 @@ int MGClusterCommand::execute(){ start = time(NULL); oldList = *list; + map Seq2Bin; + map oldSeq2Bin; if (method == "furthest") { tag = "fn"; } else if (method == "nearest") { tag = "nn"; } @@ -198,6 +200,8 @@ int MGClusterCommand::execute(){ else if(method == "nearest"){ cluster = new SingleLinkage(rabund, list, distMatrix, cutoff, method); } else if(method == "average"){ cluster = new AverageLinkage(rabund, list, distMatrix, cutoff, method); } cluster->setMapWanted(true); + Seq2Bin = cluster->getSeqtoBin(); + oldSeq2Bin = Seq2Bin; if (m->control_pressed) { delete nameMap; delete distMatrix; delete list; delete rabund; delete cluster; @@ -231,8 +235,8 @@ int MGClusterCommand::execute(){ } else if(rndDist != rndPreviousDist){ if (merge) { - map seq2Bin = cluster->getSeqtoBin(); - ListVector* temp = mergeOPFs(seq2Bin, rndPreviousDist); + Seq2Bin = cluster->getSeqtoBin(); + ListVector* temp = mergeOPFs(oldSeq2Bin, rndPreviousDist); if (m->control_pressed) { delete nameMap; delete distMatrix; delete list; delete rabund; delete cluster; delete temp; @@ -250,8 +254,10 @@ int MGClusterCommand::execute(){ } previousDist = dist; + cout << "prev distance = " << previousDist << " dist = " << dist << endl; rndPreviousDist = rndDist; oldList = *list; + oldSeq2Bin = Seq2Bin; } if(previousDist <= 0.0000){ @@ -260,8 +266,8 @@ int MGClusterCommand::execute(){ } else if(rndPreviousDist seq2Bin = cluster->getSeqtoBin(); - ListVector* temp = mergeOPFs(seq2Bin, rndPreviousDist); + Seq2Bin = cluster->getSeqtoBin(); + ListVector* temp = mergeOPFs(oldSeq2Bin, rndPreviousDist); if (m->control_pressed) { delete nameMap; delete distMatrix; delete list; delete rabund; delete cluster; delete temp; @@ -301,6 +307,8 @@ int MGClusterCommand::execute(){ //create cluster hcluster = new HCluster(rabund, list, method, distFile, nameMap, cutoff); hcluster->setMapWanted(true); + Seq2Bin = cluster->getSeqtoBin(); + oldSeq2Bin = Seq2Bin; vector seqs; seqs.resize(1); // to start loop //ifstream inHcluster; @@ -351,8 +359,8 @@ int MGClusterCommand::execute(){ } else if((rndDist != rndPreviousDist)){ if (merge) { - map seq2Bin = hcluster->getSeqtoBin(); - ListVector* temp = mergeOPFs(seq2Bin, rndPreviousDist); + Seq2Bin = hcluster->getSeqtoBin(); + ListVector* temp = mergeOPFs(oldSeq2Bin, rndPreviousDist); if (m->control_pressed) { delete nameMap; delete list; delete rabund; delete hcluster; delete temp; @@ -374,6 +382,7 @@ int MGClusterCommand::execute(){ previousDist = seqs[i].dist; rndPreviousDist = rndDist; oldList = *list; + oldSeq2Bin = Seq2Bin; } } } @@ -385,8 +394,8 @@ int MGClusterCommand::execute(){ } else if(rndPreviousDist seq2Bin = hcluster->getSeqtoBin(); - ListVector* temp = mergeOPFs(seq2Bin, rndPreviousDist); + Seq2Bin = hcluster->getSeqtoBin(); + ListVector* temp = mergeOPFs(oldSeq2Bin, rndPreviousDist); if (m->control_pressed) { delete nameMap; delete list; delete rabund; delete hcluster; delete temp; @@ -406,8 +415,8 @@ int MGClusterCommand::execute(){ } delete hcluster; - remove(distFile.c_str()); - remove(overlapFile.c_str()); + //remove(distFile.c_str()); + //remove(overlapFile.c_str()); } delete list; @@ -509,15 +518,22 @@ ListVector* MGClusterCommand::mergeOPFs(map binInfo, float dist){ //get names of seqs that overlap string name1 = nameMap->get(overlapNode.seq1); string name2 = nameMap->get(overlapNode.seq2); - + //use binInfo to find out if they are already in the same bin - int binKeep = binInfo[name1]; - int binRemove = binInfo[name2]; + map::iterator itBin1 = binInfo.find(name1); + map::iterator itBin2 = binInfo.find(name2); + + if(itBin1 == binInfo.end()){ cerr << "AAError: Sequence '" << name1 << "' does not have any bin info.\n"; exit(1); } + if(itBin2 == binInfo.end()){ cerr << "ABError: Sequence '" << name2 << "' does not have any bin info.\n"; exit(1); } + + int binKeep = itBin1->second; + int binRemove = itBin2->second; //if not merge bins and update binInfo if(binKeep != binRemove) { + //save names in old bin - string names = list->get(binRemove); + string names = newList->get(binRemove); //merge bins into name1s bin newList->set(binKeep, newList->get(binRemove)+','+newList->get(binKeep)); diff --git a/nameassignment.cpp b/nameassignment.cpp index 0c30898..641be4e 100644 --- a/nameassignment.cpp +++ b/nameassignment.cpp @@ -87,16 +87,35 @@ void NameAssignment::print(ostream& out){ //********************************************************************************************************************** int NameAssignment::get(string key){ - - return (*this)[key]; - + try { + map::iterator itGet = (*this).find(key); + + //if you can't find it + if (itGet == (*this).end()) { return -1; } + + return (*this)[key]; + } + catch(exception& e) { + m->errorOut(e, "NameAssignment", "get"); + exit(1); + } } //********************************************************************************************************************** string NameAssignment::get(int key){ + try { - return reverse[key]; - + map::iterator itGet = reverse.find(key); + + if (itGet == reverse.end()) { return "not found"; } + + return reverse[key]; + + } + catch(exception& e) { + m->errorOut(e, "NameAssignment", "get"); + exit(1); + } } //********************************************************************************************************************** diff --git a/readblast.cpp b/readblast.cpp index 9979a75..4833d01 100644 --- a/readblast.cpp +++ b/readblast.cpp @@ -169,6 +169,7 @@ int ReadBlast::read(NameAssignment* nameMap) { map::iterator itDist; for(it=thisRowsBlastScores.begin(); it!=thisRowsBlastScores.end(); it++) { distance = 1.0 - (it->second / refScore); + //do we already have the distance calculated for b->a map::iterator itA = nameMap->find(currentRow); @@ -176,10 +177,12 @@ int ReadBlast::read(NameAssignment* nameMap) { //if we have it then compare if (itDist != dists[it->first].end()) { + if (distance < 0.0) { cout << currentRow << '\t' << nameMap->get(it->first) << '\t' << "score = " << it->second << " refscore = " << refScore << " distance = " << distance << " distance = " << itDist->second << endl; } + //if you want the minimum blast score ratio, then pick max distance if(minWanted) { distance = max(itDist->second, distance); } else{ distance = min(itDist->second, distance); } - + //is this distance below cutoff if (distance < cutoff) { if (!hclusterWanted) {