]> git.donarmstrong.com Git - mothur.git/commitdiff
fixed cluster.split with average method
authorwestcott <westcott>
Wed, 23 Jun 2010 17:42:48 +0000 (17:42 +0000)
committerwestcott <westcott>
Wed, 23 Jun 2010 17:42:48 +0000 (17:42 +0000)
clustersplitcommand.cpp
mgclustercommand.cpp
nameassignment.cpp
readblast.cpp

index 38e839496536b6ae525c94efd0c5b024c7eee384..deea384a46d58f9f7e3683c88009a3e16a45b760 100644 (file)
@@ -237,6 +237,7 @@ int ClusterSplitCommand::execute(){
                vector<string> listFileNames;
                set<string> labels;
                string singletonName = "";
+               double saveCutoff = cutoff;
 
                //****************** file prep work ******************************//
                #ifdef USE_MPI
@@ -361,6 +362,10 @@ int ClusterSplitCommand::execute(){
                        for(int i = 1; i < processors; i++) { 
                                int num = dividedNames[i].size();
                                
+                               double tempCutoff;
+                               MPI_Recv(&tempCutoff, 1, MPI_DOUBLE, i, tag, MPI_COMM_WORLD, &status);
+                               if (tempCutoff < cutoff) { cutoff = tempCutoff; }
+                               
                                //send list filenames to root process
                                for (int j = 0; j < num; j++) {  
                                        int lengthList = 0;
@@ -429,6 +434,9 @@ int ClusterSplitCommand::execute(){
                        //process them
                        listFileNames = cluster(myNames, labels);
                        
+                       //send cutoff
+                       MPI_Send(&cutoff, 1, MPI_DOUBLE, 0, tag, MPI_COMM_WORLD);
+                       
                        //send list filenames to root process
                        for (int j = 0; j < num; j++) {  
                                char tempListFileName[1024];
@@ -506,6 +514,10 @@ int ClusterSplitCommand::execute(){
                                                ifstream in2;
                                                openInputFile(filename, in2);
                                                
+                                               float tempCutoff;
+                                               in2 >> tempCutoff; gobble(in2);
+                                               if (tempCutoff < cutoff) { cutoff = tempCutoff; }
+                                               
                                                while(!in2.eof()) {
                                                        string tempName;
                                                        in2 >> tempName; gobble(in2);
@@ -521,6 +533,8 @@ int ClusterSplitCommand::execute(){
        #endif  
                if (m->control_pressed) { for (int i = 0; i < listFileNames.size(); i++) { remove(listFileNames[i].c_str()); } return 0; }
                
+               if (saveCutoff != cutoff) { m->mothurOut("Cutoff was " + toString(saveCutoff) + " changed cutoff to " + toString(cutoff)); m->mothurOutEndLine();  }
+               
                m->mothurOut("It took " + toString(time(NULL) - estart) + " seconds to cluster"); m->mothurOutEndLine();
                
                //****************** merge list file and create rabund and sabund files ******************************//
@@ -592,9 +606,11 @@ map<float, int> ClusterSplitCommand::completeListFile(vector<string> listNames,
 
                        if ((*it != "unique") && (convertTestFloat(*it, temp) == true)) {       convert(*it, temp);     }
                        else if (*it == "unique")                                                                               {       temp = -1.0;            }
-                                               
-                       orderFloat.push_back(temp);
-                       labelBin[temp] = numSingleBins; //initialize numbins 
+                       
+                       if (temp <= cutoff) {
+                               orderFloat.push_back(temp);
+                               labelBin[temp] = numSingleBins; //initialize numbins 
+                       }
                }
        
                //sort order
@@ -804,7 +820,8 @@ int ClusterSplitCommand::createProcesses(vector < vector < map<string, string> >
                                ofstream outLabels;
                                filename = toString(getpid()) + ".temp.labels";
                                openOutputFile(filename, outLabels);
-               
+                               
+                               outLabels << cutoff << endl;
                                for (set<string>::iterator it = labels.begin(); it != labels.end(); it++) {
                                        outLabels << (*it) << endl;
                                }
@@ -841,8 +858,11 @@ vector<string> ClusterSplitCommand::cluster(vector< map<string, string> > distNa
                
                vector<string> listFileNames;
                
+               double smallestCutoff = cutoff;
+               
                //cluster each distance file
                for (int i = 0; i < distNames.size(); i++) {
+                       if (m->control_pressed) { return listFileNames; }
                        
                        string thisNamefile = distNames[i].begin()->second;
                        string thisDistFile = distNames[i].begin()->first;
@@ -925,7 +945,7 @@ vector<string> ClusterSplitCommand::cluster(vector< map<string, string> > distNa
                                        listFileNames.clear(); return listFileNames;
                                }
                
-                               cluster->update(cutoff);
+                               cluster->update(saveCutoff);
        
                                float dist = matrix->getSmallDist();
                                float rndDist;
@@ -973,8 +993,13 @@ vector<string> ClusterSplitCommand::cluster(vector< map<string, string> > distNa
                        
                        remove(thisDistFile.c_str());
                        remove(thisNamefile.c_str());
+                       
+                       if (saveCutoff != cutoff) { m->mothurOut("Cutoff was " + toString(cutoff) + " changed cutoff to " + toString(saveCutoff)); m->mothurOutEndLine();  }
+                       
+                       if (saveCutoff < smallestCutoff) { smallestCutoff = saveCutoff;  }
                }
                
+               cutoff = smallestCutoff;
                                        
                return listFileNames;
        
index 023f2142deb8584ad3794170b95e1b1d12889529..f8f3c78e6fc68c507bcc896bf0789a2ba2d7e373 100644 (file)
@@ -171,6 +171,8 @@ int MGClusterCommand::execute(){
                
                start = time(NULL);
                oldList = *list;
+               map<string, int> Seq2Bin;
+               map<string, int> oldSeq2Bin;
                
                if (method == "furthest")               { tag = "fn";  }
                else if (method == "nearest")   { tag = "nn";  }
@@ -198,6 +200,8 @@ int MGClusterCommand::execute(){
                        else if(method == "nearest"){   cluster = new SingleLinkage(rabund, list, distMatrix, cutoff, method); }
                        else if(method == "average"){   cluster = new AverageLinkage(rabund, list, distMatrix, cutoff, method); }
                        cluster->setMapWanted(true);
+                       Seq2Bin = cluster->getSeqtoBin();
+                       oldSeq2Bin = Seq2Bin;
                        
                        if (m->control_pressed) { 
                                delete nameMap; delete distMatrix; delete list; delete rabund; delete cluster;
@@ -231,8 +235,8 @@ int MGClusterCommand::execute(){
                                }
                                else if(rndDist != rndPreviousDist){
                                        if (merge) {
-                                               map<string, int> seq2Bin = cluster->getSeqtoBin();
-                                               ListVector* temp = mergeOPFs(seq2Bin, rndPreviousDist);
+                                               Seq2Bin = cluster->getSeqtoBin();
+                                               ListVector* temp = mergeOPFs(oldSeq2Bin, rndPreviousDist);
                                                
                                                if (m->control_pressed) { 
                                                        delete nameMap; delete distMatrix; delete list; delete rabund; delete cluster; delete temp;
@@ -250,8 +254,10 @@ int MGClusterCommand::execute(){
                                }
                                
                                previousDist = dist;
+       cout << "prev distance = " << previousDist << " dist = " << dist << endl;
                                rndPreviousDist = rndDist;
                                oldList = *list;
+                               oldSeq2Bin = Seq2Bin;
                        }
                        
                        if(previousDist <= 0.0000){
@@ -260,8 +266,8 @@ int MGClusterCommand::execute(){
                        }
                        else if(rndPreviousDist<cutoff){
                                if (merge) {
-                                       map<string, int> seq2Bin = cluster->getSeqtoBin();
-                                       ListVector* temp = mergeOPFs(seq2Bin, rndPreviousDist);
+                                       Seq2Bin = cluster->getSeqtoBin();
+                                       ListVector* temp = mergeOPFs(oldSeq2Bin, rndPreviousDist);
                                        
                                        if (m->control_pressed) { 
                                                        delete nameMap; delete distMatrix; delete list; delete rabund; delete cluster; delete temp;
@@ -301,6 +307,8 @@ int MGClusterCommand::execute(){
                        //create cluster
                        hcluster = new HCluster(rabund, list, method, distFile, nameMap, cutoff);
                        hcluster->setMapWanted(true);
+                       Seq2Bin = cluster->getSeqtoBin();
+                       oldSeq2Bin = Seq2Bin;
                        
                        vector<seqDist> seqs; seqs.resize(1); // to start loop
                        //ifstream inHcluster;
@@ -351,8 +359,8 @@ int MGClusterCommand::execute(){
                                                }
                                                else if((rndDist != rndPreviousDist)){
                                                        if (merge) {
-                                                               map<string, int> seq2Bin = hcluster->getSeqtoBin();
-                                                               ListVector* temp = mergeOPFs(seq2Bin, rndPreviousDist);
+                                                               Seq2Bin = hcluster->getSeqtoBin();
+                                                               ListVector* temp = mergeOPFs(oldSeq2Bin, rndPreviousDist);
                                                                
                                                                if (m->control_pressed) { 
                                                                        delete nameMap;  delete list; delete rabund; delete hcluster; delete temp;
@@ -374,6 +382,7 @@ int MGClusterCommand::execute(){
                                                previousDist = seqs[i].dist;
                                                rndPreviousDist = rndDist;
                                                oldList = *list;
+                                               oldSeq2Bin = Seq2Bin;
                                        }
                                }
                        }
@@ -385,8 +394,8 @@ int MGClusterCommand::execute(){
                        }
                        else if(rndPreviousDist<cutoff){
                                if (merge) {
-                                       map<string, int> seq2Bin = hcluster->getSeqtoBin();
-                                       ListVector* temp = mergeOPFs(seq2Bin, rndPreviousDist);
+                                       Seq2Bin = hcluster->getSeqtoBin();
+                                       ListVector* temp = mergeOPFs(oldSeq2Bin, rndPreviousDist);
                                        
                                        if (m->control_pressed) { 
                                                        delete nameMap; delete list; delete rabund; delete hcluster; delete temp;
@@ -406,8 +415,8 @@ int MGClusterCommand::execute(){
                        }
                        
                        delete hcluster;
-                       remove(distFile.c_str());
-                       remove(overlapFile.c_str());
+                       //remove(distFile.c_str());
+                       //remove(overlapFile.c_str());
                }
                
                delete list; 
@@ -509,15 +518,22 @@ ListVector* MGClusterCommand::mergeOPFs(map<string, int> binInfo, float dist){
                                //get names of seqs that overlap
                                string name1 = nameMap->get(overlapNode.seq1);
                                string name2 = nameMap->get(overlapNode.seq2);
-                               
+                       
                                //use binInfo to find out if they are already in the same bin
-                               int binKeep = binInfo[name1];
-                               int binRemove = binInfo[name2];
+                               map<string, int>::iterator itBin1 = binInfo.find(name1);
+                               map<string, int>::iterator itBin2 = binInfo.find(name2);
+                               
+                               if(itBin1 == binInfo.end()){  cerr << "AAError: Sequence '" << name1 << "' does not have any bin info.\n"; exit(1);  }
+                               if(itBin2 == binInfo.end()){  cerr << "ABError: Sequence '" << name2 << "' does not have any bin info.\n"; exit(1);  }
+
+                               int binKeep = itBin1->second;
+                               int binRemove = itBin2->second;
                                
                                //if not merge bins and update binInfo
                                if(binKeep != binRemove) {
+               
                                        //save names in old bin
-                                       string names = list->get(binRemove);
+                                       string names = newList->get(binRemove);
                                        
                                        //merge bins into name1s bin
                                        newList->set(binKeep, newList->get(binRemove)+','+newList->get(binKeep));
index 0c30898dd4085f5000a68c005ccbeee1bcd3650c..641be4e626fb9c52b9035a979acd95dbe654e35a 100644 (file)
@@ -87,16 +87,35 @@ void NameAssignment::print(ostream& out){
 //**********************************************************************************************************************
 
 int NameAssignment::get(string key){
-       
-       return  (*this)[key];   
-
+       try {
+               map<string, int>::iterator itGet = (*this).find(key);
+               
+               //if you can't find it
+               if (itGet == (*this).end()) { return -1; }
+               
+               return  (*this)[key];   
+       }
+       catch(exception& e) {
+               m->errorOut(e, "NameAssignment", "get");
+               exit(1);
+       }
 }
 //**********************************************************************************************************************
 
 string NameAssignment::get(int key){
+       try {
        
-       return  reverse[key];   
-
+               map<int, string>::iterator itGet = reverse.find(key);
+       
+               if (itGet == reverse.end()) { return "not found"; }
+       
+               return  reverse[key];   
+       
+       }
+       catch(exception& e) {
+               m->errorOut(e, "NameAssignment", "get");
+               exit(1);
+       }
 }
 //**********************************************************************************************************************
 
index 9979a7586f59f3827f96645439b1ae323ed8a733..4833d0125d8a286e0116be5ae0644c33f5443803 100644 (file)
@@ -169,6 +169,7 @@ int ReadBlast::read(NameAssignment* nameMap) {
                                        map<int, float>::iterator itDist;
                                        for(it=thisRowsBlastScores.begin(); it!=thisRowsBlastScores.end(); it++) {  
                                                distance = 1.0 - (it->second / refScore);
+               
                                                
                                                //do we already have the distance calculated for b->a
                                                map<string,int>::iterator itA = nameMap->find(currentRow);
@@ -176,10 +177,12 @@ int ReadBlast::read(NameAssignment* nameMap) {
                                                
                                                //if we have it then compare
                                                if (itDist != dists[it->first].end()) {
+               if (distance < 0.0) { cout << currentRow << '\t' << nameMap->get(it->first) << '\t' << "score = " << it->second << " refscore = " << refScore << " distance = " << distance << " distance = " << itDist->second << endl;        }
+
                                                        //if you want the minimum blast score ratio, then pick max distance
                                                        if(minWanted) {  distance = max(itDist->second, distance);  }
                                                        else{   distance = min(itDist->second, distance);  }
-                                                       
+
                                                        //is this distance below cutoff
                                                        if (distance < cutoff) {
                                                                if (!hclusterWanted) {