]> git.donarmstrong.com Git - mothur.git/commitdiff
made changes to concensus to find a better tree. also fixed minor bug in venn with...
authorwestcott <westcott>
Thu, 2 Jul 2009 15:11:04 +0000 (15:11 +0000)
committerwestcott <westcott>
Thu, 2 Jul 2009 15:11:04 +0000 (15:11 +0000)
Mothur.xcodeproj/project.pbxproj
chimeraseqscommand.cpp [new file with mode: 0644]
chimeraseqscommand.h [new file with mode: 0644]
consensuscommand.cpp
consensuscommand.h
filters.h [new file with mode: 0644]
filterseqscommand.cpp
filterseqscommand.h
venn.cpp

index 79629fcaf01051e48d36cf35cb6341dff509fbad..b8a10577b717fc26f6d167c0fec1589b24bbc3d4 100644 (file)
@@ -70,6 +70,7 @@
                3792948A0F2E258500B9034A /* parsimony.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 379294890F2E258500B9034A /* parsimony.cpp */; };
                3799A9500FD6A58C00E33EDE /* distancedb.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 3799A94C0FD6A58C00E33EDE /* distancedb.cpp */; };
                3799A9510FD6A58C00E33EDE /* seqsummarycommand.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 3799A94E0FD6A58C00E33EDE /* seqsummarycommand.cpp */; };
+               379D3D510FF90E090068C1C0 /* chimeraseqscommand.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 379D3D500FF90E090068C1C0 /* chimeraseqscommand.cpp */; };
                37AD4CE40F28AEA300AA2D49 /* sharedlistvector.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 37AD4CE30F28AEA300AA2D49 /* sharedlistvector.cpp */; };
                37AD4DBB0F28E2FE00AA2D49 /* tree.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 37AD4DBA0F28E2FE00AA2D49 /* tree.cpp */; };
                37AD4DCA0F28F3DD00AA2D49 /* readtree.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 37AD4DC90F28F3DD00AA2D49 /* readtree.cpp */; };
                3799A94D0FD6A58C00E33EDE /* distancedb.hpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.h; path = distancedb.hpp; sourceTree = SOURCE_ROOT; };
                3799A94E0FD6A58C00E33EDE /* seqsummarycommand.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = seqsummarycommand.cpp; sourceTree = SOURCE_ROOT; };
                3799A94F0FD6A58C00E33EDE /* seqsummarycommand.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = seqsummarycommand.h; sourceTree = SOURCE_ROOT; };
+               379D3D4F0FF90E090068C1C0 /* chimeraseqscommand.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = chimeraseqscommand.h; sourceTree = "<group>"; };
+               379D3D500FF90E090068C1C0 /* chimeraseqscommand.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = chimeraseqscommand.cpp; sourceTree = "<group>"; };
+               379D3D6B0FF917F40068C1C0 /* filters.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = filters.h; sourceTree = "<group>"; };
                37AD4CE20F28AEA300AA2D49 /* sharedlistvector.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = sharedlistvector.h; sourceTree = SOURCE_ROOT; };
                37AD4CE30F28AEA300AA2D49 /* sharedlistvector.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = sharedlistvector.cpp; sourceTree = SOURCE_ROOT; };
                37AD4DB90F28E2FE00AA2D49 /* tree.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = tree.h; sourceTree = SOURCE_ROOT; };
                                37C753F40FB34C0300DBD02E /* eachgapignore.h */,
                                373C69960FC1E63600137ACD /* efron.cpp */,
                                373C69970FC1E63600137ACD /* efron.h */,
+                               379D3D6B0FF917F40068C1C0 /* filters.h */,
                                EB9303F70F53517300E8EF26 /* geom.h */,
                                EB9303F80F53517300E8EF26 /* geom.cpp */,
                                378C1AF10FB0644D004D63F5 /* goodscoverage.h */,
                                37D927CB0F21331F001D4494 /* collectsharedcommand.cpp */,
                                377326640FAF16E0007ABB8B /* consensuscommand.h */,
                                377326630FAF16E0007ABB8B /* consensuscommand.cpp */,
+                               379D3D4F0FF90E090068C1C0 /* chimeraseqscommand.h */,
+                               379D3D500FF90E090068C1C0 /* chimeraseqscommand.cpp */,
                                37B28F660F27590100808A62 /* deconvolutecommand.h */,
                                37B28F670F27590100808A62 /* deconvolutecommand.cpp */,
                                37C753CC0FB3415200DBD02E /* distancecommand.h */,
                                378598740FDD4C1500EF9D03 /* heatmapsim.cpp in Sources */,
                                378599100FDD7E8E00EF9D03 /* optionparser.cpp in Sources */,
                                7E5A17AF0FE57292003C6A03 /* mergefilecommand.cpp in Sources */,
+                               379D3D510FF90E090068C1C0 /* chimeraseqscommand.cpp in Sources */,
                        );
                        runOnlyForDeploymentPostprocessing = 0;
                };
diff --git a/chimeraseqscommand.cpp b/chimeraseqscommand.cpp
new file mode 100644 (file)
index 0000000..b810b9f
--- /dev/null
@@ -0,0 +1,230 @@
+/*
+ *  chimeraseqscommand.cpp
+ *  Mothur
+ *
+ *  Created by Sarah Westcott on 6/29/09.
+ *  Copyright 2009 Schloss Lab UMASS Amherst. All rights reserved.
+ *
+ */
+
+#include "chimeraseqscommand.h"
+
+//***************************************************************************************************************
+
+ChimeraSeqsCommand::ChimeraSeqsCommand(string option){
+       try {
+               abort = false;
+               
+               //allow user to run help
+               if(option == "help") { help(); abort = true; }
+               
+               else {
+                       //valid paramters for this command
+                       string Array[] =  {"fasta", "filter", "correction", "processors", "method" };
+                       vector<string> myArray (Array, Array+(sizeof(Array)/sizeof(string)));
+                       
+                       OptionParser parser(option);
+                       map<string,string> parameters = parser.getParameters();
+                       
+                       ValidParameters validParameter;
+                       
+                       //check to make sure all parameters are valid for command
+                       for (map<string,string>::iterator it = parameters.begin(); it != parameters.end(); it++) { 
+                               if (validParameter.isValidParameter(it->first, myArray, it->second) != true) {  abort = true;  }
+                       }
+                       
+                       //check for required parameters
+                       fastafile = validParameter.validFile(parameters, "fasta", true);
+                       if (fastafile == "not open") { abort = true; }
+                       else if (fastafile == "not found") { fastafile = ""; mothurOut("fasta is a required parameter for the chimera.seqs command."); mothurOutEndLine(); abort = true;  }     
+                       
+                       string temp;
+                       temp = validParameter.validFile(parameters, "filter", false);                   if (temp == "not found") { temp = "F"; }
+                       filter = isTrue(temp);
+                       
+                       temp = validParameter.validFile(parameters, "correction", false);               if (temp == "not found") { temp = "T"; }
+                       correction = isTrue(temp);
+                       
+                       temp = validParameter.validFile(parameters, "processors", true);                if (temp == "not found") { temp = "1"; }
+                       convert(temp, processors);
+                       
+                       method = validParameter.validFile(parameters, "method", false);         if (method == "not found") { method = "bellerophon"; }
+
+               }
+       }
+       catch(exception& e) {
+               errorOut(e, "ChimeraSeqsCommand", "ChimeraSeqsCommand");
+               exit(1);
+       }
+}
+//**********************************************************************************************************************
+
+void ChimeraSeqsCommand::help(){
+       try {
+               mothurOut("The chimera.seqs command reads a fastafile and creates a sorted priority score list of potentially chimeric sequences (ideally, the sequences should already be aligned).\n");
+               mothurOut("The chimera.seqs command parameters are fasta, filter, correction, processors and method.  fasta is required.\n");
+               mothurOut("The filter parameter allows you to specify if you would like to apply a 50% soft filter.  The default is false. \n");
+               mothurOut("The correction parameter allows you to .....  The default is true. \n");
+               mothurOut("The processors parameter allows you to specify how many processors you would like to use.  The default is 1. \n");
+               mothurOut("The method parameter allows you to specify the method for finding chimeric sequences.  The default is bellerophon. \n");
+               mothurOut("The chimera.seqs command should be in the following format: \n");
+               mothurOut("chimera.seqs(fasta=yourFastaFile, filter=yourFilter, correction=yourCorrection, processors=yourProcessors, method=bellerophon) \n");
+               mothurOut("Example: chimera.seqs(fasta=AD.align, filter=True, correction=true, processors=2, method=yourMethod) \n");
+               mothurOut("Note: No spaces between parameter labels (i.e. fasta), '=' and parameters (i.e.yourFastaFile).\n\n");        
+       }
+       catch(exception& e) {
+               errorOut(e, "ChimeraSeqsCommand", "help");
+               exit(1);
+       }
+}
+
+//***************************************************************************************************************
+
+ChimeraSeqsCommand::~ChimeraSeqsCommand(){     /*      do nothing      */      }
+
+//***************************************************************************************************************
+
+int ChimeraSeqsCommand::execute(){
+       try{
+               
+               if (abort == true) { return 0; }
+               
+               //do soft filter
+               if (filter)  {
+                       string optionString = "fasta=" + fastafile + ", soft=50.0, vertical=F";
+                       filterSeqs = new FilterSeqsCommand(optionString);
+                       filterSeqs->execute();
+                       delete filterSeqs;
+                       
+                       //reset fastafile to filtered file
+                       fastafile = getRootName(fastafile) + "filter.fasta";
+               }
+               
+               //read in sequences
+               readSeqs();
+               
+               //int numSeqs = seqs.size();
+               
+               //find average midpoint of seqs
+               midpoint = findAverageMidPoint();
+                               
+               //this should be parallelized
+               //generatePreferences();
+               
+                               
+               //output results to screen                                              
+               mothurOutEndLine();
+               mothurOut("\t\t"); mothurOutEndLine();
+               //mothurOut("Minimum:\t" + toString(startPosition[0]) + "\t" + toString(endPosition[0]) + "\t" + toString(seqLength[0]) + "\t" + toString(ambigBases[0]) + "\t" + toString(longHomoPolymer[0])); mothurOutEndLine();
+               //mothurOut("2.5%-tile:\t" + toString(startPosition[ptile0_25]) + "\t" + toString(endPosition[ptile0_25]) + "\t" + toString(seqLength[ptile0_25]) + "\t" + toString(ambigBases[ptile0_25]) + "\t"+ toString(longHomoPolymer[ptile0_25])); mothurOutEndLine();
+               //mothurOut("25%-tile:\t" + toString(startPosition[ptile25]) + "\t" + toString(endPosition[ptile25]) + "\t" + toString(seqLength[ptile25]) + "\t" + toString(ambigBases[ptile25]) + "\t" + toString(longHomoPolymer[ptile25])); mothurOutEndLine();
+               //mothurOut("Median: \t" + toString(startPosition[ptile50]) + "\t" + toString(endPosition[ptile50]) + "\t" + toString(seqLength[ptile50]) + "\t" + toString(ambigBases[ptile50]) + "\t" + toString(longHomoPolymer[ptile50])); mothurOutEndLine();
+               //mothurOut("75%-tile:\t" + toString(startPosition[ptile75]) + "\t" + toString(endPosition[ptile75]) + "\t" + toString(seqLength[ptile75]) + "\t" + toString(ambigBases[ptile75]) + "\t" + toString(longHomoPolymer[ptile75])); mothurOutEndLine();
+               //mothurOut("97.5%-tile:\t" + toString(startPosition[ptile97_5]) + "\t" + toString(endPosition[ptile97_5]) + "\t" + toString(seqLength[ptile97_5]) + "\t" + toString(ambigBases[ptile97_5]) + "\t" + toString(longHomoPolymer[ptile97_5])); mothurOutEndLine();
+               //mothurOut("Maximum:\t" + toString(startPosition[ptile100]) + "\t" + toString(endPosition[ptile100]) + "\t" + toString(seqLength[ptile100]) + "\t" + toString(ambigBases[ptile100]) + "\t" + toString(longHomoPolymer[ptile100])); mothurOutEndLine();
+               //mothurOut("# of Seqs:\t" + toString(numSeqs)); mothurOutEndLine();
+               
+               //outSummary.close();
+               return 0;
+       }
+       catch(exception& e) {
+               errorOut(e, "ChimeraSeqsCommand", "execute");
+               exit(1);
+       }
+}
+
+//***************************************************************************************************************
+void ChimeraSeqsCommand::readSeqs(){
+       try {
+               ifstream inFASTA;
+               openInputFile(fastafile, inFASTA);
+               
+               //read in seqs and store in vector
+               while(!inFASTA.eof()){
+                       Sequence current(inFASTA);
+                       
+                       seqs.push_back(current);
+                       
+                       gobble(inFASTA);
+               }
+               inFASTA.close();
+
+       }
+       catch(exception& e) {
+               errorOut(e, "ChimeraSeqsCommand", "readSeqs");
+               exit(1);
+       }
+}
+
+
+//***************************************************************************************************************
+int ChimeraSeqsCommand::findAverageMidPoint(){
+       try {
+               int totalMids = 0;
+               int averageMid = 0;
+               
+               //loop through the seqs and find midpoint
+               for (int i = 0; i < seqs.size(); i++) {
+                       
+                       //get unaligned sequence
+                       seqs[i].setUnaligned(seqs[i].getUnaligned());  //if you read an aligned file the unaligned is really aligned, so we need to make sure its unaligned
+                       
+                       string unaligned = seqs[i].getUnaligned();
+                       string aligned = seqs[i].getAligned();
+                       
+                       //find midpoint of this seq
+                       int count = 0;
+                       int thismid = 0;
+                       for (int j = 0; j < aligned.length(); j++) {
+                               
+                               thismid++;
+                               
+                               //if you are part of the unaligned sequence increment
+                               if (isalpha(aligned[j])) {  count++;  }
+                               
+                               //if you have reached the halfway point stop
+                               if (count >= (unaligned.length() / 2)) { break; }
+                       }
+                       
+                       //add this mid to total
+                       totalMids += thismid;
+               
+               }
+               
+               averageMid = (totalMids / seqs.size());
+               
+               return averageMid; 
+       
+       
+       }
+       catch(exception& e) {
+               errorOut(e, "ChimeraSeqsCommand", "findAverageMidPoint");
+               exit(1);
+       }
+}
+
+/***************************************************************************************************************
+int ChimeraSeqsCommand::createSparseMatrix(int startLine, int endLine, SparseMatrix* sparse){
+       try {
+
+               for(int i=startLine; i<endLine; i++){
+                       
+                       for(int j=0;j<i;j++){
+                       
+                               distCalculator->calcDist(seqs.get(i), seqs.get(j));
+                               double dist = distCalculator->getDist();
+                               
+                               
+                               
+                       }
+                       
+       
+               return 1;
+       }
+       catch(exception& e) {
+               errorOut(e, "ChimeraSeqsCommand", "createSparseMatrix");
+               exit(1);
+       }
+}
+/**************************************************************************************************/
+
diff --git a/chimeraseqscommand.h b/chimeraseqscommand.h
new file mode 100644 (file)
index 0000000..455ac92
--- /dev/null
@@ -0,0 +1,57 @@
+#ifndef CHIMERACOMMAND_H
+#define CHIMERACOMMAND_H
+
+/*
+ *  chimeraseqscommand.h
+ *  Mothur
+ *
+ *  Created by Sarah Westcott on 6/29/09.
+ *  Copyright 2009 Schloss Lab UMASS Amherst. All rights reserved.
+ *
+ */
+
+#include "mothur.h"
+#include "command.hpp"
+#include "filterseqscommand.h"
+#include "sequence.hpp"
+
+
+
+/***********************************************************/
+
+class ChimeraSeqsCommand : public Command {
+public:
+       ChimeraSeqsCommand(string);
+       ~ChimeraSeqsCommand();
+       int execute();
+       void help();
+       
+private:
+       //Dist* distCalculator;
+       
+       struct Preference {
+               string leftParent;
+               string rightParent;
+               float score;
+
+       };
+
+
+       bool abort;
+       string method, fastafile;
+       bool filter, correction;
+       int processors, midpoint;
+       FilterSeqsCommand* filterSeqs;
+       vector<Sequence> seqs;
+       vector<Preference> pref;
+       
+       int findAverageMidPoint();
+       void readSeqs();
+       
+
+};
+
+/***********************************************************/
+
+#endif
+
index 9a90219d14c3809a687e053851440971eecb2e9b..8c34068bba76f4347d8783b35c74af69979921ea 100644 (file)
@@ -17,18 +17,9 @@ ConcensusCommand::ConcensusCommand(string fileroot){
                abort = false;
                
                filename = fileroot;
-               //allow user to run help
-               //if(option == "help") { help(); abort = true; }
                
-               //else {
-                       //if (option != "") { mothurOut("There are no valid parameters for the consensus command."); mothurOutEndLine(); abort = true; }
-                       
-               //      //no trees were read
-               //      if (globaldata->gTree.size() == 0) {  mothurOut("You must execute the read.tree command, before you may use the consensus command."); mothurOutEndLine(); abort = true;  }
-                       //else { 
-                        t = globaldata->gTree;
-                        //     }
-               //}
+               t = globaldata->gTree;
+       
        }
        catch(exception& e) {
                errorOut(e, "ConcensusCommand", "ConcensusCommand");
@@ -97,9 +88,10 @@ int ConcensusCommand::execute(){
                out2 << "Species in Order: " << endl << endl;
                for (int i = 0; i < treeSet.size(); i++) {  out2 << i+1 << ".  " << treeSet[i] << endl; }
                
-               vector<string> temp; 
                //output sets included
                out2 << endl << "Sets included in the consensus tree:" << endl << endl;
+               
+               vector<string> temp;
                for (it2 = nodePairsInTree.begin(); it2 != nodePairsInTree.end(); it2++) {
                        //only output pairs not leaves
                        if (it2->first.size() > 1) { 
@@ -112,6 +104,7 @@ int ConcensusCommand::execute(){
                                        //find spot 
                                        int index = findSpot(it2->first[i]);
                                        temp[index] = "*";
+                                       //temp[index] = it2->first[i] + "  ";
                                }
                                
                                //output temp
@@ -173,8 +166,8 @@ int ConcensusCommand::buildConcensusTree(vector<string> nodeSet) {
                //terminate recursion
                }else if (count == numNodes) { return 0; }
                else {
-                       leftChildSet = getNextAvailableSet(nodeSet);
-                       rightChildSet = getRestSet(nodeSet, leftChildSet);
+                       //finds best child pair
+                       leftChildSet = getNextAvailableSet(nodeSet, rightChildSet);
                        int left = buildConcensusTree(leftChildSet);
                        int right = buildConcensusTree(rightChildSet);
                        consensusTree->tree[count].setChildren(left, right);
@@ -221,6 +214,7 @@ void ConcensusCommand::getSets() {
                        }
                }
                
+               
                //add each leaf to terminate recursion in consensus
                //you want the leaves in there but with insignifigant sightings value so it is added last
                //for each leaf node get descendant info.
@@ -238,33 +232,70 @@ void ConcensusCommand::getSets() {
                }
 
                sort(treeSet.begin(), treeSet.end());
+               
+               
+               map< vector<string>, int> nodePairsCopy = nodePairs;
+               
+               
+               //set initial rating on pairs to sightings + subgroup sightings
+               while (nodePairsCopy.size() != 0) {
+               
+                       vector<string> small = getSmallest(nodePairsCopy);
+                       
+                       int subgrouprate = getSubgroupRating(small);
+               
+                       nodePairsInitialRate[small] = nodePairs[small] + subgrouprate;
+                       
+                       nodePairsCopy.erase(small);
+               }
+               
        }
        catch(exception& e) {
                errorOut(e, "ConcensusCommand", "getSets");
                exit(1);
        }
 }
+//**********************************************************************************************************************
+vector<string> ConcensusCommand::getSmallest(map< vector<string>, int> nodes) {
+       try{
+               vector<string> smallest = nodes.begin()->first;
+               int smallsize = smallest.size();
+               
+               for(it2 = nodes.begin(); it2 != nodes.end(); it2++) {
+                       if(it2->first.size() < smallsize) { smallsize = it2->first.size();  smallest = it2->first;  }
+               }
+               
+               return smallest;
+       }
+       catch(exception& e) {
+               errorOut(e, "ConcensusCommand", "getSmallest");
+               exit(1);
+       }
+}
 
 //**********************************************************************************************************************
-vector<string> ConcensusCommand::getNextAvailableSet(vector<string> bigset) {
+vector<string> ConcensusCommand::getNextAvailableSet(vector<string> bigset, vector<string>& rest) {
        try {
+//cout << "new call " << endl << endl << endl;
                vector<string> largest; largest.clear();
-               int largestSighting = -1;
-               
-               //go through the sets
-               for (it2 = nodePairs.begin(); it2 != nodePairs.end(); it2++) {
-                       //are you a subset of bigset
-                       if (isSubset(bigset, it2->first) == true) {
-                       
-                               //are you the largest. if you are the same size as current largest refer to sighting
-                               if (it2->first.size() > largest.size()) { largest = it2->first;  largestSighting = it2->second; }
-                               else if (it2->first.size() == largest.size()) {
-                                       if (it2->second > largestSighting) { largest = it2->first;  largestSighting = it2->second; }
-                               }
-                               
-                       }
+               rest.clear();
+               
+               //if you are just 2 groups
+               if (bigset.size() == 2)  {   
+                       rest.push_back(bigset[0]);
+                       largest.push_back(bigset[1]);
+               }else{
+                       rest = bestSplit[bigset][0];
+                       largest = bestSplit[bigset][1];
                }
                
+               
+               //save for printing out later and for branch lengths
+               nodePairsInTree[rest] = nodePairs[rest];
+               
+               //delete whatever set you return because it is no longer available
+               nodePairs.erase(rest);
+
                //save for printing out later and for branch lengths
                nodePairsInTree[largest] = nodePairs[largest];
                
@@ -280,6 +311,73 @@ vector<string> ConcensusCommand::getNextAvailableSet(vector<string> bigset) {
        }
 }
 
+/**********************************************************************************************************************/
+int ConcensusCommand::getSubgroupRating(vector<string> group) {
+       try {
+               map< vector<string>, int>::iterator ittemp;
+               map< vector< vector<string> > , int >::iterator it3;
+               int rate = 0;
+               
+               // ***********************************************************************************//
+               //1. this function must be called passing it littlest sets to biggest 
+               //              since it the rating is made from your sighting plus you best splits rating
+               //2. it saves the top pair to use later
+               // ***********************************************************************************//
+
+               
+               if (group.size() < 3) {  return rate;  }
+               
+               map< vector<string>, int> possiblePairing;  //this is all the subsets of group
+               
+               //go through the sets
+               for (it2 = nodePairs.begin(); it2 != nodePairs.end(); it2++) {
+                       //are you a subset of bigset, then save in possiblePairings
+                       if (isSubset(group, it2->first) == true) {  possiblePairing[it2->first] = it2->second;  }
+               }               
+       
+               map< vector< vector<string> > , int > rating;
+               
+               while (possiblePairing.size() != 0) {
+               
+                       it2 = possiblePairing.begin(); 
+                       vector<string> temprest = getRestSet(group, it2->first);
+                       
+                       //is the rest a set available in possiblePairings
+                       ittemp = possiblePairing.find(temprest);
+                       if (ittemp != possiblePairing.end()) {  //if the rest is in the possible pairings then add this pair to rating map
+                               vector< vector<string> > temprate;
+                               temprate.push_back(it2->first);  temprate.push_back(temprest);
+                               
+                               rating[temprate] = (nodePairsInitialRate[it2->first] + nodePairsInitialRate[temprest]);
+                               
+                               //erase so you dont add 1,2 and 2,1.
+                               possiblePairing.erase(temprest);
+                       }
+                       
+                       possiblePairing.erase(it2);
+               }
+
+
+               it3 = rating.begin();
+               rate = it3->second;
+               vector< vector<string> > topPair = it3->first;
+               
+               //choose the split with the best rating
+               for (it3 = rating.begin(); it3 != rating.end(); it3++) {
+                       
+                       if (it3->second > rate) {  rate = it3->second;  topPair = it3->first;  }
+               }
+               
+               bestSplit[group] = topPair;
+               
+               return rate;
+       }
+       catch(exception& e) {
+               errorOut(e, "ConcensusCommand", "getSubgroupRating");
+               exit(1);
+       }
+}
+
 //**********************************************************************************************************************
 vector<string> ConcensusCommand::getRestSet(vector<string> bigset, vector<string> subset) {
        try {
@@ -294,12 +392,6 @@ vector<string> ConcensusCommand::getRestSet(vector<string> bigset, vector<string
                        //its not in the subset so put it in the rest
                        if (inSubset == false) { rest.push_back(bigset[i]); }
                }
-               
-               //save for printing out later and for branch lengths
-               nodePairsInTree[rest] = nodePairs[rest];
-               
-               //delete whatever set you return because it is no longer available
-               nodePairs.erase(rest);
 
                return rest;
        
@@ -314,6 +406,9 @@ vector<string> ConcensusCommand::getRestSet(vector<string> bigset, vector<string
 bool ConcensusCommand::isSubset(vector<string> bigset, vector<string> subset) {
        try {
                
+       
+               if (subset.size() > bigset.size()) { return false;  }
+               
                //check if each guy in suset is also in bigset
                for (int i = 0; i < subset.size(); i++) {
                        bool match = false;
index 53b8094b440be3e757bff24209ca97022a7433df..145bea5bdaf24410229cb42e889e7f02cacdd9c8 100644 (file)
@@ -35,6 +35,8 @@ private:
                                                                                // ie. combos FI and EGK would create nodePairs[vector containing F and I] = 1; nodePairs[vector containing E, G and K] = 1
                                                                                // if you saw the combo FI again in another tree you would then update nodePairs[vector containing F and I] = 2;
                                                                                // requires vectors to be sorted to find key.
+       map< vector<string>, vector< vector<string> > > bestSplit;  //maps a group to its best split
+       map< vector<string>, int > nodePairsInitialRate;
        map< vector<string>, int > nodePairsInTree;
        map<string, int>::iterator it;
        map< vector<string>, int>::iterator it2;
@@ -43,7 +45,9 @@ private:
        int numNodes, numLeaves, count;  //count is the next available spot in the tree vector
                                                                                
        void getSets();
-       vector<string> getNextAvailableSet(vector<string>);  //gets next largest and highest rated set that is a subset of the set passed in.
+       int getSubgroupRating(vector<string>);
+       vector<string> getSmallest(map< vector<string>, int>);
+       vector<string> getNextAvailableSet(vector<string>, vector<string>&);  
        vector<string> getRestSet(vector<string>, vector<string>);
        bool isSubset(vector<string>, vector<string>); 
        int findSpot(string); 
diff --git a/filters.h b/filters.h
new file mode 100644 (file)
index 0000000..4c39607
--- /dev/null
+++ b/filters.h
@@ -0,0 +1,102 @@
+#ifndef FILTERS_H
+#define FILTERS_H
+
+/*
+ *  filters.h
+ *  Mothur
+ *
+ *  Created by Sarah Westcott on 6/29/09.
+ *  Copyright 2009 Schloss Lab UMASS Amherst. All rights reserved.
+ *
+ */
+
+#include "mothur.h"
+#include "sequence.hpp"
+
+
+/***********************************************************************/
+
+class Filters {
+
+public:
+       Filters() {};
+       ~Filters(){};
+               
+       string getFilter()                      {       return filter;          }
+       void setFilter(string s)        {  filter = s;                  }
+       void setLength(int l)           { alignmentLength = l;  }
+       void setSoft(float s)           {               soft = s;               }
+       void setTrump(float t)          {               trump = t;              }
+       void setNumSeqs(int num)        {       numSeqs = num;          }
+       
+       void initialize() {
+               a.assign(alignmentLength, 0);
+               t.assign(alignmentLength, 0);
+               g.assign(alignmentLength, 0);
+               c.assign(alignmentLength, 0);
+               gap.assign(alignmentLength, 0);
+       }
+
+       void doSoft() { 
+               int threshold = int (soft * numSeqs);
+       
+               for(int i=0;i<alignmentLength;i++){
+                       if(a[i] < threshold && t[i] < threshold && g[i] < threshold && c[i] < threshold){       filter[i] = 0;  }
+               }
+       }
+       
+       void doVertical() {
+
+               for(int i=0;i<alignmentLength;i++){
+                       if(gap[i] == numSeqs)   {       filter[i] = '0';        }
+               }
+       
+       }
+       
+       void doTrump(Sequence seq) {
+       
+               string curAligned = seq.getAligned();
+
+               for(int j = 0; j < alignmentLength; j++) {
+                       if(curAligned[j] == trump){
+                               filter[j] = '0';
+                       }
+               }
+
+       }
+
+       void doHard(string hard) {
+               ifstream fileHandle;
+               openInputFile(hard, fileHandle);
+       
+               fileHandle >> filter;
+       
+               fileHandle.close();
+       }
+
+       void getFreqs(Sequence seq) {
+       
+               string curAligned = seq.getAligned();
+       
+               for(int j=0;j<alignmentLength;j++){
+                       if(toupper(curAligned[j]) == 'A')                                                                               {       a[j]++;         }
+                       else if(toupper(curAligned[j]) == 'T' || toupper(curAligned[j]) == 'U') {       t[j]++;         }
+                       else if(toupper(curAligned[j]) == 'G')                                                                  {       g[j]++;         }
+                       else if(toupper(curAligned[j]) == 'C')                                                                  {       c[j]++;         }
+                       else if(curAligned[j] == '-' || curAligned[j] == '.')                                   {       gap[j]++;       }
+               }
+       }
+               
+protected:
+       string filter;
+       vector<int> a, t, g, c, gap;
+       int alignmentLength, numSeqs;
+       float soft;
+       char trump;
+
+};
+
+/***********************************************************************/
+
+#endif
+
index ad491aa1252ccb28553b12daf0f353533428d531..68450fb15a7b140803aab2d54d8c5bc84e7ce688 100644 (file)
@@ -56,6 +56,13 @@ FilterSeqsCommand::FilterSeqsCommand(string option){
                        
                        numSeqs = 0;
                        
+                       if (abort == false) {
+                       
+                               if (soft != 0)                  {  F.setSoft(soft);             }
+                               if (trump != '*')               {  F.setTrump(trump);   }
+                       
+                       }
+                                               
                }
                
        }
@@ -90,70 +97,6 @@ void FilterSeqsCommand::help(){
 
 /**************************************************************************************/
 
-void FilterSeqsCommand::doHard() {
-       
-       ifstream fileHandle;
-       openInputFile(hard, fileHandle);
-       
-       fileHandle >> filter;
-       
-       fileHandle.close();
-
-}
-
-/**************************************************************************************/
-
-void FilterSeqsCommand::doTrump(Sequence seq) {
-       
-       string curAligned = seq.getAligned();
-
-       for(int j = 0; j < alignmentLength; j++) {
-               if(curAligned[j] == trump){
-                       filter[j] = '0';
-               }
-       }
-
-}
-
-/**************************************************************************************/
-
-void FilterSeqsCommand::doVertical() {
-
-       for(int i=0;i<alignmentLength;i++){
-               if(gap[i] == numSeqs)   {       filter[i] = '0';        }
-       }
-       
-}
-
-/**************************************************************************************/
-
-void FilterSeqsCommand::doSoft() {
-       
-       int threshold = int (soft * numSeqs);
-       
-       for(int i=0;i<alignmentLength;i++){
-               if(a[i] < threshold && t[i] < threshold && g[i] < threshold && c[i] < threshold){       filter[i] = 0;  }
-       }
-}
-
-/**************************************************************************************/
-
-void FilterSeqsCommand::getFreqs(Sequence seq) {
-
-       string curAligned = seq.getAligned();;
-       
-       for(int j=0;j<alignmentLength;j++){
-               if(toupper(curAligned[j]) == 'A')                                                                               {       a[j]++;         }
-               else if(toupper(curAligned[j]) == 'T' || toupper(curAligned[j]) == 'U') {       t[j]++;         }
-               else if(toupper(curAligned[j]) == 'G')                                                                  {       g[j]++;         }
-               else if(toupper(curAligned[j]) == 'C')                                                                  {       c[j]++;         }
-               else if(curAligned[j] == '-' || curAligned[j] == '.')                                   {       gap[j]++;       }
-       }
-       
-}
-
-/**************************************************************************************/
-
 int FilterSeqsCommand::execute() {     
        try {
        
@@ -166,31 +109,33 @@ int FilterSeqsCommand::execute() {
                alignmentLength = testSeq.getAlignLength();
                inFASTA.seekg(0);
                
+               F.setLength(alignmentLength);
+               
                if(soft != 0 || isTrue(vertical)){
-                       a.assign(alignmentLength, 0);
-                       t.assign(alignmentLength, 0);
-                       g.assign(alignmentLength, 0);
-                       c.assign(alignmentLength, 0);
-                       gap.assign(alignmentLength, 0);
+                       F.initialize();
                }
                
-               if(hard.compare("") != 0)       {       doHard();               }
-               else                                            {       filter = string(alignmentLength, '1');  }
+               if(hard.compare("") != 0)       {       F.doHard(hard);         }
+               else                                            {       F.setFilter(string(alignmentLength, '1'));      }
 
                if(trump != '*' || isTrue(vertical) || soft != 0){
                        while(!inFASTA.eof()){  //read through and create the filter...
                                Sequence seq(inFASTA);
-                               if(trump != '*'){       doTrump(seq);   }
-                               if(isTrue(vertical) || soft != 0){      getFreqs(seq);  }
+                               if(trump != '*'){       F.doTrump(seq); }
+                               if(isTrue(vertical) || soft != 0){      F.getFreqs(seq);        }
                                numSeqs++;
                                cout.flush();
                        }
                
                }
                inFASTA.close();
+               F.setNumSeqs(numSeqs);
+               
+               
+               if(isTrue(vertical) == 1)       {       F.doVertical(); }
+               if(soft != 0)                           {       F.doSoft();             }
                
-               if(isTrue(vertical) == 1)       {       doVertical();   }
-               if(soft != 0)   {       doSoft();               }                       
+               filter = F.getFilter();
 
                ofstream outFilter;
                string filterFile = getRootName(fastafile) + "filter";
index cfa56b39df6a4001014e1bc42048bed50bede8fa..611cbbc1b226c13d859273dc0cb356bb3b3d13d5 100644 (file)
@@ -11,6 +11,7 @@
  */
 
 #include "command.hpp"
+#include "filters.h"
 
 class Sequence;
 class FilterSeqsCommand : public Command {
@@ -22,11 +23,6 @@ public:
        void help();
        
 private:
-       void doHard();
-       void doTrump(Sequence);
-       void doVertical();
-       void doSoft();
-       void getFreqs(Sequence);
        string vertical, filter, fastafile, hard;       
        int alignmentLength;
 
@@ -35,6 +31,8 @@ private:
        float soft;
        int numSeqs;
        
+       Filters F;
+               
        vector<int> a, t, g, c, gap;
 
 };
index 2c40c25ed7fadfcc50958ba1a1c87e124a939cf6..217dd3594081703b6cf7a28857cd553b62bcfaec 100644 (file)
--- a/venn.cpp
+++ b/venn.cpp
@@ -380,78 +380,105 @@ void Venn::getPic(vector<SharedRAbundVector*> lookup, vector<Calculator*> vCalcs
                                        //get estimates for numA
                                        data = singleCalc->getValues(sabundA);
                                        numA = data[0];
+       //cout << "num a = " << numA << endl;   
                        
                                        //get estimates for numB
                                        data = singleCalc->getValues(sabundB);
                                        numB = data[0];
-                               
+       //cout << "num b = " << numB << endl;                           
                                        //get estimates for numC
                                        data = singleCalc->getValues(sabundC);
                                        numC = data[0];
-                               
+       //cout << "num c = " << numC << endl;                           
                                        //get estimates for numD
                                        data = singleCalc->getValues(sabundD);
                                        numD = data[0];
-
+//cout << "num d = " << numD << endl;  
 
                                        //get estimates for pairs
                                        subset.clear();
                                        subset.push_back(lookup[0]); subset.push_back(lookup[1]);
                                        data = vCalcs[i]->getValues(subset);
                                        sharedAB = data[0];
-                               
+       //cout << "num ab = " << sharedAB << endl;                      
                                        subset.clear();
                                        subset.push_back(lookup[0]); subset.push_back(lookup[2]);
                                        data = vCalcs[i]->getValues(subset);
                                        sharedAC = data[0];
-                               
+       //cout << "num ac = " << sharedAC << endl;                              
                                        subset.clear();
                                        subset.push_back(lookup[0]); subset.push_back(lookup[3]);
                                        data = vCalcs[i]->getValues(subset);
                                        sharedAD = data[0];
-                               
+       //cout << "num ad = " << sharedAD << endl;                      
                                        subset.clear();
                                        subset.push_back(lookup[1]); subset.push_back(lookup[2]);
                                        data = vCalcs[i]->getValues(subset);
                                        sharedBC = data[0];
-                               
+       //cout << "num bc = " << sharedBC << endl;                              
                                        subset.clear();
                                        subset.push_back(lookup[1]); subset.push_back(lookup[3]);
                                        data = vCalcs[i]->getValues(subset);
                                        sharedBD = data[0];
-                               
+               //cout << "num bd = " << sharedBD << endl;                                              
                                        subset.clear();
                                        subset.push_back(lookup[2]); subset.push_back(lookup[3]);
                                        data = vCalcs[i]->getValues(subset);
                                        sharedCD = data[0];
-                               
-                               
+                                               
+       //cout << "num cd = " << sharedCD << endl;                              
                                        //get estimates for combos of 3
                                        subset.clear();
                                        subset.push_back(lookup[0]); subset.push_back(lookup[1]); subset.push_back(lookup[2]);
                                        data = vCalcs[i]->getValues(subset);
                                        sharedABC = data[0];
-                               
+               //cout << "num abc = " << sharedABC << endl;                                    
                                        subset.clear();
                                        subset.push_back(lookup[0]); subset.push_back(lookup[2]); subset.push_back(lookup[3]);
                                        data = vCalcs[i]->getValues(subset);
                                        sharedACD = data[0];
-                               
+                       //cout << "num acd = " << sharedACD << endl;    
                                        subset.clear();
                                        subset.push_back(lookup[1]); subset.push_back(lookup[2]); subset.push_back(lookup[3]);
                                        data = vCalcs[i]->getValues(subset);
                                        sharedBCD = data[0];
-                               
+               //cout << "num bcd = " << sharedBCD << endl;            
                                        subset.clear();
                                        subset.push_back(lookup[0]); subset.push_back(lookup[1]); subset.push_back(lookup[3]);
                                        data = vCalcs[i]->getValues(subset);
                                        sharedABD = data[0];
-
+//cout << "num abd = " << sharedABD << endl;
                                        //get estimate for all four
                                        data = vCalcs[i]->getValues(lookup);
                                        sharedABCD = data[0];
-
-               
+               //cout << "num abcd = " << sharedABCD << endl << endl;                  
+                                       //make adjustments
+                                       sharedABC = sharedABC - sharedABCD;
+                       //cout << "num abc = " << sharedABC << endl;            
+                                       sharedABD = sharedABD - sharedABCD;
+                               //cout << "num abd = " << sharedABD << endl;
+                                       sharedACD = sharedACD - sharedABCD;
+                               //cout << "num acd = " << sharedACD << endl;
+                                       sharedBCD = sharedBCD - sharedABCD;
+                               //cout << "num bcd = " << sharedBCD << endl;
+                                       
+                                       sharedAB = sharedAB - sharedABC - sharedABCD - sharedABD; // cout << "num ab = " << sharedAB << endl;
+                                       sharedAC = sharedAC - sharedABC - sharedABCD - sharedACD; // cout << "num ac = " << sharedAC << endl;
+                                       sharedAD = sharedAD - sharedABD - sharedABCD - sharedACD; // cout << "num ad = " << sharedAD << endl;
+                                       
+                                       sharedBC = sharedBC - sharedABC - sharedABCD - sharedBCD; // cout << "num bc = " << sharedBC << endl;
+                                       sharedBD = sharedBD - sharedABD - sharedABCD - sharedBCD; // cout << "num bd = " << sharedBD << endl; 
+                                       sharedCD = sharedCD - sharedACD - sharedABCD - sharedBCD; // cout << "num cd = " << sharedCD << endl;
+                                       
+                                       numA = numA - sharedAB - sharedAC - sharedAD - sharedABCD - sharedABC - sharedACD - sharedABD;
+                       //cout << "num a = " << numA << endl;           
+                                       numB = numB - sharedAB - sharedBC - sharedBD - sharedABCD - sharedABC - sharedABD - sharedBCD;
+                       //cout << "num b = " << numB << endl;           
+                                       numC = numC - sharedAC - sharedBC - sharedCD - sharedABCD - sharedABC - sharedACD - sharedBCD;
+                       //cout << "num c = " << numC << endl;           
+                                       numD = numD - sharedAD - sharedBD - sharedCD - sharedABCD - sharedBCD - sharedACD - sharedABD;
+                       //cout << "num d = " << numD << endl;           
+                                       
                                        //image window
                                        outsvg << "<svg width=\"100%\" height=\"100%\" viewBox=\"0 0 700 700\" >\n";
                                        outsvg << "<g>\n";
@@ -478,8 +505,8 @@ void Venn::getPic(vector<SharedRAbundVector*> lookup, vector<Calculator*> vCalcs
                                        outsvg << "<text fill=\"black\" class=\"seri\" x=\"" + toString(215 - ((int)toString(sharedBC).length() / 2)) + "\" y=\"190\">" + toString(sharedBC) + "</text>\n";  
                                        outsvg << "<text fill=\"black\" class=\"seri\"  x=\"" + toString(150 - ((int)toString(numD).length() / 2)) + "\"   y=\"230\">" + toString(numD) + "</text>\n";  
                                        outsvg << "<text fill=\"black\" class=\"seri\"  x=\"" + toString(150 - ((int)lookup[3]->getGroup().length() / 2)) + "\"   y=\"210\">" + lookup[3]->getGroup() + "</text>\n"; 
-                                       outsvg << "<text fill=\"black\" class=\"seri\" x=\"" + toString(240 - ((int)toString(sharedBC).length() / 2)) + "\" y=\"325\">" + toString(sharedAD) + "</text>\n"; 
-                                       outsvg << "<text fill=\"black\" class=\"seri\" x=\"" + toString(470 - ((int)toString(sharedBC).length() / 2)) + "\" y=\"325\">" + toString(sharedBD) + "</text>\n";
+                                       outsvg << "<text fill=\"black\" class=\"seri\" x=\"" + toString(240 - ((int)toString(sharedAD).length() / 2)) + "\" y=\"325\">" + toString(sharedAD) + "</text>\n"; 
+                                       outsvg << "<text fill=\"black\" class=\"seri\" x=\"" + toString(470 - ((int)toString(sharedBD).length() / 2)) + "\" y=\"325\">" + toString(sharedBD) + "</text>\n";
                                        outsvg << "<text fill=\"black\" class=\"seri\" x=\"" + toString(350 - ((int)toString(sharedCD).length() / 2)) + "\" y=\"430\">" + toString(sharedCD) + "</text>\n"; 
                                        outsvg << "<text fill=\"black\" class=\"seri\" x=\"" + toString(275 - ((int)toString(sharedABD).length() / 2)) + "\" y=\"240\">" + toString(sharedABD) + "</text>\n"; 
                                        outsvg << "<text fill=\"black\" class=\"seri\" x=\"" + toString(400 - ((int)toString(sharedBCD).length() / 2)) + "\" y=\"360\">" + toString(sharedBCD) + "</text>\n";