]> git.donarmstrong.com Git - mothur.git/commitdiff
pat's mods to morisitahorn and pre.cluster
authorpschloss <pschloss>
Tue, 19 Jan 2010 14:20:22 +0000 (14:20 +0000)
committerpschloss <pschloss>
Tue, 19 Jan 2010 14:20:22 +0000 (14:20 +0000)
Mothur.xcodeproj/project.pbxproj
classifyseqscommand.cpp
fileoutput.cpp
heatmapsim.cpp
preclustercommand.cpp
preclustercommand.h
screenseqscommand.cpp
sharedmorisitahorn.cpp
trimseqscommand.cpp

index 7f8568e03a0f9bdbd282ba3a87f38107359e44dc..8e21f64a61dbedfb71cf3a279161398b0fe6bfc9 100644 (file)
                A7D86C7C10FE09AB00865661 /* formatcolumn.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = formatcolumn.cpp; sourceTree = SOURCE_ROOT; };
                A7D86C8A10FE09FE00865661 /* formatphylip.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = formatphylip.h; sourceTree = SOURCE_ROOT; };
                A7D86C8B10FE09FE00865661 /* formatphylip.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = formatphylip.cpp; sourceTree = SOURCE_ROOT; };
-               A7E0AAB410D2886D00CF407B /* mgclustercommand.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = mgclustercommand.h; sourceTree = "<group>"; };
-               A7E0AAB510D2886D00CF407B /* mgclustercommand.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = mgclustercommand.cpp; sourceTree = "<group>"; };
+               A7E0AAB410D2886D00CF407B /* mgclustercommand.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = mgclustercommand.h; sourceTree = SOURCE_ROOT; };
+               A7E0AAB510D2886D00CF407B /* mgclustercommand.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = mgclustercommand.cpp; sourceTree = SOURCE_ROOT; };
                A7E4A781106913F900688F62 /* getsharedotucommand.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = getsharedotucommand.h; sourceTree = SOURCE_ROOT; };
                A7E4A782106913F900688F62 /* getsharedotucommand.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = getsharedotucommand.cpp; sourceTree = SOURCE_ROOT; };
                A7E4A822106A9AD700688F62 /* maligner.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = maligner.h; sourceTree = SOURCE_ROOT; };
                1DEB923608733DC60010E9CD /* Debug */ = {
                        isa = XCBuildConfiguration;
                        buildSettings = {
+                               ARCHS = "$(ARCHS_STANDARD_64_BIT_PRE_XCODE_3_1)";
+                               ARCHS_STANDARD_64_BIT_PRE_XCODE_3_1 = x86_64;
                                GCC_OPTIMIZATION_LEVEL = 3;
                                GCC_WARN_ABOUT_MISSING_NEWLINE = YES;
                                GCC_WARN_ABOUT_RETURN_TYPE = YES;
                1DEB923708733DC60010E9CD /* Release */ = {
                        isa = XCBuildConfiguration;
                        buildSettings = {
-                               ARCHS = (
-                                       ppc,
-                                       i386,
-                               );
+                               ARCHS = "$(ARCHS_STANDARD_64_BIT_PRE_XCODE_3_1)";
+                               ARCHS_STANDARD_64_BIT_PRE_XCODE_3_1 = x86_64;
                                GCC_OPTIMIZATION_LEVEL = 3;
                                GCC_WARN_ABOUT_MISSING_NEWLINE = YES;
                                GCC_WARN_ABOUT_MISSING_PROTOTYPES = YES;
index 39dda9ffd5fb29bb872ce4ccb205882d69926c66..3e4f6ad3b36f4a137ceef8cb774b905b238c2f20 100644 (file)
@@ -82,6 +82,7 @@ ClassifySeqsCommand::ClassifySeqsCommand(string option){
                        
                        namefile = validParameter.validFile(parameters, "name", false);
                        if (namefile == "not found") { namefile = "";  }
+
                        else { 
                                splitAtDash(namefile, namefileNames);
                                
index 004ed8313511c2dd42d4b87aca93c0ad5750bce5..cd3a1c74084de46dd7f5c7805d3425f0d0dd3ecb 100644 (file)
@@ -306,7 +306,7 @@ void OneColumnFile::initFile(string label){
                }
                else{
                        openOutputFile(outName, outFile);
-                       outFile << "numsequences\t" << label << endl;
+                       outFile << "numsampled\t" << label << endl;
                }
        
                outFile.setf(ios::fixed, ios::floatfield);
index 88a49ed93512369a5c3d0e9d7c17fde7304d5f7f..c68d0cc413855650efc86f2370ad7ab82508e6b7 100644 (file)
@@ -50,7 +50,8 @@ void HeatMapSim::getPic(vector<SharedRAbundVector*> lookup, vector<Calculator*>
                        }
                        
                        sims.clear();
-                       double biggest = -1;
+//                     double biggest = -1;
+                       double biggest = 1;
                        float scaler;
 
                        //get sim for each comparison and save them so you can find the relative similairity
@@ -65,7 +66,7 @@ void HeatMapSim::getPic(vector<SharedRAbundVector*> lookup, vector<Calculator*>
                                                sims.push_back(data[0]);
                                        
                                                //save biggest similairity to set relative sim
-                                               if (data[0] > biggest) { biggest = data[0]; }
+//                                             if (data[0] > biggest) { biggest = data[0]; }
                                }
                        }
                        
@@ -171,6 +172,7 @@ void HeatMapSim::getPic(vector< vector<double> > dists, vector<string> groups) {
 
 void HeatMapSim::printLegend(int y, float maxSim) {
        try {
+               maxSim = 1;
                
                //output legend and color labels
                //go through map and give each score a color value
@@ -180,22 +182,22 @@ void HeatMapSim::printLegend(int y, float maxSim) {
                //prints legend
                for (int i = 1; i < 255; i++) {
                        color = toHex(int((float)(i)));
-                       outsvg << "<rect fill=\"#" + color + "0000\" stroke=\"#" + color + "0000\" x=\"" + toString(x) + "\" y=\"" + toString(y) + "\" width=\"1\" height=\"10\"/>\n";
-                       x += 1;
+                       outsvg << "<rect fill=\"#" + color + "0000\" stroke=\"#" + color + "0000\" x=\"" + toString(x) + "\" y=\"" + toString(y) + "\" width=\"3\" height=\"30\"/>\n";
+                       x += 3;
                }
                
                float scaler = maxSim / 5.0;
                
                //prints legend labels
-               x = 10;
-               for (int i = 1; i<=5; i++) {
+               x = 0;
+               for (int i = 0; i<=5; i++) {
                        float label = scaler*i;
                        label = int(label * 1000 + 0.5);
                        label /= 1000.0;
-                       string text = toString(label, 3);
+                       string text = toString(label, 1);
                        
                        outsvg << "<text fill=\"black\" class=\"seri\" x=\"" + toString(x) + "\" y=\"" + toString(y-3) + "\">" + text + "</text>\n";
-                       x += 60;
+                       x += 153;
                }
        }
        
index 479a862935430545757ef48c6da660913782cfa2..66fd8500e3c7d583604106e0408e15bc36dc4a92 100644 (file)
@@ -43,9 +43,10 @@ PreClusterCommand::PreClusterCommand(string option){
                        //check for optional parameter and set defaults
                        // ...at some point should added some additional type checking...
                        namefile = validParameter.validFile(parameters, "name", true);
+
                        if (namefile == "not found") { namefile =  "";  }
                        else if (namefile == "not open") { abort = true; }      
-                       else {  readNameFile();  }
+//                     else {  readNameFile();  }
                        
                        string temp     = validParameter.validFile(parameters, "diffs", false);                         if(temp == "not found"){        temp = "1"; }
                        convert(temp, diffs); 
@@ -87,68 +88,62 @@ int PreClusterCommand::execute(){
                if (abort == true) { return 0; }
                
                //reads fasta file and return number of seqs
-               int numSeqs = readSeqs(); //fills alignSeqs and makes all seqs active
+               int numSeqs = readNamesFASTA(); //fills alignSeqs and makes all seqs active
        
                if (numSeqs == 0) { mothurOut("Error reading fasta file...please correct."); mothurOutEndLine(); return 0;  }
                if (diffs > length) { mothurOut("Error: diffs is greater than your sequence length."); mothurOutEndLine(); return 0;  }
                
                //clear sizes since you only needed this info to build the alignSeqs seqPNode structs
-               sizes.clear();
+//             sizes.clear();
        
                //sort seqs by number of identical seqs
                sort(alignSeqs.begin(), alignSeqs.end(), comparePriority);
        
                //go through active list and cluster everthing you can, until all nodes are inactive
                //taking advantage of the fact that maps are already sorted
-               map<string, bool>::iterator itActive;
-               map<string, bool>::iterator it2Active;
+//             map<string, bool>::iterator itActive;
+//             map<string, bool>::iterator it2Active;
                int count = 0;
                
-               for (int i = 0; i < alignSeqs.size(); i++) {
-               
-                       //are you active
-                       itActive = active.find(alignSeqs[i].seq.getName());
+               for (int i = 0; i < numSeqs; i++) {
                        
-                       if (itActive != active.end()) {  //this sequence has not been merged yet
+                       //are you active
+                       //                      itActive = active.find(alignSeqs[i].seq.getName());
                        
-                               //try to merge it with all smaller seqs
-                               for (int j = i; j < alignSeqs.size(); j++) {
-                                       
-                                       if (i != j) {
-                                               //are you active
-                                               it2Active = active.find(alignSeqs[j].seq.getName());
-                                               if (it2Active != active.end()) {  //this sequence has not been merged yet
-                                                       //are you within "diff" bases
-                                                       int mismatch = calcMisMatches(alignSeqs[i].seq.getAligned(), alignSeqs[j].seq.getAligned());
-                                                       
-                                                       if (mismatch <= diffs) {
-                                                               //merge
-                                                               names[alignSeqs[i].seq.getName()] += "," + names[alignSeqs[j].seq.getName()];
-                                                       
-                                                               //remove from active list
-                                                               active.erase(it2Active);
-                                                               
-                                                               //set numIdentical to 0, so you only print the representative seqs in the fasta file
-                                                               alignSeqs[j].numIdentical = 0;
-                                                               count++;
-                                                       }
-                                               }//end if j active
-                                       }//end if i != j
-                               }//end for loop
+                       if (alignSeqs[i].active) {  //this sequence has not been merged yet
                                
-                               //remove from active list 
-                               active.erase(itActive);
+                               //try to merge it with all smaller seqs
+                               for (int j = i+1; j < numSeqs; j++) {
+                                       if (alignSeqs[j].active) {  //this sequence has not been merged yet
+                                               //are you within "diff" bases
+                                               int mismatch = calcMisMatches(alignSeqs[i].seq.getAligned(), alignSeqs[j].seq.getAligned());
+                                               
+                                               if (mismatch <= diffs) {
+                                                       //merge
+                                                       alignSeqs[i].names += ',' + alignSeqs[j].names;
+                                                       alignSeqs[i].numIdentical += alignSeqs[j].numIdentical;
+
+                                                       alignSeqs[j].active = 0;
+                                                       alignSeqs[j].numIdentical = 0;
+                                                       count++;
+                                               }
+                                       }//end if j active
+                               }//end if i != j
+                       
+                       //remove from active list 
+                               alignSeqs[i].active = 0;
                        }//end if active i
+                       if(i % 100 == 0)        { cout << i << '\t' << numSeqs - count << '\t' << count << endl;        }
                }
        
                string newFastaFile = getRootName(fastafile) + "precluster" + getExtension(fastafile);
                string newNamesFile = getRootName(fastafile) + "precluster.names";
                
-               printData(newFastaFile, newNamesFile);
                
                mothurOut("Total number of sequences before precluster was " + toString(alignSeqs.size()) + "."); mothurOutEndLine();
                mothurOut("pre.cluster removed " + toString(count) + " sequences."); mothurOutEndLine(); 
-               
+               printData(newFastaFile, newNamesFile);
+
                return 0;
                
        }
@@ -158,65 +153,85 @@ int PreClusterCommand::execute(){
        }
 }
 /**************************************************************************************************/
-int PreClusterCommand::readSeqs(){
+int PreClusterCommand::readFASTA(){
        try {
-               ifstream inFasta;
-               openInputFile(fastafile, inFasta);
-               length = 0;
-               map<string, string>::iterator it;
-                               
-               while (!inFasta.eof()) {
-                       Sequence temp(inFasta);  //read seq
-                       
-                       if (temp.getName() != "") {  
-                               if (namefile != "") {
-                                       //make sure fasta and name files match
-                                       it = names.find(temp.getName());
-                                       if (it == names.end()) { mothurOut(temp.getName() + " is not in your names file, please correct."); mothurOutEndLine(); exit(1); }
-                               }else { sizes[temp.getName()] = 1; }
-                               
-                               seqPNode tempNode(sizes[temp.getName()], temp);
-                               alignSeqs.push_back(tempNode); 
-                               active[temp.getName()] = true;
-                       }
-                       gobble(inFasta);
-               }
-               inFasta.close();
-               
-               if (alignSeqs.size() != 0) {  length = alignSeqs[0].seq.getAligned().length();  }
-               
+//             ifstream inFasta;
+//             openInputFile(fastafile, inFasta);
+//             length = 0;
+////           map<string, string>::iterator it;
+//
+//             while (!inFasta.eof()) {
+//                     Sequence temp(inFasta);  //read seq
+//                     
+//                     if (temp.getName() != "") {  
+//                             if (namefile != "") {
+//                                     //make sure fasta and name files match
+//                                     it = names.find(temp.getName());
+//                                     if (it == names.end()) { mothurOut(temp.getName() + " is not in your names file, please correct."); mothurOutEndLine(); exit(1); }
+//                             }else { sizes[temp.getName()] = 1; }
+//                             
+//                             seqPNode tempNode(sizes[temp.getName()], temp);
+//                             alignSeqs.push_back(tempNode); 
+//                             active[temp.getName()] = true;
+//                     }
+//                     gobble(inFasta);
+//             }
+//             inFasta.close();
+//             
+//             if (alignSeqs.size() != 0) {  length = alignSeqs[0].seq.getAligned().length();  }
+//             
                return alignSeqs.size();
        }
        catch(exception& e) {
-               errorOut(e, "PreClusterCommand", "readSeqs");
+               errorOut(e, "PreClusterCommand", "readFASTA");
                exit(1);
        }
 }
 /**************************************************************************************************/
-void PreClusterCommand::readNameFile(){
+
+int PreClusterCommand::readNamesFASTA(){
        try {
-               ifstream in;
-               openInputFile(namefile, in);
-               string firstCol, secondCol;
-                               
-               while (!in.eof()) {
-                       in >> firstCol >> secondCol; gobble(in);
-                       names[firstCol] = secondCol;
+               ifstream inNames;
+               ifstream inFasta;
+               
+               openInputFile(namefile, inNames);
+               openInputFile(fastafile, inFasta);
+               
+               string firstCol, secondCol, nameString;
+               length = 0;
+               
+               while (inFasta && inNames) {
+       
+                       inNames >> firstCol >> secondCol;
+                       nameString = secondCol;
+                       
+                       gobble(inNames);
                        int size = 1;
                        while (secondCol.find_first_of(',') != -1) { 
                                size++;
                                secondCol = secondCol.substr(secondCol.find_first_of(',')+1, secondCol.length());
                        }
-                       sizes[firstCol] = size;
+                       Sequence seq(inFasta);
+                       if (seq.getName() != firstCol) { mothurOut(seq.getName() + " is not in your names file, please correct."); mothurOutEndLine(); exit(1); }
+                       else{
+                               seqPNode tempNode(size, seq, nameString);
+                               alignSeqs.push_back(tempNode);
+                               if (seq.getAligned().length() > length) {  length = alignSeqs[0].seq.getAligned().length();  }
+                       }                       
                }
-               in.close();
+               inFasta.close();
+               inNames.close();
+               return alignSeqs.size();
        }
+       
        catch(exception& e) {
-               errorOut(e, "PreClusterCommand", "readNameFile");
+               errorOut(e, "PreClusterCommand", "readNamesFASTA");
                exit(1);
        }
 }
+
 /**************************************************************************************************/
+
 int PreClusterCommand::calcMisMatches(string seq1, string seq2){
        try {
                int numBad = 0;
@@ -234,23 +249,22 @@ int PreClusterCommand::calcMisMatches(string seq1, string seq2){
                exit(1);
        }
 }
+
 /**************************************************************************************************/
+
 void PreClusterCommand::printData(string newfasta, string newname){
        try {
                ofstream outFasta;
                ofstream outNames;
+               
                openOutputFile(newfasta, outFasta);
                openOutputFile(newname, outNames);
                                
-               map<string, string>::iterator itNames;
                
                for (int i = 0; i < alignSeqs.size(); i++) {
                        if (alignSeqs[i].numIdentical != 0) {
                                alignSeqs[i].seq.printSequence(outFasta); 
-                               
-                               itNames = names.find(alignSeqs[i].seq.getName());
-                               
-                               outNames << itNames->first << '\t' << itNames->second << endl;
+                               outNames << alignSeqs[i].seq.getName() << '\t' << alignSeqs[i].names << endl;
                        }
                }
                
index bb666cdd1bd93fd9b17740e69d68d7a5ec2ff1ff..5113fa44e0e5b848a22d52c6c8c03f29a646ffd0 100644 (file)
 struct seqPNode {
        int numIdentical;
        Sequence seq;
+       string names;
+       bool active;
        seqPNode() {}
-       seqPNode(int s, Sequence q) : numIdentical(s), seq(q) {}
+       seqPNode(int n, Sequence s, string nm) : numIdentical(n), seq(s), names(nm), active(1) {}
        ~seqPNode() {}
 };
 /************************************************************/
@@ -38,13 +40,13 @@ private:
        bool abort;
        string fastafile, namefile;
        vector<seqPNode> alignSeqs; //maps the number of identical seqs to a sequence
-       map<string, string> names; //represents the names file first column maps to second column
-       map<string, int> sizes;  //this map a seq name to the number of identical seqs in the names file
-       map<string, bool> active; //maps sequence name to whether it has already been merged or not.
+//     map<string, string> names; //represents the names file first column maps to second column
+//     map<string, int> sizes;  //this map a seq name to the number of identical seqs in the names file
+//     map<string, bool> active; //maps sequence name to whether it has already been merged or not.
        
-       int readSeqs();
+       int readFASTA();
+       int readNamesFASTA();
        int calcMisMatches(string, string);
-       void readNameFile();
        void printData(string, string); //fasta filename, names file name
 };
 
index 9225e992bab6e17d0c3228cc36ced9a2505933e7..32426f3092d17468aa2f2599528f915423954376 100644 (file)
@@ -151,6 +151,7 @@ int ScreenSeqsCommand::execute(){
                        gobble(inFASTA);
                }       
                if(namefile != "" && groupfile != "")   {       screenNameGroupFile(badSeqNames);       }       // this screens both names and groups
+               else if(namefile != "")                                 {       screenNameGroupFile(badSeqNames);       }
                else if(groupfile != "")                                {       screenGroupFile(badSeqNames);           }       // this screens just the groups
                if(alignreport != "")                                   {       screenAlignReport(badSeqNames);         }
                
index f9c1b6be1035ef85e6ea0e2c25be118cd99b9c97..d5cdeeccd05dbe8570ad09c5617c3ef106a7be78 100644 (file)
@@ -14,7 +14,7 @@ EstOutput MorHorn::getValues(vector<SharedRAbundVector*> shared) {
        try {   
                data.resize(1,0);
                
-               int Atotal, Btotal, tempA, tempB;
+               float Atotal, Btotal, tempA, tempB;
                Atotal = 0; Btotal = 0; 
                float  morhorn, sumSharedA, sumSharedB, a, b, d;
                morhorn = 0.0; sumSharedA = 0.0; sumSharedB = 0.0; a = 0.0; b = 0.0; d = 0.0;
@@ -26,23 +26,21 @@ EstOutput MorHorn::getValues(vector<SharedRAbundVector*> shared) {
                        Btotal += shared[1]->getAbundance(i);
                }
                
-               //calculate the theta denominator sums
+               //calculate the denominator sums
                for (int j = 0; j < shared[0]->size(); j++) {
                        //store in temps to avoid multiple repetitive function calls
                        tempA = shared[0]->getAbundance(j);
                        tempB = shared[1]->getAbundance(j);
+                       float relA = tempA / Atotal;
+                       float relB = tempB / Btotal;
                        
-                       a += tempA * tempA;
-                       b += tempB * tempB;
-                       d += tempA * tempB;
+                       a += relA * relA;
+                       b += relB * relB;
+                       d += relA * relB;
                }
 
-               a /= double(Atotal * Atotal);
-               b /= double(Btotal * Btotal);
-               d /= double(Atotal * Btotal);
-               
                morhorn = (2 * d) / (a + b);
-               
+
                if (isnan(morhorn) || isinf(morhorn)) { morhorn = 0; }
                
                data[0] = morhorn;
index f7773244bc37d888d4c44818cdd8dd1d020444ba..a6950aca22d383fdba9dde8d719d0430e62e32d4 100644 (file)
@@ -185,15 +185,17 @@ int TrimSeqsCommand::execute(){
                                        }
                                        if(!success)                    {       trashCode += 'q';                                                               }
                                }
+                               
                                if(barcodes.size() != 0){
-                                       
                                        success = stripBarcode(currSeq, group);
                                        if(!success){   trashCode += 'b';       }
                                }
+                               
                                if(numFPrimers != 0){
                                        success = stripForward(currSeq);
                                        if(!success){   trashCode += 'f';       }
                                }
+                               
                                if(numRPrimers != 0){
                                        success = stripReverse(currSeq);
                                        if(!success){   trashCode += 'r';       }